aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/debugfs-kmemtrace71
-rw-r--r--Documentation/cputopology.txt6
-rw-r--r--Documentation/ftrace.txt1134
-rw-r--r--Documentation/kernel-parameters.txt19
-rw-r--r--Documentation/lockdep-design.txt30
-rw-r--r--Documentation/sysrq.txt2
-rw-r--r--Documentation/tracepoints.txt8
-rw-r--r--Documentation/vm/kmemtrace.txt126
-rw-r--r--Documentation/x86/boot.txt18
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--MAINTAINERS6
-rw-r--r--Makefile3
-rw-r--r--arch/Kconfig1
-rw-r--r--arch/alpha/include/asm/ftrace.h1
-rw-r--r--arch/alpha/include/asm/hardirq.h13
-rw-r--r--arch/alpha/include/asm/statfs.h2
-rw-r--r--arch/alpha/include/asm/swab.h2
-rw-r--r--arch/alpha/kernel/irq.c2
-rw-r--r--arch/alpha/mm/init.c20
-rw-r--r--arch/arm/include/asm/a.out.h2
-rw-r--r--arch/arm/include/asm/setup.h2
-rw-r--r--arch/arm/include/asm/swab.h2
-rw-r--r--arch/arm/kernel/irq.c18
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/arm/oprofile/op_model_mpcore.c2
-rw-r--r--arch/avr32/Kconfig2
-rw-r--r--arch/avr32/include/asm/ftrace.h1
-rw-r--r--arch/avr32/include/asm/hardirq.h11
-rw-r--r--arch/avr32/include/asm/swab.h2
-rw-r--r--arch/blackfin/include/asm/ftrace.h1
-rw-r--r--arch/blackfin/include/asm/percpu.h10
-rw-r--r--arch/blackfin/include/asm/swab.h2
-rw-r--r--arch/blackfin/kernel/irqchip.c5
-rw-r--r--arch/cris/include/asm/ftrace.h1
-rw-r--r--arch/h8300/include/asm/ftrace.h1
-rw-r--r--arch/h8300/include/asm/swab.h2
-rw-r--r--arch/ia64/Kconfig3
-rw-r--r--arch/ia64/include/asm/fpu.h2
-rw-r--r--arch/ia64/include/asm/ftrace.h28
-rw-r--r--arch/ia64/include/asm/gcc_intrin.h1
-rw-r--r--arch/ia64/include/asm/hardirq.h10
-rw-r--r--arch/ia64/include/asm/intrinsics.h1
-rw-r--r--arch/ia64/include/asm/kvm.h3
-rw-r--r--arch/ia64/include/asm/percpu.h4
-rw-r--r--arch/ia64/include/asm/swab.h2
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/ia64/include/asm/uv/uv.h13
-rw-r--r--arch/ia64/kernel/Makefile5
-rw-r--r--arch/ia64/kernel/acpi.c4
-rw-r--r--arch/ia64/kernel/entry.S100
-rw-r--r--arch/ia64/kernel/ftrace.c206
-rw-r--r--arch/ia64/kernel/ia64_ksyms.c6
-rw-r--r--arch/ia64/kernel/iosapic.c2
-rw-r--r--arch/ia64/kernel/irq.c4
-rw-r--r--arch/ia64/kernel/irq_ia64.c12
-rw-r--r--arch/ia64/kernel/msi_ia64.c4
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S1
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c2
-rw-r--r--arch/m68k/include/asm/ftrace.h1
-rw-r--r--arch/mips/include/asm/ftrace.h1
-rw-r--r--arch/mips/include/asm/irq.h2
-rw-r--r--arch/mips/include/asm/sigcontext.h1
-rw-r--r--arch/mips/include/asm/swab.h2
-rw-r--r--arch/mips/kernel/irq-gic.c2
-rw-r--r--arch/mips/kernel/smtc.c6
-rw-r--r--arch/mips/mti-malta/malta-smtc.c5
-rw-r--r--arch/mips/sgi-ip22/ip22-int.c2
-rw-r--r--arch/mips/sgi-ip22/ip22-time.c2
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c3
-rw-r--r--arch/mips/sibyte/sb1250/smp.c3
-rw-r--r--arch/mn10300/kernel/mn10300-watchdog.c3
-rw-r--r--arch/parisc/include/asm/ftrace.h1
-rw-r--r--arch/parisc/include/asm/pdc.h3
-rw-r--r--arch/parisc/include/asm/swab.h2
-rw-r--r--arch/parisc/kernel/irq.c2
-rw-r--r--arch/powerpc/include/asm/bootx.h2
-rw-r--r--arch/powerpc/include/asm/elf.h2
-rw-r--r--arch/powerpc/include/asm/kvm.h2
-rw-r--r--arch/powerpc/include/asm/ps3fb.h1
-rw-r--r--arch/powerpc/include/asm/spu_info.h3
-rw-r--r--arch/powerpc/include/asm/swab.h2
-rw-r--r--arch/powerpc/kernel/irq.c2
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/platforms/pseries/xics.c5
-rw-r--r--arch/powerpc/sysdev/mpic.c3
-rw-r--r--arch/sparc/kernel/irq_64.c5
-rw-r--r--arch/sparc/kernel/time_64.c2
-rw-r--r--arch/um/include/asm/ftrace.h1
-rw-r--r--arch/x86/Kconfig672
-rw-r--r--arch/x86/Kconfig.cpu80
-rw-r--r--arch/x86/Kconfig.debug50
-rw-r--r--arch/x86/Makefile43
-rw-r--r--arch/x86/boot/Makefile24
-rw-r--r--arch/x86/boot/a20.c79
-rw-r--r--arch/x86/boot/boot.h3
-rw-r--r--arch/x86/boot/compressed/Makefile21
-rw-r--r--arch/x86/boot/compressed/head_32.S8
-rw-r--r--arch/x86/boot/compressed/head_64.S10
-rw-r--r--arch/x86/boot/compressed/misc.c118
-rw-r--r--arch/x86/boot/copy.S40
-rw-r--r--arch/x86/boot/header.S31
-rw-r--r--arch/x86/boot/main.c5
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/pmjump.S16
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/boot/voyager.c40
-rw-r--r--arch/x86/configs/i386_defconfig419
-rw-r--r--arch/x86/configs/x86_64_defconfig425
-rw-r--r--arch/x86/ia32/ia32_signal.c405
-rw-r--r--arch/x86/ia32/ia32entry.S8
-rw-r--r--arch/x86/include/asm/a.out-core.h2
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/apic.h444
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/apm.h (renamed from arch/x86/include/asm/mach-default/apm.h)0
-rw-r--r--arch/x86/include/asm/arch_hooks.h26
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h155
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h22
-rw-r--r--arch/x86/include/asm/boot.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h58
-rw-r--r--arch/x86/include/asm/calling.h56
-rw-r--r--arch/x86/include/asm/cpu.h17
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h193
-rw-r--r--arch/x86/include/asm/cpumask.h32
-rw-r--r--arch/x86/include/asm/current.h24
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/do_timer.h (renamed from arch/x86/include/asm/mach-default/do_timer.h)0
-rw-r--r--arch/x86/include/asm/elf.h15
-rw-r--r--arch/x86/include/asm/entry_arch.h (renamed from arch/x86/include/asm/mach-default/entry_arch.h)27
-rw-r--r--arch/x86/include/asm/es7000/apic.h242
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h22
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h29
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h37
-rw-r--r--arch/x86/include/asm/fixmap.h141
-rw-r--r--arch/x86/include/asm/fixmap_32.h119
-rw-r--r--arch/x86/include/asm/fixmap_64.h79
-rw-r--r--arch/x86/include/asm/ftrace.h32
-rw-r--r--arch/x86/include/asm/genapic.h6
-rw-r--r--arch/x86/include/asm/genapic_32.h148
-rw-r--r--arch/x86/include/asm/genapic_64.h66
-rw-r--r--arch/x86/include/asm/hardirq.h50
-rw-r--r--arch/x86/include/asm/hardirq_32.h30
-rw-r--r--arch/x86/include/asm/hardirq_64.h25
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h25
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io.h97
-rw-r--r--arch/x86/include/asm/io_32.h88
-rw-r--r--arch/x86/include/asm/io_64.h61
-rw-r--r--arch/x86/include/asm/io_apic.h41
-rw-r--r--arch/x86/include/asm/ipi.h75
-rw-r--r--arch/x86/include/asm/irq.h5
-rw-r--r--arch/x86/include/asm/irq_regs.h36
-rw-r--r--arch/x86/include/asm/irq_regs_32.h31
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h214
-rw-r--r--arch/x86/include/asm/kexec.h34
-rw-r--r--arch/x86/include/asm/linkage.h77
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h168
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h41
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h35
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h9
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-generic/mach_wakecpu.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach-voyager/do_timer.h17
-rw-r--r--arch/x86/include/asm/mach-voyager/entry_arch.h26
-rw-r--r--arch/x86/include/asm/mach-voyager/setup_arch.h12
-rw-r--r--arch/x86/include/asm/mach_timer.h (renamed from arch/x86/include/asm/mach-default/mach_timer.h)0
-rw-r--r--arch/x86/include/asm/mach_traps.h (renamed from arch/x86/include/asm/mach-default/mach_traps.h)0
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mmu_context_32.h55
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/mpspec.h33
-rw-r--r--arch/x86/include/asm/mpspec_def.h23
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/numa_32.h6
-rw-r--r--arch/x86/include/asm/numaq.h2
-rw-r--r--arch/x86/include/asm/numaq/apic.h142
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h22
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h6
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h45
-rw-r--r--arch/x86/include/asm/page.h152
-rw-r--r--arch/x86/include/asm/page_32.h87
-rw-r--r--arch/x86/include/asm/page_32_types.h60
-rw-r--r--arch/x86/include/asm/page_64.h101
-rw-r--r--arch/x86/include/asm/page_64_types.h89
-rw-r--r--arch/x86/include/asm/page_types.h51
-rw-r--r--arch/x86/include/asm/paravirt.h465
-rw-r--r--arch/x86/include/asm/pat.h10
-rw-r--r--arch/x86/include/asm/pci-functions.h (renamed from arch/x86/include/asm/mach-default/pci-functions.h)0
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h177
-rw-r--r--arch/x86/include/asm/pgtable-2level.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h (renamed from arch/x86/include/asm/pgtable-2level-defs.h)17
-rw-r--r--arch/x86/include/asm/pgtable-3level.h35
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h (renamed from arch/x86/include/asm/pgtable-3level-defs.h)20
-rw-r--r--arch/x86/include/asm/pgtable.h509
-rw-r--r--arch/x86/include/asm/pgtable_32.h88
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h51
-rw-r--r--arch/x86/include/asm/pgtable_64.h113
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h63
-rw-r--r--arch/x86/include/asm/pgtable_types.h329
-rw-r--r--arch/x86/include/asm/prctl.h4
-rw-r--r--arch/x86/include/asm/processor.h45
-rw-r--r--arch/x86/include/asm/proto.h4
-rw-r--r--arch/x86/include/asm/ptrace-abi.h3
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/rdc321x_defs.h (renamed from arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h)0
-rw-r--r--arch/x86/include/asm/segment.h9
-rw-r--r--arch/x86/include/asm/setup.h62
-rw-r--r--arch/x86/include/asm/setup_arch.h (renamed from arch/x86/include/asm/mach-default/setup_arch.h)0
-rw-r--r--arch/x86/include/asm/smp.h69
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h (renamed from arch/x86/include/asm/mach-default/smpboot_hooks.h)6
-rw-r--r--arch/x86/include/asm/spinlock.h69
-rw-r--r--arch/x86/include/asm/stackprotector.h124
-rw-r--r--arch/x86/include/asm/summit/apic.h202
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h26
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/syscalls.h23
-rw-r--r--arch/x86/include/asm/system.h70
-rw-r--r--arch/x86/include/asm/thread_info.h30
-rw-r--r--arch/x86/include/asm/timer.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h17
-rw-r--r--arch/x86/include/asm/topology.h31
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/uaccess.h138
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv.h33
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/vic.h61
-rw-r--r--arch/x86/include/asm/voyager.h529
-rw-r--r--arch/x86/include/asm/xen/events.h6
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h28
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/Makefile45
-rw-r--r--arch/x86/kernel/acpi/boot.c169
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S4
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/alternative.c42
-rw-r--r--arch/x86/kernel/apic/Makefile19
-rw-r--r--arch/x86/kernel/apic/apic.c (renamed from arch/x86/kernel/apic.c)314
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c (renamed from arch/x86/kernel/genapic_flat_64.c)192
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c267
-rw-r--r--arch/x86/kernel/apic/es7000_32.c780
-rw-r--r--arch/x86/kernel/apic/io_apic.c (renamed from arch/x86/kernel/io_apic.c)443
-rw-r--r--arch/x86/kernel/apic/ipi.c164
-rw-r--r--arch/x86/kernel/apic/nmi.c (renamed from arch/x86/kernel/nmi.c)12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c557
-rw-r--r--arch/x86/kernel/apic/probe_32.c284
-rw-r--r--arch/x86/kernel/apic/probe_64.c (renamed from arch/x86/kernel/genapic_64.c)55
-rw-r--r--arch/x86/kernel/apic/summit_32.c579
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c (renamed from arch/x86/kernel/genx2apic_cluster.c)153
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c (renamed from arch/x86/kernel/genx2apic_phys.c)150
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c (renamed from arch/x86/kernel/genx2apic_uv_x.c)161
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c54
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/common.c263
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c785
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c50
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c63
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c43
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c214
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c20
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/dumpstack.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/efi_stub_32.S3
-rw-r--r--arch/x86/kernel/efi_stub_64.S7
-rw-r--r--arch/x86/kernel/entry_32.S473
-rw-r--r--arch/x86/kernel/entry_64.S78
-rw-r--r--arch/x86/kernel/es7000_32.c378
-rw-r--r--arch/x86/kernel/ftrace.c206
-rw-r--r--arch/x86/kernel/head64.c23
-rw-r--r--arch/x86/kernel/head_32.S44
-rw-r--r--arch/x86/kernel/head_64.S23
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/ioport.c14
-rw-r--r--arch/x86/kernel/ipi.c190
-rw-r--r--arch/x86/kernel/irq.c78
-rw-r--r--arch/x86/kernel/irq_32.c61
-rw-r--r--arch/x86/kernel/irq_64.c43
-rw-r--r--arch/x86/kernel/irqinit_32.c39
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c19
-rw-r--r--arch/x86/kernel/machine_kexec_64.c179
-rw-r--r--arch/x86/kernel/mca_32.c5
-rw-r--r--arch/x86/kernel/microcode_intel.c10
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c212
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/numaq_32.c293
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c10
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/probe_roms_32.c2
-rw-r--r--arch/x86/kernel/process.c198
-rw-r--r--arch/x86/kernel/process_32.c241
-rw-r--r--arch/x86/kernel/process_64.c230
-rw-r--r--arch/x86/kernel/ptrace.c28
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S26
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S312
-rw-r--r--arch/x86/kernel/setup.c143
-rw-r--r--arch/x86/kernel/setup_percpu.c704
-rw-r--r--arch/x86/kernel/signal.c463
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c225
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/summit_32.c188
-rw-r--r--arch/x86/kernel/syscall_table_32.S20
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_uv.c72
-rw-r--r--arch/x86/kernel/trampoline_32.S2
-rw-r--r--arch/x86/kernel/trampoline_64.S23
-rw-r--r--arch/x86/kernel/traps.c64
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c12
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmi_32.c13
-rw-r--r--arch/x86/kernel/vmiclock_32.c6
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S11
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S44
-rw-r--r--arch/x86/kernel/vsmp_64.c24
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c32
-rw-r--r--arch/x86/lib/getuser.S2
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/mach-default/Makefile5
-rw-r--r--arch/x86/mach-default/setup.c174
-rw-r--r--arch/x86/mach-generic/Makefile11
-rw-r--r--arch/x86/mach-generic/bigsmp.c60
-rw-r--r--arch/x86/mach-generic/default.c27
-rw-r--r--arch/x86/mach-generic/es7000.c103
-rw-r--r--arch/x86/mach-generic/numaq.c53
-rw-r--r--arch/x86/mach-generic/probe.c152
-rw-r--r--arch/x86/mach-generic/summit.c40
-rw-r--r--arch/x86/mach-rdc321x/Makefile5
-rw-r--r--arch/x86/mach-rdc321x/gpio.c194
-rw-r--r--arch/x86/mach-rdc321x/platform.c69
-rw-r--r--arch/x86/mach-voyager/Makefile8
-rw-r--r--arch/x86/mach-voyager/setup.c118
-rw-r--r--arch/x86/mach-voyager/voyager_basic.c317
-rw-r--r--arch/x86/mach-voyager/voyager_cat.c1197
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c1807
-rw-r--r--arch/x86/mach-voyager/voyager_thread.c128
-rw-r--r--arch/x86/math-emu/get_address.c6
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/extable.c6
-rw-r--r--arch/x86/mm/fault.c1333
-rw-r--r--arch/x86/mm/highmem_32.c44
-rw-r--r--arch/x86/mm/init.c393
-rw-r--r--arch/x86/mm/init_32.c496
-rw-r--r--arch/x86/mm/init_64.c416
-rw-r--r--arch/x86/mm/iomap_32.c13
-rw-r--r--arch/x86/mm/ioremap.c37
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/memtest.c159
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/numa_32.c33
-rw-r--r--arch/x86/mm/numa_64.c217
-rw-r--r--arch/x86/mm/pageattr.c7
-rw-r--r--arch/x86/mm/pat.c77
-rw-r--r--arch/x86/mm/pgtable.c18
-rw-r--r--arch/x86/mm/pgtable_32.c18
-rw-r--r--arch/x86/mm/srat_64.c3
-rw-r--r--arch/x86/mm/tlb.c (renamed from arch/x86/kernel/tlb_64.c)123
-rw-r--r--arch/x86/pci/numaq_32.c6
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/power/hibernate_asm_32.S2
-rw-r--r--arch/x86/power/hibernate_asm_64.S2
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vma.c4
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c816
-rw-r--r--arch/x86/xen/irq.c39
-rw-r--r--arch/x86/xen/mmu.c757
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c15
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/smp.c47
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/xen-asm.S142
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S343
-rw-r--r--arch/x86/xen/xen-asm_64.S252
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h10
-rw-r--r--arch/xtensa/include/asm/ftrace.h1
-rw-r--r--arch/xtensa/include/asm/swab.h2
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile1
-rw-r--r--drivers/acpi/acpica/tbxface.c17
-rw-r--r--drivers/acpi/osl.c13
-rw-r--r--drivers/acpi/processor_perflib.c4
-rw-r--r--drivers/acpi/tables.c20
-rw-r--r--drivers/base/cpu.c2
-rw-r--r--drivers/base/topology.c33
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--drivers/clocksource/acpi_pm.c2
-rw-r--r--drivers/clocksource/cyclone.c2
-rw-r--r--drivers/eisa/Kconfig6
-rw-r--r--drivers/firmware/dcdbas.c12
-rw-r--r--drivers/firmware/iscsi_ibft.c4
-rw-r--r--drivers/gpu/drm/drm_proc.c4
-rw-r--r--drivers/input/keyboard/Kconfig4
-rw-r--r--drivers/input/mouse/Kconfig2
-rw-r--r--drivers/lguest/Kconfig2
-rw-r--r--drivers/misc/Kconfig4
-rw-r--r--drivers/misc/sgi-gru/grufile.c18
-rw-r--r--drivers/misc/sgi-xp/xp.h24
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c2
-rw-r--r--drivers/mtd/nand/Kconfig2
-rw-r--r--drivers/net/ne3210.c3
-rw-r--r--drivers/net/sfc/efx.c17
-rw-r--r--drivers/net/sfc/falcon.c24
-rw-r--r--drivers/net/wireless/arlan-main.c4
-rw-r--r--drivers/oprofile/buffer_sync.c22
-rw-r--r--drivers/oprofile/buffer_sync.h4
-rw-r--r--drivers/oprofile/cpu_buffer.c5
-rw-r--r--drivers/oprofile/oprof.c9
-rw-r--r--drivers/pci/dmar.c7
-rw-r--r--drivers/pci/intr_remapping.c1
-rw-r--r--drivers/watchdog/rdc321x_wdt.c2
-rw-r--r--drivers/xen/events.c251
-rw-r--r--drivers/xen/manage.c2
-rw-r--r--fs/partitions/check.c4
-rw-r--r--include/acpi/acpiosxf.h1
-rw-r--r--include/acpi/acpixf.h4
-rw-r--r--include/asm-frv/ftrace.h1
-rw-r--r--include/asm-frv/swab.h2
-rw-r--r--include/asm-generic/percpu.h52
-rw-r--r--include/asm-generic/sections.h2
-rw-r--r--include/asm-generic/vmlinux.lds.h84
-rw-r--r--include/asm-m32r/ftrace.h1
-rw-r--r--include/asm-m32r/swab.h2
-rw-r--r--include/asm-mn10300/ftrace.h1
-rw-r--r--include/asm-mn10300/swab.h2
-rw-r--r--include/linux/acpi.h1
-rw-r--r--include/linux/blktrace_api.h5
-rw-r--r--include/linux/bootmem.h36
-rw-r--r--include/linux/coda_psdev.h15
-rw-r--r--include/linux/compiler.h6
-rw-r--r--include/linux/decompress/bunzip2.h10
-rw-r--r--include/linux/decompress/generic.h33
-rw-r--r--include/linux/decompress/inflate.h13
-rw-r--r--include/linux/decompress/mm.h87
-rw-r--r--include/linux/decompress/unlzma.h12
-rw-r--r--include/linux/elfcore.h9
-rw-r--r--include/linux/ftrace.h253
-rw-r--r--include/linux/ftrace_irq.h2
-rw-r--r--include/linux/hardirq.h73
-rw-r--r--include/linux/in6.h2
-rw-r--r--include/linux/interrupt.h6
-rw-r--r--include/linux/irq.h86
-rw-r--r--include/linux/irqflags.h8
-rw-r--r--include/linux/irqnr.h1
-rw-r--r--include/linux/kernel.h150
-rw-r--r--include/linux/kprobes.h22
-rw-r--r--include/linux/lockdep.h50
-rw-r--r--include/linux/magic.h1
-rw-r--r--include/linux/memory.h6
-rw-r--r--include/linux/mmiotrace.h78
-rw-r--r--include/linux/module.h5
-rw-r--r--include/linux/mutex.h5
-rw-r--r--include/linux/nubus.h2
-rw-r--r--include/linux/percpu.h155
-rw-r--r--include/linux/reiserfs_fs.h56
-rw-r--r--include/linux/ring_buffer.h27
-rw-r--r--include/linux/sched.h31
-rw-r--r--include/linux/slab_def.h68
-rw-r--r--include/linux/slob_def.h9
-rw-r--r--include/linux/slub_def.h72
-rw-r--r--include/linux/smp.h6
-rw-r--r--include/linux/socket.h6
-rw-r--r--include/linux/stackprotector.h16
-rw-r--r--include/linux/string.h7
-rw-r--r--include/linux/syscalls.h60
-rw-r--r--include/linux/timer.h93
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/linux/trace_clock.h19
-rw-r--r--include/linux/tracepoint.h116
-rw-r--r--include/linux/types.h6
-rw-r--r--include/linux/vmalloc.h4
-rw-r--r--include/trace/block.h70
-rw-r--r--include/trace/irq.h9
-rw-r--r--include/trace/irq_event_types.h55
-rw-r--r--include/trace/kmemtrace.h75
-rw-r--r--include/trace/lockdep.h9
-rw-r--r--include/trace/lockdep_event_types.h44
-rw-r--r--include/trace/power.h32
-rw-r--r--include/trace/sched.h49
-rw-r--r--include/trace/sched_event_types.h337
-rw-r--r--include/trace/trace_event_types.h5
-rw-r--r--include/trace/trace_events.h5
-rw-r--r--include/trace/workqueue.h25
-rw-r--r--init/Kconfig62
-rw-r--r--init/do_mounts_rd.c178
-rw-r--r--init/initramfs.c122
-rw-r--r--init/main.c21
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c63
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c19
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/lockdep.c538
-rw-r--r--kernel/lockdep_internals.h45
-rw-r--r--kernel/lockdep_proc.c22
-rw-r--r--kernel/lockdep_states.h9
-rw-r--r--kernel/module.c66
-rw-r--r--kernel/mutex-debug.c9
-rw-r--r--kernel/mutex-debug.h18
-rw-r--r--kernel/mutex.c121
-rw-r--r--kernel/mutex.h22
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched.c92
-rw-r--r--kernel/sched_clock.c53
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_rt.c32
-rw-r--r--kernel/softirq.c37
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/timer.c68
-rw-r--r--kernel/trace/Kconfig119
-rw-r--r--kernel/trace/Makefile11
-rw-r--r--kernel/trace/blktrace.c (renamed from block/blktrace.c)734
-rw-r--r--kernel/trace/events.c15
-rw-r--r--kernel/trace/ftrace.c1071
-rw-r--r--kernel/trace/kmemtrace.c339
-rw-r--r--kernel/trace/ring_buffer.c581
-rw-r--r--kernel/trace/trace.c2941
-rw-r--r--kernel/trace/trace.h243
-rw-r--r--kernel/trace/trace_boot.c36
-rw-r--r--kernel/trace/trace_branch.c278
-rw-r--r--kernel/trace/trace_clock.c108
-rw-r--r--kernel/trace/trace_event_types.h175
-rw-r--r--kernel/trace/trace_events.c604
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h131
-rw-r--r--kernel/trace/trace_events_stage_3.h217
-rw-r--r--kernel/trace/trace_export.c102
-rw-r--r--kernel/trace/trace_functions.c369
-rw-r--r--kernel/trace/trace_functions_graph.c504
-rw-r--r--kernel/trace/trace_hw_branches.c185
-rw-r--r--kernel/trace/trace_irqsoff.c54
-rw-r--r--kernel/trace/trace_mmiotrace.c43
-rw-r--r--kernel/trace/trace_nop.c5
-rw-r--r--kernel/trace/trace_output.c967
-rw-r--r--kernel/trace/trace_output.h63
-rw-r--r--kernel/trace/trace_power.c194
-rw-r--r--kernel/trace/trace_printk.c270
-rw-r--r--kernel/trace/trace_sched_switch.c24
-rw-r--r--kernel/trace/trace_sched_wakeup.c96
-rw-r--r--kernel/trace/trace_selftest.c133
-rw-r--r--kernel/trace/trace_stack.c19
-rw-r--r--kernel/trace/trace_stat.c319
-rw-r--r--kernel/trace/trace_stat.h31
-rw-r--r--kernel/trace/trace_syscalls.c250
-rw-r--r--kernel/trace/trace_sysprof.c23
-rw-r--r--kernel/trace/trace_workqueue.c288
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/workqueue.c16
-rw-r--r--lib/Kconfig17
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/Makefile7
-rw-r--r--lib/decompress.c54
-rw-r--r--lib/decompress_bunzip2.c735
-rw-r--r--lib/decompress_inflate.c167
-rw-r--r--lib/decompress_unlzma.c647
-rw-r--r--lib/locking-selftest.c4
-rw-r--r--lib/vsprintf.c1005
-rw-r--r--lib/zlib_inflate/inflate.h4
-rw-r--r--lib/zlib_inflate/inftrees.h4
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c32
-rw-r--r--mm/bootmem.c35
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/memory.c10
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/percpu.c1226
-rw-r--r--mm/slab.c75
-rw-r--r--mm/slob.c39
-rw-r--r--mm/slub.c98
-rw-r--r--mm/vmalloc.c97
-rw-r--r--mm/vmscan.c2
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/route.c2
-rw-r--r--samples/tracepoints/tp-samples-trace.h8
-rw-r--r--scripts/Makefile.build13
-rw-r--r--scripts/Makefile.lib14
-rw-r--r--scripts/bin_size10
-rw-r--r--scripts/gcc-x86_32-has-stack-protector.sh8
-rw-r--r--scripts/gcc-x86_64-has-stack-protector.sh6
-rw-r--r--scripts/gen_initramfs_list.sh18
-rw-r--r--scripts/headers_check.pl2
-rw-r--r--scripts/kallsyms.c57
-rw-r--r--scripts/mod/modpost.c5
-rwxr-xr-xscripts/recordmcount.pl37
-rw-r--r--sound/drivers/Kconfig2
-rw-r--r--usr/Kconfig89
-rw-r--r--usr/Makefile36
-rw-r--r--usr/initramfs_data.S2
-rw-r--r--usr/initramfs_data.bz2.S29
-rw-r--r--usr/initramfs_data.gz.S29
-rw-r--r--usr/initramfs_data.lzma.S29
659 files changed, 34855 insertions, 22729 deletions
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
new file mode 100644
index 000000000000..5e6a92a02d85
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-kmemtrace
@@ -0,0 +1,71 @@
1What: /sys/kernel/debug/kmemtrace/
2Date: July 2008
3Contact: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
4Description:
5
6In kmemtrace-enabled kernels, the following files are created:
7
8/sys/kernel/debug/kmemtrace/
9 cpu<n> (0400) Per-CPU tracing data, see below. (binary)
10 total_overruns (0400) Total number of bytes which were dropped from
11 cpu<n> files because of full buffer condition,
12 non-binary. (text)
13 abi_version (0400) Kernel's kmemtrace ABI version. (text)
14
15Each per-CPU file should be read according to the relay interface. That is,
16the reader should set affinity to that specific CPU and, as currently done by
17the userspace application (though there are other methods), use poll() with
18an infinite timeout before every read(). Otherwise, erroneous data may be
19read. The binary data has the following _core_ format:
20
21 Event ID (1 byte) Unsigned integer, one of:
22 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
23 1 - represents a freeing of previously allocated memory
24 (KMEMTRACE_EVENT_FREE)
25 Type ID (1 byte) Unsigned integer, one of:
26 0 - this is a kmalloc() / kfree()
27 1 - this is a kmem_cache_alloc() / kmem_cache_free()
28 2 - this is a __get_free_pages() et al.
29 Event size (2 bytes) Unsigned integer representing the
30 size of this event. Used to extend
31 kmemtrace. Discard the bytes you
32 don't know about.
33 Sequence number (4 bytes) Signed integer used to reorder data
34 logged on SMP machines. Wraparound
35 must be taken into account, although
36 it is unlikely.
37 Caller address (8 bytes) Return address to the caller.
38 Pointer to mem (8 bytes) Pointer to target memory area. Can be
39 NULL, but not all such calls might be
40 recorded.
41
42In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
43
44 Requested bytes (8 bytes) Total number of requested bytes,
45 unsigned, must not be zero.
46 Allocated bytes (8 bytes) Total number of actually allocated
47 bytes, unsigned, must not be lower
48 than requested bytes.
49 Requested flags (4 bytes) GFP flags supplied by the caller.
50 Target CPU (4 bytes) Signed integer, valid for event id 1.
51 If equal to -1, target CPU is the same
52 as origin CPU, but the reverse might
53 not be true.
54
55The data is made available in the same endianness the machine has.
56
57Other event ids and type ids may be defined and added. Other fields may be
58added by increasing event size, but see below for details.
59Every modification to the ABI, including new id definitions, are followed
60by bumping the ABI version by one.
61
62Adding new data to the packet (features) is done at the end of the mandatory
63data:
64 Feature size (2 byte)
65 Feature ID (1 byte)
66 Feature data (Feature size - 3 bytes)
67
68
69Users:
70 kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
71
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 45932ec21cee..b41f3e58aefa 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
18these macros in include/asm-XXX/topology.h: 18these macros in include/asm-XXX/topology.h:
19#define topology_physical_package_id(cpu) 19#define topology_physical_package_id(cpu)
20#define topology_core_id(cpu) 20#define topology_core_id(cpu)
21#define topology_thread_siblings(cpu) 21#define topology_thread_cpumask(cpu)
22#define topology_core_siblings(cpu) 22#define topology_core_cpumask(cpu)
23 23
24The type of **_id is int. 24The type of **_id is int.
25The type of siblings is cpumask_t. 25The type of siblings is (const) struct cpumask *.
26 26
27To be consistent on all architectures, include/linux/topology.h 27To be consistent on all architectures, include/linux/topology.h
28provides default definitions for any of the above macros that are 28provides default definitions for any of the above macros that are
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 803b1318b13d..fd9a3e693813 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -15,31 +15,31 @@ Introduction
15 15
16Ftrace is an internal tracer designed to help out developers and 16Ftrace is an internal tracer designed to help out developers and
17designers of systems to find what is going on inside the kernel. 17designers of systems to find what is going on inside the kernel.
18It can be used for debugging or analyzing latencies and performance 18It can be used for debugging or analyzing latencies and
19issues that take place outside of user-space. 19performance issues that take place outside of user-space.
20 20
21Although ftrace is the function tracer, it also includes an 21Although ftrace is the function tracer, it also includes an
22infrastructure that allows for other types of tracing. Some of the 22infrastructure that allows for other types of tracing. Some of
23tracers that are currently in ftrace include a tracer to trace 23the tracers that are currently in ftrace include a tracer to
24context switches, the time it takes for a high priority task to 24trace context switches, the time it takes for a high priority
25run after it was woken up, the time interrupts are disabled, and 25task to run after it was woken up, the time interrupts are
26more (ftrace allows for tracer plugins, which means that the list of 26disabled, and more (ftrace allows for tracer plugins, which
27tracers can always grow). 27means that the list of tracers can always grow).
28 28
29 29
30The File System 30The File System
31--------------- 31---------------
32 32
33Ftrace uses the debugfs file system to hold the control files as well 33Ftrace uses the debugfs file system to hold the control files as
34as the files to display output. 34well as the files to display output.
35 35
36To mount the debugfs system: 36To mount the debugfs system:
37 37
38 # mkdir /debug 38 # mkdir /debug
39 # mount -t debugfs nodev /debug 39 # mount -t debugfs nodev /debug
40 40
41(Note: it is more common to mount at /sys/kernel/debug, but for simplicity 41( Note: it is more common to mount at /sys/kernel/debug, but for
42 this document will use /debug) 42 simplicity this document will use /debug)
43 43
44That's it! (assuming that you have ftrace configured into your kernel) 44That's it! (assuming that you have ftrace configured into your kernel)
45 45
@@ -50,90 +50,124 @@ of ftrace. Here is a list of some of the key files:
50 50
51 Note: all time values are in microseconds. 51 Note: all time values are in microseconds.
52 52
53 current_tracer: This is used to set or display the current tracer 53 current_tracer:
54 that is configured. 54
55 55 This is used to set or display the current tracer
56 available_tracers: This holds the different types of tracers that 56 that is configured.
57 have been compiled into the kernel. The tracers 57
58 listed here can be configured by echoing their name 58 available_tracers:
59 into current_tracer. 59
60 60 This holds the different types of tracers that
61 tracing_enabled: This sets or displays whether the current_tracer 61 have been compiled into the kernel. The
62 is activated and tracing or not. Echo 0 into this 62 tracers listed here can be configured by
63 file to disable the tracer or 1 to enable it. 63 echoing their name into current_tracer.
64 64
65 trace: This file holds the output of the trace in a human readable 65 tracing_enabled:
66 format (described below). 66
67 67 This sets or displays whether the current_tracer
68 latency_trace: This file shows the same trace but the information 68 is activated and tracing or not. Echo 0 into this
69 is organized more to display possible latencies 69 file to disable the tracer or 1 to enable it.
70 in the system (described below). 70
71 71 trace:
72 trace_pipe: The output is the same as the "trace" file but this 72
73 file is meant to be streamed with live tracing. 73 This file holds the output of the trace in a human
74 Reads from this file will block until new data 74 readable format (described below).
75 is retrieved. Unlike the "trace" and "latency_trace" 75
76 files, this file is a consumer. This means reading 76 latency_trace:
77 from this file causes sequential reads to display 77
78 more current data. Once data is read from this 78 This file shows the same trace but the information
79 file, it is consumed, and will not be read 79 is organized more to display possible latencies
80 again with a sequential read. The "trace" and 80 in the system (described below).
81 "latency_trace" files are static, and if the 81
82 tracer is not adding more data, they will display 82 trace_pipe:
83 the same information every time they are read. 83
84 84 The output is the same as the "trace" file but this
85 trace_options: This file lets the user control the amount of data 85 file is meant to be streamed with live tracing.
86 that is displayed in one of the above output 86 Reads from this file will block until new data
87 files. 87 is retrieved. Unlike the "trace" and "latency_trace"
88 88 files, this file is a consumer. This means reading
89 trace_max_latency: Some of the tracers record the max latency. 89 from this file causes sequential reads to display
90 For example, the time interrupts are disabled. 90 more current data. Once data is read from this
91 This time is saved in this file. The max trace 91 file, it is consumed, and will not be read
92 will also be stored, and displayed by either 92 again with a sequential read. The "trace" and
93 "trace" or "latency_trace". A new max trace will 93 "latency_trace" files are static, and if the
94 only be recorded if the latency is greater than 94 tracer is not adding more data, they will display
95 the value in this file. (in microseconds) 95 the same information every time they are read.
96 96
97 buffer_size_kb: This sets or displays the number of kilobytes each CPU 97 trace_options:
98 buffer can hold. The tracer buffers are the same size 98
99 for each CPU. The displayed number is the size of the 99 This file lets the user control the amount of data
100 CPU buffer and not total size of all buffers. The 100 that is displayed in one of the above output
101 trace buffers are allocated in pages (blocks of memory 101 files.
102 that the kernel uses for allocation, usually 4 KB in size). 102
103 If the last page allocated has room for more bytes 103 tracing_max_latency:
104 than requested, the rest of the page will be used, 104
105 making the actual allocation bigger than requested. 105 Some of the tracers record the max latency.
106 (Note, the size may not be a multiple of the page size due 106 For example, the time interrupts are disabled.
107 to buffer managment overhead.) 107 This time is saved in this file. The max trace
108 108 will also be stored, and displayed by either
109 This can only be updated when the current_tracer 109 "trace" or "latency_trace". A new max trace will
110 is set to "nop". 110 only be recorded if the latency is greater than
111 111 the value in this file. (in microseconds)
112 tracing_cpumask: This is a mask that lets the user only trace 112
113 on specified CPUS. The format is a hex string 113 buffer_size_kb:
114 representing the CPUS. 114
115 115 This sets or displays the number of kilobytes each CPU
116 set_ftrace_filter: When dynamic ftrace is configured in (see the 116 buffer can hold. The tracer buffers are the same size
117 section below "dynamic ftrace"), the code is dynamically 117 for each CPU. The displayed number is the size of the
118 modified (code text rewrite) to disable calling of the 118 CPU buffer and not total size of all buffers. The
119 function profiler (mcount). This lets tracing be configured 119 trace buffers are allocated in pages (blocks of memory
120 in with practically no overhead in performance. This also 120 that the kernel uses for allocation, usually 4 KB in size).
121 has a side effect of enabling or disabling specific functions 121 If the last page allocated has room for more bytes
122 to be traced. Echoing names of functions into this file 122 than requested, the rest of the page will be used,
123 will limit the trace to only those functions. 123 making the actual allocation bigger than requested.
124 124 ( Note, the size may not be a multiple of the page size
125 set_ftrace_notrace: This has an effect opposite to that of 125 due to buffer managment overhead. )
126 set_ftrace_filter. Any function that is added here will not 126
127 be traced. If a function exists in both set_ftrace_filter 127 This can only be updated when the current_tracer
128 and set_ftrace_notrace, the function will _not_ be traced. 128 is set to "nop".
129 129
130 set_ftrace_pid: Have the function tracer only trace a single thread. 130 tracing_cpumask:
131 131
132 available_filter_functions: This lists the functions that ftrace 132 This is a mask that lets the user only trace
133 has processed and can trace. These are the function 133 on specified CPUS. The format is a hex string
134 names that you can pass to "set_ftrace_filter" or 134 representing the CPUS.
135 "set_ftrace_notrace". (See the section "dynamic ftrace" 135
136 below for more details.) 136 set_ftrace_filter:
137
138 When dynamic ftrace is configured in (see the
139 section below "dynamic ftrace"), the code is dynamically
140 modified (code text rewrite) to disable calling of the
141 function profiler (mcount). This lets tracing be configured
142 in with practically no overhead in performance. This also
143 has a side effect of enabling or disabling specific functions
144 to be traced. Echoing names of functions into this file
145 will limit the trace to only those functions.
146
147 set_ftrace_notrace:
148
149 This has an effect opposite to that of
150 set_ftrace_filter. Any function that is added here will not
151 be traced. If a function exists in both set_ftrace_filter
152 and set_ftrace_notrace, the function will _not_ be traced.
153
154 set_ftrace_pid:
155
156 Have the function tracer only trace a single thread.
157
158 set_graph_function:
159
160 Set a "trigger" function where tracing should start
161 with the function graph tracer (See the section
162 "dynamic ftrace" for more details).
163
164 available_filter_functions:
165
166 This lists the functions that ftrace
167 has processed and can trace. These are the function
168 names that you can pass to "set_ftrace_filter" or
169 "set_ftrace_notrace". (See the section "dynamic ftrace"
170 below for more details.)
137 171
138 172
139The Tracers 173The Tracers
@@ -141,36 +175,66 @@ The Tracers
141 175
142Here is the list of current tracers that may be configured. 176Here is the list of current tracers that may be configured.
143 177
144 function - function tracer that uses mcount to trace all functions. 178 "function"
179
180 Function call tracer to trace all kernel functions.
181
182 "function_graph_tracer"
183
184 Similar to the function tracer except that the
185 function tracer probes the functions on their entry
186 whereas the function graph tracer traces on both entry
187 and exit of the functions. It then provides the ability
188 to draw a graph of function calls similar to C code
189 source.
145 190
146 sched_switch - traces the context switches between tasks. 191 "sched_switch"
147 192
148 irqsoff - traces the areas that disable interrupts and saves 193 Traces the context switches and wakeups between tasks.
149 the trace with the longest max latency.
150 See tracing_max_latency. When a new max is recorded,
151 it replaces the old trace. It is best to view this
152 trace via the latency_trace file.
153 194
154 preemptoff - Similar to irqsoff but traces and records the amount of 195 "irqsoff"
155 time for which preemption is disabled.
156 196
157 preemptirqsoff - Similar to irqsoff and preemptoff, but traces and 197 Traces the areas that disable interrupts and saves
158 records the largest time for which irqs and/or preemption 198 the trace with the longest max latency.
159 is disabled. 199 See tracing_max_latency. When a new max is recorded,
200 it replaces the old trace. It is best to view this
201 trace via the latency_trace file.
160 202
161 wakeup - Traces and records the max latency that it takes for 203 "preemptoff"
162 the highest priority task to get scheduled after
163 it has been woken up.
164 204
165 nop - This is not a tracer. To remove all tracers from tracing 205 Similar to irqsoff but traces and records the amount of
166 simply echo "nop" into current_tracer. 206 time for which preemption is disabled.
207
208 "preemptirqsoff"
209
210 Similar to irqsoff and preemptoff, but traces and
211 records the largest time for which irqs and/or preemption
212 is disabled.
213
214 "wakeup"
215
216 Traces and records the max latency that it takes for
217 the highest priority task to get scheduled after
218 it has been woken up.
219
220 "hw-branch-tracer"
221
222 Uses the BTS CPU feature on x86 CPUs to traces all
223 branches executed.
224
225 "nop"
226
227 This is the "trace nothing" tracer. To remove all
228 tracers from tracing simply echo "nop" into
229 current_tracer.
167 230
168 231
169Examples of using the tracer 232Examples of using the tracer
170---------------------------- 233----------------------------
171 234
172Here are typical examples of using the tracers when controlling them only 235Here are typical examples of using the tracers when controlling
173with the debugfs interface (without using any user-land utilities). 236them only with the debugfs interface (without using any
237user-land utilities).
174 238
175Output format: 239Output format:
176-------------- 240--------------
@@ -187,16 +251,16 @@ Here is an example of the output format of the file "trace"
187 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput 251 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput
188 -------- 252 --------
189 253
190A header is printed with the tracer name that is represented by the trace. 254A header is printed with the tracer name that is represented by
191In this case the tracer is "function". Then a header showing the format. Task 255the trace. In this case the tracer is "function". Then a header
192name "bash", the task PID "4251", the CPU that it was running on 256showing the format. Task name "bash", the task PID "4251", the
193"01", the timestamp in <secs>.<usecs> format, the function name that was 257CPU that it was running on "01", the timestamp in <secs>.<usecs>
194traced "path_put" and the parent function that called this function 258format, the function name that was traced "path_put" and the
195"path_walk". The timestamp is the time at which the function was 259parent function that called this function "path_walk". The
196entered. 260timestamp is the time at which the function was entered.
197 261
198The sched_switch tracer also includes tracing of task wakeups and 262The sched_switch tracer also includes tracing of task wakeups
199context switches. 263and context switches.
200 264
201 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S 265 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
202 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S 266 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
@@ -205,8 +269,8 @@ context switches.
205 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R 269 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
206 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R 270 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
207 271
208Wake ups are represented by a "+" and the context switches are shown as 272Wake ups are represented by a "+" and the context switches are
209"==>". The format is: 273shown as "==>". The format is:
210 274
211 Context switches: 275 Context switches:
212 276
@@ -220,19 +284,20 @@ Wake ups are represented by a "+" and the context switches are shown as
220 284
221 <pid>:<prio>:<state> + <pid>:<prio>:<state> 285 <pid>:<prio>:<state> + <pid>:<prio>:<state>
222 286
223The prio is the internal kernel priority, which is the inverse of the 287The prio is the internal kernel priority, which is the inverse
224priority that is usually displayed by user-space tools. Zero represents 288of the priority that is usually displayed by user-space tools.
225the highest priority (99). Prio 100 starts the "nice" priorities with 289Zero represents the highest priority (99). Prio 100 starts the
226100 being equal to nice -20 and 139 being nice 19. The prio "140" is 290"nice" priorities with 100 being equal to nice -20 and 139 being
227reserved for the idle task which is the lowest priority thread (pid 0). 291nice 19. The prio "140" is reserved for the idle task which is
292the lowest priority thread (pid 0).
228 293
229 294
230Latency trace format 295Latency trace format
231-------------------- 296--------------------
232 297
233For traces that display latency times, the latency_trace file gives 298For traces that display latency times, the latency_trace file
234somewhat more information to see why a latency happened. Here is a typical 299gives somewhat more information to see why a latency happened.
235trace. 300Here is a typical trace.
236 301
237# tracer: irqsoff 302# tracer: irqsoff
238# 303#
@@ -259,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
259 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq) 324 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq)
260 325
261 326
327This shows that the current tracer is "irqsoff" tracing the time
328for which interrupts were disabled. It gives the trace version
329and the version of the kernel upon which this was executed on
330(2.6.26-rc8). Then it displays the max latency in microsecs (97
331us). The number of trace entries displayed and the total number
332recorded (both are three: #3/3). The type of preemption that was
333used (PREEMPT). VP, KP, SP, and HP are always zero and are
334reserved for later use. #P is the number of online CPUS (#P:2).
262 335
263This shows that the current tracer is "irqsoff" tracing the time for which 336The task is the process that was running when the latency
264interrupts were disabled. It gives the trace version and the version 337occurred. (swapper pid: 0).
265of the kernel upon which this was executed on (2.6.26-rc8). Then it displays
266the max latency in microsecs (97 us). The number of trace entries displayed
267and the total number recorded (both are three: #3/3). The type of
268preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
269and are reserved for later use. #P is the number of online CPUS (#P:2).
270
271The task is the process that was running when the latency occurred.
272(swapper pid: 0).
273 338
274The start and stop (the functions in which the interrupts were disabled and 339The start and stop (the functions in which the interrupts were
275enabled respectively) that caused the latencies: 340disabled and enabled respectively) that caused the latencies:
276 341
277 apic_timer_interrupt is where the interrupts were disabled. 342 apic_timer_interrupt is where the interrupts were disabled.
278 do_softirq is where they were enabled again. 343 do_softirq is where they were enabled again.
@@ -308,12 +373,12 @@ The above is mostly meaningful for kernel developers.
308 latency_trace file is relative to the start of the trace. 373 latency_trace file is relative to the start of the trace.
309 374
310 delay: This is just to help catch your eye a bit better. And 375 delay: This is just to help catch your eye a bit better. And
311 needs to be fixed to be only relative to the same CPU. 376 needs to be fixed to be only relative to the same CPU.
312 The marks are determined by the difference between this 377 The marks are determined by the difference between this
313 current trace and the next trace. 378 current trace and the next trace.
314 '!' - greater than preempt_mark_thresh (default 100) 379 '!' - greater than preempt_mark_thresh (default 100)
315 '+' - greater than 1 microsecond 380 '+' - greater than 1 microsecond
316 ' ' - less than or equal to 1 microsecond. 381 ' ' - less than or equal to 1 microsecond.
317 382
318 The rest is the same as the 'trace' file. 383 The rest is the same as the 'trace' file.
319 384
@@ -321,14 +386,15 @@ The above is mostly meaningful for kernel developers.
321trace_options 386trace_options
322------------- 387-------------
323 388
324The trace_options file is used to control what gets printed in the trace 389The trace_options file is used to control what gets printed in
325output. To see what is available, simply cat the file: 390the trace output. To see what is available, simply cat the file:
326 391
327 cat /debug/tracing/trace_options 392 cat /debug/tracing/trace_options
328 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
329 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
330 395
331To disable one of the options, echo in the option prepended with "no". 396To disable one of the options, echo in the option prepended with
397"no".
332 398
333 echo noprint-parent > /debug/tracing/trace_options 399 echo noprint-parent > /debug/tracing/trace_options
334 400
@@ -338,8 +404,8 @@ To enable an option, leave off the "no".
338 404
339Here are the available options: 405Here are the available options:
340 406
341 print-parent - On function traces, display the calling function 407 print-parent - On function traces, display the calling (parent)
342 as well as the function being traced. 408 function as well as the function being traced.
343 409
344 print-parent: 410 print-parent:
345 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul 411 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul
@@ -348,15 +414,16 @@ Here are the available options:
348 bash-4000 [01] 1477.606694: simple_strtoul 414 bash-4000 [01] 1477.606694: simple_strtoul
349 415
350 416
351 sym-offset - Display not only the function name, but also the offset 417 sym-offset - Display not only the function name, but also the
352 in the function. For example, instead of seeing just 418 offset in the function. For example, instead of
353 "ktime_get", you will see "ktime_get+0xb/0x20". 419 seeing just "ktime_get", you will see
420 "ktime_get+0xb/0x20".
354 421
355 sym-offset: 422 sym-offset:
356 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 423 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0
357 424
358 sym-addr - this will also display the function address as well as 425 sym-addr - this will also display the function address as well
359 the function name. 426 as the function name.
360 427
361 sym-addr: 428 sym-addr:
362 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 429 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
@@ -366,35 +433,41 @@ Here are the available options:
366 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 433 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
367 (+0.000ms): simple_strtoul (strict_strtoul) 434 (+0.000ms): simple_strtoul (strict_strtoul)
368 435
369 raw - This will display raw numbers. This option is best for use with 436 raw - This will display raw numbers. This option is best for
370 user applications that can translate the raw numbers better than 437 use with user applications that can translate the raw
371 having it done in the kernel. 438 numbers better than having it done in the kernel.
372 439
373 hex - Similar to raw, but the numbers will be in a hexadecimal format. 440 hex - Similar to raw, but the numbers will be in a hexadecimal
441 format.
374 442
375 bin - This will print out the formats in raw binary. 443 bin - This will print out the formats in raw binary.
376 444
377 block - TBD (needs update) 445 block - TBD (needs update)
378 446
379 stacktrace - This is one of the options that changes the trace itself. 447 stacktrace - This is one of the options that changes the trace
380 When a trace is recorded, so is the stack of functions. 448 itself. When a trace is recorded, so is the stack
381 This allows for back traces of trace sites. 449 of functions. This allows for back traces of
450 trace sites.
382 451
383 userstacktrace - This option changes the trace. 452 userstacktrace - This option changes the trace. It records a
384 It records a stacktrace of the current userspace thread. 453 stacktrace of the current userspace thread.
385 454
386 sym-userobj - when user stacktrace are enabled, look up which object the 455 sym-userobj - when user stacktrace are enabled, look up which
387 address belongs to, and print a relative address 456 object the address belongs to, and print a
388 This is especially useful when ASLR is on, otherwise you don't 457 relative address. This is especially useful when
389 get a chance to resolve the address to object/file/line after the app is no 458 ASLR is on, otherwise you don't get a chance to
390 longer running 459 resolve the address to object/file/line after
460 the app is no longer running
391 461
392 The lookup is performed when you read trace,trace_pipe,latency_trace. Example: 462 The lookup is performed when you read
463 trace,trace_pipe,latency_trace. Example:
393 464
394 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 465 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
395x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 466x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
396 467
397 sched-tree - TBD (any users??) 468 sched-tree - trace all tasks that are on the runqueue, at
469 every scheduling event. Will add overhead if
470 there's a lot of tasks running at once.
398 471
399 472
400sched_switch 473sched_switch
@@ -431,18 +504,19 @@ of how to use it.
431 [...] 504 [...]
432 505
433 506
434As we have discussed previously about this format, the header shows 507As we have discussed previously about this format, the header
435the name of the trace and points to the options. The "FUNCTION" 508shows the name of the trace and points to the options. The
436is a misnomer since here it represents the wake ups and context 509"FUNCTION" is a misnomer since here it represents the wake ups
437switches. 510and context switches.
438 511
439The sched_switch file only lists the wake ups (represented with '+') 512The sched_switch file only lists the wake ups (represented with
440and context switches ('==>') with the previous task or current task 513'+') and context switches ('==>') with the previous task or
441first followed by the next task or task waking up. The format for both 514current task first followed by the next task or task waking up.
442of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO 515The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
443is the inverse of the actual priority with zero (0) being the highest 516Remember that the KERNEL-PRIO is the inverse of the actual
444priority and the nice values starting at 100 (nice -20). Below is 517priority with zero (0) being the highest priority and the nice
445a quick chart to map the kernel priority to user land priorities. 518values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities.
446 520
447 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0
448 Kernel priority: 100 to 139 ==> user nice -20 to 19 522 Kernel priority: 100 to 139 ==> user nice -20 to 19
@@ -463,10 +537,10 @@ The task states are:
463ftrace_enabled 537ftrace_enabled
464-------------- 538--------------
465 539
466The following tracers (listed below) give different output depending 540The following tracers (listed below) give different output
467on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, 541depending on whether or not the sysctl ftrace_enabled is set. To
468one can either use the sysctl function or set it via the proc 542set ftrace_enabled, one can either use the sysctl function or
469file system interface. 543set it via the proc file system interface.
470 544
471 sysctl kernel.ftrace_enabled=1 545 sysctl kernel.ftrace_enabled=1
472 546
@@ -474,12 +548,12 @@ file system interface.
474 548
475 echo 1 > /proc/sys/kernel/ftrace_enabled 549 echo 1 > /proc/sys/kernel/ftrace_enabled
476 550
477To disable ftrace_enabled simply replace the '1' with '0' in 551To disable ftrace_enabled simply replace the '1' with '0' in the
478the above commands. 552above commands.
479 553
480When ftrace_enabled is set the tracers will also record the functions 554When ftrace_enabled is set the tracers will also record the
481that are within the trace. The descriptions of the tracers 555functions that are within the trace. The descriptions of the
482will also show an example with ftrace enabled. 556tracers will also show an example with ftrace enabled.
483 557
484 558
485irqsoff 559irqsoff
@@ -487,17 +561,18 @@ irqsoff
487 561
488When interrupts are disabled, the CPU can not react to any other 562When interrupts are disabled, the CPU can not react to any other
489external event (besides NMIs and SMIs). This prevents the timer 563external event (besides NMIs and SMIs). This prevents the timer
490interrupt from triggering or the mouse interrupt from letting the 564interrupt from triggering or the mouse interrupt from letting
491kernel know of a new mouse event. The result is a latency with the 565the kernel know of a new mouse event. The result is a latency
492reaction time. 566with the reaction time.
493 567
494The irqsoff tracer tracks the time for which interrupts are disabled. 568The irqsoff tracer tracks the time for which interrupts are
495When a new maximum latency is hit, the tracer saves the trace leading up 569disabled. When a new maximum latency is hit, the tracer saves
496to that latency point so that every time a new maximum is reached, the old 570the trace leading up to that latency point so that every time a
497saved trace is discarded and the new trace is saved. 571new maximum is reached, the old saved trace is discarded and the
572new trace is saved.
498 573
499To reset the maximum, echo 0 into tracing_max_latency. Here is an 574To reset the maximum, echo 0 into tracing_max_latency. Here is
500example: 575an example:
501 576
502 # echo irqsoff > /debug/tracing/current_tracer 577 # echo irqsoff > /debug/tracing/current_tracer
503 # echo 0 > /debug/tracing/tracing_max_latency 578 # echo 0 > /debug/tracing/tracing_max_latency
@@ -532,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26
532 607
533 608
534Here we see that that we had a latency of 12 microsecs (which is 609Here we see that that we had a latency of 12 microsecs (which is
535very good). The _write_lock_irq in sys_setpgid disabled interrupts. 610very good). The _write_lock_irq in sys_setpgid disabled
536The difference between the 12 and the displayed timestamp 14us occurred 611interrupts. The difference between the 12 and the displayed
537because the clock was incremented between the time of recording the max 612timestamp 14us occurred because the clock was incremented
538latency and the time of recording the function that had that latency. 613between the time of recording the max latency and the time of
614recording the function that had that latency.
539 615
540Note the above example had ftrace_enabled not set. If we set the 616Note the above example had ftrace_enabled not set. If we set the
541ftrace_enabled, we get a much larger output: 617ftrace_enabled, we get a much larger output:
@@ -586,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
586 662
587 663
588Here we traced a 50 microsecond latency. But we also see all the 664Here we traced a 50 microsecond latency. But we also see all the
589functions that were called during that time. Note that by enabling 665functions that were called during that time. Note that by
590function tracing, we incur an added overhead. This overhead may 666enabling function tracing, we incur an added overhead. This
591extend the latency times. But nevertheless, this trace has provided 667overhead may extend the latency times. But nevertheless, this
592some very helpful debugging information. 668trace has provided some very helpful debugging information.
593 669
594 670
595preemptoff 671preemptoff
596---------- 672----------
597 673
598When preemption is disabled, we may be able to receive interrupts but 674When preemption is disabled, we may be able to receive
599the task cannot be preempted and a higher priority task must wait 675interrupts but the task cannot be preempted and a higher
600for preemption to be enabled again before it can preempt a lower 676priority task must wait for preemption to be enabled again
601priority task. 677before it can preempt a lower priority task.
602 678
603The preemptoff tracer traces the places that disable preemption. 679The preemptoff tracer traces the places that disable preemption.
604Like the irqsoff tracer, it records the maximum latency for which preemption 680Like the irqsoff tracer, it records the maximum latency for
605was disabled. The control of preemptoff tracer is much like the irqsoff 681which preemption was disabled. The control of preemptoff tracer
606tracer. 682is much like the irqsoff tracer.
607 683
608 # echo preemptoff > /debug/tracing/current_tracer 684 # echo preemptoff > /debug/tracing/current_tracer
609 # echo 0 > /debug/tracing/tracing_max_latency 685 # echo 0 > /debug/tracing/tracing_max_latency
@@ -637,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
637 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) 713 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq)
638 714
639 715
640This has some more changes. Preemption was disabled when an interrupt 716This has some more changes. Preemption was disabled when an
641came in (notice the 'h'), and was enabled while doing a softirq. 717interrupt came in (notice the 'h'), and was enabled while doing
642(notice the 's'). But we also see that interrupts have been disabled 718a softirq. (notice the 's'). But we also see that interrupts
643when entering the preempt off section and leaving it (the 'd'). 719have been disabled when entering the preempt off section and
644We do not know if interrupts were enabled in the mean time. 720leaving it (the 'd'). We do not know if interrupts were enabled
721in the mean time.
645 722
646# tracer: preemptoff 723# tracer: preemptoff
647# 724#
@@ -700,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
700 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) 777 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq)
701 778
702 779
703The above is an example of the preemptoff trace with ftrace_enabled 780The above is an example of the preemptoff trace with
704set. Here we see that interrupts were disabled the entire time. 781ftrace_enabled set. Here we see that interrupts were disabled
705The irq_enter code lets us know that we entered an interrupt 'h'. 782the entire time. The irq_enter code lets us know that we entered
706Before that, the functions being traced still show that it is not 783an interrupt 'h'. Before that, the functions being traced still
707in an interrupt, but we can see from the functions themselves that 784show that it is not in an interrupt, but we can see from the
708this is not the case. 785functions themselves that this is not the case.
709 786
710Notice that __do_softirq when called does not have a preempt_count. 787Notice that __do_softirq when called does not have a
711It may seem that we missed a preempt enabling. What really happened 788preempt_count. It may seem that we missed a preempt enabling.
712is that the preempt count is held on the thread's stack and we 789What really happened is that the preempt count is held on the
713switched to the softirq stack (4K stacks in effect). The code 790thread's stack and we switched to the softirq stack (4K stacks
714does not copy the preempt count, but because interrupts are disabled, 791in effect). The code does not copy the preempt count, but
715we do not need to worry about it. Having a tracer like this is good 792because interrupts are disabled, we do not need to worry about
716for letting people know what really happens inside the kernel. 793it. Having a tracer like this is good for letting people know
794what really happens inside the kernel.
717 795
718 796
719preemptirqsoff 797preemptirqsoff
720-------------- 798--------------
721 799
722Knowing the locations that have interrupts disabled or preemption 800Knowing the locations that have interrupts disabled or
723disabled for the longest times is helpful. But sometimes we would 801preemption disabled for the longest times is helpful. But
724like to know when either preemption and/or interrupts are disabled. 802sometimes we would like to know when either preemption and/or
803interrupts are disabled.
725 804
726Consider the following code: 805Consider the following code:
727 806
@@ -741,11 +820,13 @@ The preemptoff tracer will record the total length of
741call_function_with_irqs_and_preemption_off() and 820call_function_with_irqs_and_preemption_off() and
742call_function_with_preemption_off(). 821call_function_with_preemption_off().
743 822
744But neither will trace the time that interrupts and/or preemption 823But neither will trace the time that interrupts and/or
745is disabled. This total time is the time that we can not schedule. 824preemption is disabled. This total time is the time that we can
746To record this time, use the preemptirqsoff tracer. 825not schedule. To record this time, use the preemptirqsoff
826tracer.
747 827
748Again, using this trace is much like the irqsoff and preemptoff tracers. 828Again, using this trace is much like the irqsoff and preemptoff
829tracers.
749 830
750 # echo preemptirqsoff > /debug/tracing/current_tracer 831 # echo preemptirqsoff > /debug/tracing/current_tracer
751 # echo 0 > /debug/tracing/tracing_max_latency 832 # echo 0 > /debug/tracing/tracing_max_latency
@@ -781,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
781 862
782 863
783The trace_hardirqs_off_thunk is called from assembly on x86 when 864The trace_hardirqs_off_thunk is called from assembly on x86 when
784interrupts are disabled in the assembly code. Without the function 865interrupts are disabled in the assembly code. Without the
785tracing, we do not know if interrupts were enabled within the preemption 866function tracing, we do not know if interrupts were enabled
786points. We do see that it started with preemption enabled. 867within the preemption points. We do see that it started with
868preemption enabled.
787 869
788Here is a trace with ftrace_enabled set: 870Here is a trace with ftrace_enabled set:
789 871
@@ -871,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
871 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) 953 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq)
872 954
873 955
874This is a very interesting trace. It started with the preemption of 956This is a very interesting trace. It started with the preemption
875the ls task. We see that the task had the "need_resched" bit set 957of the ls task. We see that the task had the "need_resched" bit
876via the 'N' in the trace. Interrupts were disabled before the spin_lock 958set via the 'N' in the trace. Interrupts were disabled before
877at the beginning of the trace. We see that a schedule took place to run 959the spin_lock at the beginning of the trace. We see that a
878sshd. When the interrupts were enabled, we took an interrupt. 960schedule took place to run sshd. When the interrupts were
879On return from the interrupt handler, the softirq ran. We took another 961enabled, we took an interrupt. On return from the interrupt
880interrupt while running the softirq as we see from the capital 'H'. 962handler, the softirq ran. We took another interrupt while
963running the softirq as we see from the capital 'H'.
881 964
882 965
883wakeup 966wakeup
884------ 967------
885 968
886In a Real-Time environment it is very important to know the wakeup 969In a Real-Time environment it is very important to know the
887time it takes for the highest priority task that is woken up to the 970wakeup time it takes for the highest priority task that is woken
888time that it executes. This is also known as "schedule latency". 971up to the time that it executes. This is also known as "schedule
889I stress the point that this is about RT tasks. It is also important 972latency". I stress the point that this is about RT tasks. It is
890to know the scheduling latency of non-RT tasks, but the average 973also important to know the scheduling latency of non-RT tasks,
891schedule latency is better for non-RT tasks. Tools like 974but the average schedule latency is better for non-RT tasks.
892LatencyTop are more appropriate for such measurements. 975Tools like LatencyTop are more appropriate for such
976measurements.
893 977
894Real-Time environments are interested in the worst case latency. 978Real-Time environments are interested in the worst case latency.
895That is the longest latency it takes for something to happen, and 979That is the longest latency it takes for something to happen,
896not the average. We can have a very fast scheduler that may only 980and not the average. We can have a very fast scheduler that may
897have a large latency once in a while, but that would not work well 981only have a large latency once in a while, but that would not
898with Real-Time tasks. The wakeup tracer was designed to record 982work well with Real-Time tasks. The wakeup tracer was designed
899the worst case wakeups of RT tasks. Non-RT tasks are not recorded 983to record the worst case wakeups of RT tasks. Non-RT tasks are
900because the tracer only records one worst case and tracing non-RT 984not recorded because the tracer only records one worst case and
901tasks that are unpredictable will overwrite the worst case latency 985tracing non-RT tasks that are unpredictable will overwrite the
902of RT tasks. 986worst case latency of RT tasks.
903 987
904Since this tracer only deals with RT tasks, we will run this slightly 988Since this tracer only deals with RT tasks, we will run this
905differently than we did with the previous tracers. Instead of performing 989slightly differently than we did with the previous tracers.
906an 'ls', we will run 'sleep 1' under 'chrt' which changes the 990Instead of performing an 'ls', we will run 'sleep 1' under
907priority of the task. 991'chrt' which changes the priority of the task.
908 992
909 # echo wakeup > /debug/tracing/current_tracer 993 # echo wakeup > /debug/tracing/current_tracer
910 # echo 0 > /debug/tracing/tracing_max_latency 994 # echo 0 > /debug/tracing/tracing_max_latency
@@ -934,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc8
934 <idle>-0 1d..4 4us : schedule (cpu_idle) 1018 <idle>-0 1d..4 4us : schedule (cpu_idle)
935 1019
936 1020
1021Running this on an idle system, we see that it only took 4
1022microseconds to perform the task switch. Note, since the trace
1023marker in the schedule is before the actual "switch", we stop
1024the tracing when the recorded task is about to schedule in. This
1025may change if we add a new marker at the end of the scheduler.
937 1026
938Running this on an idle system, we see that it only took 4 microseconds 1027Notice that the recorded task is 'sleep' with the PID of 4901
939to perform the task switch. Note, since the trace marker in the 1028and it has an rt_prio of 5. This priority is user-space priority
940schedule is before the actual "switch", we stop the tracing when 1029and not the internal kernel priority. The policy is 1 for
941the recorded task is about to schedule in. This may change if 1030SCHED_FIFO and 2 for SCHED_RR.
942we add a new marker at the end of the scheduler.
943
944Notice that the recorded task is 'sleep' with the PID of 4901 and it
945has an rt_prio of 5. This priority is user-space priority and not
946the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
947for SCHED_RR.
948 1031
949Doing the same with chrt -r 5 and ftrace_enabled set. 1032Doing the same with chrt -r 5 and ftrace_enabled set.
950 1033
@@ -1001,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline)
1001ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) 1084ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock)
1002ksoftirq-7 1d..4 50us : schedule (__cond_resched) 1085ksoftirq-7 1d..4 50us : schedule (__cond_resched)
1003 1086
1004The interrupt went off while running ksoftirqd. This task runs at 1087The interrupt went off while running ksoftirqd. This task runs
1005SCHED_OTHER. Why did not we see the 'N' set early? This may be 1088at SCHED_OTHER. Why did not we see the 'N' set early? This may
1006a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks 1089be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K
1007configured, the interrupt and softirq run with their own stack. 1090stacks configured, the interrupt and softirq run with their own
1008Some information is held on the top of the task's stack (need_resched 1091stack. Some information is held on the top of the task's stack
1009and preempt_count are both stored there). The setting of the NEED_RESCHED 1092(need_resched and preempt_count are both stored there). The
1010bit is done directly to the task's stack, but the reading of the 1093setting of the NEED_RESCHED bit is done directly to the task's
1011NEED_RESCHED is done by looking at the current stack, which in this case 1094stack, but the reading of the NEED_RESCHED is done by looking at
1012is the stack for the hard interrupt. This hides the fact that NEED_RESCHED 1095the current stack, which in this case is the stack for the hard
1013has been set. We do not see the 'N' until we switch back to the task's 1096interrupt. This hides the fact that NEED_RESCHED has been set.
1097We do not see the 'N' until we switch back to the task's
1014assigned stack. 1098assigned stack.
1015 1099
1016function 1100function
1017-------- 1101--------
1018 1102
1019This tracer is the function tracer. Enabling the function tracer 1103This tracer is the function tracer. Enabling the function tracer
1020can be done from the debug file system. Make sure the ftrace_enabled is 1104can be done from the debug file system. Make sure the
1021set; otherwise this tracer is a nop. 1105ftrace_enabled is set; otherwise this tracer is a nop.
1022 1106
1023 # sysctl kernel.ftrace_enabled=1 1107 # sysctl kernel.ftrace_enabled=1
1024 # echo function > /debug/tracing/current_tracer 1108 # echo function > /debug/tracing/current_tracer
@@ -1048,14 +1132,15 @@ set; otherwise this tracer is a nop.
1048[...] 1132[...]
1049 1133
1050 1134
1051Note: function tracer uses ring buffers to store the above entries. 1135Note: function tracer uses ring buffers to store the above
1052The newest data may overwrite the oldest data. Sometimes using echo to 1136entries. The newest data may overwrite the oldest data.
1053stop the trace is not sufficient because the tracing could have overwritten 1137Sometimes using echo to stop the trace is not sufficient because
1054the data that you wanted to record. For this reason, it is sometimes better to 1138the tracing could have overwritten the data that you wanted to
1055disable tracing directly from a program. This allows you to stop the 1139record. For this reason, it is sometimes better to disable
1056tracing at the point that you hit the part that you are interested in. 1140tracing directly from a program. This allows you to stop the
1057To disable the tracing directly from a C program, something like following 1141tracing at the point that you hit the part that you are
1058code snippet can be used: 1142interested in. To disable the tracing directly from a C program,
1143something like following code snippet can be used:
1059 1144
1060int trace_fd; 1145int trace_fd;
1061[...] 1146[...]
@@ -1070,10 +1155,10 @@ int main(int argc, char *argv[]) {
1070} 1155}
1071 1156
1072Note: Here we hard coded the path name. The debugfs mount is not 1157Note: Here we hard coded the path name. The debugfs mount is not
1073guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). 1158guaranteed to be at /debug (and is more commonly at
1074For simple one time traces, the above is sufficent. For anything else, 1159/sys/kernel/debug). For simple one time traces, the above is
1075a search through /proc/mounts may be needed to find where the debugfs 1160sufficent. For anything else, a search through /proc/mounts may
1076file-system is mounted. 1161be needed to find where the debugfs file-system is mounted.
1077 1162
1078 1163
1079Single thread tracing 1164Single thread tracing
@@ -1152,49 +1237,297 @@ int main (int argc, char **argv)
1152 return 0; 1237 return 0;
1153} 1238}
1154 1239
1240
1241hw-branch-tracer (x86 only)
1242---------------------------
1243
1244This tracer uses the x86 last branch tracing hardware feature to
1245collect a branch trace on all cpus with relatively low overhead.
1246
1247The tracer uses a fixed-size circular buffer per cpu and only
1248traces ring 0 branches. The trace file dumps that buffer in the
1249following format:
1250
1251# tracer: hw-branch-tracer
1252#
1253# CPU# TO <- FROM
1254 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6
1255 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a
1256 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf
1257 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf
1258 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a
1259 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf
1260
1261
1262The tracer may be used to dump the trace for the oops'ing cpu on
1263a kernel oops into the system log. To enable this,
1264ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1265can either use the sysctl function or set it via the proc system
1266interface.
1267
1268 sysctl kernel.ftrace_dump_on_oops=1
1269
1270or
1271
1272 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
1273
1274
1275Here's an example of such a dump after a null pointer
1276dereference in a kernel module:
1277
1278[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
1279[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops]
1280[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0
1281[57848.106019] Oops: 0002 [#1] SMP
1282[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus
1283[57848.106019] Dumping ftrace buffer:
1284[57848.106019] ---------------------------------
1285[...]
1286[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24
1287[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165
1288[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165
1289[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165
1290[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165
1291[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops]
1292[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30
1293[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b
1294[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31
1295[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1
1296[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30
1297[...]
1298[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2
1299[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881
1300[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881
1301[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96
1302[...]
1303[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3
1304[57848.106019] ---------------------------------
1305[57848.106019] CPU 0
1306[57848.106019] Modules linked in: oops
1307[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23
1308[57848.106019] RIP: 0010:[<ffffffffa0000006>] [<ffffffffa0000006>] open+0x6/0x14 [oops]
1309[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246
1310[...]
1311
1312
1313function graph tracer
1314---------------------------
1315
1316This tracer is similar to the function tracer except that it
1317probes a function on its entry and its exit. This is done by
1318using a dynamically allocated stack of return addresses in each
1319task_struct. On function entry the tracer overwrites the return
1320address of each function traced to set a custom probe. Thus the
1321original return address is stored on the stack of return address
1322in the task_struct.
1323
1324Probing on both ends of a function leads to special features
1325such as:
1326
1327- measure of a function's time execution
1328- having a reliable call stack to draw function calls graph
1329
1330This tracer is useful in several situations:
1331
1332- you want to find the reason of a strange kernel behavior and
1333 need to see what happens in detail on any areas (or specific
1334 ones).
1335
1336- you are experiencing weird latencies but it's difficult to
1337 find its origin.
1338
1339- you want to find quickly which path is taken by a specific
1340 function
1341
1342- you just want to peek inside a working kernel and want to see
1343 what happens there.
1344
1345# tracer: function_graph
1346#
1347# CPU DURATION FUNCTION CALLS
1348# | | | | | | |
1349
1350 0) | sys_open() {
1351 0) | do_sys_open() {
1352 0) | getname() {
1353 0) | kmem_cache_alloc() {
1354 0) 1.382 us | __might_sleep();
1355 0) 2.478 us | }
1356 0) | strncpy_from_user() {
1357 0) | might_fault() {
1358 0) 1.389 us | __might_sleep();
1359 0) 2.553 us | }
1360 0) 3.807 us | }
1361 0) 7.876 us | }
1362 0) | alloc_fd() {
1363 0) 0.668 us | _spin_lock();
1364 0) 0.570 us | expand_files();
1365 0) 0.586 us | _spin_unlock();
1366
1367
1368There are several columns that can be dynamically
1369enabled/disabled. You can use every combination of options you
1370want, depending on your needs.
1371
1372- The cpu number on which the function executed is default
1373 enabled. It is sometimes better to only trace one cpu (see
1374 tracing_cpu_mask file) or you might sometimes see unordered
1375 function calls while cpu tracing switch.
1376
1377 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options
1378 show: echo funcgraph-cpu > /debug/tracing/trace_options
1379
1380- The duration (function's time of execution) is displayed on
1381 the closing bracket line of a function or on the same line
1382 than the current function in case of a leaf one. It is default
1383 enabled.
1384
1385 hide: echo nofuncgraph-duration > /debug/tracing/trace_options
1386 show: echo funcgraph-duration > /debug/tracing/trace_options
1387
1388- The overhead field precedes the duration field in case of
1389 reached duration thresholds.
1390
1391 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options
1392 show: echo funcgraph-overhead > /debug/tracing/trace_options
1393 depends on: funcgraph-duration
1394
1395 ie:
1396
1397 0) | up_write() {
1398 0) 0.646 us | _spin_lock_irqsave();
1399 0) 0.684 us | _spin_unlock_irqrestore();
1400 0) 3.123 us | }
1401 0) 0.548 us | fput();
1402 0) + 58.628 us | }
1403
1404 [...]
1405
1406 0) | putname() {
1407 0) | kmem_cache_free() {
1408 0) 0.518 us | __phys_addr();
1409 0) 1.757 us | }
1410 0) 2.861 us | }
1411 0) ! 115.305 us | }
1412 0) ! 116.402 us | }
1413
1414 + means that the function exceeded 10 usecs.
1415 ! means that the function exceeded 100 usecs.
1416
1417
1418- The task/pid field displays the thread cmdline and pid which
1419 executed the function. It is default disabled.
1420
1421 hide: echo nofuncgraph-proc > /debug/tracing/trace_options
1422 show: echo funcgraph-proc > /debug/tracing/trace_options
1423
1424 ie:
1425
1426 # tracer: function_graph
1427 #
1428 # CPU TASK/PID DURATION FUNCTION CALLS
1429 # | | | | | | | | |
1430 0) sh-4802 | | d_free() {
1431 0) sh-4802 | | call_rcu() {
1432 0) sh-4802 | | __call_rcu() {
1433 0) sh-4802 | 0.616 us | rcu_process_gp_end();
1434 0) sh-4802 | 0.586 us | check_for_new_grace_period();
1435 0) sh-4802 | 2.899 us | }
1436 0) sh-4802 | 4.040 us | }
1437 0) sh-4802 | 5.151 us | }
1438 0) sh-4802 | + 49.370 us | }
1439
1440
1441- The absolute time field is an absolute timestamp given by the
1442 system clock since it started. A snapshot of this time is
1443 given on each entry/exit of functions
1444
1445 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options
1446 show: echo funcgraph-abstime > /debug/tracing/trace_options
1447
1448 ie:
1449
1450 #
1451 # TIME CPU DURATION FUNCTION CALLS
1452 # | | | | | | | |
1453 360.774522 | 1) 0.541 us | }
1454 360.774522 | 1) 4.663 us | }
1455 360.774523 | 1) 0.541 us | __wake_up_bit();
1456 360.774524 | 1) 6.796 us | }
1457 360.774524 | 1) 7.952 us | }
1458 360.774525 | 1) 9.063 us | }
1459 360.774525 | 1) 0.615 us | journal_mark_dirty();
1460 360.774527 | 1) 0.578 us | __brelse();
1461 360.774528 | 1) | reiserfs_prepare_for_journal() {
1462 360.774528 | 1) | unlock_buffer() {
1463 360.774529 | 1) | wake_up_bit() {
1464 360.774529 | 1) | bit_waitqueue() {
1465 360.774530 | 1) 0.594 us | __phys_addr();
1466
1467
1468You can put some comments on specific functions by using
1469trace_printk() For example, if you want to put a comment inside
1470the __might_sleep() function, you just have to include
1471<linux/ftrace.h> and call trace_printk() inside __might_sleep()
1472
1473trace_printk("I'm a comment!\n")
1474
1475will produce:
1476
1477 1) | __might_sleep() {
1478 1) | /* I'm a comment! */
1479 1) 1.449 us | }
1480
1481
1482You might find other useful features for this tracer in the
1483following "dynamic ftrace" section such as tracing only specific
1484functions or tasks.
1485
1155dynamic ftrace 1486dynamic ftrace
1156-------------- 1487--------------
1157 1488
1158If CONFIG_DYNAMIC_FTRACE is set, the system will run with 1489If CONFIG_DYNAMIC_FTRACE is set, the system will run with
1159virtually no overhead when function tracing is disabled. The way 1490virtually no overhead when function tracing is disabled. The way
1160this works is the mcount function call (placed at the start of 1491this works is the mcount function call (placed at the start of
1161every kernel function, produced by the -pg switch in gcc), starts 1492every kernel function, produced by the -pg switch in gcc),
1162of pointing to a simple return. (Enabling FTRACE will include the 1493starts of pointing to a simple return. (Enabling FTRACE will
1163-pg switch in the compiling of the kernel.) 1494include the -pg switch in the compiling of the kernel.)
1164 1495
1165At compile time every C file object is run through the 1496At compile time every C file object is run through the
1166recordmcount.pl script (located in the scripts directory). This 1497recordmcount.pl script (located in the scripts directory). This
1167script will process the C object using objdump to find all the 1498script will process the C object using objdump to find all the
1168locations in the .text section that call mcount. (Note, only 1499locations in the .text section that call mcount. (Note, only the
1169the .text section is processed, since processing other sections 1500.text section is processed, since processing other sections like
1170like .init.text may cause races due to those sections being freed). 1501.init.text may cause races due to those sections being freed).
1171 1502
1172A new section called "__mcount_loc" is created that holds references 1503A new section called "__mcount_loc" is created that holds
1173to all the mcount call sites in the .text section. This section is 1504references to all the mcount call sites in the .text section.
1174compiled back into the original object. The final linker will add 1505This section is compiled back into the original object. The
1175all these references into a single table. 1506final linker will add all these references into a single table.
1176 1507
1177On boot up, before SMP is initialized, the dynamic ftrace code 1508On boot up, before SMP is initialized, the dynamic ftrace code
1178scans this table and updates all the locations into nops. It also 1509scans this table and updates all the locations into nops. It
1179records the locations, which are added to the available_filter_functions 1510also records the locations, which are added to the
1180list. Modules are processed as they are loaded and before they are 1511available_filter_functions list. Modules are processed as they
1181executed. When a module is unloaded, it also removes its functions from 1512are loaded and before they are executed. When a module is
1182the ftrace function list. This is automatic in the module unload 1513unloaded, it also removes its functions from the ftrace function
1183code, and the module author does not need to worry about it. 1514list. This is automatic in the module unload code, and the
1184 1515module author does not need to worry about it.
1185When tracing is enabled, kstop_machine is called to prevent races 1516
1186with the CPUS executing code being modified (which can cause the 1517When tracing is enabled, kstop_machine is called to prevent
1187CPU to do undesireable things), and the nops are patched back 1518races with the CPUS executing code being modified (which can
1188to calls. But this time, they do not call mcount (which is just 1519cause the CPU to do undesireable things), and the nops are
1189a function stub). They now call into the ftrace infrastructure. 1520patched back to calls. But this time, they do not call mcount
1521(which is just a function stub). They now call into the ftrace
1522infrastructure.
1190 1523
1191One special side-effect to the recording of the functions being 1524One special side-effect to the recording of the functions being
1192traced is that we can now selectively choose which functions we 1525traced is that we can now selectively choose which functions we
1193wish to trace and which ones we want the mcount calls to remain as 1526wish to trace and which ones we want the mcount calls to remain
1194nops. 1527as nops.
1195 1528
1196Two files are used, one for enabling and one for disabling the tracing 1529Two files are used, one for enabling and one for disabling the
1197of specified functions. They are: 1530tracing of specified functions. They are:
1198 1531
1199 set_ftrace_filter 1532 set_ftrace_filter
1200 1533
@@ -1202,8 +1535,8 @@ and
1202 1535
1203 set_ftrace_notrace 1536 set_ftrace_notrace
1204 1537
1205A list of available functions that you can add to these files is listed 1538A list of available functions that you can add to these files is
1206in: 1539listed in:
1207 1540
1208 available_filter_functions 1541 available_filter_functions
1209 1542
@@ -1240,8 +1573,8 @@ hrtimer_interrupt
1240sys_nanosleep 1573sys_nanosleep
1241 1574
1242 1575
1243Perhaps this is not enough. The filters also allow simple wild cards. 1576Perhaps this is not enough. The filters also allow simple wild
1244Only the following are currently available 1577cards. Only the following are currently available
1245 1578
1246 <match>* - will match functions that begin with <match> 1579 <match>* - will match functions that begin with <match>
1247 *<match> - will match functions that end with <match> 1580 *<match> - will match functions that end with <match>
@@ -1251,9 +1584,9 @@ These are the only wild cards which are supported.
1251 1584
1252 <match>*<match> will not work. 1585 <match>*<match> will not work.
1253 1586
1254Note: It is better to use quotes to enclose the wild cards, otherwise 1587Note: It is better to use quotes to enclose the wild cards,
1255 the shell may expand the parameters into names of files in the local 1588 otherwise the shell may expand the parameters into names
1256 directory. 1589 of files in the local directory.
1257 1590
1258 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1591 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
1259 1592
@@ -1299,7 +1632,8 @@ This is because the '>' and '>>' act just like they do in bash.
1299To rewrite the filters, use '>' 1632To rewrite the filters, use '>'
1300To append to the filters, use '>>' 1633To append to the filters, use '>>'
1301 1634
1302To clear out a filter so that all functions will be recorded again: 1635To clear out a filter so that all functions will be recorded
1636again:
1303 1637
1304 # echo > /debug/tracing/set_ftrace_filter 1638 # echo > /debug/tracing/set_ftrace_filter
1305 # cat /debug/tracing/set_ftrace_filter 1639 # cat /debug/tracing/set_ftrace_filter
@@ -1331,7 +1665,8 @@ hrtimer_get_res
1331hrtimer_init_sleeper 1665hrtimer_init_sleeper
1332 1666
1333 1667
1334The set_ftrace_notrace prevents those functions from being traced. 1668The set_ftrace_notrace prevents those functions from being
1669traced.
1335 1670
1336 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1671 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
1337 1672
@@ -1353,13 +1688,75 @@ Produces:
1353 1688
1354We can see that there's no more lock or preempt tracing. 1689We can see that there's no more lock or preempt tracing.
1355 1690
1691
1692Dynamic ftrace with the function graph tracer
1693---------------------------------------------
1694
1695Although what has been explained above concerns both the
1696function tracer and the function-graph-tracer, there are some
1697special features only available in the function-graph tracer.
1698
1699If you want to trace only one function and all of its children,
1700you just have to echo its name into set_graph_function:
1701
1702 echo __do_fault > set_graph_function
1703
1704will produce the following "expanded" trace of the __do_fault()
1705function:
1706
1707 0) | __do_fault() {
1708 0) | filemap_fault() {
1709 0) | find_lock_page() {
1710 0) 0.804 us | find_get_page();
1711 0) | __might_sleep() {
1712 0) 1.329 us | }
1713 0) 3.904 us | }
1714 0) 4.979 us | }
1715 0) 0.653 us | _spin_lock();
1716 0) 0.578 us | page_add_file_rmap();
1717 0) 0.525 us | native_set_pte_at();
1718 0) 0.585 us | _spin_unlock();
1719 0) | unlock_page() {
1720 0) 0.541 us | page_waitqueue();
1721 0) 0.639 us | __wake_up_bit();
1722 0) 2.786 us | }
1723 0) + 14.237 us | }
1724 0) | __do_fault() {
1725 0) | filemap_fault() {
1726 0) | find_lock_page() {
1727 0) 0.698 us | find_get_page();
1728 0) | __might_sleep() {
1729 0) 1.412 us | }
1730 0) 3.950 us | }
1731 0) 5.098 us | }
1732 0) 0.631 us | _spin_lock();
1733 0) 0.571 us | page_add_file_rmap();
1734 0) 0.526 us | native_set_pte_at();
1735 0) 0.586 us | _spin_unlock();
1736 0) | unlock_page() {
1737 0) 0.533 us | page_waitqueue();
1738 0) 0.638 us | __wake_up_bit();
1739 0) 2.793 us | }
1740 0) + 14.012 us | }
1741
1742You can also expand several functions at once:
1743
1744 echo sys_open > set_graph_function
1745 echo sys_close >> set_graph_function
1746
1747Now if you want to go back to trace all functions you can clear
1748this special filter via:
1749
1750 echo > set_graph_function
1751
1752
1356trace_pipe 1753trace_pipe
1357---------- 1754----------
1358 1755
1359The trace_pipe outputs the same content as the trace file, but the effect 1756The trace_pipe outputs the same content as the trace file, but
1360on the tracing is different. Every read from trace_pipe is consumed. 1757the effect on the tracing is different. Every read from
1361This means that subsequent reads will be different. The trace 1758trace_pipe is consumed. This means that subsequent reads will be
1362is live. 1759different. The trace is live.
1363 1760
1364 # echo function > /debug/tracing/current_tracer 1761 # echo function > /debug/tracing/current_tracer
1365 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1762 # cat /debug/tracing/trace_pipe > /tmp/trace.out &
@@ -1387,38 +1784,45 @@ is live.
1387 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up 1784 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
1388 1785
1389 1786
1390Note, reading the trace_pipe file will block until more input is added. 1787Note, reading the trace_pipe file will block until more input is
1391By changing the tracer, trace_pipe will issue an EOF. We needed 1788added. By changing the tracer, trace_pipe will issue an EOF. We
1392to set the function tracer _before_ we "cat" the trace_pipe file. 1789needed to set the function tracer _before_ we "cat" the
1790trace_pipe file.
1393 1791
1394 1792
1395trace entries 1793trace entries
1396------------- 1794-------------
1397 1795
1398Having too much or not enough data can be troublesome in diagnosing 1796Having too much or not enough data can be troublesome in
1399an issue in the kernel. The file buffer_size_kb is used to modify 1797diagnosing an issue in the kernel. The file buffer_size_kb is
1400the size of the internal trace buffers. The number listed 1798used to modify the size of the internal trace buffers. The
1401is the number of entries that can be recorded per CPU. To know 1799number listed is the number of entries that can be recorded per
1402the full size, multiply the number of possible CPUS with the 1800CPU. To know the full size, multiply the number of possible CPUS
1403number of entries. 1801with the number of entries.
1404 1802
1405 # cat /debug/tracing/buffer_size_kb 1803 # cat /debug/tracing/buffer_size_kb
14061408 (units kilobytes) 18041408 (units kilobytes)
1407 1805
1408Note, to modify this, you must have tracing completely disabled. To do that, 1806Note, to modify this, you must have tracing completely disabled.
1409echo "nop" into the current_tracer. If the current_tracer is not set 1807To do that, echo "nop" into the current_tracer. If the
1410to "nop", an EINVAL error will be returned. 1808current_tracer is not set to "nop", an EINVAL error will be
1809returned.
1411 1810
1412 # echo nop > /debug/tracing/current_tracer 1811 # echo nop > /debug/tracing/current_tracer
1413 # echo 10000 > /debug/tracing/buffer_size_kb 1812 # echo 10000 > /debug/tracing/buffer_size_kb
1414 # cat /debug/tracing/buffer_size_kb 1813 # cat /debug/tracing/buffer_size_kb
141510000 (units kilobytes) 181410000 (units kilobytes)
1416 1815
1417The number of pages which will be allocated is limited to a percentage 1816The number of pages which will be allocated is limited to a
1418of available memory. Allocating too much will produce an error. 1817percentage of available memory. Allocating too much will produce
1818an error.
1419 1819
1420 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1820 # echo 1000000000000 > /debug/tracing/buffer_size_kb
1421-bash: echo: write error: Cannot allocate memory 1821-bash: echo: write error: Cannot allocate memory
1422 # cat /debug/tracing/buffer_size_kb 1822 # cat /debug/tracing/buffer_size_kb
142385 182385
1424 1824
1825-----------
1826
1827More details can be found in the source code, in the
1828kernel/tracing/*.c files.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 54f21a5c262b..7643483bdd6a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -49,6 +49,7 @@ parameter is applicable:
49 ISAPNP ISA PnP code is enabled. 49 ISAPNP ISA PnP code is enabled.
50 ISDN Appropriate ISDN support is enabled. 50 ISDN Appropriate ISDN support is enabled.
51 JOY Appropriate joystick support is enabled. 51 JOY Appropriate joystick support is enabled.
52 KMEMTRACE kmemtrace is enabled.
52 LIBATA Libata driver is enabled 53 LIBATA Libata driver is enabled
53 LP Printer support is enabled. 54 LP Printer support is enabled.
54 LOOP Loopback device support is enabled. 55 LOOP Loopback device support is enabled.
@@ -1047,6 +1048,15 @@ and is between 256 and 4096 characters. It is defined in the file
1047 use the HighMem zone if it exists, and the Normal 1048 use the HighMem zone if it exists, and the Normal
1048 zone if it does not. 1049 zone if it does not.
1049 1050
1051 kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
1052 Controls whether kmemtrace is enabled
1053 at boot-time.
1054
1055 kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
1056 subbufs kmemtrace's relay channel has. Set this
1057 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1058 you experience buffer overruns.
1059
1050 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1060 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1051 is similar to kernelcore except it specifies the 1061 is similar to kernelcore except it specifies the
1052 amount of memory used for migratable allocations. 1062 amount of memory used for migratable allocations.
@@ -1310,8 +1320,13 @@ and is between 256 and 4096 characters. It is defined in the file
1310 1320
1311 memtest= [KNL,X86] Enable memtest 1321 memtest= [KNL,X86] Enable memtest
1312 Format: <integer> 1322 Format: <integer>
1313 range: 0,4 : pattern number
1314 default : 0 <disable> 1323 default : 0 <disable>
1324 Specifies the number of memtest passes to be
1325 performed. Each pass selects another test
1326 pattern from a given set of patterns. Memtest
1327 fills the memory with this pattern, validates
1328 memory contents and reserves bad memory
1329 regions that are detected.
1315 1330
1316 meye.*= [HW] Set MotionEye Camera parameters 1331 meye.*= [HW] Set MotionEye Camera parameters
1317 See Documentation/video4linux/meye.txt. 1332 See Documentation/video4linux/meye.txt.
@@ -2329,6 +2344,8 @@ and is between 256 and 4096 characters. It is defined in the file
2329 2344
2330 tp720= [HW,PS2] 2345 tp720= [HW,PS2]
2331 2346
2347 trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size.
2348
2332 trix= [HW,OSS] MediaTrix AudioTrix Pro 2349 trix= [HW,OSS] MediaTrix AudioTrix Pro
2333 Format: 2350 Format:
2334 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2351 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index 488773018152..938ea22f2cc0 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -27,33 +27,37 @@ lock-class.
27State 27State
28----- 28-----
29 29
30The validator tracks lock-class usage history into 5 separate state bits: 30The validator tracks lock-class usage history into 4n + 1 separate state bits:
31 31
32- 'ever held in hardirq context' [ == hardirq-safe ] 32- 'ever held in STATE context'
33- 'ever held in softirq context' [ == softirq-safe ] 33- 'ever head as readlock in STATE context'
34- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] 34- 'ever head with STATE enabled'
35- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] 35- 'ever head as readlock with STATE enabled'
36
37Where STATE can be either one of (kernel/lockdep_states.h)
38 - hardirq
39 - softirq
40 - reclaim_fs
36 41
37- 'ever used' [ == !unused ] 42- 'ever used' [ == !unused ]
38 43
39When locking rules are violated, these 4 state bits are presented in the 44When locking rules are violated, these state bits are presented in the
40locking error messages, inside curlies. A contrived example: 45locking error messages, inside curlies. A contrived example:
41 46
42 modprobe/2287 is trying to acquire lock: 47 modprobe/2287 is trying to acquire lock:
43 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 48 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
44 49
45 but task is already holding lock: 50 but task is already holding lock:
46 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 51 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
47 52
48 53
49The bit position indicates hardirq, softirq, hardirq-read, 54The bit position indicates STATE, STATE-read, for each of the states listed
50softirq-read respectively, and the character displayed in each 55above, and the character displayed in each indicates:
51indicates:
52 56
53 '.' acquired while irqs disabled 57 '.' acquired while irqs disabled
54 '+' acquired in irq context 58 '+' acquired in irq context
55 '-' acquired with irqs enabled 59 '-' acquired with irqs enabled
56 '?' read acquired in irq context with irqs enabled. 60 '?' acquired in irq context with irqs enabled.
57 61
58Unused mutexes cannot be part of the cause of an error. 62Unused mutexes cannot be part of the cause of an error.
59 63
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718afb..535aeb936dbc 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -113,6 +113,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
113 113
114'x' - Used by xmon interface on ppc/powerpc platforms. 114'x' - Used by xmon interface on ppc/powerpc platforms.
115 115
116'z' - Dump the ftrace buffer
117
116'0'-'9' - Sets the console log level, controlling which kernel messages 118'0'-'9' - Sets the console log level, controlling which kernel messages
117 will be printed to your console. ('0', for example would make 119 will be printed to your console. ('0', for example would make
118 it so that only emergency messages like PANICs or OOPSes would 120 it so that only emergency messages like PANICs or OOPSes would
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..4ff43c6de299 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
45#include <linux/tracepoint.h> 45#include <linux/tracepoint.h>
46 46
47DECLARE_TRACE(subsys_eventname, 47DECLARE_TRACE(subsys_eventname,
48 TPPROTO(int firstarg, struct task_struct *p), 48 TP_PROTO(int firstarg, struct task_struct *p),
49 TPARGS(firstarg, p)); 49 TP_ARGS(firstarg, p));
50 50
51In subsys/file.c (where the tracing statement must be added) : 51In subsys/file.c (where the tracing statement must be added) :
52 52
@@ -66,10 +66,10 @@ Where :
66 - subsys is the name of your subsystem. 66 - subsys is the name of your subsystem.
67 - eventname is the name of the event to trace. 67 - eventname is the name of the event to trace.
68 68
69- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the 69- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
70 function called by this tracepoint. 70 function called by this tracepoint.
71 71
72- TPARGS(firstarg, p) are the parameters names, same as found in the 72- TP_ARGS(firstarg, p) are the parameters names, same as found in the
73 prototype. 73 prototype.
74 74
75Connecting a function (probe) to a tracepoint is done by providing a 75Connecting a function (probe) to a tracepoint is done by providing a
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt
new file mode 100644
index 000000000000..a956d9b7f943
--- /dev/null
+++ b/Documentation/vm/kmemtrace.txt
@@ -0,0 +1,126 @@
1 kmemtrace - Kernel Memory Tracer
2
3 by Eduard - Gabriel Munteanu
4 <eduard.munteanu@linux360.ro>
5
6I. Introduction
7===============
8
9kmemtrace helps kernel developers figure out two things:
101) how different allocators (SLAB, SLUB etc.) perform
112) how kernel code allocates memory and how much
12
13To do this, we trace every allocation and export information to the userspace
14through the relay interface. We export things such as the number of requested
15bytes, the number of bytes actually allocated (i.e. including internal
16fragmentation), whether this is a slab allocation or a plain kmalloc() and so
17on.
18
19The actual analysis is performed by a userspace tool (see section III for
20details on where to get it from). It logs the data exported by the kernel,
21processes it and (as of writing this) can provide the following information:
22- the total amount of memory allocated and fragmentation per call-site
23- the amount of memory allocated and fragmentation per allocation
24- total memory allocated and fragmentation in the collected dataset
25- number of cross-CPU allocation and frees (makes sense in NUMA environments)
26
27Moreover, it can potentially find inconsistent and erroneous behavior in
28kernel code, such as using slab free functions on kmalloc'ed memory or
29allocating less memory than requested (but not truly failed allocations).
30
31kmemtrace also makes provisions for tracing on some arch and analysing the
32data on another.
33
34II. Design and goals
35====================
36
37kmemtrace was designed to handle rather large amounts of data. Thus, it uses
38the relay interface to export whatever is logged to userspace, which then
39stores it. Analysis and reporting is done asynchronously, that is, after the
40data is collected and stored. By design, it allows one to log and analyse
41on different machines and different arches.
42
43As of writing this, the ABI is not considered stable, though it might not
44change much. However, no guarantees are made about compatibility yet. When
45deemed stable, the ABI should still allow easy extension while maintaining
46backward compatibility. This is described further in Documentation/ABI.
47
48Summary of design goals:
49 - allow logging and analysis to be done across different machines
50 - be fast and anticipate usage in high-load environments (*)
51 - be reasonably extensible
52 - make it possible for GNU/Linux distributions to have kmemtrace
53 included in their repositories
54
55(*) - one of the reasons Pekka Enberg's original userspace data analysis
56 tool's code was rewritten from Perl to C (although this is more than a
57 simple conversion)
58
59
60III. Quick usage guide
61======================
62
631) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
64CONFIG_KMEMTRACE).
65
662) Get the userspace tool and build it:
67$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository
68$ cd kmemtrace-user/
69$ ./autogen.sh
70$ ./configure
71$ make
72
733) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
74'single' runlevel (so that relay buffers don't fill up easily), and run
75kmemtrace:
76# '$' does not mean user, but root here.
77$ mount -t debugfs none /sys/kernel/debug
78$ mount -t proc none /proc
79$ cd path/to/kmemtrace-user/
80$ ./kmemtraced
81Wait a bit, then stop it with CTRL+C.
82$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
83 # overrun, should
84 # be zero.
85$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
86 check its correctness]
87$ ./kmemtrace-report
88
89Now you should have a nice and short summary of how the allocator performs.
90
91IV. FAQ and known issues
92========================
93
94Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
95this? Should I worry?
96A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
97large the number is. You can fix it by supplying a higher
98'kmemtrace.subbufs=N' kernel parameter.
99---
100
101Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
102A: This is a bug and should be reported. It can occur for a variety of
103reasons:
104 - possible bugs in relay code
105 - possible misuse of relay by kmemtrace
106 - timestamps being collected unorderly
107Or you may fix it yourself and send us a patch.
108---
109
110Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
111A: This is a known issue and I'm working on it. These might be true errors
112in kernel code, which may have inconsistent behavior (e.g. allocating memory
113with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
114out this behavior may work with SLAB, but may fail with other allocators.
115
116It may also be due to lack of tracing in some unusual allocator functions.
117
118We don't want bug reports regarding this issue yet.
119---
120
121V. See also
122===========
123
124Documentation/kernel-parameters.txt
125Documentation/ABI/testing/debugfs-kmemtrace
126
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 7b4596ac4120..e0203662f9e9 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -158,7 +158,7 @@ Offset Proto Name Meaning
1580202/4 2.00+ header Magic signature "HdrS" 1580202/4 2.00+ header Magic signature "HdrS"
1590206/2 2.00+ version Boot protocol version supported 1590206/2 2.00+ version Boot protocol version supported
1600208/4 2.00+ realmode_swtch Boot loader hook (see below) 1600208/4 2.00+ realmode_swtch Boot loader hook (see below)
161020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) 161020C/2 2.00+ start_sys_seg The load-low segment (0x1000) (obsolete)
162020E/2 2.00+ kernel_version Pointer to kernel version string 162020E/2 2.00+ kernel_version Pointer to kernel version string
1630210/1 2.00+ type_of_loader Boot loader identifier 1630210/1 2.00+ type_of_loader Boot loader identifier
1640211/1 2.00+ loadflags Boot protocol option flags 1640211/1 2.00+ loadflags Boot protocol option flags
@@ -170,10 +170,11 @@ Offset Proto Name Meaning
1700224/2 2.01+ heap_end_ptr Free memory after setup end 1700224/2 2.01+ heap_end_ptr Free memory after setup end
1710226/2 N/A pad1 Unused 1710226/2 N/A pad1 Unused
1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
173022C/4 2.03+ initrd_addr_max Highest legal initrd address 173022C/4 2.03+ ramdisk_max Highest legal initrd address
1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1760235/3 N/A pad2 Unused 1760235/1 N/A pad2 Unused
1770236/2 N/A pad3 Unused
1770238/4 2.06+ cmdline_size Maximum size of the kernel command line 1780238/4 2.06+ cmdline_size Maximum size of the kernel command line
178023C/4 2.07+ hardware_subarch Hardware subarchitecture 179023C/4 2.07+ hardware_subarch Hardware subarchitecture
1790240/8 2.07+ hardware_subarch_data Subarchitecture-specific data 1800240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
@@ -299,14 +300,14 @@ Protocol: 2.00+
299 e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version 300 e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version
300 10.17. 301 10.17.
301 302
302Field name: readmode_swtch 303Field name: realmode_swtch
303Type: modify (optional) 304Type: modify (optional)
304Offset/size: 0x208/4 305Offset/size: 0x208/4
305Protocol: 2.00+ 306Protocol: 2.00+
306 307
307 Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) 308 Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.)
308 309
309Field name: start_sys 310Field name: start_sys_seg
310Type: read 311Type: read
311Offset/size: 0x20c/2 312Offset/size: 0x20c/2
312Protocol: 2.00+ 313Protocol: 2.00+
@@ -468,7 +469,7 @@ Protocol: 2.02+
468 zero, the kernel will assume that your boot loader does not support 469 zero, the kernel will assume that your boot loader does not support
469 the 2.02+ protocol. 470 the 2.02+ protocol.
470 471
471Field name: initrd_addr_max 472Field name: ramdisk_max
472Type: read 473Type: read
473Offset/size: 0x22c/4 474Offset/size: 0x22c/4
474Protocol: 2.03+ 475Protocol: 2.03+
@@ -542,7 +543,10 @@ Protocol: 2.08+
542 543
543 The payload may be compressed. The format of both the compressed and 544 The payload may be compressed. The format of both the compressed and
544 uncompressed data should be determined using the standard magic 545 uncompressed data should be determined using the standard magic
545 numbers. Currently only gzip compressed ELF is used. 546 numbers. The currently supported compression formats are gzip
547 (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A) and LZMA
548 (magic number 5D 00). The uncompressed payload is currently always ELF
549 (magic number 7F 45 4C 46).
546 550
547Field name: payload_length 551Field name: payload_length
548Type: read 552Type: read
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 000000000000..607b1a016064
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
1
2Mini-HOWTO for using the earlyprintk=dbgp boot option with a
3USB2 Debug port key and a debug cable, on x86 systems.
4
5You need two computers, the 'USB debug key' special gadget and
6and two USB cables, connected like this:
7
8 [host/target] <-------> [USB debug key] <-------> [client/console]
9
101. There are three specific hardware requirements:
11
12 a.) Host/target system needs to have USB debug port capability.
13
14 You can check this capability by looking at a 'Debug port' bit in
15 the lspci -vvv output:
16
17 # lspci -vvv
18 ...
19 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
20 Subsystem: Lenovo ThinkPad T61
21 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
22 Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
23 Latency: 0
24 Interrupt: pin D routed to IRQ 19
25 Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
26 Capabilities: [50] Power Management version 2
27 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
28 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
29 Capabilities: [58] Debug port: BAR=1 offset=00a0
30 ^^^^^^^^^^^ <==================== [ HERE ]
31 Kernel driver in use: ehci_hcd
32 Kernel modules: ehci-hcd
33 ...
34
35( If your system does not list a debug port capability then you probably
36 wont be able to use the USB debug key. )
37
38 b.) You also need a Netchip USB debug cable/key:
39
40 http://www.plxtech.com/products/NET2000/NET20DC/default.asp
41
42 This is a small blue plastic connector with two USB connections,
43 it draws power from its USB connections.
44
45 c.) Thirdly, you need a second client/console system with a regular USB port.
46
472. Software requirements:
48
49 a.) On the host/target system:
50
51 You need to enable the following kernel config option:
52
53 CONFIG_EARLY_PRINTK_DBGP=y
54
55 And you need to add the boot command line: "earlyprintk=dbgp".
56 (If you are using Grub, append it to the 'kernel' line in
57 /etc/grub.conf)
58
59 NOTE: normally earlyprintk console gets turned off once the
60 regular console is alive - use "earlyprintk=dbgp,keep" to keep
61 this channel open beyond early bootup. This can be useful for
62 debugging crashes under Xorg, etc.
63
64 b.) On the client/console system:
65
66 You should enable the following kernel config option:
67
68 CONFIG_USB_SERIAL_DEBUG=y
69
70 On the next bootup with the modified kernel you should
71 get a /dev/ttyUSBx device(s).
72
73 Now this channel of kernel messages is ready to be used: start
74 your favorite terminal emulator (minicom, etc.) and set
75 it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
76 see the raw output.
77
78 c.) On Nvidia Southbridge based systems: the kernel will try to probe
79 and find out which port has debug device connected.
80
813. Testing that it works fine:
82
83 You can test the output by using earlyprintk=dbgp,keep and provoking
84 kernel messages on the host/target system. You can provoke a harmless
85 kernel message by for example doing:
86
87 echo h > /proc/sysrq-trigger
88
89 On the host/target system you should see this help line in "dmesg" output:
90
91 SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
92
93 On the client/console system do:
94
95 cat /dev/ttyUSB0
96
97 And you should see the help line above displayed shortly after you've
98 provoked it on the host system.
99
100If it does not work then please ask about it on the linux-kernel@vger.kernel.org
101mailing list or contact the x86 maintainers.
diff --git a/MAINTAINERS b/MAINTAINERS
index 5d460c9d1c2c..dd3c11c4c3d2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2621,6 +2621,12 @@ M: jason.wessel@windriver.com
2621L: kgdb-bugreport@lists.sourceforge.net 2621L: kgdb-bugreport@lists.sourceforge.net
2622S: Maintained 2622S: Maintained
2623 2623
2624KMEMTRACE
2625P: Eduard - Gabriel Munteanu
2626M: eduard.munteanu@linux360.ro
2627L: linux-kernel@vger.kernel.org
2628S: Maintained
2629
2624KPROBES 2630KPROBES
2625P: Ananth N Mavinakayanahalli 2631P: Ananth N Mavinakayanahalli
2626M: ananth@in.ibm.com 2632M: ananth@in.ibm.com
diff --git a/Makefile b/Makefile
index a2c2efe9e82e..828028d4a448 100644
--- a/Makefile
+++ b/Makefile
@@ -533,8 +533,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
533endif 533endif
534 534
535# Force gcc to behave correct even for buggy distributions 535# Force gcc to behave correct even for buggy distributions
536# Arch Makefiles may override this setting 536ifndef CONFIG_CC_STACKPROTECTOR
537KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) 537KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
538endif
538 539
539ifdef CONFIG_FRAME_POINTER 540ifdef CONFIG_FRAME_POINTER
540KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls 541KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
diff --git a/arch/Kconfig b/arch/Kconfig
index 550dab22daa1..a092dc77c24d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
6 tristate "OProfile system profiling (EXPERIMENTAL)" 6 tristate "OProfile system profiling (EXPERIMENTAL)"
7 depends on PROFILING 7 depends on PROFILING
8 depends on HAVE_OPROFILE 8 depends on HAVE_OPROFILE
9 depends on TRACING_SUPPORT
9 select TRACING 10 select TRACING
10 select RING_BUFFER 11 select RING_BUFFER
11 help 12 help
diff --git a/arch/alpha/include/asm/ftrace.h b/arch/alpha/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/alpha/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h
index d953e234daa8..88971460fa6c 100644
--- a/arch/alpha/include/asm/hardirq.h
+++ b/arch/alpha/include/asm/hardirq.h
@@ -14,17 +14,4 @@ typedef struct {
14 14
15void ack_bad_irq(unsigned int irq); 15void ack_bad_irq(unsigned int irq);
16 16
17#define HARDIRQ_BITS 12
18
19/*
20 * The hardirq mask has to be large enough to have
21 * space for potentially nestable IRQ sources in the system
22 * to nest on a single CPU. On Alpha, interrupts are masked at the CPU
23 * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode)
24 * so we really only have 8 nestable IRQs, but allow some overhead
25 */
26#if (1 << HARDIRQ_BITS) < 16
27#error HARDIRQ_BITS is too low!
28#endif
29
30#endif /* _ALPHA_HARDIRQ_H */ 17#endif /* _ALPHA_HARDIRQ_H */
diff --git a/arch/alpha/include/asm/statfs.h b/arch/alpha/include/asm/statfs.h
index de35cd438a10..ccd2e186bfd8 100644
--- a/arch/alpha/include/asm/statfs.h
+++ b/arch/alpha/include/asm/statfs.h
@@ -1,6 +1,8 @@
1#ifndef _ALPHA_STATFS_H 1#ifndef _ALPHA_STATFS_H
2#define _ALPHA_STATFS_H 2#define _ALPHA_STATFS_H
3 3
4#include <linux/types.h>
5
4/* Alpha is the only 64-bit platform with 32-bit statfs. And doesn't 6/* Alpha is the only 64-bit platform with 32-bit statfs. And doesn't
5 even seem to implement statfs64 */ 7 even seem to implement statfs64 */
6#define __statfs_word __u32 8#define __statfs_word __u32
diff --git a/arch/alpha/include/asm/swab.h b/arch/alpha/include/asm/swab.h
index 68e7089e02d5..4d682b16c7c4 100644
--- a/arch/alpha/include/asm/swab.h
+++ b/arch/alpha/include/asm/swab.h
@@ -1,7 +1,7 @@
1#ifndef _ALPHA_SWAB_H 1#ifndef _ALPHA_SWAB_H
2#define _ALPHA_SWAB_H 2#define _ALPHA_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6#include <asm/compiler.h> 6#include <asm/compiler.h>
7 7
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 703731accda6..7bc7489223f3 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); 55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
56 last_cpu = cpu; 56 last_cpu = cpu;
57 57
58 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 58 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu)); 59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
60 return 0; 60 return 0;
61} 61}
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab312..af71d38c8e41 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
189 189
190 if (alpha_using_srm) { 190 if (alpha_using_srm) {
191 static struct vm_struct console_remap_vm; 191 static struct vm_struct console_remap_vm;
192 unsigned long vaddr = VMALLOC_START; 192 unsigned long nr_pages = 0;
193 unsigned long vaddr;
193 unsigned long i, j; 194 unsigned long i, j;
194 195
196 /* calculate needed size */
197 for (i = 0; i < crb->map_entries; ++i)
198 nr_pages += crb->map[i].count;
199
200 /* register the vm area */
201 console_remap_vm.flags = VM_ALLOC;
202 console_remap_vm.size = nr_pages << PAGE_SHIFT;
203 vm_area_register_early(&console_remap_vm, PAGE_SIZE);
204
205 vaddr = (unsigned long)console_remap_vm.addr;
206
195 /* Set up the third level PTEs and update the virtual 207 /* Set up the third level PTEs and update the virtual
196 addresses of the CRB entries. */ 208 addresses of the CRB entries. */
197 for (i = 0; i < crb->map_entries; ++i) { 209 for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
213 vaddr += PAGE_SIZE; 225 vaddr += PAGE_SIZE;
214 } 226 }
215 } 227 }
216
217 /* Let vmalloc know that we've allocated some space. */
218 console_remap_vm.flags = VM_ALLOC;
219 console_remap_vm.addr = (void *) VMALLOC_START;
220 console_remap_vm.size = vaddr - VMALLOC_START;
221 vmlist = &console_remap_vm;
222 } 228 }
223 229
224 callback_init_done = 1; 230 callback_init_done = 1;
diff --git a/arch/arm/include/asm/a.out.h b/arch/arm/include/asm/a.out.h
index 79489fdcc8b8..083894b2e3bc 100644
--- a/arch/arm/include/asm/a.out.h
+++ b/arch/arm/include/asm/a.out.h
@@ -2,7 +2,7 @@
2#define __ARM_A_OUT_H__ 2#define __ARM_A_OUT_H__
3 3
4#include <linux/personality.h> 4#include <linux/personality.h>
5#include <asm/types.h> 5#include <linux/types.h>
6 6
7struct exec 7struct exec
8{ 8{
diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h
index f2cd18a0932b..ee1304f22f94 100644
--- a/arch/arm/include/asm/setup.h
+++ b/arch/arm/include/asm/setup.h
@@ -14,7 +14,7 @@
14#ifndef __ASMARM_SETUP_H 14#ifndef __ASMARM_SETUP_H
15#define __ASMARM_SETUP_H 15#define __ASMARM_SETUP_H
16 16
17#include <asm/types.h> 17#include <linux/types.h>
18 18
19#define COMMAND_LINE_SIZE 1024 19#define COMMAND_LINE_SIZE 1024
20 20
diff --git a/arch/arm/include/asm/swab.h b/arch/arm/include/asm/swab.h
index 27a689be0856..ca2bf2f6d6ea 100644
--- a/arch/arm/include/asm/swab.h
+++ b/arch/arm/include/asm/swab.h
@@ -16,7 +16,7 @@
16#define __ASM_ARM_SWAB_H 16#define __ASM_ARM_SWAB_H
17 17
18#include <linux/compiler.h> 18#include <linux/compiler.h>
19#include <asm/types.h> 19#include <linux/types.h>
20 20
21#if !defined(__STRICT_ANSI__) || defined(__KERNEL__) 21#if !defined(__STRICT_ANSI__) || defined(__KERNEL__)
22# define __SWAB_64_THRU_32__ 22# define __SWAB_64_THRU_32__
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 363db186cb93..45eacb5a2ecd 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -104,6 +104,11 @@ static struct irq_desc bad_irq_desc = {
104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock), 104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
105}; 105};
106 106
107#ifdef CONFIG_CPUMASK_OFFSTACK
108/* We are not allocating bad_irq_desc.affinity or .pending_mask */
109#error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
110#endif
111
107/* 112/*
108 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not 113 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not
109 * come via this function. Instead, they should provide their 114 * come via this function. Instead, they should provide their
@@ -161,7 +166,7 @@ void __init init_IRQ(void)
161 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; 166 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
162 167
163#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
164 bad_irq_desc.affinity = CPU_MASK_ALL; 169 cpumask_setall(bad_irq_desc.affinity);
165 bad_irq_desc.cpu = smp_processor_id(); 170 bad_irq_desc.cpu = smp_processor_id();
166#endif 171#endif
167 init_arch_irq(); 172 init_arch_irq();
@@ -191,15 +196,16 @@ void migrate_irqs(void)
191 struct irq_desc *desc = irq_desc + i; 196 struct irq_desc *desc = irq_desc + i;
192 197
193 if (desc->cpu == cpu) { 198 if (desc->cpu == cpu) {
194 unsigned int newcpu = any_online_cpu(desc->affinity); 199 unsigned int newcpu = cpumask_any_and(desc->affinity,
195 200 cpu_online_mask);
196 if (newcpu == NR_CPUS) { 201 if (newcpu >= nr_cpu_ids) {
197 if (printk_ratelimit()) 202 if (printk_ratelimit())
198 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n", 203 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
199 i, cpu); 204 i, cpu);
200 205
201 cpus_setall(desc->affinity); 206 cpumask_setall(desc->affinity);
202 newcpu = any_online_cpu(desc->affinity); 207 newcpu = cpumask_any_and(desc->affinity,
208 cpu_online_mask);
203 } 209 }
204 210
205 route_irq(desc, i, newcpu); 211 route_irq(desc, i, newcpu);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 00216071eaf7..85598f7da407 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -65,6 +65,7 @@ SECTIONS
65#endif 65#endif
66 . = ALIGN(4096); 66 . = ALIGN(4096);
67 __per_cpu_start = .; 67 __per_cpu_start = .;
68 *(.data.percpu.page_aligned)
68 *(.data.percpu) 69 *(.data.percpu)
69 *(.data.percpu.shared_aligned) 70 *(.data.percpu.shared_aligned)
70 __per_cpu_end = .; 71 __per_cpu_end = .;
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 6d6bd5899240..853d42bb8682 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -263,7 +263,7 @@ static void em_route_irq(int irq, unsigned int cpu)
263 const struct cpumask *mask = cpumask_of(cpu); 263 const struct cpumask *mask = cpumask_of(cpu);
264 264
265 spin_lock_irq(&desc->lock); 265 spin_lock_irq(&desc->lock);
266 desc->affinity = *mask; 266 cpumask_copy(desc->affinity, mask);
267 desc->chip->set_affinity(irq, mask); 267 desc->chip->set_affinity(irq, mask);
268 spin_unlock_irq(&desc->lock); 268 spin_unlock_irq(&desc->lock);
269} 269}
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b0..05fe3053dcae 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
181config QUICKLIST 181config QUICKLIST
182 def_bool y 182 def_bool y
183 183
184config HAVE_ARCH_BOOTMEM_NODE 184config HAVE_ARCH_BOOTMEM
185 def_bool n 185 def_bool n
186 186
187config ARCH_HAVE_MEMORY_PRESENT 187config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/avr32/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h
index 267354356f60..015bc75ea798 100644
--- a/arch/avr32/include/asm/hardirq.h
+++ b/arch/avr32/include/asm/hardirq.h
@@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq);
20 20
21#endif /* __ASSEMBLY__ */ 21#endif /* __ASSEMBLY__ */
22 22
23#define HARDIRQ_BITS 12
24
25/*
26 * The hardirq mask has to be large enough to have
27 * space for potentially all IRQ sources in the system
28 * nesting on a single CPU:
29 */
30#if (1 << HARDIRQ_BITS) < NR_IRQS
31# error HARDIRQ_BITS is too low!
32#endif
33
34#endif /* __ASM_AVR32_HARDIRQ_H */ 23#endif /* __ASM_AVR32_HARDIRQ_H */
diff --git a/arch/avr32/include/asm/swab.h b/arch/avr32/include/asm/swab.h
index a14aa5b46d98..14cc737bbca6 100644
--- a/arch/avr32/include/asm/swab.h
+++ b/arch/avr32/include/asm/swab.h
@@ -4,7 +4,7 @@
4#ifndef __ASM_AVR32_SWAB_H 4#ifndef __ASM_AVR32_SWAB_H
5#define __ASM_AVR32_SWAB_H 5#define __ASM_AVR32_SWAB_H
6 6
7#include <asm/types.h> 7#include <linux/types.h>
8#include <linux/compiler.h> 8#include <linux/compiler.h>
9 9
10#define __SWAB_64_THRU_32__ 10#define __SWAB_64_THRU_32__
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/blackfin/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
index 797c0c165069..c94c7bc88c71 100644
--- a/arch/blackfin/include/asm/percpu.h
+++ b/arch/blackfin/include/asm/percpu.h
@@ -3,14 +3,4 @@
3 3
4#include <asm-generic/percpu.h> 4#include <asm-generic/percpu.h>
5 5
6#ifdef CONFIG_MODULES
7#define PERCPU_MODULE_RESERVE 8192
8#else
9#define PERCPU_MODULE_RESERVE 0
10#endif
11
12#define PERCPU_ENOUGH_ROOM \
13 (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
14 PERCPU_MODULE_RESERVE)
15
16#endif /* __ARCH_BLACKFIN_PERCPU__ */ 6#endif /* __ARCH_BLACKFIN_PERCPU__ */
diff --git a/arch/blackfin/include/asm/swab.h b/arch/blackfin/include/asm/swab.h
index 69a051b612bd..6403ad2932eb 100644
--- a/arch/blackfin/include/asm/swab.h
+++ b/arch/blackfin/include/asm/swab.h
@@ -1,7 +1,7 @@
1#ifndef _BLACKFIN_SWAB_H 1#ifndef _BLACKFIN_SWAB_H
2#define _BLACKFIN_SWAB_H 2#define _BLACKFIN_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6 6
7#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) 7#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__)
diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c
index 7fd126564846..1ab5b532ec72 100644
--- a/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@ -70,6 +70,11 @@ static struct irq_desc bad_irq_desc = {
70#endif 70#endif
71}; 71};
72 72
73#ifdef CONFIG_CPUMASK_OFFSTACK
74/* We are not allocating a variable-sized bad_irq_desc.affinity */
75#error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
76#endif
77
73int show_interrupts(struct seq_file *p, void *v) 78int show_interrupts(struct seq_file *p, void *v)
74{ 79{
75 int i = *(loff_t *) v, j; 80 int i = *(loff_t *) v, j;
diff --git a/arch/cris/include/asm/ftrace.h b/arch/cris/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/cris/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/h8300/include/asm/ftrace.h b/arch/h8300/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/h8300/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/h8300/include/asm/swab.h b/arch/h8300/include/asm/swab.h
index c108f39b8bc4..39abbf52807d 100644
--- a/arch/h8300/include/asm/swab.h
+++ b/arch/h8300/include/asm/swab.h
@@ -1,7 +1,7 @@
1#ifndef _H8300_SWAB_H 1#ifndef _H8300_SWAB_H
2#define _H8300_SWAB_H 2#define _H8300_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5 5
6#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) 6#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__)
7# define __SWAB_64_THRU_32__ 7# define __SWAB_64_THRU_32__
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 153e727a6e8e..294a3b13ecac 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -22,6 +22,9 @@ config IA64
22 select HAVE_OPROFILE 22 select HAVE_OPROFILE
23 select HAVE_KPROBES 23 select HAVE_KPROBES
24 select HAVE_KRETPROBES 24 select HAVE_KRETPROBES
25 select HAVE_FTRACE_MCOUNT_RECORD
26 select HAVE_DYNAMIC_FTRACE if (!ITANIUM)
27 select HAVE_FUNCTION_TRACER
25 select HAVE_DMA_ATTRS 28 select HAVE_DMA_ATTRS
26 select HAVE_KVM 29 select HAVE_KVM
27 select HAVE_ARCH_TRACEHOOK 30 select HAVE_ARCH_TRACEHOOK
diff --git a/arch/ia64/include/asm/fpu.h b/arch/ia64/include/asm/fpu.h
index 3859558ff0a4..0c26157cffa5 100644
--- a/arch/ia64/include/asm/fpu.h
+++ b/arch/ia64/include/asm/fpu.h
@@ -6,8 +6,6 @@
6 * David Mosberger-Tang <davidm@hpl.hp.com> 6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 */ 7 */
8 8
9#include <asm/types.h>
10
11/* floating point status register: */ 9/* floating point status register: */
12#define FPSR_TRAP_VD (1 << 0) /* invalid op trap disabled */ 10#define FPSR_TRAP_VD (1 << 0) /* invalid op trap disabled */
13#define FPSR_TRAP_DD (1 << 1) /* denormal trap disabled */ 11#define FPSR_TRAP_DD (1 << 1) /* denormal trap disabled */
diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h
new file mode 100644
index 000000000000..d20db3c2a656
--- /dev/null
+++ b/arch/ia64/include/asm/ftrace.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_IA64_FTRACE_H
2#define _ASM_IA64_FTRACE_H
3
4#ifdef CONFIG_FUNCTION_TRACER
5#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */
6
7#ifndef __ASSEMBLY__
8extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0);
9#define mcount _mcount
10
11#include <asm/kprobes.h>
12/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */
13#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip)
14#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip)
15
16static inline unsigned long ftrace_call_adjust(unsigned long addr)
17{
18 /* second bundle, insn 2 */
19 return addr - 0x12;
20}
21
22struct dyn_arch_ftrace {
23};
24#endif
25
26#endif /* CONFIG_FUNCTION_TRACER */
27
28#endif /* _ASM_IA64_FTRACE_H */
diff --git a/arch/ia64/include/asm/gcc_intrin.h b/arch/ia64/include/asm/gcc_intrin.h
index 0f5b55921758..c2c5fd8fcac4 100644
--- a/arch/ia64/include/asm/gcc_intrin.h
+++ b/arch/ia64/include/asm/gcc_intrin.h
@@ -6,6 +6,7 @@
6 * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com> 6 * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
7 */ 7 */
8 8
9#include <linux/types.h>
9#include <linux/compiler.h> 10#include <linux/compiler.h>
10 11
11/* define this macro to get some asm stmts included in 'c' files */ 12/* define this macro to get some asm stmts included in 'c' files */
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index 140e495b8e0e..d514cd9edb49 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -20,16 +20,6 @@
20 20
21#define local_softirq_pending() (local_cpu_data->softirq_pending) 21#define local_softirq_pending() (local_cpu_data->softirq_pending)
22 22
23#define HARDIRQ_BITS 14
24
25/*
26 * The hardirq mask has to be large enough to have space for potentially all IRQ sources
27 * in the system nesting on a single CPU:
28 */
29#if (1 << HARDIRQ_BITS) < NR_IRQS
30# error HARDIRQ_BITS is too low!
31#endif
32
33extern void __iomem *ipi_base_addr; 23extern void __iomem *ipi_base_addr;
34 24
35void ack_bad_irq(unsigned int irq); 25void ack_bad_irq(unsigned int irq);
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
index a3e44a5ed497..c47830e26cb7 100644
--- a/arch/ia64/include/asm/intrinsics.h
+++ b/arch/ia64/include/asm/intrinsics.h
@@ -10,6 +10,7 @@
10 10
11#ifndef __ASSEMBLY__ 11#ifndef __ASSEMBLY__
12 12
13#include <linux/types.h>
13/* include compiler specific intrinsics */ 14/* include compiler specific intrinsics */
14#include <asm/ia64regs.h> 15#include <asm/ia64regs.h>
15#ifdef __INTEL_COMPILER 16#ifdef __INTEL_COMPILER
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index bfa86b6af7cd..2b0a38e84705 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -21,8 +21,7 @@
21 * 21 *
22 */ 22 */
23 23
24#include <asm/types.h> 24#include <linux/types.h>
25
26#include <linux/ioctl.h> 25#include <linux/ioctl.h>
27 26
28/* Select x86 specific features in <linux/kvm.h> */ 27/* Select x86 specific features in <linux/kvm.h> */
diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
index 77f30b664b4e..30cf46534dd2 100644
--- a/arch/ia64/include/asm/percpu.h
+++ b/arch/ia64/include/asm/percpu.h
@@ -27,12 +27,12 @@ extern void *per_cpu_init(void);
27 27
28#else /* ! SMP */ 28#else /* ! SMP */
29 29
30#define PER_CPU_ATTRIBUTES __attribute__((__section__(".data.percpu")))
31
32#define per_cpu_init() (__phys_per_cpu_start) 30#define per_cpu_init() (__phys_per_cpu_start)
33 31
34#endif /* SMP */ 32#endif /* SMP */
35 33
34#define PER_CPU_BASE_SECTION ".data.percpu"
35
36/* 36/*
37 * Be extremely careful when taking the address of this variable! Due to virtual 37 * Be extremely careful when taking the address of this variable! Due to virtual
38 * remapping, it is different from the canonical address returned by __get_cpu_var(var)! 38 * remapping, it is different from the canonical address returned by __get_cpu_var(var)!
diff --git a/arch/ia64/include/asm/swab.h b/arch/ia64/include/asm/swab.h
index 6aa58b699eea..c89a8cb5d8a5 100644
--- a/arch/ia64/include/asm/swab.h
+++ b/arch/ia64/include/asm/swab.h
@@ -6,7 +6,7 @@
6 * David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co. 6 * David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co.
7 */ 7 */
8 8
9#include <asm/types.h> 9#include <linux/types.h>
10#include <asm/intrinsics.h> 10#include <asm/intrinsics.h>
11#include <linux/compiler.h> 11#include <linux/compiler.h>
12 12
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 32f3af1641c5..3193f4417e16 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -84,7 +84,7 @@ void build_cpu_to_node_map(void);
84 .child = NULL, \ 84 .child = NULL, \
85 .groups = NULL, \ 85 .groups = NULL, \
86 .min_interval = 8, \ 86 .min_interval = 8, \
87 .max_interval = 8*(min(num_online_cpus(), 32)), \ 87 .max_interval = 8*(min(num_online_cpus(), 32U)), \
88 .busy_factor = 64, \ 88 .busy_factor = 64, \
89 .imbalance_pct = 125, \ 89 .imbalance_pct = 125, \
90 .cache_nice_tries = 2, \ 90 .cache_nice_tries = 2, \
diff --git a/arch/ia64/include/asm/uv/uv.h b/arch/ia64/include/asm/uv/uv.h
new file mode 100644
index 000000000000..61b5bdfd980e
--- /dev/null
+++ b/arch/ia64/include/asm/uv/uv.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_IA64_UV_UV_H
2#define _ASM_IA64_UV_UV_H
3
4#include <asm/system.h>
5#include <asm/sn/simulator.h>
6
7static inline int is_uv_system(void)
8{
9 /* temporary support for running on hardware simulator */
10 return IS_MEDUSA() || ia64_platform_is("uv");
11}
12
13#endif /* _ASM_IA64_UV_UV_H */
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea954892..ab6e7ec0bba3 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -2,6 +2,10 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5ifdef CONFIG_DYNAMIC_FTRACE
6CFLAGS_REMOVE_ftrace.o = -pg
7endif
8
5extra-y := head.o init_task.o vmlinux.lds 9extra-y := head.o init_task.o vmlinux.lds
6 10
7obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ 11obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
@@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
28obj-$(CONFIG_CPU_FREQ) += cpufreq/ 32obj-$(CONFIG_CPU_FREQ) += cpufreq/
29obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o 33obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
30obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o 34obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
35obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
31obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o 36obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 37obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
33obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o 38obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index d541671caf4a..bdef2ce38c8b 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -199,6 +199,10 @@ char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
199 return __va(phys_addr); 199 return __va(phys_addr);
200} 200}
201 201
202void __init __acpi_unmap_table(char *map, unsigned long size)
203{
204}
205
202/* -------------------------------------------------------------------------- 206/* --------------------------------------------------------------------------
203 Boot-time Table Parsing 207 Boot-time Table Parsing
204 -------------------------------------------------------------------------- */ 208 -------------------------------------------------------------------------- */
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e5341e2c1175..7e3382b06d56 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -47,6 +47,7 @@
47#include <asm/processor.h> 47#include <asm/processor.h>
48#include <asm/thread_info.h> 48#include <asm/thread_info.h>
49#include <asm/unistd.h> 49#include <asm/unistd.h>
50#include <asm/ftrace.h>
50 51
51#include "minstate.h" 52#include "minstate.h"
52 53
@@ -1404,6 +1405,105 @@ GLOBAL_ENTRY(unw_init_running)
1404 br.ret.sptk.many rp 1405 br.ret.sptk.many rp
1405END(unw_init_running) 1406END(unw_init_running)
1406 1407
1408#ifdef CONFIG_FUNCTION_TRACER
1409#ifdef CONFIG_DYNAMIC_FTRACE
1410GLOBAL_ENTRY(_mcount)
1411 br ftrace_stub
1412END(_mcount)
1413
1414.here:
1415 br.ret.sptk.many b0
1416
1417GLOBAL_ENTRY(ftrace_caller)
1418 alloc out0 = ar.pfs, 8, 0, 4, 0
1419 mov out3 = r0
1420 ;;
1421 mov out2 = b0
1422 add r3 = 0x20, r3
1423 mov out1 = r1;
1424 br.call.sptk.many b0 = ftrace_patch_gp
1425 //this might be called from module, so we must patch gp
1426ftrace_patch_gp:
1427 movl gp=__gp
1428 mov b0 = r3
1429 ;;
1430.global ftrace_call;
1431ftrace_call:
1432{
1433 .mlx
1434 nop.m 0x0
1435 movl r3 = .here;;
1436}
1437 alloc loc0 = ar.pfs, 4, 4, 2, 0
1438 ;;
1439 mov loc1 = b0
1440 mov out0 = b0
1441 mov loc2 = r8
1442 mov loc3 = r15
1443 ;;
1444 adds out0 = -MCOUNT_INSN_SIZE, out0
1445 mov out1 = in2
1446 mov b6 = r3
1447
1448 br.call.sptk.many b0 = b6
1449 ;;
1450 mov ar.pfs = loc0
1451 mov b0 = loc1
1452 mov r8 = loc2
1453 mov r15 = loc3
1454 br ftrace_stub
1455 ;;
1456END(ftrace_caller)
1457
1458#else
1459GLOBAL_ENTRY(_mcount)
1460 movl r2 = ftrace_stub
1461 movl r3 = ftrace_trace_function;;
1462 ld8 r3 = [r3];;
1463 ld8 r3 = [r3];;
1464 cmp.eq p7,p0 = r2, r3
1465(p7) br.sptk.many ftrace_stub
1466 ;;
1467
1468 alloc loc0 = ar.pfs, 4, 4, 2, 0
1469 ;;
1470 mov loc1 = b0
1471 mov out0 = b0
1472 mov loc2 = r8
1473 mov loc3 = r15
1474 ;;
1475 adds out0 = -MCOUNT_INSN_SIZE, out0
1476 mov out1 = in2
1477 mov b6 = r3
1478
1479 br.call.sptk.many b0 = b6
1480 ;;
1481 mov ar.pfs = loc0
1482 mov b0 = loc1
1483 mov r8 = loc2
1484 mov r15 = loc3
1485 br ftrace_stub
1486 ;;
1487END(_mcount)
1488#endif
1489
1490GLOBAL_ENTRY(ftrace_stub)
1491 mov r3 = b0
1492 movl r2 = _mcount_ret_helper
1493 ;;
1494 mov b6 = r2
1495 mov b7 = r3
1496 br.ret.sptk.many b6
1497
1498_mcount_ret_helper:
1499 mov b0 = r42
1500 mov r1 = r41
1501 mov ar.pfs = r40
1502 br b7
1503END(ftrace_stub)
1504
1505#endif /* CONFIG_FUNCTION_TRACER */
1506
1407 .rodata 1507 .rodata
1408 .align 8 1508 .align 8
1409 .globl sys_call_table 1509 .globl sys_call_table
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
new file mode 100644
index 000000000000..7fc8c961b1f7
--- /dev/null
+++ b/arch/ia64/kernel/ftrace.c
@@ -0,0 +1,206 @@
1/*
2 * Dynamic function tracing support.
3 *
4 * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com>
5 *
6 * For licencing details, see COPYING.
7 *
8 * Defines low-level handling of mcount calls when the kernel
9 * is compiled with the -pg flag. When using dynamic ftrace, the
10 * mcount call-sites get patched lazily with NOP till they are
11 * enabled. All code mutation routines here take effect atomically.
12 */
13
14#include <linux/uaccess.h>
15#include <linux/ftrace.h>
16
17#include <asm/cacheflush.h>
18#include <asm/patch.h>
19
20/* In IA64, each function will be added below two bundles with -pg option */
21static unsigned char __attribute__((aligned(8)))
22ftrace_orig_code[MCOUNT_INSN_SIZE] = {
23 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */
24 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */
25 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */
26 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */
27 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */
28 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */
29};
30
31struct ftrace_orig_insn {
32 u64 dummy1, dummy2, dummy3;
33 u64 dummy4:64-41+13;
34 u64 imm20:20;
35 u64 dummy5:3;
36 u64 sign:1;
37 u64 dummy6:4;
38};
39
40/* mcount stub will be converted below for nop */
41static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = {
42 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
43 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
44 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */
45 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
46 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */
47 0x00, 0x00, 0x04, 0x00
48};
49
50static unsigned char *ftrace_nop_replace(void)
51{
52 return ftrace_nop_code;
53}
54
55/*
56 * mcount stub will be converted below for call
57 * Note: Just the last instruction is changed against nop
58 * */
59static unsigned char __attribute__((aligned(8)))
60ftrace_call_code[MCOUNT_INSN_SIZE] = {
61 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
62 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
63 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */
64 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
65 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/
66 0xf8, 0xff, 0xff, 0xc8
67};
68
69struct ftrace_call_insn {
70 u64 dummy1, dummy2;
71 u64 dummy3:48;
72 u64 imm39_l:16;
73 u64 imm39_h:23;
74 u64 dummy4:13;
75 u64 imm20:20;
76 u64 dummy5:3;
77 u64 i:1;
78 u64 dummy6:4;
79};
80
81static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
82{
83 struct ftrace_call_insn *code = (void *)ftrace_call_code;
84 unsigned long offset = addr - (ip + 0x10);
85
86 code->imm39_l = offset >> 24;
87 code->imm39_h = offset >> 40;
88 code->imm20 = offset >> 4;
89 code->i = offset >> 63;
90 return ftrace_call_code;
91}
92
93static int
94ftrace_modify_code(unsigned long ip, unsigned char *old_code,
95 unsigned char *new_code, int do_check)
96{
97 unsigned char replaced[MCOUNT_INSN_SIZE];
98
99 /*
100 * Note: Due to modules and __init, code can
101 * disappear and change, we need to protect against faulting
102 * as well as code changing. We do this by using the
103 * probe_kernel_* functions.
104 *
105 * No real locking needed, this code is run through
106 * kstop_machine, or before SMP starts.
107 */
108
109 if (!do_check)
110 goto skip_check;
111
112 /* read the text we want to modify */
113 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
114 return -EFAULT;
115
116 /* Make sure it is what we expect it to be */
117 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
118 return -EINVAL;
119
120skip_check:
121 /* replace the text with the new text */
122 if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE))
123 return -EPERM;
124 flush_icache_range(ip, ip + MCOUNT_INSN_SIZE);
125
126 return 0;
127}
128
129static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr)
130{
131 unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE];
132 unsigned long ip = rec->ip;
133
134 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
135 return -EFAULT;
136 if (rec->flags & FTRACE_FL_CONVERTED) {
137 struct ftrace_call_insn *call_insn, *tmp_call;
138
139 call_insn = (void *)ftrace_call_code;
140 tmp_call = (void *)replaced;
141 call_insn->imm39_l = tmp_call->imm39_l;
142 call_insn->imm39_h = tmp_call->imm39_h;
143 call_insn->imm20 = tmp_call->imm20;
144 call_insn->i = tmp_call->i;
145 if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0)
146 return -EINVAL;
147 return 0;
148 } else {
149 struct ftrace_orig_insn *call_insn, *tmp_call;
150
151 call_insn = (void *)ftrace_orig_code;
152 tmp_call = (void *)replaced;
153 call_insn->sign = tmp_call->sign;
154 call_insn->imm20 = tmp_call->imm20;
155 if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0)
156 return -EINVAL;
157 return 0;
158 }
159}
160
161int ftrace_make_nop(struct module *mod,
162 struct dyn_ftrace *rec, unsigned long addr)
163{
164 int ret;
165 char *new;
166
167 ret = ftrace_make_nop_check(rec, addr);
168 if (ret)
169 return ret;
170 new = ftrace_nop_replace();
171 return ftrace_modify_code(rec->ip, NULL, new, 0);
172}
173
174int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
175{
176 unsigned long ip = rec->ip;
177 unsigned char *old, *new;
178
179 old= ftrace_nop_replace();
180 new = ftrace_call_replace(ip, addr);
181 return ftrace_modify_code(ip, old, new, 1);
182}
183
184/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */
185int ftrace_update_ftrace_func(ftrace_func_t func)
186{
187 unsigned long ip;
188 unsigned long addr = ((struct fnptr *)ftrace_call)->ip;
189
190 if (func == ftrace_stub)
191 return 0;
192 ip = ((struct fnptr *)func)->ip;
193
194 ia64_patch_imm64(addr + 2, ip);
195
196 flush_icache_range(addr, addr + 16);
197 return 0;
198}
199
200/* run from kstop_machine */
201int __init ftrace_dyn_arch_init(void *data)
202{
203 *(unsigned long *)data = 0;
204
205 return 0;
206}
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 6da1f20d7372..2d311864e359 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -112,3 +112,9 @@ EXPORT_SYMBOL_GPL(esi_call_phys);
112#endif 112#endif
113extern char ia64_ivt[]; 113extern char ia64_ivt[];
114EXPORT_SYMBOL(ia64_ivt); 114EXPORT_SYMBOL(ia64_ivt);
115
116#include <asm/ftrace.h>
117#ifdef CONFIG_FUNCTION_TRACER
118/* mcount is defined in assembly */
119EXPORT_SYMBOL(_mcount);
120#endif
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index e13125058bed..166e0d839fa0 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -880,7 +880,7 @@ iosapic_unregister_intr (unsigned int gsi)
880 if (iosapic_intr_info[irq].count == 0) { 880 if (iosapic_intr_info[irq].count == 0) {
881#ifdef CONFIG_SMP 881#ifdef CONFIG_SMP
882 /* Clear affinity */ 882 /* Clear affinity */
883 cpus_setall(idesc->affinity); 883 cpumask_setall(idesc->affinity);
884#endif 884#endif
885 /* Clear the interrupt information */ 885 /* Clear the interrupt information */
886 iosapic_intr_info[irq].dest = 0; 886 iosapic_intr_info[irq].dest = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index a58f64ca9f0e..226233a6fa19 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -103,7 +103,7 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
103void set_irq_affinity_info (unsigned int irq, int hwid, int redir) 103void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
104{ 104{
105 if (irq < NR_IRQS) { 105 if (irq < NR_IRQS) {
106 cpumask_copy(&irq_desc[irq].affinity, 106 cpumask_copy(irq_desc[irq].affinity,
107 cpumask_of(cpu_logical_id(hwid))); 107 cpumask_of(cpu_logical_id(hwid)));
108 irq_redir[irq] = (char) (redir & 0xff); 108 irq_redir[irq] = (char) (redir & 0xff);
109 } 109 }
@@ -148,7 +148,7 @@ static void migrate_irqs(void)
148 if (desc->status == IRQ_PER_CPU) 148 if (desc->status == IRQ_PER_CPU)
149 continue; 149 continue;
150 150
151 if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask) 151 if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
152 >= nr_cpu_ids) { 152 >= nr_cpu_ids) {
153 /* 153 /*
154 * Save it for phase 2 processing 154 * Save it for phase 2 processing
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..927ad027820c 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -493,11 +493,13 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); 493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
494 ia64_srlz_d(); 494 ia64_srlz_d();
495 while (vector != IA64_SPURIOUS_INT_VECTOR) { 495 while (vector != IA64_SPURIOUS_INT_VECTOR) {
496 struct irq_desc *desc = irq_to_desc(vector);
497
496 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 498 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
497 smp_local_flush_tlb(); 499 smp_local_flush_tlb();
498 kstat_this_cpu.irqs[vector]++; 500 kstat_incr_irqs_this_cpu(vector, desc);
499 } else if (unlikely(IS_RESCHEDULE(vector))) 501 } else if (unlikely(IS_RESCHEDULE(vector)))
500 kstat_this_cpu.irqs[vector]++; 502 kstat_incr_irqs_this_cpu(vector, desc);
501 else { 503 else {
502 int irq = local_vector_to_irq(vector); 504 int irq = local_vector_to_irq(vector);
503 505
@@ -551,11 +553,13 @@ void ia64_process_pending_intr(void)
551 * Perform normal interrupt style processing 553 * Perform normal interrupt style processing
552 */ 554 */
553 while (vector != IA64_SPURIOUS_INT_VECTOR) { 555 while (vector != IA64_SPURIOUS_INT_VECTOR) {
556 struct irq_desc *desc = irq_to_desc(vector);
557
554 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 558 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
555 smp_local_flush_tlb(); 559 smp_local_flush_tlb();
556 kstat_this_cpu.irqs[vector]++; 560 kstat_incr_irqs_this_cpu(vector, desc);
557 } else if (unlikely(IS_RESCHEDULE(vector))) 561 } else if (unlikely(IS_RESCHEDULE(vector)))
558 kstat_this_cpu.irqs[vector]++; 562 kstat_incr_irqs_this_cpu(vector, desc);
559 else { 563 else {
560 struct pt_regs *old_regs = set_irq_regs(NULL); 564 struct pt_regs *old_regs = set_irq_regs(NULL);
561 int irq = local_vector_to_irq(vector); 565 int irq = local_vector_to_irq(vector);
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 890339339035..dcb6b7c51ea7 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -75,7 +75,7 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
75 msg.data = data; 75 msg.data = data;
76 76
77 write_msi_msg(irq, &msg); 77 write_msi_msg(irq, &msg);
78 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 78 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
79} 79}
80#endif /* CONFIG_SMP */ 80#endif /* CONFIG_SMP */
81 81
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); 187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
188 188
189 dmar_msi_write(irq, &msg); 189 dmar_msi_write(irq, &msg);
190 irq_desc[irq].affinity = *mask; 190 cpumask_copy(irq_desc[irq].affinity, mask);
191} 191}
192#endif /* CONFIG_SMP */ 192#endif /* CONFIG_SMP */
193 193
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 10a7d47e8510..f45e4e508eca 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -219,6 +219,7 @@ SECTIONS
219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
220 { 220 {
221 __per_cpu_start = .; 221 __per_cpu_start = .;
222 *(.data.percpu.page_aligned)
222 *(.data.percpu) 223 *(.data.percpu)
223 *(.data.percpu.shared_aligned) 224 *(.data.percpu.shared_aligned)
224 __per_cpu_end = .; 225 __per_cpu_end = .;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ca553b0429ce..81e428943d73 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -205,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff); 205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
206 206
207 write_msi_msg(irq, &msg); 207 write_msi_msg(irq, &msg);
208 irq_desc[irq].affinity = *cpu_mask; 208 cpumask_copy(irq_desc[irq].affinity, cpu_mask);
209} 209}
210#endif /* CONFIG_SMP */ 210#endif /* CONFIG_SMP */
211 211
diff --git a/arch/m68k/include/asm/ftrace.h b/arch/m68k/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/m68k/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/mips/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index abc62aa744ac..3214ade02d10 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -66,7 +66,7 @@ extern void smtc_forward_irq(unsigned int irq);
66 */ 66 */
67#define IRQ_AFFINITY_HOOK(irq) \ 67#define IRQ_AFFINITY_HOOK(irq) \
68do { \ 68do { \
69 if (!cpu_isset(smp_processor_id(), irq_desc[irq].affinity)) { \ 69 if (!cpumask_test_cpu(smp_processor_id(), irq_desc[irq].affinity)) {\
70 smtc_forward_irq(irq); \ 70 smtc_forward_irq(irq); \
71 irq_exit(); \ 71 irq_exit(); \
72 return; \ 72 return; \
diff --git a/arch/mips/include/asm/sigcontext.h b/arch/mips/include/asm/sigcontext.h
index 9ce0607d7a4e..9e89cf99d4e4 100644
--- a/arch/mips/include/asm/sigcontext.h
+++ b/arch/mips/include/asm/sigcontext.h
@@ -9,6 +9,7 @@
9#ifndef _ASM_SIGCONTEXT_H 9#ifndef _ASM_SIGCONTEXT_H
10#define _ASM_SIGCONTEXT_H 10#define _ASM_SIGCONTEXT_H
11 11
12#include <linux/types.h>
12#include <asm/sgidefs.h> 13#include <asm/sgidefs.h>
13 14
14#if _MIPS_SIM == _MIPS_SIM_ABI32 15#if _MIPS_SIM == _MIPS_SIM_ABI32
diff --git a/arch/mips/include/asm/swab.h b/arch/mips/include/asm/swab.h
index 88f1f7d555cb..99993c0d6c12 100644
--- a/arch/mips/include/asm/swab.h
+++ b/arch/mips/include/asm/swab.h
@@ -9,7 +9,7 @@
9#define _ASM_SWAB_H 9#define _ASM_SWAB_H
10 10
11#include <linux/compiler.h> 11#include <linux/compiler.h>
12#include <asm/types.h> 12#include <linux/types.h>
13 13
14#define __SWAB_64_THRU_32__ 14#define __SWAB_64_THRU_32__
15 15
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 494a49a317e9..87deb8f6c458 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask); 187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
188 188
189 } 189 }
190 irq_desc[irq].affinity = *cpumask; 190 cpumask_copy(irq_desc[irq].affinity, cpumask);
191 spin_unlock_irqrestore(&gic_lock, flags); 191 spin_unlock_irqrestore(&gic_lock, flags);
192 192
193} 193}
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index b6cca01ff82b..5f5af7d4c890 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -686,7 +686,7 @@ void smtc_forward_irq(unsigned int irq)
686 * and efficiency, we just pick the easiest one to find. 686 * and efficiency, we just pick the easiest one to find.
687 */ 687 */
688 688
689 target = first_cpu(irq_desc[irq].affinity); 689 target = cpumask_first(irq_desc[irq].affinity);
690 690
691 /* 691 /*
692 * We depend on the platform code to have correctly processed 692 * We depend on the platform code to have correctly processed
@@ -921,11 +921,13 @@ void ipi_decode(struct smtc_ipi *pipi)
921 struct clock_event_device *cd; 921 struct clock_event_device *cd;
922 void *arg_copy = pipi->arg; 922 void *arg_copy = pipi->arg;
923 int type_copy = pipi->type; 923 int type_copy = pipi->type;
924 int irq = MIPS_CPU_IRQ_BASE + 1;
925
924 smtc_ipi_nq(&freeIPIq, pipi); 926 smtc_ipi_nq(&freeIPIq, pipi);
925 switch (type_copy) { 927 switch (type_copy) {
926 case SMTC_CLOCK_TICK: 928 case SMTC_CLOCK_TICK:
927 irq_enter(); 929 irq_enter();
928 kstat_this_cpu.irqs[MIPS_CPU_IRQ_BASE + 1]++; 930 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
929 cd = &per_cpu(mips_clockevent_device, cpu); 931 cd = &per_cpu(mips_clockevent_device, cpu);
930 cd->event_handler(cd); 932 cd->event_handler(cd);
931 irq_exit(); 933 irq_exit();
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index aabd7274507b..5ba31888fefb 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -116,7 +116,7 @@ struct plat_smp_ops msmtc_smp_ops = {
116 116
117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) 117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
118{ 118{
119 cpumask_t tmask = *affinity; 119 cpumask_t tmask;
120 int cpu = 0; 120 int cpu = 0;
121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff); 121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
122 122
@@ -139,11 +139,12 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
139 * be made to forward to an offline "CPU". 139 * be made to forward to an offline "CPU".
140 */ 140 */
141 141
142 cpumask_copy(&tmask, affinity);
142 for_each_cpu(cpu, affinity) { 143 for_each_cpu(cpu, affinity) {
143 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu)) 144 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
144 cpu_clear(cpu, tmask); 145 cpu_clear(cpu, tmask);
145 } 146 }
146 irq_desc[irq].affinity = tmask; 147 cpumask_copy(irq_desc[irq].affinity, &tmask);
147 148
148 if (cpus_empty(tmask)) 149 if (cpus_empty(tmask))
149 /* 150 /*
diff --git a/arch/mips/sgi-ip22/ip22-int.c b/arch/mips/sgi-ip22/ip22-int.c
index f8b18af141a1..0ecd5fe9486e 100644
--- a/arch/mips/sgi-ip22/ip22-int.c
+++ b/arch/mips/sgi-ip22/ip22-int.c
@@ -155,7 +155,7 @@ static void indy_buserror_irq(void)
155 int irq = SGI_BUSERR_IRQ; 155 int irq = SGI_BUSERR_IRQ;
156 156
157 irq_enter(); 157 irq_enter();
158 kstat_this_cpu.irqs[irq]++; 158 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
159 ip22_be_interrupt(irq); 159 ip22_be_interrupt(irq);
160 irq_exit(); 160 irq_exit();
161} 161}
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 3dcb27ec0c53..c8f7d2328b24 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -122,7 +122,7 @@ void indy_8254timer_irq(void)
122 char c; 122 char c;
123 123
124 irq_enter(); 124 irq_enter();
125 kstat_this_cpu.irqs[irq]++; 125 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n"); 126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n");
127 ArcRead(0, &c, 1, &cnt); 127 ArcRead(0, &c, 1, &cnt);
128 ArcEnterInteractiveMode(); 128 ArcEnterInteractiveMode();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index dddfda8e8294..314691648c97 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -178,9 +178,10 @@ struct plat_smp_ops bcm1480_smp_ops = {
178void bcm1480_mailbox_interrupt(void) 178void bcm1480_mailbox_interrupt(void)
179{ 179{
180 int cpu = smp_processor_id(); 180 int cpu = smp_processor_id();
181 int irq = K_BCM1480_INT_MBOX_0_0;
181 unsigned int action; 182 unsigned int action;
182 183
183 kstat_this_cpu.irqs[K_BCM1480_INT_MBOX_0_0]++; 184 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
184 /* Load the mailbox register to figure out what we're supposed to do */ 185 /* Load the mailbox register to figure out what we're supposed to do */
185 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff; 186 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff;
186 187
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index 5950a288a7da..cad14003b84f 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -166,9 +166,10 @@ struct plat_smp_ops sb_smp_ops = {
166void sb1250_mailbox_interrupt(void) 166void sb1250_mailbox_interrupt(void)
167{ 167{
168 int cpu = smp_processor_id(); 168 int cpu = smp_processor_id();
169 int irq = K_INT_MBOX_0;
169 unsigned int action; 170 unsigned int action;
170 171
171 kstat_this_cpu.irqs[K_INT_MBOX_0]++; 172 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
172 /* Load the mailbox register to figure out what we're supposed to do */ 173 /* Load the mailbox register to figure out what we're supposed to do */
173 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff; 174 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff;
174 175
diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c
index 10811e981d20..2e370d88a87a 100644
--- a/arch/mn10300/kernel/mn10300-watchdog.c
+++ b/arch/mn10300/kernel/mn10300-watchdog.c
@@ -130,6 +130,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
130 * the stack NMI-atomically, it's safe to use smp_processor_id(). 130 * the stack NMI-atomically, it's safe to use smp_processor_id().
131 */ 131 */
132 int sum, cpu = smp_processor_id(); 132 int sum, cpu = smp_processor_id();
133 int irq = NMIIRQ;
133 u8 wdt, tmp; 134 u8 wdt, tmp;
134 135
135 wdt = WDCTR & ~WDCTR_WDCNE; 136 wdt = WDCTR & ~WDCTR_WDCNE;
@@ -138,7 +139,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
138 NMICR = NMICR_WDIF; 139 NMICR = NMICR_WDIF;
139 140
140 nmi_count(cpu)++; 141 nmi_count(cpu)++;
141 kstat_this_cpu.irqs[NMIIRQ]++; 142 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
142 sum = irq_stat[cpu].__irq_count; 143 sum = irq_stat[cpu].__irq_count;
143 144
144 if (last_irq_sums[cpu] == sum) { 145 if (last_irq_sums[cpu] == sum) {
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/parisc/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h
index c584b00c6074..430f1aeea0b8 100644
--- a/arch/parisc/include/asm/pdc.h
+++ b/arch/parisc/include/asm/pdc.h
@@ -336,10 +336,11 @@
336#define NUM_PDC_RESULT 32 336#define NUM_PDC_RESULT 32
337 337
338#if !defined(__ASSEMBLY__) 338#if !defined(__ASSEMBLY__)
339#ifdef __KERNEL__
340 339
341#include <linux/types.h> 340#include <linux/types.h>
342 341
342#ifdef __KERNEL__
343
343extern int pdc_type; 344extern int pdc_type;
344 345
345/* Values for pdc_type */ 346/* Values for pdc_type */
diff --git a/arch/parisc/include/asm/swab.h b/arch/parisc/include/asm/swab.h
index 3ff16c5a3358..e78403b129ef 100644
--- a/arch/parisc/include/asm/swab.h
+++ b/arch/parisc/include/asm/swab.h
@@ -1,7 +1,7 @@
1#ifndef _PARISC_SWAB_H 1#ifndef _PARISC_SWAB_H
2#define _PARISC_SWAB_H 2#define _PARISC_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6 6
7#define __SWAB_64_THRU_32__ 7#define __SWAB_64_THRU_32__
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 29e70e16ede8..2b5f5915dd1d 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -138,7 +138,7 @@ static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
138 if (cpu_dest < 0) 138 if (cpu_dest < 0)
139 return; 139 return;
140 140
141 cpumask_copy(&irq_desc[irq].affinity, &cpumask_of_cpu(cpu_dest)); 141 cpumask_copy(&irq_desc[irq].affinity, dest);
142} 142}
143#endif 143#endif
144 144
diff --git a/arch/powerpc/include/asm/bootx.h b/arch/powerpc/include/asm/bootx.h
index 57b82e3f89ce..60a3c9ef3017 100644
--- a/arch/powerpc/include/asm/bootx.h
+++ b/arch/powerpc/include/asm/bootx.h
@@ -9,7 +9,7 @@
9#ifndef __ASM_BOOTX_H__ 9#ifndef __ASM_BOOTX_H__
10#define __ASM_BOOTX_H__ 10#define __ASM_BOOTX_H__
11 11
12#include <asm/types.h> 12#include <linux/types.h>
13 13
14#ifdef macintosh 14#ifdef macintosh
15#include <Types.h> 15#include <Types.h>
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index cd46f023ec6d..b5600ce6055e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -7,7 +7,7 @@
7#include <asm/string.h> 7#include <asm/string.h>
8#endif 8#endif
9 9
10#include <asm/types.h> 10#include <linux/types.h>
11#include <asm/ptrace.h> 11#include <asm/ptrace.h>
12#include <asm/cputable.h> 12#include <asm/cputable.h>
13#include <asm/auxvec.h> 13#include <asm/auxvec.h>
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index f993e4198d5c..4e0cf65f7f5a 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -20,7 +20,7 @@
20#ifndef __LINUX_KVM_POWERPC_H 20#ifndef __LINUX_KVM_POWERPC_H
21#define __LINUX_KVM_POWERPC_H 21#define __LINUX_KVM_POWERPC_H
22 22
23#include <asm/types.h> 23#include <linux/types.h>
24 24
25struct kvm_regs { 25struct kvm_regs {
26 __u64 pc; 26 __u64 pc;
diff --git a/arch/powerpc/include/asm/ps3fb.h b/arch/powerpc/include/asm/ps3fb.h
index 3f121fe4010d..e7233a849680 100644
--- a/arch/powerpc/include/asm/ps3fb.h
+++ b/arch/powerpc/include/asm/ps3fb.h
@@ -19,6 +19,7 @@
19#ifndef _ASM_POWERPC_PS3FB_H_ 19#ifndef _ASM_POWERPC_PS3FB_H_
20#define _ASM_POWERPC_PS3FB_H_ 20#define _ASM_POWERPC_PS3FB_H_
21 21
22#include <linux/types.h>
22#include <linux/ioctl.h> 23#include <linux/ioctl.h>
23 24
24/* ioctl */ 25/* ioctl */
diff --git a/arch/powerpc/include/asm/spu_info.h b/arch/powerpc/include/asm/spu_info.h
index 3545efbf9891..1286c823f0d8 100644
--- a/arch/powerpc/include/asm/spu_info.h
+++ b/arch/powerpc/include/asm/spu_info.h
@@ -23,9 +23,10 @@
23#ifndef _SPU_INFO_H 23#ifndef _SPU_INFO_H
24#define _SPU_INFO_H 24#define _SPU_INFO_H
25 25
26#include <linux/types.h>
27
26#ifdef __KERNEL__ 28#ifdef __KERNEL__
27#include <asm/spu.h> 29#include <asm/spu.h>
28#include <linux/types.h>
29#else 30#else
30struct mfc_cq_sr { 31struct mfc_cq_sr {
31 __u64 mfc_cq_data0_RW; 32 __u64 mfc_cq_data0_RW;
diff --git a/arch/powerpc/include/asm/swab.h b/arch/powerpc/include/asm/swab.h
index ef824ae4b79c..c581e3ef73ed 100644
--- a/arch/powerpc/include/asm/swab.h
+++ b/arch/powerpc/include/asm/swab.h
@@ -8,7 +8,7 @@
8 * 2 of the License, or (at your option) any later version. 8 * 2 of the License, or (at your option) any later version.
9 */ 9 */
10 10
11#include <asm/types.h> 11#include <linux/types.h>
12#include <linux/compiler.h> 12#include <linux/compiler.h>
13 13
14#ifdef __GNUC__ 14#ifdef __GNUC__
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 23b8b5e36f98..ad1e5ac721d8 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -231,7 +231,7 @@ void fixup_irqs(cpumask_t map)
231 if (irq_desc[irq].status & IRQ_PER_CPU) 231 if (irq_desc[irq].status & IRQ_PER_CPU)
232 continue; 232 continue;
233 233
234 cpus_and(mask, irq_desc[irq].affinity, map); 234 cpumask_and(&mask, irq_desc[irq].affinity, &map);
235 if (any_online_cpu(mask) == NR_CPUS) { 235 if (any_online_cpu(mask) == NR_CPUS) {
236 printk("Breaking affinity for irq %i\n", irq); 236 printk("Breaking affinity for irq %i\n", irq);
237 mask = map; 237 mask = map;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 161b9b9691f0..295ccc5e86b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -184,6 +184,7 @@ SECTIONS
184 . = ALIGN(PAGE_SIZE); 184 . = ALIGN(PAGE_SIZE);
185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
186 __per_cpu_start = .; 186 __per_cpu_start = .;
187 *(.data.percpu.page_aligned)
187 *(.data.percpu) 188 *(.data.percpu)
188 *(.data.percpu.shared_aligned) 189 *(.data.percpu.shared_aligned)
189 __per_cpu_end = .; 190 __per_cpu_end = .;
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 84e058f1e1cc..80b513449f4c 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -153,9 +153,10 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
153{ 153{
154 int server; 154 int server;
155 /* For the moment only implement delivery to all cpus or one cpu */ 155 /* For the moment only implement delivery to all cpus or one cpu */
156 cpumask_t cpumask = irq_desc[virq].affinity; 156 cpumask_t cpumask;
157 cpumask_t tmp = CPU_MASK_NONE; 157 cpumask_t tmp = CPU_MASK_NONE;
158 158
159 cpumask_copy(&cpumask, irq_desc[virq].affinity);
159 if (!distribute_irqs) 160 if (!distribute_irqs)
160 return default_server; 161 return default_server;
161 162
@@ -869,7 +870,7 @@ void xics_migrate_irqs_away(void)
869 virq, cpu); 870 virq, cpu);
870 871
871 /* Reset affinity to all cpus */ 872 /* Reset affinity to all cpus */
872 irq_desc[virq].affinity = CPU_MASK_ALL; 873 cpumask_setall(irq_desc[virq].affinity);
873 desc->chip->set_affinity(virq, cpu_all_mask); 874 desc->chip->set_affinity(virq, cpu_all_mask);
874unlock: 875unlock:
875 spin_unlock_irqrestore(&desc->lock, flags); 876 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index a35297dbac28..532e205303a2 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -566,9 +566,10 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
566#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
567static int irq_choose_cpu(unsigned int virt_irq) 567static int irq_choose_cpu(unsigned int virt_irq)
568{ 568{
569 cpumask_t mask = irq_desc[virt_irq].affinity; 569 cpumask_t mask;
570 int cpuid; 570 int cpuid;
571 571
572 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
572 if (cpus_equal(mask, CPU_MASK_ALL)) { 573 if (cpus_equal(mask, CPU_MASK_ALL)) {
573 static int irq_rover; 574 static int irq_rover;
574 static DEFINE_SPINLOCK(irq_rover_lock); 575 static DEFINE_SPINLOCK(irq_rover_lock);
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 1c378d8e90c5..233bd87a9637 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -252,9 +252,10 @@ struct irq_handler_data {
252#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
253static int irq_choose_cpu(unsigned int virt_irq) 253static int irq_choose_cpu(unsigned int virt_irq)
254{ 254{
255 cpumask_t mask = irq_desc[virt_irq].affinity; 255 cpumask_t mask;
256 int cpuid; 256 int cpuid;
257 257
258 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
258 if (cpus_equal(mask, CPU_MASK_ALL)) { 259 if (cpus_equal(mask, CPU_MASK_ALL)) {
259 static int irq_rover; 260 static int irq_rover;
260 static DEFINE_SPINLOCK(irq_rover_lock); 261 static DEFINE_SPINLOCK(irq_rover_lock);
@@ -805,7 +806,7 @@ void fixup_irqs(void)
805 !(irq_desc[irq].status & IRQ_PER_CPU)) { 806 !(irq_desc[irq].status & IRQ_PER_CPU)) {
806 if (irq_desc[irq].chip->set_affinity) 807 if (irq_desc[irq].chip->set_affinity)
807 irq_desc[irq].chip->set_affinity(irq, 808 irq_desc[irq].chip->set_affinity(irq,
808 &irq_desc[irq].affinity); 809 irq_desc[irq].affinity);
809 } 810 }
810 spin_unlock_irqrestore(&irq_desc[irq].lock, flags); 811 spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
811 } 812 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 2db3c2229b95..db310aa00183 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -729,7 +729,7 @@ void timer_interrupt(int irq, struct pt_regs *regs)
729 729
730 irq_enter(); 730 irq_enter();
731 731
732 kstat_this_cpu.irqs[0]++; 732 kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
733 733
734 if (unlikely(!evt->event_handler)) { 734 if (unlikely(!evt->event_handler)) {
735 printk(KERN_WARNING 735 printk(KERN_WARNING
diff --git a/arch/um/include/asm/ftrace.h b/arch/um/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/um/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc2fbadff9f9..1a3150570785 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -5,7 +5,7 @@ mainmenu "Linux Kernel Configuration for x86"
5config 64BIT 5config 64BIT
6 bool "64-bit kernel" if ARCH = "x86" 6 bool "64-bit kernel" if ARCH = "x86"
7 default ARCH = "x86_64" 7 default ARCH = "x86_64"
8 help 8 ---help---
9 Say yes to build a 64-bit kernel - formerly known as x86_64 9 Say yes to build a 64-bit kernel - formerly known as x86_64
10 Say no to build a 32-bit kernel - formerly known as i386 10 Say no to build a 32-bit kernel - formerly known as i386
11 11
@@ -34,12 +34,17 @@ config X86
34 select HAVE_FUNCTION_TRACER 34 select HAVE_FUNCTION_TRACER
35 select HAVE_FUNCTION_GRAPH_TRACER 35 select HAVE_FUNCTION_GRAPH_TRACER
36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
37 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 37 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
38 select HAVE_ARCH_KGDB if !X86_VOYAGER 38 select HAVE_FTRACE_SYSCALLS
39 select HAVE_KVM
40 select HAVE_ARCH_KGDB
39 select HAVE_ARCH_TRACEHOOK 41 select HAVE_ARCH_TRACEHOOK
40 select HAVE_GENERIC_DMA_COHERENT if X86_32 42 select HAVE_GENERIC_DMA_COHERENT if X86_32
41 select HAVE_EFFICIENT_UNALIGNED_ACCESS 43 select HAVE_EFFICIENT_UNALIGNED_ACCESS
42 select USER_STACKTRACE_SUPPORT 44 select USER_STACKTRACE_SUPPORT
45 select HAVE_KERNEL_GZIP
46 select HAVE_KERNEL_BZIP2
47 select HAVE_KERNEL_LZMA
43 48
44config ARCH_DEFCONFIG 49config ARCH_DEFCONFIG
45 string 50 string
@@ -133,18 +138,19 @@ config ARCH_HAS_CACHE_LINE_SIZE
133 def_bool y 138 def_bool y
134 139
135config HAVE_SETUP_PER_CPU_AREA 140config HAVE_SETUP_PER_CPU_AREA
136 def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) 141 def_bool y
142
143config HAVE_DYNAMIC_PER_CPU_AREA
144 def_bool y
137 145
138config HAVE_CPUMASK_OF_CPU_MAP 146config HAVE_CPUMASK_OF_CPU_MAP
139 def_bool X86_64_SMP 147 def_bool X86_64_SMP
140 148
141config ARCH_HIBERNATION_POSSIBLE 149config ARCH_HIBERNATION_POSSIBLE
142 def_bool y 150 def_bool y
143 depends on !SMP || !X86_VOYAGER
144 151
145config ARCH_SUSPEND_POSSIBLE 152config ARCH_SUSPEND_POSSIBLE
146 def_bool y 153 def_bool y
147 depends on !X86_VOYAGER
148 154
149config ZONE_DMA32 155config ZONE_DMA32
150 bool 156 bool
@@ -174,11 +180,6 @@ config GENERIC_PENDING_IRQ
174 depends on GENERIC_HARDIRQS && SMP 180 depends on GENERIC_HARDIRQS && SMP
175 default y 181 default y
176 182
177config X86_SMP
178 bool
179 depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
180 default y
181
182config USE_GENERIC_SMP_HELPERS 183config USE_GENERIC_SMP_HELPERS
183 def_bool y 184 def_bool y
184 depends on SMP 185 depends on SMP
@@ -194,19 +195,17 @@ config X86_64_SMP
194config X86_HT 195config X86_HT
195 bool 196 bool
196 depends on SMP 197 depends on SMP
197 depends on (X86_32 && !X86_VOYAGER) || X86_64
198 default y
199
200config X86_BIOS_REBOOT
201 bool
202 depends on !X86_VOYAGER
203 default y 198 default y
204 199
205config X86_TRAMPOLINE 200config X86_TRAMPOLINE
206 bool 201 bool
207 depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) 202 depends on SMP || (64BIT && ACPI_SLEEP)
208 default y 203 default y
209 204
205config X86_32_LAZY_GS
206 def_bool y
207 depends on X86_32 && !CC_STACKPROTECTOR
208
210config KTIME_SCALAR 209config KTIME_SCALAR
211 def_bool X86_32 210 def_bool X86_32
212source "init/Kconfig" 211source "init/Kconfig"
@@ -244,14 +243,24 @@ config SMP
244 243
245 If you don't know what to do here, say N. 244 If you don't know what to do here, say N.
246 245
247config X86_HAS_BOOT_CPU_ID 246config X86_X2APIC
248 def_bool y 247 bool "Support x2apic"
249 depends on X86_VOYAGER 248 depends on X86_LOCAL_APIC && X86_64
249 ---help---
250 This enables x2apic support on CPUs that have this feature.
251
252 This allows 32-bit apic IDs (so it can support very large systems),
253 and accesses the local apic via MSRs not via mmio.
254
255 ( On certain CPU models you may need to enable INTR_REMAP too,
256 to get functional x2apic mode. )
257
258 If you don't know what to do here, say N.
250 259
251config SPARSE_IRQ 260config SPARSE_IRQ
252 bool "Support sparse irq numbering" 261 bool "Support sparse irq numbering"
253 depends on PCI_MSI || HT_IRQ 262 depends on PCI_MSI || HT_IRQ
254 help 263 ---help---
255 This enables support for sparse irqs. This is useful for distro 264 This enables support for sparse irqs. This is useful for distro
256 kernels that want to define a high CONFIG_NR_CPUS value but still 265 kernels that want to define a high CONFIG_NR_CPUS value but still
257 want to have low kernel memory footprint on smaller machines. 266 want to have low kernel memory footprint on smaller machines.
@@ -265,114 +274,140 @@ config NUMA_MIGRATE_IRQ_DESC
265 bool "Move irq desc when changing irq smp_affinity" 274 bool "Move irq desc when changing irq smp_affinity"
266 depends on SPARSE_IRQ && NUMA 275 depends on SPARSE_IRQ && NUMA
267 default n 276 default n
268 help 277 ---help---
269 This enables moving irq_desc to cpu/node that irq will use handled. 278 This enables moving irq_desc to cpu/node that irq will use handled.
270 279
271 If you don't know what to do here, say N. 280 If you don't know what to do here, say N.
272 281
273config X86_FIND_SMP_CONFIG
274 def_bool y
275 depends on X86_MPPARSE || X86_VOYAGER
276
277config X86_MPPARSE 282config X86_MPPARSE
278 bool "Enable MPS table" if ACPI 283 bool "Enable MPS table" if ACPI
279 default y 284 default y
280 depends on X86_LOCAL_APIC 285 depends on X86_LOCAL_APIC
281 help 286 ---help---
282 For old smp systems that do not have proper acpi support. Newer systems 287 For old smp systems that do not have proper acpi support. Newer systems
283 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it 288 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
284 289
285choice 290config X86_BIGSMP
286 prompt "Subarchitecture Type" 291 bool "Support for big SMP systems with more than 8 CPUs"
287 default X86_PC 292 depends on X86_32 && SMP
293 ---help---
294 This option is needed for the systems that have more than 8 CPUs
288 295
289config X86_PC 296if X86_32
290 bool "PC-compatible" 297config X86_EXTENDED_PLATFORM
291 help 298 bool "Support for extended (non-PC) x86 platforms"
292 Choose this option if your computer is a standard PC or compatible. 299 default y
300 ---help---
301 If you disable this option then the kernel will only support
302 standard PC platforms. (which covers the vast majority of
303 systems out there.)
304
305 If you enable this option then you'll be able to select support
306 for the following (non-PC) 32 bit x86 platforms:
307 AMD Elan
308 NUMAQ (IBM/Sequent)
309 RDC R-321x SoC
310 SGI 320/540 (Visual Workstation)
311 Summit/EXA (IBM x440)
312 Unisys ES7000 IA32 series
313
314 If you have one of these systems, or if you want to build a
315 generic distribution kernel, say Y here - otherwise say N.
316endif
317
318if X86_64
319config X86_EXTENDED_PLATFORM
320 bool "Support for extended (non-PC) x86 platforms"
321 default y
322 ---help---
323 If you disable this option then the kernel will only support
324 standard PC platforms. (which covers the vast majority of
325 systems out there.)
326
327 If you enable this option then you'll be able to select support
328 for the following (non-PC) 64 bit x86 platforms:
329 ScaleMP vSMP
330 SGI Ultraviolet
331
332 If you have one of these systems, or if you want to build a
333 generic distribution kernel, say Y here - otherwise say N.
334endif
335# This is an alphabetically sorted list of 64 bit extended platforms
336# Please maintain the alphabetic order if and when there are additions
337
338config X86_VSMP
339 bool "ScaleMP vSMP"
340 select PARAVIRT
341 depends on X86_64 && PCI
342 depends on X86_EXTENDED_PLATFORM
343 ---help---
344 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
345 supposed to run on these EM64T-based machines. Only choose this option
346 if you have one of these machines.
347
348config X86_UV
349 bool "SGI Ultraviolet"
350 depends on X86_64
351 depends on X86_EXTENDED_PLATFORM
352 select X86_X2APIC
353 ---help---
354 This option is needed in order to support SGI Ultraviolet systems.
355 If you don't have one of these, you should say N here.
356
357# Following is an alphabetically sorted list of 32 bit extended platforms
358# Please maintain the alphabetic order if and when there are additions
293 359
294config X86_ELAN 360config X86_ELAN
295 bool "AMD Elan" 361 bool "AMD Elan"
296 depends on X86_32 362 depends on X86_32
297 help 363 depends on X86_EXTENDED_PLATFORM
364 ---help---
298 Select this for an AMD Elan processor. 365 Select this for an AMD Elan processor.
299 366
300 Do not use this option for K6/Athlon/Opteron processors! 367 Do not use this option for K6/Athlon/Opteron processors!
301 368
302 If unsure, choose "PC-compatible" instead. 369 If unsure, choose "PC-compatible" instead.
303 370
304config X86_VOYAGER 371config X86_RDC321X
305 bool "Voyager (NCR)" 372 bool "RDC R-321x SoC"
306 depends on X86_32 && (SMP || BROKEN) && !PCI
307 help
308 Voyager is an MCA-based 32-way capable SMP architecture proprietary
309 to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
310
311 *** WARNING ***
312
313 If you do not specifically know you have a Voyager based machine,
314 say N here, otherwise the kernel you build will not be bootable.
315
316config X86_GENERICARCH
317 bool "Generic architecture"
318 depends on X86_32 373 depends on X86_32
319 help 374 depends on X86_EXTENDED_PLATFORM
320 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default 375 select M486
376 select X86_REBOOTFIXUPS
377 ---help---
378 This option is needed for RDC R-321x system-on-chip, also known
379 as R-8610-(G).
380 If you don't have one of these chips, you should say N here.
381
382config X86_32_NON_STANDARD
383 bool "Support non-standard 32-bit SMP architectures"
384 depends on X86_32 && SMP
385 depends on X86_EXTENDED_PLATFORM
386 ---help---
387 This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
321 subarchitectures. It is intended for a generic binary kernel. 388 subarchitectures. It is intended for a generic binary kernel.
322 if you select them all, kernel will probe it one by one. and will 389 if you select them all, kernel will probe it one by one. and will
323 fallback to default. 390 fallback to default.
324 391
325if X86_GENERICARCH 392# Alphabetically sorted list of Non standard 32 bit platforms
326 393
327config X86_NUMAQ 394config X86_NUMAQ
328 bool "NUMAQ (IBM/Sequent)" 395 bool "NUMAQ (IBM/Sequent)"
329 depends on SMP && X86_32 && PCI && X86_MPPARSE 396 depends on X86_32_NON_STANDARD
330 select NUMA 397 select NUMA
331 help 398 select X86_MPPARSE
399 ---help---
332 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent) 400 This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
333 NUMA multiquad box. This changes the way that processors are 401 NUMA multiquad box. This changes the way that processors are
334 bootstrapped, and uses Clustered Logical APIC addressing mode instead 402 bootstrapped, and uses Clustered Logical APIC addressing mode instead
335 of Flat Logical. You will need a new lynxer.elf file to flash your 403 of Flat Logical. You will need a new lynxer.elf file to flash your
336 firmware with - send email to <Martin.Bligh@us.ibm.com>. 404 firmware with - send email to <Martin.Bligh@us.ibm.com>.
337 405
338config X86_SUMMIT
339 bool "Summit/EXA (IBM x440)"
340 depends on X86_32 && SMP
341 help
342 This option is needed for IBM systems that use the Summit/EXA chipset.
343 In particular, it is needed for the x440.
344
345config X86_ES7000
346 bool "Support for Unisys ES7000 IA32 series"
347 depends on X86_32 && SMP
348 help
349 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
350 supposed to run on an IA32-based Unisys ES7000 system.
351
352config X86_BIGSMP
353 bool "Support for big SMP systems with more than 8 CPUs"
354 depends on X86_32 && SMP
355 help
356 This option is needed for the systems that have more than 8 CPUs
357 and if the system is not of any sub-arch type above.
358
359endif
360
361config X86_VSMP
362 bool "Support for ScaleMP vSMP"
363 select PARAVIRT
364 depends on X86_64 && PCI
365 help
366 Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
367 supposed to run on these EM64T-based machines. Only choose this option
368 if you have one of these machines.
369
370endchoice
371
372config X86_VISWS 406config X86_VISWS
373 bool "SGI 320/540 (Visual Workstation)" 407 bool "SGI 320/540 (Visual Workstation)"
374 depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT 408 depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
375 help 409 depends on X86_32_NON_STANDARD
410 ---help---
376 The SGI Visual Workstation series is an IA32-based workstation 411 The SGI Visual Workstation series is an IA32-based workstation
377 based on SGI systems chips with some legacy PC hardware attached. 412 based on SGI systems chips with some legacy PC hardware attached.
378 413
@@ -381,21 +416,25 @@ config X86_VISWS
381 A kernel compiled for the Visual Workstation will run on general 416 A kernel compiled for the Visual Workstation will run on general
382 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 417 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
383 418
384config X86_RDC321X 419config X86_SUMMIT
385 bool "RDC R-321x SoC" 420 bool "Summit/EXA (IBM x440)"
386 depends on X86_32 421 depends on X86_32_NON_STANDARD
387 select M486 422 ---help---
388 select X86_REBOOTFIXUPS 423 This option is needed for IBM systems that use the Summit/EXA chipset.
389 help 424 In particular, it is needed for the x440.
390 This option is needed for RDC R-321x system-on-chip, also known 425
391 as R-8610-(G). 426config X86_ES7000
392 If you don't have one of these chips, you should say N here. 427 bool "Unisys ES7000 IA32 series"
428 depends on X86_32_NON_STANDARD && X86_BIGSMP
429 ---help---
430 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
431 supposed to run on an IA32-based Unisys ES7000 system.
393 432
394config SCHED_OMIT_FRAME_POINTER 433config SCHED_OMIT_FRAME_POINTER
395 def_bool y 434 def_bool y
396 prompt "Single-depth WCHAN output" 435 prompt "Single-depth WCHAN output"
397 depends on X86 436 depends on X86
398 help 437 ---help---
399 Calculate simpler /proc/<PID>/wchan values. If this option 438 Calculate simpler /proc/<PID>/wchan values. If this option
400 is disabled then wchan values will recurse back to the 439 is disabled then wchan values will recurse back to the
401 caller function. This provides more accurate wchan values, 440 caller function. This provides more accurate wchan values,
@@ -405,7 +444,7 @@ config SCHED_OMIT_FRAME_POINTER
405 444
406menuconfig PARAVIRT_GUEST 445menuconfig PARAVIRT_GUEST
407 bool "Paravirtualized guest support" 446 bool "Paravirtualized guest support"
408 help 447 ---help---
409 Say Y here to get to see options related to running Linux under 448 Say Y here to get to see options related to running Linux under
410 various hypervisors. This option alone does not add any kernel code. 449 various hypervisors. This option alone does not add any kernel code.
411 450
@@ -419,8 +458,7 @@ config VMI
419 bool "VMI Guest support" 458 bool "VMI Guest support"
420 select PARAVIRT 459 select PARAVIRT
421 depends on X86_32 460 depends on X86_32
422 depends on !X86_VOYAGER 461 ---help---
423 help
424 VMI provides a paravirtualized interface to the VMware ESX server 462 VMI provides a paravirtualized interface to the VMware ESX server
425 (it could be used by other hypervisors in theory too, but is not 463 (it could be used by other hypervisors in theory too, but is not
426 at the moment), by linking the kernel to a GPL-ed ROM module 464 at the moment), by linking the kernel to a GPL-ed ROM module
@@ -430,8 +468,7 @@ config KVM_CLOCK
430 bool "KVM paravirtualized clock" 468 bool "KVM paravirtualized clock"
431 select PARAVIRT 469 select PARAVIRT
432 select PARAVIRT_CLOCK 470 select PARAVIRT_CLOCK
433 depends on !X86_VOYAGER 471 ---help---
434 help
435 Turning on this option will allow you to run a paravirtualized clock 472 Turning on this option will allow you to run a paravirtualized clock
436 when running over the KVM hypervisor. Instead of relying on a PIT 473 when running over the KVM hypervisor. Instead of relying on a PIT
437 (or probably other) emulation by the underlying device model, the host 474 (or probably other) emulation by the underlying device model, the host
@@ -441,17 +478,15 @@ config KVM_CLOCK
441config KVM_GUEST 478config KVM_GUEST
442 bool "KVM Guest support" 479 bool "KVM Guest support"
443 select PARAVIRT 480 select PARAVIRT
444 depends on !X86_VOYAGER 481 ---help---
445 help 482 This option enables various optimizations for running under the KVM
446 This option enables various optimizations for running under the KVM 483 hypervisor.
447 hypervisor.
448 484
449source "arch/x86/lguest/Kconfig" 485source "arch/x86/lguest/Kconfig"
450 486
451config PARAVIRT 487config PARAVIRT
452 bool "Enable paravirtualization code" 488 bool "Enable paravirtualization code"
453 depends on !X86_VOYAGER 489 ---help---
454 help
455 This changes the kernel so it can modify itself when it is run 490 This changes the kernel so it can modify itself when it is run
456 under a hypervisor, potentially improving performance significantly 491 under a hypervisor, potentially improving performance significantly
457 over full virtualization. However, when run without a hypervisor 492 over full virtualization. However, when run without a hypervisor
@@ -464,51 +499,51 @@ config PARAVIRT_CLOCK
464endif 499endif
465 500
466config PARAVIRT_DEBUG 501config PARAVIRT_DEBUG
467 bool "paravirt-ops debugging" 502 bool "paravirt-ops debugging"
468 depends on PARAVIRT && DEBUG_KERNEL 503 depends on PARAVIRT && DEBUG_KERNEL
469 help 504 ---help---
470 Enable to debug paravirt_ops internals. Specifically, BUG if 505 Enable to debug paravirt_ops internals. Specifically, BUG if
471 a paravirt_op is missing when it is called. 506 a paravirt_op is missing when it is called.
472 507
473config MEMTEST 508config MEMTEST
474 bool "Memtest" 509 bool "Memtest"
475 help 510 ---help---
476 This option adds a kernel parameter 'memtest', which allows memtest 511 This option adds a kernel parameter 'memtest', which allows memtest
477 to be set. 512 to be set.
478 memtest=0, mean disabled; -- default 513 memtest=0, mean disabled; -- default
479 memtest=1, mean do 1 test pattern; 514 memtest=1, mean do 1 test pattern;
480 ... 515 ...
481 memtest=4, mean do 4 test patterns. 516 memtest=4, mean do 4 test patterns.
482 If you are unsure how to answer this question, answer N. 517 If you are unsure how to answer this question, answer N.
483 518
484config X86_SUMMIT_NUMA 519config X86_SUMMIT_NUMA
485 def_bool y 520 def_bool y
486 depends on X86_32 && NUMA && X86_GENERICARCH 521 depends on X86_32 && NUMA && X86_32_NON_STANDARD
487 522
488config X86_CYCLONE_TIMER 523config X86_CYCLONE_TIMER
489 def_bool y 524 def_bool y
490 depends on X86_GENERICARCH 525 depends on X86_32_NON_STANDARD
491 526
492source "arch/x86/Kconfig.cpu" 527source "arch/x86/Kconfig.cpu"
493 528
494config HPET_TIMER 529config HPET_TIMER
495 def_bool X86_64 530 def_bool X86_64
496 prompt "HPET Timer Support" if X86_32 531 prompt "HPET Timer Support" if X86_32
497 help 532 ---help---
498 Use the IA-PC HPET (High Precision Event Timer) to manage 533 Use the IA-PC HPET (High Precision Event Timer) to manage
499 time in preference to the PIT and RTC, if a HPET is 534 time in preference to the PIT and RTC, if a HPET is
500 present. 535 present.
501 HPET is the next generation timer replacing legacy 8254s. 536 HPET is the next generation timer replacing legacy 8254s.
502 The HPET provides a stable time base on SMP 537 The HPET provides a stable time base on SMP
503 systems, unlike the TSC, but it is more expensive to access, 538 systems, unlike the TSC, but it is more expensive to access,
504 as it is off-chip. You can find the HPET spec at 539 as it is off-chip. You can find the HPET spec at
505 <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>. 540 <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
506 541
507 You can safely choose Y here. However, HPET will only be 542 You can safely choose Y here. However, HPET will only be
508 activated if the platform and the BIOS support this feature. 543 activated if the platform and the BIOS support this feature.
509 Otherwise the 8254 will be used for timing services. 544 Otherwise the 8254 will be used for timing services.
510 545
511 Choose N to continue using the legacy 8254 timer. 546 Choose N to continue using the legacy 8254 timer.
512 547
513config HPET_EMULATE_RTC 548config HPET_EMULATE_RTC
514 def_bool y 549 def_bool y
@@ -519,7 +554,7 @@ config HPET_EMULATE_RTC
519config DMI 554config DMI
520 default y 555 default y
521 bool "Enable DMI scanning" if EMBEDDED 556 bool "Enable DMI scanning" if EMBEDDED
522 help 557 ---help---
523 Enabled scanning of DMI to identify machine quirks. Say Y 558 Enabled scanning of DMI to identify machine quirks. Say Y
524 here unless you have verified that your setup is not 559 here unless you have verified that your setup is not
525 affected by entries in the DMI blacklist. Required by PNP 560 affected by entries in the DMI blacklist. Required by PNP
@@ -531,7 +566,7 @@ config GART_IOMMU
531 select SWIOTLB 566 select SWIOTLB
532 select AGP 567 select AGP
533 depends on X86_64 && PCI 568 depends on X86_64 && PCI
534 help 569 ---help---
535 Support for full DMA access of devices with 32bit memory access only 570 Support for full DMA access of devices with 32bit memory access only
536 on systems with more than 3GB. This is usually needed for USB, 571 on systems with more than 3GB. This is usually needed for USB,
537 sound, many IDE/SATA chipsets and some other devices. 572 sound, many IDE/SATA chipsets and some other devices.
@@ -546,7 +581,7 @@ config CALGARY_IOMMU
546 bool "IBM Calgary IOMMU support" 581 bool "IBM Calgary IOMMU support"
547 select SWIOTLB 582 select SWIOTLB
548 depends on X86_64 && PCI && EXPERIMENTAL 583 depends on X86_64 && PCI && EXPERIMENTAL
549 help 584 ---help---
550 Support for hardware IOMMUs in IBM's xSeries x366 and x460 585 Support for hardware IOMMUs in IBM's xSeries x366 and x460
551 systems. Needed to run systems with more than 3GB of memory 586 systems. Needed to run systems with more than 3GB of memory
552 properly with 32-bit PCI devices that do not support DAC 587 properly with 32-bit PCI devices that do not support DAC
@@ -564,7 +599,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
564 def_bool y 599 def_bool y
565 prompt "Should Calgary be enabled by default?" 600 prompt "Should Calgary be enabled by default?"
566 depends on CALGARY_IOMMU 601 depends on CALGARY_IOMMU
567 help 602 ---help---
568 Should Calgary be enabled by default? if you choose 'y', Calgary 603 Should Calgary be enabled by default? if you choose 'y', Calgary
569 will be used (if it exists). If you choose 'n', Calgary will not be 604 will be used (if it exists). If you choose 'n', Calgary will not be
570 used even if it exists. If you choose 'n' and would like to use 605 used even if it exists. If you choose 'n' and would like to use
@@ -576,7 +611,7 @@ config AMD_IOMMU
576 select SWIOTLB 611 select SWIOTLB
577 select PCI_MSI 612 select PCI_MSI
578 depends on X86_64 && PCI && ACPI 613 depends on X86_64 && PCI && ACPI
579 help 614 ---help---
580 With this option you can enable support for AMD IOMMU hardware in 615 With this option you can enable support for AMD IOMMU hardware in
581 your system. An IOMMU is a hardware component which provides 616 your system. An IOMMU is a hardware component which provides
582 remapping of DMA memory accesses from devices. With an AMD IOMMU you 617 remapping of DMA memory accesses from devices. With an AMD IOMMU you
@@ -591,7 +626,7 @@ config AMD_IOMMU_STATS
591 bool "Export AMD IOMMU statistics to debugfs" 626 bool "Export AMD IOMMU statistics to debugfs"
592 depends on AMD_IOMMU 627 depends on AMD_IOMMU
593 select DEBUG_FS 628 select DEBUG_FS
594 help 629 ---help---
595 This option enables code in the AMD IOMMU driver to collect various 630 This option enables code in the AMD IOMMU driver to collect various
596 statistics about whats happening in the driver and exports that 631 statistics about whats happening in the driver and exports that
597 information to userspace via debugfs. 632 information to userspace via debugfs.
@@ -600,7 +635,7 @@ config AMD_IOMMU_STATS
600# need this always selected by IOMMU for the VIA workaround 635# need this always selected by IOMMU for the VIA workaround
601config SWIOTLB 636config SWIOTLB
602 def_bool y if X86_64 637 def_bool y if X86_64
603 help 638 ---help---
604 Support for software bounce buffers used on x86-64 systems 639 Support for software bounce buffers used on x86-64 systems
605 which don't have a hardware IOMMU (e.g. the current generation 640 which don't have a hardware IOMMU (e.g. the current generation
606 of Intel's x86-64 CPUs). Using this PCI devices which can only 641 of Intel's x86-64 CPUs). Using this PCI devices which can only
@@ -618,7 +653,7 @@ config MAXSMP
618 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL 653 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
619 select CPUMASK_OFFSTACK 654 select CPUMASK_OFFSTACK
620 default n 655 default n
621 help 656 ---help---
622 Configure maximum number of CPUS and NUMA Nodes for this architecture. 657 Configure maximum number of CPUS and NUMA Nodes for this architecture.
623 If unsure, say N. 658 If unsure, say N.
624 659
@@ -629,7 +664,7 @@ config NR_CPUS
629 default "4096" if MAXSMP 664 default "4096" if MAXSMP
630 default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) 665 default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
631 default "8" if SMP 666 default "8" if SMP
632 help 667 ---help---
633 This allows you to specify the maximum number of CPUs which this 668 This allows you to specify the maximum number of CPUs which this
634 kernel will support. The maximum supported value is 512 and the 669 kernel will support. The maximum supported value is 512 and the
635 minimum value which makes sense is 2. 670 minimum value which makes sense is 2.
@@ -640,7 +675,7 @@ config NR_CPUS
640config SCHED_SMT 675config SCHED_SMT
641 bool "SMT (Hyperthreading) scheduler support" 676 bool "SMT (Hyperthreading) scheduler support"
642 depends on X86_HT 677 depends on X86_HT
643 help 678 ---help---
644 SMT scheduler support improves the CPU scheduler's decision making 679 SMT scheduler support improves the CPU scheduler's decision making
645 when dealing with Intel Pentium 4 chips with HyperThreading at a 680 when dealing with Intel Pentium 4 chips with HyperThreading at a
646 cost of slightly increased overhead in some places. If unsure say 681 cost of slightly increased overhead in some places. If unsure say
@@ -650,7 +685,7 @@ config SCHED_MC
650 def_bool y 685 def_bool y
651 prompt "Multi-core scheduler support" 686 prompt "Multi-core scheduler support"
652 depends on X86_HT 687 depends on X86_HT
653 help 688 ---help---
654 Multi-core scheduler support improves the CPU scheduler's decision 689 Multi-core scheduler support improves the CPU scheduler's decision
655 making when dealing with multi-core CPU chips at a cost of slightly 690 making when dealing with multi-core CPU chips at a cost of slightly
656 increased overhead in some places. If unsure say N here. 691 increased overhead in some places. If unsure say N here.
@@ -659,8 +694,8 @@ source "kernel/Kconfig.preempt"
659 694
660config X86_UP_APIC 695config X86_UP_APIC
661 bool "Local APIC support on uniprocessors" 696 bool "Local APIC support on uniprocessors"
662 depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH) 697 depends on X86_32 && !SMP && !X86_32_NON_STANDARD
663 help 698 ---help---
664 A local APIC (Advanced Programmable Interrupt Controller) is an 699 A local APIC (Advanced Programmable Interrupt Controller) is an
665 integrated interrupt controller in the CPU. If you have a single-CPU 700 integrated interrupt controller in the CPU. If you have a single-CPU
666 system which has a processor with a local APIC, you can say Y here to 701 system which has a processor with a local APIC, you can say Y here to
@@ -673,7 +708,7 @@ config X86_UP_APIC
673config X86_UP_IOAPIC 708config X86_UP_IOAPIC
674 bool "IO-APIC support on uniprocessors" 709 bool "IO-APIC support on uniprocessors"
675 depends on X86_UP_APIC 710 depends on X86_UP_APIC
676 help 711 ---help---
677 An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an 712 An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
678 SMP-capable replacement for PC-style interrupt controllers. Most 713 SMP-capable replacement for PC-style interrupt controllers. Most
679 SMP systems and many recent uniprocessor systems have one. 714 SMP systems and many recent uniprocessor systems have one.
@@ -684,11 +719,11 @@ config X86_UP_IOAPIC
684 719
685config X86_LOCAL_APIC 720config X86_LOCAL_APIC
686 def_bool y 721 def_bool y
687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 722 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
688 723
689config X86_IO_APIC 724config X86_IO_APIC
690 def_bool y 725 def_bool y
691 depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 726 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
692 727
693config X86_VISWS_APIC 728config X86_VISWS_APIC
694 def_bool y 729 def_bool y
@@ -698,7 +733,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
698 bool "Reroute for broken boot IRQs" 733 bool "Reroute for broken boot IRQs"
699 default n 734 default n
700 depends on X86_IO_APIC 735 depends on X86_IO_APIC
701 help 736 ---help---
702 This option enables a workaround that fixes a source of 737 This option enables a workaround that fixes a source of
703 spurious interrupts. This is recommended when threaded 738 spurious interrupts. This is recommended when threaded
704 interrupt handling is used on systems where the generation of 739 interrupt handling is used on systems where the generation of
@@ -720,7 +755,6 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
720 755
721config X86_MCE 756config X86_MCE
722 bool "Machine Check Exception" 757 bool "Machine Check Exception"
723 depends on !X86_VOYAGER
724 ---help--- 758 ---help---
725 Machine Check Exception support allows the processor to notify the 759 Machine Check Exception support allows the processor to notify the
726 kernel if it detects a problem (e.g. overheating, component failure). 760 kernel if it detects a problem (e.g. overheating, component failure).
@@ -739,7 +773,7 @@ config X86_MCE_INTEL
739 def_bool y 773 def_bool y
740 prompt "Intel MCE features" 774 prompt "Intel MCE features"
741 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 775 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
742 help 776 ---help---
743 Additional support for intel specific MCE features such as 777 Additional support for intel specific MCE features such as
744 the thermal monitor. 778 the thermal monitor.
745 779
@@ -747,14 +781,19 @@ config X86_MCE_AMD
747 def_bool y 781 def_bool y
748 prompt "AMD MCE features" 782 prompt "AMD MCE features"
749 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 783 depends on X86_64 && X86_MCE && X86_LOCAL_APIC
750 help 784 ---help---
751 Additional support for AMD specific MCE features such as 785 Additional support for AMD specific MCE features such as
752 the DRAM Error Threshold. 786 the DRAM Error Threshold.
753 787
788config X86_MCE_THRESHOLD
789 depends on X86_MCE_AMD || X86_MCE_INTEL
790 bool
791 default y
792
754config X86_MCE_NONFATAL 793config X86_MCE_NONFATAL
755 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 794 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
756 depends on X86_32 && X86_MCE 795 depends on X86_32 && X86_MCE
757 help 796 ---help---
758 Enabling this feature starts a timer that triggers every 5 seconds which 797 Enabling this feature starts a timer that triggers every 5 seconds which
759 will look at the machine check registers to see if anything happened. 798 will look at the machine check registers to see if anything happened.
760 Non-fatal problems automatically get corrected (but still logged). 799 Non-fatal problems automatically get corrected (but still logged).
@@ -767,7 +806,7 @@ config X86_MCE_NONFATAL
767config X86_MCE_P4THERMAL 806config X86_MCE_P4THERMAL
768 bool "check for P4 thermal throttling interrupt." 807 bool "check for P4 thermal throttling interrupt."
769 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) 808 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
770 help 809 ---help---
771 Enabling this feature will cause a message to be printed when the P4 810 Enabling this feature will cause a message to be printed when the P4
772 enters thermal throttling. 811 enters thermal throttling.
773 812
@@ -775,11 +814,11 @@ config VM86
775 bool "Enable VM86 support" if EMBEDDED 814 bool "Enable VM86 support" if EMBEDDED
776 default y 815 default y
777 depends on X86_32 816 depends on X86_32
778 help 817 ---help---
779 This option is required by programs like DOSEMU to run 16-bit legacy 818 This option is required by programs like DOSEMU to run 16-bit legacy
780 code on X86 processors. It also may be needed by software like 819 code on X86 processors. It also may be needed by software like
781 XFree86 to initialize some video cards via BIOS. Disabling this 820 XFree86 to initialize some video cards via BIOS. Disabling this
782 option saves about 6k. 821 option saves about 6k.
783 822
784config TOSHIBA 823config TOSHIBA
785 tristate "Toshiba Laptop support" 824 tristate "Toshiba Laptop support"
@@ -853,33 +892,33 @@ config MICROCODE
853 module will be called microcode. 892 module will be called microcode.
854 893
855config MICROCODE_INTEL 894config MICROCODE_INTEL
856 bool "Intel microcode patch loading support" 895 bool "Intel microcode patch loading support"
857 depends on MICROCODE 896 depends on MICROCODE
858 default MICROCODE 897 default MICROCODE
859 select FW_LOADER 898 select FW_LOADER
860 --help--- 899 ---help---
861 This options enables microcode patch loading support for Intel 900 This options enables microcode patch loading support for Intel
862 processors. 901 processors.
863 902
864 For latest news and information on obtaining all the required 903 For latest news and information on obtaining all the required
865 Intel ingredients for this driver, check: 904 Intel ingredients for this driver, check:
866 <http://www.urbanmyth.org/microcode/>. 905 <http://www.urbanmyth.org/microcode/>.
867 906
868config MICROCODE_AMD 907config MICROCODE_AMD
869 bool "AMD microcode patch loading support" 908 bool "AMD microcode patch loading support"
870 depends on MICROCODE 909 depends on MICROCODE
871 select FW_LOADER 910 select FW_LOADER
872 --help--- 911 ---help---
873 If you select this option, microcode patch loading support for AMD 912 If you select this option, microcode patch loading support for AMD
874 processors will be enabled. 913 processors will be enabled.
875 914
876 config MICROCODE_OLD_INTERFACE 915config MICROCODE_OLD_INTERFACE
877 def_bool y 916 def_bool y
878 depends on MICROCODE 917 depends on MICROCODE
879 918
880config X86_MSR 919config X86_MSR
881 tristate "/dev/cpu/*/msr - Model-specific register support" 920 tristate "/dev/cpu/*/msr - Model-specific register support"
882 help 921 ---help---
883 This device gives privileged processes access to the x86 922 This device gives privileged processes access to the x86
884 Model-Specific Registers (MSRs). It is a character device with 923 Model-Specific Registers (MSRs). It is a character device with
885 major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. 924 major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
@@ -888,12 +927,18 @@ config X86_MSR
888 927
889config X86_CPUID 928config X86_CPUID
890 tristate "/dev/cpu/*/cpuid - CPU information support" 929 tristate "/dev/cpu/*/cpuid - CPU information support"
891 help 930 ---help---
892 This device gives processes access to the x86 CPUID instruction to 931 This device gives processes access to the x86 CPUID instruction to
893 be executed on a specific processor. It is a character device 932 be executed on a specific processor. It is a character device
894 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 933 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
895 /dev/cpu/31/cpuid. 934 /dev/cpu/31/cpuid.
896 935
936config X86_CPU_DEBUG
937 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
938 ---help---
939 If you select this option, this will provide various x86 CPUs
940 information through debugfs.
941
897choice 942choice
898 prompt "High Memory Support" 943 prompt "High Memory Support"
899 default HIGHMEM4G if !X86_NUMAQ 944 default HIGHMEM4G if !X86_NUMAQ
@@ -940,7 +985,7 @@ config NOHIGHMEM
940config HIGHMEM4G 985config HIGHMEM4G
941 bool "4GB" 986 bool "4GB"
942 depends on !X86_NUMAQ 987 depends on !X86_NUMAQ
943 help 988 ---help---
944 Select this if you have a 32-bit processor and between 1 and 4 989 Select this if you have a 32-bit processor and between 1 and 4
945 gigabytes of physical RAM. 990 gigabytes of physical RAM.
946 991
@@ -948,7 +993,7 @@ config HIGHMEM64G
948 bool "64GB" 993 bool "64GB"
949 depends on !M386 && !M486 994 depends on !M386 && !M486
950 select X86_PAE 995 select X86_PAE
951 help 996 ---help---
952 Select this if you have a 32-bit processor and more than 4 997 Select this if you have a 32-bit processor and more than 4
953 gigabytes of physical RAM. 998 gigabytes of physical RAM.
954 999
@@ -959,7 +1004,7 @@ choice
959 prompt "Memory split" if EMBEDDED 1004 prompt "Memory split" if EMBEDDED
960 default VMSPLIT_3G 1005 default VMSPLIT_3G
961 depends on X86_32 1006 depends on X86_32
962 help 1007 ---help---
963 Select the desired split between kernel and user memory. 1008 Select the desired split between kernel and user memory.
964 1009
965 If the address range available to the kernel is less than the 1010 If the address range available to the kernel is less than the
@@ -1005,20 +1050,20 @@ config HIGHMEM
1005config X86_PAE 1050config X86_PAE
1006 bool "PAE (Physical Address Extension) Support" 1051 bool "PAE (Physical Address Extension) Support"
1007 depends on X86_32 && !HIGHMEM4G 1052 depends on X86_32 && !HIGHMEM4G
1008 help 1053 ---help---
1009 PAE is required for NX support, and furthermore enables 1054 PAE is required for NX support, and furthermore enables
1010 larger swapspace support for non-overcommit purposes. It 1055 larger swapspace support for non-overcommit purposes. It
1011 has the cost of more pagetable lookup overhead, and also 1056 has the cost of more pagetable lookup overhead, and also
1012 consumes more pagetable space per process. 1057 consumes more pagetable space per process.
1013 1058
1014config ARCH_PHYS_ADDR_T_64BIT 1059config ARCH_PHYS_ADDR_T_64BIT
1015 def_bool X86_64 || X86_PAE 1060 def_bool X86_64 || X86_PAE
1016 1061
1017config DIRECT_GBPAGES 1062config DIRECT_GBPAGES
1018 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1063 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1019 default y 1064 default y
1020 depends on X86_64 1065 depends on X86_64
1021 help 1066 ---help---
1022 Allow the kernel linear mapping to use 1GB pages on CPUs that 1067 Allow the kernel linear mapping to use 1GB pages on CPUs that
1023 support it. This can improve the kernel's performance a tiny bit by 1068 support it. This can improve the kernel's performance a tiny bit by
1024 reducing TLB pressure. If in doubt, say "Y". 1069 reducing TLB pressure. If in doubt, say "Y".
@@ -1028,9 +1073,8 @@ config NUMA
1028 bool "Numa Memory Allocation and Scheduler Support" 1073 bool "Numa Memory Allocation and Scheduler Support"
1029 depends on SMP 1074 depends on SMP
1030 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) 1075 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
1031 default n if X86_PC
1032 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) 1076 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
1033 help 1077 ---help---
1034 Enable NUMA (Non Uniform Memory Access) support. 1078 Enable NUMA (Non Uniform Memory Access) support.
1035 1079
1036 The kernel will try to allocate memory used by a CPU on the 1080 The kernel will try to allocate memory used by a CPU on the
@@ -1053,19 +1097,19 @@ config K8_NUMA
1053 def_bool y 1097 def_bool y
1054 prompt "Old style AMD Opteron NUMA detection" 1098 prompt "Old style AMD Opteron NUMA detection"
1055 depends on X86_64 && NUMA && PCI 1099 depends on X86_64 && NUMA && PCI
1056 help 1100 ---help---
1057 Enable K8 NUMA node topology detection. You should say Y here if 1101 Enable K8 NUMA node topology detection. You should say Y here if
1058 you have a multi processor AMD K8 system. This uses an old 1102 you have a multi processor AMD K8 system. This uses an old
1059 method to read the NUMA configuration directly from the builtin 1103 method to read the NUMA configuration directly from the builtin
1060 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 1104 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
1061 instead, which also takes priority if both are compiled in. 1105 instead, which also takes priority if both are compiled in.
1062 1106
1063config X86_64_ACPI_NUMA 1107config X86_64_ACPI_NUMA
1064 def_bool y 1108 def_bool y
1065 prompt "ACPI NUMA detection" 1109 prompt "ACPI NUMA detection"
1066 depends on X86_64 && NUMA && ACPI && PCI 1110 depends on X86_64 && NUMA && ACPI && PCI
1067 select ACPI_NUMA 1111 select ACPI_NUMA
1068 help 1112 ---help---
1069 Enable ACPI SRAT based node topology detection. 1113 Enable ACPI SRAT based node topology detection.
1070 1114
1071# Some NUMA nodes have memory ranges that span 1115# Some NUMA nodes have memory ranges that span
@@ -1080,7 +1124,7 @@ config NODES_SPAN_OTHER_NODES
1080config NUMA_EMU 1124config NUMA_EMU
1081 bool "NUMA emulation" 1125 bool "NUMA emulation"
1082 depends on X86_64 && NUMA 1126 depends on X86_64 && NUMA
1083 help 1127 ---help---
1084 Enable NUMA emulation. A flat machine will be split 1128 Enable NUMA emulation. A flat machine will be split
1085 into virtual nodes when booted with "numa=fake=N", where N is the 1129 into virtual nodes when booted with "numa=fake=N", where N is the
1086 number of nodes. This is only useful for debugging. 1130 number of nodes. This is only useful for debugging.
@@ -1093,11 +1137,11 @@ config NODES_SHIFT
1093 default "4" if X86_NUMAQ 1137 default "4" if X86_NUMAQ
1094 default "3" 1138 default "3"
1095 depends on NEED_MULTIPLE_NODES 1139 depends on NEED_MULTIPLE_NODES
1096 help 1140 ---help---
1097 Specify the maximum number of NUMA Nodes available on the target 1141 Specify the maximum number of NUMA Nodes available on the target
1098 system. Increases memory reserved to accomodate various tables. 1142 system. Increases memory reserved to accomodate various tables.
1099 1143
1100config HAVE_ARCH_BOOTMEM_NODE 1144config HAVE_ARCH_BOOTMEM
1101 def_bool y 1145 def_bool y
1102 depends on X86_32 && NUMA 1146 depends on X86_32 && NUMA
1103 1147
@@ -1131,7 +1175,7 @@ config ARCH_SPARSEMEM_DEFAULT
1131 1175
1132config ARCH_SPARSEMEM_ENABLE 1176config ARCH_SPARSEMEM_ENABLE
1133 def_bool y 1177 def_bool y
1134 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH 1178 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
1135 select SPARSEMEM_STATIC if X86_32 1179 select SPARSEMEM_STATIC if X86_32
1136 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1180 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1137 1181
@@ -1148,61 +1192,61 @@ source "mm/Kconfig"
1148config HIGHPTE 1192config HIGHPTE
1149 bool "Allocate 3rd-level pagetables from highmem" 1193 bool "Allocate 3rd-level pagetables from highmem"
1150 depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) 1194 depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
1151 help 1195 ---help---
1152 The VM uses one page table entry for each page of physical memory. 1196 The VM uses one page table entry for each page of physical memory.
1153 For systems with a lot of RAM, this can be wasteful of precious 1197 For systems with a lot of RAM, this can be wasteful of precious
1154 low memory. Setting this option will put user-space page table 1198 low memory. Setting this option will put user-space page table
1155 entries in high memory. 1199 entries in high memory.
1156 1200
1157config X86_CHECK_BIOS_CORRUPTION 1201config X86_CHECK_BIOS_CORRUPTION
1158 bool "Check for low memory corruption" 1202 bool "Check for low memory corruption"
1159 help 1203 ---help---
1160 Periodically check for memory corruption in low memory, which 1204 Periodically check for memory corruption in low memory, which
1161 is suspected to be caused by BIOS. Even when enabled in the 1205 is suspected to be caused by BIOS. Even when enabled in the
1162 configuration, it is disabled at runtime. Enable it by 1206 configuration, it is disabled at runtime. Enable it by
1163 setting "memory_corruption_check=1" on the kernel command 1207 setting "memory_corruption_check=1" on the kernel command
1164 line. By default it scans the low 64k of memory every 60 1208 line. By default it scans the low 64k of memory every 60
1165 seconds; see the memory_corruption_check_size and 1209 seconds; see the memory_corruption_check_size and
1166 memory_corruption_check_period parameters in 1210 memory_corruption_check_period parameters in
1167 Documentation/kernel-parameters.txt to adjust this. 1211 Documentation/kernel-parameters.txt to adjust this.
1168 1212
1169 When enabled with the default parameters, this option has 1213 When enabled with the default parameters, this option has
1170 almost no overhead, as it reserves a relatively small amount 1214 almost no overhead, as it reserves a relatively small amount
1171 of memory and scans it infrequently. It both detects corruption 1215 of memory and scans it infrequently. It both detects corruption
1172 and prevents it from affecting the running system. 1216 and prevents it from affecting the running system.
1173 1217
1174 It is, however, intended as a diagnostic tool; if repeatable 1218 It is, however, intended as a diagnostic tool; if repeatable
1175 BIOS-originated corruption always affects the same memory, 1219 BIOS-originated corruption always affects the same memory,
1176 you can use memmap= to prevent the kernel from using that 1220 you can use memmap= to prevent the kernel from using that
1177 memory. 1221 memory.
1178 1222
1179config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK 1223config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1180 bool "Set the default setting of memory_corruption_check" 1224 bool "Set the default setting of memory_corruption_check"
1181 depends on X86_CHECK_BIOS_CORRUPTION 1225 depends on X86_CHECK_BIOS_CORRUPTION
1182 default y 1226 default y
1183 help 1227 ---help---
1184 Set whether the default state of memory_corruption_check is 1228 Set whether the default state of memory_corruption_check is
1185 on or off. 1229 on or off.
1186 1230
1187config X86_RESERVE_LOW_64K 1231config X86_RESERVE_LOW_64K
1188 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" 1232 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1189 default y 1233 default y
1190 help 1234 ---help---
1191 Reserve the first 64K of physical RAM on BIOSes that are known 1235 Reserve the first 64K of physical RAM on BIOSes that are known
1192 to potentially corrupt that memory range. A numbers of BIOSes are 1236 to potentially corrupt that memory range. A numbers of BIOSes are
1193 known to utilize this area during suspend/resume, so it must not 1237 known to utilize this area during suspend/resume, so it must not
1194 be used by the kernel. 1238 be used by the kernel.
1195 1239
1196 Set this to N if you are absolutely sure that you trust the BIOS 1240 Set this to N if you are absolutely sure that you trust the BIOS
1197 to get all its memory reservations and usages right. 1241 to get all its memory reservations and usages right.
1198 1242
1199 If you have doubts about the BIOS (e.g. suspend/resume does not 1243 If you have doubts about the BIOS (e.g. suspend/resume does not
1200 work or there's kernel crashes after certain hardware hotplug 1244 work or there's kernel crashes after certain hardware hotplug
1201 events) and it's not AMI or Phoenix, then you might want to enable 1245 events) and it's not AMI or Phoenix, then you might want to enable
1202 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical 1246 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1203 corruption patterns. 1247 corruption patterns.
1204 1248
1205 Say Y if unsure. 1249 Say Y if unsure.
1206 1250
1207config MATH_EMULATION 1251config MATH_EMULATION
1208 bool 1252 bool
@@ -1268,7 +1312,7 @@ config MTRR_SANITIZER
1268 def_bool y 1312 def_bool y
1269 prompt "MTRR cleanup support" 1313 prompt "MTRR cleanup support"
1270 depends on MTRR 1314 depends on MTRR
1271 help 1315 ---help---
1272 Convert MTRR layout from continuous to discrete, so X drivers can 1316 Convert MTRR layout from continuous to discrete, so X drivers can
1273 add writeback entries. 1317 add writeback entries.
1274 1318
@@ -1283,7 +1327,7 @@ config MTRR_SANITIZER_ENABLE_DEFAULT
1283 range 0 1 1327 range 0 1
1284 default "0" 1328 default "0"
1285 depends on MTRR_SANITIZER 1329 depends on MTRR_SANITIZER
1286 help 1330 ---help---
1287 Enable mtrr cleanup default value 1331 Enable mtrr cleanup default value
1288 1332
1289config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT 1333config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
@@ -1291,7 +1335,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1291 range 0 7 1335 range 0 7
1292 default "1" 1336 default "1"
1293 depends on MTRR_SANITIZER 1337 depends on MTRR_SANITIZER
1294 help 1338 ---help---
1295 mtrr cleanup spare entries default, it can be changed via 1339 mtrr cleanup spare entries default, it can be changed via
1296 mtrr_spare_reg_nr=N on the kernel command line. 1340 mtrr_spare_reg_nr=N on the kernel command line.
1297 1341
@@ -1299,7 +1343,7 @@ config X86_PAT
1299 bool 1343 bool
1300 prompt "x86 PAT support" 1344 prompt "x86 PAT support"
1301 depends on MTRR 1345 depends on MTRR
1302 help 1346 ---help---
1303 Use PAT attributes to setup page level cache control. 1347 Use PAT attributes to setup page level cache control.
1304 1348
1305 PATs are the modern equivalents of MTRRs and are much more 1349 PATs are the modern equivalents of MTRRs and are much more
@@ -1314,20 +1358,20 @@ config EFI
1314 bool "EFI runtime service support" 1358 bool "EFI runtime service support"
1315 depends on ACPI 1359 depends on ACPI
1316 ---help--- 1360 ---help---
1317 This enables the kernel to use EFI runtime services that are 1361 This enables the kernel to use EFI runtime services that are
1318 available (such as the EFI variable services). 1362 available (such as the EFI variable services).
1319 1363
1320 This option is only useful on systems that have EFI firmware. 1364 This option is only useful on systems that have EFI firmware.
1321 In addition, you should use the latest ELILO loader available 1365 In addition, you should use the latest ELILO loader available
1322 at <http://elilo.sourceforge.net> in order to take advantage 1366 at <http://elilo.sourceforge.net> in order to take advantage
1323 of EFI runtime services. However, even with this option, the 1367 of EFI runtime services. However, even with this option, the
1324 resultant kernel should continue to boot on existing non-EFI 1368 resultant kernel should continue to boot on existing non-EFI
1325 platforms. 1369 platforms.
1326 1370
1327config SECCOMP 1371config SECCOMP
1328 def_bool y 1372 def_bool y
1329 prompt "Enable seccomp to safely compute untrusted bytecode" 1373 prompt "Enable seccomp to safely compute untrusted bytecode"
1330 help 1374 ---help---
1331 This kernel feature is useful for number crunching applications 1375 This kernel feature is useful for number crunching applications
1332 that may need to compute untrusted bytecode during their 1376 that may need to compute untrusted bytecode during their
1333 execution. By using pipes or other transports made available to 1377 execution. By using pipes or other transports made available to
@@ -1340,13 +1384,16 @@ config SECCOMP
1340 1384
1341 If unsure, say Y. Only embedded should say N here. 1385 If unsure, say Y. Only embedded should say N here.
1342 1386
1387config CC_STACKPROTECTOR_ALL
1388 bool
1389
1343config CC_STACKPROTECTOR 1390config CC_STACKPROTECTOR
1344 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1391 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1345 depends on X86_64 && EXPERIMENTAL && BROKEN 1392 select CC_STACKPROTECTOR_ALL
1346 help 1393 ---help---
1347 This option turns on the -fstack-protector GCC feature. This 1394 This option turns on the -fstack-protector GCC feature. This
1348 feature puts, at the beginning of critical functions, a canary 1395 feature puts, at the beginning of functions, a canary value on
1349 value on the stack just before the return address, and validates 1396 the stack just before the return address, and validates
1350 the value just before actually returning. Stack based buffer 1397 the value just before actually returning. Stack based buffer
1351 overflows (that need to overwrite this return address) now also 1398 overflows (that need to overwrite this return address) now also
1352 overwrite the canary, which gets detected and the attack is then 1399 overwrite the canary, which gets detected and the attack is then
@@ -1354,22 +1401,14 @@ config CC_STACKPROTECTOR
1354 1401
1355 This feature requires gcc version 4.2 or above, or a distribution 1402 This feature requires gcc version 4.2 or above, or a distribution
1356 gcc with the feature backported. Older versions are automatically 1403 gcc with the feature backported. Older versions are automatically
1357 detected and for those versions, this configuration option is ignored. 1404 detected and for those versions, this configuration option is
1358 1405 ignored. (and a warning is printed during bootup)
1359config CC_STACKPROTECTOR_ALL
1360 bool "Use stack-protector for all functions"
1361 depends on CC_STACKPROTECTOR
1362 help
1363 Normally, GCC only inserts the canary value protection for
1364 functions that use large-ish on-stack buffers. By enabling
1365 this option, GCC will be asked to do this for ALL functions.
1366 1406
1367source kernel/Kconfig.hz 1407source kernel/Kconfig.hz
1368 1408
1369config KEXEC 1409config KEXEC
1370 bool "kexec system call" 1410 bool "kexec system call"
1371 depends on X86_BIOS_REBOOT 1411 ---help---
1372 help
1373 kexec is a system call that implements the ability to shutdown your 1412 kexec is a system call that implements the ability to shutdown your
1374 current kernel, and to start another kernel. It is like a reboot 1413 current kernel, and to start another kernel. It is like a reboot
1375 but it is independent of the system firmware. And like a reboot 1414 but it is independent of the system firmware. And like a reboot
@@ -1386,7 +1425,7 @@ config KEXEC
1386config CRASH_DUMP 1425config CRASH_DUMP
1387 bool "kernel crash dumps" 1426 bool "kernel crash dumps"
1388 depends on X86_64 || (X86_32 && HIGHMEM) 1427 depends on X86_64 || (X86_32 && HIGHMEM)
1389 help 1428 ---help---
1390 Generate crash dump after being started by kexec. 1429 Generate crash dump after being started by kexec.
1391 This should be normally only set in special crash dump kernels 1430 This should be normally only set in special crash dump kernels
1392 which are loaded in the main kernel with kexec-tools into 1431 which are loaded in the main kernel with kexec-tools into
@@ -1400,8 +1439,8 @@ config CRASH_DUMP
1400config KEXEC_JUMP 1439config KEXEC_JUMP
1401 bool "kexec jump (EXPERIMENTAL)" 1440 bool "kexec jump (EXPERIMENTAL)"
1402 depends on EXPERIMENTAL 1441 depends on EXPERIMENTAL
1403 depends on KEXEC && HIBERNATION && X86_32 1442 depends on KEXEC && HIBERNATION
1404 help 1443 ---help---
1405 Jump between original kernel and kexeced kernel and invoke 1444 Jump between original kernel and kexeced kernel and invoke
1406 code in physical address mode via KEXEC 1445 code in physical address mode via KEXEC
1407 1446
@@ -1410,7 +1449,7 @@ config PHYSICAL_START
1410 default "0x1000000" if X86_NUMAQ 1449 default "0x1000000" if X86_NUMAQ
1411 default "0x200000" if X86_64 1450 default "0x200000" if X86_64
1412 default "0x100000" 1451 default "0x100000"
1413 help 1452 ---help---
1414 This gives the physical address where the kernel is loaded. 1453 This gives the physical address where the kernel is loaded.
1415 1454
1416 If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then 1455 If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
@@ -1451,7 +1490,7 @@ config PHYSICAL_START
1451config RELOCATABLE 1490config RELOCATABLE
1452 bool "Build a relocatable kernel (EXPERIMENTAL)" 1491 bool "Build a relocatable kernel (EXPERIMENTAL)"
1453 depends on EXPERIMENTAL 1492 depends on EXPERIMENTAL
1454 help 1493 ---help---
1455 This builds a kernel image that retains relocation information 1494 This builds a kernel image that retains relocation information
1456 so it can be loaded someplace besides the default 1MB. 1495 so it can be loaded someplace besides the default 1MB.
1457 The relocations tend to make the kernel binary about 10% larger, 1496 The relocations tend to make the kernel binary about 10% larger,
@@ -1471,7 +1510,7 @@ config PHYSICAL_ALIGN
1471 default "0x100000" if X86_32 1510 default "0x100000" if X86_32
1472 default "0x200000" if X86_64 1511 default "0x200000" if X86_64
1473 range 0x2000 0x400000 1512 range 0x2000 0x400000
1474 help 1513 ---help---
1475 This value puts the alignment restrictions on physical address 1514 This value puts the alignment restrictions on physical address
1476 where kernel is loaded and run from. Kernel is compiled for an 1515 where kernel is loaded and run from. Kernel is compiled for an
1477 address which meets above alignment restriction. 1516 address which meets above alignment restriction.
@@ -1492,7 +1531,7 @@ config PHYSICAL_ALIGN
1492 1531
1493config HOTPLUG_CPU 1532config HOTPLUG_CPU
1494 bool "Support for hot-pluggable CPUs" 1533 bool "Support for hot-pluggable CPUs"
1495 depends on SMP && HOTPLUG && !X86_VOYAGER 1534 depends on SMP && HOTPLUG
1496 ---help--- 1535 ---help---
1497 Say Y here to allow turning CPUs off and on. CPUs can be 1536 Say Y here to allow turning CPUs off and on. CPUs can be
1498 controlled through /sys/devices/system/cpu. 1537 controlled through /sys/devices/system/cpu.
@@ -1504,7 +1543,7 @@ config COMPAT_VDSO
1504 def_bool y 1543 def_bool y
1505 prompt "Compat VDSO support" 1544 prompt "Compat VDSO support"
1506 depends on X86_32 || IA32_EMULATION 1545 depends on X86_32 || IA32_EMULATION
1507 help 1546 ---help---
1508 Map the 32-bit VDSO to the predictable old-style address too. 1547 Map the 32-bit VDSO to the predictable old-style address too.
1509 ---help--- 1548 ---help---
1510 Say N here if you are running a sufficiently recent glibc 1549 Say N here if you are running a sufficiently recent glibc
@@ -1516,7 +1555,7 @@ config COMPAT_VDSO
1516config CMDLINE_BOOL 1555config CMDLINE_BOOL
1517 bool "Built-in kernel command line" 1556 bool "Built-in kernel command line"
1518 default n 1557 default n
1519 help 1558 ---help---
1520 Allow for specifying boot arguments to the kernel at 1559 Allow for specifying boot arguments to the kernel at
1521 build time. On some systems (e.g. embedded ones), it is 1560 build time. On some systems (e.g. embedded ones), it is
1522 necessary or convenient to provide some or all of the 1561 necessary or convenient to provide some or all of the
@@ -1534,7 +1573,7 @@ config CMDLINE
1534 string "Built-in kernel command string" 1573 string "Built-in kernel command string"
1535 depends on CMDLINE_BOOL 1574 depends on CMDLINE_BOOL
1536 default "" 1575 default ""
1537 help 1576 ---help---
1538 Enter arguments here that should be compiled into the kernel 1577 Enter arguments here that should be compiled into the kernel
1539 image and used at boot time. If the boot loader provides a 1578 image and used at boot time. If the boot loader provides a
1540 command line at boot time, it is appended to this string to 1579 command line at boot time, it is appended to this string to
@@ -1551,7 +1590,7 @@ config CMDLINE_OVERRIDE
1551 bool "Built-in command line overrides boot loader arguments" 1590 bool "Built-in command line overrides boot loader arguments"
1552 default n 1591 default n
1553 depends on CMDLINE_BOOL 1592 depends on CMDLINE_BOOL
1554 help 1593 ---help---
1555 Set this option to 'Y' to have the kernel ignore the boot loader 1594 Set this option to 'Y' to have the kernel ignore the boot loader
1556 command line, and use ONLY the built-in command line. 1595 command line, and use ONLY the built-in command line.
1557 1596
@@ -1573,7 +1612,6 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
1573 depends on NUMA 1612 depends on NUMA
1574 1613
1575menu "Power management and ACPI options" 1614menu "Power management and ACPI options"
1576 depends on !X86_VOYAGER
1577 1615
1578config ARCH_HIBERNATION_HEADER 1616config ARCH_HIBERNATION_HEADER
1579 def_bool y 1617 def_bool y
@@ -1651,7 +1689,7 @@ if APM
1651 1689
1652config APM_IGNORE_USER_SUSPEND 1690config APM_IGNORE_USER_SUSPEND
1653 bool "Ignore USER SUSPEND" 1691 bool "Ignore USER SUSPEND"
1654 help 1692 ---help---
1655 This option will ignore USER SUSPEND requests. On machines with a 1693 This option will ignore USER SUSPEND requests. On machines with a
1656 compliant APM BIOS, you want to say N. However, on the NEC Versa M 1694 compliant APM BIOS, you want to say N. However, on the NEC Versa M
1657 series notebooks, it is necessary to say Y because of a BIOS bug. 1695 series notebooks, it is necessary to say Y because of a BIOS bug.
@@ -1675,7 +1713,7 @@ config APM_DO_ENABLE
1675 1713
1676config APM_CPU_IDLE 1714config APM_CPU_IDLE
1677 bool "Make CPU Idle calls when idle" 1715 bool "Make CPU Idle calls when idle"
1678 help 1716 ---help---
1679 Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. 1717 Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
1680 On some machines, this can activate improved power savings, such as 1718 On some machines, this can activate improved power savings, such as
1681 a slowed CPU clock rate, when the machine is idle. These idle calls 1719 a slowed CPU clock rate, when the machine is idle. These idle calls
@@ -1686,7 +1724,7 @@ config APM_CPU_IDLE
1686 1724
1687config APM_DISPLAY_BLANK 1725config APM_DISPLAY_BLANK
1688 bool "Enable console blanking using APM" 1726 bool "Enable console blanking using APM"
1689 help 1727 ---help---
1690 Enable console blanking using the APM. Some laptops can use this to 1728 Enable console blanking using the APM. Some laptops can use this to
1691 turn off the LCD backlight when the screen blanker of the Linux 1729 turn off the LCD backlight when the screen blanker of the Linux
1692 virtual console blanks the screen. Note that this is only used by 1730 virtual console blanks the screen. Note that this is only used by
@@ -1699,7 +1737,7 @@ config APM_DISPLAY_BLANK
1699 1737
1700config APM_ALLOW_INTS 1738config APM_ALLOW_INTS
1701 bool "Allow interrupts during APM BIOS calls" 1739 bool "Allow interrupts during APM BIOS calls"
1702 help 1740 ---help---
1703 Normally we disable external interrupts while we are making calls to 1741 Normally we disable external interrupts while we are making calls to
1704 the APM BIOS as a measure to lessen the effects of a badly behaving 1742 the APM BIOS as a measure to lessen the effects of a badly behaving
1705 BIOS implementation. The BIOS should reenable interrupts if it 1743 BIOS implementation. The BIOS should reenable interrupts if it
@@ -1724,7 +1762,7 @@ config PCI
1724 bool "PCI support" 1762 bool "PCI support"
1725 default y 1763 default y
1726 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) 1764 select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
1727 help 1765 ---help---
1728 Find out whether you have a PCI motherboard. PCI is the name of a 1766 Find out whether you have a PCI motherboard. PCI is the name of a
1729 bus system, i.e. the way the CPU talks to the other stuff inside 1767 bus system, i.e. the way the CPU talks to the other stuff inside
1730 your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or 1768 your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
@@ -1795,7 +1833,7 @@ config PCI_MMCONFIG
1795config DMAR 1833config DMAR
1796 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1834 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1797 depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL 1835 depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
1798 help 1836 ---help---
1799 DMA remapping (DMAR) devices support enables independent address 1837 DMA remapping (DMAR) devices support enables independent address
1800 translations for Direct Memory Access (DMA) from devices. 1838 translations for Direct Memory Access (DMA) from devices.
1801 These DMA remapping devices are reported via ACPI tables 1839 These DMA remapping devices are reported via ACPI tables
@@ -1817,29 +1855,30 @@ config DMAR_GFX_WA
1817 def_bool y 1855 def_bool y
1818 prompt "Support for Graphics workaround" 1856 prompt "Support for Graphics workaround"
1819 depends on DMAR 1857 depends on DMAR
1820 help 1858 ---help---
1821 Current Graphics drivers tend to use physical address 1859 Current Graphics drivers tend to use physical address
1822 for DMA and avoid using DMA APIs. Setting this config 1860 for DMA and avoid using DMA APIs. Setting this config
1823 option permits the IOMMU driver to set a unity map for 1861 option permits the IOMMU driver to set a unity map for
1824 all the OS-visible memory. Hence the driver can continue 1862 all the OS-visible memory. Hence the driver can continue
1825 to use physical addresses for DMA. 1863 to use physical addresses for DMA.
1826 1864
1827config DMAR_FLOPPY_WA 1865config DMAR_FLOPPY_WA
1828 def_bool y 1866 def_bool y
1829 depends on DMAR 1867 depends on DMAR
1830 help 1868 ---help---
1831 Floppy disk drivers are know to bypass DMA API calls 1869 Floppy disk drivers are know to bypass DMA API calls
1832 thereby failing to work when IOMMU is enabled. This 1870 thereby failing to work when IOMMU is enabled. This
1833 workaround will setup a 1:1 mapping for the first 1871 workaround will setup a 1:1 mapping for the first
1834 16M to make floppy (an ISA device) work. 1872 16M to make floppy (an ISA device) work.
1835 1873
1836config INTR_REMAP 1874config INTR_REMAP
1837 bool "Support for Interrupt Remapping (EXPERIMENTAL)" 1875 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1838 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL 1876 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1839 help 1877 select X86_X2APIC
1840 Supports Interrupt remapping for IO-APIC and MSI devices. 1878 ---help---
1841 To use x2apic mode in the CPU's which support x2APIC enhancements or 1879 Supports Interrupt remapping for IO-APIC and MSI devices.
1842 to support platforms with CPU's having > 8 bit APIC ID, say Y. 1880 To use x2apic mode in the CPU's which support x2APIC enhancements or
1881 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1843 1882
1844source "drivers/pci/pcie/Kconfig" 1883source "drivers/pci/pcie/Kconfig"
1845 1884
@@ -1853,8 +1892,7 @@ if X86_32
1853 1892
1854config ISA 1893config ISA
1855 bool "ISA support" 1894 bool "ISA support"
1856 depends on !X86_VOYAGER 1895 ---help---
1857 help
1858 Find out whether you have ISA slots on your motherboard. ISA is the 1896 Find out whether you have ISA slots on your motherboard. ISA is the
1859 name of a bus system, i.e. the way the CPU talks to the other stuff 1897 name of a bus system, i.e. the way the CPU talks to the other stuff
1860 inside your box. Other bus systems are PCI, EISA, MicroChannel 1898 inside your box. Other bus systems are PCI, EISA, MicroChannel
@@ -1880,9 +1918,8 @@ config EISA
1880source "drivers/eisa/Kconfig" 1918source "drivers/eisa/Kconfig"
1881 1919
1882config MCA 1920config MCA
1883 bool "MCA support" if !X86_VOYAGER 1921 bool "MCA support"
1884 default y if X86_VOYAGER 1922 ---help---
1885 help
1886 MicroChannel Architecture is found in some IBM PS/2 machines and 1923 MicroChannel Architecture is found in some IBM PS/2 machines and
1887 laptops. It is a bus system similar to PCI or ISA. See 1924 laptops. It is a bus system similar to PCI or ISA. See
1888 <file:Documentation/mca.txt> (and especially the web page given 1925 <file:Documentation/mca.txt> (and especially the web page given
@@ -1892,8 +1929,7 @@ source "drivers/mca/Kconfig"
1892 1929
1893config SCx200 1930config SCx200
1894 tristate "NatSemi SCx200 support" 1931 tristate "NatSemi SCx200 support"
1895 depends on !X86_VOYAGER 1932 ---help---
1896 help
1897 This provides basic support for National Semiconductor's 1933 This provides basic support for National Semiconductor's
1898 (now AMD's) Geode processors. The driver probes for the 1934 (now AMD's) Geode processors. The driver probes for the
1899 PCI-IDs of several on-chip devices, so its a good dependency 1935 PCI-IDs of several on-chip devices, so its a good dependency
@@ -1905,7 +1941,7 @@ config SCx200HR_TIMER
1905 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" 1941 tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
1906 depends on SCx200 && GENERIC_TIME 1942 depends on SCx200 && GENERIC_TIME
1907 default y 1943 default y
1908 help 1944 ---help---
1909 This driver provides a clocksource built upon the on-chip 1945 This driver provides a clocksource built upon the on-chip
1910 27MHz high-resolution timer. Its also a workaround for 1946 27MHz high-resolution timer. Its also a workaround for
1911 NSC Geode SC-1100's buggy TSC, which loses time when the 1947 NSC Geode SC-1100's buggy TSC, which loses time when the
@@ -1916,7 +1952,7 @@ config GEODE_MFGPT_TIMER
1916 def_bool y 1952 def_bool y
1917 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" 1953 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
1918 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS 1954 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
1919 help 1955 ---help---
1920 This driver provides a clock event source based on the MFGPT 1956 This driver provides a clock event source based on the MFGPT
1921 timer(s) in the CS5535 and CS5536 companion chip for the geode. 1957 timer(s) in the CS5535 and CS5536 companion chip for the geode.
1922 MFGPTs have a better resolution and max interval than the 1958 MFGPTs have a better resolution and max interval than the
@@ -1925,7 +1961,7 @@ config GEODE_MFGPT_TIMER
1925config OLPC 1961config OLPC
1926 bool "One Laptop Per Child support" 1962 bool "One Laptop Per Child support"
1927 default n 1963 default n
1928 help 1964 ---help---
1929 Add support for detecting the unique features of the OLPC 1965 Add support for detecting the unique features of the OLPC
1930 XO hardware. 1966 XO hardware.
1931 1967
@@ -1950,16 +1986,16 @@ config IA32_EMULATION
1950 bool "IA32 Emulation" 1986 bool "IA32 Emulation"
1951 depends on X86_64 1987 depends on X86_64
1952 select COMPAT_BINFMT_ELF 1988 select COMPAT_BINFMT_ELF
1953 help 1989 ---help---
1954 Include code to run 32-bit programs under a 64-bit kernel. You should 1990 Include code to run 32-bit programs under a 64-bit kernel. You should
1955 likely turn this on, unless you're 100% sure that you don't have any 1991 likely turn this on, unless you're 100% sure that you don't have any
1956 32-bit programs left. 1992 32-bit programs left.
1957 1993
1958config IA32_AOUT 1994config IA32_AOUT
1959 tristate "IA32 a.out support" 1995 tristate "IA32 a.out support"
1960 depends on IA32_EMULATION 1996 depends on IA32_EMULATION
1961 help 1997 ---help---
1962 Support old a.out binaries in the 32bit emulation. 1998 Support old a.out binaries in the 32bit emulation.
1963 1999
1964config COMPAT 2000config COMPAT
1965 def_bool y 2001 def_bool y
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c98d52e82966..a95eaf0e582a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -50,7 +50,7 @@ config M386
50config M486 50config M486
51 bool "486" 51 bool "486"
52 depends on X86_32 52 depends on X86_32
53 help 53 ---help---
54 Select this for a 486 series processor, either Intel or one of the 54 Select this for a 486 series processor, either Intel or one of the
55 compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, 55 compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX,
56 DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or 56 DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or
@@ -59,7 +59,7 @@ config M486
59config M586 59config M586
60 bool "586/K5/5x86/6x86/6x86MX" 60 bool "586/K5/5x86/6x86/6x86MX"
61 depends on X86_32 61 depends on X86_32
62 help 62 ---help---
63 Select this for an 586 or 686 series processor such as the AMD K5, 63 Select this for an 586 or 686 series processor such as the AMD K5,
64 the Cyrix 5x86, 6x86 and 6x86MX. This choice does not 64 the Cyrix 5x86, 6x86 and 6x86MX. This choice does not
65 assume the RDTSC (Read Time Stamp Counter) instruction. 65 assume the RDTSC (Read Time Stamp Counter) instruction.
@@ -67,21 +67,21 @@ config M586
67config M586TSC 67config M586TSC
68 bool "Pentium-Classic" 68 bool "Pentium-Classic"
69 depends on X86_32 69 depends on X86_32
70 help 70 ---help---
71 Select this for a Pentium Classic processor with the RDTSC (Read 71 Select this for a Pentium Classic processor with the RDTSC (Read
72 Time Stamp Counter) instruction for benchmarking. 72 Time Stamp Counter) instruction for benchmarking.
73 73
74config M586MMX 74config M586MMX
75 bool "Pentium-MMX" 75 bool "Pentium-MMX"
76 depends on X86_32 76 depends on X86_32
77 help 77 ---help---
78 Select this for a Pentium with the MMX graphics/multimedia 78 Select this for a Pentium with the MMX graphics/multimedia
79 extended instructions. 79 extended instructions.
80 80
81config M686 81config M686
82 bool "Pentium-Pro" 82 bool "Pentium-Pro"
83 depends on X86_32 83 depends on X86_32
84 help 84 ---help---
85 Select this for Intel Pentium Pro chips. This enables the use of 85 Select this for Intel Pentium Pro chips. This enables the use of
86 Pentium Pro extended instructions, and disables the init-time guard 86 Pentium Pro extended instructions, and disables the init-time guard
87 against the f00f bug found in earlier Pentiums. 87 against the f00f bug found in earlier Pentiums.
@@ -89,7 +89,7 @@ config M686
89config MPENTIUMII 89config MPENTIUMII
90 bool "Pentium-II/Celeron(pre-Coppermine)" 90 bool "Pentium-II/Celeron(pre-Coppermine)"
91 depends on X86_32 91 depends on X86_32
92 help 92 ---help---
93 Select this for Intel chips based on the Pentium-II and 93 Select this for Intel chips based on the Pentium-II and
94 pre-Coppermine Celeron core. This option enables an unaligned 94 pre-Coppermine Celeron core. This option enables an unaligned
95 copy optimization, compiles the kernel with optimization flags 95 copy optimization, compiles the kernel with optimization flags
@@ -99,7 +99,7 @@ config MPENTIUMII
99config MPENTIUMIII 99config MPENTIUMIII
100 bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" 100 bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon"
101 depends on X86_32 101 depends on X86_32
102 help 102 ---help---
103 Select this for Intel chips based on the Pentium-III and 103 Select this for Intel chips based on the Pentium-III and
104 Celeron-Coppermine core. This option enables use of some 104 Celeron-Coppermine core. This option enables use of some
105 extended prefetch instructions in addition to the Pentium II 105 extended prefetch instructions in addition to the Pentium II
@@ -108,14 +108,14 @@ config MPENTIUMIII
108config MPENTIUMM 108config MPENTIUMM
109 bool "Pentium M" 109 bool "Pentium M"
110 depends on X86_32 110 depends on X86_32
111 help 111 ---help---
112 Select this for Intel Pentium M (not Pentium-4 M) 112 Select this for Intel Pentium M (not Pentium-4 M)
113 notebook chips. 113 notebook chips.
114 114
115config MPENTIUM4 115config MPENTIUM4
116 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" 116 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
117 depends on X86_32 117 depends on X86_32
118 help 118 ---help---
119 Select this for Intel Pentium 4 chips. This includes the 119 Select this for Intel Pentium 4 chips. This includes the
120 Pentium 4, Pentium D, P4-based Celeron and Xeon, and 120 Pentium 4, Pentium D, P4-based Celeron and Xeon, and
121 Pentium-4 M (not Pentium M) chips. This option enables compile 121 Pentium-4 M (not Pentium M) chips. This option enables compile
@@ -151,7 +151,7 @@ config MPENTIUM4
151config MK6 151config MK6
152 bool "K6/K6-II/K6-III" 152 bool "K6/K6-II/K6-III"
153 depends on X86_32 153 depends on X86_32
154 help 154 ---help---
155 Select this for an AMD K6-family processor. Enables use of 155 Select this for an AMD K6-family processor. Enables use of
156 some extended instructions, and passes appropriate optimization 156 some extended instructions, and passes appropriate optimization
157 flags to GCC. 157 flags to GCC.
@@ -159,14 +159,14 @@ config MK6
159config MK7 159config MK7
160 bool "Athlon/Duron/K7" 160 bool "Athlon/Duron/K7"
161 depends on X86_32 161 depends on X86_32
162 help 162 ---help---
163 Select this for an AMD Athlon K7-family processor. Enables use of 163 Select this for an AMD Athlon K7-family processor. Enables use of
164 some extended instructions, and passes appropriate optimization 164 some extended instructions, and passes appropriate optimization
165 flags to GCC. 165 flags to GCC.
166 166
167config MK8 167config MK8
168 bool "Opteron/Athlon64/Hammer/K8" 168 bool "Opteron/Athlon64/Hammer/K8"
169 help 169 ---help---
170 Select this for an AMD Opteron or Athlon64 Hammer-family processor. 170 Select this for an AMD Opteron or Athlon64 Hammer-family processor.
171 Enables use of some extended instructions, and passes appropriate 171 Enables use of some extended instructions, and passes appropriate
172 optimization flags to GCC. 172 optimization flags to GCC.
@@ -174,7 +174,7 @@ config MK8
174config MCRUSOE 174config MCRUSOE
175 bool "Crusoe" 175 bool "Crusoe"
176 depends on X86_32 176 depends on X86_32
177 help 177 ---help---
178 Select this for a Transmeta Crusoe processor. Treats the processor 178 Select this for a Transmeta Crusoe processor. Treats the processor
179 like a 586 with TSC, and sets some GCC optimization flags (like a 179 like a 586 with TSC, and sets some GCC optimization flags (like a
180 Pentium Pro with no alignment requirements). 180 Pentium Pro with no alignment requirements).
@@ -182,13 +182,13 @@ config MCRUSOE
182config MEFFICEON 182config MEFFICEON
183 bool "Efficeon" 183 bool "Efficeon"
184 depends on X86_32 184 depends on X86_32
185 help 185 ---help---
186 Select this for a Transmeta Efficeon processor. 186 Select this for a Transmeta Efficeon processor.
187 187
188config MWINCHIPC6 188config MWINCHIPC6
189 bool "Winchip-C6" 189 bool "Winchip-C6"
190 depends on X86_32 190 depends on X86_32
191 help 191 ---help---
192 Select this for an IDT Winchip C6 chip. Linux and GCC 192 Select this for an IDT Winchip C6 chip. Linux and GCC
193 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
194 and alignment requirements. 194 and alignment requirements.
@@ -196,7 +196,7 @@ config MWINCHIPC6
196config MWINCHIP3D 196config MWINCHIP3D
197 bool "Winchip-2/Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
198 depends on X86_32 198 depends on X86_32
199 help 199 ---help---
200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
201 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
202 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
@@ -206,19 +206,19 @@ config MWINCHIP3D
206config MGEODEGX1 206config MGEODEGX1
207 bool "GeodeGX1" 207 bool "GeodeGX1"
208 depends on X86_32 208 depends on X86_32
209 help 209 ---help---
210 Select this for a Geode GX1 (Cyrix MediaGX) chip. 210 Select this for a Geode GX1 (Cyrix MediaGX) chip.
211 211
212config MGEODE_LX 212config MGEODE_LX
213 bool "Geode GX/LX" 213 bool "Geode GX/LX"
214 depends on X86_32 214 depends on X86_32
215 help 215 ---help---
216 Select this for AMD Geode GX and LX processors. 216 Select this for AMD Geode GX and LX processors.
217 217
218config MCYRIXIII 218config MCYRIXIII
219 bool "CyrixIII/VIA-C3" 219 bool "CyrixIII/VIA-C3"
220 depends on X86_32 220 depends on X86_32
221 help 221 ---help---
222 Select this for a Cyrix III or C3 chip. Presently Linux and GCC 222 Select this for a Cyrix III or C3 chip. Presently Linux and GCC
223 treat this chip as a generic 586. Whilst the CPU is 686 class, 223 treat this chip as a generic 586. Whilst the CPU is 686 class,
224 it lacks the cmov extension which gcc assumes is present when 224 it lacks the cmov extension which gcc assumes is present when
@@ -230,7 +230,7 @@ config MCYRIXIII
230config MVIAC3_2 230config MVIAC3_2
231 bool "VIA C3-2 (Nehemiah)" 231 bool "VIA C3-2 (Nehemiah)"
232 depends on X86_32 232 depends on X86_32
233 help 233 ---help---
234 Select this for a VIA C3 "Nehemiah". Selecting this enables usage 234 Select this for a VIA C3 "Nehemiah". Selecting this enables usage
235 of SSE and tells gcc to treat the CPU as a 686. 235 of SSE and tells gcc to treat the CPU as a 686.
236 Note, this kernel will not boot on older (pre model 9) C3s. 236 Note, this kernel will not boot on older (pre model 9) C3s.
@@ -238,14 +238,14 @@ config MVIAC3_2
238config MVIAC7 238config MVIAC7
239 bool "VIA C7" 239 bool "VIA C7"
240 depends on X86_32 240 depends on X86_32
241 help 241 ---help---
242 Select this for a VIA C7. Selecting this uses the correct cache 242 Select this for a VIA C7. Selecting this uses the correct cache
243 shift and tells gcc to treat the CPU as a 686. 243 shift and tells gcc to treat the CPU as a 686.
244 244
245config MPSC 245config MPSC
246 bool "Intel P4 / older Netburst based Xeon" 246 bool "Intel P4 / older Netburst based Xeon"
247 depends on X86_64 247 depends on X86_64
248 help 248 ---help---
249 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey 249 Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
250 Xeon CPUs with Intel 64bit which is compatible with x86-64. 250 Xeon CPUs with Intel 64bit which is compatible with x86-64.
251 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the 251 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
@@ -255,7 +255,7 @@ config MPSC
255 255
256config MCORE2 256config MCORE2
257 bool "Core 2/newer Xeon" 257 bool "Core 2/newer Xeon"
258 help 258 ---help---
259 259
260 Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 260 Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
261 53xx) CPUs. You can distinguish newer from older Xeons by the CPU 261 53xx) CPUs. You can distinguish newer from older Xeons by the CPU
@@ -265,7 +265,7 @@ config MCORE2
265config GENERIC_CPU 265config GENERIC_CPU
266 bool "Generic-x86-64" 266 bool "Generic-x86-64"
267 depends on X86_64 267 depends on X86_64
268 help 268 ---help---
269 Generic x86-64 CPU. 269 Generic x86-64 CPU.
270 Run equally well on all x86-64 CPUs. 270 Run equally well on all x86-64 CPUs.
271 271
@@ -274,7 +274,7 @@ endchoice
274config X86_GENERIC 274config X86_GENERIC
275 bool "Generic x86 support" 275 bool "Generic x86 support"
276 depends on X86_32 276 depends on X86_32
277 help 277 ---help---
278 Instead of just including optimizations for the selected 278 Instead of just including optimizations for the selected
279 x86 variant (e.g. PII, Crusoe or Athlon), include some more 279 x86 variant (e.g. PII, Crusoe or Athlon), include some more
280 generic optimizations as well. This will make the kernel 280 generic optimizations as well. This will make the kernel
@@ -294,25 +294,23 @@ config X86_CPU
294# Define implied options from the CPU selection here 294# Define implied options from the CPU selection here
295config X86_L1_CACHE_BYTES 295config X86_L1_CACHE_BYTES
296 int 296 int
297 default "128" if GENERIC_CPU || MPSC 297 default "128" if MPSC
298 default "64" if MK8 || MCORE2 298 default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
299 depends on X86_64
300 299
301config X86_INTERNODE_CACHE_BYTES 300config X86_INTERNODE_CACHE_BYTES
302 int 301 int
303 default "4096" if X86_VSMP 302 default "4096" if X86_VSMP
304 default X86_L1_CACHE_BYTES if !X86_VSMP 303 default X86_L1_CACHE_BYTES if !X86_VSMP
305 depends on X86_64
306 304
307config X86_CMPXCHG 305config X86_CMPXCHG
308 def_bool X86_64 || (X86_32 && !M386) 306 def_bool X86_64 || (X86_32 && !M386)
309 307
310config X86_L1_CACHE_SHIFT 308config X86_L1_CACHE_SHIFT
311 int 309 int
312 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || MPSC
313 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
314 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
315 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
316 314
317config X86_XADD 315config X86_XADD
318 def_bool y 316 def_bool y
@@ -321,7 +319,7 @@ config X86_XADD
321config X86_PPRO_FENCE 319config X86_PPRO_FENCE
322 bool "PentiumPro memory ordering errata workaround" 320 bool "PentiumPro memory ordering errata workaround"
323 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 321 depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
324 help 322 ---help---
325 Old PentiumPro multiprocessor systems had errata that could cause 323 Old PentiumPro multiprocessor systems had errata that could cause
326 memory operations to violate the x86 ordering standard in rare cases. 324 memory operations to violate the x86 ordering standard in rare cases.
327 Enabling this option will attempt to work around some (but not all) 325 Enabling this option will attempt to work around some (but not all)
@@ -414,14 +412,14 @@ config X86_DEBUGCTLMSR
414 412
415menuconfig PROCESSOR_SELECT 413menuconfig PROCESSOR_SELECT
416 bool "Supported processor vendors" if EMBEDDED 414 bool "Supported processor vendors" if EMBEDDED
417 help 415 ---help---
418 This lets you choose what x86 vendor support code your kernel 416 This lets you choose what x86 vendor support code your kernel
419 will include. 417 will include.
420 418
421config CPU_SUP_INTEL 419config CPU_SUP_INTEL
422 default y 420 default y
423 bool "Support Intel processors" if PROCESSOR_SELECT 421 bool "Support Intel processors" if PROCESSOR_SELECT
424 help 422 ---help---
425 This enables detection, tunings and quirks for Intel processors 423 This enables detection, tunings and quirks for Intel processors
426 424
427 You need this enabled if you want your kernel to run on an 425 You need this enabled if you want your kernel to run on an
@@ -435,7 +433,7 @@ config CPU_SUP_CYRIX_32
435 default y 433 default y
436 bool "Support Cyrix processors" if PROCESSOR_SELECT 434 bool "Support Cyrix processors" if PROCESSOR_SELECT
437 depends on !64BIT 435 depends on !64BIT
438 help 436 ---help---
439 This enables detection, tunings and quirks for Cyrix processors 437 This enables detection, tunings and quirks for Cyrix processors
440 438
441 You need this enabled if you want your kernel to run on a 439 You need this enabled if you want your kernel to run on a
@@ -448,7 +446,7 @@ config CPU_SUP_CYRIX_32
448config CPU_SUP_AMD 446config CPU_SUP_AMD
449 default y 447 default y
450 bool "Support AMD processors" if PROCESSOR_SELECT 448 bool "Support AMD processors" if PROCESSOR_SELECT
451 help 449 ---help---
452 This enables detection, tunings and quirks for AMD processors 450 This enables detection, tunings and quirks for AMD processors
453 451
454 You need this enabled if you want your kernel to run on an 452 You need this enabled if you want your kernel to run on an
@@ -462,7 +460,7 @@ config CPU_SUP_CENTAUR_32
462 default y 460 default y
463 bool "Support Centaur processors" if PROCESSOR_SELECT 461 bool "Support Centaur processors" if PROCESSOR_SELECT
464 depends on !64BIT 462 depends on !64BIT
465 help 463 ---help---
466 This enables detection, tunings and quirks for Centaur processors 464 This enables detection, tunings and quirks for Centaur processors
467 465
468 You need this enabled if you want your kernel to run on a 466 You need this enabled if you want your kernel to run on a
@@ -476,7 +474,7 @@ config CPU_SUP_CENTAUR_64
476 default y 474 default y
477 bool "Support Centaur processors" if PROCESSOR_SELECT 475 bool "Support Centaur processors" if PROCESSOR_SELECT
478 depends on 64BIT 476 depends on 64BIT
479 help 477 ---help---
480 This enables detection, tunings and quirks for Centaur processors 478 This enables detection, tunings and quirks for Centaur processors
481 479
482 You need this enabled if you want your kernel to run on a 480 You need this enabled if you want your kernel to run on a
@@ -490,7 +488,7 @@ config CPU_SUP_TRANSMETA_32
490 default y 488 default y
491 bool "Support Transmeta processors" if PROCESSOR_SELECT 489 bool "Support Transmeta processors" if PROCESSOR_SELECT
492 depends on !64BIT 490 depends on !64BIT
493 help 491 ---help---
494 This enables detection, tunings and quirks for Transmeta processors 492 This enables detection, tunings and quirks for Transmeta processors
495 493
496 You need this enabled if you want your kernel to run on a 494 You need this enabled if you want your kernel to run on a
@@ -504,7 +502,7 @@ config CPU_SUP_UMC_32
504 default y 502 default y
505 bool "Support UMC processors" if PROCESSOR_SELECT 503 bool "Support UMC processors" if PROCESSOR_SELECT
506 depends on !64BIT 504 depends on !64BIT
507 help 505 ---help---
508 This enables detection, tunings and quirks for UMC processors 506 This enables detection, tunings and quirks for UMC processors
509 507
510 You need this enabled if you want your kernel to run on a 508 You need this enabled if you want your kernel to run on a
@@ -523,7 +521,7 @@ config X86_PTRACE_BTS
523 bool "Branch Trace Store" 521 bool "Branch Trace Store"
524 default y 522 default y
525 depends on X86_DEBUGCTLMSR 523 depends on X86_DEBUGCTLMSR
526 help 524 ---help---
527 This adds a ptrace interface to the hardware's branch trace store. 525 This adds a ptrace interface to the hardware's branch trace store.
528 526
529 Debuggers may use it to collect an execution trace of the debugged 527 Debuggers may use it to collect an execution trace of the debugged
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e1983fa025d2..fdb45df608b6 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -7,7 +7,7 @@ source "lib/Kconfig.debug"
7 7
8config STRICT_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 ---help---
11 If this option is disabled, you allow userspace (root) access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
@@ -25,7 +25,7 @@ config STRICT_DEVMEM
25config X86_VERBOSE_BOOTUP 25config X86_VERBOSE_BOOTUP
26 bool "Enable verbose x86 bootup info messages" 26 bool "Enable verbose x86 bootup info messages"
27 default y 27 default y
28 help 28 ---help---
29 Enables the informational output from the decompression stage 29 Enables the informational output from the decompression stage
30 (e.g. bzImage) of the boot. If you disable this you will still 30 (e.g. bzImage) of the boot. If you disable this you will still
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
@@ -33,7 +33,7 @@ config X86_VERBOSE_BOOTUP
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EMBEDDED
35 default y 35 default y
36 help 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
38 port. 38 port.
39 39
@@ -47,7 +47,7 @@ config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 47 bool "Early printk via EHCI debug port"
48 default n 48 default n
49 depends on EARLY_PRINTK && PCI 49 depends on EARLY_PRINTK && PCI
50 help 50 ---help---
51 Write kernel log output directly into the EHCI debug port. 51 Write kernel log output directly into the EHCI debug port.
52 52
53 This is useful for kernel debugging when your machine crashes very 53 This is useful for kernel debugging when your machine crashes very
@@ -59,14 +59,14 @@ config EARLY_PRINTK_DBGP
59config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
60 bool "Check for stack overflows" 60 bool "Check for stack overflows"
61 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
62 help 62 ---help---
63 This option will cause messages to be printed if free stack space 63 This option will cause messages to be printed if free stack space
64 drops below a certain limit. 64 drops below a certain limit.
65 65
66config DEBUG_STACK_USAGE 66config DEBUG_STACK_USAGE
67 bool "Stack utilization instrumentation" 67 bool "Stack utilization instrumentation"
68 depends on DEBUG_KERNEL 68 depends on DEBUG_KERNEL
69 help 69 ---help---
70 Enables the display of the minimum amount of free stack which each 70 Enables the display of the minimum amount of free stack which each
71 task has ever had available in the sysrq-T and sysrq-P debug output. 71 task has ever had available in the sysrq-T and sysrq-P debug output.
72 72
@@ -75,7 +75,7 @@ config DEBUG_STACK_USAGE
75config DEBUG_PAGEALLOC 75config DEBUG_PAGEALLOC
76 bool "Debug page memory allocations" 76 bool "Debug page memory allocations"
77 depends on DEBUG_KERNEL 77 depends on DEBUG_KERNEL
78 help 78 ---help---
79 Unmap pages from the kernel linear mapping after free_pages(). 79 Unmap pages from the kernel linear mapping after free_pages().
80 This results in a large slowdown, but helps to find certain types 80 This results in a large slowdown, but helps to find certain types
81 of memory corruptions. 81 of memory corruptions.
@@ -83,9 +83,9 @@ config DEBUG_PAGEALLOC
83config DEBUG_PER_CPU_MAPS 83config DEBUG_PER_CPU_MAPS
84 bool "Debug access to per_cpu maps" 84 bool "Debug access to per_cpu maps"
85 depends on DEBUG_KERNEL 85 depends on DEBUG_KERNEL
86 depends on X86_SMP 86 depends on SMP
87 default n 87 default n
88 help 88 ---help---
89 Say Y to verify that the per_cpu map being accessed has 89 Say Y to verify that the per_cpu map being accessed has
90 been setup. Adds a fair amount of code to kernel memory 90 been setup. Adds a fair amount of code to kernel memory
91 and decreases performance. 91 and decreases performance.
@@ -96,7 +96,7 @@ config X86_PTDUMP
96 bool "Export kernel pagetable layout to userspace via debugfs" 96 bool "Export kernel pagetable layout to userspace via debugfs"
97 depends on DEBUG_KERNEL 97 depends on DEBUG_KERNEL
98 select DEBUG_FS 98 select DEBUG_FS
99 help 99 ---help---
100 Say Y here if you want to show the kernel pagetable layout in a 100 Say Y here if you want to show the kernel pagetable layout in a
101 debugfs file. This information is only useful for kernel developers 101 debugfs file. This information is only useful for kernel developers
102 who are working in architecture specific areas of the kernel. 102 who are working in architecture specific areas of the kernel.
@@ -108,7 +108,7 @@ config DEBUG_RODATA
108 bool "Write protect kernel read-only data structures" 108 bool "Write protect kernel read-only data structures"
109 default y 109 default y
110 depends on DEBUG_KERNEL 110 depends on DEBUG_KERNEL
111 help 111 ---help---
112 Mark the kernel read-only data as write-protected in the pagetables, 112 Mark the kernel read-only data as write-protected in the pagetables,
113 in order to catch accidental (and incorrect) writes to such const 113 in order to catch accidental (and incorrect) writes to such const
114 data. This is recommended so that we can catch kernel bugs sooner. 114 data. This is recommended so that we can catch kernel bugs sooner.
@@ -117,7 +117,8 @@ config DEBUG_RODATA
117config DEBUG_RODATA_TEST 117config DEBUG_RODATA_TEST
118 bool "Testcase for the DEBUG_RODATA feature" 118 bool "Testcase for the DEBUG_RODATA feature"
119 depends on DEBUG_RODATA 119 depends on DEBUG_RODATA
120 help 120 default y
121 ---help---
121 This option enables a testcase for the DEBUG_RODATA 122 This option enables a testcase for the DEBUG_RODATA
122 feature as well as for the change_page_attr() infrastructure. 123 feature as well as for the change_page_attr() infrastructure.
123 If in doubt, say "N" 124 If in doubt, say "N"
@@ -125,7 +126,7 @@ config DEBUG_RODATA_TEST
125config DEBUG_NX_TEST 126config DEBUG_NX_TEST
126 tristate "Testcase for the NX non-executable stack feature" 127 tristate "Testcase for the NX non-executable stack feature"
127 depends on DEBUG_KERNEL && m 128 depends on DEBUG_KERNEL && m
128 help 129 ---help---
129 This option enables a testcase for the CPU NX capability 130 This option enables a testcase for the CPU NX capability
130 and the software setup of this feature. 131 and the software setup of this feature.
131 If in doubt, say "N" 132 If in doubt, say "N"
@@ -133,7 +134,7 @@ config DEBUG_NX_TEST
133config 4KSTACKS 134config 4KSTACKS
134 bool "Use 4Kb for kernel stacks instead of 8Kb" 135 bool "Use 4Kb for kernel stacks instead of 8Kb"
135 depends on X86_32 136 depends on X86_32
136 help 137 ---help---
137 If you say Y here the kernel will use a 4Kb stacksize for the 138 If you say Y here the kernel will use a 4Kb stacksize for the
138 kernel stack attached to each process/thread. This facilitates 139 kernel stack attached to each process/thread. This facilitates
139 running more threads on a system and also reduces the pressure 140 running more threads on a system and also reduces the pressure
@@ -144,7 +145,7 @@ config DOUBLEFAULT
144 default y 145 default y
145 bool "Enable doublefault exception handler" if EMBEDDED 146 bool "Enable doublefault exception handler" if EMBEDDED
146 depends on X86_32 147 depends on X86_32
147 help 148 ---help---
148 This option allows trapping of rare doublefault exceptions that 149 This option allows trapping of rare doublefault exceptions that
149 would otherwise cause a system to silently reboot. Disabling this 150 would otherwise cause a system to silently reboot. Disabling this
150 option saves about 4k and might cause you much additional grey 151 option saves about 4k and might cause you much additional grey
@@ -154,7 +155,7 @@ config IOMMU_DEBUG
154 bool "Enable IOMMU debugging" 155 bool "Enable IOMMU debugging"
155 depends on GART_IOMMU && DEBUG_KERNEL 156 depends on GART_IOMMU && DEBUG_KERNEL
156 depends on X86_64 157 depends on X86_64
157 help 158 ---help---
158 Force the IOMMU to on even when you have less than 4GB of 159 Force the IOMMU to on even when you have less than 4GB of
159 memory and add debugging code. On overflow always panic. And 160 memory and add debugging code. On overflow always panic. And
160 allow to enable IOMMU leak tracing. Can be disabled at boot 161 allow to enable IOMMU leak tracing. Can be disabled at boot
@@ -170,7 +171,7 @@ config IOMMU_LEAK
170 bool "IOMMU leak tracing" 171 bool "IOMMU leak tracing"
171 depends on DEBUG_KERNEL 172 depends on DEBUG_KERNEL
172 depends on IOMMU_DEBUG 173 depends on IOMMU_DEBUG
173 help 174 ---help---
174 Add a simple leak tracer to the IOMMU code. This is useful when you 175 Add a simple leak tracer to the IOMMU code. This is useful when you
175 are debugging a buggy device driver that leaks IOMMU mappings. 176 are debugging a buggy device driver that leaks IOMMU mappings.
176 177
@@ -203,25 +204,25 @@ choice
203 204
204config IO_DELAY_0X80 205config IO_DELAY_0X80
205 bool "port 0x80 based port-IO delay [recommended]" 206 bool "port 0x80 based port-IO delay [recommended]"
206 help 207 ---help---
207 This is the traditional Linux IO delay used for in/out_p. 208 This is the traditional Linux IO delay used for in/out_p.
208 It is the most tested hence safest selection here. 209 It is the most tested hence safest selection here.
209 210
210config IO_DELAY_0XED 211config IO_DELAY_0XED
211 bool "port 0xed based port-IO delay" 212 bool "port 0xed based port-IO delay"
212 help 213 ---help---
213 Use port 0xed as the IO delay. This frees up port 0x80 which is 214 Use port 0xed as the IO delay. This frees up port 0x80 which is
214 often used as a hardware-debug port. 215 often used as a hardware-debug port.
215 216
216config IO_DELAY_UDELAY 217config IO_DELAY_UDELAY
217 bool "udelay based port-IO delay" 218 bool "udelay based port-IO delay"
218 help 219 ---help---
219 Use udelay(2) as the IO delay method. This provides the delay 220 Use udelay(2) as the IO delay method. This provides the delay
220 while not having any side-effect on the IO port space. 221 while not having any side-effect on the IO port space.
221 222
222config IO_DELAY_NONE 223config IO_DELAY_NONE
223 bool "no port-IO delay" 224 bool "no port-IO delay"
224 help 225 ---help---
225 No port-IO delay. Will break on old boxes that require port-IO 226 No port-IO delay. Will break on old boxes that require port-IO
226 delay for certain operations. Should work on most new machines. 227 delay for certain operations. Should work on most new machines.
227 228
@@ -255,18 +256,18 @@ config DEBUG_BOOT_PARAMS
255 bool "Debug boot parameters" 256 bool "Debug boot parameters"
256 depends on DEBUG_KERNEL 257 depends on DEBUG_KERNEL
257 depends on DEBUG_FS 258 depends on DEBUG_FS
258 help 259 ---help---
259 This option will cause struct boot_params to be exported via debugfs. 260 This option will cause struct boot_params to be exported via debugfs.
260 261
261config CPA_DEBUG 262config CPA_DEBUG
262 bool "CPA self-test code" 263 bool "CPA self-test code"
263 depends on DEBUG_KERNEL 264 depends on DEBUG_KERNEL
264 help 265 ---help---
265 Do change_page_attr() self-tests every 30 seconds. 266 Do change_page_attr() self-tests every 30 seconds.
266 267
267config OPTIMIZE_INLINING 268config OPTIMIZE_INLINING
268 bool "Allow gcc to uninline functions marked 'inline'" 269 bool "Allow gcc to uninline functions marked 'inline'"
269 help 270 ---help---
270 This option determines if the kernel forces gcc to inline the functions 271 This option determines if the kernel forces gcc to inline the functions
271 developers have marked 'inline'. Doing so takes away freedom from gcc to 272 developers have marked 'inline'. Doing so takes away freedom from gcc to
272 do what it thinks is best, which is desirable for the gcc 3.x series of 273 do what it thinks is best, which is desirable for the gcc 3.x series of
@@ -279,4 +280,3 @@ config OPTIMIZE_INLINING
279 If unsure, say N. 280 If unsure, say N.
280 281
281endmenu 282endmenu
282
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d1a47adb5aec..1836191839ee 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,14 +70,17 @@ else
70 # this works around some issues with generating unwind tables in older gccs 70 # this works around some issues with generating unwind tables in older gccs
71 # newer gccs do it by default 71 # newer gccs do it by default
72 KBUILD_CFLAGS += -maccumulate-outgoing-args 72 KBUILD_CFLAGS += -maccumulate-outgoing-args
73endif
73 74
74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh 75ifdef CONFIG_CC_STACKPROTECTOR
75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ 76 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
76 "$(CC)" -fstack-protector ) 77 ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y)
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ 78 stackp-y := -fstack-protector
78 "$(CC)" -fstack-protector-all ) 79 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
79 80 KBUILD_CFLAGS += $(stackp-y)
80 KBUILD_CFLAGS += $(stackp-y) 81 else
82 $(warning stack protector enabled but no compiler support)
83 endif
81endif 84endif
82 85
83# Stackpointer is addressed different for 32 bit and 64 bit x86 86# Stackpointer is addressed different for 32 bit and 64 bit x86
@@ -102,29 +105,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
102# prevent gcc from generating any FP code by mistake 105# prevent gcc from generating any FP code by mistake
103KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) 106KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
104 107
105###
106# Sub architecture support
107# fcore-y is linked before mcore-y files.
108
109# Default subarch .c files
110mcore-y := arch/x86/mach-default/
111
112# Voyager subarch support
113mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115
116# generic subarchitecture
117mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120
121# default subarch .h files
122mflags-y += -Iarch/x86/include/asm/mach-default
123
124# 64 bit does not support subarch support - clear sub arch variables
125fcore-$(CONFIG_X86_64) :=
126mcore-$(CONFIG_X86_64) :=
127
128KBUILD_CFLAGS += $(mflags-y) 108KBUILD_CFLAGS += $(mflags-y)
129KBUILD_AFLAGS += $(mflags-y) 109KBUILD_AFLAGS += $(mflags-y)
130 110
@@ -150,9 +130,6 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
150core-y += arch/x86/kernel/ 130core-y += arch/x86/kernel/
151core-y += arch/x86/mm/ 131core-y += arch/x86/mm/
152 132
153# Remaining sub architecture files
154core-y += $(mcore-y)
155
156core-y += arch/x86/crypto/ 133core-y += arch/x86/crypto/
157core-y += arch/x86/vdso/ 134core-y += arch/x86/vdso/
158core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ 135core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index cd48c7210016..57a29fecf6bb 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,33 +6,29 @@
6# for more details. 6# for more details.
7# 7#
8# Copyright (C) 1994 by Linus Torvalds 8# Copyright (C) 1994 by Linus Torvalds
9# Changed by many, many contributors over the years.
9# 10#
10 11
11# ROOT_DEV specifies the default root-device when making the image. 12# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case 13# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'. 14# the default of FLOPPY is used by 'build'.
14 15
15ROOT_DEV := CURRENT 16ROOT_DEV := CURRENT
16 17
17# If you want to preset the SVGA mode, uncomment the next line and 18# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want. 19# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 20# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup. 21# The number is the same as you would ordinarily press at bootup.
21 22
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 23SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23 24
24# If you want the RAM disk device, define this to be the size in blocks. 25targets := vmlinux.bin setup.bin setup.elf bzImage
25
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 26subdir- := compressed
30 27
31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 28setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
32setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 29setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
33setup-y += printf.o string.o tty.o video.o video-mode.o version.o 30setup-y += printf.o string.o tty.o video.o video-mode.o version.o
34setup-$(CONFIG_X86_APM_BOOT) += apm.o 31setup-$(CONFIG_X86_APM_BOOT) += apm.o
35setup-$(CONFIG_X86_VOYAGER) += voyager.o
36 32
37# The link order of the video-*.o modules can matter. In particular, 33# The link order of the video-*.o modules can matter. In particular,
38# video-vga.o *must* be listed first, followed by video-vesa.o. 34# video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -72,17 +68,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 68KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 69KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 70
75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 71$(obj)/bzImage: asflags-y := $(SVGA_MODE)
76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
78$(obj)/bzImage: BUILDFLAGS := -b
79 72
80quiet_cmd_image = BUILD $@ 73quiet_cmd_image = BUILD $@
81cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ 74cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
82 $(obj)/vmlinux.bin $(ROOT_DEV) > $@ 75 $(ROOT_DEV) > $@
83 76
84$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ 77$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
85 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
86 $(call if_changed,image) 78 $(call if_changed,image)
87 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 79 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
88 80
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 4063d630deff..7c19ce8c2442 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -15,16 +16,23 @@
15#include "boot.h" 16#include "boot.h"
16 17
17#define MAX_8042_LOOPS 100000 18#define MAX_8042_LOOPS 100000
19#define MAX_8042_FF 32
18 20
19static int empty_8042(void) 21static int empty_8042(void)
20{ 22{
21 u8 status; 23 u8 status;
22 int loops = MAX_8042_LOOPS; 24 int loops = MAX_8042_LOOPS;
25 int ffs = MAX_8042_FF;
23 26
24 while (loops--) { 27 while (loops--) {
25 io_delay(); 28 io_delay();
26 29
27 status = inb(0x64); 30 status = inb(0x64);
31 if (status == 0xff) {
32 /* FF is a plausible, but very unlikely status */
33 if (!--ffs)
34 return -1; /* Assume no KBC present */
35 }
28 if (status & 1) { 36 if (status & 1) {
29 /* Read and discard input data */ 37 /* Read and discard input data */
30 io_delay(); 38 io_delay();
@@ -118,44 +126,37 @@ static void enable_a20_fast(void)
118 126
119int enable_a20(void) 127int enable_a20(void)
120{ 128{
121#if defined(CONFIG_X86_ELAN)
122 /* Elan croaks if we try to touch the KBC */
123 enable_a20_fast();
124 while (!a20_test_long())
125 ;
126 return 0;
127#elif defined(CONFIG_X86_VOYAGER)
128 /* On Voyager, a20_test() is unsafe? */
129 enable_a20_kbc();
130 return 0;
131#else
132 int loops = A20_ENABLE_LOOPS; 129 int loops = A20_ENABLE_LOOPS;
133 while (loops--) { 130 int kbc_err;
134 /* First, check to see if A20 is already enabled 131
135 (legacy free, etc.) */ 132 while (loops--) {
136 if (a20_test_short()) 133 /* First, check to see if A20 is already enabled
137 return 0; 134 (legacy free, etc.) */
138 135 if (a20_test_short())
139 /* Next, try the BIOS (INT 0x15, AX=0x2401) */ 136 return 0;
140 enable_a20_bios(); 137
141 if (a20_test_short()) 138 /* Next, try the BIOS (INT 0x15, AX=0x2401) */
142 return 0; 139 enable_a20_bios();
143 140 if (a20_test_short())
144 /* Try enabling A20 through the keyboard controller */ 141 return 0;
145 empty_8042(); 142
146 if (a20_test_short()) 143 /* Try enabling A20 through the keyboard controller */
147 return 0; /* BIOS worked, but with delayed reaction */ 144 kbc_err = empty_8042();
148 145
149 enable_a20_kbc(); 146 if (a20_test_short())
150 if (a20_test_long()) 147 return 0; /* BIOS worked, but with delayed reaction */
151 return 0; 148
152 149 if (!kbc_err) {
153 /* Finally, try enabling the "fast A20 gate" */ 150 enable_a20_kbc();
154 enable_a20_fast(); 151 if (a20_test_long())
155 if (a20_test_long()) 152 return 0;
156 return 0; 153 }
157 } 154
158 155 /* Finally, try enabling the "fast A20 gate" */
159 return -1; 156 enable_a20_fast();
160#endif 157 if (a20_test_long())
158 return 0;
159 }
160
161 return -1;
161} 162}
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index cc0ef13fba7a..7b2692e897e5 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -302,9 +302,6 @@ void probe_cards(int unsafe);
302/* video-vesa.c */ 302/* video-vesa.c */
303void vesa_store_edid(void); 303void vesa_store_edid(void);
304 304
305/* voyager.c */
306int query_voyager(void);
307
308#endif /* __ASSEMBLY__ */ 305#endif /* __ASSEMBLY__ */
309 306
310#endif /* BOOT_BOOT_H */ 307#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1771c804e02f..3ca4c194b8e5 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o 7targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -47,18 +47,35 @@ ifeq ($(CONFIG_X86_32),y)
47ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
49 $(call if_changed,gzip) 49 $(call if_changed,gzip)
50$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
51 $(call if_changed,bzip2)
52$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
53 $(call if_changed,lzma)
50else 54else
51$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 55$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
52 $(call if_changed,gzip) 56 $(call if_changed,gzip)
57$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
58 $(call if_changed,bzip2)
59$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
60 $(call if_changed,lzma)
53endif 61endif
54LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 62LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
55 63
56else 64else
65
57$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 66$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
58 $(call if_changed,gzip) 67 $(call if_changed,gzip)
68$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
69 $(call if_changed,bzip2)
70$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
71 $(call if_changed,lzma)
59 72
60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 73LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
61endif 74endif
62 75
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 76suffix_$(CONFIG_KERNEL_GZIP) = gz
77suffix_$(CONFIG_KERNEL_BZIP2) = bz2
78suffix_$(CONFIG_KERNEL_LZMA) = lzma
79
80$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
64 $(call if_changed,ld) 81 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 29c5fbf08392..3a8a866fb2e2 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -25,14 +25,12 @@
25 25
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
28#include <asm/page.h> 28#include <asm/page_types.h>
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
31 31
32.section ".text.head","ax",@progbits 32.section ".text.head","ax",@progbits
33 .globl startup_32 33ENTRY(startup_32)
34
35startup_32:
36 cld 34 cld
37 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 35 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
38 * us to not reload segments */ 36 * us to not reload segments */
@@ -113,6 +111,8 @@ startup_32:
113 */ 111 */
114 leal relocated(%ebx), %eax 112 leal relocated(%ebx), %eax
115 jmp *%eax 113 jmp *%eax
114ENDPROC(startup_32)
115
116.section ".text" 116.section ".text"
117relocated: 117relocated:
118 118
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 1d5dff4123e1..ed4a82948002 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -26,8 +26,8 @@
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
29#include <asm/pgtable.h> 29#include <asm/pgtable_types.h>
30#include <asm/page.h> 30#include <asm/page_types.h>
31#include <asm/boot.h> 31#include <asm/boot.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
@@ -35,9 +35,7 @@
35 35
36.section ".text.head" 36.section ".text.head"
37 .code32 37 .code32
38 .globl startup_32 38ENTRY(startup_32)
39
40startup_32:
41 cld 39 cld
42 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
43 * us to not reload segments */ 41 * us to not reload segments */
@@ -176,6 +174,7 @@ startup_32:
176 174
177 /* Jump from 32bit compatibility mode into 64bit mode. */ 175 /* Jump from 32bit compatibility mode into 64bit mode. */
178 lret 176 lret
177ENDPROC(startup_32)
179 178
180no_longmode: 179no_longmode:
181 /* This isn't an x86-64 CPU so hang */ 180 /* This isn't an x86-64 CPU so hang */
@@ -295,7 +294,6 @@ relocated:
295 call decompress_kernel 294 call decompress_kernel
296 popq %rsi 295 popq %rsi
297 296
298
299/* 297/*
300 * Jump to the decompressed kernel. 298 * Jump to the decompressed kernel.
301 */ 299 */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index da062216948a..e45be73684ff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,71 +116,13 @@
116/* 116/*
117 * gzip declarations 117 * gzip declarations
118 */ 118 */
119
120#define OF(args) args
121#define STATIC static 119#define STATIC static
122 120
123#undef memset 121#undef memset
124#undef memcpy 122#undef memcpy
125#define memzero(s, n) memset((s), 0, (n)) 123#define memzero(s, n) memset((s), 0, (n))
126 124
127typedef unsigned char uch;
128typedef unsigned short ush;
129typedef unsigned long ulg;
130
131/*
132 * Window size must be at least 32k, and a power of two.
133 * We don't actually have a window just a huge output buffer,
134 * so we report a 2G window size, as that should always be
135 * larger than our output buffer:
136 */
137#define WSIZE 0x80000000
138
139/* Input buffer: */
140static unsigned char *inbuf;
141
142/* Sliding window buffer (and final output buffer): */
143static unsigned char *window;
144
145/* Valid bytes in inbuf: */
146static unsigned insize;
147
148/* Index of next byte to be processed in inbuf: */
149static unsigned inptr;
150
151/* Bytes in output buffer: */
152static unsigned outcnt;
153
154/* gzip flag byte */
155#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
156#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gz file */
157#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
158#define ORIG_NAM 0x08 /* bit 3 set: original file name present */
159#define COMMENT 0x10 /* bit 4 set: file comment present */
160#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
161#define RESERVED 0xC0 /* bit 6, 7: reserved */
162
163#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
164
165/* Diagnostic functions */
166#ifdef DEBUG
167# define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0)
168# define Trace(x) do { fprintf x; } while (0)
169# define Tracev(x) do { if (verbose) fprintf x ; } while (0)
170# define Tracevv(x) do { if (verbose > 1) fprintf x ; } while (0)
171# define Tracec(c, x) do { if (verbose && (c)) fprintf x ; } while (0)
172# define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0)
173#else
174# define Assert(cond, msg)
175# define Trace(x)
176# define Tracev(x)
177# define Tracevv(x)
178# define Tracec(c, x)
179# define Tracecv(c, x)
180#endif
181 125
182static int fill_inbuf(void);
183static void flush_window(void);
184static void error(char *m); 126static void error(char *m);
185 127
186/* 128/*
@@ -189,13 +131,8 @@ static void error(char *m);
189static struct boot_params *real_mode; /* Pointer to real-mode data */ 131static struct boot_params *real_mode; /* Pointer to real-mode data */
190static int quiet; 132static int quiet;
191 133
192extern unsigned char input_data[];
193extern int input_len;
194
195static long bytes_out;
196
197static void *memset(void *s, int c, unsigned n); 134static void *memset(void *s, int c, unsigned n);
198static void *memcpy(void *dest, const void *src, unsigned n); 135void *memcpy(void *dest, const void *src, unsigned n);
199 136
200static void __putstr(int, const char *); 137static void __putstr(int, const char *);
201#define putstr(__x) __putstr(0, __x) 138#define putstr(__x) __putstr(0, __x)
@@ -213,7 +150,17 @@ static char *vidmem;
213static int vidport; 150static int vidport;
214static int lines, cols; 151static int lines, cols;
215 152
216#include "../../../../lib/inflate.c" 153#ifdef CONFIG_KERNEL_GZIP
154#include "../../../../lib/decompress_inflate.c"
155#endif
156
157#ifdef CONFIG_KERNEL_BZIP2
158#include "../../../../lib/decompress_bunzip2.c"
159#endif
160
161#ifdef CONFIG_KERNEL_LZMA
162#include "../../../../lib/decompress_unlzma.c"
163#endif
217 164
218static void scroll(void) 165static void scroll(void)
219{ 166{
@@ -282,7 +229,7 @@ static void *memset(void *s, int c, unsigned n)
282 return s; 229 return s;
283} 230}
284 231
285static void *memcpy(void *dest, const void *src, unsigned n) 232void *memcpy(void *dest, const void *src, unsigned n)
286{ 233{
287 int i; 234 int i;
288 const char *s = src; 235 const char *s = src;
@@ -293,38 +240,6 @@ static void *memcpy(void *dest, const void *src, unsigned n)
293 return dest; 240 return dest;
294} 241}
295 242
296/* ===========================================================================
297 * Fill the input buffer. This is called only when the buffer is empty
298 * and at least one byte is really needed.
299 */
300static int fill_inbuf(void)
301{
302 error("ran out of input data");
303 return 0;
304}
305
306/* ===========================================================================
307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
308 * (Used for the decompressed data only.)
309 */
310static void flush_window(void)
311{
312 /* With my window equal to my output buffer
313 * I only need to compute the crc here.
314 */
315 unsigned long c = crc; /* temporary variable */
316 unsigned n;
317 unsigned char *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (unsigned long)outcnt;
326 outcnt = 0;
327}
328 243
329static void error(char *x) 244static void error(char *x)
330{ 245{
@@ -407,12 +322,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
407 lines = real_mode->screen_info.orig_video_lines; 322 lines = real_mode->screen_info.orig_video_lines;
408 cols = real_mode->screen_info.orig_video_cols; 323 cols = real_mode->screen_info.orig_video_cols;
409 324
410 window = output; /* Output buffer (Normally at 1M) */
411 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
412 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
413 inbuf = input_data; /* Input buffer */
414 insize = input_len;
415 inptr = 0;
416 327
417#ifdef CONFIG_X86_64 328#ifdef CONFIG_X86_64
418 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 329 if ((unsigned long)output & (__KERNEL_ALIGN - 1))
@@ -430,10 +341,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
430#endif 341#endif
431#endif 342#endif
432 343
433 makecrc();
434 if (!quiet) 344 if (!quiet)
435 putstr("\nDecompressing Linux... "); 345 putstr("\nDecompressing Linux... ");
436 gunzip(); 346 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
437 parse_elf(output); 347 parse_elf(output);
438 if (!quiet) 348 if (!quiet)
439 putstr("done.\nBooting the kernel.\n"); 349 putstr("done.\nBooting the kernel.\n");
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index ef50c84e8b4b..11f272c6f5e9 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -8,6 +8,8 @@
8 * 8 *
9 * ----------------------------------------------------------------------- */ 9 * ----------------------------------------------------------------------- */
10 10
11#include <linux/linkage.h>
12
11/* 13/*
12 * Memory copy routines 14 * Memory copy routines
13 */ 15 */
@@ -15,9 +17,7 @@
15 .code16gcc 17 .code16gcc
16 .text 18 .text
17 19
18 .globl memcpy 20GLOBAL(memcpy)
19 .type memcpy, @function
20memcpy:
21 pushw %si 21 pushw %si
22 pushw %di 22 pushw %di
23 movw %ax, %di 23 movw %ax, %di
@@ -31,11 +31,9 @@ memcpy:
31 popw %di 31 popw %di
32 popw %si 32 popw %si
33 ret 33 ret
34 .size memcpy, .-memcpy 34ENDPROC(memcpy)
35 35
36 .globl memset 36GLOBAL(memset)
37 .type memset, @function
38memset:
39 pushw %di 37 pushw %di
40 movw %ax, %di 38 movw %ax, %di
41 movzbl %dl, %eax 39 movzbl %dl, %eax
@@ -48,52 +46,42 @@ memset:
48 rep; stosb 46 rep; stosb
49 popw %di 47 popw %di
50 ret 48 ret
51 .size memset, .-memset 49ENDPROC(memset)
52 50
53 .globl copy_from_fs 51GLOBAL(copy_from_fs)
54 .type copy_from_fs, @function
55copy_from_fs:
56 pushw %ds 52 pushw %ds
57 pushw %fs 53 pushw %fs
58 popw %ds 54 popw %ds
59 call memcpy 55 call memcpy
60 popw %ds 56 popw %ds
61 ret 57 ret
62 .size copy_from_fs, .-copy_from_fs 58ENDPROC(copy_from_fs)
63 59
64 .globl copy_to_fs 60GLOBAL(copy_to_fs)
65 .type copy_to_fs, @function
66copy_to_fs:
67 pushw %es 61 pushw %es
68 pushw %fs 62 pushw %fs
69 popw %es 63 popw %es
70 call memcpy 64 call memcpy
71 popw %es 65 popw %es
72 ret 66 ret
73 .size copy_to_fs, .-copy_to_fs 67ENDPROC(copy_to_fs)
74 68
75#if 0 /* Not currently used, but can be enabled as needed */ 69#if 0 /* Not currently used, but can be enabled as needed */
76 70GLOBAL(copy_from_gs)
77 .globl copy_from_gs
78 .type copy_from_gs, @function
79copy_from_gs:
80 pushw %ds 71 pushw %ds
81 pushw %gs 72 pushw %gs
82 popw %ds 73 popw %ds
83 call memcpy 74 call memcpy
84 popw %ds 75 popw %ds
85 ret 76 ret
86 .size copy_from_gs, .-copy_from_gs 77ENDPROC(copy_from_gs)
87 .globl copy_to_gs
88 78
89 .type copy_to_gs, @function 79GLOBAL(copy_to_gs)
90copy_to_gs:
91 pushw %es 80 pushw %es
92 pushw %gs 81 pushw %gs
93 popw %es 82 popw %es
94 call memcpy 83 call memcpy
95 popw %es 84 popw %es
96 ret 85 ret
97 .size copy_to_gs, .-copy_to_gs 86ENDPROC(copy_to_gs)
98
99#endif 87#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b993062e9a5f..5d84d1c74e4c 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -19,17 +19,13 @@
19#include <linux/utsrelease.h> 19#include <linux/utsrelease.h>
20#include <asm/boot.h> 20#include <asm/boot.h>
21#include <asm/e820.h> 21#include <asm/e820.h>
22#include <asm/page.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "offsets.h"
26 26
27SETUPSECTS = 4 /* default nr of setup-sectors */
28BOOTSEG = 0x07C0 /* original address of boot-sector */ 27BOOTSEG = 0x07C0 /* original address of boot-sector */
29SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ 28SYSSEG = 0x1000 /* historical load address >> 4 */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33 29
34#ifndef SVGA_MODE 30#ifndef SVGA_MODE
35#define SVGA_MODE ASK_VGA 31#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
97 .section ".header", "a" 93 .section ".header", "a"
98 .globl hdr 94 .globl hdr
99hdr: 95hdr:
100setup_sects: .byte SETUPSECTS 96setup_sects: .byte 0 /* Filled in by build.c */
101root_flags: .word ROOT_RDONLY 97root_flags: .word ROOT_RDONLY
102syssize: .long SYSSIZE 98syssize: .long 0 /* Filled in by build.c */
103ram_size: .word RAMDISK 99ram_size: .word 0 /* Obsolete */
104vid_mode: .word SVGA_MODE 100vid_mode: .word SVGA_MODE
105root_dev: .word ROOT_DEV 101root_dev: .word 0 /* Filled in by build.c */
106boot_flag: .word 0xAA55 102boot_flag: .word 0xAA55
107 103
108 # offset 512, entry point 104 # offset 512, entry point
@@ -123,14 +119,15 @@ _start:
123 # or else old loadlin-1.5 will fail) 119 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch 120 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
126start_sys_seg: .word SYSSEG 122start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
123 # in case something decided to "use" it
127 .word kernel_version-512 # pointing to kernel version string 124 .word kernel_version-512 # pointing to kernel version string
128 # above section of header is compatible 125 # above section of header is compatible
129 # with loadlin-1.5 (header v1.5). Don't 126 # with loadlin-1.5 (header v1.5). Don't
130 # change it. 127 # change it.
131 128
132type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, 129type_of_loader: .byte 0 # 0 means ancient bootloader, newer
133 # Bootlin, SYSLX, bootsect...) 130 # bootloaders know to change this.
134 # See Documentation/i386/boot.txt for 131 # See Documentation/i386/boot.txt for
135 # assigned ids 132 # assigned ids
136 133
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
142 # space behind setup.S can be used for 139 # space behind setup.S can be used for
143 # heap purposes. 140 # heap purposes.
144 # Only the loader knows what is free 141 # Only the loader knows what is free
145#ifndef __BIG_KERNEL__
146 .byte 0
147#else
148 .byte LOADED_HIGH 142 .byte LOADED_HIGH
149#endif
150 143
151setup_move_size: .word 0x8000 # size to move, when setup is not 144setup_move_size: .word 0x8000 # size to move, when setup is not
152 # loaded at 0x90000. We will move setup 145 # loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
157 150
158code32_start: # here loaders can put a different 151code32_start: # here loaders can put a different
159 # start address for 32-bit code. 152 # start address for 32-bit code.
160#ifndef __BIG_KERNEL__
161 .long 0x1000 # 0x1000 = default for zImage
162#else
163 .long 0x100000 # 0x100000 = default for big kernel 153 .long 0x100000 # 0x100000 = default for big kernel
164#endif
165 154
166ramdisk_image: .long 0 # address of loaded ramdisk image 155ramdisk_image: .long 0 # address of loaded ramdisk image
167 # Here the loader puts the 32-bit 156 # Here the loader puts the 32-bit
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 197421db1af1..58f0415d3ae0 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -149,11 +149,6 @@ void main(void)
149 /* Query MCA information */ 149 /* Query MCA information */
150 query_mca(); 150 query_mca();
151 151
152 /* Voyager */
153#ifdef CONFIG_X86_VOYAGER
154 query_voyager();
155#endif
156
157 /* Query Intel SpeedStep (IST) information */ 152 /* Query Intel SpeedStep (IST) information */
158 query_ist(); 153 query_ist();
159 154
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff8..8062f8915250 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
33} 33}
34 34
35/* 35/*
36 * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
37 * A bzImage kernel is loaded and runs at 0x100000.
38 */
39static void move_kernel_around(void)
40{
41 /* Note: rely on the compile-time option here rather than
42 the LOADED_HIGH flag. The Qemu kernel loader unconditionally
43 sets the loadflags to zero. */
44#ifndef __BIG_KERNEL__
45 u16 dst_seg, src_seg;
46 u32 syssize;
47
48 dst_seg = 0x1000 >> 4;
49 src_seg = 0x10000 >> 4;
50 syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
51
52 while (syssize) {
53 int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
54 int dwords = paras << 2;
55
56 asm volatile("pushw %%es ; "
57 "pushw %%ds ; "
58 "movw %1,%%es ; "
59 "movw %2,%%ds ; "
60 "xorw %%di,%%di ; "
61 "xorw %%si,%%si ; "
62 "rep;movsl ; "
63 "popw %%ds ; "
64 "popw %%es"
65 : "+c" (dwords)
66 : "r" (dst_seg), "r" (src_seg)
67 : "esi", "edi");
68
69 syssize -= paras;
70 dst_seg += paras;
71 src_seg += paras;
72 }
73#endif
74}
75
76/*
77 * Disable all interrupts at the legacy PIC. 36 * Disable all interrupts at the legacy PIC.
78 */ 37 */
79static void mask_all_interrupts(void) 38static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
147 /* Hook before leaving real mode, also disables interrupts */ 106 /* Hook before leaving real mode, also disables interrupts */
148 realmode_switch_hook(); 107 realmode_switch_hook();
149 108
150 /* Move the kernel/setup to their final resting places */
151 move_kernel_around();
152
153 /* Enable the A20 gate */ 109 /* Enable the A20 gate */
154 if (enable_a20()) { 110 if (enable_a20()) {
155 puts("A20 gate not responding, unable to boot...\n"); 111 puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 141b6e20ed31..019c17a75851 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -15,18 +15,15 @@
15#include <asm/boot.h> 15#include <asm/boot.h>
16#include <asm/processor-flags.h> 16#include <asm/processor-flags.h>
17#include <asm/segment.h> 17#include <asm/segment.h>
18#include <linux/linkage.h>
18 19
19 .text 20 .text
20
21 .globl protected_mode_jump
22 .type protected_mode_jump, @function
23
24 .code16 21 .code16
25 22
26/* 23/*
27 * void protected_mode_jump(u32 entrypoint, u32 bootparams); 24 * void protected_mode_jump(u32 entrypoint, u32 bootparams);
28 */ 25 */
29protected_mode_jump: 26GLOBAL(protected_mode_jump)
30 movl %edx, %esi # Pointer to boot_params table 27 movl %edx, %esi # Pointer to boot_params table
31 28
32 xorl %ebx, %ebx 29 xorl %ebx, %ebx
@@ -47,12 +44,10 @@ protected_mode_jump:
47 .byte 0x66, 0xea # ljmpl opcode 44 .byte 0x66, 0xea # ljmpl opcode
482: .long in_pm32 # offset 452: .long in_pm32 # offset
49 .word __BOOT_CS # segment 46 .word __BOOT_CS # segment
50 47ENDPROC(protected_mode_jump)
51 .size protected_mode_jump, .-protected_mode_jump
52 48
53 .code32 49 .code32
54 .type in_pm32, @function 50GLOBAL(in_pm32)
55in_pm32:
56 # Set up data segments for flat 32-bit mode 51 # Set up data segments for flat 32-bit mode
57 movl %ecx, %ds 52 movl %ecx, %ds
58 movl %ecx, %es 53 movl %ecx, %es
@@ -78,5 +73,4 @@ in_pm32:
78 lldt %cx 73 lldt %cx
79 74
80 jmpl *%eax # Jump to the 32-bit entrypoint 75 jmpl *%eax # Jump to the 32-bit entrypoint
81 76ENDPROC(in_pm32)
82 .size in_pm32, .-in_pm32
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e3..ee3a4ea923ac 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
130 130
131static void usage(void) 131static void usage(void)
132{ 132{
133 die("Usage: build [-b] setup system [rootdev] [> image]"); 133 die("Usage: build setup system [rootdev] [> image]");
134} 134}
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
145 void *kernel; 145 void *kernel;
146 u32 crc = 0xffffffffUL; 146 u32 crc = 0xffffffffUL;
147 147
148 if (argc > 2 && !strcmp(argv[1], "-b"))
149 {
150 is_big_kernel = 1;
151 argc--, argv++;
152 }
153 if ((argc < 3) || (argc > 4)) 148 if ((argc < 3) || (argc > 4))
154 usage(); 149 usage();
155 if (argc > 3) { 150 if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
216 die("Unable to mmap '%s': %m", argv[2]); 211 die("Unable to mmap '%s': %m", argv[2]);
217 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ 212 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
218 sys_size = (sz + 15 + 4) / 16; 213 sys_size = (sz + 15 + 4) / 16;
219 if (!is_big_kernel && sys_size > DEF_SYSSIZE)
220 die("System is too big. Try using bzImage or modules.");
221 214
222 /* Patch the setup code with the appropriate size parameters */ 215 /* Patch the setup code with the appropriate size parameters */
223 buf[0x1f1] = setup_sectors-1; 216 buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
deleted file mode 100644
index 433909d61e5c..000000000000
--- a/arch/x86/boot/voyager.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 *
6 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * Get the Voyager config information
13 */
14
15#include "boot.h"
16
17int query_voyager(void)
18{
19 u8 err;
20 u16 es, di;
21 /* Abuse the apm_bios_info area for this */
22 u8 *data_ptr = (u8 *)&boot_params.apm_bios_info;
23
24 data_ptr[0] = 0xff; /* Flag on config not found(?) */
25
26 asm("pushw %%es ; "
27 "int $0x15 ; "
28 "setc %0 ; "
29 "movw %%es, %1 ; "
30 "popw %%es"
31 : "=q" (err), "=r" (es), "=D" (di)
32 : "a" (0xffc0));
33
34 if (err)
35 return -1; /* Not Voyager */
36
37 set_fs(es);
38 copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */
39 return 0;
40}
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edba00d98ac3..235b81d0f6f2 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,14 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc5 3# Linux kernel version: 2.6.29-rc4
4# Wed Sep 3 17:23:09 2008 4# Tue Feb 24 15:50:58 2009
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 11CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 12CONFIG_GENERIC_CMOS_UPDATE=y
14CONFIG_CLOCKSOURCE_WATCHDOG=y 13CONFIG_CLOCKSOURCE_WATCHDOG=y
@@ -24,16 +23,14 @@ CONFIG_GENERIC_ISA_DMA=y
24CONFIG_GENERIC_IOMAP=y 23CONFIG_GENERIC_IOMAP=y
25CONFIG_GENERIC_BUG=y 24CONFIG_GENERIC_BUG=y
26CONFIG_GENERIC_HWEIGHT=y 25CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
28CONFIG_ARCH_MAY_HAVE_PC_FDC=y 26CONFIG_ARCH_MAY_HAVE_PC_FDC=y
29# CONFIG_RWSEM_GENERIC_SPINLOCK is not set 27# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
30CONFIG_RWSEM_XCHGADD_ALGORITHM=y 28CONFIG_RWSEM_XCHGADD_ALGORITHM=y
31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y 29CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y 30CONFIG_GENERIC_CALIBRATE_DELAY=y
35# CONFIG_GENERIC_TIME_VSYSCALL is not set 31# CONFIG_GENERIC_TIME_VSYSCALL is not set
36CONFIG_ARCH_HAS_CPU_RELAX=y 32CONFIG_ARCH_HAS_CPU_RELAX=y
33CONFIG_ARCH_HAS_DEFAULT_IDLE=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 34CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y 35CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set 36# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
@@ -42,12 +39,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
42# CONFIG_ZONE_DMA32 is not set 39# CONFIG_ZONE_DMA32 is not set
43CONFIG_ARCH_POPULATES_NODE_MAP=y 40CONFIG_ARCH_POPULATES_NODE_MAP=y
44# CONFIG_AUDIT_ARCH is not set 41# CONFIG_AUDIT_ARCH is not set
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 42CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y 43CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y 44CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y 45CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y 46CONFIG_X86_SMP=y
47CONFIG_USE_GENERIC_SMP_HELPERS=y
51CONFIG_X86_32_SMP=y 48CONFIG_X86_32_SMP=y
52CONFIG_X86_HT=y 49CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 50CONFIG_X86_BIOS_REBOOT=y
@@ -76,30 +73,44 @@ CONFIG_TASK_IO_ACCOUNTING=y
76CONFIG_AUDIT=y 73CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 74CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 75CONFIG_AUDIT_TREE=y
76
77#
78# RCU Subsystem
79#
80# CONFIG_CLASSIC_RCU is not set
81CONFIG_TREE_RCU=y
82# CONFIG_PREEMPT_RCU is not set
83# CONFIG_RCU_TRACE is not set
84CONFIG_RCU_FANOUT=32
85# CONFIG_RCU_FANOUT_EXACT is not set
86# CONFIG_TREE_RCU_TRACE is not set
87# CONFIG_PREEMPT_RCU_TRACE is not set
79# CONFIG_IKCONFIG is not set 88# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=18 89CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y
84# CONFIG_CGROUP_DEVICE is not set
85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y 90CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
87CONFIG_GROUP_SCHED=y 91CONFIG_GROUP_SCHED=y
88CONFIG_FAIR_GROUP_SCHED=y 92CONFIG_FAIR_GROUP_SCHED=y
89# CONFIG_RT_GROUP_SCHED is not set 93# CONFIG_RT_GROUP_SCHED is not set
90# CONFIG_USER_SCHED is not set 94# CONFIG_USER_SCHED is not set
91CONFIG_CGROUP_SCHED=y 95CONFIG_CGROUP_SCHED=y
96CONFIG_CGROUPS=y
97# CONFIG_CGROUP_DEBUG is not set
98CONFIG_CGROUP_NS=y
99CONFIG_CGROUP_FREEZER=y
100# CONFIG_CGROUP_DEVICE is not set
101CONFIG_CPUSETS=y
102CONFIG_PROC_PID_CPUSET=y
92CONFIG_CGROUP_CPUACCT=y 103CONFIG_CGROUP_CPUACCT=y
93CONFIG_RESOURCE_COUNTERS=y 104CONFIG_RESOURCE_COUNTERS=y
94# CONFIG_CGROUP_MEM_RES_CTLR is not set 105# CONFIG_CGROUP_MEM_RES_CTLR is not set
95# CONFIG_SYSFS_DEPRECATED_V2 is not set 106# CONFIG_SYSFS_DEPRECATED_V2 is not set
96CONFIG_PROC_PID_CPUSET=y
97CONFIG_RELAY=y 107CONFIG_RELAY=y
98CONFIG_NAMESPACES=y 108CONFIG_NAMESPACES=y
99CONFIG_UTS_NS=y 109CONFIG_UTS_NS=y
100CONFIG_IPC_NS=y 110CONFIG_IPC_NS=y
101CONFIG_USER_NS=y 111CONFIG_USER_NS=y
102CONFIG_PID_NS=y 112CONFIG_PID_NS=y
113CONFIG_NET_NS=y
103CONFIG_BLK_DEV_INITRD=y 114CONFIG_BLK_DEV_INITRD=y
104CONFIG_INITRAMFS_SOURCE="" 115CONFIG_INITRAMFS_SOURCE=""
105CONFIG_CC_OPTIMIZE_FOR_SIZE=y 116CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -124,12 +135,15 @@ CONFIG_SIGNALFD=y
124CONFIG_TIMERFD=y 135CONFIG_TIMERFD=y
125CONFIG_EVENTFD=y 136CONFIG_EVENTFD=y
126CONFIG_SHMEM=y 137CONFIG_SHMEM=y
138CONFIG_AIO=y
127CONFIG_VM_EVENT_COUNTERS=y 139CONFIG_VM_EVENT_COUNTERS=y
140CONFIG_PCI_QUIRKS=y
128CONFIG_SLUB_DEBUG=y 141CONFIG_SLUB_DEBUG=y
129# CONFIG_SLAB is not set 142# CONFIG_SLAB is not set
130CONFIG_SLUB=y 143CONFIG_SLUB=y
131# CONFIG_SLOB is not set 144# CONFIG_SLOB is not set
132CONFIG_PROFILING=y 145CONFIG_PROFILING=y
146CONFIG_TRACEPOINTS=y
133CONFIG_MARKERS=y 147CONFIG_MARKERS=y
134# CONFIG_OPROFILE is not set 148# CONFIG_OPROFILE is not set
135CONFIG_HAVE_OPROFILE=y 149CONFIG_HAVE_OPROFILE=y
@@ -139,15 +153,10 @@ CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y 153CONFIG_HAVE_IOREMAP_PROT=y
140CONFIG_HAVE_KPROBES=y 154CONFIG_HAVE_KPROBES=y
141CONFIG_HAVE_KRETPROBES=y 155CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set 156CONFIG_HAVE_ARCH_TRACEHOOK=y
143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
146CONFIG_PROC_PAGE_MONITOR=y
147CONFIG_HAVE_GENERIC_DMA_COHERENT=y 157CONFIG_HAVE_GENERIC_DMA_COHERENT=y
148CONFIG_SLABINFO=y 158CONFIG_SLABINFO=y
149CONFIG_RT_MUTEXES=y 159CONFIG_RT_MUTEXES=y
150# CONFIG_TINY_SHMEM is not set
151CONFIG_BASE_SMALL=0 160CONFIG_BASE_SMALL=0
152CONFIG_MODULES=y 161CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set 162# CONFIG_MODULE_FORCE_LOAD is not set
@@ -155,12 +164,10 @@ CONFIG_MODULE_UNLOAD=y
155CONFIG_MODULE_FORCE_UNLOAD=y 164CONFIG_MODULE_FORCE_UNLOAD=y
156# CONFIG_MODVERSIONS is not set 165# CONFIG_MODVERSIONS is not set
157# CONFIG_MODULE_SRCVERSION_ALL is not set 166# CONFIG_MODULE_SRCVERSION_ALL is not set
158CONFIG_KMOD=y
159CONFIG_STOP_MACHINE=y 167CONFIG_STOP_MACHINE=y
160CONFIG_BLOCK=y 168CONFIG_BLOCK=y
161# CONFIG_LBD is not set 169# CONFIG_LBD is not set
162CONFIG_BLK_DEV_IO_TRACE=y 170CONFIG_BLK_DEV_IO_TRACE=y
163# CONFIG_LSF is not set
164CONFIG_BLK_DEV_BSG=y 171CONFIG_BLK_DEV_BSG=y
165# CONFIG_BLK_DEV_INTEGRITY is not set 172# CONFIG_BLK_DEV_INTEGRITY is not set
166 173
@@ -176,7 +183,7 @@ CONFIG_IOSCHED_CFQ=y
176CONFIG_DEFAULT_CFQ=y 183CONFIG_DEFAULT_CFQ=y
177# CONFIG_DEFAULT_NOOP is not set 184# CONFIG_DEFAULT_NOOP is not set
178CONFIG_DEFAULT_IOSCHED="cfq" 185CONFIG_DEFAULT_IOSCHED="cfq"
179CONFIG_CLASSIC_RCU=y 186CONFIG_FREEZER=y
180 187
181# 188#
182# Processor type and features 189# Processor type and features
@@ -186,15 +193,14 @@ CONFIG_NO_HZ=y
186CONFIG_HIGH_RES_TIMERS=y 193CONFIG_HIGH_RES_TIMERS=y
187CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 194CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
188CONFIG_SMP=y 195CONFIG_SMP=y
196CONFIG_SPARSE_IRQ=y
189CONFIG_X86_FIND_SMP_CONFIG=y 197CONFIG_X86_FIND_SMP_CONFIG=y
190CONFIG_X86_MPPARSE=y 198CONFIG_X86_MPPARSE=y
191CONFIG_X86_PC=y
192# CONFIG_X86_ELAN is not set 199# CONFIG_X86_ELAN is not set
193# CONFIG_X86_VOYAGER is not set
194# CONFIG_X86_GENERICARCH is not set 200# CONFIG_X86_GENERICARCH is not set
195# CONFIG_X86_VSMP is not set 201# CONFIG_X86_VSMP is not set
196# CONFIG_X86_RDC321X is not set 202# CONFIG_X86_RDC321X is not set
197CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y 203CONFIG_SCHED_OMIT_FRAME_POINTER=y
198# CONFIG_PARAVIRT_GUEST is not set 204# CONFIG_PARAVIRT_GUEST is not set
199# CONFIG_MEMTEST is not set 205# CONFIG_MEMTEST is not set
200# CONFIG_M386 is not set 206# CONFIG_M386 is not set
@@ -238,10 +244,19 @@ CONFIG_X86_TSC=y
238CONFIG_X86_CMOV=y 244CONFIG_X86_CMOV=y
239CONFIG_X86_MINIMUM_CPU_FAMILY=4 245CONFIG_X86_MINIMUM_CPU_FAMILY=4
240CONFIG_X86_DEBUGCTLMSR=y 246CONFIG_X86_DEBUGCTLMSR=y
247CONFIG_CPU_SUP_INTEL=y
248CONFIG_CPU_SUP_CYRIX_32=y
249CONFIG_CPU_SUP_AMD=y
250CONFIG_CPU_SUP_CENTAUR_32=y
251CONFIG_CPU_SUP_TRANSMETA_32=y
252CONFIG_CPU_SUP_UMC_32=y
253CONFIG_X86_DS=y
254CONFIG_X86_PTRACE_BTS=y
241CONFIG_HPET_TIMER=y 255CONFIG_HPET_TIMER=y
242CONFIG_HPET_EMULATE_RTC=y 256CONFIG_HPET_EMULATE_RTC=y
243CONFIG_DMI=y 257CONFIG_DMI=y
244# CONFIG_IOMMU_HELPER is not set 258# CONFIG_IOMMU_HELPER is not set
259# CONFIG_IOMMU_API is not set
245CONFIG_NR_CPUS=64 260CONFIG_NR_CPUS=64
246CONFIG_SCHED_SMT=y 261CONFIG_SCHED_SMT=y
247CONFIG_SCHED_MC=y 262CONFIG_SCHED_MC=y
@@ -250,12 +265,17 @@ CONFIG_PREEMPT_VOLUNTARY=y
250# CONFIG_PREEMPT is not set 265# CONFIG_PREEMPT is not set
251CONFIG_X86_LOCAL_APIC=y 266CONFIG_X86_LOCAL_APIC=y
252CONFIG_X86_IO_APIC=y 267CONFIG_X86_IO_APIC=y
253# CONFIG_X86_MCE is not set 268CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
269CONFIG_X86_MCE=y
270CONFIG_X86_MCE_NONFATAL=y
271CONFIG_X86_MCE_P4THERMAL=y
254CONFIG_VM86=y 272CONFIG_VM86=y
255# CONFIG_TOSHIBA is not set 273# CONFIG_TOSHIBA is not set
256# CONFIG_I8K is not set 274# CONFIG_I8K is not set
257CONFIG_X86_REBOOTFIXUPS=y 275CONFIG_X86_REBOOTFIXUPS=y
258CONFIG_MICROCODE=y 276CONFIG_MICROCODE=y
277CONFIG_MICROCODE_INTEL=y
278CONFIG_MICROCODE_AMD=y
259CONFIG_MICROCODE_OLD_INTERFACE=y 279CONFIG_MICROCODE_OLD_INTERFACE=y
260CONFIG_X86_MSR=y 280CONFIG_X86_MSR=y
261CONFIG_X86_CPUID=y 281CONFIG_X86_CPUID=y
@@ -264,6 +284,7 @@ CONFIG_HIGHMEM4G=y
264# CONFIG_HIGHMEM64G is not set 284# CONFIG_HIGHMEM64G is not set
265CONFIG_PAGE_OFFSET=0xC0000000 285CONFIG_PAGE_OFFSET=0xC0000000
266CONFIG_HIGHMEM=y 286CONFIG_HIGHMEM=y
287# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
267CONFIG_ARCH_FLATMEM_ENABLE=y 288CONFIG_ARCH_FLATMEM_ENABLE=y
268CONFIG_ARCH_SPARSEMEM_ENABLE=y 289CONFIG_ARCH_SPARSEMEM_ENABLE=y
269CONFIG_ARCH_SELECT_MEMORY_MODEL=y 290CONFIG_ARCH_SELECT_MEMORY_MODEL=y
@@ -274,14 +295,17 @@ CONFIG_FLATMEM_MANUAL=y
274CONFIG_FLATMEM=y 295CONFIG_FLATMEM=y
275CONFIG_FLAT_NODE_MEM_MAP=y 296CONFIG_FLAT_NODE_MEM_MAP=y
276CONFIG_SPARSEMEM_STATIC=y 297CONFIG_SPARSEMEM_STATIC=y
277# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
278CONFIG_PAGEFLAGS_EXTENDED=y 298CONFIG_PAGEFLAGS_EXTENDED=y
279CONFIG_SPLIT_PTLOCK_CPUS=4 299CONFIG_SPLIT_PTLOCK_CPUS=4
280CONFIG_RESOURCES_64BIT=y 300# CONFIG_PHYS_ADDR_T_64BIT is not set
281CONFIG_ZONE_DMA_FLAG=1 301CONFIG_ZONE_DMA_FLAG=1
282CONFIG_BOUNCE=y 302CONFIG_BOUNCE=y
283CONFIG_VIRT_TO_BUS=y 303CONFIG_VIRT_TO_BUS=y
304CONFIG_UNEVICTABLE_LRU=y
284CONFIG_HIGHPTE=y 305CONFIG_HIGHPTE=y
306CONFIG_X86_CHECK_BIOS_CORRUPTION=y
307CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
308CONFIG_X86_RESERVE_LOW_64K=y
285# CONFIG_MATH_EMULATION is not set 309# CONFIG_MATH_EMULATION is not set
286CONFIG_MTRR=y 310CONFIG_MTRR=y
287# CONFIG_MTRR_SANITIZER is not set 311# CONFIG_MTRR_SANITIZER is not set
@@ -302,10 +326,11 @@ CONFIG_PHYSICAL_START=0x1000000
302CONFIG_PHYSICAL_ALIGN=0x200000 326CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 327CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 328# CONFIG_COMPAT_VDSO is not set
329# CONFIG_CMDLINE_BOOL is not set
305CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 330CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
306 331
307# 332#
308# Power management options 333# Power management and ACPI options
309# 334#
310CONFIG_PM=y 335CONFIG_PM=y
311CONFIG_PM_DEBUG=y 336CONFIG_PM_DEBUG=y
@@ -331,19 +356,13 @@ CONFIG_ACPI_BATTERY=y
331CONFIG_ACPI_BUTTON=y 356CONFIG_ACPI_BUTTON=y
332CONFIG_ACPI_FAN=y 357CONFIG_ACPI_FAN=y
333CONFIG_ACPI_DOCK=y 358CONFIG_ACPI_DOCK=y
334# CONFIG_ACPI_BAY is not set
335CONFIG_ACPI_PROCESSOR=y 359CONFIG_ACPI_PROCESSOR=y
336CONFIG_ACPI_HOTPLUG_CPU=y 360CONFIG_ACPI_HOTPLUG_CPU=y
337CONFIG_ACPI_THERMAL=y 361CONFIG_ACPI_THERMAL=y
338# CONFIG_ACPI_WMI is not set
339# CONFIG_ACPI_ASUS is not set
340# CONFIG_ACPI_TOSHIBA is not set
341# CONFIG_ACPI_CUSTOM_DSDT is not set 362# CONFIG_ACPI_CUSTOM_DSDT is not set
342CONFIG_ACPI_BLACKLIST_YEAR=0 363CONFIG_ACPI_BLACKLIST_YEAR=0
343# CONFIG_ACPI_DEBUG is not set 364# CONFIG_ACPI_DEBUG is not set
344CONFIG_ACPI_EC=y
345# CONFIG_ACPI_PCI_SLOT is not set 365# CONFIG_ACPI_PCI_SLOT is not set
346CONFIG_ACPI_POWER=y
347CONFIG_ACPI_SYSTEM=y 366CONFIG_ACPI_SYSTEM=y
348CONFIG_X86_PM_TIMER=y 367CONFIG_X86_PM_TIMER=y
349CONFIG_ACPI_CONTAINER=y 368CONFIG_ACPI_CONTAINER=y
@@ -388,7 +407,6 @@ CONFIG_X86_ACPI_CPUFREQ=y
388# 407#
389# shared options 408# shared options
390# 409#
391# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
392# CONFIG_X86_SPEEDSTEP_LIB is not set 410# CONFIG_X86_SPEEDSTEP_LIB is not set
393CONFIG_CPU_IDLE=y 411CONFIG_CPU_IDLE=y
394CONFIG_CPU_IDLE_GOV_LADDER=y 412CONFIG_CPU_IDLE_GOV_LADDER=y
@@ -415,6 +433,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y
415CONFIG_PCI_MSI=y 433CONFIG_PCI_MSI=y
416# CONFIG_PCI_LEGACY is not set 434# CONFIG_PCI_LEGACY is not set
417# CONFIG_PCI_DEBUG is not set 435# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set
418CONFIG_HT_IRQ=y 437CONFIG_HT_IRQ=y
419CONFIG_ISA_DMA_API=y 438CONFIG_ISA_DMA_API=y
420# CONFIG_ISA is not set 439# CONFIG_ISA is not set
@@ -452,13 +471,17 @@ CONFIG_HOTPLUG_PCI=y
452# Executable file formats / Emulations 471# Executable file formats / Emulations
453# 472#
454CONFIG_BINFMT_ELF=y 473CONFIG_BINFMT_ELF=y
474CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
475CONFIG_HAVE_AOUT=y
455# CONFIG_BINFMT_AOUT is not set 476# CONFIG_BINFMT_AOUT is not set
456CONFIG_BINFMT_MISC=y 477CONFIG_BINFMT_MISC=y
478CONFIG_HAVE_ATOMIC_IOMAP=y
457CONFIG_NET=y 479CONFIG_NET=y
458 480
459# 481#
460# Networking options 482# Networking options
461# 483#
484CONFIG_COMPAT_NET_DEV_OPS=y
462CONFIG_PACKET=y 485CONFIG_PACKET=y
463CONFIG_PACKET_MMAP=y 486CONFIG_PACKET_MMAP=y
464CONFIG_UNIX=y 487CONFIG_UNIX=y
@@ -519,7 +542,6 @@ CONFIG_DEFAULT_CUBIC=y
519# CONFIG_DEFAULT_RENO is not set 542# CONFIG_DEFAULT_RENO is not set
520CONFIG_DEFAULT_TCP_CONG="cubic" 543CONFIG_DEFAULT_TCP_CONG="cubic"
521CONFIG_TCP_MD5SIG=y 544CONFIG_TCP_MD5SIG=y
522# CONFIG_IP_VS is not set
523CONFIG_IPV6=y 545CONFIG_IPV6=y
524# CONFIG_IPV6_PRIVACY is not set 546# CONFIG_IPV6_PRIVACY is not set
525# CONFIG_IPV6_ROUTER_PREF is not set 547# CONFIG_IPV6_ROUTER_PREF is not set
@@ -557,19 +579,21 @@ CONFIG_NF_CONNTRACK_IRC=y
557CONFIG_NF_CONNTRACK_SIP=y 579CONFIG_NF_CONNTRACK_SIP=y
558CONFIG_NF_CT_NETLINK=y 580CONFIG_NF_CT_NETLINK=y
559CONFIG_NETFILTER_XTABLES=y 581CONFIG_NETFILTER_XTABLES=y
582CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
560CONFIG_NETFILTER_XT_TARGET_MARK=y 583CONFIG_NETFILTER_XT_TARGET_MARK=y
561CONFIG_NETFILTER_XT_TARGET_NFLOG=y 584CONFIG_NETFILTER_XT_TARGET_NFLOG=y
562CONFIG_NETFILTER_XT_TARGET_SECMARK=y 585CONFIG_NETFILTER_XT_TARGET_SECMARK=y
563CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
564CONFIG_NETFILTER_XT_TARGET_TCPMSS=y 586CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
565CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y 587CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
566CONFIG_NETFILTER_XT_MATCH_MARK=y 588CONFIG_NETFILTER_XT_MATCH_MARK=y
567CONFIG_NETFILTER_XT_MATCH_POLICY=y 589CONFIG_NETFILTER_XT_MATCH_POLICY=y
568CONFIG_NETFILTER_XT_MATCH_STATE=y 590CONFIG_NETFILTER_XT_MATCH_STATE=y
591# CONFIG_IP_VS is not set
569 592
570# 593#
571# IP: Netfilter Configuration 594# IP: Netfilter Configuration
572# 595#
596CONFIG_NF_DEFRAG_IPV4=y
573CONFIG_NF_CONNTRACK_IPV4=y 597CONFIG_NF_CONNTRACK_IPV4=y
574CONFIG_NF_CONNTRACK_PROC_COMPAT=y 598CONFIG_NF_CONNTRACK_PROC_COMPAT=y
575CONFIG_IP_NF_IPTABLES=y 599CONFIG_IP_NF_IPTABLES=y
@@ -595,8 +619,8 @@ CONFIG_IP_NF_MANGLE=y
595CONFIG_NF_CONNTRACK_IPV6=y 619CONFIG_NF_CONNTRACK_IPV6=y
596CONFIG_IP6_NF_IPTABLES=y 620CONFIG_IP6_NF_IPTABLES=y
597CONFIG_IP6_NF_MATCH_IPV6HEADER=y 621CONFIG_IP6_NF_MATCH_IPV6HEADER=y
598CONFIG_IP6_NF_FILTER=y
599CONFIG_IP6_NF_TARGET_LOG=y 622CONFIG_IP6_NF_TARGET_LOG=y
623CONFIG_IP6_NF_FILTER=y
600CONFIG_IP6_NF_TARGET_REJECT=y 624CONFIG_IP6_NF_TARGET_REJECT=y
601CONFIG_IP6_NF_MANGLE=y 625CONFIG_IP6_NF_MANGLE=y
602# CONFIG_IP_DCCP is not set 626# CONFIG_IP_DCCP is not set
@@ -604,6 +628,7 @@ CONFIG_IP6_NF_MANGLE=y
604# CONFIG_TIPC is not set 628# CONFIG_TIPC is not set
605# CONFIG_ATM is not set 629# CONFIG_ATM is not set
606# CONFIG_BRIDGE is not set 630# CONFIG_BRIDGE is not set
631# CONFIG_NET_DSA is not set
607# CONFIG_VLAN_8021Q is not set 632# CONFIG_VLAN_8021Q is not set
608# CONFIG_DECNET is not set 633# CONFIG_DECNET is not set
609CONFIG_LLC=y 634CONFIG_LLC=y
@@ -623,6 +648,7 @@ CONFIG_NET_SCHED=y
623# CONFIG_NET_SCH_HTB is not set 648# CONFIG_NET_SCH_HTB is not set
624# CONFIG_NET_SCH_HFSC is not set 649# CONFIG_NET_SCH_HFSC is not set
625# CONFIG_NET_SCH_PRIO is not set 650# CONFIG_NET_SCH_PRIO is not set
651# CONFIG_NET_SCH_MULTIQ is not set
626# CONFIG_NET_SCH_RED is not set 652# CONFIG_NET_SCH_RED is not set
627# CONFIG_NET_SCH_SFQ is not set 653# CONFIG_NET_SCH_SFQ is not set
628# CONFIG_NET_SCH_TEQL is not set 654# CONFIG_NET_SCH_TEQL is not set
@@ -630,6 +656,7 @@ CONFIG_NET_SCHED=y
630# CONFIG_NET_SCH_GRED is not set 656# CONFIG_NET_SCH_GRED is not set
631# CONFIG_NET_SCH_DSMARK is not set 657# CONFIG_NET_SCH_DSMARK is not set
632# CONFIG_NET_SCH_NETEM is not set 658# CONFIG_NET_SCH_NETEM is not set
659# CONFIG_NET_SCH_DRR is not set
633# CONFIG_NET_SCH_INGRESS is not set 660# CONFIG_NET_SCH_INGRESS is not set
634 661
635# 662#
@@ -644,6 +671,7 @@ CONFIG_NET_CLS=y
644# CONFIG_NET_CLS_RSVP is not set 671# CONFIG_NET_CLS_RSVP is not set
645# CONFIG_NET_CLS_RSVP6 is not set 672# CONFIG_NET_CLS_RSVP6 is not set
646# CONFIG_NET_CLS_FLOW is not set 673# CONFIG_NET_CLS_FLOW is not set
674# CONFIG_NET_CLS_CGROUP is not set
647CONFIG_NET_EMATCH=y 675CONFIG_NET_EMATCH=y
648CONFIG_NET_EMATCH_STACK=32 676CONFIG_NET_EMATCH_STACK=32
649# CONFIG_NET_EMATCH_CMP is not set 677# CONFIG_NET_EMATCH_CMP is not set
@@ -659,7 +687,9 @@ CONFIG_NET_CLS_ACT=y
659# CONFIG_NET_ACT_NAT is not set 687# CONFIG_NET_ACT_NAT is not set
660# CONFIG_NET_ACT_PEDIT is not set 688# CONFIG_NET_ACT_PEDIT is not set
661# CONFIG_NET_ACT_SIMP is not set 689# CONFIG_NET_ACT_SIMP is not set
690# CONFIG_NET_ACT_SKBEDIT is not set
662CONFIG_NET_SCH_FIFO=y 691CONFIG_NET_SCH_FIFO=y
692# CONFIG_DCB is not set
663 693
664# 694#
665# Network testing 695# Network testing
@@ -676,29 +706,33 @@ CONFIG_HAMRADIO=y
676# CONFIG_IRDA is not set 706# CONFIG_IRDA is not set
677# CONFIG_BT is not set 707# CONFIG_BT is not set
678# CONFIG_AF_RXRPC is not set 708# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
679CONFIG_FIB_RULES=y 710CONFIG_FIB_RULES=y
680 711CONFIG_WIRELESS=y
681#
682# Wireless
683#
684CONFIG_CFG80211=y 712CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set
685CONFIG_NL80211=y 714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y
686CONFIG_WIRELESS_EXT=y 716CONFIG_WIRELESS_EXT=y
687CONFIG_WIRELESS_EXT_SYSFS=y 717CONFIG_WIRELESS_EXT_SYSFS=y
718# CONFIG_LIB80211 is not set
688CONFIG_MAC80211=y 719CONFIG_MAC80211=y
689 720
690# 721#
691# Rate control algorithm selection 722# Rate control algorithm selection
692# 723#
693CONFIG_MAC80211_RC_PID=y 724CONFIG_MAC80211_RC_MINSTREL=y
694CONFIG_MAC80211_RC_DEFAULT_PID=y 725# CONFIG_MAC80211_RC_DEFAULT_PID is not set
695CONFIG_MAC80211_RC_DEFAULT="pid" 726CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
727CONFIG_MAC80211_RC_DEFAULT="minstrel"
696# CONFIG_MAC80211_MESH is not set 728# CONFIG_MAC80211_MESH is not set
697CONFIG_MAC80211_LEDS=y 729CONFIG_MAC80211_LEDS=y
698# CONFIG_MAC80211_DEBUGFS is not set 730# CONFIG_MAC80211_DEBUGFS is not set
699# CONFIG_MAC80211_DEBUG_MENU is not set 731# CONFIG_MAC80211_DEBUG_MENU is not set
700# CONFIG_IEEE80211 is not set 732# CONFIG_WIMAX is not set
701# CONFIG_RFKILL is not set 733CONFIG_RFKILL=y
734# CONFIG_RFKILL_INPUT is not set
735CONFIG_RFKILL_LEDS=y
702# CONFIG_NET_9P is not set 736# CONFIG_NET_9P is not set
703 737
704# 738#
@@ -722,7 +756,7 @@ CONFIG_PROC_EVENTS=y
722# CONFIG_MTD is not set 756# CONFIG_MTD is not set
723# CONFIG_PARPORT is not set 757# CONFIG_PARPORT is not set
724CONFIG_PNP=y 758CONFIG_PNP=y
725# CONFIG_PNP_DEBUG is not set 759CONFIG_PNP_DEBUG_MESSAGES=y
726 760
727# 761#
728# Protocols 762# Protocols
@@ -750,20 +784,19 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
750CONFIG_MISC_DEVICES=y 784CONFIG_MISC_DEVICES=y
751# CONFIG_IBM_ASM is not set 785# CONFIG_IBM_ASM is not set
752# CONFIG_PHANTOM is not set 786# CONFIG_PHANTOM is not set
753# CONFIG_EEPROM_93CX6 is not set
754# CONFIG_SGI_IOC4 is not set 787# CONFIG_SGI_IOC4 is not set
755# CONFIG_TIFM_CORE is not set 788# CONFIG_TIFM_CORE is not set
756# CONFIG_ACER_WMI is not set 789# CONFIG_ICS932S401 is not set
757# CONFIG_ASUS_LAPTOP is not set
758# CONFIG_FUJITSU_LAPTOP is not set
759# CONFIG_TC1100_WMI is not set
760# CONFIG_MSI_LAPTOP is not set
761# CONFIG_COMPAL_LAPTOP is not set
762# CONFIG_SONY_LAPTOP is not set
763# CONFIG_THINKPAD_ACPI is not set
764# CONFIG_INTEL_MENLOW is not set
765# CONFIG_ENCLOSURE_SERVICES is not set 790# CONFIG_ENCLOSURE_SERVICES is not set
766# CONFIG_HP_ILO is not set 791# CONFIG_HP_ILO is not set
792# CONFIG_C2PORT is not set
793
794#
795# EEPROM support
796#
797# CONFIG_EEPROM_AT24 is not set
798# CONFIG_EEPROM_LEGACY is not set
799# CONFIG_EEPROM_93CX6 is not set
767CONFIG_HAVE_IDE=y 800CONFIG_HAVE_IDE=y
768# CONFIG_IDE is not set 801# CONFIG_IDE is not set
769 802
@@ -802,7 +835,7 @@ CONFIG_SCSI_WAIT_SCAN=m
802# 835#
803CONFIG_SCSI_SPI_ATTRS=y 836CONFIG_SCSI_SPI_ATTRS=y
804# CONFIG_SCSI_FC_ATTRS is not set 837# CONFIG_SCSI_FC_ATTRS is not set
805CONFIG_SCSI_ISCSI_ATTRS=y 838# CONFIG_SCSI_ISCSI_ATTRS is not set
806# CONFIG_SCSI_SAS_ATTRS is not set 839# CONFIG_SCSI_SAS_ATTRS is not set
807# CONFIG_SCSI_SAS_LIBSAS is not set 840# CONFIG_SCSI_SAS_LIBSAS is not set
808# CONFIG_SCSI_SRP_ATTRS is not set 841# CONFIG_SCSI_SRP_ATTRS is not set
@@ -875,6 +908,7 @@ CONFIG_PATA_OLDPIIX=y
875CONFIG_PATA_SCH=y 908CONFIG_PATA_SCH=y
876CONFIG_MD=y 909CONFIG_MD=y
877CONFIG_BLK_DEV_MD=y 910CONFIG_BLK_DEV_MD=y
911CONFIG_MD_AUTODETECT=y
878# CONFIG_MD_LINEAR is not set 912# CONFIG_MD_LINEAR is not set
879# CONFIG_MD_RAID0 is not set 913# CONFIG_MD_RAID0 is not set
880# CONFIG_MD_RAID1 is not set 914# CONFIG_MD_RAID1 is not set
@@ -930,6 +964,9 @@ CONFIG_PHYLIB=y
930# CONFIG_BROADCOM_PHY is not set 964# CONFIG_BROADCOM_PHY is not set
931# CONFIG_ICPLUS_PHY is not set 965# CONFIG_ICPLUS_PHY is not set
932# CONFIG_REALTEK_PHY is not set 966# CONFIG_REALTEK_PHY is not set
967# CONFIG_NATIONAL_PHY is not set
968# CONFIG_STE10XP is not set
969# CONFIG_LSI_ET1011C_PHY is not set
933# CONFIG_FIXED_PHY is not set 970# CONFIG_FIXED_PHY is not set
934# CONFIG_MDIO_BITBANG is not set 971# CONFIG_MDIO_BITBANG is not set
935CONFIG_NET_ETHERNET=y 972CONFIG_NET_ETHERNET=y
@@ -953,6 +990,9 @@ CONFIG_NET_TULIP=y
953# CONFIG_IBM_NEW_EMAC_RGMII is not set 990# CONFIG_IBM_NEW_EMAC_RGMII is not set
954# CONFIG_IBM_NEW_EMAC_TAH is not set 991# CONFIG_IBM_NEW_EMAC_TAH is not set
955# CONFIG_IBM_NEW_EMAC_EMAC4 is not set 992# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
993# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
994# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
995# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
956CONFIG_NET_PCI=y 996CONFIG_NET_PCI=y
957# CONFIG_PCNET32 is not set 997# CONFIG_PCNET32 is not set
958# CONFIG_AMD8111_ETH is not set 998# CONFIG_AMD8111_ETH is not set
@@ -960,7 +1000,6 @@ CONFIG_NET_PCI=y
960# CONFIG_B44 is not set 1000# CONFIG_B44 is not set
961CONFIG_FORCEDETH=y 1001CONFIG_FORCEDETH=y
962# CONFIG_FORCEDETH_NAPI is not set 1002# CONFIG_FORCEDETH_NAPI is not set
963# CONFIG_EEPRO100 is not set
964CONFIG_E100=y 1003CONFIG_E100=y
965# CONFIG_FEALNX is not set 1004# CONFIG_FEALNX is not set
966# CONFIG_NATSEMI is not set 1005# CONFIG_NATSEMI is not set
@@ -974,15 +1013,16 @@ CONFIG_8139TOO=y
974# CONFIG_R6040 is not set 1013# CONFIG_R6040 is not set
975# CONFIG_SIS900 is not set 1014# CONFIG_SIS900 is not set
976# CONFIG_EPIC100 is not set 1015# CONFIG_EPIC100 is not set
1016# CONFIG_SMSC9420 is not set
977# CONFIG_SUNDANCE is not set 1017# CONFIG_SUNDANCE is not set
978# CONFIG_TLAN is not set 1018# CONFIG_TLAN is not set
979# CONFIG_VIA_RHINE is not set 1019# CONFIG_VIA_RHINE is not set
980# CONFIG_SC92031 is not set 1020# CONFIG_SC92031 is not set
1021# CONFIG_ATL2 is not set
981CONFIG_NETDEV_1000=y 1022CONFIG_NETDEV_1000=y
982# CONFIG_ACENIC is not set 1023# CONFIG_ACENIC is not set
983# CONFIG_DL2K is not set 1024# CONFIG_DL2K is not set
984CONFIG_E1000=y 1025CONFIG_E1000=y
985# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
986CONFIG_E1000E=y 1026CONFIG_E1000E=y
987# CONFIG_IP1000 is not set 1027# CONFIG_IP1000 is not set
988# CONFIG_IGB is not set 1028# CONFIG_IGB is not set
@@ -1000,18 +1040,23 @@ CONFIG_BNX2=y
1000# CONFIG_QLA3XXX is not set 1040# CONFIG_QLA3XXX is not set
1001# CONFIG_ATL1 is not set 1041# CONFIG_ATL1 is not set
1002# CONFIG_ATL1E is not set 1042# CONFIG_ATL1E is not set
1043# CONFIG_JME is not set
1003CONFIG_NETDEV_10000=y 1044CONFIG_NETDEV_10000=y
1004# CONFIG_CHELSIO_T1 is not set 1045# CONFIG_CHELSIO_T1 is not set
1046CONFIG_CHELSIO_T3_DEPENDS=y
1005# CONFIG_CHELSIO_T3 is not set 1047# CONFIG_CHELSIO_T3 is not set
1048# CONFIG_ENIC is not set
1006# CONFIG_IXGBE is not set 1049# CONFIG_IXGBE is not set
1007# CONFIG_IXGB is not set 1050# CONFIG_IXGB is not set
1008# CONFIG_S2IO is not set 1051# CONFIG_S2IO is not set
1009# CONFIG_MYRI10GE is not set 1052# CONFIG_MYRI10GE is not set
1010# CONFIG_NETXEN_NIC is not set 1053# CONFIG_NETXEN_NIC is not set
1011# CONFIG_NIU is not set 1054# CONFIG_NIU is not set
1055# CONFIG_MLX4_EN is not set
1012# CONFIG_MLX4_CORE is not set 1056# CONFIG_MLX4_CORE is not set
1013# CONFIG_TEHUTI is not set 1057# CONFIG_TEHUTI is not set
1014# CONFIG_BNX2X is not set 1058# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set
1015# CONFIG_SFC is not set 1060# CONFIG_SFC is not set
1016CONFIG_TR=y 1061CONFIG_TR=y
1017# CONFIG_IBMOL is not set 1062# CONFIG_IBMOL is not set
@@ -1025,9 +1070,8 @@ CONFIG_TR=y
1025# CONFIG_WLAN_PRE80211 is not set 1070# CONFIG_WLAN_PRE80211 is not set
1026CONFIG_WLAN_80211=y 1071CONFIG_WLAN_80211=y
1027# CONFIG_PCMCIA_RAYCS is not set 1072# CONFIG_PCMCIA_RAYCS is not set
1028# CONFIG_IPW2100 is not set
1029# CONFIG_IPW2200 is not set
1030# CONFIG_LIBERTAS is not set 1073# CONFIG_LIBERTAS is not set
1074# CONFIG_LIBERTAS_THINFIRM is not set
1031# CONFIG_AIRO is not set 1075# CONFIG_AIRO is not set
1032# CONFIG_HERMES is not set 1076# CONFIG_HERMES is not set
1033# CONFIG_ATMEL is not set 1077# CONFIG_ATMEL is not set
@@ -1044,6 +1088,8 @@ CONFIG_WLAN_80211=y
1044CONFIG_ATH5K=y 1088CONFIG_ATH5K=y
1045# CONFIG_ATH5K_DEBUG is not set 1089# CONFIG_ATH5K_DEBUG is not set
1046# CONFIG_ATH9K is not set 1090# CONFIG_ATH9K is not set
1091# CONFIG_IPW2100 is not set
1092# CONFIG_IPW2200 is not set
1047# CONFIG_IWLCORE is not set 1093# CONFIG_IWLCORE is not set
1048# CONFIG_IWLWIFI_LEDS is not set 1094# CONFIG_IWLWIFI_LEDS is not set
1049# CONFIG_IWLAGN is not set 1095# CONFIG_IWLAGN is not set
@@ -1055,6 +1101,10 @@ CONFIG_ATH5K=y
1055# CONFIG_RT2X00 is not set 1101# CONFIG_RT2X00 is not set
1056 1102
1057# 1103#
1104# Enable WiMAX (Networking options) to see the WiMAX drivers
1105#
1106
1107#
1058# USB Network Adapters 1108# USB Network Adapters
1059# 1109#
1060# CONFIG_USB_CATC is not set 1110# CONFIG_USB_CATC is not set
@@ -1062,6 +1112,7 @@ CONFIG_ATH5K=y
1062# CONFIG_USB_PEGASUS is not set 1112# CONFIG_USB_PEGASUS is not set
1063# CONFIG_USB_RTL8150 is not set 1113# CONFIG_USB_RTL8150 is not set
1064# CONFIG_USB_USBNET is not set 1114# CONFIG_USB_USBNET is not set
1115# CONFIG_USB_HSO is not set
1065CONFIG_NET_PCMCIA=y 1116CONFIG_NET_PCMCIA=y
1066# CONFIG_PCMCIA_3C589 is not set 1117# CONFIG_PCMCIA_3C589 is not set
1067# CONFIG_PCMCIA_3C574 is not set 1118# CONFIG_PCMCIA_3C574 is not set
@@ -1123,6 +1174,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y
1123CONFIG_MOUSE_PS2_SYNAPTICS=y 1174CONFIG_MOUSE_PS2_SYNAPTICS=y
1124CONFIG_MOUSE_PS2_LIFEBOOK=y 1175CONFIG_MOUSE_PS2_LIFEBOOK=y
1125CONFIG_MOUSE_PS2_TRACKPOINT=y 1176CONFIG_MOUSE_PS2_TRACKPOINT=y
1177# CONFIG_MOUSE_PS2_ELANTECH is not set
1126# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1178# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1127# CONFIG_MOUSE_SERIAL is not set 1179# CONFIG_MOUSE_SERIAL is not set
1128# CONFIG_MOUSE_APPLETOUCH is not set 1180# CONFIG_MOUSE_APPLETOUCH is not set
@@ -1160,15 +1212,16 @@ CONFIG_INPUT_TOUCHSCREEN=y
1160# CONFIG_TOUCHSCREEN_FUJITSU is not set 1212# CONFIG_TOUCHSCREEN_FUJITSU is not set
1161# CONFIG_TOUCHSCREEN_GUNZE is not set 1213# CONFIG_TOUCHSCREEN_GUNZE is not set
1162# CONFIG_TOUCHSCREEN_ELO is not set 1214# CONFIG_TOUCHSCREEN_ELO is not set
1215# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
1163# CONFIG_TOUCHSCREEN_MTOUCH is not set 1216# CONFIG_TOUCHSCREEN_MTOUCH is not set
1164# CONFIG_TOUCHSCREEN_INEXIO is not set 1217# CONFIG_TOUCHSCREEN_INEXIO is not set
1165# CONFIG_TOUCHSCREEN_MK712 is not set 1218# CONFIG_TOUCHSCREEN_MK712 is not set
1166# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1219# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1167# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1220# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1168# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1221# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1169# CONFIG_TOUCHSCREEN_UCB1400 is not set
1170# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1222# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1171# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set 1223# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1224# CONFIG_TOUCHSCREEN_TSC2007 is not set
1172CONFIG_INPUT_MISC=y 1225CONFIG_INPUT_MISC=y
1173# CONFIG_INPUT_PCSPKR is not set 1226# CONFIG_INPUT_PCSPKR is not set
1174# CONFIG_INPUT_APANEL is not set 1227# CONFIG_INPUT_APANEL is not set
@@ -1179,6 +1232,7 @@ CONFIG_INPUT_MISC=y
1179# CONFIG_INPUT_KEYSPAN_REMOTE is not set 1232# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1180# CONFIG_INPUT_POWERMATE is not set 1233# CONFIG_INPUT_POWERMATE is not set
1181# CONFIG_INPUT_YEALINK is not set 1234# CONFIG_INPUT_YEALINK is not set
1235# CONFIG_INPUT_CM109 is not set
1182# CONFIG_INPUT_UINPUT is not set 1236# CONFIG_INPUT_UINPUT is not set
1183 1237
1184# 1238#
@@ -1245,6 +1299,7 @@ CONFIG_SERIAL_CORE=y
1245CONFIG_SERIAL_CORE_CONSOLE=y 1299CONFIG_SERIAL_CORE_CONSOLE=y
1246# CONFIG_SERIAL_JSM is not set 1300# CONFIG_SERIAL_JSM is not set
1247CONFIG_UNIX98_PTYS=y 1301CONFIG_UNIX98_PTYS=y
1302# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
1248# CONFIG_LEGACY_PTYS is not set 1303# CONFIG_LEGACY_PTYS is not set
1249# CONFIG_IPMI_HANDLER is not set 1304# CONFIG_IPMI_HANDLER is not set
1250CONFIG_HW_RANDOM=y 1305CONFIG_HW_RANDOM=y
@@ -1279,6 +1334,7 @@ CONFIG_I2C=y
1279CONFIG_I2C_BOARDINFO=y 1334CONFIG_I2C_BOARDINFO=y
1280# CONFIG_I2C_CHARDEV is not set 1335# CONFIG_I2C_CHARDEV is not set
1281CONFIG_I2C_HELPER_AUTO=y 1336CONFIG_I2C_HELPER_AUTO=y
1337CONFIG_I2C_ALGOBIT=y
1282 1338
1283# 1339#
1284# I2C Hardware Bus support 1340# I2C Hardware Bus support
@@ -1331,8 +1387,6 @@ CONFIG_I2C_I801=y
1331# Miscellaneous I2C Chip support 1387# Miscellaneous I2C Chip support
1332# 1388#
1333# CONFIG_DS1682 is not set 1389# CONFIG_DS1682 is not set
1334# CONFIG_EEPROM_AT24 is not set
1335# CONFIG_EEPROM_LEGACY is not set
1336# CONFIG_SENSORS_PCF8574 is not set 1390# CONFIG_SENSORS_PCF8574 is not set
1337# CONFIG_PCF8575 is not set 1391# CONFIG_PCF8575 is not set
1338# CONFIG_SENSORS_PCA9539 is not set 1392# CONFIG_SENSORS_PCA9539 is not set
@@ -1351,8 +1405,78 @@ CONFIG_POWER_SUPPLY=y
1351# CONFIG_POWER_SUPPLY_DEBUG is not set 1405# CONFIG_POWER_SUPPLY_DEBUG is not set
1352# CONFIG_PDA_POWER is not set 1406# CONFIG_PDA_POWER is not set
1353# CONFIG_BATTERY_DS2760 is not set 1407# CONFIG_BATTERY_DS2760 is not set
1354# CONFIG_HWMON is not set 1408# CONFIG_BATTERY_BQ27x00 is not set
1409CONFIG_HWMON=y
1410# CONFIG_HWMON_VID is not set
1411# CONFIG_SENSORS_ABITUGURU is not set
1412# CONFIG_SENSORS_ABITUGURU3 is not set
1413# CONFIG_SENSORS_AD7414 is not set
1414# CONFIG_SENSORS_AD7418 is not set
1415# CONFIG_SENSORS_ADM1021 is not set
1416# CONFIG_SENSORS_ADM1025 is not set
1417# CONFIG_SENSORS_ADM1026 is not set
1418# CONFIG_SENSORS_ADM1029 is not set
1419# CONFIG_SENSORS_ADM1031 is not set
1420# CONFIG_SENSORS_ADM9240 is not set
1421# CONFIG_SENSORS_ADT7462 is not set
1422# CONFIG_SENSORS_ADT7470 is not set
1423# CONFIG_SENSORS_ADT7473 is not set
1424# CONFIG_SENSORS_ADT7475 is not set
1425# CONFIG_SENSORS_K8TEMP is not set
1426# CONFIG_SENSORS_ASB100 is not set
1427# CONFIG_SENSORS_ATXP1 is not set
1428# CONFIG_SENSORS_DS1621 is not set
1429# CONFIG_SENSORS_I5K_AMB is not set
1430# CONFIG_SENSORS_F71805F is not set
1431# CONFIG_SENSORS_F71882FG is not set
1432# CONFIG_SENSORS_F75375S is not set
1433# CONFIG_SENSORS_FSCHER is not set
1434# CONFIG_SENSORS_FSCPOS is not set
1435# CONFIG_SENSORS_FSCHMD is not set
1436# CONFIG_SENSORS_GL518SM is not set
1437# CONFIG_SENSORS_GL520SM is not set
1438# CONFIG_SENSORS_CORETEMP is not set
1439# CONFIG_SENSORS_IT87 is not set
1440# CONFIG_SENSORS_LM63 is not set
1441# CONFIG_SENSORS_LM75 is not set
1442# CONFIG_SENSORS_LM77 is not set
1443# CONFIG_SENSORS_LM78 is not set
1444# CONFIG_SENSORS_LM80 is not set
1445# CONFIG_SENSORS_LM83 is not set
1446# CONFIG_SENSORS_LM85 is not set
1447# CONFIG_SENSORS_LM87 is not set
1448# CONFIG_SENSORS_LM90 is not set
1449# CONFIG_SENSORS_LM92 is not set
1450# CONFIG_SENSORS_LM93 is not set
1451# CONFIG_SENSORS_LTC4245 is not set
1452# CONFIG_SENSORS_MAX1619 is not set
1453# CONFIG_SENSORS_MAX6650 is not set
1454# CONFIG_SENSORS_PC87360 is not set
1455# CONFIG_SENSORS_PC87427 is not set
1456# CONFIG_SENSORS_SIS5595 is not set
1457# CONFIG_SENSORS_DME1737 is not set
1458# CONFIG_SENSORS_SMSC47M1 is not set
1459# CONFIG_SENSORS_SMSC47M192 is not set
1460# CONFIG_SENSORS_SMSC47B397 is not set
1461# CONFIG_SENSORS_ADS7828 is not set
1462# CONFIG_SENSORS_THMC50 is not set
1463# CONFIG_SENSORS_VIA686A is not set
1464# CONFIG_SENSORS_VT1211 is not set
1465# CONFIG_SENSORS_VT8231 is not set
1466# CONFIG_SENSORS_W83781D is not set
1467# CONFIG_SENSORS_W83791D is not set
1468# CONFIG_SENSORS_W83792D is not set
1469# CONFIG_SENSORS_W83793 is not set
1470# CONFIG_SENSORS_W83L785TS is not set
1471# CONFIG_SENSORS_W83L786NG is not set
1472# CONFIG_SENSORS_W83627HF is not set
1473# CONFIG_SENSORS_W83627EHF is not set
1474# CONFIG_SENSORS_HDAPS is not set
1475# CONFIG_SENSORS_LIS3LV02D is not set
1476# CONFIG_SENSORS_APPLESMC is not set
1477# CONFIG_HWMON_DEBUG_CHIP is not set
1355CONFIG_THERMAL=y 1478CONFIG_THERMAL=y
1479# CONFIG_THERMAL_HWMON is not set
1356CONFIG_WATCHDOG=y 1480CONFIG_WATCHDOG=y
1357# CONFIG_WATCHDOG_NOWAYOUT is not set 1481# CONFIG_WATCHDOG_NOWAYOUT is not set
1358 1482
@@ -1372,6 +1496,7 @@ CONFIG_WATCHDOG=y
1372# CONFIG_I6300ESB_WDT is not set 1496# CONFIG_I6300ESB_WDT is not set
1373# CONFIG_ITCO_WDT is not set 1497# CONFIG_ITCO_WDT is not set
1374# CONFIG_IT8712F_WDT is not set 1498# CONFIG_IT8712F_WDT is not set
1499# CONFIG_IT87_WDT is not set
1375# CONFIG_HP_WATCHDOG is not set 1500# CONFIG_HP_WATCHDOG is not set
1376# CONFIG_SC1200_WDT is not set 1501# CONFIG_SC1200_WDT is not set
1377# CONFIG_PC87413_WDT is not set 1502# CONFIG_PC87413_WDT is not set
@@ -1379,9 +1504,11 @@ CONFIG_WATCHDOG=y
1379# CONFIG_SBC8360_WDT is not set 1504# CONFIG_SBC8360_WDT is not set
1380# CONFIG_SBC7240_WDT is not set 1505# CONFIG_SBC7240_WDT is not set
1381# CONFIG_CPU5_WDT is not set 1506# CONFIG_CPU5_WDT is not set
1507# CONFIG_SMSC_SCH311X_WDT is not set
1382# CONFIG_SMSC37B787_WDT is not set 1508# CONFIG_SMSC37B787_WDT is not set
1383# CONFIG_W83627HF_WDT is not set 1509# CONFIG_W83627HF_WDT is not set
1384# CONFIG_W83697HF_WDT is not set 1510# CONFIG_W83697HF_WDT is not set
1511# CONFIG_W83697UG_WDT is not set
1385# CONFIG_W83877F_WDT is not set 1512# CONFIG_W83877F_WDT is not set
1386# CONFIG_W83977F_WDT is not set 1513# CONFIG_W83977F_WDT is not set
1387# CONFIG_MACHZ_WDT is not set 1514# CONFIG_MACHZ_WDT is not set
@@ -1397,11 +1524,11 @@ CONFIG_WATCHDOG=y
1397# USB-based Watchdog Cards 1524# USB-based Watchdog Cards
1398# 1525#
1399# CONFIG_USBPCWATCHDOG is not set 1526# CONFIG_USBPCWATCHDOG is not set
1527CONFIG_SSB_POSSIBLE=y
1400 1528
1401# 1529#
1402# Sonics Silicon Backplane 1530# Sonics Silicon Backplane
1403# 1531#
1404CONFIG_SSB_POSSIBLE=y
1405# CONFIG_SSB is not set 1532# CONFIG_SSB is not set
1406 1533
1407# 1534#
@@ -1410,7 +1537,13 @@ CONFIG_SSB_POSSIBLE=y
1410# CONFIG_MFD_CORE is not set 1537# CONFIG_MFD_CORE is not set
1411# CONFIG_MFD_SM501 is not set 1538# CONFIG_MFD_SM501 is not set
1412# CONFIG_HTC_PASIC3 is not set 1539# CONFIG_HTC_PASIC3 is not set
1540# CONFIG_TWL4030_CORE is not set
1413# CONFIG_MFD_TMIO is not set 1541# CONFIG_MFD_TMIO is not set
1542# CONFIG_PMIC_DA903X is not set
1543# CONFIG_MFD_WM8400 is not set
1544# CONFIG_MFD_WM8350_I2C is not set
1545# CONFIG_MFD_PCF50633 is not set
1546# CONFIG_REGULATOR is not set
1414 1547
1415# 1548#
1416# Multimedia devices 1549# Multimedia devices
@@ -1450,6 +1583,7 @@ CONFIG_DRM=y
1450# CONFIG_DRM_I810 is not set 1583# CONFIG_DRM_I810 is not set
1451# CONFIG_DRM_I830 is not set 1584# CONFIG_DRM_I830 is not set
1452CONFIG_DRM_I915=y 1585CONFIG_DRM_I915=y
1586# CONFIG_DRM_I915_KMS is not set
1453# CONFIG_DRM_MGA is not set 1587# CONFIG_DRM_MGA is not set
1454# CONFIG_DRM_SIS is not set 1588# CONFIG_DRM_SIS is not set
1455# CONFIG_DRM_VIA is not set 1589# CONFIG_DRM_VIA is not set
@@ -1459,6 +1593,7 @@ CONFIG_DRM_I915=y
1459CONFIG_FB=y 1593CONFIG_FB=y
1460# CONFIG_FIRMWARE_EDID is not set 1594# CONFIG_FIRMWARE_EDID is not set
1461# CONFIG_FB_DDC is not set 1595# CONFIG_FB_DDC is not set
1596# CONFIG_FB_BOOT_VESA_SUPPORT is not set
1462CONFIG_FB_CFB_FILLRECT=y 1597CONFIG_FB_CFB_FILLRECT=y
1463CONFIG_FB_CFB_COPYAREA=y 1598CONFIG_FB_CFB_COPYAREA=y
1464CONFIG_FB_CFB_IMAGEBLIT=y 1599CONFIG_FB_CFB_IMAGEBLIT=y
@@ -1487,7 +1622,6 @@ CONFIG_FB_TILEBLITTING=y
1487# CONFIG_FB_UVESA is not set 1622# CONFIG_FB_UVESA is not set
1488# CONFIG_FB_VESA is not set 1623# CONFIG_FB_VESA is not set
1489CONFIG_FB_EFI=y 1624CONFIG_FB_EFI=y
1490# CONFIG_FB_IMAC is not set
1491# CONFIG_FB_N411 is not set 1625# CONFIG_FB_N411 is not set
1492# CONFIG_FB_HGA is not set 1626# CONFIG_FB_HGA is not set
1493# CONFIG_FB_S1D13XXX is not set 1627# CONFIG_FB_S1D13XXX is not set
@@ -1503,6 +1637,7 @@ CONFIG_FB_EFI=y
1503# CONFIG_FB_S3 is not set 1637# CONFIG_FB_S3 is not set
1504# CONFIG_FB_SAVAGE is not set 1638# CONFIG_FB_SAVAGE is not set
1505# CONFIG_FB_SIS is not set 1639# CONFIG_FB_SIS is not set
1640# CONFIG_FB_VIA is not set
1506# CONFIG_FB_NEOMAGIC is not set 1641# CONFIG_FB_NEOMAGIC is not set
1507# CONFIG_FB_KYRO is not set 1642# CONFIG_FB_KYRO is not set
1508# CONFIG_FB_3DFX is not set 1643# CONFIG_FB_3DFX is not set
@@ -1515,12 +1650,15 @@ CONFIG_FB_EFI=y
1515# CONFIG_FB_CARMINE is not set 1650# CONFIG_FB_CARMINE is not set
1516# CONFIG_FB_GEODE is not set 1651# CONFIG_FB_GEODE is not set
1517# CONFIG_FB_VIRTUAL is not set 1652# CONFIG_FB_VIRTUAL is not set
1653# CONFIG_FB_METRONOME is not set
1654# CONFIG_FB_MB862XX is not set
1518CONFIG_BACKLIGHT_LCD_SUPPORT=y 1655CONFIG_BACKLIGHT_LCD_SUPPORT=y
1519# CONFIG_LCD_CLASS_DEVICE is not set 1656# CONFIG_LCD_CLASS_DEVICE is not set
1520CONFIG_BACKLIGHT_CLASS_DEVICE=y 1657CONFIG_BACKLIGHT_CLASS_DEVICE=y
1521# CONFIG_BACKLIGHT_CORGI is not set 1658CONFIG_BACKLIGHT_GENERIC=y
1522# CONFIG_BACKLIGHT_PROGEAR is not set 1659# CONFIG_BACKLIGHT_PROGEAR is not set
1523# CONFIG_BACKLIGHT_MBP_NVIDIA is not set 1660# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1661# CONFIG_BACKLIGHT_SAHARA is not set
1524 1662
1525# 1663#
1526# Display device support 1664# Display device support
@@ -1540,10 +1678,12 @@ CONFIG_LOGO=y
1540# CONFIG_LOGO_LINUX_VGA16 is not set 1678# CONFIG_LOGO_LINUX_VGA16 is not set
1541CONFIG_LOGO_LINUX_CLUT224=y 1679CONFIG_LOGO_LINUX_CLUT224=y
1542CONFIG_SOUND=y 1680CONFIG_SOUND=y
1681CONFIG_SOUND_OSS_CORE=y
1543CONFIG_SND=y 1682CONFIG_SND=y
1544CONFIG_SND_TIMER=y 1683CONFIG_SND_TIMER=y
1545CONFIG_SND_PCM=y 1684CONFIG_SND_PCM=y
1546CONFIG_SND_HWDEP=y 1685CONFIG_SND_HWDEP=y
1686CONFIG_SND_JACK=y
1547CONFIG_SND_SEQUENCER=y 1687CONFIG_SND_SEQUENCER=y
1548CONFIG_SND_SEQ_DUMMY=y 1688CONFIG_SND_SEQ_DUMMY=y
1549CONFIG_SND_OSSEMUL=y 1689CONFIG_SND_OSSEMUL=y
@@ -1551,6 +1691,8 @@ CONFIG_SND_MIXER_OSS=y
1551CONFIG_SND_PCM_OSS=y 1691CONFIG_SND_PCM_OSS=y
1552CONFIG_SND_PCM_OSS_PLUGINS=y 1692CONFIG_SND_PCM_OSS_PLUGINS=y
1553CONFIG_SND_SEQUENCER_OSS=y 1693CONFIG_SND_SEQUENCER_OSS=y
1694CONFIG_SND_HRTIMER=y
1695CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
1554CONFIG_SND_DYNAMIC_MINORS=y 1696CONFIG_SND_DYNAMIC_MINORS=y
1555CONFIG_SND_SUPPORT_OLD_API=y 1697CONFIG_SND_SUPPORT_OLD_API=y
1556CONFIG_SND_VERBOSE_PROCFS=y 1698CONFIG_SND_VERBOSE_PROCFS=y
@@ -1605,11 +1747,16 @@ CONFIG_SND_PCI=y
1605# CONFIG_SND_FM801 is not set 1747# CONFIG_SND_FM801 is not set
1606CONFIG_SND_HDA_INTEL=y 1748CONFIG_SND_HDA_INTEL=y
1607CONFIG_SND_HDA_HWDEP=y 1749CONFIG_SND_HDA_HWDEP=y
1750# CONFIG_SND_HDA_RECONFIG is not set
1751# CONFIG_SND_HDA_INPUT_BEEP is not set
1608CONFIG_SND_HDA_CODEC_REALTEK=y 1752CONFIG_SND_HDA_CODEC_REALTEK=y
1609CONFIG_SND_HDA_CODEC_ANALOG=y 1753CONFIG_SND_HDA_CODEC_ANALOG=y
1610CONFIG_SND_HDA_CODEC_SIGMATEL=y 1754CONFIG_SND_HDA_CODEC_SIGMATEL=y
1611CONFIG_SND_HDA_CODEC_VIA=y 1755CONFIG_SND_HDA_CODEC_VIA=y
1612CONFIG_SND_HDA_CODEC_ATIHDMI=y 1756CONFIG_SND_HDA_CODEC_ATIHDMI=y
1757CONFIG_SND_HDA_CODEC_NVHDMI=y
1758CONFIG_SND_HDA_CODEC_INTELHDMI=y
1759CONFIG_SND_HDA_ELD=y
1613CONFIG_SND_HDA_CODEC_CONEXANT=y 1760CONFIG_SND_HDA_CODEC_CONEXANT=y
1614CONFIG_SND_HDA_CODEC_CMEDIA=y 1761CONFIG_SND_HDA_CODEC_CMEDIA=y
1615CONFIG_SND_HDA_CODEC_SI3054=y 1762CONFIG_SND_HDA_CODEC_SI3054=y
@@ -1643,6 +1790,7 @@ CONFIG_SND_USB=y
1643# CONFIG_SND_USB_AUDIO is not set 1790# CONFIG_SND_USB_AUDIO is not set
1644# CONFIG_SND_USB_USX2Y is not set 1791# CONFIG_SND_USB_USX2Y is not set
1645# CONFIG_SND_USB_CAIAQ is not set 1792# CONFIG_SND_USB_CAIAQ is not set
1793# CONFIG_SND_USB_US122L is not set
1646CONFIG_SND_PCMCIA=y 1794CONFIG_SND_PCMCIA=y
1647# CONFIG_SND_VXPOCKET is not set 1795# CONFIG_SND_VXPOCKET is not set
1648# CONFIG_SND_PDAUDIOCF is not set 1796# CONFIG_SND_PDAUDIOCF is not set
@@ -1657,15 +1805,37 @@ CONFIG_HIDRAW=y
1657# USB Input Devices 1805# USB Input Devices
1658# 1806#
1659CONFIG_USB_HID=y 1807CONFIG_USB_HID=y
1660CONFIG_USB_HIDINPUT_POWERBOOK=y
1661CONFIG_HID_FF=y
1662CONFIG_HID_PID=y 1808CONFIG_HID_PID=y
1809CONFIG_USB_HIDDEV=y
1810
1811#
1812# Special HID drivers
1813#
1814CONFIG_HID_COMPAT=y
1815CONFIG_HID_A4TECH=y
1816CONFIG_HID_APPLE=y
1817CONFIG_HID_BELKIN=y
1818CONFIG_HID_CHERRY=y
1819CONFIG_HID_CHICONY=y
1820CONFIG_HID_CYPRESS=y
1821CONFIG_HID_EZKEY=y
1822CONFIG_HID_GYRATION=y
1823CONFIG_HID_LOGITECH=y
1663CONFIG_LOGITECH_FF=y 1824CONFIG_LOGITECH_FF=y
1664# CONFIG_LOGIRUMBLEPAD2_FF is not set 1825# CONFIG_LOGIRUMBLEPAD2_FF is not set
1826CONFIG_HID_MICROSOFT=y
1827CONFIG_HID_MONTEREY=y
1828CONFIG_HID_NTRIG=y
1829CONFIG_HID_PANTHERLORD=y
1665CONFIG_PANTHERLORD_FF=y 1830CONFIG_PANTHERLORD_FF=y
1831CONFIG_HID_PETALYNX=y
1832CONFIG_HID_SAMSUNG=y
1833CONFIG_HID_SONY=y
1834CONFIG_HID_SUNPLUS=y
1835# CONFIG_GREENASIA_FF is not set
1836CONFIG_HID_TOPSEED=y
1666CONFIG_THRUSTMASTER_FF=y 1837CONFIG_THRUSTMASTER_FF=y
1667CONFIG_ZEROPLUS_FF=y 1838CONFIG_ZEROPLUS_FF=y
1668CONFIG_USB_HIDDEV=y
1669CONFIG_USB_SUPPORT=y 1839CONFIG_USB_SUPPORT=y
1670CONFIG_USB_ARCH_HAS_HCD=y 1840CONFIG_USB_ARCH_HAS_HCD=y
1671CONFIG_USB_ARCH_HAS_OHCI=y 1841CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1683,6 +1853,8 @@ CONFIG_USB_DEVICEFS=y
1683CONFIG_USB_SUSPEND=y 1853CONFIG_USB_SUSPEND=y
1684# CONFIG_USB_OTG is not set 1854# CONFIG_USB_OTG is not set
1685CONFIG_USB_MON=y 1855CONFIG_USB_MON=y
1856# CONFIG_USB_WUSB is not set
1857# CONFIG_USB_WUSB_CBAF is not set
1686 1858
1687# 1859#
1688# USB Host Controller Drivers 1860# USB Host Controller Drivers
@@ -1691,6 +1863,7 @@ CONFIG_USB_MON=y
1691CONFIG_USB_EHCI_HCD=y 1863CONFIG_USB_EHCI_HCD=y
1692# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1864# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1693# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1865# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1866# CONFIG_USB_OXU210HP_HCD is not set
1694# CONFIG_USB_ISP116X_HCD is not set 1867# CONFIG_USB_ISP116X_HCD is not set
1695# CONFIG_USB_ISP1760_HCD is not set 1868# CONFIG_USB_ISP1760_HCD is not set
1696CONFIG_USB_OHCI_HCD=y 1869CONFIG_USB_OHCI_HCD=y
@@ -1700,6 +1873,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y
1700CONFIG_USB_UHCI_HCD=y 1873CONFIG_USB_UHCI_HCD=y
1701# CONFIG_USB_SL811_HCD is not set 1874# CONFIG_USB_SL811_HCD is not set
1702# CONFIG_USB_R8A66597_HCD is not set 1875# CONFIG_USB_R8A66597_HCD is not set
1876# CONFIG_USB_WHCI_HCD is not set
1877# CONFIG_USB_HWA_HCD is not set
1703 1878
1704# 1879#
1705# USB Device Class drivers 1880# USB Device Class drivers
@@ -1707,20 +1882,20 @@ CONFIG_USB_UHCI_HCD=y
1707# CONFIG_USB_ACM is not set 1882# CONFIG_USB_ACM is not set
1708CONFIG_USB_PRINTER=y 1883CONFIG_USB_PRINTER=y
1709# CONFIG_USB_WDM is not set 1884# CONFIG_USB_WDM is not set
1885# CONFIG_USB_TMC is not set
1710 1886
1711# 1887#
1712# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1888# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
1713# 1889#
1714 1890
1715# 1891#
1716# may also be needed; see USB_STORAGE Help for more information 1892# see USB_STORAGE Help for more information
1717# 1893#
1718CONFIG_USB_STORAGE=y 1894CONFIG_USB_STORAGE=y
1719# CONFIG_USB_STORAGE_DEBUG is not set 1895# CONFIG_USB_STORAGE_DEBUG is not set
1720# CONFIG_USB_STORAGE_DATAFAB is not set 1896# CONFIG_USB_STORAGE_DATAFAB is not set
1721# CONFIG_USB_STORAGE_FREECOM is not set 1897# CONFIG_USB_STORAGE_FREECOM is not set
1722# CONFIG_USB_STORAGE_ISD200 is not set 1898# CONFIG_USB_STORAGE_ISD200 is not set
1723# CONFIG_USB_STORAGE_DPCM is not set
1724# CONFIG_USB_STORAGE_USBAT is not set 1899# CONFIG_USB_STORAGE_USBAT is not set
1725# CONFIG_USB_STORAGE_SDDR09 is not set 1900# CONFIG_USB_STORAGE_SDDR09 is not set
1726# CONFIG_USB_STORAGE_SDDR55 is not set 1901# CONFIG_USB_STORAGE_SDDR55 is not set
@@ -1728,7 +1903,6 @@ CONFIG_USB_STORAGE=y
1728# CONFIG_USB_STORAGE_ALAUDA is not set 1903# CONFIG_USB_STORAGE_ALAUDA is not set
1729# CONFIG_USB_STORAGE_ONETOUCH is not set 1904# CONFIG_USB_STORAGE_ONETOUCH is not set
1730# CONFIG_USB_STORAGE_KARMA is not set 1905# CONFIG_USB_STORAGE_KARMA is not set
1731# CONFIG_USB_STORAGE_SIERRA is not set
1732# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1906# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1733CONFIG_USB_LIBUSUAL=y 1907CONFIG_USB_LIBUSUAL=y
1734 1908
@@ -1749,6 +1923,7 @@ CONFIG_USB_LIBUSUAL=y
1749# CONFIG_USB_EMI62 is not set 1923# CONFIG_USB_EMI62 is not set
1750# CONFIG_USB_EMI26 is not set 1924# CONFIG_USB_EMI26 is not set
1751# CONFIG_USB_ADUTUX is not set 1925# CONFIG_USB_ADUTUX is not set
1926# CONFIG_USB_SEVSEG is not set
1752# CONFIG_USB_RIO500 is not set 1927# CONFIG_USB_RIO500 is not set
1753# CONFIG_USB_LEGOTOWER is not set 1928# CONFIG_USB_LEGOTOWER is not set
1754# CONFIG_USB_LCD is not set 1929# CONFIG_USB_LCD is not set
@@ -1766,7 +1941,13 @@ CONFIG_USB_LIBUSUAL=y
1766# CONFIG_USB_IOWARRIOR is not set 1941# CONFIG_USB_IOWARRIOR is not set
1767# CONFIG_USB_TEST is not set 1942# CONFIG_USB_TEST is not set
1768# CONFIG_USB_ISIGHTFW is not set 1943# CONFIG_USB_ISIGHTFW is not set
1944# CONFIG_USB_VST is not set
1769# CONFIG_USB_GADGET is not set 1945# CONFIG_USB_GADGET is not set
1946
1947#
1948# OTG and related infrastructure
1949#
1950# CONFIG_UWB is not set
1770# CONFIG_MMC is not set 1951# CONFIG_MMC is not set
1771# CONFIG_MEMSTICK is not set 1952# CONFIG_MEMSTICK is not set
1772CONFIG_NEW_LEDS=y 1953CONFIG_NEW_LEDS=y
@@ -1775,6 +1956,7 @@ CONFIG_LEDS_CLASS=y
1775# 1956#
1776# LED drivers 1957# LED drivers
1777# 1958#
1959# CONFIG_LEDS_ALIX2 is not set
1778# CONFIG_LEDS_PCA9532 is not set 1960# CONFIG_LEDS_PCA9532 is not set
1779# CONFIG_LEDS_CLEVO_MAIL is not set 1961# CONFIG_LEDS_CLEVO_MAIL is not set
1780# CONFIG_LEDS_PCA955X is not set 1962# CONFIG_LEDS_PCA955X is not set
@@ -1785,6 +1967,7 @@ CONFIG_LEDS_CLASS=y
1785CONFIG_LEDS_TRIGGERS=y 1967CONFIG_LEDS_TRIGGERS=y
1786# CONFIG_LEDS_TRIGGER_TIMER is not set 1968# CONFIG_LEDS_TRIGGER_TIMER is not set
1787# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1969# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1970# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1788# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1971# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1789# CONFIG_ACCESSIBILITY is not set 1972# CONFIG_ACCESSIBILITY is not set
1790# CONFIG_INFINIBAND is not set 1973# CONFIG_INFINIBAND is not set
@@ -1824,6 +2007,7 @@ CONFIG_RTC_INTF_DEV=y
1824# CONFIG_RTC_DRV_M41T80 is not set 2007# CONFIG_RTC_DRV_M41T80 is not set
1825# CONFIG_RTC_DRV_S35390A is not set 2008# CONFIG_RTC_DRV_S35390A is not set
1826# CONFIG_RTC_DRV_FM3130 is not set 2009# CONFIG_RTC_DRV_FM3130 is not set
2010# CONFIG_RTC_DRV_RX8581 is not set
1827 2011
1828# 2012#
1829# SPI RTC drivers 2013# SPI RTC drivers
@@ -1833,12 +2017,15 @@ CONFIG_RTC_INTF_DEV=y
1833# Platform RTC drivers 2017# Platform RTC drivers
1834# 2018#
1835CONFIG_RTC_DRV_CMOS=y 2019CONFIG_RTC_DRV_CMOS=y
2020# CONFIG_RTC_DRV_DS1286 is not set
1836# CONFIG_RTC_DRV_DS1511 is not set 2021# CONFIG_RTC_DRV_DS1511 is not set
1837# CONFIG_RTC_DRV_DS1553 is not set 2022# CONFIG_RTC_DRV_DS1553 is not set
1838# CONFIG_RTC_DRV_DS1742 is not set 2023# CONFIG_RTC_DRV_DS1742 is not set
1839# CONFIG_RTC_DRV_STK17TA8 is not set 2024# CONFIG_RTC_DRV_STK17TA8 is not set
1840# CONFIG_RTC_DRV_M48T86 is not set 2025# CONFIG_RTC_DRV_M48T86 is not set
2026# CONFIG_RTC_DRV_M48T35 is not set
1841# CONFIG_RTC_DRV_M48T59 is not set 2027# CONFIG_RTC_DRV_M48T59 is not set
2028# CONFIG_RTC_DRV_BQ4802 is not set
1842# CONFIG_RTC_DRV_V3020 is not set 2029# CONFIG_RTC_DRV_V3020 is not set
1843 2030
1844# 2031#
@@ -1851,6 +2038,22 @@ CONFIG_DMADEVICES=y
1851# 2038#
1852# CONFIG_INTEL_IOATDMA is not set 2039# CONFIG_INTEL_IOATDMA is not set
1853# CONFIG_UIO is not set 2040# CONFIG_UIO is not set
2041# CONFIG_STAGING is not set
2042CONFIG_X86_PLATFORM_DEVICES=y
2043# CONFIG_ACER_WMI is not set
2044# CONFIG_ASUS_LAPTOP is not set
2045# CONFIG_FUJITSU_LAPTOP is not set
2046# CONFIG_TC1100_WMI is not set
2047# CONFIG_MSI_LAPTOP is not set
2048# CONFIG_PANASONIC_LAPTOP is not set
2049# CONFIG_COMPAL_LAPTOP is not set
2050# CONFIG_SONY_LAPTOP is not set
2051# CONFIG_THINKPAD_ACPI is not set
2052# CONFIG_INTEL_MENLOW is not set
2053CONFIG_EEEPC_LAPTOP=y
2054# CONFIG_ACPI_WMI is not set
2055# CONFIG_ACPI_ASUS is not set
2056# CONFIG_ACPI_TOSHIBA is not set
1854 2057
1855# 2058#
1856# Firmware Drivers 2059# Firmware Drivers
@@ -1861,8 +2064,7 @@ CONFIG_EFI_VARS=y
1861# CONFIG_DELL_RBU is not set 2064# CONFIG_DELL_RBU is not set
1862# CONFIG_DCDBAS is not set 2065# CONFIG_DCDBAS is not set
1863CONFIG_DMIID=y 2066CONFIG_DMIID=y
1864CONFIG_ISCSI_IBFT_FIND=y 2067# CONFIG_ISCSI_IBFT_FIND is not set
1865CONFIG_ISCSI_IBFT=y
1866 2068
1867# 2069#
1868# File systems 2070# File systems
@@ -1872,21 +2074,24 @@ CONFIG_EXT3_FS=y
1872CONFIG_EXT3_FS_XATTR=y 2074CONFIG_EXT3_FS_XATTR=y
1873CONFIG_EXT3_FS_POSIX_ACL=y 2075CONFIG_EXT3_FS_POSIX_ACL=y
1874CONFIG_EXT3_FS_SECURITY=y 2076CONFIG_EXT3_FS_SECURITY=y
1875# CONFIG_EXT4DEV_FS is not set 2077# CONFIG_EXT4_FS is not set
1876CONFIG_JBD=y 2078CONFIG_JBD=y
1877# CONFIG_JBD_DEBUG is not set 2079# CONFIG_JBD_DEBUG is not set
1878CONFIG_FS_MBCACHE=y 2080CONFIG_FS_MBCACHE=y
1879# CONFIG_REISERFS_FS is not set 2081# CONFIG_REISERFS_FS is not set
1880# CONFIG_JFS_FS is not set 2082# CONFIG_JFS_FS is not set
1881CONFIG_FS_POSIX_ACL=y 2083CONFIG_FS_POSIX_ACL=y
2084CONFIG_FILE_LOCKING=y
1882# CONFIG_XFS_FS is not set 2085# CONFIG_XFS_FS is not set
1883# CONFIG_OCFS2_FS is not set 2086# CONFIG_OCFS2_FS is not set
2087# CONFIG_BTRFS_FS is not set
1884CONFIG_DNOTIFY=y 2088CONFIG_DNOTIFY=y
1885CONFIG_INOTIFY=y 2089CONFIG_INOTIFY=y
1886CONFIG_INOTIFY_USER=y 2090CONFIG_INOTIFY_USER=y
1887CONFIG_QUOTA=y 2091CONFIG_QUOTA=y
1888CONFIG_QUOTA_NETLINK_INTERFACE=y 2092CONFIG_QUOTA_NETLINK_INTERFACE=y
1889# CONFIG_PRINT_QUOTA_WARNING is not set 2093# CONFIG_PRINT_QUOTA_WARNING is not set
2094CONFIG_QUOTA_TREE=y
1890# CONFIG_QFMT_V1 is not set 2095# CONFIG_QFMT_V1 is not set
1891CONFIG_QFMT_V2=y 2096CONFIG_QFMT_V2=y
1892CONFIG_QUOTACTL=y 2097CONFIG_QUOTACTL=y
@@ -1920,16 +2125,14 @@ CONFIG_PROC_FS=y
1920CONFIG_PROC_KCORE=y 2125CONFIG_PROC_KCORE=y
1921CONFIG_PROC_VMCORE=y 2126CONFIG_PROC_VMCORE=y
1922CONFIG_PROC_SYSCTL=y 2127CONFIG_PROC_SYSCTL=y
2128CONFIG_PROC_PAGE_MONITOR=y
1923CONFIG_SYSFS=y 2129CONFIG_SYSFS=y
1924CONFIG_TMPFS=y 2130CONFIG_TMPFS=y
1925CONFIG_TMPFS_POSIX_ACL=y 2131CONFIG_TMPFS_POSIX_ACL=y
1926CONFIG_HUGETLBFS=y 2132CONFIG_HUGETLBFS=y
1927CONFIG_HUGETLB_PAGE=y 2133CONFIG_HUGETLB_PAGE=y
1928# CONFIG_CONFIGFS_FS is not set 2134# CONFIG_CONFIGFS_FS is not set
1929 2135CONFIG_MISC_FILESYSTEMS=y
1930#
1931# Miscellaneous filesystems
1932#
1933# CONFIG_ADFS_FS is not set 2136# CONFIG_ADFS_FS is not set
1934# CONFIG_AFFS_FS is not set 2137# CONFIG_AFFS_FS is not set
1935# CONFIG_ECRYPT_FS is not set 2138# CONFIG_ECRYPT_FS is not set
@@ -1939,6 +2142,7 @@ CONFIG_HUGETLB_PAGE=y
1939# CONFIG_BFS_FS is not set 2142# CONFIG_BFS_FS is not set
1940# CONFIG_EFS_FS is not set 2143# CONFIG_EFS_FS is not set
1941# CONFIG_CRAMFS is not set 2144# CONFIG_CRAMFS is not set
2145# CONFIG_SQUASHFS is not set
1942# CONFIG_VXFS_FS is not set 2146# CONFIG_VXFS_FS is not set
1943# CONFIG_MINIX_FS is not set 2147# CONFIG_MINIX_FS is not set
1944# CONFIG_OMFS_FS is not set 2148# CONFIG_OMFS_FS is not set
@@ -1960,6 +2164,7 @@ CONFIG_NFS_ACL_SUPPORT=y
1960CONFIG_NFS_COMMON=y 2164CONFIG_NFS_COMMON=y
1961CONFIG_SUNRPC=y 2165CONFIG_SUNRPC=y
1962CONFIG_SUNRPC_GSS=y 2166CONFIG_SUNRPC_GSS=y
2167# CONFIG_SUNRPC_REGISTER_V4 is not set
1963CONFIG_RPCSEC_GSS_KRB5=y 2168CONFIG_RPCSEC_GSS_KRB5=y
1964# CONFIG_RPCSEC_GSS_SPKM3 is not set 2169# CONFIG_RPCSEC_GSS_SPKM3 is not set
1965# CONFIG_SMB_FS is not set 2170# CONFIG_SMB_FS is not set
@@ -2036,7 +2241,7 @@ CONFIG_NLS_UTF8=y
2036# 2241#
2037CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2242CONFIG_TRACE_IRQFLAGS_SUPPORT=y
2038CONFIG_PRINTK_TIME=y 2243CONFIG_PRINTK_TIME=y
2039CONFIG_ENABLE_WARN_DEPRECATED=y 2244# CONFIG_ENABLE_WARN_DEPRECATED is not set
2040CONFIG_ENABLE_MUST_CHECK=y 2245CONFIG_ENABLE_MUST_CHECK=y
2041CONFIG_FRAME_WARN=2048 2246CONFIG_FRAME_WARN=2048
2042CONFIG_MAGIC_SYSRQ=y 2247CONFIG_MAGIC_SYSRQ=y
@@ -2066,33 +2271,54 @@ CONFIG_TIMER_STATS=y
2066CONFIG_DEBUG_BUGVERBOSE=y 2271CONFIG_DEBUG_BUGVERBOSE=y
2067# CONFIG_DEBUG_INFO is not set 2272# CONFIG_DEBUG_INFO is not set
2068# CONFIG_DEBUG_VM is not set 2273# CONFIG_DEBUG_VM is not set
2274# CONFIG_DEBUG_VIRTUAL is not set
2069# CONFIG_DEBUG_WRITECOUNT is not set 2275# CONFIG_DEBUG_WRITECOUNT is not set
2070CONFIG_DEBUG_MEMORY_INIT=y 2276CONFIG_DEBUG_MEMORY_INIT=y
2071# CONFIG_DEBUG_LIST is not set 2277# CONFIG_DEBUG_LIST is not set
2072# CONFIG_DEBUG_SG is not set 2278# CONFIG_DEBUG_SG is not set
2279# CONFIG_DEBUG_NOTIFIERS is not set
2280CONFIG_ARCH_WANT_FRAME_POINTERS=y
2073CONFIG_FRAME_POINTER=y 2281CONFIG_FRAME_POINTER=y
2074# CONFIG_BOOT_PRINTK_DELAY is not set 2282# CONFIG_BOOT_PRINTK_DELAY is not set
2075# CONFIG_RCU_TORTURE_TEST is not set 2283# CONFIG_RCU_TORTURE_TEST is not set
2284# CONFIG_RCU_CPU_STALL_DETECTOR is not set
2076# CONFIG_KPROBES_SANITY_TEST is not set 2285# CONFIG_KPROBES_SANITY_TEST is not set
2077# CONFIG_BACKTRACE_SELF_TEST is not set 2286# CONFIG_BACKTRACE_SELF_TEST is not set
2287# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
2078# CONFIG_LKDTM is not set 2288# CONFIG_LKDTM is not set
2079# CONFIG_FAULT_INJECTION is not set 2289# CONFIG_FAULT_INJECTION is not set
2080# CONFIG_LATENCYTOP is not set 2290# CONFIG_LATENCYTOP is not set
2081CONFIG_SYSCTL_SYSCALL_CHECK=y 2291CONFIG_SYSCTL_SYSCALL_CHECK=y
2082CONFIG_HAVE_FTRACE=y 2292CONFIG_USER_STACKTRACE_SUPPORT=y
2293CONFIG_HAVE_FUNCTION_TRACER=y
2294CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2295CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2083CONFIG_HAVE_DYNAMIC_FTRACE=y 2296CONFIG_HAVE_DYNAMIC_FTRACE=y
2084# CONFIG_FTRACE is not set 2297CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2298CONFIG_HAVE_HW_BRANCH_TRACER=y
2299
2300#
2301# Tracers
2302#
2303# CONFIG_FUNCTION_TRACER is not set
2085# CONFIG_IRQSOFF_TRACER is not set 2304# CONFIG_IRQSOFF_TRACER is not set
2086# CONFIG_SYSPROF_TRACER is not set 2305# CONFIG_SYSPROF_TRACER is not set
2087# CONFIG_SCHED_TRACER is not set 2306# CONFIG_SCHED_TRACER is not set
2088# CONFIG_CONTEXT_SWITCH_TRACER is not set 2307# CONFIG_CONTEXT_SWITCH_TRACER is not set
2308# CONFIG_BOOT_TRACER is not set
2309# CONFIG_TRACE_BRANCH_PROFILING is not set
2310# CONFIG_POWER_TRACER is not set
2311# CONFIG_STACK_TRACER is not set
2312# CONFIG_HW_BRANCH_TRACER is not set
2089CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2313CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2314# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
2090# CONFIG_SAMPLES is not set 2315# CONFIG_SAMPLES is not set
2091CONFIG_HAVE_ARCH_KGDB=y 2316CONFIG_HAVE_ARCH_KGDB=y
2092# CONFIG_KGDB is not set 2317# CONFIG_KGDB is not set
2093# CONFIG_STRICT_DEVMEM is not set 2318# CONFIG_STRICT_DEVMEM is not set
2094CONFIG_X86_VERBOSE_BOOTUP=y 2319CONFIG_X86_VERBOSE_BOOTUP=y
2095CONFIG_EARLY_PRINTK=y 2320CONFIG_EARLY_PRINTK=y
2321CONFIG_EARLY_PRINTK_DBGP=y
2096CONFIG_DEBUG_STACKOVERFLOW=y 2322CONFIG_DEBUG_STACKOVERFLOW=y
2097CONFIG_DEBUG_STACK_USAGE=y 2323CONFIG_DEBUG_STACK_USAGE=y
2098# CONFIG_DEBUG_PAGEALLOC is not set 2324# CONFIG_DEBUG_PAGEALLOC is not set
@@ -2123,8 +2349,10 @@ CONFIG_OPTIMIZE_INLINING=y
2123CONFIG_KEYS=y 2349CONFIG_KEYS=y
2124CONFIG_KEYS_DEBUG_PROC_KEYS=y 2350CONFIG_KEYS_DEBUG_PROC_KEYS=y
2125CONFIG_SECURITY=y 2351CONFIG_SECURITY=y
2352# CONFIG_SECURITYFS is not set
2126CONFIG_SECURITY_NETWORK=y 2353CONFIG_SECURITY_NETWORK=y
2127# CONFIG_SECURITY_NETWORK_XFRM is not set 2354# CONFIG_SECURITY_NETWORK_XFRM is not set
2355# CONFIG_SECURITY_PATH is not set
2128CONFIG_SECURITY_FILE_CAPABILITIES=y 2356CONFIG_SECURITY_FILE_CAPABILITIES=y
2129# CONFIG_SECURITY_ROOTPLUG is not set 2357# CONFIG_SECURITY_ROOTPLUG is not set
2130CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2358CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2135,7 +2363,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
2135CONFIG_SECURITY_SELINUX_DEVELOP=y 2363CONFIG_SECURITY_SELINUX_DEVELOP=y
2136CONFIG_SECURITY_SELINUX_AVC_STATS=y 2364CONFIG_SECURITY_SELINUX_AVC_STATS=y
2137CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2365CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2138# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2139# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2366# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2140# CONFIG_SECURITY_SMACK is not set 2367# CONFIG_SECURITY_SMACK is not set
2141CONFIG_CRYPTO=y 2368CONFIG_CRYPTO=y
@@ -2143,11 +2370,18 @@ CONFIG_CRYPTO=y
2143# 2370#
2144# Crypto core or helper 2371# Crypto core or helper
2145# 2372#
2373# CONFIG_CRYPTO_FIPS is not set
2146CONFIG_CRYPTO_ALGAPI=y 2374CONFIG_CRYPTO_ALGAPI=y
2375CONFIG_CRYPTO_ALGAPI2=y
2147CONFIG_CRYPTO_AEAD=y 2376CONFIG_CRYPTO_AEAD=y
2377CONFIG_CRYPTO_AEAD2=y
2148CONFIG_CRYPTO_BLKCIPHER=y 2378CONFIG_CRYPTO_BLKCIPHER=y
2379CONFIG_CRYPTO_BLKCIPHER2=y
2149CONFIG_CRYPTO_HASH=y 2380CONFIG_CRYPTO_HASH=y
2381CONFIG_CRYPTO_HASH2=y
2382CONFIG_CRYPTO_RNG2=y
2150CONFIG_CRYPTO_MANAGER=y 2383CONFIG_CRYPTO_MANAGER=y
2384CONFIG_CRYPTO_MANAGER2=y
2151# CONFIG_CRYPTO_GF128MUL is not set 2385# CONFIG_CRYPTO_GF128MUL is not set
2152# CONFIG_CRYPTO_NULL is not set 2386# CONFIG_CRYPTO_NULL is not set
2153# CONFIG_CRYPTO_CRYPTD is not set 2387# CONFIG_CRYPTO_CRYPTD is not set
@@ -2182,6 +2416,7 @@ CONFIG_CRYPTO_HMAC=y
2182# Digest 2416# Digest
2183# 2417#
2184# CONFIG_CRYPTO_CRC32C is not set 2418# CONFIG_CRYPTO_CRC32C is not set
2419# CONFIG_CRYPTO_CRC32C_INTEL is not set
2185# CONFIG_CRYPTO_MD4 is not set 2420# CONFIG_CRYPTO_MD4 is not set
2186CONFIG_CRYPTO_MD5=y 2421CONFIG_CRYPTO_MD5=y
2187# CONFIG_CRYPTO_MICHAEL_MIC is not set 2422# CONFIG_CRYPTO_MICHAEL_MIC is not set
@@ -2222,6 +2457,11 @@ CONFIG_CRYPTO_DES=y
2222# 2457#
2223# CONFIG_CRYPTO_DEFLATE is not set 2458# CONFIG_CRYPTO_DEFLATE is not set
2224# CONFIG_CRYPTO_LZO is not set 2459# CONFIG_CRYPTO_LZO is not set
2460
2461#
2462# Random Number Generation
2463#
2464# CONFIG_CRYPTO_ANSI_CPRNG is not set
2225CONFIG_CRYPTO_HW=y 2465CONFIG_CRYPTO_HW=y
2226# CONFIG_CRYPTO_DEV_PADLOCK is not set 2466# CONFIG_CRYPTO_DEV_PADLOCK is not set
2227# CONFIG_CRYPTO_DEV_GEODE is not set 2467# CONFIG_CRYPTO_DEV_GEODE is not set
@@ -2239,6 +2479,7 @@ CONFIG_VIRTUALIZATION=y
2239CONFIG_BITREVERSE=y 2479CONFIG_BITREVERSE=y
2240CONFIG_GENERIC_FIND_FIRST_BIT=y 2480CONFIG_GENERIC_FIND_FIRST_BIT=y
2241CONFIG_GENERIC_FIND_NEXT_BIT=y 2481CONFIG_GENERIC_FIND_NEXT_BIT=y
2482CONFIG_GENERIC_FIND_LAST_BIT=y
2242# CONFIG_CRC_CCITT is not set 2483# CONFIG_CRC_CCITT is not set
2243# CONFIG_CRC16 is not set 2484# CONFIG_CRC16 is not set
2244CONFIG_CRC_T10DIF=y 2485CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 322dd2748fc9..9fe5d212ab4c 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,14 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc5 3# Linux kernel version: 2.6.29-rc4
4# Wed Sep 3 17:13:39 2008 4# Tue Feb 24 15:44:16 2009
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 11CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 12CONFIG_GENERIC_CMOS_UPDATE=y
14CONFIG_CLOCKSOURCE_WATCHDOG=y 13CONFIG_CLOCKSOURCE_WATCHDOG=y
@@ -23,17 +22,16 @@ CONFIG_ZONE_DMA=y
23CONFIG_GENERIC_ISA_DMA=y 22CONFIG_GENERIC_ISA_DMA=y
24CONFIG_GENERIC_IOMAP=y 23CONFIG_GENERIC_IOMAP=y
25CONFIG_GENERIC_BUG=y 24CONFIG_GENERIC_BUG=y
25CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
26CONFIG_GENERIC_HWEIGHT=y 26CONFIG_GENERIC_HWEIGHT=y
27# CONFIG_GENERIC_GPIO is not set
28CONFIG_ARCH_MAY_HAVE_PC_FDC=y 27CONFIG_ARCH_MAY_HAVE_PC_FDC=y
29CONFIG_RWSEM_GENERIC_SPINLOCK=y 28CONFIG_RWSEM_GENERIC_SPINLOCK=y
30# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set 29# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
31# CONFIG_ARCH_HAS_ILOG2_U32 is not set
32# CONFIG_ARCH_HAS_ILOG2_U64 is not set
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y 30CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_GENERIC_CALIBRATE_DELAY=y 31CONFIG_GENERIC_CALIBRATE_DELAY=y
35CONFIG_GENERIC_TIME_VSYSCALL=y 32CONFIG_GENERIC_TIME_VSYSCALL=y
36CONFIG_ARCH_HAS_CPU_RELAX=y 33CONFIG_ARCH_HAS_CPU_RELAX=y
34CONFIG_ARCH_HAS_DEFAULT_IDLE=y
37CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
38CONFIG_HAVE_SETUP_PER_CPU_AREA=y 36CONFIG_HAVE_SETUP_PER_CPU_AREA=y
39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y 37CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
@@ -42,12 +40,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
42CONFIG_ZONE_DMA32=y 40CONFIG_ZONE_DMA32=y
43CONFIG_ARCH_POPULATES_NODE_MAP=y 41CONFIG_ARCH_POPULATES_NODE_MAP=y
44CONFIG_AUDIT_ARCH=y 42CONFIG_AUDIT_ARCH=y
45CONFIG_ARCH_SUPPORTS_AOUT=y
46CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 43CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
47CONFIG_GENERIC_HARDIRQS=y 44CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_IRQ_PROBE=y 45CONFIG_GENERIC_IRQ_PROBE=y
49CONFIG_GENERIC_PENDING_IRQ=y 46CONFIG_GENERIC_PENDING_IRQ=y
50CONFIG_X86_SMP=y 47CONFIG_X86_SMP=y
48CONFIG_USE_GENERIC_SMP_HELPERS=y
51CONFIG_X86_64_SMP=y 49CONFIG_X86_64_SMP=y
52CONFIG_X86_HT=y 50CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 51CONFIG_X86_BIOS_REBOOT=y
@@ -76,30 +74,44 @@ CONFIG_TASK_IO_ACCOUNTING=y
76CONFIG_AUDIT=y 74CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 75CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 76CONFIG_AUDIT_TREE=y
77
78#
79# RCU Subsystem
80#
81# CONFIG_CLASSIC_RCU is not set
82CONFIG_TREE_RCU=y
83# CONFIG_PREEMPT_RCU is not set
84# CONFIG_RCU_TRACE is not set
85CONFIG_RCU_FANOUT=64
86# CONFIG_RCU_FANOUT_EXACT is not set
87# CONFIG_TREE_RCU_TRACE is not set
88# CONFIG_PREEMPT_RCU_TRACE is not set
79# CONFIG_IKCONFIG is not set 89# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=18 90CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y
84# CONFIG_CGROUP_DEVICE is not set
85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y 91CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
87CONFIG_GROUP_SCHED=y 92CONFIG_GROUP_SCHED=y
88CONFIG_FAIR_GROUP_SCHED=y 93CONFIG_FAIR_GROUP_SCHED=y
89# CONFIG_RT_GROUP_SCHED is not set 94# CONFIG_RT_GROUP_SCHED is not set
90# CONFIG_USER_SCHED is not set 95# CONFIG_USER_SCHED is not set
91CONFIG_CGROUP_SCHED=y 96CONFIG_CGROUP_SCHED=y
97CONFIG_CGROUPS=y
98# CONFIG_CGROUP_DEBUG is not set
99CONFIG_CGROUP_NS=y
100CONFIG_CGROUP_FREEZER=y
101# CONFIG_CGROUP_DEVICE is not set
102CONFIG_CPUSETS=y
103CONFIG_PROC_PID_CPUSET=y
92CONFIG_CGROUP_CPUACCT=y 104CONFIG_CGROUP_CPUACCT=y
93CONFIG_RESOURCE_COUNTERS=y 105CONFIG_RESOURCE_COUNTERS=y
94# CONFIG_CGROUP_MEM_RES_CTLR is not set 106# CONFIG_CGROUP_MEM_RES_CTLR is not set
95# CONFIG_SYSFS_DEPRECATED_V2 is not set 107# CONFIG_SYSFS_DEPRECATED_V2 is not set
96CONFIG_PROC_PID_CPUSET=y
97CONFIG_RELAY=y 108CONFIG_RELAY=y
98CONFIG_NAMESPACES=y 109CONFIG_NAMESPACES=y
99CONFIG_UTS_NS=y 110CONFIG_UTS_NS=y
100CONFIG_IPC_NS=y 111CONFIG_IPC_NS=y
101CONFIG_USER_NS=y 112CONFIG_USER_NS=y
102CONFIG_PID_NS=y 113CONFIG_PID_NS=y
114CONFIG_NET_NS=y
103CONFIG_BLK_DEV_INITRD=y 115CONFIG_BLK_DEV_INITRD=y
104CONFIG_INITRAMFS_SOURCE="" 116CONFIG_INITRAMFS_SOURCE=""
105CONFIG_CC_OPTIMIZE_FOR_SIZE=y 117CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -124,12 +136,15 @@ CONFIG_SIGNALFD=y
124CONFIG_TIMERFD=y 136CONFIG_TIMERFD=y
125CONFIG_EVENTFD=y 137CONFIG_EVENTFD=y
126CONFIG_SHMEM=y 138CONFIG_SHMEM=y
139CONFIG_AIO=y
127CONFIG_VM_EVENT_COUNTERS=y 140CONFIG_VM_EVENT_COUNTERS=y
141CONFIG_PCI_QUIRKS=y
128CONFIG_SLUB_DEBUG=y 142CONFIG_SLUB_DEBUG=y
129# CONFIG_SLAB is not set 143# CONFIG_SLAB is not set
130CONFIG_SLUB=y 144CONFIG_SLUB=y
131# CONFIG_SLOB is not set 145# CONFIG_SLOB is not set
132CONFIG_PROFILING=y 146CONFIG_PROFILING=y
147CONFIG_TRACEPOINTS=y
133CONFIG_MARKERS=y 148CONFIG_MARKERS=y
134# CONFIG_OPROFILE is not set 149# CONFIG_OPROFILE is not set
135CONFIG_HAVE_OPROFILE=y 150CONFIG_HAVE_OPROFILE=y
@@ -139,15 +154,10 @@ CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y 154CONFIG_HAVE_IOREMAP_PROT=y
140CONFIG_HAVE_KPROBES=y 155CONFIG_HAVE_KPROBES=y
141CONFIG_HAVE_KRETPROBES=y 156CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set 157CONFIG_HAVE_ARCH_TRACEHOOK=y
143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
146CONFIG_PROC_PAGE_MONITOR=y
147# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set 158# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
148CONFIG_SLABINFO=y 159CONFIG_SLABINFO=y
149CONFIG_RT_MUTEXES=y 160CONFIG_RT_MUTEXES=y
150# CONFIG_TINY_SHMEM is not set
151CONFIG_BASE_SMALL=0 161CONFIG_BASE_SMALL=0
152CONFIG_MODULES=y 162CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set 163# CONFIG_MODULE_FORCE_LOAD is not set
@@ -155,7 +165,6 @@ CONFIG_MODULE_UNLOAD=y
155CONFIG_MODULE_FORCE_UNLOAD=y 165CONFIG_MODULE_FORCE_UNLOAD=y
156# CONFIG_MODVERSIONS is not set 166# CONFIG_MODVERSIONS is not set
157# CONFIG_MODULE_SRCVERSION_ALL is not set 167# CONFIG_MODULE_SRCVERSION_ALL is not set
158CONFIG_KMOD=y
159CONFIG_STOP_MACHINE=y 168CONFIG_STOP_MACHINE=y
160CONFIG_BLOCK=y 169CONFIG_BLOCK=y
161CONFIG_BLK_DEV_IO_TRACE=y 170CONFIG_BLK_DEV_IO_TRACE=y
@@ -175,7 +184,7 @@ CONFIG_IOSCHED_CFQ=y
175CONFIG_DEFAULT_CFQ=y 184CONFIG_DEFAULT_CFQ=y
176# CONFIG_DEFAULT_NOOP is not set 185# CONFIG_DEFAULT_NOOP is not set
177CONFIG_DEFAULT_IOSCHED="cfq" 186CONFIG_DEFAULT_IOSCHED="cfq"
178CONFIG_CLASSIC_RCU=y 187CONFIG_FREEZER=y
179 188
180# 189#
181# Processor type and features 190# Processor type and features
@@ -185,13 +194,14 @@ CONFIG_NO_HZ=y
185CONFIG_HIGH_RES_TIMERS=y 194CONFIG_HIGH_RES_TIMERS=y
186CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 195CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
187CONFIG_SMP=y 196CONFIG_SMP=y
197CONFIG_SPARSE_IRQ=y
198# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
188CONFIG_X86_FIND_SMP_CONFIG=y 199CONFIG_X86_FIND_SMP_CONFIG=y
189CONFIG_X86_MPPARSE=y 200CONFIG_X86_MPPARSE=y
190CONFIG_X86_PC=y
191# CONFIG_X86_ELAN is not set 201# CONFIG_X86_ELAN is not set
192# CONFIG_X86_VOYAGER is not set
193# CONFIG_X86_GENERICARCH is not set 202# CONFIG_X86_GENERICARCH is not set
194# CONFIG_X86_VSMP is not set 203# CONFIG_X86_VSMP is not set
204CONFIG_SCHED_OMIT_FRAME_POINTER=y
195# CONFIG_PARAVIRT_GUEST is not set 205# CONFIG_PARAVIRT_GUEST is not set
196# CONFIG_MEMTEST is not set 206# CONFIG_MEMTEST is not set
197# CONFIG_M386 is not set 207# CONFIG_M386 is not set
@@ -230,6 +240,11 @@ CONFIG_X86_CMPXCHG64=y
230CONFIG_X86_CMOV=y 240CONFIG_X86_CMOV=y
231CONFIG_X86_MINIMUM_CPU_FAMILY=64 241CONFIG_X86_MINIMUM_CPU_FAMILY=64
232CONFIG_X86_DEBUGCTLMSR=y 242CONFIG_X86_DEBUGCTLMSR=y
243CONFIG_CPU_SUP_INTEL=y
244CONFIG_CPU_SUP_AMD=y
245CONFIG_CPU_SUP_CENTAUR_64=y
246CONFIG_X86_DS=y
247CONFIG_X86_PTRACE_BTS=y
233CONFIG_HPET_TIMER=y 248CONFIG_HPET_TIMER=y
234CONFIG_HPET_EMULATE_RTC=y 249CONFIG_HPET_EMULATE_RTC=y
235CONFIG_DMI=y 250CONFIG_DMI=y
@@ -237,8 +252,11 @@ CONFIG_GART_IOMMU=y
237CONFIG_CALGARY_IOMMU=y 252CONFIG_CALGARY_IOMMU=y
238CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y 253CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
239CONFIG_AMD_IOMMU=y 254CONFIG_AMD_IOMMU=y
255CONFIG_AMD_IOMMU_STATS=y
240CONFIG_SWIOTLB=y 256CONFIG_SWIOTLB=y
241CONFIG_IOMMU_HELPER=y 257CONFIG_IOMMU_HELPER=y
258CONFIG_IOMMU_API=y
259# CONFIG_MAXSMP is not set
242CONFIG_NR_CPUS=64 260CONFIG_NR_CPUS=64
243CONFIG_SCHED_SMT=y 261CONFIG_SCHED_SMT=y
244CONFIG_SCHED_MC=y 262CONFIG_SCHED_MC=y
@@ -247,12 +265,19 @@ CONFIG_PREEMPT_VOLUNTARY=y
247# CONFIG_PREEMPT is not set 265# CONFIG_PREEMPT is not set
248CONFIG_X86_LOCAL_APIC=y 266CONFIG_X86_LOCAL_APIC=y
249CONFIG_X86_IO_APIC=y 267CONFIG_X86_IO_APIC=y
250# CONFIG_X86_MCE is not set 268CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
269CONFIG_X86_MCE=y
270CONFIG_X86_MCE_INTEL=y
271CONFIG_X86_MCE_AMD=y
251# CONFIG_I8K is not set 272# CONFIG_I8K is not set
252CONFIG_MICROCODE=y 273CONFIG_MICROCODE=y
274CONFIG_MICROCODE_INTEL=y
275CONFIG_MICROCODE_AMD=y
253CONFIG_MICROCODE_OLD_INTERFACE=y 276CONFIG_MICROCODE_OLD_INTERFACE=y
254CONFIG_X86_MSR=y 277CONFIG_X86_MSR=y
255CONFIG_X86_CPUID=y 278CONFIG_X86_CPUID=y
279CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
280CONFIG_DIRECT_GBPAGES=y
256CONFIG_NUMA=y 281CONFIG_NUMA=y
257CONFIG_K8_NUMA=y 282CONFIG_K8_NUMA=y
258CONFIG_X86_64_ACPI_NUMA=y 283CONFIG_X86_64_ACPI_NUMA=y
@@ -269,7 +294,6 @@ CONFIG_SPARSEMEM_MANUAL=y
269CONFIG_SPARSEMEM=y 294CONFIG_SPARSEMEM=y
270CONFIG_NEED_MULTIPLE_NODES=y 295CONFIG_NEED_MULTIPLE_NODES=y
271CONFIG_HAVE_MEMORY_PRESENT=y 296CONFIG_HAVE_MEMORY_PRESENT=y
272# CONFIG_SPARSEMEM_STATIC is not set
273CONFIG_SPARSEMEM_EXTREME=y 297CONFIG_SPARSEMEM_EXTREME=y
274CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y 298CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
275CONFIG_SPARSEMEM_VMEMMAP=y 299CONFIG_SPARSEMEM_VMEMMAP=y
@@ -280,10 +304,14 @@ CONFIG_SPARSEMEM_VMEMMAP=y
280CONFIG_PAGEFLAGS_EXTENDED=y 304CONFIG_PAGEFLAGS_EXTENDED=y
281CONFIG_SPLIT_PTLOCK_CPUS=4 305CONFIG_SPLIT_PTLOCK_CPUS=4
282CONFIG_MIGRATION=y 306CONFIG_MIGRATION=y
283CONFIG_RESOURCES_64BIT=y 307CONFIG_PHYS_ADDR_T_64BIT=y
284CONFIG_ZONE_DMA_FLAG=1 308CONFIG_ZONE_DMA_FLAG=1
285CONFIG_BOUNCE=y 309CONFIG_BOUNCE=y
286CONFIG_VIRT_TO_BUS=y 310CONFIG_VIRT_TO_BUS=y
311CONFIG_UNEVICTABLE_LRU=y
312CONFIG_X86_CHECK_BIOS_CORRUPTION=y
313CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
314CONFIG_X86_RESERVE_LOW_64K=y
287CONFIG_MTRR=y 315CONFIG_MTRR=y
288# CONFIG_MTRR_SANITIZER is not set 316# CONFIG_MTRR_SANITIZER is not set
289CONFIG_X86_PAT=y 317CONFIG_X86_PAT=y
@@ -302,11 +330,12 @@ CONFIG_PHYSICAL_START=0x1000000
302CONFIG_PHYSICAL_ALIGN=0x200000 330CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 331CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 332# CONFIG_COMPAT_VDSO is not set
333# CONFIG_CMDLINE_BOOL is not set
305CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 334CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
306CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 335CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
307 336
308# 337#
309# Power management options 338# Power management and ACPI options
310# 339#
311CONFIG_ARCH_HIBERNATION_HEADER=y 340CONFIG_ARCH_HIBERNATION_HEADER=y
312CONFIG_PM=y 341CONFIG_PM=y
@@ -333,20 +362,14 @@ CONFIG_ACPI_BATTERY=y
333CONFIG_ACPI_BUTTON=y 362CONFIG_ACPI_BUTTON=y
334CONFIG_ACPI_FAN=y 363CONFIG_ACPI_FAN=y
335CONFIG_ACPI_DOCK=y 364CONFIG_ACPI_DOCK=y
336# CONFIG_ACPI_BAY is not set
337CONFIG_ACPI_PROCESSOR=y 365CONFIG_ACPI_PROCESSOR=y
338CONFIG_ACPI_HOTPLUG_CPU=y 366CONFIG_ACPI_HOTPLUG_CPU=y
339CONFIG_ACPI_THERMAL=y 367CONFIG_ACPI_THERMAL=y
340CONFIG_ACPI_NUMA=y 368CONFIG_ACPI_NUMA=y
341# CONFIG_ACPI_WMI is not set
342# CONFIG_ACPI_ASUS is not set
343# CONFIG_ACPI_TOSHIBA is not set
344# CONFIG_ACPI_CUSTOM_DSDT is not set 369# CONFIG_ACPI_CUSTOM_DSDT is not set
345CONFIG_ACPI_BLACKLIST_YEAR=0 370CONFIG_ACPI_BLACKLIST_YEAR=0
346# CONFIG_ACPI_DEBUG is not set 371# CONFIG_ACPI_DEBUG is not set
347CONFIG_ACPI_EC=y
348# CONFIG_ACPI_PCI_SLOT is not set 372# CONFIG_ACPI_PCI_SLOT is not set
349CONFIG_ACPI_POWER=y
350CONFIG_ACPI_SYSTEM=y 373CONFIG_ACPI_SYSTEM=y
351CONFIG_X86_PM_TIMER=y 374CONFIG_X86_PM_TIMER=y
352CONFIG_ACPI_CONTAINER=y 375CONFIG_ACPI_CONTAINER=y
@@ -381,13 +404,17 @@ CONFIG_X86_ACPI_CPUFREQ=y
381# 404#
382# shared options 405# shared options
383# 406#
384# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
385# CONFIG_X86_SPEEDSTEP_LIB is not set 407# CONFIG_X86_SPEEDSTEP_LIB is not set
386CONFIG_CPU_IDLE=y 408CONFIG_CPU_IDLE=y
387CONFIG_CPU_IDLE_GOV_LADDER=y 409CONFIG_CPU_IDLE_GOV_LADDER=y
388CONFIG_CPU_IDLE_GOV_MENU=y 410CONFIG_CPU_IDLE_GOV_MENU=y
389 411
390# 412#
413# Memory power savings
414#
415# CONFIG_I7300_IDLE is not set
416
417#
391# Bus options (PCI etc.) 418# Bus options (PCI etc.)
392# 419#
393CONFIG_PCI=y 420CONFIG_PCI=y
@@ -395,8 +422,10 @@ CONFIG_PCI_DIRECT=y
395CONFIG_PCI_MMCONFIG=y 422CONFIG_PCI_MMCONFIG=y
396CONFIG_PCI_DOMAINS=y 423CONFIG_PCI_DOMAINS=y
397CONFIG_DMAR=y 424CONFIG_DMAR=y
425# CONFIG_DMAR_DEFAULT_ON is not set
398CONFIG_DMAR_GFX_WA=y 426CONFIG_DMAR_GFX_WA=y
399CONFIG_DMAR_FLOPPY_WA=y 427CONFIG_DMAR_FLOPPY_WA=y
428# CONFIG_INTR_REMAP is not set
400CONFIG_PCIEPORTBUS=y 429CONFIG_PCIEPORTBUS=y
401# CONFIG_HOTPLUG_PCI_PCIE is not set 430# CONFIG_HOTPLUG_PCI_PCIE is not set
402CONFIG_PCIEAER=y 431CONFIG_PCIEAER=y
@@ -405,6 +434,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y
405CONFIG_PCI_MSI=y 434CONFIG_PCI_MSI=y
406# CONFIG_PCI_LEGACY is not set 435# CONFIG_PCI_LEGACY is not set
407# CONFIG_PCI_DEBUG is not set 436# CONFIG_PCI_DEBUG is not set
437# CONFIG_PCI_STUB is not set
408CONFIG_HT_IRQ=y 438CONFIG_HT_IRQ=y
409CONFIG_ISA_DMA_API=y 439CONFIG_ISA_DMA_API=y
410CONFIG_K8_NB=y 440CONFIG_K8_NB=y
@@ -438,6 +468,8 @@ CONFIG_HOTPLUG_PCI=y
438# 468#
439CONFIG_BINFMT_ELF=y 469CONFIG_BINFMT_ELF=y
440CONFIG_COMPAT_BINFMT_ELF=y 470CONFIG_COMPAT_BINFMT_ELF=y
471CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
472# CONFIG_HAVE_AOUT is not set
441CONFIG_BINFMT_MISC=y 473CONFIG_BINFMT_MISC=y
442CONFIG_IA32_EMULATION=y 474CONFIG_IA32_EMULATION=y
443# CONFIG_IA32_AOUT is not set 475# CONFIG_IA32_AOUT is not set
@@ -449,6 +481,7 @@ CONFIG_NET=y
449# 481#
450# Networking options 482# Networking options
451# 483#
484CONFIG_COMPAT_NET_DEV_OPS=y
452CONFIG_PACKET=y 485CONFIG_PACKET=y
453CONFIG_PACKET_MMAP=y 486CONFIG_PACKET_MMAP=y
454CONFIG_UNIX=y 487CONFIG_UNIX=y
@@ -509,7 +542,6 @@ CONFIG_DEFAULT_CUBIC=y
509# CONFIG_DEFAULT_RENO is not set 542# CONFIG_DEFAULT_RENO is not set
510CONFIG_DEFAULT_TCP_CONG="cubic" 543CONFIG_DEFAULT_TCP_CONG="cubic"
511CONFIG_TCP_MD5SIG=y 544CONFIG_TCP_MD5SIG=y
512# CONFIG_IP_VS is not set
513CONFIG_IPV6=y 545CONFIG_IPV6=y
514# CONFIG_IPV6_PRIVACY is not set 546# CONFIG_IPV6_PRIVACY is not set
515# CONFIG_IPV6_ROUTER_PREF is not set 547# CONFIG_IPV6_ROUTER_PREF is not set
@@ -547,19 +579,21 @@ CONFIG_NF_CONNTRACK_IRC=y
547CONFIG_NF_CONNTRACK_SIP=y 579CONFIG_NF_CONNTRACK_SIP=y
548CONFIG_NF_CT_NETLINK=y 580CONFIG_NF_CT_NETLINK=y
549CONFIG_NETFILTER_XTABLES=y 581CONFIG_NETFILTER_XTABLES=y
582CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
550CONFIG_NETFILTER_XT_TARGET_MARK=y 583CONFIG_NETFILTER_XT_TARGET_MARK=y
551CONFIG_NETFILTER_XT_TARGET_NFLOG=y 584CONFIG_NETFILTER_XT_TARGET_NFLOG=y
552CONFIG_NETFILTER_XT_TARGET_SECMARK=y 585CONFIG_NETFILTER_XT_TARGET_SECMARK=y
553CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
554CONFIG_NETFILTER_XT_TARGET_TCPMSS=y 586CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
555CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y 587CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
556CONFIG_NETFILTER_XT_MATCH_MARK=y 588CONFIG_NETFILTER_XT_MATCH_MARK=y
557CONFIG_NETFILTER_XT_MATCH_POLICY=y 589CONFIG_NETFILTER_XT_MATCH_POLICY=y
558CONFIG_NETFILTER_XT_MATCH_STATE=y 590CONFIG_NETFILTER_XT_MATCH_STATE=y
591# CONFIG_IP_VS is not set
559 592
560# 593#
561# IP: Netfilter Configuration 594# IP: Netfilter Configuration
562# 595#
596CONFIG_NF_DEFRAG_IPV4=y
563CONFIG_NF_CONNTRACK_IPV4=y 597CONFIG_NF_CONNTRACK_IPV4=y
564CONFIG_NF_CONNTRACK_PROC_COMPAT=y 598CONFIG_NF_CONNTRACK_PROC_COMPAT=y
565CONFIG_IP_NF_IPTABLES=y 599CONFIG_IP_NF_IPTABLES=y
@@ -585,8 +619,8 @@ CONFIG_IP_NF_MANGLE=y
585CONFIG_NF_CONNTRACK_IPV6=y 619CONFIG_NF_CONNTRACK_IPV6=y
586CONFIG_IP6_NF_IPTABLES=y 620CONFIG_IP6_NF_IPTABLES=y
587CONFIG_IP6_NF_MATCH_IPV6HEADER=y 621CONFIG_IP6_NF_MATCH_IPV6HEADER=y
588CONFIG_IP6_NF_FILTER=y
589CONFIG_IP6_NF_TARGET_LOG=y 622CONFIG_IP6_NF_TARGET_LOG=y
623CONFIG_IP6_NF_FILTER=y
590CONFIG_IP6_NF_TARGET_REJECT=y 624CONFIG_IP6_NF_TARGET_REJECT=y
591CONFIG_IP6_NF_MANGLE=y 625CONFIG_IP6_NF_MANGLE=y
592# CONFIG_IP_DCCP is not set 626# CONFIG_IP_DCCP is not set
@@ -594,6 +628,7 @@ CONFIG_IP6_NF_MANGLE=y
594# CONFIG_TIPC is not set 628# CONFIG_TIPC is not set
595# CONFIG_ATM is not set 629# CONFIG_ATM is not set
596# CONFIG_BRIDGE is not set 630# CONFIG_BRIDGE is not set
631# CONFIG_NET_DSA is not set
597# CONFIG_VLAN_8021Q is not set 632# CONFIG_VLAN_8021Q is not set
598# CONFIG_DECNET is not set 633# CONFIG_DECNET is not set
599CONFIG_LLC=y 634CONFIG_LLC=y
@@ -613,6 +648,7 @@ CONFIG_NET_SCHED=y
613# CONFIG_NET_SCH_HTB is not set 648# CONFIG_NET_SCH_HTB is not set
614# CONFIG_NET_SCH_HFSC is not set 649# CONFIG_NET_SCH_HFSC is not set
615# CONFIG_NET_SCH_PRIO is not set 650# CONFIG_NET_SCH_PRIO is not set
651# CONFIG_NET_SCH_MULTIQ is not set
616# CONFIG_NET_SCH_RED is not set 652# CONFIG_NET_SCH_RED is not set
617# CONFIG_NET_SCH_SFQ is not set 653# CONFIG_NET_SCH_SFQ is not set
618# CONFIG_NET_SCH_TEQL is not set 654# CONFIG_NET_SCH_TEQL is not set
@@ -620,6 +656,7 @@ CONFIG_NET_SCHED=y
620# CONFIG_NET_SCH_GRED is not set 656# CONFIG_NET_SCH_GRED is not set
621# CONFIG_NET_SCH_DSMARK is not set 657# CONFIG_NET_SCH_DSMARK is not set
622# CONFIG_NET_SCH_NETEM is not set 658# CONFIG_NET_SCH_NETEM is not set
659# CONFIG_NET_SCH_DRR is not set
623# CONFIG_NET_SCH_INGRESS is not set 660# CONFIG_NET_SCH_INGRESS is not set
624 661
625# 662#
@@ -634,6 +671,7 @@ CONFIG_NET_CLS=y
634# CONFIG_NET_CLS_RSVP is not set 671# CONFIG_NET_CLS_RSVP is not set
635# CONFIG_NET_CLS_RSVP6 is not set 672# CONFIG_NET_CLS_RSVP6 is not set
636# CONFIG_NET_CLS_FLOW is not set 673# CONFIG_NET_CLS_FLOW is not set
674# CONFIG_NET_CLS_CGROUP is not set
637CONFIG_NET_EMATCH=y 675CONFIG_NET_EMATCH=y
638CONFIG_NET_EMATCH_STACK=32 676CONFIG_NET_EMATCH_STACK=32
639# CONFIG_NET_EMATCH_CMP is not set 677# CONFIG_NET_EMATCH_CMP is not set
@@ -649,7 +687,9 @@ CONFIG_NET_CLS_ACT=y
649# CONFIG_NET_ACT_NAT is not set 687# CONFIG_NET_ACT_NAT is not set
650# CONFIG_NET_ACT_PEDIT is not set 688# CONFIG_NET_ACT_PEDIT is not set
651# CONFIG_NET_ACT_SIMP is not set 689# CONFIG_NET_ACT_SIMP is not set
690# CONFIG_NET_ACT_SKBEDIT is not set
652CONFIG_NET_SCH_FIFO=y 691CONFIG_NET_SCH_FIFO=y
692# CONFIG_DCB is not set
653 693
654# 694#
655# Network testing 695# Network testing
@@ -666,29 +706,33 @@ CONFIG_HAMRADIO=y
666# CONFIG_IRDA is not set 706# CONFIG_IRDA is not set
667# CONFIG_BT is not set 707# CONFIG_BT is not set
668# CONFIG_AF_RXRPC is not set 708# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
669CONFIG_FIB_RULES=y 710CONFIG_FIB_RULES=y
670 711CONFIG_WIRELESS=y
671#
672# Wireless
673#
674CONFIG_CFG80211=y 712CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set
675CONFIG_NL80211=y 714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y
676CONFIG_WIRELESS_EXT=y 716CONFIG_WIRELESS_EXT=y
677CONFIG_WIRELESS_EXT_SYSFS=y 717CONFIG_WIRELESS_EXT_SYSFS=y
718# CONFIG_LIB80211 is not set
678CONFIG_MAC80211=y 719CONFIG_MAC80211=y
679 720
680# 721#
681# Rate control algorithm selection 722# Rate control algorithm selection
682# 723#
683CONFIG_MAC80211_RC_PID=y 724CONFIG_MAC80211_RC_MINSTREL=y
684CONFIG_MAC80211_RC_DEFAULT_PID=y 725# CONFIG_MAC80211_RC_DEFAULT_PID is not set
685CONFIG_MAC80211_RC_DEFAULT="pid" 726CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
727CONFIG_MAC80211_RC_DEFAULT="minstrel"
686# CONFIG_MAC80211_MESH is not set 728# CONFIG_MAC80211_MESH is not set
687CONFIG_MAC80211_LEDS=y 729CONFIG_MAC80211_LEDS=y
688# CONFIG_MAC80211_DEBUGFS is not set 730# CONFIG_MAC80211_DEBUGFS is not set
689# CONFIG_MAC80211_DEBUG_MENU is not set 731# CONFIG_MAC80211_DEBUG_MENU is not set
690# CONFIG_IEEE80211 is not set 732# CONFIG_WIMAX is not set
691# CONFIG_RFKILL is not set 733CONFIG_RFKILL=y
734# CONFIG_RFKILL_INPUT is not set
735CONFIG_RFKILL_LEDS=y
692# CONFIG_NET_9P is not set 736# CONFIG_NET_9P is not set
693 737
694# 738#
@@ -712,7 +756,7 @@ CONFIG_PROC_EVENTS=y
712# CONFIG_MTD is not set 756# CONFIG_MTD is not set
713# CONFIG_PARPORT is not set 757# CONFIG_PARPORT is not set
714CONFIG_PNP=y 758CONFIG_PNP=y
715# CONFIG_PNP_DEBUG is not set 759CONFIG_PNP_DEBUG_MESSAGES=y
716 760
717# 761#
718# Protocols 762# Protocols
@@ -740,21 +784,21 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
740CONFIG_MISC_DEVICES=y 784CONFIG_MISC_DEVICES=y
741# CONFIG_IBM_ASM is not set 785# CONFIG_IBM_ASM is not set
742# CONFIG_PHANTOM is not set 786# CONFIG_PHANTOM is not set
743# CONFIG_EEPROM_93CX6 is not set
744# CONFIG_SGI_IOC4 is not set 787# CONFIG_SGI_IOC4 is not set
745# CONFIG_TIFM_CORE is not set 788# CONFIG_TIFM_CORE is not set
746# CONFIG_ACER_WMI is not set 789# CONFIG_ICS932S401 is not set
747# CONFIG_ASUS_LAPTOP is not set
748# CONFIG_FUJITSU_LAPTOP is not set
749# CONFIG_MSI_LAPTOP is not set
750# CONFIG_COMPAL_LAPTOP is not set
751# CONFIG_SONY_LAPTOP is not set
752# CONFIG_THINKPAD_ACPI is not set
753# CONFIG_INTEL_MENLOW is not set
754# CONFIG_ENCLOSURE_SERVICES is not set 790# CONFIG_ENCLOSURE_SERVICES is not set
755# CONFIG_SGI_XP is not set 791# CONFIG_SGI_XP is not set
756# CONFIG_HP_ILO is not set 792# CONFIG_HP_ILO is not set
757# CONFIG_SGI_GRU is not set 793# CONFIG_SGI_GRU is not set
794# CONFIG_C2PORT is not set
795
796#
797# EEPROM support
798#
799# CONFIG_EEPROM_AT24 is not set
800# CONFIG_EEPROM_LEGACY is not set
801# CONFIG_EEPROM_93CX6 is not set
758CONFIG_HAVE_IDE=y 802CONFIG_HAVE_IDE=y
759# CONFIG_IDE is not set 803# CONFIG_IDE is not set
760 804
@@ -793,7 +837,7 @@ CONFIG_SCSI_WAIT_SCAN=m
793# 837#
794CONFIG_SCSI_SPI_ATTRS=y 838CONFIG_SCSI_SPI_ATTRS=y
795# CONFIG_SCSI_FC_ATTRS is not set 839# CONFIG_SCSI_FC_ATTRS is not set
796CONFIG_SCSI_ISCSI_ATTRS=y 840# CONFIG_SCSI_ISCSI_ATTRS is not set
797# CONFIG_SCSI_SAS_ATTRS is not set 841# CONFIG_SCSI_SAS_ATTRS is not set
798# CONFIG_SCSI_SAS_LIBSAS is not set 842# CONFIG_SCSI_SAS_LIBSAS is not set
799# CONFIG_SCSI_SRP_ATTRS is not set 843# CONFIG_SCSI_SRP_ATTRS is not set
@@ -864,6 +908,7 @@ CONFIG_PATA_OLDPIIX=y
864CONFIG_PATA_SCH=y 908CONFIG_PATA_SCH=y
865CONFIG_MD=y 909CONFIG_MD=y
866CONFIG_BLK_DEV_MD=y 910CONFIG_BLK_DEV_MD=y
911CONFIG_MD_AUTODETECT=y
867# CONFIG_MD_LINEAR is not set 912# CONFIG_MD_LINEAR is not set
868# CONFIG_MD_RAID0 is not set 913# CONFIG_MD_RAID0 is not set
869# CONFIG_MD_RAID1 is not set 914# CONFIG_MD_RAID1 is not set
@@ -919,6 +964,9 @@ CONFIG_PHYLIB=y
919# CONFIG_BROADCOM_PHY is not set 964# CONFIG_BROADCOM_PHY is not set
920# CONFIG_ICPLUS_PHY is not set 965# CONFIG_ICPLUS_PHY is not set
921# CONFIG_REALTEK_PHY is not set 966# CONFIG_REALTEK_PHY is not set
967# CONFIG_NATIONAL_PHY is not set
968# CONFIG_STE10XP is not set
969# CONFIG_LSI_ET1011C_PHY is not set
922# CONFIG_FIXED_PHY is not set 970# CONFIG_FIXED_PHY is not set
923# CONFIG_MDIO_BITBANG is not set 971# CONFIG_MDIO_BITBANG is not set
924CONFIG_NET_ETHERNET=y 972CONFIG_NET_ETHERNET=y
@@ -942,6 +990,9 @@ CONFIG_NET_TULIP=y
942# CONFIG_IBM_NEW_EMAC_RGMII is not set 990# CONFIG_IBM_NEW_EMAC_RGMII is not set
943# CONFIG_IBM_NEW_EMAC_TAH is not set 991# CONFIG_IBM_NEW_EMAC_TAH is not set
944# CONFIG_IBM_NEW_EMAC_EMAC4 is not set 992# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
993# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
994# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
995# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
945CONFIG_NET_PCI=y 996CONFIG_NET_PCI=y
946# CONFIG_PCNET32 is not set 997# CONFIG_PCNET32 is not set
947# CONFIG_AMD8111_ETH is not set 998# CONFIG_AMD8111_ETH is not set
@@ -949,7 +1000,6 @@ CONFIG_NET_PCI=y
949# CONFIG_B44 is not set 1000# CONFIG_B44 is not set
950CONFIG_FORCEDETH=y 1001CONFIG_FORCEDETH=y
951# CONFIG_FORCEDETH_NAPI is not set 1002# CONFIG_FORCEDETH_NAPI is not set
952# CONFIG_EEPRO100 is not set
953CONFIG_E100=y 1003CONFIG_E100=y
954# CONFIG_FEALNX is not set 1004# CONFIG_FEALNX is not set
955# CONFIG_NATSEMI is not set 1005# CONFIG_NATSEMI is not set
@@ -963,15 +1013,16 @@ CONFIG_8139TOO_PIO=y
963# CONFIG_R6040 is not set 1013# CONFIG_R6040 is not set
964# CONFIG_SIS900 is not set 1014# CONFIG_SIS900 is not set
965# CONFIG_EPIC100 is not set 1015# CONFIG_EPIC100 is not set
1016# CONFIG_SMSC9420 is not set
966# CONFIG_SUNDANCE is not set 1017# CONFIG_SUNDANCE is not set
967# CONFIG_TLAN is not set 1018# CONFIG_TLAN is not set
968# CONFIG_VIA_RHINE is not set 1019# CONFIG_VIA_RHINE is not set
969# CONFIG_SC92031 is not set 1020# CONFIG_SC92031 is not set
1021# CONFIG_ATL2 is not set
970CONFIG_NETDEV_1000=y 1022CONFIG_NETDEV_1000=y
971# CONFIG_ACENIC is not set 1023# CONFIG_ACENIC is not set
972# CONFIG_DL2K is not set 1024# CONFIG_DL2K is not set
973CONFIG_E1000=y 1025CONFIG_E1000=y
974# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
975# CONFIG_E1000E is not set 1026# CONFIG_E1000E is not set
976# CONFIG_IP1000 is not set 1027# CONFIG_IP1000 is not set
977# CONFIG_IGB is not set 1028# CONFIG_IGB is not set
@@ -989,18 +1040,23 @@ CONFIG_TIGON3=y
989# CONFIG_QLA3XXX is not set 1040# CONFIG_QLA3XXX is not set
990# CONFIG_ATL1 is not set 1041# CONFIG_ATL1 is not set
991# CONFIG_ATL1E is not set 1042# CONFIG_ATL1E is not set
1043# CONFIG_JME is not set
992CONFIG_NETDEV_10000=y 1044CONFIG_NETDEV_10000=y
993# CONFIG_CHELSIO_T1 is not set 1045# CONFIG_CHELSIO_T1 is not set
1046CONFIG_CHELSIO_T3_DEPENDS=y
994# CONFIG_CHELSIO_T3 is not set 1047# CONFIG_CHELSIO_T3 is not set
1048# CONFIG_ENIC is not set
995# CONFIG_IXGBE is not set 1049# CONFIG_IXGBE is not set
996# CONFIG_IXGB is not set 1050# CONFIG_IXGB is not set
997# CONFIG_S2IO is not set 1051# CONFIG_S2IO is not set
998# CONFIG_MYRI10GE is not set 1052# CONFIG_MYRI10GE is not set
999# CONFIG_NETXEN_NIC is not set 1053# CONFIG_NETXEN_NIC is not set
1000# CONFIG_NIU is not set 1054# CONFIG_NIU is not set
1055# CONFIG_MLX4_EN is not set
1001# CONFIG_MLX4_CORE is not set 1056# CONFIG_MLX4_CORE is not set
1002# CONFIG_TEHUTI is not set 1057# CONFIG_TEHUTI is not set
1003# CONFIG_BNX2X is not set 1058# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set
1004# CONFIG_SFC is not set 1060# CONFIG_SFC is not set
1005CONFIG_TR=y 1061CONFIG_TR=y
1006# CONFIG_IBMOL is not set 1062# CONFIG_IBMOL is not set
@@ -1013,9 +1069,8 @@ CONFIG_TR=y
1013# CONFIG_WLAN_PRE80211 is not set 1069# CONFIG_WLAN_PRE80211 is not set
1014CONFIG_WLAN_80211=y 1070CONFIG_WLAN_80211=y
1015# CONFIG_PCMCIA_RAYCS is not set 1071# CONFIG_PCMCIA_RAYCS is not set
1016# CONFIG_IPW2100 is not set
1017# CONFIG_IPW2200 is not set
1018# CONFIG_LIBERTAS is not set 1072# CONFIG_LIBERTAS is not set
1073# CONFIG_LIBERTAS_THINFIRM is not set
1019# CONFIG_AIRO is not set 1074# CONFIG_AIRO is not set
1020# CONFIG_HERMES is not set 1075# CONFIG_HERMES is not set
1021# CONFIG_ATMEL is not set 1076# CONFIG_ATMEL is not set
@@ -1032,6 +1087,8 @@ CONFIG_WLAN_80211=y
1032CONFIG_ATH5K=y 1087CONFIG_ATH5K=y
1033# CONFIG_ATH5K_DEBUG is not set 1088# CONFIG_ATH5K_DEBUG is not set
1034# CONFIG_ATH9K is not set 1089# CONFIG_ATH9K is not set
1090# CONFIG_IPW2100 is not set
1091# CONFIG_IPW2200 is not set
1035# CONFIG_IWLCORE is not set 1092# CONFIG_IWLCORE is not set
1036# CONFIG_IWLWIFI_LEDS is not set 1093# CONFIG_IWLWIFI_LEDS is not set
1037# CONFIG_IWLAGN is not set 1094# CONFIG_IWLAGN is not set
@@ -1043,6 +1100,10 @@ CONFIG_ATH5K=y
1043# CONFIG_RT2X00 is not set 1100# CONFIG_RT2X00 is not set
1044 1101
1045# 1102#
1103# Enable WiMAX (Networking options) to see the WiMAX drivers
1104#
1105
1106#
1046# USB Network Adapters 1107# USB Network Adapters
1047# 1108#
1048# CONFIG_USB_CATC is not set 1109# CONFIG_USB_CATC is not set
@@ -1050,6 +1111,7 @@ CONFIG_ATH5K=y
1050# CONFIG_USB_PEGASUS is not set 1111# CONFIG_USB_PEGASUS is not set
1051# CONFIG_USB_RTL8150 is not set 1112# CONFIG_USB_RTL8150 is not set
1052# CONFIG_USB_USBNET is not set 1113# CONFIG_USB_USBNET is not set
1114# CONFIG_USB_HSO is not set
1053CONFIG_NET_PCMCIA=y 1115CONFIG_NET_PCMCIA=y
1054# CONFIG_PCMCIA_3C589 is not set 1116# CONFIG_PCMCIA_3C589 is not set
1055# CONFIG_PCMCIA_3C574 is not set 1117# CONFIG_PCMCIA_3C574 is not set
@@ -1059,6 +1121,7 @@ CONFIG_NET_PCMCIA=y
1059# CONFIG_PCMCIA_SMC91C92 is not set 1121# CONFIG_PCMCIA_SMC91C92 is not set
1060# CONFIG_PCMCIA_XIRC2PS is not set 1122# CONFIG_PCMCIA_XIRC2PS is not set
1061# CONFIG_PCMCIA_AXNET is not set 1123# CONFIG_PCMCIA_AXNET is not set
1124# CONFIG_PCMCIA_IBMTR is not set
1062# CONFIG_WAN is not set 1125# CONFIG_WAN is not set
1063CONFIG_FDDI=y 1126CONFIG_FDDI=y
1064# CONFIG_DEFXX is not set 1127# CONFIG_DEFXX is not set
@@ -1110,6 +1173,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y
1110CONFIG_MOUSE_PS2_SYNAPTICS=y 1173CONFIG_MOUSE_PS2_SYNAPTICS=y
1111CONFIG_MOUSE_PS2_LIFEBOOK=y 1174CONFIG_MOUSE_PS2_LIFEBOOK=y
1112CONFIG_MOUSE_PS2_TRACKPOINT=y 1175CONFIG_MOUSE_PS2_TRACKPOINT=y
1176# CONFIG_MOUSE_PS2_ELANTECH is not set
1113# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1177# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1114# CONFIG_MOUSE_SERIAL is not set 1178# CONFIG_MOUSE_SERIAL is not set
1115# CONFIG_MOUSE_APPLETOUCH is not set 1179# CONFIG_MOUSE_APPLETOUCH is not set
@@ -1147,15 +1211,16 @@ CONFIG_INPUT_TOUCHSCREEN=y
1147# CONFIG_TOUCHSCREEN_FUJITSU is not set 1211# CONFIG_TOUCHSCREEN_FUJITSU is not set
1148# CONFIG_TOUCHSCREEN_GUNZE is not set 1212# CONFIG_TOUCHSCREEN_GUNZE is not set
1149# CONFIG_TOUCHSCREEN_ELO is not set 1213# CONFIG_TOUCHSCREEN_ELO is not set
1214# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
1150# CONFIG_TOUCHSCREEN_MTOUCH is not set 1215# CONFIG_TOUCHSCREEN_MTOUCH is not set
1151# CONFIG_TOUCHSCREEN_INEXIO is not set 1216# CONFIG_TOUCHSCREEN_INEXIO is not set
1152# CONFIG_TOUCHSCREEN_MK712 is not set 1217# CONFIG_TOUCHSCREEN_MK712 is not set
1153# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1218# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1154# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1219# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1155# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1220# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1156# CONFIG_TOUCHSCREEN_UCB1400 is not set
1157# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1221# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1158# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set 1222# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1223# CONFIG_TOUCHSCREEN_TSC2007 is not set
1159CONFIG_INPUT_MISC=y 1224CONFIG_INPUT_MISC=y
1160# CONFIG_INPUT_PCSPKR is not set 1225# CONFIG_INPUT_PCSPKR is not set
1161# CONFIG_INPUT_APANEL is not set 1226# CONFIG_INPUT_APANEL is not set
@@ -1165,6 +1230,7 @@ CONFIG_INPUT_MISC=y
1165# CONFIG_INPUT_KEYSPAN_REMOTE is not set 1230# CONFIG_INPUT_KEYSPAN_REMOTE is not set
1166# CONFIG_INPUT_POWERMATE is not set 1231# CONFIG_INPUT_POWERMATE is not set
1167# CONFIG_INPUT_YEALINK is not set 1232# CONFIG_INPUT_YEALINK is not set
1233# CONFIG_INPUT_CM109 is not set
1168# CONFIG_INPUT_UINPUT is not set 1234# CONFIG_INPUT_UINPUT is not set
1169 1235
1170# 1236#
@@ -1231,6 +1297,7 @@ CONFIG_SERIAL_CORE=y
1231CONFIG_SERIAL_CORE_CONSOLE=y 1297CONFIG_SERIAL_CORE_CONSOLE=y
1232# CONFIG_SERIAL_JSM is not set 1298# CONFIG_SERIAL_JSM is not set
1233CONFIG_UNIX98_PTYS=y 1299CONFIG_UNIX98_PTYS=y
1300# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
1234# CONFIG_LEGACY_PTYS is not set 1301# CONFIG_LEGACY_PTYS is not set
1235# CONFIG_IPMI_HANDLER is not set 1302# CONFIG_IPMI_HANDLER is not set
1236CONFIG_HW_RANDOM=y 1303CONFIG_HW_RANDOM=y
@@ -1260,6 +1327,7 @@ CONFIG_I2C=y
1260CONFIG_I2C_BOARDINFO=y 1327CONFIG_I2C_BOARDINFO=y
1261# CONFIG_I2C_CHARDEV is not set 1328# CONFIG_I2C_CHARDEV is not set
1262CONFIG_I2C_HELPER_AUTO=y 1329CONFIG_I2C_HELPER_AUTO=y
1330CONFIG_I2C_ALGOBIT=y
1263 1331
1264# 1332#
1265# I2C Hardware Bus support 1333# I2C Hardware Bus support
@@ -1311,8 +1379,6 @@ CONFIG_I2C_I801=y
1311# Miscellaneous I2C Chip support 1379# Miscellaneous I2C Chip support
1312# 1380#
1313# CONFIG_DS1682 is not set 1381# CONFIG_DS1682 is not set
1314# CONFIG_EEPROM_AT24 is not set
1315# CONFIG_EEPROM_LEGACY is not set
1316# CONFIG_SENSORS_PCF8574 is not set 1382# CONFIG_SENSORS_PCF8574 is not set
1317# CONFIG_PCF8575 is not set 1383# CONFIG_PCF8575 is not set
1318# CONFIG_SENSORS_PCA9539 is not set 1384# CONFIG_SENSORS_PCA9539 is not set
@@ -1331,8 +1397,78 @@ CONFIG_POWER_SUPPLY=y
1331# CONFIG_POWER_SUPPLY_DEBUG is not set 1397# CONFIG_POWER_SUPPLY_DEBUG is not set
1332# CONFIG_PDA_POWER is not set 1398# CONFIG_PDA_POWER is not set
1333# CONFIG_BATTERY_DS2760 is not set 1399# CONFIG_BATTERY_DS2760 is not set
1334# CONFIG_HWMON is not set 1400# CONFIG_BATTERY_BQ27x00 is not set
1401CONFIG_HWMON=y
1402# CONFIG_HWMON_VID is not set
1403# CONFIG_SENSORS_ABITUGURU is not set
1404# CONFIG_SENSORS_ABITUGURU3 is not set
1405# CONFIG_SENSORS_AD7414 is not set
1406# CONFIG_SENSORS_AD7418 is not set
1407# CONFIG_SENSORS_ADM1021 is not set
1408# CONFIG_SENSORS_ADM1025 is not set
1409# CONFIG_SENSORS_ADM1026 is not set
1410# CONFIG_SENSORS_ADM1029 is not set
1411# CONFIG_SENSORS_ADM1031 is not set
1412# CONFIG_SENSORS_ADM9240 is not set
1413# CONFIG_SENSORS_ADT7462 is not set
1414# CONFIG_SENSORS_ADT7470 is not set
1415# CONFIG_SENSORS_ADT7473 is not set
1416# CONFIG_SENSORS_ADT7475 is not set
1417# CONFIG_SENSORS_K8TEMP is not set
1418# CONFIG_SENSORS_ASB100 is not set
1419# CONFIG_SENSORS_ATXP1 is not set
1420# CONFIG_SENSORS_DS1621 is not set
1421# CONFIG_SENSORS_I5K_AMB is not set
1422# CONFIG_SENSORS_F71805F is not set
1423# CONFIG_SENSORS_F71882FG is not set
1424# CONFIG_SENSORS_F75375S is not set
1425# CONFIG_SENSORS_FSCHER is not set
1426# CONFIG_SENSORS_FSCPOS is not set
1427# CONFIG_SENSORS_FSCHMD is not set
1428# CONFIG_SENSORS_GL518SM is not set
1429# CONFIG_SENSORS_GL520SM is not set
1430# CONFIG_SENSORS_CORETEMP is not set
1431# CONFIG_SENSORS_IT87 is not set
1432# CONFIG_SENSORS_LM63 is not set
1433# CONFIG_SENSORS_LM75 is not set
1434# CONFIG_SENSORS_LM77 is not set
1435# CONFIG_SENSORS_LM78 is not set
1436# CONFIG_SENSORS_LM80 is not set
1437# CONFIG_SENSORS_LM83 is not set
1438# CONFIG_SENSORS_LM85 is not set
1439# CONFIG_SENSORS_LM87 is not set
1440# CONFIG_SENSORS_LM90 is not set
1441# CONFIG_SENSORS_LM92 is not set
1442# CONFIG_SENSORS_LM93 is not set
1443# CONFIG_SENSORS_LTC4245 is not set
1444# CONFIG_SENSORS_MAX1619 is not set
1445# CONFIG_SENSORS_MAX6650 is not set
1446# CONFIG_SENSORS_PC87360 is not set
1447# CONFIG_SENSORS_PC87427 is not set
1448# CONFIG_SENSORS_SIS5595 is not set
1449# CONFIG_SENSORS_DME1737 is not set
1450# CONFIG_SENSORS_SMSC47M1 is not set
1451# CONFIG_SENSORS_SMSC47M192 is not set
1452# CONFIG_SENSORS_SMSC47B397 is not set
1453# CONFIG_SENSORS_ADS7828 is not set
1454# CONFIG_SENSORS_THMC50 is not set
1455# CONFIG_SENSORS_VIA686A is not set
1456# CONFIG_SENSORS_VT1211 is not set
1457# CONFIG_SENSORS_VT8231 is not set
1458# CONFIG_SENSORS_W83781D is not set
1459# CONFIG_SENSORS_W83791D is not set
1460# CONFIG_SENSORS_W83792D is not set
1461# CONFIG_SENSORS_W83793 is not set
1462# CONFIG_SENSORS_W83L785TS is not set
1463# CONFIG_SENSORS_W83L786NG is not set
1464# CONFIG_SENSORS_W83627HF is not set
1465# CONFIG_SENSORS_W83627EHF is not set
1466# CONFIG_SENSORS_HDAPS is not set
1467# CONFIG_SENSORS_LIS3LV02D is not set
1468# CONFIG_SENSORS_APPLESMC is not set
1469# CONFIG_HWMON_DEBUG_CHIP is not set
1335CONFIG_THERMAL=y 1470CONFIG_THERMAL=y
1471# CONFIG_THERMAL_HWMON is not set
1336CONFIG_WATCHDOG=y 1472CONFIG_WATCHDOG=y
1337# CONFIG_WATCHDOG_NOWAYOUT is not set 1473# CONFIG_WATCHDOG_NOWAYOUT is not set
1338 1474
@@ -1352,15 +1488,18 @@ CONFIG_WATCHDOG=y
1352# CONFIG_I6300ESB_WDT is not set 1488# CONFIG_I6300ESB_WDT is not set
1353# CONFIG_ITCO_WDT is not set 1489# CONFIG_ITCO_WDT is not set
1354# CONFIG_IT8712F_WDT is not set 1490# CONFIG_IT8712F_WDT is not set
1491# CONFIG_IT87_WDT is not set
1355# CONFIG_HP_WATCHDOG is not set 1492# CONFIG_HP_WATCHDOG is not set
1356# CONFIG_SC1200_WDT is not set 1493# CONFIG_SC1200_WDT is not set
1357# CONFIG_PC87413_WDT is not set 1494# CONFIG_PC87413_WDT is not set
1358# CONFIG_60XX_WDT is not set 1495# CONFIG_60XX_WDT is not set
1359# CONFIG_SBC8360_WDT is not set 1496# CONFIG_SBC8360_WDT is not set
1360# CONFIG_CPU5_WDT is not set 1497# CONFIG_CPU5_WDT is not set
1498# CONFIG_SMSC_SCH311X_WDT is not set
1361# CONFIG_SMSC37B787_WDT is not set 1499# CONFIG_SMSC37B787_WDT is not set
1362# CONFIG_W83627HF_WDT is not set 1500# CONFIG_W83627HF_WDT is not set
1363# CONFIG_W83697HF_WDT is not set 1501# CONFIG_W83697HF_WDT is not set
1502# CONFIG_W83697UG_WDT is not set
1364# CONFIG_W83877F_WDT is not set 1503# CONFIG_W83877F_WDT is not set
1365# CONFIG_W83977F_WDT is not set 1504# CONFIG_W83977F_WDT is not set
1366# CONFIG_MACHZ_WDT is not set 1505# CONFIG_MACHZ_WDT is not set
@@ -1376,11 +1515,11 @@ CONFIG_WATCHDOG=y
1376# USB-based Watchdog Cards 1515# USB-based Watchdog Cards
1377# 1516#
1378# CONFIG_USBPCWATCHDOG is not set 1517# CONFIG_USBPCWATCHDOG is not set
1518CONFIG_SSB_POSSIBLE=y
1379 1519
1380# 1520#
1381# Sonics Silicon Backplane 1521# Sonics Silicon Backplane
1382# 1522#
1383CONFIG_SSB_POSSIBLE=y
1384# CONFIG_SSB is not set 1523# CONFIG_SSB is not set
1385 1524
1386# 1525#
@@ -1389,7 +1528,13 @@ CONFIG_SSB_POSSIBLE=y
1389# CONFIG_MFD_CORE is not set 1528# CONFIG_MFD_CORE is not set
1390# CONFIG_MFD_SM501 is not set 1529# CONFIG_MFD_SM501 is not set
1391# CONFIG_HTC_PASIC3 is not set 1530# CONFIG_HTC_PASIC3 is not set
1531# CONFIG_TWL4030_CORE is not set
1392# CONFIG_MFD_TMIO is not set 1532# CONFIG_MFD_TMIO is not set
1533# CONFIG_PMIC_DA903X is not set
1534# CONFIG_MFD_WM8400 is not set
1535# CONFIG_MFD_WM8350_I2C is not set
1536# CONFIG_MFD_PCF50633 is not set
1537# CONFIG_REGULATOR is not set
1393 1538
1394# 1539#
1395# Multimedia devices 1540# Multimedia devices
@@ -1423,6 +1568,7 @@ CONFIG_DRM=y
1423# CONFIG_DRM_I810 is not set 1568# CONFIG_DRM_I810 is not set
1424# CONFIG_DRM_I830 is not set 1569# CONFIG_DRM_I830 is not set
1425CONFIG_DRM_I915=y 1570CONFIG_DRM_I915=y
1571CONFIG_DRM_I915_KMS=y
1426# CONFIG_DRM_MGA is not set 1572# CONFIG_DRM_MGA is not set
1427# CONFIG_DRM_SIS is not set 1573# CONFIG_DRM_SIS is not set
1428# CONFIG_DRM_VIA is not set 1574# CONFIG_DRM_VIA is not set
@@ -1432,6 +1578,7 @@ CONFIG_DRM_I915=y
1432CONFIG_FB=y 1578CONFIG_FB=y
1433# CONFIG_FIRMWARE_EDID is not set 1579# CONFIG_FIRMWARE_EDID is not set
1434# CONFIG_FB_DDC is not set 1580# CONFIG_FB_DDC is not set
1581# CONFIG_FB_BOOT_VESA_SUPPORT is not set
1435CONFIG_FB_CFB_FILLRECT=y 1582CONFIG_FB_CFB_FILLRECT=y
1436CONFIG_FB_CFB_COPYAREA=y 1583CONFIG_FB_CFB_COPYAREA=y
1437CONFIG_FB_CFB_IMAGEBLIT=y 1584CONFIG_FB_CFB_IMAGEBLIT=y
@@ -1460,7 +1607,6 @@ CONFIG_FB_TILEBLITTING=y
1460# CONFIG_FB_UVESA is not set 1607# CONFIG_FB_UVESA is not set
1461# CONFIG_FB_VESA is not set 1608# CONFIG_FB_VESA is not set
1462CONFIG_FB_EFI=y 1609CONFIG_FB_EFI=y
1463# CONFIG_FB_IMAC is not set
1464# CONFIG_FB_N411 is not set 1610# CONFIG_FB_N411 is not set
1465# CONFIG_FB_HGA is not set 1611# CONFIG_FB_HGA is not set
1466# CONFIG_FB_S1D13XXX is not set 1612# CONFIG_FB_S1D13XXX is not set
@@ -1475,6 +1621,7 @@ CONFIG_FB_EFI=y
1475# CONFIG_FB_S3 is not set 1621# CONFIG_FB_S3 is not set
1476# CONFIG_FB_SAVAGE is not set 1622# CONFIG_FB_SAVAGE is not set
1477# CONFIG_FB_SIS is not set 1623# CONFIG_FB_SIS is not set
1624# CONFIG_FB_VIA is not set
1478# CONFIG_FB_NEOMAGIC is not set 1625# CONFIG_FB_NEOMAGIC is not set
1479# CONFIG_FB_KYRO is not set 1626# CONFIG_FB_KYRO is not set
1480# CONFIG_FB_3DFX is not set 1627# CONFIG_FB_3DFX is not set
@@ -1486,12 +1633,15 @@ CONFIG_FB_EFI=y
1486# CONFIG_FB_CARMINE is not set 1633# CONFIG_FB_CARMINE is not set
1487# CONFIG_FB_GEODE is not set 1634# CONFIG_FB_GEODE is not set
1488# CONFIG_FB_VIRTUAL is not set 1635# CONFIG_FB_VIRTUAL is not set
1636# CONFIG_FB_METRONOME is not set
1637# CONFIG_FB_MB862XX is not set
1489CONFIG_BACKLIGHT_LCD_SUPPORT=y 1638CONFIG_BACKLIGHT_LCD_SUPPORT=y
1490# CONFIG_LCD_CLASS_DEVICE is not set 1639# CONFIG_LCD_CLASS_DEVICE is not set
1491CONFIG_BACKLIGHT_CLASS_DEVICE=y 1640CONFIG_BACKLIGHT_CLASS_DEVICE=y
1492# CONFIG_BACKLIGHT_CORGI is not set 1641CONFIG_BACKLIGHT_GENERIC=y
1493# CONFIG_BACKLIGHT_PROGEAR is not set 1642# CONFIG_BACKLIGHT_PROGEAR is not set
1494# CONFIG_BACKLIGHT_MBP_NVIDIA is not set 1643# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1644# CONFIG_BACKLIGHT_SAHARA is not set
1495 1645
1496# 1646#
1497# Display device support 1647# Display device support
@@ -1511,10 +1661,12 @@ CONFIG_LOGO=y
1511# CONFIG_LOGO_LINUX_VGA16 is not set 1661# CONFIG_LOGO_LINUX_VGA16 is not set
1512CONFIG_LOGO_LINUX_CLUT224=y 1662CONFIG_LOGO_LINUX_CLUT224=y
1513CONFIG_SOUND=y 1663CONFIG_SOUND=y
1664CONFIG_SOUND_OSS_CORE=y
1514CONFIG_SND=y 1665CONFIG_SND=y
1515CONFIG_SND_TIMER=y 1666CONFIG_SND_TIMER=y
1516CONFIG_SND_PCM=y 1667CONFIG_SND_PCM=y
1517CONFIG_SND_HWDEP=y 1668CONFIG_SND_HWDEP=y
1669CONFIG_SND_JACK=y
1518CONFIG_SND_SEQUENCER=y 1670CONFIG_SND_SEQUENCER=y
1519CONFIG_SND_SEQ_DUMMY=y 1671CONFIG_SND_SEQ_DUMMY=y
1520CONFIG_SND_OSSEMUL=y 1672CONFIG_SND_OSSEMUL=y
@@ -1522,6 +1674,8 @@ CONFIG_SND_MIXER_OSS=y
1522CONFIG_SND_PCM_OSS=y 1674CONFIG_SND_PCM_OSS=y
1523CONFIG_SND_PCM_OSS_PLUGINS=y 1675CONFIG_SND_PCM_OSS_PLUGINS=y
1524CONFIG_SND_SEQUENCER_OSS=y 1676CONFIG_SND_SEQUENCER_OSS=y
1677CONFIG_SND_HRTIMER=y
1678CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
1525CONFIG_SND_DYNAMIC_MINORS=y 1679CONFIG_SND_DYNAMIC_MINORS=y
1526CONFIG_SND_SUPPORT_OLD_API=y 1680CONFIG_SND_SUPPORT_OLD_API=y
1527CONFIG_SND_VERBOSE_PROCFS=y 1681CONFIG_SND_VERBOSE_PROCFS=y
@@ -1575,11 +1729,16 @@ CONFIG_SND_PCI=y
1575# CONFIG_SND_FM801 is not set 1729# CONFIG_SND_FM801 is not set
1576CONFIG_SND_HDA_INTEL=y 1730CONFIG_SND_HDA_INTEL=y
1577CONFIG_SND_HDA_HWDEP=y 1731CONFIG_SND_HDA_HWDEP=y
1732# CONFIG_SND_HDA_RECONFIG is not set
1733# CONFIG_SND_HDA_INPUT_BEEP is not set
1578CONFIG_SND_HDA_CODEC_REALTEK=y 1734CONFIG_SND_HDA_CODEC_REALTEK=y
1579CONFIG_SND_HDA_CODEC_ANALOG=y 1735CONFIG_SND_HDA_CODEC_ANALOG=y
1580CONFIG_SND_HDA_CODEC_SIGMATEL=y 1736CONFIG_SND_HDA_CODEC_SIGMATEL=y
1581CONFIG_SND_HDA_CODEC_VIA=y 1737CONFIG_SND_HDA_CODEC_VIA=y
1582CONFIG_SND_HDA_CODEC_ATIHDMI=y 1738CONFIG_SND_HDA_CODEC_ATIHDMI=y
1739CONFIG_SND_HDA_CODEC_NVHDMI=y
1740CONFIG_SND_HDA_CODEC_INTELHDMI=y
1741CONFIG_SND_HDA_ELD=y
1583CONFIG_SND_HDA_CODEC_CONEXANT=y 1742CONFIG_SND_HDA_CODEC_CONEXANT=y
1584CONFIG_SND_HDA_CODEC_CMEDIA=y 1743CONFIG_SND_HDA_CODEC_CMEDIA=y
1585CONFIG_SND_HDA_CODEC_SI3054=y 1744CONFIG_SND_HDA_CODEC_SI3054=y
@@ -1612,6 +1771,7 @@ CONFIG_SND_USB=y
1612# CONFIG_SND_USB_AUDIO is not set 1771# CONFIG_SND_USB_AUDIO is not set
1613# CONFIG_SND_USB_USX2Y is not set 1772# CONFIG_SND_USB_USX2Y is not set
1614# CONFIG_SND_USB_CAIAQ is not set 1773# CONFIG_SND_USB_CAIAQ is not set
1774# CONFIG_SND_USB_US122L is not set
1615CONFIG_SND_PCMCIA=y 1775CONFIG_SND_PCMCIA=y
1616# CONFIG_SND_VXPOCKET is not set 1776# CONFIG_SND_VXPOCKET is not set
1617# CONFIG_SND_PDAUDIOCF is not set 1777# CONFIG_SND_PDAUDIOCF is not set
@@ -1626,15 +1786,37 @@ CONFIG_HIDRAW=y
1626# USB Input Devices 1786# USB Input Devices
1627# 1787#
1628CONFIG_USB_HID=y 1788CONFIG_USB_HID=y
1629CONFIG_USB_HIDINPUT_POWERBOOK=y
1630CONFIG_HID_FF=y
1631CONFIG_HID_PID=y 1789CONFIG_HID_PID=y
1790CONFIG_USB_HIDDEV=y
1791
1792#
1793# Special HID drivers
1794#
1795CONFIG_HID_COMPAT=y
1796CONFIG_HID_A4TECH=y
1797CONFIG_HID_APPLE=y
1798CONFIG_HID_BELKIN=y
1799CONFIG_HID_CHERRY=y
1800CONFIG_HID_CHICONY=y
1801CONFIG_HID_CYPRESS=y
1802CONFIG_HID_EZKEY=y
1803CONFIG_HID_GYRATION=y
1804CONFIG_HID_LOGITECH=y
1632CONFIG_LOGITECH_FF=y 1805CONFIG_LOGITECH_FF=y
1633# CONFIG_LOGIRUMBLEPAD2_FF is not set 1806# CONFIG_LOGIRUMBLEPAD2_FF is not set
1807CONFIG_HID_MICROSOFT=y
1808CONFIG_HID_MONTEREY=y
1809CONFIG_HID_NTRIG=y
1810CONFIG_HID_PANTHERLORD=y
1634CONFIG_PANTHERLORD_FF=y 1811CONFIG_PANTHERLORD_FF=y
1812CONFIG_HID_PETALYNX=y
1813CONFIG_HID_SAMSUNG=y
1814CONFIG_HID_SONY=y
1815CONFIG_HID_SUNPLUS=y
1816# CONFIG_GREENASIA_FF is not set
1817CONFIG_HID_TOPSEED=y
1635CONFIG_THRUSTMASTER_FF=y 1818CONFIG_THRUSTMASTER_FF=y
1636CONFIG_ZEROPLUS_FF=y 1819CONFIG_ZEROPLUS_FF=y
1637CONFIG_USB_HIDDEV=y
1638CONFIG_USB_SUPPORT=y 1820CONFIG_USB_SUPPORT=y
1639CONFIG_USB_ARCH_HAS_HCD=y 1821CONFIG_USB_ARCH_HAS_HCD=y
1640CONFIG_USB_ARCH_HAS_OHCI=y 1822CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1652,6 +1834,8 @@ CONFIG_USB_DEVICEFS=y
1652CONFIG_USB_SUSPEND=y 1834CONFIG_USB_SUSPEND=y
1653# CONFIG_USB_OTG is not set 1835# CONFIG_USB_OTG is not set
1654CONFIG_USB_MON=y 1836CONFIG_USB_MON=y
1837# CONFIG_USB_WUSB is not set
1838# CONFIG_USB_WUSB_CBAF is not set
1655 1839
1656# 1840#
1657# USB Host Controller Drivers 1841# USB Host Controller Drivers
@@ -1660,6 +1844,7 @@ CONFIG_USB_MON=y
1660CONFIG_USB_EHCI_HCD=y 1844CONFIG_USB_EHCI_HCD=y
1661# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1845# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1662# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1846# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1847# CONFIG_USB_OXU210HP_HCD is not set
1663# CONFIG_USB_ISP116X_HCD is not set 1848# CONFIG_USB_ISP116X_HCD is not set
1664# CONFIG_USB_ISP1760_HCD is not set 1849# CONFIG_USB_ISP1760_HCD is not set
1665CONFIG_USB_OHCI_HCD=y 1850CONFIG_USB_OHCI_HCD=y
@@ -1669,6 +1854,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y
1669CONFIG_USB_UHCI_HCD=y 1854CONFIG_USB_UHCI_HCD=y
1670# CONFIG_USB_SL811_HCD is not set 1855# CONFIG_USB_SL811_HCD is not set
1671# CONFIG_USB_R8A66597_HCD is not set 1856# CONFIG_USB_R8A66597_HCD is not set
1857# CONFIG_USB_WHCI_HCD is not set
1858# CONFIG_USB_HWA_HCD is not set
1672 1859
1673# 1860#
1674# USB Device Class drivers 1861# USB Device Class drivers
@@ -1676,20 +1863,20 @@ CONFIG_USB_UHCI_HCD=y
1676# CONFIG_USB_ACM is not set 1863# CONFIG_USB_ACM is not set
1677CONFIG_USB_PRINTER=y 1864CONFIG_USB_PRINTER=y
1678# CONFIG_USB_WDM is not set 1865# CONFIG_USB_WDM is not set
1866# CONFIG_USB_TMC is not set
1679 1867
1680# 1868#
1681# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1869# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
1682# 1870#
1683 1871
1684# 1872#
1685# may also be needed; see USB_STORAGE Help for more information 1873# see USB_STORAGE Help for more information
1686# 1874#
1687CONFIG_USB_STORAGE=y 1875CONFIG_USB_STORAGE=y
1688# CONFIG_USB_STORAGE_DEBUG is not set 1876# CONFIG_USB_STORAGE_DEBUG is not set
1689# CONFIG_USB_STORAGE_DATAFAB is not set 1877# CONFIG_USB_STORAGE_DATAFAB is not set
1690# CONFIG_USB_STORAGE_FREECOM is not set 1878# CONFIG_USB_STORAGE_FREECOM is not set
1691# CONFIG_USB_STORAGE_ISD200 is not set 1879# CONFIG_USB_STORAGE_ISD200 is not set
1692# CONFIG_USB_STORAGE_DPCM is not set
1693# CONFIG_USB_STORAGE_USBAT is not set 1880# CONFIG_USB_STORAGE_USBAT is not set
1694# CONFIG_USB_STORAGE_SDDR09 is not set 1881# CONFIG_USB_STORAGE_SDDR09 is not set
1695# CONFIG_USB_STORAGE_SDDR55 is not set 1882# CONFIG_USB_STORAGE_SDDR55 is not set
@@ -1697,7 +1884,6 @@ CONFIG_USB_STORAGE=y
1697# CONFIG_USB_STORAGE_ALAUDA is not set 1884# CONFIG_USB_STORAGE_ALAUDA is not set
1698# CONFIG_USB_STORAGE_ONETOUCH is not set 1885# CONFIG_USB_STORAGE_ONETOUCH is not set
1699# CONFIG_USB_STORAGE_KARMA is not set 1886# CONFIG_USB_STORAGE_KARMA is not set
1700# CONFIG_USB_STORAGE_SIERRA is not set
1701# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1887# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1702CONFIG_USB_LIBUSUAL=y 1888CONFIG_USB_LIBUSUAL=y
1703 1889
@@ -1718,6 +1904,7 @@ CONFIG_USB_LIBUSUAL=y
1718# CONFIG_USB_EMI62 is not set 1904# CONFIG_USB_EMI62 is not set
1719# CONFIG_USB_EMI26 is not set 1905# CONFIG_USB_EMI26 is not set
1720# CONFIG_USB_ADUTUX is not set 1906# CONFIG_USB_ADUTUX is not set
1907# CONFIG_USB_SEVSEG is not set
1721# CONFIG_USB_RIO500 is not set 1908# CONFIG_USB_RIO500 is not set
1722# CONFIG_USB_LEGOTOWER is not set 1909# CONFIG_USB_LEGOTOWER is not set
1723# CONFIG_USB_LCD is not set 1910# CONFIG_USB_LCD is not set
@@ -1735,7 +1922,13 @@ CONFIG_USB_LIBUSUAL=y
1735# CONFIG_USB_IOWARRIOR is not set 1922# CONFIG_USB_IOWARRIOR is not set
1736# CONFIG_USB_TEST is not set 1923# CONFIG_USB_TEST is not set
1737# CONFIG_USB_ISIGHTFW is not set 1924# CONFIG_USB_ISIGHTFW is not set
1925# CONFIG_USB_VST is not set
1738# CONFIG_USB_GADGET is not set 1926# CONFIG_USB_GADGET is not set
1927
1928#
1929# OTG and related infrastructure
1930#
1931# CONFIG_UWB is not set
1739# CONFIG_MMC is not set 1932# CONFIG_MMC is not set
1740# CONFIG_MEMSTICK is not set 1933# CONFIG_MEMSTICK is not set
1741CONFIG_NEW_LEDS=y 1934CONFIG_NEW_LEDS=y
@@ -1744,6 +1937,7 @@ CONFIG_LEDS_CLASS=y
1744# 1937#
1745# LED drivers 1938# LED drivers
1746# 1939#
1940# CONFIG_LEDS_ALIX2 is not set
1747# CONFIG_LEDS_PCA9532 is not set 1941# CONFIG_LEDS_PCA9532 is not set
1748# CONFIG_LEDS_CLEVO_MAIL is not set 1942# CONFIG_LEDS_CLEVO_MAIL is not set
1749# CONFIG_LEDS_PCA955X is not set 1943# CONFIG_LEDS_PCA955X is not set
@@ -1754,6 +1948,7 @@ CONFIG_LEDS_CLASS=y
1754CONFIG_LEDS_TRIGGERS=y 1948CONFIG_LEDS_TRIGGERS=y
1755# CONFIG_LEDS_TRIGGER_TIMER is not set 1949# CONFIG_LEDS_TRIGGER_TIMER is not set
1756# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1950# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1951# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1757# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1952# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1758# CONFIG_ACCESSIBILITY is not set 1953# CONFIG_ACCESSIBILITY is not set
1759# CONFIG_INFINIBAND is not set 1954# CONFIG_INFINIBAND is not set
@@ -1793,6 +1988,7 @@ CONFIG_RTC_INTF_DEV=y
1793# CONFIG_RTC_DRV_M41T80 is not set 1988# CONFIG_RTC_DRV_M41T80 is not set
1794# CONFIG_RTC_DRV_S35390A is not set 1989# CONFIG_RTC_DRV_S35390A is not set
1795# CONFIG_RTC_DRV_FM3130 is not set 1990# CONFIG_RTC_DRV_FM3130 is not set
1991# CONFIG_RTC_DRV_RX8581 is not set
1796 1992
1797# 1993#
1798# SPI RTC drivers 1994# SPI RTC drivers
@@ -1802,12 +1998,15 @@ CONFIG_RTC_INTF_DEV=y
1802# Platform RTC drivers 1998# Platform RTC drivers
1803# 1999#
1804CONFIG_RTC_DRV_CMOS=y 2000CONFIG_RTC_DRV_CMOS=y
2001# CONFIG_RTC_DRV_DS1286 is not set
1805# CONFIG_RTC_DRV_DS1511 is not set 2002# CONFIG_RTC_DRV_DS1511 is not set
1806# CONFIG_RTC_DRV_DS1553 is not set 2003# CONFIG_RTC_DRV_DS1553 is not set
1807# CONFIG_RTC_DRV_DS1742 is not set 2004# CONFIG_RTC_DRV_DS1742 is not set
1808# CONFIG_RTC_DRV_STK17TA8 is not set 2005# CONFIG_RTC_DRV_STK17TA8 is not set
1809# CONFIG_RTC_DRV_M48T86 is not set 2006# CONFIG_RTC_DRV_M48T86 is not set
2007# CONFIG_RTC_DRV_M48T35 is not set
1810# CONFIG_RTC_DRV_M48T59 is not set 2008# CONFIG_RTC_DRV_M48T59 is not set
2009# CONFIG_RTC_DRV_BQ4802 is not set
1811# CONFIG_RTC_DRV_V3020 is not set 2010# CONFIG_RTC_DRV_V3020 is not set
1812 2011
1813# 2012#
@@ -1820,6 +2019,21 @@ CONFIG_DMADEVICES=y
1820# 2019#
1821# CONFIG_INTEL_IOATDMA is not set 2020# CONFIG_INTEL_IOATDMA is not set
1822# CONFIG_UIO is not set 2021# CONFIG_UIO is not set
2022# CONFIG_STAGING is not set
2023CONFIG_X86_PLATFORM_DEVICES=y
2024# CONFIG_ACER_WMI is not set
2025# CONFIG_ASUS_LAPTOP is not set
2026# CONFIG_FUJITSU_LAPTOP is not set
2027# CONFIG_MSI_LAPTOP is not set
2028# CONFIG_PANASONIC_LAPTOP is not set
2029# CONFIG_COMPAL_LAPTOP is not set
2030# CONFIG_SONY_LAPTOP is not set
2031# CONFIG_THINKPAD_ACPI is not set
2032# CONFIG_INTEL_MENLOW is not set
2033CONFIG_EEEPC_LAPTOP=y
2034# CONFIG_ACPI_WMI is not set
2035# CONFIG_ACPI_ASUS is not set
2036# CONFIG_ACPI_TOSHIBA is not set
1823 2037
1824# 2038#
1825# Firmware Drivers 2039# Firmware Drivers
@@ -1830,8 +2044,7 @@ CONFIG_EFI_VARS=y
1830# CONFIG_DELL_RBU is not set 2044# CONFIG_DELL_RBU is not set
1831# CONFIG_DCDBAS is not set 2045# CONFIG_DCDBAS is not set
1832CONFIG_DMIID=y 2046CONFIG_DMIID=y
1833CONFIG_ISCSI_IBFT_FIND=y 2047# CONFIG_ISCSI_IBFT_FIND is not set
1834CONFIG_ISCSI_IBFT=y
1835 2048
1836# 2049#
1837# File systems 2050# File systems
@@ -1841,22 +2054,25 @@ CONFIG_EXT3_FS=y
1841CONFIG_EXT3_FS_XATTR=y 2054CONFIG_EXT3_FS_XATTR=y
1842CONFIG_EXT3_FS_POSIX_ACL=y 2055CONFIG_EXT3_FS_POSIX_ACL=y
1843CONFIG_EXT3_FS_SECURITY=y 2056CONFIG_EXT3_FS_SECURITY=y
1844# CONFIG_EXT4DEV_FS is not set 2057# CONFIG_EXT4_FS is not set
1845CONFIG_JBD=y 2058CONFIG_JBD=y
1846# CONFIG_JBD_DEBUG is not set 2059# CONFIG_JBD_DEBUG is not set
1847CONFIG_FS_MBCACHE=y 2060CONFIG_FS_MBCACHE=y
1848# CONFIG_REISERFS_FS is not set 2061# CONFIG_REISERFS_FS is not set
1849# CONFIG_JFS_FS is not set 2062# CONFIG_JFS_FS is not set
1850CONFIG_FS_POSIX_ACL=y 2063CONFIG_FS_POSIX_ACL=y
2064CONFIG_FILE_LOCKING=y
1851# CONFIG_XFS_FS is not set 2065# CONFIG_XFS_FS is not set
1852# CONFIG_GFS2_FS is not set 2066# CONFIG_GFS2_FS is not set
1853# CONFIG_OCFS2_FS is not set 2067# CONFIG_OCFS2_FS is not set
2068# CONFIG_BTRFS_FS is not set
1854CONFIG_DNOTIFY=y 2069CONFIG_DNOTIFY=y
1855CONFIG_INOTIFY=y 2070CONFIG_INOTIFY=y
1856CONFIG_INOTIFY_USER=y 2071CONFIG_INOTIFY_USER=y
1857CONFIG_QUOTA=y 2072CONFIG_QUOTA=y
1858CONFIG_QUOTA_NETLINK_INTERFACE=y 2073CONFIG_QUOTA_NETLINK_INTERFACE=y
1859# CONFIG_PRINT_QUOTA_WARNING is not set 2074# CONFIG_PRINT_QUOTA_WARNING is not set
2075CONFIG_QUOTA_TREE=y
1860# CONFIG_QFMT_V1 is not set 2076# CONFIG_QFMT_V1 is not set
1861CONFIG_QFMT_V2=y 2077CONFIG_QFMT_V2=y
1862CONFIG_QUOTACTL=y 2078CONFIG_QUOTACTL=y
@@ -1890,16 +2106,14 @@ CONFIG_PROC_FS=y
1890CONFIG_PROC_KCORE=y 2106CONFIG_PROC_KCORE=y
1891CONFIG_PROC_VMCORE=y 2107CONFIG_PROC_VMCORE=y
1892CONFIG_PROC_SYSCTL=y 2108CONFIG_PROC_SYSCTL=y
2109CONFIG_PROC_PAGE_MONITOR=y
1893CONFIG_SYSFS=y 2110CONFIG_SYSFS=y
1894CONFIG_TMPFS=y 2111CONFIG_TMPFS=y
1895CONFIG_TMPFS_POSIX_ACL=y 2112CONFIG_TMPFS_POSIX_ACL=y
1896CONFIG_HUGETLBFS=y 2113CONFIG_HUGETLBFS=y
1897CONFIG_HUGETLB_PAGE=y 2114CONFIG_HUGETLB_PAGE=y
1898# CONFIG_CONFIGFS_FS is not set 2115# CONFIG_CONFIGFS_FS is not set
1899 2116CONFIG_MISC_FILESYSTEMS=y
1900#
1901# Miscellaneous filesystems
1902#
1903# CONFIG_ADFS_FS is not set 2117# CONFIG_ADFS_FS is not set
1904# CONFIG_AFFS_FS is not set 2118# CONFIG_AFFS_FS is not set
1905# CONFIG_ECRYPT_FS is not set 2119# CONFIG_ECRYPT_FS is not set
@@ -1909,6 +2123,7 @@ CONFIG_HUGETLB_PAGE=y
1909# CONFIG_BFS_FS is not set 2123# CONFIG_BFS_FS is not set
1910# CONFIG_EFS_FS is not set 2124# CONFIG_EFS_FS is not set
1911# CONFIG_CRAMFS is not set 2125# CONFIG_CRAMFS is not set
2126# CONFIG_SQUASHFS is not set
1912# CONFIG_VXFS_FS is not set 2127# CONFIG_VXFS_FS is not set
1913# CONFIG_MINIX_FS is not set 2128# CONFIG_MINIX_FS is not set
1914# CONFIG_OMFS_FS is not set 2129# CONFIG_OMFS_FS is not set
@@ -1930,6 +2145,7 @@ CONFIG_NFS_ACL_SUPPORT=y
1930CONFIG_NFS_COMMON=y 2145CONFIG_NFS_COMMON=y
1931CONFIG_SUNRPC=y 2146CONFIG_SUNRPC=y
1932CONFIG_SUNRPC_GSS=y 2147CONFIG_SUNRPC_GSS=y
2148# CONFIG_SUNRPC_REGISTER_V4 is not set
1933CONFIG_RPCSEC_GSS_KRB5=y 2149CONFIG_RPCSEC_GSS_KRB5=y
1934# CONFIG_RPCSEC_GSS_SPKM3 is not set 2150# CONFIG_RPCSEC_GSS_SPKM3 is not set
1935# CONFIG_SMB_FS is not set 2151# CONFIG_SMB_FS is not set
@@ -2006,7 +2222,7 @@ CONFIG_NLS_UTF8=y
2006# 2222#
2007CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2223CONFIG_TRACE_IRQFLAGS_SUPPORT=y
2008CONFIG_PRINTK_TIME=y 2224CONFIG_PRINTK_TIME=y
2009CONFIG_ENABLE_WARN_DEPRECATED=y 2225# CONFIG_ENABLE_WARN_DEPRECATED is not set
2010CONFIG_ENABLE_MUST_CHECK=y 2226CONFIG_ENABLE_MUST_CHECK=y
2011CONFIG_FRAME_WARN=2048 2227CONFIG_FRAME_WARN=2048
2012CONFIG_MAGIC_SYSRQ=y 2228CONFIG_MAGIC_SYSRQ=y
@@ -2035,40 +2251,60 @@ CONFIG_TIMER_STATS=y
2035CONFIG_DEBUG_BUGVERBOSE=y 2251CONFIG_DEBUG_BUGVERBOSE=y
2036# CONFIG_DEBUG_INFO is not set 2252# CONFIG_DEBUG_INFO is not set
2037# CONFIG_DEBUG_VM is not set 2253# CONFIG_DEBUG_VM is not set
2254# CONFIG_DEBUG_VIRTUAL is not set
2038# CONFIG_DEBUG_WRITECOUNT is not set 2255# CONFIG_DEBUG_WRITECOUNT is not set
2039CONFIG_DEBUG_MEMORY_INIT=y 2256CONFIG_DEBUG_MEMORY_INIT=y
2040# CONFIG_DEBUG_LIST is not set 2257# CONFIG_DEBUG_LIST is not set
2041# CONFIG_DEBUG_SG is not set 2258# CONFIG_DEBUG_SG is not set
2259# CONFIG_DEBUG_NOTIFIERS is not set
2260CONFIG_ARCH_WANT_FRAME_POINTERS=y
2042CONFIG_FRAME_POINTER=y 2261CONFIG_FRAME_POINTER=y
2043# CONFIG_BOOT_PRINTK_DELAY is not set 2262# CONFIG_BOOT_PRINTK_DELAY is not set
2044# CONFIG_RCU_TORTURE_TEST is not set 2263# CONFIG_RCU_TORTURE_TEST is not set
2264# CONFIG_RCU_CPU_STALL_DETECTOR is not set
2045# CONFIG_KPROBES_SANITY_TEST is not set 2265# CONFIG_KPROBES_SANITY_TEST is not set
2046# CONFIG_BACKTRACE_SELF_TEST is not set 2266# CONFIG_BACKTRACE_SELF_TEST is not set
2267# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
2047# CONFIG_LKDTM is not set 2268# CONFIG_LKDTM is not set
2048# CONFIG_FAULT_INJECTION is not set 2269# CONFIG_FAULT_INJECTION is not set
2049# CONFIG_LATENCYTOP is not set 2270# CONFIG_LATENCYTOP is not set
2050CONFIG_SYSCTL_SYSCALL_CHECK=y 2271CONFIG_SYSCTL_SYSCALL_CHECK=y
2051CONFIG_HAVE_FTRACE=y 2272CONFIG_USER_STACKTRACE_SUPPORT=y
2273CONFIG_HAVE_FUNCTION_TRACER=y
2274CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2275CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2052CONFIG_HAVE_DYNAMIC_FTRACE=y 2276CONFIG_HAVE_DYNAMIC_FTRACE=y
2053# CONFIG_FTRACE is not set 2277CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2278CONFIG_HAVE_HW_BRANCH_TRACER=y
2279
2280#
2281# Tracers
2282#
2283# CONFIG_FUNCTION_TRACER is not set
2054# CONFIG_IRQSOFF_TRACER is not set 2284# CONFIG_IRQSOFF_TRACER is not set
2055# CONFIG_SYSPROF_TRACER is not set 2285# CONFIG_SYSPROF_TRACER is not set
2056# CONFIG_SCHED_TRACER is not set 2286# CONFIG_SCHED_TRACER is not set
2057# CONFIG_CONTEXT_SWITCH_TRACER is not set 2287# CONFIG_CONTEXT_SWITCH_TRACER is not set
2288# CONFIG_BOOT_TRACER is not set
2289# CONFIG_TRACE_BRANCH_PROFILING is not set
2290# CONFIG_POWER_TRACER is not set
2291# CONFIG_STACK_TRACER is not set
2292# CONFIG_HW_BRANCH_TRACER is not set
2058CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2293CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2294# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
2059# CONFIG_SAMPLES is not set 2295# CONFIG_SAMPLES is not set
2060CONFIG_HAVE_ARCH_KGDB=y 2296CONFIG_HAVE_ARCH_KGDB=y
2061# CONFIG_KGDB is not set 2297# CONFIG_KGDB is not set
2062# CONFIG_STRICT_DEVMEM is not set 2298# CONFIG_STRICT_DEVMEM is not set
2063CONFIG_X86_VERBOSE_BOOTUP=y 2299CONFIG_X86_VERBOSE_BOOTUP=y
2064CONFIG_EARLY_PRINTK=y 2300CONFIG_EARLY_PRINTK=y
2301CONFIG_EARLY_PRINTK_DBGP=y
2065CONFIG_DEBUG_STACKOVERFLOW=y 2302CONFIG_DEBUG_STACKOVERFLOW=y
2066CONFIG_DEBUG_STACK_USAGE=y 2303CONFIG_DEBUG_STACK_USAGE=y
2067# CONFIG_DEBUG_PAGEALLOC is not set 2304# CONFIG_DEBUG_PAGEALLOC is not set
2068# CONFIG_DEBUG_PER_CPU_MAPS is not set 2305# CONFIG_DEBUG_PER_CPU_MAPS is not set
2069# CONFIG_X86_PTDUMP is not set 2306# CONFIG_X86_PTDUMP is not set
2070CONFIG_DEBUG_RODATA=y 2307CONFIG_DEBUG_RODATA=y
2071# CONFIG_DIRECT_GBPAGES is not set
2072# CONFIG_DEBUG_RODATA_TEST is not set 2308# CONFIG_DEBUG_RODATA_TEST is not set
2073CONFIG_DEBUG_NX_TEST=m 2309CONFIG_DEBUG_NX_TEST=m
2074# CONFIG_IOMMU_DEBUG is not set 2310# CONFIG_IOMMU_DEBUG is not set
@@ -2092,8 +2328,10 @@ CONFIG_OPTIMIZE_INLINING=y
2092CONFIG_KEYS=y 2328CONFIG_KEYS=y
2093CONFIG_KEYS_DEBUG_PROC_KEYS=y 2329CONFIG_KEYS_DEBUG_PROC_KEYS=y
2094CONFIG_SECURITY=y 2330CONFIG_SECURITY=y
2331# CONFIG_SECURITYFS is not set
2095CONFIG_SECURITY_NETWORK=y 2332CONFIG_SECURITY_NETWORK=y
2096# CONFIG_SECURITY_NETWORK_XFRM is not set 2333# CONFIG_SECURITY_NETWORK_XFRM is not set
2334# CONFIG_SECURITY_PATH is not set
2097CONFIG_SECURITY_FILE_CAPABILITIES=y 2335CONFIG_SECURITY_FILE_CAPABILITIES=y
2098# CONFIG_SECURITY_ROOTPLUG is not set 2336# CONFIG_SECURITY_ROOTPLUG is not set
2099CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2337CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2104,7 +2342,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
2104CONFIG_SECURITY_SELINUX_DEVELOP=y 2342CONFIG_SECURITY_SELINUX_DEVELOP=y
2105CONFIG_SECURITY_SELINUX_AVC_STATS=y 2343CONFIG_SECURITY_SELINUX_AVC_STATS=y
2106CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2344CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2107# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set
2108# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2345# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2109# CONFIG_SECURITY_SMACK is not set 2346# CONFIG_SECURITY_SMACK is not set
2110CONFIG_CRYPTO=y 2347CONFIG_CRYPTO=y
@@ -2112,11 +2349,18 @@ CONFIG_CRYPTO=y
2112# 2349#
2113# Crypto core or helper 2350# Crypto core or helper
2114# 2351#
2352# CONFIG_CRYPTO_FIPS is not set
2115CONFIG_CRYPTO_ALGAPI=y 2353CONFIG_CRYPTO_ALGAPI=y
2354CONFIG_CRYPTO_ALGAPI2=y
2116CONFIG_CRYPTO_AEAD=y 2355CONFIG_CRYPTO_AEAD=y
2356CONFIG_CRYPTO_AEAD2=y
2117CONFIG_CRYPTO_BLKCIPHER=y 2357CONFIG_CRYPTO_BLKCIPHER=y
2358CONFIG_CRYPTO_BLKCIPHER2=y
2118CONFIG_CRYPTO_HASH=y 2359CONFIG_CRYPTO_HASH=y
2360CONFIG_CRYPTO_HASH2=y
2361CONFIG_CRYPTO_RNG2=y
2119CONFIG_CRYPTO_MANAGER=y 2362CONFIG_CRYPTO_MANAGER=y
2363CONFIG_CRYPTO_MANAGER2=y
2120# CONFIG_CRYPTO_GF128MUL is not set 2364# CONFIG_CRYPTO_GF128MUL is not set
2121# CONFIG_CRYPTO_NULL is not set 2365# CONFIG_CRYPTO_NULL is not set
2122# CONFIG_CRYPTO_CRYPTD is not set 2366# CONFIG_CRYPTO_CRYPTD is not set
@@ -2151,6 +2395,7 @@ CONFIG_CRYPTO_HMAC=y
2151# Digest 2395# Digest
2152# 2396#
2153# CONFIG_CRYPTO_CRC32C is not set 2397# CONFIG_CRYPTO_CRC32C is not set
2398# CONFIG_CRYPTO_CRC32C_INTEL is not set
2154# CONFIG_CRYPTO_MD4 is not set 2399# CONFIG_CRYPTO_MD4 is not set
2155CONFIG_CRYPTO_MD5=y 2400CONFIG_CRYPTO_MD5=y
2156# CONFIG_CRYPTO_MICHAEL_MIC is not set 2401# CONFIG_CRYPTO_MICHAEL_MIC is not set
@@ -2191,6 +2436,11 @@ CONFIG_CRYPTO_DES=y
2191# 2436#
2192# CONFIG_CRYPTO_DEFLATE is not set 2437# CONFIG_CRYPTO_DEFLATE is not set
2193# CONFIG_CRYPTO_LZO is not set 2438# CONFIG_CRYPTO_LZO is not set
2439
2440#
2441# Random Number Generation
2442#
2443# CONFIG_CRYPTO_ANSI_CPRNG is not set
2194CONFIG_CRYPTO_HW=y 2444CONFIG_CRYPTO_HW=y
2195# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2445# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2196CONFIG_HAVE_KVM=y 2446CONFIG_HAVE_KVM=y
@@ -2205,6 +2455,7 @@ CONFIG_VIRTUALIZATION=y
2205CONFIG_BITREVERSE=y 2455CONFIG_BITREVERSE=y
2206CONFIG_GENERIC_FIND_FIRST_BIT=y 2456CONFIG_GENERIC_FIND_FIRST_BIT=y
2207CONFIG_GENERIC_FIND_NEXT_BIT=y 2457CONFIG_GENERIC_FIND_NEXT_BIT=y
2458CONFIG_GENERIC_FIND_LAST_BIT=y
2208# CONFIG_CRC_CCITT is not set 2459# CONFIG_CRC_CCITT is not set
2209# CONFIG_CRC16 is not set 2460# CONFIG_CRC16 is not set
2210CONFIG_CRC_T10DIF=y 2461CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 9dabd00e9805..588a7aa937e1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -33,8 +33,6 @@
33#include <asm/sigframe.h> 33#include <asm/sigframe.h>
34#include <asm/sys_ia32.h> 34#include <asm/sys_ia32.h>
35 35
36#define DEBUG_SIG 0
37
38#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 36#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
39 37
40#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ 38#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
@@ -46,78 +44,83 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
46 44
47int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 45int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
48{ 46{
49 int err; 47 int err = 0;
50 48
51 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) 49 if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
52 return -EFAULT; 50 return -EFAULT;
53 51
54 /* If you change siginfo_t structure, please make sure that 52 put_user_try {
55 this code is fixed accordingly. 53 /* If you change siginfo_t structure, please make sure that
56 It should never copy any pad contained in the structure 54 this code is fixed accordingly.
57 to avoid security leaks, but must copy the generic 55 It should never copy any pad contained in the structure
58 3 ints plus the relevant union member. */ 56 to avoid security leaks, but must copy the generic
59 err = __put_user(from->si_signo, &to->si_signo); 57 3 ints plus the relevant union member. */
60 err |= __put_user(from->si_errno, &to->si_errno); 58 put_user_ex(from->si_signo, &to->si_signo);
61 err |= __put_user((short)from->si_code, &to->si_code); 59 put_user_ex(from->si_errno, &to->si_errno);
62 60 put_user_ex((short)from->si_code, &to->si_code);
63 if (from->si_code < 0) { 61
64 err |= __put_user(from->si_pid, &to->si_pid); 62 if (from->si_code < 0) {
65 err |= __put_user(from->si_uid, &to->si_uid); 63 put_user_ex(from->si_pid, &to->si_pid);
66 err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); 64 put_user_ex(from->si_uid, &to->si_uid);
67 } else { 65 put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
68 /* 66 } else {
69 * First 32bits of unions are always present: 67 /*
70 * si_pid === si_band === si_tid === si_addr(LS half) 68 * First 32bits of unions are always present:
71 */ 69 * si_pid === si_band === si_tid === si_addr(LS half)
72 err |= __put_user(from->_sifields._pad[0], 70 */
73 &to->_sifields._pad[0]); 71 put_user_ex(from->_sifields._pad[0],
74 switch (from->si_code >> 16) { 72 &to->_sifields._pad[0]);
75 case __SI_FAULT >> 16: 73 switch (from->si_code >> 16) {
76 break; 74 case __SI_FAULT >> 16:
77 case __SI_CHLD >> 16: 75 break;
78 err |= __put_user(from->si_utime, &to->si_utime); 76 case __SI_CHLD >> 16:
79 err |= __put_user(from->si_stime, &to->si_stime); 77 put_user_ex(from->si_utime, &to->si_utime);
80 err |= __put_user(from->si_status, &to->si_status); 78 put_user_ex(from->si_stime, &to->si_stime);
81 /* FALL THROUGH */ 79 put_user_ex(from->si_status, &to->si_status);
82 default: 80 /* FALL THROUGH */
83 case __SI_KILL >> 16: 81 default:
84 err |= __put_user(from->si_uid, &to->si_uid); 82 case __SI_KILL >> 16:
85 break; 83 put_user_ex(from->si_uid, &to->si_uid);
86 case __SI_POLL >> 16: 84 break;
87 err |= __put_user(from->si_fd, &to->si_fd); 85 case __SI_POLL >> 16:
88 break; 86 put_user_ex(from->si_fd, &to->si_fd);
89 case __SI_TIMER >> 16: 87 break;
90 err |= __put_user(from->si_overrun, &to->si_overrun); 88 case __SI_TIMER >> 16:
91 err |= __put_user(ptr_to_compat(from->si_ptr), 89 put_user_ex(from->si_overrun, &to->si_overrun);
92 &to->si_ptr); 90 put_user_ex(ptr_to_compat(from->si_ptr),
93 break; 91 &to->si_ptr);
94 /* This is not generated by the kernel as of now. */ 92 break;
95 case __SI_RT >> 16: 93 /* This is not generated by the kernel as of now. */
96 case __SI_MESGQ >> 16: 94 case __SI_RT >> 16:
97 err |= __put_user(from->si_uid, &to->si_uid); 95 case __SI_MESGQ >> 16:
98 err |= __put_user(from->si_int, &to->si_int); 96 put_user_ex(from->si_uid, &to->si_uid);
99 break; 97 put_user_ex(from->si_int, &to->si_int);
98 break;
99 }
100 } 100 }
101 } 101 } put_user_catch(err);
102
102 return err; 103 return err;
103} 104}
104 105
105int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) 106int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
106{ 107{
107 int err; 108 int err = 0;
108 u32 ptr32; 109 u32 ptr32;
109 110
110 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) 111 if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
111 return -EFAULT; 112 return -EFAULT;
112 113
113 err = __get_user(to->si_signo, &from->si_signo); 114 get_user_try {
114 err |= __get_user(to->si_errno, &from->si_errno); 115 get_user_ex(to->si_signo, &from->si_signo);
115 err |= __get_user(to->si_code, &from->si_code); 116 get_user_ex(to->si_errno, &from->si_errno);
117 get_user_ex(to->si_code, &from->si_code);
116 118
117 err |= __get_user(to->si_pid, &from->si_pid); 119 get_user_ex(to->si_pid, &from->si_pid);
118 err |= __get_user(to->si_uid, &from->si_uid); 120 get_user_ex(to->si_uid, &from->si_uid);
119 err |= __get_user(ptr32, &from->si_ptr); 121 get_user_ex(ptr32, &from->si_ptr);
120 to->si_ptr = compat_ptr(ptr32); 122 to->si_ptr = compat_ptr(ptr32);
123 } get_user_catch(err);
121 124
122 return err; 125 return err;
123} 126}
@@ -142,17 +145,23 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
142 struct pt_regs *regs) 145 struct pt_regs *regs)
143{ 146{
144 stack_t uss, uoss; 147 stack_t uss, uoss;
145 int ret; 148 int ret, err = 0;
146 mm_segment_t seg; 149 mm_segment_t seg;
147 150
148 if (uss_ptr) { 151 if (uss_ptr) {
149 u32 ptr; 152 u32 ptr;
150 153
151 memset(&uss, 0, sizeof(stack_t)); 154 memset(&uss, 0, sizeof(stack_t));
152 if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) || 155 if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)))
153 __get_user(ptr, &uss_ptr->ss_sp) || 156 return -EFAULT;
154 __get_user(uss.ss_flags, &uss_ptr->ss_flags) || 157
155 __get_user(uss.ss_size, &uss_ptr->ss_size)) 158 get_user_try {
159 get_user_ex(ptr, &uss_ptr->ss_sp);
160 get_user_ex(uss.ss_flags, &uss_ptr->ss_flags);
161 get_user_ex(uss.ss_size, &uss_ptr->ss_size);
162 } get_user_catch(err);
163
164 if (err)
156 return -EFAULT; 165 return -EFAULT;
157 uss.ss_sp = compat_ptr(ptr); 166 uss.ss_sp = compat_ptr(ptr);
158 } 167 }
@@ -161,10 +170,16 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
161 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); 170 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
162 set_fs(seg); 171 set_fs(seg);
163 if (ret >= 0 && uoss_ptr) { 172 if (ret >= 0 && uoss_ptr) {
164 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) || 173 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)))
165 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || 174 return -EFAULT;
166 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || 175
167 __put_user(uoss.ss_size, &uoss_ptr->ss_size)) 176 put_user_try {
177 put_user_ex(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp);
178 put_user_ex(uoss.ss_flags, &uoss_ptr->ss_flags);
179 put_user_ex(uoss.ss_size, &uoss_ptr->ss_size);
180 } put_user_catch(err);
181
182 if (err)
168 ret = -EFAULT; 183 ret = -EFAULT;
169 } 184 }
170 return ret; 185 return ret;
@@ -173,75 +188,78 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
173/* 188/*
174 * Do a signal return; undo the signal stack. 189 * Do a signal return; undo the signal stack.
175 */ 190 */
191#define loadsegment_gs(v) load_gs_index(v)
192#define loadsegment_fs(v) loadsegment(fs, v)
193#define loadsegment_ds(v) loadsegment(ds, v)
194#define loadsegment_es(v) loadsegment(es, v)
195
196#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; })
197#define set_user_seg(seg, v) loadsegment_##seg(v)
198
176#define COPY(x) { \ 199#define COPY(x) { \
177 err |= __get_user(regs->x, &sc->x); \ 200 get_user_ex(regs->x, &sc->x); \
178} 201}
179 202
180#define COPY_SEG_CPL3(seg) { \ 203#define GET_SEG(seg) ({ \
181 unsigned short tmp; \ 204 unsigned short tmp; \
182 err |= __get_user(tmp, &sc->seg); \ 205 get_user_ex(tmp, &sc->seg); \
183 regs->seg = tmp | 3; \ 206 tmp; \
184} 207})
208
209#define COPY_SEG_CPL3(seg) do { \
210 regs->seg = GET_SEG(seg) | 3; \
211} while (0)
185 212
186#define RELOAD_SEG(seg) { \ 213#define RELOAD_SEG(seg) { \
187 unsigned int cur, pre; \ 214 unsigned int pre = GET_SEG(seg); \
188 err |= __get_user(pre, &sc->seg); \ 215 unsigned int cur = get_user_seg(seg); \
189 savesegment(seg, cur); \
190 pre |= 3; \ 216 pre |= 3; \
191 if (pre != cur) \ 217 if (pre != cur) \
192 loadsegment(seg, pre); \ 218 set_user_seg(seg, pre); \
193} 219}
194 220
195static int ia32_restore_sigcontext(struct pt_regs *regs, 221static int ia32_restore_sigcontext(struct pt_regs *regs,
196 struct sigcontext_ia32 __user *sc, 222 struct sigcontext_ia32 __user *sc,
197 unsigned int *pax) 223 unsigned int *pax)
198{ 224{
199 unsigned int tmpflags, gs, oldgs, err = 0; 225 unsigned int tmpflags, err = 0;
200 void __user *buf; 226 void __user *buf;
201 u32 tmp; 227 u32 tmp;
202 228
203 /* Always make any pending restarted system calls return -EINTR */ 229 /* Always make any pending restarted system calls return -EINTR */
204 current_thread_info()->restart_block.fn = do_no_restart_syscall; 230 current_thread_info()->restart_block.fn = do_no_restart_syscall;
205 231
206#if DEBUG_SIG 232 get_user_try {
207 printk(KERN_DEBUG "SIG restore_sigcontext: " 233 /*
208 "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", 234 * Reload fs and gs if they have changed in the signal
209 sc, sc->err, sc->ip, sc->cs, sc->flags); 235 * handler. This does not handle long fs/gs base changes in
210#endif 236 * the handler, but does not clobber them at least in the
211 237 * normal case.
212 /* 238 */
213 * Reload fs and gs if they have changed in the signal 239 RELOAD_SEG(gs);
214 * handler. This does not handle long fs/gs base changes in 240 RELOAD_SEG(fs);
215 * the handler, but does not clobber them at least in the 241 RELOAD_SEG(ds);
216 * normal case. 242 RELOAD_SEG(es);
217 */ 243
218 err |= __get_user(gs, &sc->gs); 244 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
219 gs |= 3; 245 COPY(dx); COPY(cx); COPY(ip);
220 savesegment(gs, oldgs); 246 /* Don't touch extended registers */
221 if (gs != oldgs) 247
222 load_gs_index(gs); 248 COPY_SEG_CPL3(cs);
223 249 COPY_SEG_CPL3(ss);
224 RELOAD_SEG(fs); 250
225 RELOAD_SEG(ds); 251 get_user_ex(tmpflags, &sc->flags);
226 RELOAD_SEG(es); 252 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
227 253 /* disable syscall checks */
228 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 254 regs->orig_ax = -1;
229 COPY(dx); COPY(cx); COPY(ip); 255
230 /* Don't touch extended registers */ 256 get_user_ex(tmp, &sc->fpstate);
231 257 buf = compat_ptr(tmp);
232 COPY_SEG_CPL3(cs); 258 err |= restore_i387_xstate_ia32(buf);
233 COPY_SEG_CPL3(ss); 259
234 260 get_user_ex(*pax, &sc->ax);
235 err |= __get_user(tmpflags, &sc->flags); 261 } get_user_catch(err);
236 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 262
237 /* disable syscall checks */
238 regs->orig_ax = -1;
239
240 err |= __get_user(tmp, &sc->fpstate);
241 buf = compat_ptr(tmp);
242 err |= restore_i387_xstate_ia32(buf);
243
244 err |= __get_user(*pax, &sc->ax);
245 return err; 263 return err;
246} 264}
247 265
@@ -317,38 +335,36 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
317 void __user *fpstate, 335 void __user *fpstate,
318 struct pt_regs *regs, unsigned int mask) 336 struct pt_regs *regs, unsigned int mask)
319{ 337{
320 int tmp, err = 0; 338 int err = 0;
321 339
322 savesegment(gs, tmp); 340 put_user_try {
323 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 341 put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs);
324 savesegment(fs, tmp); 342 put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs);
325 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 343 put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds);
326 savesegment(ds, tmp); 344 put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es);
327 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 345
328 savesegment(es, tmp); 346 put_user_ex(regs->di, &sc->di);
329 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 347 put_user_ex(regs->si, &sc->si);
330 348 put_user_ex(regs->bp, &sc->bp);
331 err |= __put_user(regs->di, &sc->di); 349 put_user_ex(regs->sp, &sc->sp);
332 err |= __put_user(regs->si, &sc->si); 350 put_user_ex(regs->bx, &sc->bx);
333 err |= __put_user(regs->bp, &sc->bp); 351 put_user_ex(regs->dx, &sc->dx);
334 err |= __put_user(regs->sp, &sc->sp); 352 put_user_ex(regs->cx, &sc->cx);
335 err |= __put_user(regs->bx, &sc->bx); 353 put_user_ex(regs->ax, &sc->ax);
336 err |= __put_user(regs->dx, &sc->dx); 354 put_user_ex(current->thread.trap_no, &sc->trapno);
337 err |= __put_user(regs->cx, &sc->cx); 355 put_user_ex(current->thread.error_code, &sc->err);
338 err |= __put_user(regs->ax, &sc->ax); 356 put_user_ex(regs->ip, &sc->ip);
339 err |= __put_user(current->thread.trap_no, &sc->trapno); 357 put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
340 err |= __put_user(current->thread.error_code, &sc->err); 358 put_user_ex(regs->flags, &sc->flags);
341 err |= __put_user(regs->ip, &sc->ip); 359 put_user_ex(regs->sp, &sc->sp_at_signal);
342 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 360 put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
343 err |= __put_user(regs->flags, &sc->flags); 361
344 err |= __put_user(regs->sp, &sc->sp_at_signal); 362 put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
345 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 363
346 364 /* non-iBCS2 extensions.. */
347 err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate); 365 put_user_ex(mask, &sc->oldmask);
348 366 put_user_ex(current->thread.cr2, &sc->cr2);
349 /* non-iBCS2 extensions.. */ 367 } put_user_catch(err);
350 err |= __put_user(mask, &sc->oldmask);
351 err |= __put_user(current->thread.cr2, &sc->cr2);
352 368
353 return err; 369 return err;
354} 370}
@@ -437,13 +453,17 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
437 else 453 else
438 restorer = &frame->retcode; 454 restorer = &frame->retcode;
439 } 455 }
440 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
441 456
442 /* 457 put_user_try {
443 * These are actually not used anymore, but left because some 458 put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
444 * gdb versions depend on them as a marker. 459
445 */ 460 /*
446 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); 461 * These are actually not used anymore, but left because some
462 * gdb versions depend on them as a marker.
463 */
464 put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
465 } put_user_catch(err);
466
447 if (err) 467 if (err)
448 return -EFAULT; 468 return -EFAULT;
449 469
@@ -462,11 +482,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
462 regs->cs = __USER32_CS; 482 regs->cs = __USER32_CS;
463 regs->ss = __USER32_DS; 483 regs->ss = __USER32_DS;
464 484
465#if DEBUG_SIG
466 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
467 current->comm, current->pid, frame, regs->ip, frame->pretcode);
468#endif
469
470 return 0; 485 return 0;
471} 486}
472 487
@@ -496,41 +511,40 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
496 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 511 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
497 return -EFAULT; 512 return -EFAULT;
498 513
499 err |= __put_user(sig, &frame->sig); 514 put_user_try {
500 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 515 put_user_ex(sig, &frame->sig);
501 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 516 put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
502 err |= copy_siginfo_to_user32(&frame->info, info); 517 put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
503 if (err) 518 err |= copy_siginfo_to_user32(&frame->info, info);
504 return -EFAULT;
505 519
506 /* Create the ucontext. */ 520 /* Create the ucontext. */
507 if (cpu_has_xsave) 521 if (cpu_has_xsave)
508 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 522 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
509 else 523 else
510 err |= __put_user(0, &frame->uc.uc_flags); 524 put_user_ex(0, &frame->uc.uc_flags);
511 err |= __put_user(0, &frame->uc.uc_link); 525 put_user_ex(0, &frame->uc.uc_link);
512 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 526 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
513 err |= __put_user(sas_ss_flags(regs->sp), 527 put_user_ex(sas_ss_flags(regs->sp),
514 &frame->uc.uc_stack.ss_flags); 528 &frame->uc.uc_stack.ss_flags);
515 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 529 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
516 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, 530 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
517 regs, set->sig[0]); 531 regs, set->sig[0]);
518 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 532 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
519 if (err) 533
520 return -EFAULT; 534 if (ka->sa.sa_flags & SA_RESTORER)
535 restorer = ka->sa.sa_restorer;
536 else
537 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
538 rt_sigreturn);
539 put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
540
541 /*
542 * Not actually used anymore, but left because some gdb
543 * versions need it.
544 */
545 put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
546 } put_user_catch(err);
521 547
522 if (ka->sa.sa_flags & SA_RESTORER)
523 restorer = ka->sa.sa_restorer;
524 else
525 restorer = VDSO32_SYMBOL(current->mm->context.vdso,
526 rt_sigreturn);
527 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
528
529 /*
530 * Not actually used anymore, but left because some gdb
531 * versions need it.
532 */
533 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
534 if (err) 548 if (err)
535 return -EFAULT; 549 return -EFAULT;
536 550
@@ -549,10 +563,5 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
549 regs->cs = __USER32_CS; 563 regs->cs = __USER32_CS;
550 regs->ss = __USER32_DS; 564 regs->ss = __USER32_DS;
551 565
552#if DEBUG_SIG
553 printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
554 current->comm, current->pid, frame, regs->ip, frame->pretcode);
555#endif
556
557 return 0; 566 return 0;
558} 567}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5a0d76dc56a4..097a6b64c24d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
112 CFI_DEF_CFA rsp,0 112 CFI_DEF_CFA rsp,0
113 CFI_REGISTER rsp,rbp 113 CFI_REGISTER rsp,rbp
114 SWAPGS_UNSAFE_STACK 114 SWAPGS_UNSAFE_STACK
115 movq %gs:pda_kernelstack, %rsp 115 movq PER_CPU_VAR(kernel_stack), %rsp
116 addq $(PDA_STACKOFFSET),%rsp 116 addq $(KERNEL_STACK_OFFSET),%rsp
117 /* 117 /*
118 * No need to follow this irqs on/off section: the syscall 118 * No need to follow this irqs on/off section: the syscall
119 * disabled irqs, here we enable it straight after entry: 119 * disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
273ENTRY(ia32_cstar_target) 273ENTRY(ia32_cstar_target)
274 CFI_STARTPROC32 simple 274 CFI_STARTPROC32 simple
275 CFI_SIGNAL_FRAME 275 CFI_SIGNAL_FRAME
276 CFI_DEF_CFA rsp,PDA_STACKOFFSET 276 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
277 CFI_REGISTER rip,rcx 277 CFI_REGISTER rip,rcx
278 /*CFI_REGISTER rflags,r11*/ 278 /*CFI_REGISTER rflags,r11*/
279 SWAPGS_UNSAFE_STACK 279 SWAPGS_UNSAFE_STACK
280 movl %esp,%r8d 280 movl %esp,%r8d
281 CFI_REGISTER rsp,r8 281 CFI_REGISTER rsp,r8
282 movq %gs:pda_kernelstack,%rsp 282 movq PER_CPU_VAR(kernel_stack),%rsp
283 /* 283 /*
284 * No need to follow this irqs on/off section: the syscall 284 * No need to follow this irqs on/off section: the syscall
285 * disabled irqs and here we enable it straight after entry: 285 * disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 3c601f8224be..bb70e397aa84 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -55,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
55 dump->regs.ds = (u16)regs->ds; 55 dump->regs.ds = (u16)regs->ds;
56 dump->regs.es = (u16)regs->es; 56 dump->regs.es = (u16)regs->es;
57 dump->regs.fs = (u16)regs->fs; 57 dump->regs.fs = (u16)regs->fs;
58 savesegment(gs, dump->regs.gs); 58 dump->regs.gs = get_user_gs(regs);
59 dump->regs.orig_ax = regs->orig_ax; 59 dump->regs.orig_ax = regs->orig_ax;
60 dump->regs.ip = regs->ip; 60 dump->regs.ip = regs->ip;
61 dump->regs.cs = (u16)regs->cs; 61 dump->regs.cs = (u16)regs->cs;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 9830681446ad..4518dc500903 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -102,9 +102,6 @@ static inline void disable_acpi(void)
102 acpi_noirq = 1; 102 acpi_noirq = 1;
103} 103}
104 104
105/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
106#define FIX_ACPI_PAGES 4
107
108extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); 105extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
109 106
110static inline void acpi_noirq_set(void) { acpi_noirq = 1; } 107static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index ab1d51a8855e..394d177d721b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -1,15 +1,18 @@
1#ifndef _ASM_X86_APIC_H 1#ifndef _ASM_X86_APIC_H
2#define _ASM_X86_APIC_H 2#define _ASM_X86_APIC_H
3 3
4#include <linux/pm.h> 4#include <linux/cpumask.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/pm.h>
6 7
7#include <asm/alternative.h> 8#include <asm/alternative.h>
8#include <asm/fixmap.h> 9#include <asm/cpufeature.h>
9#include <asm/apicdef.h>
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/apicdef.h>
12#include <asm/atomic.h>
13#include <asm/fixmap.h>
14#include <asm/mpspec.h>
11#include <asm/system.h> 15#include <asm/system.h>
12#include <asm/cpufeature.h>
13#include <asm/msr.h> 16#include <asm/msr.h>
14 17
15#define ARCH_APICTIMER_STOPS_ON_C3 1 18#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -33,7 +36,13 @@
33 } while (0) 36 } while (0)
34 37
35 38
39#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
36extern void generic_apic_probe(void); 40extern void generic_apic_probe(void);
41#else
42static inline void generic_apic_probe(void)
43{
44}
45#endif
37 46
38#ifdef CONFIG_X86_LOCAL_APIC 47#ifdef CONFIG_X86_LOCAL_APIC
39 48
@@ -41,6 +50,21 @@ extern unsigned int apic_verbosity;
41extern int local_apic_timer_c2_ok; 50extern int local_apic_timer_c2_ok;
42 51
43extern int disable_apic; 52extern int disable_apic;
53
54#ifdef CONFIG_SMP
55extern void __inquire_remote_apic(int apicid);
56#else /* CONFIG_SMP */
57static inline void __inquire_remote_apic(int apicid)
58{
59}
60#endif /* CONFIG_SMP */
61
62static inline void default_inquire_remote_apic(int apicid)
63{
64 if (apic_verbosity >= APIC_DEBUG)
65 __inquire_remote_apic(apicid);
66}
67
44/* 68/*
45 * Basic functions accessing APICs. 69 * Basic functions accessing APICs.
46 */ 70 */
@@ -51,7 +75,14 @@ extern int disable_apic;
51#define setup_secondary_clock setup_secondary_APIC_clock 75#define setup_secondary_clock setup_secondary_APIC_clock
52#endif 76#endif
53 77
78#ifdef CONFIG_X86_VSMP
54extern int is_vsmp_box(void); 79extern int is_vsmp_box(void);
80#else
81static inline int is_vsmp_box(void)
82{
83 return 0;
84}
85#endif
55extern void xapic_wait_icr_idle(void); 86extern void xapic_wait_icr_idle(void);
56extern u32 safe_xapic_wait_icr_idle(void); 87extern u32 safe_xapic_wait_icr_idle(void);
57extern void xapic_icr_write(u32, u32); 88extern void xapic_icr_write(u32, u32);
@@ -71,6 +102,12 @@ static inline u32 native_apic_mem_read(u32 reg)
71 return *((volatile u32 *)(APIC_BASE + reg)); 102 return *((volatile u32 *)(APIC_BASE + reg));
72} 103}
73 104
105extern void native_apic_wait_icr_idle(void);
106extern u32 native_safe_apic_wait_icr_idle(void);
107extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void);
109
110#ifdef CONFIG_X86_X2APIC
74static inline void native_apic_msr_write(u32 reg, u32 v) 111static inline void native_apic_msr_write(u32 reg, u32 v)
75{ 112{
76 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || 113 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -91,8 +128,32 @@ static inline u32 native_apic_msr_read(u32 reg)
91 return low; 128 return low;
92} 129}
93 130
94#ifndef CONFIG_X86_32 131static inline void native_x2apic_wait_icr_idle(void)
95extern int x2apic; 132{
133 /* no need to wait for icr idle in x2apic */
134 return;
135}
136
137static inline u32 native_safe_x2apic_wait_icr_idle(void)
138{
139 /* no need to wait for icr idle in x2apic */
140 return 0;
141}
142
143static inline void native_x2apic_icr_write(u32 low, u32 id)
144{
145 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
146}
147
148static inline u64 native_x2apic_icr_read(void)
149{
150 unsigned long val;
151
152 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
153 return val;
154}
155
156extern int x2apic, x2apic_phys;
96extern void check_x2apic(void); 157extern void check_x2apic(void);
97extern void enable_x2apic(void); 158extern void enable_x2apic(void);
98extern void enable_IR_x2apic(void); 159extern void enable_IR_x2apic(void);
@@ -110,30 +171,24 @@ static inline int x2apic_enabled(void)
110 return 0; 171 return 0;
111} 172}
112#else 173#else
113#define x2apic_enabled() 0 174static inline void check_x2apic(void)
175{
176}
177static inline void enable_x2apic(void)
178{
179}
180static inline void enable_IR_x2apic(void)
181{
182}
183static inline int x2apic_enabled(void)
184{
185 return 0;
186}
114#endif 187#endif
115 188
116struct apic_ops {
117 u32 (*read)(u32 reg);
118 void (*write)(u32 reg, u32 v);
119 u64 (*icr_read)(void);
120 void (*icr_write)(u32 low, u32 high);
121 void (*wait_icr_idle)(void);
122 u32 (*safe_wait_icr_idle)(void);
123};
124
125extern struct apic_ops *apic_ops;
126
127#define apic_read (apic_ops->read)
128#define apic_write (apic_ops->write)
129#define apic_icr_read (apic_ops->icr_read)
130#define apic_icr_write (apic_ops->icr_write)
131#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
132#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
133
134extern int get_physical_broadcast(void); 189extern int get_physical_broadcast(void);
135 190
136#ifdef CONFIG_X86_64 191#ifdef CONFIG_X86_X2APIC
137static inline void ack_x2APIC_irq(void) 192static inline void ack_x2APIC_irq(void)
138{ 193{
139 /* Docs say use 0 for future compatibility */ 194 /* Docs say use 0 for future compatibility */
@@ -141,18 +196,6 @@ static inline void ack_x2APIC_irq(void)
141} 196}
142#endif 197#endif
143 198
144
145static inline void ack_APIC_irq(void)
146{
147 /*
148 * ack_APIC_irq() actually gets compiled as a single instruction
149 * ... yummie.
150 */
151
152 /* Docs say use 0 for future compatibility */
153 apic_write(APIC_EOI, 0);
154}
155
156extern int lapic_get_maxlvt(void); 199extern int lapic_get_maxlvt(void);
157extern void clear_local_APIC(void); 200extern void clear_local_APIC(void);
158extern void connect_bsp_APIC(void); 201extern void connect_bsp_APIC(void);
@@ -196,4 +239,329 @@ static inline void disable_local_APIC(void) { }
196 239
197#endif /* !CONFIG_X86_LOCAL_APIC */ 240#endif /* !CONFIG_X86_LOCAL_APIC */
198 241
242#ifdef CONFIG_X86_64
243#define SET_APIC_ID(x) (apic->set_apic_id(x))
244#else
245
246#endif
247
248/*
249 * Copyright 2004 James Cleverdon, IBM.
250 * Subject to the GNU Public License, v.2
251 *
252 * Generic APIC sub-arch data struct.
253 *
254 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
255 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
256 * James Cleverdon.
257 */
258struct apic {
259 char *name;
260
261 int (*probe)(void);
262 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
263 int (*apic_id_registered)(void);
264
265 u32 irq_delivery_mode;
266 u32 irq_dest_mode;
267
268 const struct cpumask *(*target_cpus)(void);
269
270 int disable_esr;
271
272 int dest_logical;
273 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
274 unsigned long (*check_apicid_present)(int apicid);
275
276 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
277 void (*init_apic_ldr)(void);
278
279 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
280
281 void (*setup_apic_routing)(void);
282 int (*multi_timer_check)(int apic, int irq);
283 int (*apicid_to_node)(int logical_apicid);
284 int (*cpu_to_logical_apicid)(int cpu);
285 int (*cpu_present_to_apicid)(int mps_cpu);
286 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
287 void (*setup_portio_remap)(void);
288 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
289 void (*enable_apic_mode)(void);
290 int (*phys_pkg_id)(int cpuid_apic, int index_msb);
291
292 /*
293 * When one of the next two hooks returns 1 the apic
294 * is switched to this. Essentially they are additional
295 * probe functions:
296 */
297 int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
298
299 unsigned int (*get_apic_id)(unsigned long x);
300 unsigned long (*set_apic_id)(unsigned int id);
301 unsigned long apic_id_mask;
302
303 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
304 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
305 const struct cpumask *andmask);
306
307 /* ipi */
308 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
309 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
310 int vector);
311 void (*send_IPI_allbutself)(int vector);
312 void (*send_IPI_all)(int vector);
313 void (*send_IPI_self)(int vector);
314
315 /* wakeup_secondary_cpu */
316 int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
317
318 int trampoline_phys_low;
319 int trampoline_phys_high;
320
321 void (*wait_for_init_deassert)(atomic_t *deassert);
322 void (*smp_callin_clear_local_apic)(void);
323 void (*inquire_remote_apic)(int apicid);
324
325 /* apic ops */
326 u32 (*read)(u32 reg);
327 void (*write)(u32 reg, u32 v);
328 u64 (*icr_read)(void);
329 void (*icr_write)(u32 low, u32 high);
330 void (*wait_icr_idle)(void);
331 u32 (*safe_wait_icr_idle)(void);
332};
333
334/*
335 * Pointer to the local APIC driver in use on this system (there's
336 * always just one such driver in use - the kernel decides via an
337 * early probing process which one it picks - and then sticks to it):
338 */
339extern struct apic *apic;
340
341/*
342 * APIC functionality to boot other CPUs - only used on SMP:
343 */
344#ifdef CONFIG_SMP
345extern atomic_t init_deasserted;
346extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
347#endif
348
349static inline u32 apic_read(u32 reg)
350{
351 return apic->read(reg);
352}
353
354static inline void apic_write(u32 reg, u32 val)
355{
356 apic->write(reg, val);
357}
358
359static inline u64 apic_icr_read(void)
360{
361 return apic->icr_read();
362}
363
364static inline void apic_icr_write(u32 low, u32 high)
365{
366 apic->icr_write(low, high);
367}
368
369static inline void apic_wait_icr_idle(void)
370{
371 apic->wait_icr_idle();
372}
373
374static inline u32 safe_apic_wait_icr_idle(void)
375{
376 return apic->safe_wait_icr_idle();
377}
378
379
380static inline void ack_APIC_irq(void)
381{
382#ifdef CONFIG_X86_LOCAL_APIC
383 /*
384 * ack_APIC_irq() actually gets compiled as a single instruction
385 * ... yummie.
386 */
387
388 /* Docs say use 0 for future compatibility */
389 apic_write(APIC_EOI, 0);
390#endif
391}
392
393static inline unsigned default_get_apic_id(unsigned long x)
394{
395 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
396
397 if (APIC_XAPIC(ver))
398 return (x >> 24) & 0xFF;
399 else
400 return (x >> 24) & 0x0F;
401}
402
403/*
404 * Warm reset vector default position:
405 */
406#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
407#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
408
409#ifdef CONFIG_X86_64
410extern struct apic apic_flat;
411extern struct apic apic_physflat;
412extern struct apic apic_x2apic_cluster;
413extern struct apic apic_x2apic_phys;
414extern int default_acpi_madt_oem_check(char *, char *);
415
416extern void apic_send_IPI_self(int vector);
417
418extern struct apic apic_x2apic_uv_x;
419DECLARE_PER_CPU(int, x2apic_extra_bits);
420
421extern int default_cpu_present_to_apicid(int mps_cpu);
422extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
423#endif
424
425static inline void default_wait_for_init_deassert(atomic_t *deassert)
426{
427 while (!atomic_read(deassert))
428 cpu_relax();
429 return;
430}
431
432extern void generic_bigsmp_probe(void);
433
434
435#ifdef CONFIG_X86_LOCAL_APIC
436
437#include <asm/smp.h>
438
439#define APIC_DFR_VALUE (APIC_DFR_FLAT)
440
441static inline const struct cpumask *default_target_cpus(void)
442{
443#ifdef CONFIG_SMP
444 return cpu_online_mask;
445#else
446 return cpumask_of(0);
447#endif
448}
449
450DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
451
452
453static inline unsigned int read_apic_id(void)
454{
455 unsigned int reg;
456
457 reg = apic_read(APIC_ID);
458
459 return apic->get_apic_id(reg);
460}
461
462extern void default_setup_apic_routing(void);
463
464#ifdef CONFIG_X86_32
465/*
466 * Set up the logical destination ID.
467 *
468 * Intel recommends to set DFR, LDR and TPR before enabling
469 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
470 * document number 292116). So here it goes...
471 */
472extern void default_init_apic_ldr(void);
473
474static inline int default_apic_id_registered(void)
475{
476 return physid_isset(read_apic_id(), phys_cpu_present_map);
477}
478
479static inline unsigned int
480default_cpu_mask_to_apicid(const struct cpumask *cpumask)
481{
482 return cpumask_bits(cpumask)[0];
483}
484
485static inline unsigned int
486default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
487 const struct cpumask *andmask)
488{
489 unsigned long mask1 = cpumask_bits(cpumask)[0];
490 unsigned long mask2 = cpumask_bits(andmask)[0];
491 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
492
493 return (unsigned int)(mask1 & mask2 & mask3);
494}
495
496static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
497{
498 return cpuid_apic >> index_msb;
499}
500
501extern int default_apicid_to_node(int logical_apicid);
502
503#endif
504
505static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
506{
507 return physid_isset(apicid, bitmap);
508}
509
510static inline unsigned long default_check_apicid_present(int bit)
511{
512 return physid_isset(bit, phys_cpu_present_map);
513}
514
515static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map)
516{
517 return phys_map;
518}
519
520/* Mapping from cpu number to logical apicid */
521static inline int default_cpu_to_logical_apicid(int cpu)
522{
523 return 1 << cpu;
524}
525
526static inline int __default_cpu_present_to_apicid(int mps_cpu)
527{
528 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
529 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
530 else
531 return BAD_APICID;
532}
533
534static inline int
535__default_check_phys_apicid_present(int boot_cpu_physical_apicid)
536{
537 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
538}
539
540#ifdef CONFIG_X86_32
541static inline int default_cpu_present_to_apicid(int mps_cpu)
542{
543 return __default_cpu_present_to_apicid(mps_cpu);
544}
545
546static inline int
547default_check_phys_apicid_present(int boot_cpu_physical_apicid)
548{
549 return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
550}
551#else
552extern int default_cpu_present_to_apicid(int mps_cpu);
553extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
554#endif
555
556static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
557{
558 return physid_mask_of_physid(phys_apicid);
559}
560
561#endif /* CONFIG_X86_LOCAL_APIC */
562
563#ifdef CONFIG_X86_32
564extern u8 cpu_2_logical_apicid[NR_CPUS];
565#endif
566
199#endif /* _ASM_X86_APIC_H */ 567#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/apm.h
index 20370c6db74b..20370c6db74b 100644
--- a/arch/x86/include/asm/mach-default/apm.h
+++ b/arch/x86/include/asm/apm.h
diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h
deleted file mode 100644
index cbd4957838a6..000000000000
--- a/arch/x86/include/asm/arch_hooks.h
+++ /dev/null
@@ -1,26 +0,0 @@
1#ifndef _ASM_X86_ARCH_HOOKS_H
2#define _ASM_X86_ARCH_HOOKS_H
3
4#include <linux/interrupt.h>
5
6/*
7 * linux/include/asm/arch_hooks.h
8 *
9 * define the architecture specific hooks
10 */
11
12/* these aren't arch hooks, they are generic routines
13 * that can be used by the hooks */
14extern void init_ISA_irqs(void);
15extern irqreturn_t timer_interrupt(int irq, void *dev_id);
16
17/* these are the defined hooks */
18extern void intr_init_hook(void);
19extern void pre_intr_init_hook(void);
20extern void pre_setup_arch_hook(void);
21extern void trap_init_hook(void);
22extern void pre_time_init_hook(void);
23extern void time_init_hook(void);
24extern void mca_nmi_hook(void);
25
26#endif /* _ASM_X86_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
deleted file mode 100644
index d8dd9f537911..000000000000
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ /dev/null
@@ -1,155 +0,0 @@
1#ifndef __ASM_MACH_APIC_H
2#define __ASM_MACH_APIC_H
3
4#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
5#define esr_disable (1)
6
7static inline int apic_id_registered(void)
8{
9 return (1);
10}
11
12static inline const cpumask_t *target_cpus(void)
13{
14#ifdef CONFIG_SMP
15 return &cpu_online_map;
16#else
17 return &cpumask_of_cpu(0);
18#endif
19}
20
21#undef APIC_DEST_LOGICAL
22#define APIC_DEST_LOGICAL 0
23#define APIC_DFR_VALUE (APIC_DFR_FLAT)
24#define INT_DELIVERY_MODE (dest_Fixed)
25#define INT_DEST_MODE (0) /* phys delivery to target proc */
26#define NO_BALANCE_IRQ (0)
27
28static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
29{
30 return (0);
31}
32
33static inline unsigned long check_apicid_present(int bit)
34{
35 return (1);
36}
37
38static inline unsigned long calculate_ldr(int cpu)
39{
40 unsigned long val, id;
41 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
42 id = xapic_phys_to_log_apicid(cpu);
43 val |= SET_APIC_LOGICAL_ID(id);
44 return val;
45}
46
47/*
48 * Set up the logical destination ID.
49 *
50 * Intel recommends to set DFR, LDR and TPR before enabling
51 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
52 * document number 292116). So here it goes...
53 */
54static inline void init_apic_ldr(void)
55{
56 unsigned long val;
57 int cpu = smp_processor_id();
58
59 apic_write(APIC_DFR, APIC_DFR_VALUE);
60 val = calculate_ldr(cpu);
61 apic_write(APIC_LDR, val);
62}
63
64static inline void setup_apic_routing(void)
65{
66 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
67 "Physflat", nr_ioapics);
68}
69
70static inline int multi_timer_check(int apic, int irq)
71{
72 return (0);
73}
74
75static inline int apicid_to_node(int logical_apicid)
76{
77 return apicid_2_node[hard_smp_processor_id()];
78}
79
80static inline int cpu_present_to_apicid(int mps_cpu)
81{
82 if (mps_cpu < nr_cpu_ids)
83 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
84
85 return BAD_APICID;
86}
87
88static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
89{
90 return physid_mask_of_physid(phys_apicid);
91}
92
93extern u8 cpu_2_logical_apicid[];
94/* Mapping from cpu number to logical apicid */
95static inline int cpu_to_logical_apicid(int cpu)
96{
97 if (cpu >= nr_cpu_ids)
98 return BAD_APICID;
99 return cpu_physical_id(cpu);
100}
101
102static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
103{
104 /* For clustered we don't have a good way to do this yet - hack */
105 return physids_promote(0xFFL);
106}
107
108static inline void setup_portio_remap(void)
109{
110}
111
112static inline void enable_apic_mode(void)
113{
114}
115
116static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
117{
118 return (1);
119}
120
121/* As we are using single CPU as destination, pick only one CPU here */
122static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
123{
124 int cpu;
125 int apicid;
126
127 cpu = first_cpu(*cpumask);
128 apicid = cpu_to_logical_apicid(cpu);
129 return apicid;
130}
131
132static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 const struct cpumask *andmask)
134{
135 int cpu;
136
137 /*
138 * We're using fixed IRQ delivery, can only return one phys APIC ID.
139 * May as well be the first.
140 */
141 for_each_cpu_and(cpu, cpumask, andmask)
142 if (cpumask_test_cpu(cpu, cpu_online_mask))
143 break;
144 if (cpu < nr_cpu_ids)
145 return cpu_to_logical_apicid(cpu);
146
147 return BAD_APICID;
148}
149
150static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
151{
152 return cpuid_apic >> index_msb;
153}
154
155#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
deleted file mode 100644
index 392c3f5ef2fe..000000000000
--- a/arch/x86/include/asm/bigsmp/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef __ASM_MACH_APICDEF_H
2#define __ASM_MACH_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
deleted file mode 100644
index 27fcd01b3ae6..000000000000
--- a/arch/x86/include/asm/bigsmp/ipi.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef __ASM_MACH_IPI_H
2#define __ASM_MACH_IPI_H
3
4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
6
7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
8{
9 send_IPI_mask_sequence(mask, vector);
10}
11
12static inline void send_IPI_allbutself(int vector)
13{
14 send_IPI_mask_allbutself(cpu_online_mask, vector);
15}
16
17static inline void send_IPI_all(int vector)
18{
19 send_IPI_mask(cpu_online_mask, vector);
20}
21
22#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index dd61616cb73d..6ba23dd9fc92 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,26 +1,36 @@
1#ifndef _ASM_X86_BOOT_H 1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H 2#define _ASM_X86_BOOT_H
3 3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */ 4/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */ 5#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */ 6#define EXTENDED_VGA 0xfffe /* 80x50 mode */
11#define ASK_VGA 0xfffd /* ask for it at bootup */ 7#define ASK_VGA 0xfffd /* ask for it at bootup */
12 8
9#ifdef __KERNEL__
10
13/* Physical address where kernel should be loaded. */ 11/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
15 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 13 + (CONFIG_PHYSICAL_ALIGN - 1)) \
16 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 14 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17 15
16#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */
19
18#ifdef CONFIG_X86_64 20#ifdef CONFIG_X86_64
19#define BOOT_HEAP_SIZE 0x7000 21#define BOOT_HEAP_SIZE 0x7000
20#define BOOT_STACK_SIZE 0x4000
21#else 22#else
22#define BOOT_HEAP_SIZE 0x4000 23#define BOOT_HEAP_SIZE 0x4000
24#endif
25
26#endif /* !CONFIG_KERNEL_BZIP2 */
27
28#ifdef CONFIG_X86_64
29#define BOOT_STACK_SIZE 0x4000
30#else
23#define BOOT_STACK_SIZE 0x1000 31#define BOOT_STACK_SIZE 0x1000
24#endif 32#endif
25 33
34#endif /* __KERNEL__ */
35
26#endif /* _ASM_X86_BOOT_H */ 36#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb5..eb2221d5add2 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7/* Caches aren't brain-dead on the intel. */ 7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0) 8static inline void flush_cache_all(void) { }
9#define flush_cache_mm(mm) do { } while (0) 9static inline void flush_cache_mm(struct mm_struct *mm) { }
10#define flush_cache_dup_mm(mm) do { } while (0) 10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11#define flush_cache_range(vma, start, end) do { } while (0) 11static inline void flush_cache_range(struct vm_area_struct *vma,
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12 unsigned long start, unsigned long end) { }
13#define flush_dcache_page(page) do { } while (0) 13static inline void flush_cache_page(struct vm_area_struct *vma,
14#define flush_dcache_mmap_lock(mapping) do { } while (0) 14 unsigned long vmaddr, unsigned long pfn) { }
15#define flush_dcache_mmap_unlock(mapping) do { } while (0) 15static inline void flush_dcache_page(struct page *page) { }
16#define flush_icache_range(start, end) do { } while (0) 16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17#define flush_icache_page(vma, pg) do { } while (0) 17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) 18static inline void flush_icache_range(unsigned long start,
19#define flush_cache_vmap(start, end) do { } while (0) 19 unsigned long end) { }
20#define flush_cache_vunmap(start, end) do { } while (0) 20static inline void flush_icache_page(struct vm_area_struct *vma,
21 struct page *page) { }
22static inline void flush_icache_user_range(struct vm_area_struct *vma,
23 struct page *page,
24 unsigned long addr,
25 unsigned long len) { }
26static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
27static inline void flush_cache_vunmap(unsigned long start,
28 unsigned long end) { }
21 29
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ 30static inline void copy_to_user_page(struct vm_area_struct *vma,
23 memcpy((dst), (src), (len)) 31 struct page *page, unsigned long vaddr,
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ 32 void *dst, const void *src,
25 memcpy((dst), (src), (len)) 33 unsigned long len)
34{
35 memcpy(dst, src, len);
36}
37
38static inline void copy_from_user_page(struct vm_area_struct *vma,
39 struct page *page, unsigned long vaddr,
40 void *dst, const void *src,
41 unsigned long len)
42{
43 memcpy(dst, src, len);
44}
26 45
27#define PG_non_WB PG_arch_1 46#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(NonWB, non_WB)
@@ -104,6 +123,11 @@ void clflush_cache_range(void *addr, unsigned int size);
104#ifdef CONFIG_DEBUG_RODATA 123#ifdef CONFIG_DEBUG_RODATA
105void mark_rodata_ro(void); 124void mark_rodata_ro(void);
106extern const int rodata_test_data; 125extern const int rodata_test_data;
126void set_kernel_text_rw(void);
127void set_kernel_text_ro(void);
128#else
129static inline void set_kernel_text_rw(void) { }
130static inline void set_kernel_text_ro(void) { }
107#endif 131#endif
108 132
109#ifdef CONFIG_DEBUG_RODATA_TEST 133#ifdef CONFIG_DEBUG_RODATA_TEST
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 2bc162e0ec6e..0e63c9a2a8d0 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -1,5 +1,55 @@
1/* 1/*
2 * Some macros to handle stack frames in assembly. 2
3 x86 function call convention, 64-bit:
4 -------------------------------------
5 arguments | callee-saved | extra caller-saved | return
6 [callee-clobbered] | | [callee-clobbered] |
7 ---------------------------------------------------------------------------
8 rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
9
10 ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
11 functions when it sees tail-call optimization possibilities) rflags is
12 clobbered. Leftover arguments are passed over the stack frame.)
13
14 [*] In the frame-pointers case rbp is fixed to the stack frame.
15
16 [**] for struct return values wider than 64 bits the return convention is a
17 bit more complex: up to 128 bits width we return small structures
18 straight in rax, rdx. For structures larger than that (3 words or
19 larger) the caller puts a pointer to an on-stack return struct
20 [allocated in the caller's stack frame] into the first argument - i.e.
21 into rdi. All other arguments shift up by one in this case.
22 Fortunately this case is rare in the kernel.
23
24For 32-bit we have the following conventions - kernel is built with
25-mregparm=3 and -freg-struct-return:
26
27 x86 function calling convention, 32-bit:
28 ----------------------------------------
29 arguments | callee-saved | extra caller-saved | return
30 [callee-clobbered] | | [callee-clobbered] |
31 -------------------------------------------------------------------------
32 eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
33
34 ( here too esp is obviously invariant across normal function calls. eflags
35 is clobbered. Leftover arguments are passed over the stack frame. )
36
37 [*] In the frame-pointers case ebp is fixed to the stack frame.
38
39 [**] We build with -freg-struct-return, which on 32-bit means similar
40 semantics as on 64-bit: edx can be used for a second return value
41 (i.e. covering integer and structure sizes up to 64 bits) - after that
42 it gets more complex and more expensive: 3-word or larger struct returns
43 get done in the caller's frame and the pointer to the return struct goes
44 into regparm0, i.e. eax - the other arguments shift up and the
45 function's register parameters degenerate to regparm=2 in essence.
46
47*/
48
49
50/*
51 * 64-bit system call stack frame layout defines and helpers,
52 * for assembly code:
3 */ 53 */
4 54
5#define R15 0 55#define R15 0
@@ -9,7 +59,7 @@
9#define RBP 32 59#define RBP 32
10#define RBX 40 60#define RBX 40
11 61
12/* arguments: interrupts/non tracing syscalls only save upto here*/ 62/* arguments: interrupts/non tracing syscalls only save up to here: */
13#define R11 48 63#define R11 48
14#define R10 56 64#define R10 56
15#define R9 64 65#define R9 64
@@ -22,7 +72,7 @@
22#define ORIG_RAX 120 /* + error_code */ 72#define ORIG_RAX 120 /* + error_code */
23/* end of arguments */ 73/* end of arguments */
24 74
25/* cpu exception frame or undefined in case of fast syscall. */ 75/* cpu exception frame or undefined in case of fast syscall: */
26#define RIP 128 76#define RIP 128
27#define CS 136 77#define CS 136
28#define EFLAGS 144 78#define EFLAGS 144
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bae482df6039..b185091bf19c 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,20 @@
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9 9
10#ifdef CONFIG_SMP
11
12extern void prefill_possible_map(void);
13
14#else /* CONFIG_SMP */
15
16static inline void prefill_possible_map(void) {}
17
18#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19#define safe_smp_processor_id() 0
20#define stack_smp_processor_id() 0
21
22#endif /* CONFIG_SMP */
23
10struct x86_cpu { 24struct x86_cpu {
11 struct cpu cpu; 25 struct cpu cpu;
12}; 26};
@@ -17,4 +31,7 @@ extern void arch_unregister_cpu(int);
17#endif 31#endif
18 32
19DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34
35extern unsigned int boot_cpu_id;
36
20#endif /* _ASM_X86_CPU_H */ 37#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 000000000000..d24d64fcee04
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,193 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_SMM_BIT, /* System mgmt mode */
37 CPU_SVM_BIT, /*Secure Virtual Machine*/
38 CPU_OSVM_BIT, /* OS-Visible Workaround*/
39/* Standard Registers */
40 CPU_TSS_BIT, /* Task Stack Segment */
41 CPU_CR_BIT, /* Control Registers */
42 CPU_DT_BIT, /* Descriptor Table */
43/* End of Registers flags */
44 CPU_REG_ALL_BIT, /* Select all Registers */
45};
46
47#define CPU_REG_ALL (~0) /* Select all Registers */
48
49#define CPU_MC (1 << CPU_MC_BIT)
50#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
51#define CPU_TIME (1 << CPU_TIME_BIT)
52#define CPU_PMC (1 << CPU_PMC_BIT)
53#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
54#define CPU_APIC (1 << CPU_APIC_BIT)
55#define CPU_POWERON (1 << CPU_POWERON_BIT)
56#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
57#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
58#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
59#define CPU_BIOS (1 << CPU_BIOS_BIT)
60#define CPU_FREQ (1 << CPU_FREQ_BIT)
61#define CPU_MTRR (1 << CPU_MTTR_BIT)
62#define CPU_PERF (1 << CPU_PERF_BIT)
63#define CPU_CACHE (1 << CPU_CACHE_BIT)
64#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
65#define CPU_THERM (1 << CPU_THERM_BIT)
66#define CPU_MISC (1 << CPU_MISC_BIT)
67#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
68#define CPU_PAT (1 << CPU_PAT_BIT)
69#define CPU_VMX (1 << CPU_VMX_BIT)
70#define CPU_CALL (1 << CPU_CALL_BIT)
71#define CPU_BASE (1 << CPU_BASE_BIT)
72#define CPU_SMM (1 << CPU_SMM_BIT)
73#define CPU_SVM (1 << CPU_SVM_BIT)
74#define CPU_OSVM (1 << CPU_OSVM_BIT)
75#define CPU_TSS (1 << CPU_TSS_BIT)
76#define CPU_CR (1 << CPU_CR_BIT)
77#define CPU_DT (1 << CPU_DT_BIT)
78
79/* Register file flags */
80enum cpu_file_bit {
81 CPU_INDEX_BIT, /* index */
82 CPU_VALUE_BIT, /* value */
83};
84
85#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
86
87/*
88 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
89 * -------------------------- ------------------------------------------
90 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
91 *
92 * 06_01 Pentium Pro
93 * 06_03, 06_05 Pentium II Xeon, Pentium II
94 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
95 *
96 * 06_09, 060D Pentium M
97 *
98 * 06_0E Core Duo, Core Solo
99 *
100 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
101 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
102 * Pentium dual-core
103 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
104 *
105 * 06_1C Atom
106 *
107 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
108 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
109 *
110 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
111 * Pentium 4, Pentium D
112 */
113
114/* Register processors bits */
115enum cpu_processor_bit {
116 CPU_NONE,
117/* Intel */
118 CPU_INTEL_PENTIUM_BIT,
119 CPU_INTEL_P6_BIT,
120 CPU_INTEL_PENTIUM_M_BIT,
121 CPU_INTEL_CORE_BIT,
122 CPU_INTEL_CORE2_BIT,
123 CPU_INTEL_ATOM_BIT,
124 CPU_INTEL_XEON_P4_BIT,
125 CPU_INTEL_XEON_MP_BIT,
126};
127
128#define CPU_ALL (~0) /* Select all CPUs */
129
130#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
131#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
132#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
133#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
134#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
135#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
136#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
137#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
138
139#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
140#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
141#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
142#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
143#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
144#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
145#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
146#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
147#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
148#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
149#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
150#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
151#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
152#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
153#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
154#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
155#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
156#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
157#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
158
159/* Select all Intel CPUs*/
160#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
161
162#define MAX_CPU_FILES 512
163
164struct cpu_private {
165 unsigned cpu;
166 unsigned type;
167 unsigned reg;
168 unsigned file;
169};
170
171struct cpu_debug_base {
172 char *name; /* Register name */
173 unsigned flag; /* Register flag */
174};
175
176struct cpu_cpuX_base {
177 struct dentry *dentry; /* Register dentry */
178 int init; /* Register index file */
179};
180
181struct cpu_file_base {
182 char *name; /* Register file name */
183 unsigned flag; /* Register file flag */
184};
185
186struct cpu_debug_range {
187 unsigned min; /* Register range min */
188 unsigned max; /* Register range max */
189 unsigned flag; /* Supported flags */
190 unsigned model; /* Supported models */
191};
192
193#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644
index 000000000000..a7f3c75f8ad7
--- /dev/null
+++ b/arch/x86/include/asm/cpumask.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_CPUMASK_H
2#define _ASM_X86_CPUMASK_H
3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h>
5
6#ifdef CONFIG_X86_64
7
8extern cpumask_var_t cpu_callin_mask;
9extern cpumask_var_t cpu_callout_mask;
10extern cpumask_var_t cpu_initialized_mask;
11extern cpumask_var_t cpu_sibling_setup_mask;
12
13extern void setup_cpu_local_masks(void);
14
15#else /* CONFIG_X86_32 */
16
17extern cpumask_t cpu_callin_map;
18extern cpumask_t cpu_callout_map;
19extern cpumask_t cpu_initialized;
20extern cpumask_t cpu_sibling_setup_map;
21
22#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
23#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
24#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
25#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
26
27static inline void setup_cpu_local_masks(void) { }
28
29#endif /* CONFIG_X86_32 */
30
31#endif /* __ASSEMBLY__ */
32#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..c68c361697e1 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,39 +1,21 @@
1#ifndef _ASM_X86_CURRENT_H 1#ifndef _ASM_X86_CURRENT_H
2#define _ASM_X86_CURRENT_H 2#define _ASM_X86_CURRENT_H
3 3
4#ifdef CONFIG_X86_32
5#include <linux/compiler.h> 4#include <linux/compiler.h>
6#include <asm/percpu.h> 5#include <asm/percpu.h>
7 6
7#ifndef __ASSEMBLY__
8struct task_struct; 8struct task_struct;
9 9
10DECLARE_PER_CPU(struct task_struct *, current_task); 10DECLARE_PER_CPU(struct task_struct *, current_task);
11static __always_inline struct task_struct *get_current(void)
12{
13 return x86_read_percpu(current_task);
14}
15
16#else /* X86_32 */
17
18#ifndef __ASSEMBLY__
19#include <asm/pda.h>
20
21struct task_struct;
22 11
23static __always_inline struct task_struct *get_current(void) 12static __always_inline struct task_struct *get_current(void)
24{ 13{
25 return read_pda(pcurrent); 14 return percpu_read(current_task);
26} 15}
27 16
28#else /* __ASSEMBLY__ */ 17#define current get_current()
29
30#include <asm/asm-offsets.h>
31#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
32 18
33#endif /* __ASSEMBLY__ */ 19#endif /* __ASSEMBLY__ */
34 20
35#endif /* X86_32 */
36
37#define current get_current()
38
39#endif /* _ASM_X86_CURRENT_H */ 21#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f5443..5623c50d67b2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
91#define store_gdt(dtr) native_store_gdt(dtr) 91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr) 92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr()) 93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95 94
96#define load_TLS(t, cpu) native_load_tls(t, cpu) 95#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt 96#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112} 111}
113#endif /* CONFIG_PARAVIRT */ 112#endif /* CONFIG_PARAVIRT */
114 113
114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate) 117 const gate_desc *gate)
117{ 118{
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/do_timer.h
index 23ecda0b28a0..23ecda0b28a0 100644
--- a/arch/x86/include/asm/mach-default/do_timer.h
+++ b/arch/x86/include/asm/do_timer.h
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f51a3ddde01a..83c1bc8d2e8a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -112,7 +112,7 @@ extern unsigned int vdso_enabled;
112 * now struct_user_regs, they are different) 112 * now struct_user_regs, they are different)
113 */ 113 */
114 114
115#define ELF_CORE_COPY_REGS(pr_reg, regs) \ 115#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
116do { \ 116do { \
117 pr_reg[0] = regs->bx; \ 117 pr_reg[0] = regs->bx; \
118 pr_reg[1] = regs->cx; \ 118 pr_reg[1] = regs->cx; \
@@ -124,7 +124,6 @@ do { \
124 pr_reg[7] = regs->ds & 0xffff; \ 124 pr_reg[7] = regs->ds & 0xffff; \
125 pr_reg[8] = regs->es & 0xffff; \ 125 pr_reg[8] = regs->es & 0xffff; \
126 pr_reg[9] = regs->fs & 0xffff; \ 126 pr_reg[9] = regs->fs & 0xffff; \
127 savesegment(gs, pr_reg[10]); \
128 pr_reg[11] = regs->orig_ax; \ 127 pr_reg[11] = regs->orig_ax; \
129 pr_reg[12] = regs->ip; \ 128 pr_reg[12] = regs->ip; \
130 pr_reg[13] = regs->cs & 0xffff; \ 129 pr_reg[13] = regs->cs & 0xffff; \
@@ -133,6 +132,18 @@ do { \
133 pr_reg[16] = regs->ss & 0xffff; \ 132 pr_reg[16] = regs->ss & 0xffff; \
134} while (0); 133} while (0);
135 134
135#define ELF_CORE_COPY_REGS(pr_reg, regs) \
136do { \
137 ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
138 pr_reg[10] = get_user_gs(regs); \
139} while (0);
140
141#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
142do { \
143 ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
144 savesegment(gs, pr_reg[10]); \
145} while (0);
146
136#define ELF_PLATFORM (utsname()->machine) 147#define ELF_PLATFORM (utsname()->machine)
137#define set_personality_64bit() do { } while (0) 148#define set_personality_64bit() do { } while (0)
138 149
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 6b1add8e31dd..c2e6bedaf258 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -9,14 +9,32 @@
9 * is no hardware IRQ pin equivalent for them, they are triggered 9 * is no hardware IRQ pin equivalent for them, they are triggered
10 * through the ICC by us (IPIs) 10 * through the ICC by us (IPIs)
11 */ 11 */
12#ifdef CONFIG_X86_SMP 12#ifdef CONFIG_SMP
13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) 13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
15BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
16BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt)
20BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
21 smp_invalidate_interrupt)
22BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
23 smp_invalidate_interrupt)
24BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
25 smp_invalidate_interrupt)
26BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
27 smp_invalidate_interrupt)
28BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
29 smp_invalidate_interrupt)
30BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
31 smp_invalidate_interrupt)
32BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt)
18#endif 34#endif
19 35
36BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
37
20/* 38/*
21 * every pentium local APIC has two 'local interrupts', with a 39 * every pentium local APIC has two 'local interrupts', with a
22 * soft-definable vector attached to both interrupts, one of 40 * soft-definable vector attached to both interrupts, one of
@@ -25,10 +43,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
25 * a much simpler SMP time architecture: 43 * a much simpler SMP time architecture:
26 */ 44 */
27#ifdef CONFIG_X86_LOCAL_APIC 45#ifdef CONFIG_X86_LOCAL_APIC
46
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) 47BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 48BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 49BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31 50
51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
53#endif
54
32#ifdef CONFIG_X86_MCE_P4THERMAL 55#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 56BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif 57#endif
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
deleted file mode 100644
index c58b9cc74465..000000000000
--- a/arch/x86/include/asm/es7000/apic.h
+++ /dev/null
@@ -1,242 +0,0 @@
1#ifndef __ASM_ES7000_APIC_H
2#define __ASM_ES7000_APIC_H
3
4#include <linux/gfp.h>
5
6#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
7#define esr_disable (1)
8
9static inline int apic_id_registered(void)
10{
11 return (1);
12}
13
14static inline const cpumask_t *target_cpus_cluster(void)
15{
16 return &CPU_MASK_ALL;
17}
18
19static inline const cpumask_t *target_cpus(void)
20{
21 return &cpumask_of_cpu(smp_processor_id());
22}
23
24#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
25#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
26#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
27#define NO_BALANCE_IRQ_CLUSTER (1)
28
29#define APIC_DFR_VALUE (APIC_DFR_FLAT)
30#define INT_DELIVERY_MODE (dest_Fixed)
31#define INT_DEST_MODE (0) /* phys delivery to target procs */
32#define NO_BALANCE_IRQ (0)
33#undef APIC_DEST_LOGICAL
34#define APIC_DEST_LOGICAL 0x0
35
36static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
37{
38 return 0;
39}
40static inline unsigned long check_apicid_present(int bit)
41{
42 return physid_isset(bit, phys_cpu_present_map);
43}
44
45#define apicid_cluster(apicid) (apicid & 0xF0)
46
47static inline unsigned long calculate_ldr(int cpu)
48{
49 unsigned long id;
50 id = xapic_phys_to_log_apicid(cpu);
51 return (SET_APIC_LOGICAL_ID(id));
52}
53
54/*
55 * Set up the logical destination ID.
56 *
57 * Intel recommends to set DFR, LdR and TPR before enabling
58 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
59 * document number 292116). So here it goes...
60 */
61static inline void init_apic_ldr_cluster(void)
62{
63 unsigned long val;
64 int cpu = smp_processor_id();
65
66 apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
67 val = calculate_ldr(cpu);
68 apic_write(APIC_LDR, val);
69}
70
71static inline void init_apic_ldr(void)
72{
73 unsigned long val;
74 int cpu = smp_processor_id();
75
76 apic_write(APIC_DFR, APIC_DFR_VALUE);
77 val = calculate_ldr(cpu);
78 apic_write(APIC_LDR, val);
79}
80
81extern int apic_version [MAX_APICS];
82static inline void setup_apic_routing(void)
83{
84 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
85 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
86 (apic_version[apic] == 0x14) ?
87 "Physical Cluster" : "Logical Cluster",
88 nr_ioapics, cpus_addr(*target_cpus())[0]);
89}
90
91static inline int multi_timer_check(int apic, int irq)
92{
93 return 0;
94}
95
96static inline int apicid_to_node(int logical_apicid)
97{
98 return 0;
99}
100
101
102static inline int cpu_present_to_apicid(int mps_cpu)
103{
104 if (!mps_cpu)
105 return boot_cpu_physical_apicid;
106 else if (mps_cpu < nr_cpu_ids)
107 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
108 else
109 return BAD_APICID;
110}
111
112static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
113{
114 static int id = 0;
115 physid_mask_t mask;
116 mask = physid_mask_of_physid(id);
117 ++id;
118 return mask;
119}
120
121extern u8 cpu_2_logical_apicid[];
122/* Mapping from cpu number to logical apicid */
123static inline int cpu_to_logical_apicid(int cpu)
124{
125#ifdef CONFIG_SMP
126 if (cpu >= nr_cpu_ids)
127 return BAD_APICID;
128 return (int)cpu_2_logical_apicid[cpu];
129#else
130 return logical_smp_processor_id();
131#endif
132}
133
134static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
135{
136 /* For clustered we don't have a good way to do this yet - hack */
137 return physids_promote(0xff);
138}
139
140
141static inline void setup_portio_remap(void)
142{
143}
144
145extern unsigned int boot_cpu_physical_apicid;
146static inline int check_phys_apicid_present(int cpu_physical_apicid)
147{
148 boot_cpu_physical_apicid = read_apic_id();
149 return (1);
150}
151
152static inline unsigned int
153cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
154{
155 int num_bits_set;
156 int cpus_found = 0;
157 int cpu;
158 int apicid;
159
160 num_bits_set = cpumask_weight(cpumask);
161 /* Return id to all */
162 if (num_bits_set == nr_cpu_ids)
163 return 0xFF;
164 /*
165 * The cpus in the mask must all be on the apic cluster. If are not
166 * on the same apicid cluster return default value of TARGET_CPUS.
167 */
168 cpu = cpumask_first(cpumask);
169 apicid = cpu_to_logical_apicid(cpu);
170 while (cpus_found < num_bits_set) {
171 if (cpumask_test_cpu(cpu, cpumask)) {
172 int new_apicid = cpu_to_logical_apicid(cpu);
173 if (apicid_cluster(apicid) !=
174 apicid_cluster(new_apicid)){
175 printk ("%s: Not a valid mask!\n", __func__);
176 return 0xFF;
177 }
178 apicid = new_apicid;
179 cpus_found++;
180 }
181 cpu++;
182 }
183 return apicid;
184}
185
186static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
187{
188 int num_bits_set;
189 int cpus_found = 0;
190 int cpu;
191 int apicid;
192
193 num_bits_set = cpus_weight(*cpumask);
194 /* Return id to all */
195 if (num_bits_set == nr_cpu_ids)
196 return cpu_to_logical_apicid(0);
197 /*
198 * The cpus in the mask must all be on the apic cluster. If are not
199 * on the same apicid cluster return default value of TARGET_CPUS.
200 */
201 cpu = first_cpu(*cpumask);
202 apicid = cpu_to_logical_apicid(cpu);
203 while (cpus_found < num_bits_set) {
204 if (cpu_isset(cpu, *cpumask)) {
205 int new_apicid = cpu_to_logical_apicid(cpu);
206 if (apicid_cluster(apicid) !=
207 apicid_cluster(new_apicid)){
208 printk ("%s: Not a valid mask!\n", __func__);
209 return cpu_to_logical_apicid(0);
210 }
211 apicid = new_apicid;
212 cpus_found++;
213 }
214 cpu++;
215 }
216 return apicid;
217}
218
219
220static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
221 const struct cpumask *andmask)
222{
223 int apicid = cpu_to_logical_apicid(0);
224 cpumask_var_t cpumask;
225
226 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
227 return apicid;
228
229 cpumask_and(cpumask, inmask, andmask);
230 cpumask_and(cpumask, cpumask, cpu_online_mask);
231 apicid = cpu_mask_to_apicid(cpumask);
232
233 free_cpumask_var(cpumask);
234 return apicid;
235}
236
237static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
238{
239 return cpuid_apic >> index_msb;
240}
241
242#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
deleted file mode 100644
index 8b234a3cb851..000000000000
--- a/arch/x86/include/asm/es7000/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef __ASM_ES7000_APICDEF_H
2#define __ASM_ES7000_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
deleted file mode 100644
index 7e8ed24d4b8a..000000000000
--- a/arch/x86/include/asm/es7000/ipi.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef __ASM_ES7000_IPI_H
2#define __ASM_ES7000_IPI_H
3
4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
6
7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
8{
9 send_IPI_mask_sequence(mask, vector);
10}
11
12static inline void send_IPI_allbutself(int vector)
13{
14 send_IPI_mask_allbutself(cpu_online_mask, vector);
15}
16
17static inline void send_IPI_all(int vector)
18{
19 send_IPI_mask(cpu_online_mask, vector);
20}
21
22#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
deleted file mode 100644
index c1629b090ec2..000000000000
--- a/arch/x86/include/asm/es7000/mpparse.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef __ASM_ES7000_MPPARSE_H
2#define __ASM_ES7000_MPPARSE_H
3
4#include <linux/acpi.h>
5
6extern int parse_unisys_oem (char *oemptr);
7extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
8extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
9extern void setup_unisys(void);
10
11#ifndef CONFIG_X86_GENERICARCH
12extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
13extern int mps_oem_check(struct mpc_table *mpc, char *oem, char *productid);
14#endif
15
16#ifdef CONFIG_ACPI
17
18static inline int es7000_check_dsdt(void)
19{
20 struct acpi_table_header header;
21
22 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
23 !strncmp(header.oem_id, "UNISYS", 6))
24 return 1;
25 return 0;
26}
27#endif
28
29#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
deleted file mode 100644
index 78f0daaee436..000000000000
--- a/arch/x86/include/asm/es7000/wakecpu.h
+++ /dev/null
@@ -1,37 +0,0 @@
1#ifndef __ASM_ES7000_WAKECPU_H
2#define __ASM_ES7000_WAKECPU_H
3
4#define TRAMPOLINE_PHYS_LOW 0x467
5#define TRAMPOLINE_PHYS_HIGH 0x469
6
7static inline void wait_for_init_deassert(atomic_t *deassert)
8{
9#ifndef CONFIG_ES7000_CLUSTERED_APIC
10 while (!atomic_read(deassert))
11 cpu_relax();
12#endif
13 return;
14}
15
16/* Nothing to do for most platforms, since cleared by the INIT cycle */
17static inline void smp_callin_clear_local_apic(void)
18{
19}
20
21static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
22{
23}
24
25static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
26{
27}
28
29extern void __inquire_remote_apic(int apicid);
30
31static inline void inquire_remote_apic(int apicid)
32{
33 if (apic_verbosity >= APIC_DEBUG)
34 __inquire_remote_apic(apicid);
35}
36
37#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 23696d44a0af..81937a5dc77c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -1,11 +1,147 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
12 */
13
1#ifndef _ASM_X86_FIXMAP_H 14#ifndef _ASM_X86_FIXMAP_H
2#define _ASM_X86_FIXMAP_H 15#define _ASM_X86_FIXMAP_H
3 16
17#ifndef __ASSEMBLY__
18#include <linux/kernel.h>
19#include <asm/acpi.h>
20#include <asm/apicdef.h>
21#include <asm/page.h>
22#ifdef CONFIG_X86_32
23#include <linux/threads.h>
24#include <asm/kmap_types.h>
25#else
26#include <asm/vsyscall.h>
27#endif
28
29/*
30 * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
31 * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
32 * Because of this, FIXADDR_TOP x86 integration was left as later work.
33 */
34#ifdef CONFIG_X86_32
35/* used by vmalloc.c, vsyscall.lds.S.
36 *
37 * Leave one empty page between vmalloc'ed areas and
38 * the start of the fixmap.
39 */
40extern unsigned long __FIXADDR_TOP;
41#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
42
43#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
44#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
45#else
46#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
47
48/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
49#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
50#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
51#endif
52
53
54/*
55 * Here we define all the compile-time 'special' virtual
56 * addresses. The point is to have a constant address at
57 * compile time, but to set the physical address only
58 * in the boot process.
59 * for x86_32: We allocate these special addresses
60 * from the end of virtual memory (0xfffff000) backwards.
61 * Also this lets us do fail-safe vmalloc(), we
62 * can guarantee that these special addresses and
63 * vmalloc()-ed addresses never overlap.
64 *
65 * These 'compile-time allocated' memory buffers are
66 * fixed-size 4k pages (or larger if used with an increment
67 * higher than 1). Use set_fixmap(idx,phys) to associate
68 * physical memory with fixmap indices.
69 *
70 * TLB entries of such buffers will not be flushed across
71 * task switches.
72 */
73enum fixed_addresses {
4#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
5# include "fixmap_32.h" 75 FIX_HOLE,
76 FIX_VDSO,
6#else 77#else
7# include "fixmap_64.h" 78 VSYSCALL_LAST_PAGE,
79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
81 VSYSCALL_HPET,
8#endif 82#endif
83 FIX_DBGP_BASE,
84 FIX_EARLYCON_MEM_BASE,
85#ifdef CONFIG_X86_LOCAL_APIC
86 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
87#endif
88#ifdef CONFIG_X86_IO_APIC
89 FIX_IO_APIC_BASE_0,
90 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
91#endif
92#ifdef CONFIG_X86_VISWS_APIC
93 FIX_CO_CPU, /* Cobalt timer */
94 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
95 FIX_LI_PCIA, /* Lithium PCI Bridge A */
96 FIX_LI_PCIB, /* Lithium PCI Bridge B */
97#endif
98#ifdef CONFIG_X86_F00F_BUG
99 FIX_F00F_IDT, /* Virtual mapping for IDT */
100#endif
101#ifdef CONFIG_X86_CYCLONE_TIMER
102 FIX_CYCLONE_TIMER, /*cyclone timer register*/
103#endif
104#ifdef CONFIG_X86_32
105 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
106 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
107#ifdef CONFIG_PCI_MMCONFIG
108 FIX_PCIE_MCFG,
109#endif
110#endif
111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP,
113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1,
116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional.
123 *
124 * We round it up to the next 256 pages boundary so that we
125 * can have a single pgd entry and a single pte table:
126 */
127#define NR_FIX_BTMAPS 64
128#define FIX_BTMAPS_SLOTS 4
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
132#ifdef CONFIG_X86_32
133 FIX_WP_TEST,
134#endif
135 __end_of_fixed_addresses
136};
137
138
139extern void reserve_top_address(unsigned long reserve);
140
141#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
142#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
143#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
144#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
9 145
10extern int fixmaps_set; 146extern int fixmaps_set;
11 147
@@ -69,4 +205,5 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
69 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); 205 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
70 return __virt_to_fix(vaddr); 206 return __virt_to_fix(vaddr);
71} 207}
208#endif /* !__ASSEMBLY__ */
72#endif /* _ASM_X86_FIXMAP_H */ 209#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
deleted file mode 100644
index c7115c1d7217..000000000000
--- a/arch/x86/include/asm/fixmap_32.h
+++ /dev/null
@@ -1,119 +0,0 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 */
12
13#ifndef _ASM_X86_FIXMAP_32_H
14#define _ASM_X86_FIXMAP_32_H
15
16
17/* used by vmalloc.c, vsyscall.lds.S.
18 *
19 * Leave one empty page between vmalloc'ed areas and
20 * the start of the fixmap.
21 */
22extern unsigned long __FIXADDR_TOP;
23#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
24#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
25
26#ifndef __ASSEMBLY__
27#include <linux/kernel.h>
28#include <asm/acpi.h>
29#include <asm/apicdef.h>
30#include <asm/page.h>
31#include <linux/threads.h>
32#include <asm/kmap_types.h>
33
34/*
35 * Here we define all the compile-time 'special' virtual
36 * addresses. The point is to have a constant address at
37 * compile time, but to set the physical address only
38 * in the boot process. We allocate these special addresses
39 * from the end of virtual memory (0xfffff000) backwards.
40 * Also this lets us do fail-safe vmalloc(), we
41 * can guarantee that these special addresses and
42 * vmalloc()-ed addresses never overlap.
43 *
44 * these 'compile-time allocated' memory buffers are
45 * fixed-size 4k pages. (or larger if used with an increment
46 * highger than 1) use fixmap_set(idx,phys) to associate
47 * physical memory with fixmap indices.
48 *
49 * TLB entries of such buffers will not be flushed across
50 * task switches.
51 */
52enum fixed_addresses {
53 FIX_HOLE,
54 FIX_VDSO,
55 FIX_DBGP_BASE,
56 FIX_EARLYCON_MEM_BASE,
57#ifdef CONFIG_X86_LOCAL_APIC
58 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
59#endif
60#ifdef CONFIG_X86_IO_APIC
61 FIX_IO_APIC_BASE_0,
62 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
63#endif
64#ifdef CONFIG_X86_VISWS_APIC
65 FIX_CO_CPU, /* Cobalt timer */
66 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
67 FIX_LI_PCIA, /* Lithium PCI Bridge A */
68 FIX_LI_PCIB, /* Lithium PCI Bridge B */
69#endif
70#ifdef CONFIG_X86_F00F_BUG
71 FIX_F00F_IDT, /* Virtual mapping for IDT */
72#endif
73#ifdef CONFIG_X86_CYCLONE_TIMER
74 FIX_CYCLONE_TIMER, /*cyclone timer register*/
75#endif
76 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
77 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
78#ifdef CONFIG_PCI_MMCONFIG
79 FIX_PCIE_MCFG,
80#endif
81#ifdef CONFIG_PARAVIRT
82 FIX_PARAVIRT_BOOTMAP,
83#endif
84 __end_of_permanent_fixed_addresses,
85 /*
86 * 256 temporary boot-time mappings, used by early_ioremap(),
87 * before ioremap() is functional.
88 *
89 * We round it up to the next 256 pages boundary so that we
90 * can have a single pgd entry and a single pte table:
91 */
92#define NR_FIX_BTMAPS 64
93#define FIX_BTMAPS_SLOTS 4
94 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
95 (__end_of_permanent_fixed_addresses & 255),
96 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
97 FIX_WP_TEST,
98#ifdef CONFIG_ACPI
99 FIX_ACPI_BEGIN,
100 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
101#endif
102#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
103 FIX_OHCI1394_BASE,
104#endif
105 __end_of_fixed_addresses
106};
107
108extern void reserve_top_address(unsigned long reserve);
109
110
111#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
112
113#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
114#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
115#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
116#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
117
118#endif /* !__ASSEMBLY__ */
119#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
deleted file mode 100644
index 8be740977db8..000000000000
--- a/arch/x86/include/asm/fixmap_64.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 */
10
11#ifndef _ASM_X86_FIXMAP_64_H
12#define _ASM_X86_FIXMAP_64_H
13
14#include <linux/kernel.h>
15#include <asm/acpi.h>
16#include <asm/apicdef.h>
17#include <asm/page.h>
18#include <asm/vsyscall.h>
19
20/*
21 * Here we define all the compile-time 'special' virtual
22 * addresses. The point is to have a constant address at
23 * compile time, but to set the physical address only
24 * in the boot process.
25 *
26 * These 'compile-time allocated' memory buffers are
27 * fixed-size 4k pages (or larger if used with an increment
28 * higher than 1). Use set_fixmap(idx,phys) to associate
29 * physical memory with fixmap indices.
30 *
31 * TLB entries of such buffers will not be flushed across
32 * task switches.
33 */
34
35enum fixed_addresses {
36 VSYSCALL_LAST_PAGE,
37 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
38 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
39 VSYSCALL_HPET,
40 FIX_DBGP_BASE,
41 FIX_EARLYCON_MEM_BASE,
42 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
43 FIX_IO_APIC_BASE_0,
44 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
45#ifdef CONFIG_PARAVIRT
46 FIX_PARAVIRT_BOOTMAP,
47#endif
48 __end_of_permanent_fixed_addresses,
49#ifdef CONFIG_ACPI
50 FIX_ACPI_BEGIN,
51 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
52#endif
53#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
54 FIX_OHCI1394_BASE,
55#endif
56 /*
57 * 256 temporary boot-time mappings, used by early_ioremap(),
58 * before ioremap() is functional.
59 *
60 * We round it up to the next 256 pages boundary so that we
61 * can have a single pgd entry and a single pte table:
62 */
63#define NR_FIX_BTMAPS 64
64#define FIX_BTMAPS_SLOTS 4
65 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
66 (__end_of_permanent_fixed_addresses & 255),
67 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
68 __end_of_fixed_addresses
69};
70
71#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
72#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
73#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
74
75/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
76#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
77#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
78
79#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b55b4a7fbefd..bd2c6511c887 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,6 +28,13 @@
28 28
29#endif 29#endif
30 30
31/* FIXME: I don't want to stay hardcoded */
32#ifdef CONFIG_X86_64
33# define FTRACE_SYSCALL_MAX 296
34#else
35# define FTRACE_SYSCALL_MAX 333
36#endif
37
31#ifdef CONFIG_FUNCTION_TRACER 38#ifdef CONFIG_FUNCTION_TRACER
32#define MCOUNT_ADDR ((long)(mcount)) 39#define MCOUNT_ADDR ((long)(mcount))
33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 40#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -55,29 +62,4 @@ struct dyn_arch_ftrace {
55#endif /* __ASSEMBLY__ */ 62#endif /* __ASSEMBLY__ */
56#endif /* CONFIG_FUNCTION_TRACER */ 63#endif /* CONFIG_FUNCTION_TRACER */
57 64
58#ifdef CONFIG_FUNCTION_GRAPH_TRACER
59
60#ifndef __ASSEMBLY__
61
62/*
63 * Stack of return addresses for functions
64 * of a thread.
65 * Used in struct thread_info
66 */
67struct ftrace_ret_stack {
68 unsigned long ret;
69 unsigned long func;
70 unsigned long long calltime;
71};
72
73/*
74 * Primary handler of a function return.
75 * It relays on ftrace_return_to_handler.
76 * Defined in entry_32/64.S
77 */
78extern void return_to_handler(void);
79
80#endif /* __ASSEMBLY__ */
81#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
82
83#endif /* _ASM_X86_FTRACE_H */ 65#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
index d48bee663a6f..4b8b98fa7f25 100644
--- a/arch/x86/include/asm/genapic.h
+++ b/arch/x86/include/asm/genapic.h
@@ -1,5 +1 @@
1#ifdef CONFIG_X86_32 #include <asm/apic.h>
2# include "genapic_32.h"
3#else
4# include "genapic_64.h"
5#endif
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
deleted file mode 100644
index 2c05b737ee22..000000000000
--- a/arch/x86/include/asm/genapic_32.h
+++ /dev/null
@@ -1,148 +0,0 @@
1#ifndef _ASM_X86_GENAPIC_32_H
2#define _ASM_X86_GENAPIC_32_H
3
4#include <asm/mpspec.h>
5#include <asm/atomic.h>
6
7/*
8 * Generic APIC driver interface.
9 *
10 * An straight forward mapping of the APIC related parts of the
11 * x86 subarchitecture interface to a dynamic object.
12 *
13 * This is used by the "generic" x86 subarchitecture.
14 *
15 * Copyright 2003 Andi Kleen, SuSE Labs.
16 */
17
18struct mpc_bus;
19struct mpc_table;
20struct mpc_cpu;
21
22struct genapic {
23 char *name;
24 int (*probe)(void);
25
26 int (*apic_id_registered)(void);
27 const struct cpumask *(*target_cpus)(void);
28 int int_delivery_mode;
29 int int_dest_mode;
30 int ESR_DISABLE;
31 int apic_destination_logical;
32 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
33 unsigned long (*check_apicid_present)(int apicid);
34 int no_balance_irq;
35 int no_ioapic_check;
36 void (*init_apic_ldr)(void);
37 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
38
39 void (*setup_apic_routing)(void);
40 int (*multi_timer_check)(int apic, int irq);
41 int (*apicid_to_node)(int logical_apicid);
42 int (*cpu_to_logical_apicid)(int cpu);
43 int (*cpu_present_to_apicid)(int mps_cpu);
44 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
45 void (*setup_portio_remap)(void);
46 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
47 void (*enable_apic_mode)(void);
48 u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
49
50 /* mpparse */
51 /* When one of the next two hooks returns 1 the genapic
52 is switched to this. Essentially they are additional probe
53 functions. */
54 int (*mps_oem_check)(struct mpc_table *mpc, char *oem,
55 char *productid);
56 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
57
58 unsigned (*get_apic_id)(unsigned long x);
59 unsigned long apic_id_mask;
60 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
61 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
62 const struct cpumask *andmask);
63 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
64
65#ifdef CONFIG_SMP
66 /* ipi */
67 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
68 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
69 int vector);
70 void (*send_IPI_allbutself)(int vector);
71 void (*send_IPI_all)(int vector);
72#endif
73 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
74 int trampoline_phys_low;
75 int trampoline_phys_high;
76 void (*wait_for_init_deassert)(atomic_t *deassert);
77 void (*smp_callin_clear_local_apic)(void);
78 void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
79 void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
80 void (*inquire_remote_apic)(int apicid);
81};
82
83#define APICFUNC(x) .x = x,
84
85/* More functions could be probably marked IPIFUNC and save some space
86 in UP GENERICARCH kernels, but I don't have the nerve right now
87 to untangle this mess. -AK */
88#ifdef CONFIG_SMP
89#define IPIFUNC(x) APICFUNC(x)
90#else
91#define IPIFUNC(x)
92#endif
93
94#define APIC_INIT(aname, aprobe) \
95{ \
96 .name = aname, \
97 .probe = aprobe, \
98 .int_delivery_mode = INT_DELIVERY_MODE, \
99 .int_dest_mode = INT_DEST_MODE, \
100 .no_balance_irq = NO_BALANCE_IRQ, \
101 .ESR_DISABLE = esr_disable, \
102 .apic_destination_logical = APIC_DEST_LOGICAL, \
103 APICFUNC(apic_id_registered) \
104 APICFUNC(target_cpus) \
105 APICFUNC(check_apicid_used) \
106 APICFUNC(check_apicid_present) \
107 APICFUNC(init_apic_ldr) \
108 APICFUNC(ioapic_phys_id_map) \
109 APICFUNC(setup_apic_routing) \
110 APICFUNC(multi_timer_check) \
111 APICFUNC(apicid_to_node) \
112 APICFUNC(cpu_to_logical_apicid) \
113 APICFUNC(cpu_present_to_apicid) \
114 APICFUNC(apicid_to_cpu_present) \
115 APICFUNC(setup_portio_remap) \
116 APICFUNC(check_phys_apicid_present) \
117 APICFUNC(mps_oem_check) \
118 APICFUNC(get_apic_id) \
119 .apic_id_mask = APIC_ID_MASK, \
120 APICFUNC(cpu_mask_to_apicid) \
121 APICFUNC(cpu_mask_to_apicid_and) \
122 APICFUNC(vector_allocation_domain) \
123 APICFUNC(acpi_madt_oem_check) \
124 IPIFUNC(send_IPI_mask) \
125 IPIFUNC(send_IPI_allbutself) \
126 IPIFUNC(send_IPI_all) \
127 APICFUNC(enable_apic_mode) \
128 APICFUNC(phys_pkg_id) \
129 .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \
130 .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \
131 APICFUNC(wait_for_init_deassert) \
132 APICFUNC(smp_callin_clear_local_apic) \
133 APICFUNC(store_NMI_vector) \
134 APICFUNC(restore_NMI_vector) \
135 APICFUNC(inquire_remote_apic) \
136}
137
138extern struct genapic *genapic;
139extern void es7000_update_genapic_to_cluster(void);
140
141enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
142#define get_uv_system_type() UV_NONE
143#define is_uv_system() 0
144#define uv_wakeup_secondary(a, b) 1
145#define uv_system_init() do {} while (0)
146
147
148#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
deleted file mode 100644
index adf32fb56aa6..000000000000
--- a/arch/x86/include/asm/genapic_64.h
+++ /dev/null
@@ -1,66 +0,0 @@
1#ifndef _ASM_X86_GENAPIC_64_H
2#define _ASM_X86_GENAPIC_64_H
3
4#include <linux/cpumask.h>
5
6/*
7 * Copyright 2004 James Cleverdon, IBM.
8 * Subject to the GNU Public License, v.2
9 *
10 * Generic APIC sub-arch data struct.
11 *
12 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
13 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
14 * James Cleverdon.
15 */
16
17struct genapic {
18 char *name;
19 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
20 u32 int_delivery_mode;
21 u32 int_dest_mode;
22 int (*apic_id_registered)(void);
23 const struct cpumask *(*target_cpus)(void);
24 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
25 void (*init_apic_ldr)(void);
26 /* ipi */
27 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
28 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
29 int vector);
30 void (*send_IPI_allbutself)(int vector);
31 void (*send_IPI_all)(int vector);
32 void (*send_IPI_self)(int vector);
33 /* */
34 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
35 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
36 const struct cpumask *andmask);
37 unsigned int (*phys_pkg_id)(int index_msb);
38 unsigned int (*get_apic_id)(unsigned long x);
39 unsigned long (*set_apic_id)(unsigned int id);
40 unsigned long apic_id_mask;
41 /* wakeup_secondary_cpu */
42 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
43};
44
45extern struct genapic *genapic;
46
47extern struct genapic apic_flat;
48extern struct genapic apic_physflat;
49extern struct genapic apic_x2apic_cluster;
50extern struct genapic apic_x2apic_phys;
51extern int acpi_madt_oem_check(char *, char *);
52
53extern void apic_send_IPI_self(int vector);
54enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
55extern enum uv_system_type get_uv_system_type(void);
56extern int is_uv_system(void);
57
58extern struct genapic apic_x2apic_uv_x;
59DECLARE_PER_CPU(int, x2apic_extra_bits);
60extern void uv_cpu_init(void);
61extern void uv_system_init(void);
62extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
63
64extern void setup_apic_routing(void);
65
66#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 000787df66e6..039db6aa8e02 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -1,11 +1,53 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_HARDIRQ_H
2# include "hardirq_32.h" 2#define _ASM_X86_HARDIRQ_H
3#else 3
4# include "hardirq_64.h" 4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned int __nmi_count; /* arch dependent */
10 unsigned int irq0_irqs;
11#ifdef CONFIG_X86_LOCAL_APIC
12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count;
14#endif
15 unsigned int generic_irqs; /* arch dependent */
16#ifdef CONFIG_SMP
17 unsigned int irq_resched_count;
18 unsigned int irq_call_count;
19 unsigned int irq_tlb_count;
20#endif
21#ifdef CONFIG_X86_MCE
22 unsigned int irq_thermal_count;
23# ifdef CONFIG_X86_64
24 unsigned int irq_threshold_count;
25# endif
5#endif 26#endif
27} ____cacheline_aligned irq_cpustat_t;
28
29DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
30
31/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
32#define MAX_HARDIRQS_PER_CPU NR_VECTORS
33
34#define __ARCH_IRQ_STAT
35
36#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
37
38#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
39
40#define __ARCH_SET_SOFTIRQ_PENDING
41
42#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
43#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
44
45extern void ack_bad_irq(unsigned int irq);
6 46
7extern u64 arch_irq_stat_cpu(unsigned int cpu); 47extern u64 arch_irq_stat_cpu(unsigned int cpu);
8#define arch_irq_stat_cpu arch_irq_stat_cpu 48#define arch_irq_stat_cpu arch_irq_stat_cpu
9 49
10extern u64 arch_irq_stat(void); 50extern u64 arch_irq_stat(void);
11#define arch_irq_stat arch_irq_stat 51#define arch_irq_stat arch_irq_stat
52
53#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
deleted file mode 100644
index cf7954d1405f..000000000000
--- a/arch/x86/include/asm/hardirq_32.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_32_H
2#define _ASM_X86_HARDIRQ_32_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int irq0_irqs;
13 unsigned int irq_resched_count;
14 unsigned int irq_call_count;
15 unsigned int irq_tlb_count;
16 unsigned int irq_thermal_count;
17 unsigned int irq_spurious_count;
18} ____cacheline_aligned irq_cpustat_t;
19
20DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
21
22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24
25#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
26
27void ack_bad_irq(unsigned int irq);
28#include <linux/irq_cpustat.h>
29
30#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
deleted file mode 100644
index b5a6b5d56704..000000000000
--- a/arch/x86/include/asm/hardirq_64.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_64_H
2#define _ASM_X86_HARDIRQ_64_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6#include <asm/pda.h>
7#include <asm/apic.h>
8
9/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
10#define MAX_HARDIRQS_PER_CPU NR_VECTORS
11
12#define __ARCH_IRQ_STAT 1
13
14#define inc_irq_stat(member) add_pda(member, 1)
15
16#define local_softirq_pending() read_pda(__softirq_pending)
17
18#define __ARCH_SET_SOFTIRQ_PENDING 1
19
20#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
21#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
22
23extern void ack_bad_irq(unsigned int irq);
24
25#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea660..014c2b85ae45 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
66struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
67 68
68#ifndef CONFIG_PARAVIRT 69#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..b762ea49bd70 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -25,10 +25,9 @@
25#include <asm/irq.h> 25#include <asm/irq.h>
26#include <asm/sections.h> 26#include <asm/sections.h>
27 27
28#define platform_legacy_irq(irq) ((irq) < 16)
29
30/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void);
32extern void error_interrupt(void); 31extern void error_interrupt(void);
33extern void spurious_interrupt(void); 32extern void spurious_interrupt(void);
34extern void thermal_interrupt(void); 33extern void thermal_interrupt(void);
@@ -58,7 +57,7 @@ extern void make_8259A_irq(unsigned int irq);
58extern void init_8259A(int aeoi); 57extern void init_8259A(int aeoi);
59 58
60/* IOAPIC */ 59/* IOAPIC */
61#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) 60#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
62extern unsigned long io_apic_irqs; 61extern unsigned long io_apic_irqs;
63 62
64extern void init_VISWS_APIC_irqs(void); 63extern void init_VISWS_APIC_irqs(void);
@@ -67,15 +66,7 @@ extern void disable_IO_APIC(void);
67extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); 66extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
68extern void setup_ioapic_dest(void); 67extern void setup_ioapic_dest(void);
69 68
70#ifdef CONFIG_X86_64
71extern void enable_IO_APIC(void); 69extern void enable_IO_APIC(void);
72#endif
73
74/* IPI functions */
75#ifdef CONFIG_X86_32
76extern void send_IPI_self(int vector);
77#endif
78extern void send_IPI(int dest, int vector);
79 70
80/* Statistics */ 71/* Statistics */
81extern atomic_t irq_err_count; 72extern atomic_t irq_err_count;
@@ -84,21 +75,11 @@ extern atomic_t irq_mis_count;
84/* EISA */ 75/* EISA */
85extern void eisa_set_level_irq(unsigned int irq); 76extern void eisa_set_level_irq(unsigned int irq);
86 77
87/* Voyager functions */
88extern asmlinkage void vic_cpi_interrupt(void);
89extern asmlinkage void vic_sys_interrupt(void);
90extern asmlinkage void vic_cmn_interrupt(void);
91extern asmlinkage void qic_timer_interrupt(void);
92extern asmlinkage void qic_invalidate_interrupt(void);
93extern asmlinkage void qic_reschedule_interrupt(void);
94extern asmlinkage void qic_enable_irq_interrupt(void);
95extern asmlinkage void qic_call_function_interrupt(void);
96
97/* SMP */ 78/* SMP */
98extern void smp_apic_timer_interrupt(struct pt_regs *); 79extern void smp_apic_timer_interrupt(struct pt_regs *);
99extern void smp_spurious_interrupt(struct pt_regs *); 80extern void smp_spurious_interrupt(struct pt_regs *);
100extern void smp_error_interrupt(struct pt_regs *); 81extern void smp_error_interrupt(struct pt_regs *);
101#ifdef CONFIG_X86_SMP 82#ifdef CONFIG_SMP
102extern void smp_reschedule_interrupt(struct pt_regs *); 83extern void smp_reschedule_interrupt(struct pt_regs *);
103extern void smp_call_function_interrupt(struct pt_regs *); 84extern void smp_call_function_interrupt(struct pt_regs *);
104extern void smp_call_function_single_interrupt(struct pt_regs *); 85extern void smp_call_function_single_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..1a99e6c092af 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,4 +60,8 @@ extern struct irq_chip i8259A_chip;
60extern void mask_8259A(void); 60extern void mask_8259A(void);
61extern void unmask_8259A(void); 61extern void unmask_8259A(void);
62 62
63#ifdef CONFIG_X86_32
64extern void init_ISA_irqs(void);
65#endif
66
63#endif /* _ASM_X86_I8259_H */ 67#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 000000000000..36fb1a6a5109
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_INIT_32_H
2#define _ASM_X86_INIT_32_H
3
4#ifdef CONFIG_X86_32
5extern void __init early_ioremap_page_table_range_init(void);
6#endif
7
8extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start,
10 unsigned long end,
11 unsigned long page_size_mask);
12
13
14extern unsigned long __initdata e820_table_start;
15extern unsigned long __meminitdata e820_table_end;
16extern unsigned long __meminitdata e820_table_top;
17
18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 1dbbdf4be9b4..e5383e3d2f8c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -5,6 +5,7 @@
5 5
6#include <linux/compiler.h> 6#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h> 7#include <asm-generic/int-ll64.h>
8#include <asm/page.h>
8 9
9#define build_mmio_read(name, size, type, reg, barrier) \ 10#define build_mmio_read(name, size, type, reg, barrier) \
10static inline type name(const volatile void __iomem *addr) \ 11static inline type name(const volatile void __iomem *addr) \
@@ -80,6 +81,98 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
80#define readq readq 81#define readq readq
81#define writeq writeq 82#define writeq writeq
82 83
84/**
85 * virt_to_phys - map virtual addresses to physical
86 * @address: address to remap
87 *
88 * The returned physical address is the physical (CPU) mapping for
89 * the memory address given. It is only valid to use this function on
90 * addresses directly mapped or allocated via kmalloc.
91 *
92 * This function does not give bus mappings for DMA transfers. In
93 * almost all conceivable cases a device driver should not be using
94 * this function
95 */
96
97static inline phys_addr_t virt_to_phys(volatile void *address)
98{
99 return __pa(address);
100}
101
102/**
103 * phys_to_virt - map physical address to virtual
104 * @address: address to remap
105 *
106 * The returned virtual address is a current CPU mapping for
107 * the memory address given. It is only valid to use this function on
108 * addresses that have a kernel mapping
109 *
110 * This function does not handle bus mappings for DMA transfers. In
111 * almost all conceivable cases a device driver should not be using
112 * this function
113 */
114
115static inline void *phys_to_virt(phys_addr_t address)
116{
117 return __va(address);
118}
119
120/*
121 * Change "struct page" to physical address.
122 */
123#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
124
125/*
126 * ISA I/O bus memory addresses are 1:1 with the physical address.
127 * However, we truncate the address to unsigned int to avoid undesirable
128 * promitions in legacy drivers.
129 */
130static inline unsigned int isa_virt_to_bus(volatile void *address)
131{
132 return (unsigned int)virt_to_phys(address);
133}
134#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page))
135#define isa_bus_to_virt phys_to_virt
136
137/*
138 * However PCI ones are not necessarily 1:1 and therefore these interfaces
139 * are forbidden in portable PCI drivers.
140 *
141 * Allow them on x86 for legacy drivers, though.
142 */
143#define virt_to_bus virt_to_phys
144#define bus_to_virt phys_to_virt
145
146/**
147 * ioremap - map bus memory into CPU space
148 * @offset: bus address of the memory
149 * @size: size of the resource to map
150 *
151 * ioremap performs a platform specific sequence of operations to
152 * make bus memory CPU accessible via the readb/readw/readl/writeb/
153 * writew/writel functions and the other mmio helpers. The returned
154 * address is not guaranteed to be usable directly as a virtual
155 * address.
156 *
157 * If the area you are trying to map is a PCI BAR you should have a
158 * look at pci_iomap().
159 */
160extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
161extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
162extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
163 unsigned long prot_val);
164
165/*
166 * The default ioremap() behavior is non-cached:
167 */
168static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
169{
170 return ioremap_nocache(offset, size);
171}
172
173extern void iounmap(volatile void __iomem *addr);
174
175
83#ifdef CONFIG_X86_32 176#ifdef CONFIG_X86_32
84# include "io_32.h" 177# include "io_32.h"
85#else 178#else
@@ -91,7 +184,7 @@ extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
91 184
92extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, 185extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
93 unsigned long prot_val); 186 unsigned long prot_val);
94extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); 187extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
95 188
96/* 189/*
97 * early_ioremap() and early_iounmap() are for temporary early boot-time 190 * early_ioremap() and early_iounmap() are for temporary early boot-time
@@ -103,7 +196,7 @@ extern void early_ioremap_reset(void);
103extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); 196extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
104extern void __iomem *early_memremap(unsigned long offset, unsigned long size); 197extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
105extern void early_iounmap(void __iomem *addr, unsigned long size); 198extern void early_iounmap(void __iomem *addr, unsigned long size);
106extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
107 199
200#define IO_SPACE_LIMIT 0xffff
108 201
109#endif /* _ASM_X86_IO_H */ 202#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
index d8e242e1b396..a299900f5920 100644
--- a/arch/x86/include/asm/io_32.h
+++ b/arch/x86/include/asm/io_32.h
@@ -37,8 +37,6 @@
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> 37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */ 38 */
39 39
40#define IO_SPACE_LIMIT 0xffff
41
42#define XQUAD_PORTIO_BASE 0xfe400000 40#define XQUAD_PORTIO_BASE 0xfe400000
43#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ 41#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
44 42
@@ -53,92 +51,6 @@
53 */ 51 */
54#define xlate_dev_kmem_ptr(p) p 52#define xlate_dev_kmem_ptr(p) p
55 53
56/**
57 * virt_to_phys - map virtual addresses to physical
58 * @address: address to remap
59 *
60 * The returned physical address is the physical (CPU) mapping for
61 * the memory address given. It is only valid to use this function on
62 * addresses directly mapped or allocated via kmalloc.
63 *
64 * This function does not give bus mappings for DMA transfers. In
65 * almost all conceivable cases a device driver should not be using
66 * this function
67 */
68
69static inline unsigned long virt_to_phys(volatile void *address)
70{
71 return __pa(address);
72}
73
74/**
75 * phys_to_virt - map physical address to virtual
76 * @address: address to remap
77 *
78 * The returned virtual address is a current CPU mapping for
79 * the memory address given. It is only valid to use this function on
80 * addresses that have a kernel mapping
81 *
82 * This function does not handle bus mappings for DMA transfers. In
83 * almost all conceivable cases a device driver should not be using
84 * this function
85 */
86
87static inline void *phys_to_virt(unsigned long address)
88{
89 return __va(address);
90}
91
92/*
93 * Change "struct page" to physical address.
94 */
95#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
96
97/**
98 * ioremap - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * If the area you are trying to map is a PCI BAR you should have a
109 * look at pci_iomap().
110 */
111extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
112extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
113extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
114 unsigned long prot_val);
115
116/*
117 * The default ioremap() behavior is non-cached:
118 */
119static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
120{
121 return ioremap_nocache(offset, size);
122}
123
124extern void iounmap(volatile void __iomem *addr);
125
126/*
127 * ISA I/O bus memory addresses are 1:1 with the physical address.
128 */
129#define isa_virt_to_bus virt_to_phys
130#define isa_page_to_bus page_to_phys
131#define isa_bus_to_virt phys_to_virt
132
133/*
134 * However PCI ones are not necessarily 1:1 and therefore these interfaces
135 * are forbidden in portable PCI drivers.
136 *
137 * Allow them on x86 for legacy drivers, though.
138 */
139#define virt_to_bus virt_to_phys
140#define bus_to_virt phys_to_virt
141
142static inline void 54static inline void
143memset_io(volatile void __iomem *addr, unsigned char val, int count) 55memset_io(volatile void __iomem *addr, unsigned char val, int count)
144{ 56{
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
index 563c16270ba6..244067893af4 100644
--- a/arch/x86/include/asm/io_64.h
+++ b/arch/x86/include/asm/io_64.h
@@ -136,73 +136,12 @@ __OUTS(b)
136__OUTS(w) 136__OUTS(w)
137__OUTS(l) 137__OUTS(l)
138 138
139#define IO_SPACE_LIMIT 0xffff
140
141#if defined(__KERNEL__) && defined(__x86_64__) 139#if defined(__KERNEL__) && defined(__x86_64__)
142 140
143#include <linux/vmalloc.h> 141#include <linux/vmalloc.h>
144 142
145#ifndef __i386__
146/*
147 * Change virtual addresses to physical addresses and vv.
148 * These are pretty trivial
149 */
150static inline unsigned long virt_to_phys(volatile void *address)
151{
152 return __pa(address);
153}
154
155static inline void *phys_to_virt(unsigned long address)
156{
157 return __va(address);
158}
159#endif
160
161/*
162 * Change "struct page" to physical address.
163 */
164#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
165
166#include <asm-generic/iomap.h> 143#include <asm-generic/iomap.h>
167 144
168/*
169 * This one maps high address device memory and turns off caching for that area.
170 * it's useful if some control registers are in such an area and write combining
171 * or read caching is not desirable:
172 */
173extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
174extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
175extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
176 unsigned long prot_val);
177
178/*
179 * The default ioremap() behavior is non-cached:
180 */
181static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
182{
183 return ioremap_nocache(offset, size);
184}
185
186extern void iounmap(volatile void __iomem *addr);
187
188extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
189
190/*
191 * ISA I/O bus memory addresses are 1:1 with the physical address.
192 */
193#define isa_virt_to_bus virt_to_phys
194#define isa_page_to_bus page_to_phys
195#define isa_bus_to_virt phys_to_virt
196
197/*
198 * However PCI ones are not necessarily 1:1 and therefore these interfaces
199 * are forbidden in portable PCI drivers.
200 *
201 * Allow them on x86 for legacy drivers, though.
202 */
203#define virt_to_bus virt_to_phys
204#define bus_to_virt phys_to_virt
205
206void __memcpy_fromio(void *, unsigned long, unsigned); 145void __memcpy_fromio(void *, unsigned long, unsigned);
207void __memcpy_toio(unsigned long, const void *, unsigned); 146void __memcpy_toio(unsigned long, const void *, unsigned);
208 147
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7a1f44ac1f17..59cb4a1317b7 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry {
114extern int nr_ioapics; 114extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS]; 115extern int nr_ioapic_registers[MAX_IO_APICS];
116 116
117/*
118 * MP-BIOS irq configuration table structures:
119 */
120
121#define MP_MAX_IOAPIC_PIN 127 117#define MP_MAX_IOAPIC_PIN 127
122 118
123struct mp_config_ioapic {
124 unsigned long mp_apicaddr;
125 unsigned int mp_apicid;
126 unsigned char mp_type;
127 unsigned char mp_apicver;
128 unsigned char mp_flags;
129};
130
131struct mp_config_intsrc {
132 unsigned int mp_dstapic;
133 unsigned char mp_type;
134 unsigned char mp_irqtype;
135 unsigned short mp_irqflag;
136 unsigned char mp_srcbus;
137 unsigned char mp_srcbusirq;
138 unsigned char mp_dstirq;
139};
140
141/* I/O APIC entries */ 119/* I/O APIC entries */
142extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 120extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
143 121
144/* # of MP IRQ source entries */ 122/* # of MP IRQ source entries */
145extern int mp_irq_entries; 123extern int mp_irq_entries;
146 124
147/* MP IRQ source entries */ 125/* MP IRQ source entries */
148extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 126extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
149 127
150/* non-0 if default (table-less) MP configuration */ 128/* non-0 if default (table-less) MP configuration */
151extern int mpc_default_type; 129extern int mpc_default_type;
@@ -165,15 +143,6 @@ extern int noioapicreroute;
165/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
166extern int timer_through_8259; 144extern int timer_through_8259;
167 145
168static inline void disable_ioapic_setup(void)
169{
170#ifdef CONFIG_PCI
171 noioapicquirk = 1;
172 noioapicreroute = -1;
173#endif
174 skip_ioapic_setup = 1;
175}
176
177/* 146/*
178 * If we use the IO-APIC for IRQ routing, disable automatic 147 * If we use the IO-APIC for IRQ routing, disable automatic
179 * assignment of PCI IRQ's. 148 * assignment of PCI IRQ's.
@@ -200,6 +169,12 @@ extern void reinit_intr_remapped_IO_APIC(int);
200 169
201extern void probe_nr_irqs_gsi(void); 170extern void probe_nr_irqs_gsi(void);
202 171
172extern int setup_ioapic_entry(int apic, int irq,
173 struct IO_APIC_route_entry *entry,
174 unsigned int destination, int trigger,
175 int polarity, int vector);
176extern void ioapic_write_entry(int apic, int pin,
177 struct IO_APIC_route_entry e);
203#else /* !CONFIG_X86_IO_APIC */ 178#else /* !CONFIG_X86_IO_APIC */
204#define io_apic_assign_pci_irqs 0 179#define io_apic_assign_pci_irqs 0
205static const int timer_through_8259 = 0; 180static const int timer_through_8259 = 0;
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index c745a306f7d3..0b7228268a63 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_IPI_H 1#ifndef _ASM_X86_IPI_H
2#define _ASM_X86_IPI_H 2#define _ASM_X86_IPI_H
3 3
4#ifdef CONFIG_X86_LOCAL_APIC
5
4/* 6/*
5 * Copyright 2004 James Cleverdon, IBM. 7 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2 8 * Subject to the GNU Public License, v.2
@@ -55,8 +57,8 @@ static inline void __xapic_wait_icr_idle(void)
55 cpu_relax(); 57 cpu_relax();
56} 58}
57 59
58static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, 60static inline void
59 unsigned int dest) 61__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
60{ 62{
61 /* 63 /*
62 * Subtle. In the case of the 'never do double writes' workaround 64 * Subtle. In the case of the 'never do double writes' workaround
@@ -87,8 +89,8 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
87 * This is used to send an IPI with no shorthand notation (the destination is 89 * This is used to send an IPI with no shorthand notation (the destination is
88 * specified in bits 56 to 63 of the ICR). 90 * specified in bits 56 to 63 of the ICR).
89 */ 91 */
90static inline void __send_IPI_dest_field(unsigned int mask, int vector, 92static inline void
91 unsigned int dest) 93 __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
92{ 94{
93 unsigned long cfg; 95 unsigned long cfg;
94 96
@@ -117,41 +119,44 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
117 native_apic_mem_write(APIC_ICR, cfg); 119 native_apic_mem_write(APIC_ICR, cfg);
118} 120}
119 121
120static inline void send_IPI_mask_sequence(const struct cpumask *mask, 122extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
121 int vector) 123 int vector);
122{ 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
123 unsigned long flags; 125 int vector);
124 unsigned long query_cpu; 126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
125 130
126 /* 131/* Avoid include hell */
127 * Hack. The clustered APIC addressing mode doesn't allow us to send 132#define NMI_VECTOR 0x02
128 * to an arbitrary mask, so I do a unicast to each CPU instead. 133
129 * - mbligh 134extern int no_broadcast;
130 */ 135
131 local_irq_save(flags); 136static inline void __default_local_send_IPI_allbutself(int vector)
132 for_each_cpu(query_cpu, mask) { 137{
133 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), 138 if (no_broadcast || vector == NMI_VECTOR)
134 vector, APIC_DEST_PHYSICAL); 139 apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
135 } 140 else
136 local_irq_restore(flags); 141 __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical);
137} 142}
138 143
139static inline void send_IPI_mask_allbutself(const struct cpumask *mask, 144static inline void __default_local_send_IPI_all(int vector)
140 int vector)
141{ 145{
142 unsigned long flags; 146 if (no_broadcast || vector == NMI_VECTOR)
143 unsigned int query_cpu; 147 apic->send_IPI_mask(cpu_online_mask, vector);
144 unsigned int this_cpu = smp_processor_id(); 148 else
145 149 __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical);
146 /* See Hack comment above */
147
148 local_irq_save(flags);
149 for_each_cpu(query_cpu, mask)
150 if (query_cpu != this_cpu)
151 __send_IPI_dest_field(
152 per_cpu(x86_cpu_to_apicid, query_cpu),
153 vector, APIC_DEST_PHYSICAL);
154 local_irq_restore(flags);
155} 150}
156 151
152#ifdef CONFIG_X86_32
153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector);
155extern void default_send_IPI_allbutself(int vector);
156extern void default_send_IPI_all(int vector);
157extern void default_send_IPI_self(int vector);
158#endif
159
160#endif
161
157#endif /* _ASM_X86_IPI_H */ 162#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 592688ed04d3..f38481bcd455 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,9 +36,12 @@ static inline int irq_canonicalize(int irq)
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern unsigned int do_IRQ(struct pt_regs *regs); 39extern void (*generic_interrupt_extension)(void);
40extern void init_IRQ(void); 40extern void init_IRQ(void);
41extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
43
44extern unsigned int do_IRQ(struct pt_regs *regs);
42 45
43/* Interrupt vector management */ 46/* Interrupt vector management */
44extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 47extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
index 89c898ab298b..77843225b7ea 100644
--- a/arch/x86/include/asm/irq_regs.h
+++ b/arch/x86/include/asm/irq_regs.h
@@ -1,5 +1,31 @@
1#ifdef CONFIG_X86_32 1/*
2# include "irq_regs_32.h" 2 * Per-cpu current frame pointer - the location of the last exception frame on
3#else 3 * the stack, stored in the per-cpu area.
4# include "irq_regs_64.h" 4 *
5#endif 5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_H
8#define _ASM_X86_IRQ_REGS_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return percpu_read(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 percpu_write(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
deleted file mode 100644
index 86afd7473457..000000000000
--- a/arch/x86/include/asm/irq_regs_32.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/*
2 * Per-cpu current frame pointer - the location of the last exception frame on
3 * the stack, stored in the per-cpu area.
4 *
5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_32_H
8#define _ASM_X86_IRQ_REGS_32_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return x86_read_percpu(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 x86_write_percpu(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/x86/include/asm/irq_regs_64.h
+++ /dev/null
@@ -1 +0,0 @@
1#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..3cbd79bbb47c 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,47 +1,69 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h> 4/*
5 * Linux IRQ vector layout.
6 *
7 * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can
8 * be defined by Linux. They are used as a jump table by the CPU when a
9 * given vector is triggered - by a CPU-external, CPU-internal or
10 * software-triggered event.
11 *
12 * Linux sets the kernel code address each entry jumps to early during
13 * bootup, and never changes them. This is the general layout of the
14 * IDT entries:
15 *
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts
20 * Vectors 238 ... 255 : special interrupts
21 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 *
24 * This file enumerates the exact layout of them:
25 */
5 26
6#define NMI_VECTOR 0x02 27#define NMI_VECTOR 0x02
7 28
8/* 29/*
9 * IDT vectors usable for external interrupt sources start 30 * IDT vectors usable for external interrupt sources start
10 * at 0x20: 31 * at 0x20:
11 */ 32 */
12#define FIRST_EXTERNAL_VECTOR 0x20 33#define FIRST_EXTERNAL_VECTOR 0x20
13 34
14#ifdef CONFIG_X86_32 35#ifdef CONFIG_X86_32
15# define SYSCALL_VECTOR 0x80 36# define SYSCALL_VECTOR 0x80
16#else 37#else
17# define IA32_SYSCALL_VECTOR 0x80 38# define IA32_SYSCALL_VECTOR 0x80
18#endif 39#endif
19 40
20/* 41/*
21 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering 42 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
22 * cleanup after irq migration. 43 * cleanup after irq migration.
23 */ 44 */
24#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR 45#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
25 46
26/* 47/*
27 * Vectors 0x30-0x3f are used for ISA interrupts. 48 * Vectors 0x30-0x3f are used for ISA interrupts.
28 */ 49 */
29#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) 50#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
30#define IRQ1_VECTOR (IRQ0_VECTOR + 1) 51
31#define IRQ2_VECTOR (IRQ0_VECTOR + 2) 52#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
32#define IRQ3_VECTOR (IRQ0_VECTOR + 3) 53#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
33#define IRQ4_VECTOR (IRQ0_VECTOR + 4) 54#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
34#define IRQ5_VECTOR (IRQ0_VECTOR + 5) 55#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
35#define IRQ6_VECTOR (IRQ0_VECTOR + 6) 56#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
36#define IRQ7_VECTOR (IRQ0_VECTOR + 7) 57#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
37#define IRQ8_VECTOR (IRQ0_VECTOR + 8) 58#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
38#define IRQ9_VECTOR (IRQ0_VECTOR + 9) 59#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
39#define IRQ10_VECTOR (IRQ0_VECTOR + 10) 60#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
40#define IRQ11_VECTOR (IRQ0_VECTOR + 11) 61#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
41#define IRQ12_VECTOR (IRQ0_VECTOR + 12) 62#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
42#define IRQ13_VECTOR (IRQ0_VECTOR + 13) 63#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
43#define IRQ14_VECTOR (IRQ0_VECTOR + 14) 64#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
44#define IRQ15_VECTOR (IRQ0_VECTOR + 15) 65#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
66#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
45 67
46/* 68/*
47 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff 69 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -49,119 +71,103 @@
49 * some of the following vectors are 'rare', they are merged 71 * some of the following vectors are 'rare', they are merged
50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. 72 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
51 * TLB, reschedule and local APIC vectors are performance-critical. 73 * TLB, reschedule and local APIC vectors are performance-critical.
52 *
53 * Vectors 0xf0-0xfa are free (reserved for future Linux use).
54 */ 74 */
55#ifdef CONFIG_X86_32
56
57# define SPURIOUS_APIC_VECTOR 0xff
58# define ERROR_APIC_VECTOR 0xfe
59# define INVALIDATE_TLB_VECTOR 0xfd
60# define RESCHEDULE_VECTOR 0xfc
61# define CALL_FUNCTION_VECTOR 0xfb
62# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
63# define THERMAL_APIC_VECTOR 0xf0
64
65#else
66 75
67#define SPURIOUS_APIC_VECTOR 0xff 76#define SPURIOUS_APIC_VECTOR 0xff
77/*
78 * Sanity check
79 */
80#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
81# error SPURIOUS_APIC_VECTOR definition error
82#endif
83
68#define ERROR_APIC_VECTOR 0xfe 84#define ERROR_APIC_VECTOR 0xfe
69#define RESCHEDULE_VECTOR 0xfd 85#define RESCHEDULE_VECTOR 0xfd
70#define CALL_FUNCTION_VECTOR 0xfc 86#define CALL_FUNCTION_VECTOR 0xfc
71#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 87#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
72#define THERMAL_APIC_VECTOR 0xfa 88#define THERMAL_APIC_VECTOR 0xfa
73#define THRESHOLD_APIC_VECTOR 0xf9
74#define UV_BAU_MESSAGE 0xf8
75#define INVALIDATE_TLB_VECTOR_END 0xf7
76#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
77
78#define NUM_INVALIDATE_TLB_VECTORS 8
79 89
90#ifdef CONFIG_X86_32
91/* 0xf8 - 0xf9 : free */
92#else
93# define THRESHOLD_APIC_VECTOR 0xf9
94# define UV_BAU_MESSAGE 0xf8
80#endif 95#endif
81 96
97/* f0-f7 used for spreading out TLB flushes: */
98#define INVALIDATE_TLB_VECTOR_END 0xf7
99#define INVALIDATE_TLB_VECTOR_START 0xf0
100#define NUM_INVALIDATE_TLB_VECTORS 8
101
82/* 102/*
83 * Local APIC timer IRQ vector is on a different priority level, 103 * Local APIC timer IRQ vector is on a different priority level,
84 * to work around the 'lost local interrupt if more than 2 IRQ 104 * to work around the 'lost local interrupt if more than 2 IRQ
85 * sources per level' errata. 105 * sources per level' errata.
86 */ 106 */
87#define LOCAL_TIMER_VECTOR 0xef 107#define LOCAL_TIMER_VECTOR 0xef
108
109/*
110 * Performance monitoring interrupt vector:
111 */
112#define LOCAL_PERF_VECTOR 0xee
113
114/*
115 * Generic system vector for platform specific use
116 */
117#define GENERIC_INTERRUPT_VECTOR 0xed
88 118
89/* 119/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we 120 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority 121 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector) 122 * levels. (0x80 is the syscall vector)
93 */ 123 */
94#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) 124#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
95 125
96#define NR_VECTORS 256 126#define NR_VECTORS 256
97 127
98#define FPU_IRQ 13 128#define FPU_IRQ 13
99 129
100#define FIRST_VM86_IRQ 3 130#define FIRST_VM86_IRQ 3
101#define LAST_VM86_IRQ 15 131#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103 132
104#define NR_IRQS_LEGACY 16 133#ifndef __ASSEMBLY__
105 134static inline int invalid_vm86_irq(int irq)
106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 135{
107 136 return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
108#ifndef CONFIG_SPARSE_IRQ 137}
109# if NR_CPUS < MAX_IO_APICS
110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
111# else
112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
113# endif
114#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
117# else
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
119# endif
120#endif 138#endif
121 139
122#elif defined(CONFIG_X86_VOYAGER) 140/*
123 141 * Size the maximum number of interrupts.
124# define NR_IRQS 224 142 *
143 * If the irq_desc[] array has a sparse layout, we can size things
144 * generously - it scales up linearly with the maximum number of CPUs,
145 * and the maximum number of IO-APICs, whichever is higher.
146 *
147 * In other cases we size more conservatively, to not create too large
148 * static arrays.
149 */
125 150
126#else /* IO_APIC || VOYAGER */ 151#define NR_IRQS_LEGACY 16
127 152
128# define NR_IRQS 16 153#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
154#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
129 155
156#ifdef CONFIG_X86_IO_APIC
157# ifdef CONFIG_SPARSE_IRQ
158# define NR_IRQS \
159 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
160 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
161 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
162# else
163# if NR_CPUS < MAX_IO_APICS
164# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
165# else
166# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
167# endif
168# endif
169#else /* !CONFIG_X86_IO_APIC: */
170# define NR_IRQS NR_IRQS_LEGACY
130#endif 171#endif
131 172
132/* Voyager specific defines */
133/* These define the CPIs we use in linux */
134#define VIC_CPI_LEVEL0 0
135#define VIC_CPI_LEVEL1 1
136/* now the fake CPIs */
137#define VIC_TIMER_CPI 2
138#define VIC_INVALIDATE_CPI 3
139#define VIC_RESCHEDULE_CPI 4
140#define VIC_ENABLE_IRQ_CPI 5
141#define VIC_CALL_FUNCTION_CPI 6
142#define VIC_CALL_FUNCTION_SINGLE_CPI 7
143
144/* Now the QIC CPIs: Since we don't need the two initial levels,
145 * these are 2 less than the VIC CPIs */
146#define QIC_CPI_OFFSET 1
147#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
148#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
149#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
150#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
151#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
152#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
153
154#define VIC_START_FAKE_CPI VIC_TIMER_CPI
155#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
156
157/* this is the SYS_INT CPI. */
158#define VIC_SYS_INT 8
159#define VIC_CMN_INT 15
160
161/* This is the boot CPI for alternate processors. It gets overwritten
162 * by the above once the system has activated all available processors */
163#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
164#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
165
166
167#endif /* _ASM_X86_IRQ_VECTORS_H */ 173#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index c61d8b2ab8b9..317ff1703d0b 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -10,27 +10,12 @@
10#else 10#else
11# define PA_CONTROL_PAGE 0 11# define PA_CONTROL_PAGE 0
12# define VA_CONTROL_PAGE 1 12# define VA_CONTROL_PAGE 1
13# define PA_PGD 2 13# define PA_TABLE_PAGE 2
14# define VA_PGD 3 14# define PA_SWAP_PAGE 3
15# define PA_PUD_0 4 15# define PAGES_NR 4
16# define VA_PUD_0 5
17# define PA_PMD_0 6
18# define VA_PMD_0 7
19# define PA_PTE_0 8
20# define VA_PTE_0 9
21# define PA_PUD_1 10
22# define VA_PUD_1 11
23# define PA_PMD_1 12
24# define VA_PMD_1 13
25# define PA_PTE_1 14
26# define VA_PTE_1 15
27# define PA_TABLE_PAGE 16
28# define PAGES_NR 17
29#endif 16#endif
30 17
31#ifdef CONFIG_X86_32
32# define KEXEC_CONTROL_CODE_MAX_SIZE 2048 18# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
33#endif
34 19
35#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
36 21
@@ -151,15 +136,16 @@ relocate_kernel(unsigned long indirection_page,
151 unsigned int has_pae, 136 unsigned int has_pae,
152 unsigned int preserve_context); 137 unsigned int preserve_context);
153#else 138#else
154NORET_TYPE void 139unsigned long
155relocate_kernel(unsigned long indirection_page, 140relocate_kernel(unsigned long indirection_page,
156 unsigned long page_list, 141 unsigned long page_list,
157 unsigned long start_address) ATTRIB_NORET; 142 unsigned long start_address,
143 unsigned int preserve_context);
158#endif 144#endif
159 145
160#ifdef CONFIG_X86_32
161#define ARCH_HAS_KIMAGE_ARCH 146#define ARCH_HAS_KIMAGE_ARCH
162 147
148#ifdef CONFIG_X86_32
163struct kimage_arch { 149struct kimage_arch {
164 pgd_t *pgd; 150 pgd_t *pgd;
165#ifdef CONFIG_X86_PAE 151#ifdef CONFIG_X86_PAE
@@ -169,6 +155,12 @@ struct kimage_arch {
169 pte_t *pte0; 155 pte_t *pte0;
170 pte_t *pte1; 156 pte_t *pte1;
171}; 157};
158#else
159struct kimage_arch {
160 pud_t *pud;
161 pmd_t *pmd;
162 pte_t *pte;
163};
172#endif 164#endif
173 165
174#endif /* __ASSEMBLY__ */ 166#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 5d98d0b68ffc..12d55e773eb6 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
1#ifndef _ASM_X86_LINKAGE_H 1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H 2#define _ASM_X86_LINKAGE_H
3 3
4#include <linux/stringify.h>
5
4#undef notrace 6#undef notrace
5#define notrace __attribute__((no_instrument_function)) 7#define notrace __attribute__((no_instrument_function))
6 8
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/* 11/*
@@ -50,72 +47,20 @@
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6)) 48 "g" (arg4), "g" (arg5), "g" (arg6))
52 49
53#endif 50#endif /* CONFIG_X86_32 */
54
55#ifdef CONFIG_X86_ALIGNMENT_16
56#define __ALIGN .align 16,0x90
57#define __ALIGN_STR ".align 16,0x90"
58#endif
59
60/*
61 * to check ENTRY_X86/END_X86 and
62 * KPROBE_ENTRY_X86/KPROBE_END_X86
63 * unbalanced-missed-mixed appearance
64 */
65#define __set_entry_x86 .set ENTRY_X86_IN, 0
66#define __unset_entry_x86 .set ENTRY_X86_IN, 1
67#define __set_kprobe_x86 .set KPROBE_X86_IN, 0
68#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1
69
70#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed"
71
72#define __check_entry_x86 \
73 .ifdef ENTRY_X86_IN; \
74 .ifeq ENTRY_X86_IN; \
75 __macro_err_x86; \
76 .abort; \
77 .endif; \
78 .endif
79
80#define __check_kprobe_x86 \
81 .ifdef KPROBE_X86_IN; \
82 .ifeq KPROBE_X86_IN; \
83 __macro_err_x86; \
84 .abort; \
85 .endif; \
86 .endif
87 51
88#define __check_entry_kprobe_x86 \ 52#ifdef __ASSEMBLY__
89 __check_entry_x86; \
90 __check_kprobe_x86
91 53
92#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 54#define GLOBAL(name) \
93 55 .globl name; \
94#define ENTRY_X86(name) \
95 __check_entry_kprobe_x86; \
96 __set_entry_x86; \
97 .globl name; \
98 __ALIGN; \
99 name: 56 name:
100 57
101#define END_X86(name) \ 58#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
102 __unset_entry_x86; \ 59#define __ALIGN .p2align 4, 0x90
103 __check_entry_kprobe_x86; \ 60#define __ALIGN_STR __stringify(__ALIGN)
104 .size name, .-name 61#endif
105
106#define KPROBE_ENTRY_X86(name) \
107 __check_entry_kprobe_x86; \
108 __set_kprobe_x86; \
109 .pushsection .kprobes.text, "ax"; \
110 .globl name; \
111 __ALIGN; \
112 name:
113 62
114#define KPROBE_END_X86(name) \ 63#endif /* __ASSEMBLY__ */
115 __unset_kprobe_x86; \
116 __check_entry_kprobe_x86; \
117 .size name, .-name; \
118 .popsection
119 64
120#endif /* _ASM_X86_LINKAGE_H */ 65#endif /* _ASM_X86_LINKAGE_H */
121 66
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
deleted file mode 100644
index cc09cbbee27e..000000000000
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ /dev/null
@@ -1,168 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
3
4#ifdef CONFIG_X86_LOCAL_APIC
5
6#include <mach_apicdef.h>
7#include <asm/smp.h>
8
9#define APIC_DFR_VALUE (APIC_DFR_FLAT)
10
11static inline const struct cpumask *target_cpus(void)
12{
13#ifdef CONFIG_SMP
14 return cpu_online_mask;
15#else
16 return cpumask_of(0);
17#endif
18}
19
20#define NO_BALANCE_IRQ (0)
21#define esr_disable (0)
22
23#ifdef CONFIG_X86_64
24#include <asm/genapic.h>
25#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
26#define INT_DEST_MODE (genapic->int_dest_mode)
27#define TARGET_CPUS (genapic->target_cpus())
28#define apic_id_registered (genapic->apic_id_registered)
29#define init_apic_ldr (genapic->init_apic_ldr)
30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
31#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
32#define phys_pkg_id (genapic->phys_pkg_id)
33#define vector_allocation_domain (genapic->vector_allocation_domain)
34#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
35#define send_IPI_self (genapic->send_IPI_self)
36#define wakeup_secondary_cpu (genapic->wakeup_cpu)
37extern void setup_apic_routing(void);
38#else
39#define INT_DELIVERY_MODE dest_LowestPrio
40#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
41#define TARGET_CPUS (target_cpus())
42#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init
43/*
44 * Set up the logical destination ID.
45 *
46 * Intel recommends to set DFR, LDR and TPR before enabling
47 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
48 * document number 292116). So here it goes...
49 */
50static inline void init_apic_ldr(void)
51{
52 unsigned long val;
53
54 apic_write(APIC_DFR, APIC_DFR_VALUE);
55 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
56 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
57 apic_write(APIC_LDR, val);
58}
59
60static inline int apic_id_registered(void)
61{
62 return physid_isset(read_apic_id(), phys_cpu_present_map);
63}
64
65static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
66{
67 return cpumask_bits(cpumask)[0];
68}
69
70static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
71 const struct cpumask *andmask)
72{
73 unsigned long mask1 = cpumask_bits(cpumask)[0];
74 unsigned long mask2 = cpumask_bits(andmask)[0];
75 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
76
77 return (unsigned int)(mask1 & mask2 & mask3);
78}
79
80static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
81{
82 return cpuid_apic >> index_msb;
83}
84
85static inline void setup_apic_routing(void)
86{
87#ifdef CONFIG_X86_IO_APIC
88 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
89 "Flat", nr_ioapics);
90#endif
91}
92
93static inline int apicid_to_node(int logical_apicid)
94{
95#ifdef CONFIG_SMP
96 return apicid_2_node[hard_smp_processor_id()];
97#else
98 return 0;
99#endif
100}
101
102static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
103{
104 /* Careful. Some cpus do not strictly honor the set of cpus
105 * specified in the interrupt destination when using lowest
106 * priority interrupt delivery mode.
107 *
108 * In particular there was a hyperthreading cpu observed to
109 * deliver interrupts to the wrong hyperthread when only one
110 * hyperthread was specified in the interrupt desitination.
111 */
112 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
113}
114#endif
115
116static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
117{
118 return physid_isset(apicid, bitmap);
119}
120
121static inline unsigned long check_apicid_present(int bit)
122{
123 return physid_isset(bit, phys_cpu_present_map);
124}
125
126static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
127{
128 return phys_map;
129}
130
131static inline int multi_timer_check(int apic, int irq)
132{
133 return 0;
134}
135
136/* Mapping from cpu number to logical apicid */
137static inline int cpu_to_logical_apicid(int cpu)
138{
139 return 1 << cpu;
140}
141
142static inline int cpu_present_to_apicid(int mps_cpu)
143{
144 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
145 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
146 else
147 return BAD_APICID;
148}
149
150static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
151{
152 return physid_mask_of_physid(phys_apicid);
153}
154
155static inline void setup_portio_remap(void)
156{
157}
158
159static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
160{
161 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
162}
163
164static inline void enable_apic_mode(void)
165{
166}
167#endif /* CONFIG_X86_LOCAL_APIC */
168#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
deleted file mode 100644
index 53179936d6c6..000000000000
--- a/arch/x86/include/asm/mach-default/mach_apicdef.h
+++ /dev/null
@@ -1,24 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
3
4#include <asm/apic.h>
5
6#ifdef CONFIG_X86_64
7#define APIC_ID_MASK (genapic->apic_id_mask)
8#define GET_APIC_ID(x) (genapic->get_apic_id(x))
9#define SET_APIC_ID(x) (genapic->set_apic_id(x))
10#else
11#define APIC_ID_MASK (0xF<<24)
12static inline unsigned get_apic_id(unsigned long x)
13{
14 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
15 if (APIC_XAPIC(ver))
16 return (((x)>>24)&0xFF);
17 else
18 return (((x)>>24)&0xF);
19}
20
21#define GET_APIC_ID(x) get_apic_id(x)
22#endif
23
24#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
deleted file mode 100644
index 191312d155da..000000000000
--- a/arch/x86/include/asm/mach-default/mach_ipi.h
+++ /dev/null
@@ -1,64 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
2#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
3
4/* Avoid include hell */
5#define NMI_VECTOR 0x02
6
7void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
8void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
9void __send_IPI_shortcut(unsigned int shortcut, int vector);
10
11extern int no_broadcast;
12
13#ifdef CONFIG_X86_64
14#include <asm/genapic.h>
15#define send_IPI_mask (genapic->send_IPI_mask)
16#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
17#else
18static inline void send_IPI_mask(const struct cpumask *mask, int vector)
19{
20 send_IPI_mask_bitmask(mask, vector);
21}
22void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
23#endif
24
25static inline void __local_send_IPI_allbutself(int vector)
26{
27 if (no_broadcast || vector == NMI_VECTOR)
28 send_IPI_mask_allbutself(cpu_online_mask, vector);
29 else
30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
31}
32
33static inline void __local_send_IPI_all(int vector)
34{
35 if (no_broadcast || vector == NMI_VECTOR)
36 send_IPI_mask(cpu_online_mask, vector);
37 else
38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
39}
40
41#ifdef CONFIG_X86_64
42#define send_IPI_allbutself (genapic->send_IPI_allbutself)
43#define send_IPI_all (genapic->send_IPI_all)
44#else
45static inline void send_IPI_allbutself(int vector)
46{
47 /*
48 * if there are no other CPUs in the system then we get an APIC send
49 * error if we try to broadcast, thus avoid sending IPIs in this case.
50 */
51 if (!(num_online_cpus() > 1))
52 return;
53
54 __local_send_IPI_allbutself(vector);
55 return;
56}
57
58static inline void send_IPI_all(int vector)
59{
60 __local_send_IPI_all(vector);
61}
62#endif
63
64#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
deleted file mode 100644
index c70a263d68cd..000000000000
--- a/arch/x86/include/asm/mach-default/mach_mpparse.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
3
4static inline int
5mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
6{
7 return 0;
8}
9
10/* Hook from generic ACPI tables.c */
11static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
12{
13 return 0;
14}
15
16
17#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
deleted file mode 100644
index e85ede686be8..000000000000
--- a/arch/x86/include/asm/mach-default/mach_mpspec.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6#if CONFIG_BASE_SMALL == 0
7#define MAX_MP_BUSSES 256
8#else
9#define MAX_MP_BUSSES 32
10#endif
11
12#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
deleted file mode 100644
index 89897a6a65b9..000000000000
--- a/arch/x86/include/asm/mach-default/mach_wakecpu.h
+++ /dev/null
@@ -1,41 +0,0 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
3
4#define TRAMPOLINE_PHYS_LOW (0x467)
5#define TRAMPOLINE_PHYS_HIGH (0x469)
6
7static inline void wait_for_init_deassert(atomic_t *deassert)
8{
9 while (!atomic_read(deassert))
10 cpu_relax();
11 return;
12}
13
14/* Nothing to do for most platforms, since cleared by the INIT cycle */
15static inline void smp_callin_clear_local_apic(void)
16{
17}
18
19static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
20{
21}
22
23static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
24{
25}
26
27#ifdef CONFIG_SMP
28extern void __inquire_remote_apic(int apicid);
29#else /* CONFIG_SMP */
30static inline void __inquire_remote_apic(int apicid)
31{
32}
33#endif /* CONFIG_SMP */
34
35static inline void inquire_remote_apic(int apicid)
36{
37 if (apic_verbosity >= APIC_DEBUG)
38 __inquire_remote_apic(apicid);
39}
40
41#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
deleted file mode 100644
index 995c45efdb33..000000000000
--- a/arch/x86/include/asm/mach-generic/gpio.h
+++ /dev/null
@@ -1,15 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
2#define _ASM_X86_MACH_GENERIC_GPIO_H
3
4int gpio_request(unsigned gpio, const char *label);
5void gpio_free(unsigned gpio);
6int gpio_direction_input(unsigned gpio);
7int gpio_direction_output(unsigned gpio, int value);
8int gpio_get_value(unsigned gpio);
9void gpio_set_value(unsigned gpio, int value);
10int gpio_to_irq(unsigned gpio);
11int irq_to_gpio(unsigned irq);
12
13#include <asm-generic/gpio.h> /* cansleep wrappers */
14
15#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
deleted file mode 100644
index 48553e958ad5..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ /dev/null
@@ -1,35 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
2#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
3
4#include <asm/genapic.h>
5
6#define esr_disable (genapic->ESR_DISABLE)
7#define NO_BALANCE_IRQ (genapic->no_balance_irq)
8#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
9#define INT_DEST_MODE (genapic->int_dest_mode)
10#undef APIC_DEST_LOGICAL
11#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
12#define TARGET_CPUS (genapic->target_cpus())
13#define apic_id_registered (genapic->apic_id_registered)
14#define init_apic_ldr (genapic->init_apic_ldr)
15#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
16#define setup_apic_routing (genapic->setup_apic_routing)
17#define multi_timer_check (genapic->multi_timer_check)
18#define apicid_to_node (genapic->apicid_to_node)
19#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
20#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
21#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
22#define setup_portio_remap (genapic->setup_portio_remap)
23#define check_apicid_present (genapic->check_apicid_present)
24#define check_phys_apicid_present (genapic->check_phys_apicid_present)
25#define check_apicid_used (genapic->check_apicid_used)
26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
27#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
28#define vector_allocation_domain (genapic->vector_allocation_domain)
29#define enable_apic_mode (genapic->enable_apic_mode)
30#define phys_pkg_id (genapic->phys_pkg_id)
31#define wakeup_secondary_cpu (genapic->wakeup_cpu)
32
33extern void generic_bigsmp_probe(void);
34
35#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
deleted file mode 100644
index 68041f3802f4..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_apicdef.h
+++ /dev/null
@@ -1,11 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
2#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
3
4#ifndef APIC_DEFINITION
5#include <asm/genapic.h>
6
7#define GET_APIC_ID (genapic->get_apic_id)
8#define APIC_ID_MASK (genapic->apic_id_mask)
9#endif
10
11#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
deleted file mode 100644
index ffd637e3c3d9..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_ipi.h
+++ /dev/null
@@ -1,10 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
2#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
3
4#include <asm/genapic.h>
5
6#define send_IPI_mask (genapic->send_IPI_mask)
7#define send_IPI_allbutself (genapic->send_IPI_allbutself)
8#define send_IPI_all (genapic->send_IPI_all)
9
10#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
deleted file mode 100644
index 9444ab8dca94..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_mpparse.h
+++ /dev/null
@@ -1,9 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
3
4
5extern int mps_oem_check(struct mpc_table *, char *, char *);
6
7extern int acpi_madt_oem_check(char *, char *);
8
9#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
deleted file mode 100644
index 3bc407226578..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_mpspec.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
7/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
8#define MAX_MP_BUSSES 260
9
10extern void numaq_mps_oem_check(struct mpc_table *, char *, char *);
11
12#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
deleted file mode 100644
index 1ab16b168c8a..000000000000
--- a/arch/x86/include/asm/mach-generic/mach_wakecpu.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
2#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
3
4#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low)
5#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high)
6#define wait_for_init_deassert (genapic->wait_for_init_deassert)
7#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic)
8#define store_NMI_vector (genapic->store_NMI_vector)
9#define restore_NMI_vector (genapic->restore_NMI_vector)
10#define inquire_remote_apic (genapic->inquire_remote_apic)
11
12#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
deleted file mode 100644
index c210ab5788b0..000000000000
--- a/arch/x86/include/asm/mach-rdc321x/gpio.h
+++ /dev/null
@@ -1,60 +0,0 @@
1#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
2#define _ASM_X86_MACH_RDC321X_GPIO_H
3
4#include <linux/kernel.h>
5
6extern int rdc_gpio_get_value(unsigned gpio);
7extern void rdc_gpio_set_value(unsigned gpio, int value);
8extern int rdc_gpio_direction_input(unsigned gpio);
9extern int rdc_gpio_direction_output(unsigned gpio, int value);
10extern int rdc_gpio_request(unsigned gpio, const char *label);
11extern void rdc_gpio_free(unsigned gpio);
12extern void __init rdc321x_gpio_setup(void);
13
14/* Wrappers for the arch-neutral GPIO API */
15
16static inline int gpio_request(unsigned gpio, const char *label)
17{
18 return rdc_gpio_request(gpio, label);
19}
20
21static inline void gpio_free(unsigned gpio)
22{
23 might_sleep();
24 rdc_gpio_free(gpio);
25}
26
27static inline int gpio_direction_input(unsigned gpio)
28{
29 return rdc_gpio_direction_input(gpio);
30}
31
32static inline int gpio_direction_output(unsigned gpio, int value)
33{
34 return rdc_gpio_direction_output(gpio, value);
35}
36
37static inline int gpio_get_value(unsigned gpio)
38{
39 return rdc_gpio_get_value(gpio);
40}
41
42static inline void gpio_set_value(unsigned gpio, int value)
43{
44 rdc_gpio_set_value(gpio, value);
45}
46
47static inline int gpio_to_irq(unsigned gpio)
48{
49 return gpio;
50}
51
52static inline int irq_to_gpio(unsigned irq)
53{
54 return irq;
55}
56
57/* For cansleep */
58#include <asm-generic/gpio.h>
59
60#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h
deleted file mode 100644
index 9e5a459fd15b..000000000000
--- a/arch/x86/include/asm/mach-voyager/do_timer.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/voyager.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12static inline void do_timer_interrupt_hook(void)
13{
14 global_clock_event->event_handler(global_clock_event);
15 voyager_timer_interrupt();
16}
17
diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h
deleted file mode 100644
index ae52624b5937..000000000000
--- a/arch/x86/include/asm/mach-voyager/entry_arch.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 2002
4 *
5 * Author: James.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/voyager/entry_arch.h
8 *
9 * This file builds the VIC and QIC CPI gates
10 */
11
12/* initialise the voyager interrupt gates
13 *
14 * This uses the macros in irq.h to set up assembly jump gates. The
15 * calls are then redirected to the same routine with smp_ prefixed */
16BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT)
17BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT)
18BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0);
19
20/* do all the QIC interrupts */
21BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI);
22BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
23BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
24BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
25BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
26BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h
deleted file mode 100644
index 71729ca05cd7..000000000000
--- a/arch/x86/include/asm/mach-voyager/setup_arch.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#include <asm/voyager.h>
2#include <asm/setup.h>
3#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \
4 (&boot_params.apm_bios_info))
5
6/* Hook to call BIOS initialisation function */
7
8/* for voyager, pass the voyager BIOS/SUS info area to the detection
9 * routines */
10
11#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO);
12
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae9..853728519ae9 100644
--- a/arch/x86/include/asm/mach-default/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..f7920601e472 100644
--- a/arch/x86/include/asm/mach-default/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8aeeb3fd73db..f923203dc39a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
21int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 21int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
22void destroy_context(struct mm_struct *mm); 22void destroy_context(struct mm_struct *mm);
23 23
24#ifdef CONFIG_X86_32 24
25# include "mmu_context_32.h" 25static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
26#else 26{
27# include "mmu_context_64.h" 27#ifdef CONFIG_SMP
28 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
29 percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
30#endif
31}
32
33static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
34 struct task_struct *tsk)
35{
36 unsigned cpu = smp_processor_id();
37
38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpu_clear(cpu, prev->cpu_vm_mask);
41#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next);
28#endif 44#endif
45 cpu_set(cpu, next->cpu_vm_mask);
46
47 /* Re-load page tables */
48 load_cr3(next->pgd);
49
50 /*
51 * load the LDT, if the LDT is different:
52 */
53 if (unlikely(prev->context.ldt != next->context.ldt))
54 load_LDT_nolock(&next->context);
55 }
56#ifdef CONFIG_SMP
57 else {
58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
60
61 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
62 /* We were in lazy tlb mode and leave_mm disabled
63 * tlb flush IPI delivery. We must reload CR3
64 * to make sure to use no freed page tables.
65 */
66 load_cr3(next->pgd);
67 load_LDT_nolock(&next->context);
68 }
69 }
70#endif
71}
29 72
30#define activate_mm(prev, next) \ 73#define activate_mm(prev, next) \
31do { \ 74do { \
@@ -33,5 +76,17 @@ do { \
33 switch_mm((prev), (next), NULL); \ 76 switch_mm((prev), (next), NULL); \
34} while (0); 77} while (0);
35 78
79#ifdef CONFIG_X86_32
80#define deactivate_mm(tsk, mm) \
81do { \
82 lazy_load_gs(0); \
83} while (0)
84#else
85#define deactivate_mm(tsk, mm) \
86do { \
87 load_gs_index(0); \
88 loadsegment(fs, 0); \
89} while (0)
90#endif
36 91
37#endif /* _ASM_X86_MMU_CONTEXT_H */ 92#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
deleted file mode 100644
index 7e98ce1d2c0e..000000000000
--- a/arch/x86/include/asm/mmu_context_32.h
+++ /dev/null
@@ -1,55 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_32_H
2#define _ASM_X86_MMU_CONTEXT_32_H
3
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{
6#ifdef CONFIG_SMP
7 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
8 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
9#endif
10}
11
12static inline void switch_mm(struct mm_struct *prev,
13 struct mm_struct *next,
14 struct task_struct *tsk)
15{
16 int cpu = smp_processor_id();
17
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
23 x86_write_percpu(cpu_tlbstate.active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26
27 /* Re-load page tables */
28 load_cr3(next->pgd);
29
30 /*
31 * load the LDT, if the LDT is different:
32 */
33 if (unlikely(prev->context.ldt != next->context.ldt))
34 load_LDT_nolock(&next->context);
35 }
36#ifdef CONFIG_SMP
37 else {
38 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
39 BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
40
41 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42 /* We were in lazy tlb mode and leave_mm disabled
43 * tlb flush IPI delivery. We must reload %cr3.
44 */
45 load_cr3(next->pgd);
46 load_LDT_nolock(&next->context);
47 }
48 }
49#endif
50}
51
52#define deactivate_mm(tsk, mm) \
53 asm("movl %0,%%gs": :"r" (0));
54
55#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
deleted file mode 100644
index 677d36e9540a..000000000000
--- a/arch/x86/include/asm/mmu_context_64.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_64_H
2#define _ASM_X86_MMU_CONTEXT_64_H
3
4#include <asm/pda.h>
5
6static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
7{
8#ifdef CONFIG_SMP
9 if (read_pda(mmu_state) == TLBSTATE_OK)
10 write_pda(mmu_state, TLBSTATE_LAZY);
11#endif
12}
13
14static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 unsigned cpu = smp_processor_id();
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 write_pda(mmu_state, TLBSTATE_OK);
23 write_pda(active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26 load_cr3(next->pgd);
27
28 if (unlikely(next->context.ldt != prev->context.ldt))
29 load_LDT_nolock(&next->context);
30 }
31#ifdef CONFIG_SMP
32 else {
33 write_pda(mmu_state, TLBSTATE_OK);
34 if (read_pda(active_mm) != next)
35 BUG();
36 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37 /* We were in lazy tlb mode and leave_mm disabled
38 * tlb flush IPI delivery. We must reload CR3
39 * to make sure to use no freed page tables.
40 */
41 load_cr3(next->pgd);
42 load_LDT_nolock(&next->context);
43 }
44 }
45#endif
46}
47
48#define deactivate_mm(tsk, mm) \
49do { \
50 load_gs_index(0); \
51 asm volatile("movl %0,%%fs"::"r"(0)); \
52} while (0)
53
54#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a0635..ede6998bd92c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform. 96 (NODE_DATA(0)->bdata)
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \
130 struct pglist_data __maybe_unused \
131 *__alloc_bootmem_node__pgdat = (pgdat); \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
133})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 97#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 98
136#endif /* _ASM_X86_MMZONE_32_H */ 99#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index bd22f2a3713f..642fc7fc8cdc 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -9,7 +9,18 @@ extern int apic_version[MAX_APICS];
9extern int pic_mode; 9extern int pic_mode;
10 10
11#ifdef CONFIG_X86_32 11#ifdef CONFIG_X86_32
12#include <mach_mpspec.h> 12
13/*
14 * Summit or generic (i.e. installer) kernels need lots of bus entries.
15 * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
16 */
17#if CONFIG_BASE_SMALL == 0
18# define MAX_MP_BUSSES 260
19#else
20# define MAX_MP_BUSSES 32
21#endif
22
23#define MAX_IRQ_SOURCES 256
13 24
14extern unsigned int def_to_bigsmp; 25extern unsigned int def_to_bigsmp;
15extern u8 apicid_2_node[]; 26extern u8 apicid_2_node[];
@@ -20,15 +31,15 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
20extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 31extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
21#endif 32#endif
22 33
23#define MAX_APICID 256 34#define MAX_APICID 256
24 35
25#else 36#else /* CONFIG_X86_64: */
26 37
27#define MAX_MP_BUSSES 256 38#define MAX_MP_BUSSES 256
28/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ 39/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
29#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) 40#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
30 41
31#endif 42#endif /* CONFIG_X86_64 */
32 43
33extern void early_find_smp_config(void); 44extern void early_find_smp_config(void);
34extern void early_get_smp_config(void); 45extern void early_get_smp_config(void);
@@ -45,11 +56,13 @@ extern int smp_found_config;
45extern int mpc_default_type; 56extern int mpc_default_type;
46extern unsigned long mp_lapic_addr; 57extern unsigned long mp_lapic_addr;
47 58
48extern void find_smp_config(void);
49extern void get_smp_config(void); 59extern void get_smp_config(void);
60
50#ifdef CONFIG_X86_MPPARSE 61#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void);
51extern void early_reserve_e820_mpc_new(void); 63extern void early_reserve_e820_mpc_new(void);
52#else 64#else
65static inline void find_smp_config(void) { }
53static inline void early_reserve_e820_mpc_new(void) { } 66static inline void early_reserve_e820_mpc_new(void) { }
54#endif 67#endif
55 68
@@ -64,6 +77,8 @@ extern int acpi_probe_gsi(void);
64#ifdef CONFIG_X86_IO_APIC 77#ifdef CONFIG_X86_IO_APIC
65extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 78extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
66 u32 gsi, int triggering, int polarity); 79 u32 gsi, int triggering, int polarity);
80extern int mp_find_ioapic(int gsi);
81extern int mp_find_ioapic_pin(int ioapic, int gsi);
67#else 82#else
68static inline int 83static inline int
69mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 84mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
@@ -148,4 +163,8 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
148 163
149extern physid_mask_t phys_cpu_present_map; 164extern physid_mask_t phys_cpu_present_map;
150 165
166extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
167
168extern int default_acpi_madt_oem_check(char *, char *);
169
151#endif /* _ASM_X86_MPSPEC_H */ 170#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 59568bc4767f..4a7f96d7c188 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -24,17 +24,18 @@
24# endif 24# endif
25#endif 25#endif
26 26
27struct intel_mp_floating { 27/* Intel MP Floating Pointer Structure */
28 char mpf_signature[4]; /* "_MP_" */ 28struct mpf_intel {
29 unsigned int mpf_physptr; /* Configuration table address */ 29 char signature[4]; /* "_MP_" */
30 unsigned char mpf_length; /* Our length (paragraphs) */ 30 unsigned int physptr; /* Configuration table address */
31 unsigned char mpf_specification;/* Specification version */ 31 unsigned char length; /* Our length (paragraphs) */
32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */ 32 unsigned char specification; /* Specification version */
33 unsigned char mpf_feature1; /* Standard or configuration ? */ 33 unsigned char checksum; /* Checksum (makes sum 0) */
34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ 34 unsigned char feature1; /* Standard or configuration ? */
35 unsigned char mpf_feature3; /* Unused (0) */ 35 unsigned char feature2; /* Bit7 set for IMCR|PIC */
36 unsigned char mpf_feature4; /* Unused (0) */ 36 unsigned char feature3; /* Unused (0) */
37 unsigned char mpf_feature5; /* Unused (0) */ 37 unsigned char feature4; /* Unused (0) */
38 unsigned char feature5; /* Unused (0) */
38}; 39};
39 40
40#define MPC_SIGNATURE "PCMP" 41#define MPC_SIGNATURE "PCMP"
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae04..2dbd2314139e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
77#define MSR_IA32_MC0_ADDR 0x00000402 77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403 78#define MSR_IA32_MC0_MISC 0x00000403
79 79
80/* These are consecutive and not in the normal 4er MCE bank block */
81#define MSR_IA32_MC0_CTL2 0x00000280
82#define CMCI_EN (1ULL << 30)
83#define CMCI_THRESHOLD_MASK 0xffffULL
84
80#define MSR_P6_PERFCTR0 0x000000c1 85#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2 86#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186 87#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index e9f5db796244..a37229011b56 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,8 +4,12 @@
4extern int pxm_to_nid(int pxm); 4extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 5extern void numa_remove_cpu(int cpu);
6 6
7#ifdef CONFIG_NUMA 7#ifdef CONFIG_HIGHMEM
8extern void set_highmem_pages_init(void); 8extern void set_highmem_pages_init(void);
9#else
10static inline void set_highmem_pages_init(void)
11{
12}
9#endif 13#endif
10 14
11#endif /* _ASM_X86_NUMA_32_H */ 15#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 1e8bd30b4c16..9f0a5f5d29ec 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -31,6 +31,8 @@
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int get_memcfg_numaq(void);
33 33
34extern void *xquad_portio;
35
34/* 36/*
35 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the 37 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
36 */ 38 */
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
deleted file mode 100644
index bf37bc49bd8e..000000000000
--- a/arch/x86/include/asm/numaq/apic.h
+++ /dev/null
@@ -1,142 +0,0 @@
1#ifndef __ASM_NUMAQ_APIC_H
2#define __ASM_NUMAQ_APIC_H
3
4#include <asm/io.h>
5#include <linux/mmzone.h>
6#include <linux/nodemask.h>
7
8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
9
10static inline const cpumask_t *target_cpus(void)
11{
12 return &CPU_MASK_ALL;
13}
14
15#define NO_BALANCE_IRQ (1)
16#define esr_disable (1)
17
18#define INT_DELIVERY_MODE dest_LowestPrio
19#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
20
21static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
22{
23 return physid_isset(apicid, bitmap);
24}
25static inline unsigned long check_apicid_present(int bit)
26{
27 return physid_isset(bit, phys_cpu_present_map);
28}
29#define apicid_cluster(apicid) (apicid & 0xF0)
30
31static inline int apic_id_registered(void)
32{
33 return 1;
34}
35
36static inline void init_apic_ldr(void)
37{
38 /* Already done in NUMA-Q firmware */
39}
40
41static inline void setup_apic_routing(void)
42{
43 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
44 "NUMA-Q", nr_ioapics);
45}
46
47/*
48 * Skip adding the timer int on secondary nodes, which causes
49 * a small but painful rift in the time-space continuum.
50 */
51static inline int multi_timer_check(int apic, int irq)
52{
53 return apic != 0 && irq == 0;
54}
55
56static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
57{
58 /* We don't have a good way to do this yet - hack */
59 return physids_promote(0xFUL);
60}
61
62/* Mapping from cpu number to logical apicid */
63extern u8 cpu_2_logical_apicid[];
64static inline int cpu_to_logical_apicid(int cpu)
65{
66 if (cpu >= nr_cpu_ids)
67 return BAD_APICID;
68 return (int)cpu_2_logical_apicid[cpu];
69}
70
71/*
72 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
73 * cpu to APIC ID relation to properly interact with the intelligent
74 * mode of the cluster controller.
75 */
76static inline int cpu_present_to_apicid(int mps_cpu)
77{
78 if (mps_cpu < 60)
79 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
80 else
81 return BAD_APICID;
82}
83
84static inline int apicid_to_node(int logical_apicid)
85{
86 return logical_apicid >> 4;
87}
88
89static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
90{
91 int node = apicid_to_node(logical_apicid);
92 int cpu = __ffs(logical_apicid & 0xf);
93
94 return physid_mask_of_physid(cpu + 4*node);
95}
96
97extern void *xquad_portio;
98
99static inline void setup_portio_remap(void)
100{
101 int num_quads = num_online_nodes();
102
103 if (num_quads <= 1)
104 return;
105
106 printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
107 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
108 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
109 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
110}
111
112static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
113{
114 return (1);
115}
116
117static inline void enable_apic_mode(void)
118{
119}
120
121/*
122 * We use physical apicids here, not logical, so just return the default
123 * physical broadcast to stop people from breaking us
124 */
125static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
126{
127 return (int) 0xF;
128}
129
130static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
131 const struct cpumask *andmask)
132{
133 return (int) 0xF;
134}
135
136/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
137static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
138{
139 return cpuid_apic >> index_msb;
140}
141
142#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
deleted file mode 100644
index e012a46cc22a..000000000000
--- a/arch/x86/include/asm/numaq/apicdef.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef __ASM_NUMAQ_APICDEF_H
2#define __ASM_NUMAQ_APICDEF_H
3
4
5#define APIC_ID_MASK (0xF<<24)
6
7static inline unsigned get_apic_id(unsigned long x)
8{
9 return (((x)>>24)&0x0F);
10}
11
12#define GET_APIC_ID(x) get_apic_id(x)
13
14#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
deleted file mode 100644
index a8374c652778..000000000000
--- a/arch/x86/include/asm/numaq/ipi.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef __ASM_NUMAQ_IPI_H
2#define __ASM_NUMAQ_IPI_H
3
4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
6
7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
8{
9 send_IPI_mask_sequence(mask, vector);
10}
11
12static inline void send_IPI_allbutself(int vector)
13{
14 send_IPI_mask_allbutself(cpu_online_mask, vector);
15}
16
17static inline void send_IPI_all(int vector)
18{
19 send_IPI_mask(cpu_online_mask, vector);
20}
21
22#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
deleted file mode 100644
index a2eeefcd1cc7..000000000000
--- a/arch/x86/include/asm/numaq/mpparse.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef __ASM_NUMAQ_MPPARSE_H
2#define __ASM_NUMAQ_MPPARSE_H
3
4extern void numaq_mps_oem_check(struct mpc_table *, char *, char *);
5
6#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
deleted file mode 100644
index 6f499df8eddb..000000000000
--- a/arch/x86/include/asm/numaq/wakecpu.h
+++ /dev/null
@@ -1,45 +0,0 @@
1#ifndef __ASM_NUMAQ_WAKECPU_H
2#define __ASM_NUMAQ_WAKECPU_H
3
4/* This file copes with machines that wakeup secondary CPUs by NMIs */
5
6#define TRAMPOLINE_PHYS_LOW (0x8)
7#define TRAMPOLINE_PHYS_HIGH (0xa)
8
9/* We don't do anything here because we use NMI's to boot instead */
10static inline void wait_for_init_deassert(atomic_t *deassert)
11{
12}
13
14/*
15 * Because we use NMIs rather than the INIT-STARTUP sequence to
16 * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
17 */
18static inline void smp_callin_clear_local_apic(void)
19{
20 clear_local_APIC();
21}
22
23static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
24{
25 printk("Storing NMI vector\n");
26 *high =
27 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH));
28 *low =
29 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));
30}
31
32static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
33{
34 printk("Restoring NMI vector\n");
35 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
36 *high;
37 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
38 *low;
39}
40
41static inline void inquire_remote_apic(int apicid)
42{
43}
44
45#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 776579119a00..89ed9d70b0aa 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -1,42 +1,11 @@
1#ifndef _ASM_X86_PAGE_H 1#ifndef _ASM_X86_PAGE_H
2#define _ASM_X86_PAGE_H 2#define _ASM_X86_PAGE_H
3 3
4#include <linux/const.h> 4#include <linux/types.h>
5
6/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10 5
11#ifdef __KERNEL__ 6#ifdef __KERNEL__
12 7
13#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) 8#include <asm/page_types.h>
14#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
15
16/* Cast PAGE_MASK to a signed type so that it is sign-extended if
17 virtual addresses are 32-bits but physical addresses are larger
18 (ie, 32-bit PAE). */
19#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
20
21/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
22#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
23
24/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
26
27#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
28#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
29
30#define HPAGE_SHIFT PMD_SHIFT
31#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
32#define HPAGE_MASK (~(HPAGE_SIZE - 1))
33#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
34
35#define HUGE_MAX_HSTATE 2
36
37#ifndef __ASSEMBLY__
38#include <linux/types.h>
39#endif
40 9
41#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
42#include <asm/page_64.h> 11#include <asm/page_64.h>
@@ -44,38 +13,18 @@
44#include <asm/page_32.h> 13#include <asm/page_32.h>
45#endif /* CONFIG_X86_64 */ 14#endif /* CONFIG_X86_64 */
46 15
47#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
48
49#define VM_DATA_DEFAULT_FLAGS \
50 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
51 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
52
53
54#ifndef __ASSEMBLY__ 16#ifndef __ASSEMBLY__
55 17
56typedef struct { pgdval_t pgd; } pgd_t;
57typedef struct { pgprotval_t pgprot; } pgprot_t;
58
59extern int page_is_ram(unsigned long pagenr);
60extern int devmem_is_allowed(unsigned long pagenr);
61extern void map_devmem(unsigned long pfn, unsigned long size,
62 pgprot_t vma_prot);
63extern void unmap_devmem(unsigned long pfn, unsigned long size,
64 pgprot_t vma_prot);
65
66extern unsigned long max_low_pfn_mapped;
67extern unsigned long max_pfn_mapped;
68
69struct page; 18struct page;
70 19
71static inline void clear_user_page(void *page, unsigned long vaddr, 20static inline void clear_user_page(void *page, unsigned long vaddr,
72 struct page *pg) 21 struct page *pg)
73{ 22{
74 clear_page(page); 23 clear_page(page);
75} 24}
76 25
77static inline void copy_user_page(void *to, void *from, unsigned long vaddr, 26static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
78 struct page *topage) 27 struct page *topage)
79{ 28{
80 copy_page(to, from); 29 copy_page(to, from);
81} 30}
@@ -84,99 +33,6 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
84 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) 33 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
85#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE 34#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
86 35
87static inline pgd_t native_make_pgd(pgdval_t val)
88{
89 return (pgd_t) { val };
90}
91
92static inline pgdval_t native_pgd_val(pgd_t pgd)
93{
94 return pgd.pgd;
95}
96
97#if PAGETABLE_LEVELS >= 3
98#if PAGETABLE_LEVELS == 4
99typedef struct { pudval_t pud; } pud_t;
100
101static inline pud_t native_make_pud(pmdval_t val)
102{
103 return (pud_t) { val };
104}
105
106static inline pudval_t native_pud_val(pud_t pud)
107{
108 return pud.pud;
109}
110#else /* PAGETABLE_LEVELS == 3 */
111#include <asm-generic/pgtable-nopud.h>
112
113static inline pudval_t native_pud_val(pud_t pud)
114{
115 return native_pgd_val(pud.pgd);
116}
117#endif /* PAGETABLE_LEVELS == 4 */
118
119typedef struct { pmdval_t pmd; } pmd_t;
120
121static inline pmd_t native_make_pmd(pmdval_t val)
122{
123 return (pmd_t) { val };
124}
125
126static inline pmdval_t native_pmd_val(pmd_t pmd)
127{
128 return pmd.pmd;
129}
130#else /* PAGETABLE_LEVELS == 2 */
131#include <asm-generic/pgtable-nopmd.h>
132
133static inline pmdval_t native_pmd_val(pmd_t pmd)
134{
135 return native_pgd_val(pmd.pud.pgd);
136}
137#endif /* PAGETABLE_LEVELS >= 3 */
138
139static inline pte_t native_make_pte(pteval_t val)
140{
141 return (pte_t) { .pte = val };
142}
143
144static inline pteval_t native_pte_val(pte_t pte)
145{
146 return pte.pte;
147}
148
149static inline pteval_t native_pte_flags(pte_t pte)
150{
151 return native_pte_val(pte) & PTE_FLAGS_MASK;
152}
153
154#define pgprot_val(x) ((x).pgprot)
155#define __pgprot(x) ((pgprot_t) { (x) } )
156
157#ifdef CONFIG_PARAVIRT
158#include <asm/paravirt.h>
159#else /* !CONFIG_PARAVIRT */
160
161#define pgd_val(x) native_pgd_val(x)
162#define __pgd(x) native_make_pgd(x)
163
164#ifndef __PAGETABLE_PUD_FOLDED
165#define pud_val(x) native_pud_val(x)
166#define __pud(x) native_make_pud(x)
167#endif
168
169#ifndef __PAGETABLE_PMD_FOLDED
170#define pmd_val(x) native_pmd_val(x)
171#define __pmd(x) native_make_pmd(x)
172#endif
173
174#define pte_val(x) native_pte_val(x)
175#define pte_flags(x) native_pte_flags(x)
176#define __pte(x) native_make_pte(x)
177
178#endif /* CONFIG_PARAVIRT */
179
180#define __pa(x) __phys_addr((unsigned long)(x)) 36#define __pa(x) __phys_addr((unsigned long)(x))
181#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) 37#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
182/* __pa_symbol should be used for C visible symbols. 38/* __pa_symbol should be used for C visible symbols.
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index bcde0d7b4325..da4e762406f7 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -1,82 +1,14 @@
1#ifndef _ASM_X86_PAGE_32_H 1#ifndef _ASM_X86_PAGE_32_H
2#define _ASM_X86_PAGE_32_H 2#define _ASM_X86_PAGE_32_H
3 3
4/* 4#include <asm/page_32_types.h>
5 * This handles the memory map.
6 *
7 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
8 * a virtual address space of one gigabyte, which limits the
9 * amount of physical memory you can use to about 950MB.
10 *
11 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
12 * and CONFIG_HIGHMEM64G options in the kernel configuration.
13 */
14#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
15
16#ifdef CONFIG_4KSTACKS
17#define THREAD_ORDER 0
18#else
19#define THREAD_ORDER 1
20#endif
21#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22
23#define STACKFAULT_STACK 0
24#define DOUBLEFAULT_STACK 1
25#define NMI_STACK 0
26#define DEBUG_STACK 0
27#define MCE_STACK 0
28#define N_EXCEPTION_STACKS 1
29
30#ifdef CONFIG_X86_PAE
31/* 44=32+12, the limit we can fit into an unsigned long pfn */
32#define __PHYSICAL_MASK_SHIFT 44
33#define __VIRTUAL_MASK_SHIFT 32
34#define PAGETABLE_LEVELS 3
35
36#ifndef __ASSEMBLY__
37typedef u64 pteval_t;
38typedef u64 pmdval_t;
39typedef u64 pudval_t;
40typedef u64 pgdval_t;
41typedef u64 pgprotval_t;
42
43typedef union {
44 struct {
45 unsigned long pte_low, pte_high;
46 };
47 pteval_t pte;
48} pte_t;
49#endif /* __ASSEMBLY__
50 */
51#else /* !CONFIG_X86_PAE */
52#define __PHYSICAL_MASK_SHIFT 32
53#define __VIRTUAL_MASK_SHIFT 32
54#define PAGETABLE_LEVELS 2
55
56#ifndef __ASSEMBLY__
57typedef unsigned long pteval_t;
58typedef unsigned long pmdval_t;
59typedef unsigned long pudval_t;
60typedef unsigned long pgdval_t;
61typedef unsigned long pgprotval_t;
62
63typedef union {
64 pteval_t pte;
65 pteval_t pte_low;
66} pte_t;
67
68#endif /* __ASSEMBLY__ */
69#endif /* CONFIG_X86_PAE */
70 5
71#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
72typedef struct page *pgtable_t;
73#endif
74 7
75#ifdef CONFIG_HUGETLB_PAGE 8#ifdef CONFIG_HUGETLB_PAGE
76#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA 9#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
77#endif 10#endif
78 11
79#ifndef __ASSEMBLY__
80#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) 12#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
81#ifdef CONFIG_DEBUG_VIRTUAL 13#ifdef CONFIG_DEBUG_VIRTUAL
82extern unsigned long __phys_addr(unsigned long); 14extern unsigned long __phys_addr(unsigned long);
@@ -89,23 +21,6 @@ extern unsigned long __phys_addr(unsigned long);
89#define pfn_valid(pfn) ((pfn) < max_mapnr) 21#define pfn_valid(pfn) ((pfn) < max_mapnr)
90#endif /* CONFIG_FLATMEM */ 22#endif /* CONFIG_FLATMEM */
91 23
92extern int nx_enabled;
93
94/*
95 * This much address space is reserved for vmalloc() and iomap()
96 * as well as fixmap mappings.
97 */
98extern unsigned int __VMALLOC_RESERVE;
99extern int sysctl_legacy_va_layout;
100
101extern void find_low_pfn_range(void);
102extern unsigned long init_memory_mapping(unsigned long start,
103 unsigned long end);
104extern void initmem_init(unsigned long, unsigned long);
105extern void free_initmem(void);
106extern void setup_bootmem_allocator(void);
107
108
109#ifdef CONFIG_X86_USE_3DNOW 24#ifdef CONFIG_X86_USE_3DNOW
110#include <asm/mmx.h> 25#include <asm/mmx.h>
111 26
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
new file mode 100644
index 000000000000..f1e4a79a6e41
--- /dev/null
+++ b/arch/x86/include/asm/page_32_types.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_PAGE_32_DEFS_H
2#define _ASM_X86_PAGE_32_DEFS_H
3
4#include <linux/const.h>
5
6/*
7 * This handles the memory map.
8 *
9 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
10 * a virtual address space of one gigabyte, which limits the
11 * amount of physical memory you can use to about 950MB.
12 *
13 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
14 * and CONFIG_HIGHMEM64G options in the kernel configuration.
15 */
16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
17
18#ifdef CONFIG_4KSTACKS
19#define THREAD_ORDER 0
20#else
21#define THREAD_ORDER 1
22#endif
23#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
24
25#define STACKFAULT_STACK 0
26#define DOUBLEFAULT_STACK 1
27#define NMI_STACK 0
28#define DEBUG_STACK 0
29#define MCE_STACK 0
30#define N_EXCEPTION_STACKS 1
31
32#ifdef CONFIG_X86_PAE
33/* 44=32+12, the limit we can fit into an unsigned long pfn */
34#define __PHYSICAL_MASK_SHIFT 44
35#define __VIRTUAL_MASK_SHIFT 32
36
37#else /* !CONFIG_X86_PAE */
38#define __PHYSICAL_MASK_SHIFT 32
39#define __VIRTUAL_MASK_SHIFT 32
40#endif /* CONFIG_X86_PAE */
41
42#ifndef __ASSEMBLY__
43
44/*
45 * This much address space is reserved for vmalloc() and iomap()
46 * as well as fixmap mappings.
47 */
48extern unsigned int __VMALLOC_RESERVE;
49extern int sysctl_legacy_va_layout;
50
51extern void find_low_pfn_range(void);
52extern unsigned long init_memory_mapping(unsigned long start,
53 unsigned long end);
54extern void initmem_init(unsigned long, unsigned long);
55extern void free_initmem(void);
56extern void setup_bootmem_allocator(void);
57
58#endif /* !__ASSEMBLY__ */
59
60#endif /* _ASM_X86_PAGE_32_DEFS_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 5ebca29f44f0..072694ed81a5 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -1,105 +1,6 @@
1#ifndef _ASM_X86_PAGE_64_H 1#ifndef _ASM_X86_PAGE_64_H
2#define _ASM_X86_PAGE_64_H 2#define _ASM_X86_PAGE_64_H
3 3
4#define PAGETABLE_LEVELS 4 4#include <asm/page_64_types.h>
5
6#define THREAD_ORDER 1
7#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
8#define CURRENT_MASK (~(THREAD_SIZE - 1))
9
10#define EXCEPTION_STACK_ORDER 0
11#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
12
13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
15
16#define IRQSTACK_ORDER 2
17#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
18
19#define STACKFAULT_STACK 1
20#define DOUBLEFAULT_STACK 2
21#define NMI_STACK 3
22#define DEBUG_STACK 4
23#define MCE_STACK 5
24#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
25
26#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
27#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
28
29/*
30 * Set __PAGE_OFFSET to the most negative possible address +
31 * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
32 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
33 * what Xen requires.
34 */
35#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
36
37#define __PHYSICAL_START CONFIG_PHYSICAL_START
38#define __KERNEL_ALIGN 0x200000
39
40/*
41 * Make sure kernel is aligned to 2MB address. Catching it at compile
42 * time is better. Change your config file and compile the kernel
43 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
44 */
45#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
46#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
47#endif
48
49#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
50#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
51
52/* See Documentation/x86_64/mm.txt for a description of the memory map. */
53#define __PHYSICAL_MASK_SHIFT 46
54#define __VIRTUAL_MASK_SHIFT 48
55
56/*
57 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
58 * arch/x86/kernel/head_64.S), and it is mapped here:
59 */
60#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
61#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
62
63#ifndef __ASSEMBLY__
64void clear_page(void *page);
65void copy_page(void *to, void *from);
66
67/* duplicated to the one in bootmem.h */
68extern unsigned long max_pfn;
69extern unsigned long phys_base;
70
71extern unsigned long __phys_addr(unsigned long);
72#define __phys_reloc_hide(x) (x)
73
74/*
75 * These are used to make use of C type-checking..
76 */
77typedef unsigned long pteval_t;
78typedef unsigned long pmdval_t;
79typedef unsigned long pudval_t;
80typedef unsigned long pgdval_t;
81typedef unsigned long pgprotval_t;
82
83typedef struct page *pgtable_t;
84
85typedef struct { pteval_t pte; } pte_t;
86
87#define vmemmap ((struct page *)VMEMMAP_START)
88
89extern unsigned long init_memory_mapping(unsigned long start,
90 unsigned long end);
91
92extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
93extern void free_initmem(void);
94
95extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
96extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
97
98#endif /* !__ASSEMBLY__ */
99
100#ifdef CONFIG_FLATMEM
101#define pfn_valid(pfn) ((pfn) < max_pfn)
102#endif
103
104 5
105#endif /* _ASM_X86_PAGE_64_H */ 6#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
new file mode 100644
index 000000000000..d38c91b70248
--- /dev/null
+++ b/arch/x86/include/asm/page_64_types.h
@@ -0,0 +1,89 @@
1#ifndef _ASM_X86_PAGE_64_DEFS_H
2#define _ASM_X86_PAGE_64_DEFS_H
3
4#define THREAD_ORDER 1
5#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
6#define CURRENT_MASK (~(THREAD_SIZE - 1))
7
8#define EXCEPTION_STACK_ORDER 0
9#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
10
11#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
12#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
13
14#define IRQ_STACK_ORDER 2
15#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
16
17#define STACKFAULT_STACK 1
18#define DOUBLEFAULT_STACK 2
19#define NMI_STACK 3
20#define DEBUG_STACK 4
21#define MCE_STACK 5
22#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
23
24#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
25#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
26
27/*
28 * Set __PAGE_OFFSET to the most negative possible address +
29 * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
30 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
31 * what Xen requires.
32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34
35#define __PHYSICAL_START CONFIG_PHYSICAL_START
36#define __KERNEL_ALIGN 0x200000
37
38/*
39 * Make sure kernel is aligned to 2MB address. Catching it at compile
40 * time is better. Change your config file and compile the kernel
41 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
42 */
43#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
44#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
45#endif
46
47#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
48#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
49
50/* See Documentation/x86_64/mm.txt for a description of the memory map. */
51#define __PHYSICAL_MASK_SHIFT 46
52#define __VIRTUAL_MASK_SHIFT 48
53
54/*
55 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
56 * arch/x86/kernel/head_64.S), and it is mapped here:
57 */
58#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
59#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
60
61#ifndef __ASSEMBLY__
62void clear_page(void *page);
63void copy_page(void *to, void *from);
64
65/* duplicated to the one in bootmem.h */
66extern unsigned long max_pfn;
67extern unsigned long phys_base;
68
69extern unsigned long __phys_addr(unsigned long);
70#define __phys_reloc_hide(x) (x)
71
72#define vmemmap ((struct page *)VMEMMAP_START)
73
74extern unsigned long init_memory_mapping(unsigned long start,
75 unsigned long end);
76
77extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
78extern void free_initmem(void);
79
80extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
81extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
82
83#endif /* !__ASSEMBLY__ */
84
85#ifdef CONFIG_FLATMEM
86#define pfn_valid(pfn) ((pfn) < max_pfn)
87#endif
88
89#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
new file mode 100644
index 000000000000..826ad37006ab
--- /dev/null
+++ b/arch/x86/include/asm/page_types.h
@@ -0,0 +1,51 @@
1#ifndef _ASM_X86_PAGE_DEFS_H
2#define _ASM_X86_PAGE_DEFS_H
3
4#include <linux/const.h>
5
6/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10
11#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
13
14/* Cast PAGE_MASK to a signed type so that it is sign-extended if
15 virtual addresses are 32-bits but physical addresses are larger
16 (ie, 32-bit PAE). */
17#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
18
19#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
20#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
21
22#define HPAGE_SHIFT PMD_SHIFT
23#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
24#define HPAGE_MASK (~(HPAGE_SIZE - 1))
25#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
26
27#define HUGE_MAX_HSTATE 2
28
29#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
30
31#define VM_DATA_DEFAULT_FLAGS \
32 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
33 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
34
35#ifdef CONFIG_X86_64
36#include <asm/page_64_types.h>
37#else
38#include <asm/page_32_types.h>
39#endif /* CONFIG_X86_64 */
40
41#ifndef __ASSEMBLY__
42
43extern int page_is_ram(unsigned long pagenr);
44extern int devmem_is_allowed(unsigned long pagenr);
45
46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped;
48
49#endif /* !__ASSEMBLY__ */
50
51#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index e299287e8e33..0617d5cc9712 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -4,7 +4,7 @@
4 * para-virtualization: those hooks are defined here. */ 4 * para-virtualization: those hooks are defined here. */
5 5
6#ifdef CONFIG_PARAVIRT 6#ifdef CONFIG_PARAVIRT
7#include <asm/page.h> 7#include <asm/pgtable_types.h>
8#include <asm/asm.h> 8#include <asm/asm.h>
9 9
10/* Bitmask of what can be clobbered: usually at least eax. */ 10/* Bitmask of what can be clobbered: usually at least eax. */
@@ -12,21 +12,38 @@
12#define CLBR_EAX (1 << 0) 12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1) 13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2) 14#define CLBR_EDX (1 << 2)
15#define CLBR_EDI (1 << 3)
15 16
16#ifdef CONFIG_X86_64 17#ifdef CONFIG_X86_32
17#define CLBR_RSI (1 << 3) 18/* CLBR_ANY should match all regs platform has. For i386, that's just it */
18#define CLBR_RDI (1 << 4) 19#define CLBR_ANY ((1 << 4) - 1)
20
21#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
22#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
23#define CLBR_SCRATCH (0)
24#else
25#define CLBR_RAX CLBR_EAX
26#define CLBR_RCX CLBR_ECX
27#define CLBR_RDX CLBR_EDX
28#define CLBR_RDI CLBR_EDI
29#define CLBR_RSI (1 << 4)
19#define CLBR_R8 (1 << 5) 30#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6) 31#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7) 32#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8) 33#define CLBR_R11 (1 << 8)
34
23#define CLBR_ANY ((1 << 9) - 1) 35#define CLBR_ANY ((1 << 9) - 1)
36
37#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
38 CLBR_RCX | CLBR_R8 | CLBR_R9)
39#define CLBR_RET_REG (CLBR_RAX)
40#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
41
24#include <asm/desc_defs.h> 42#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */ 43#endif /* X86_64 */
29 44
45#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
46
30#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
31#include <linux/types.h> 48#include <linux/types.h>
32#include <linux/cpumask.h> 49#include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
40struct mm_struct; 57struct mm_struct;
41struct desc_struct; 58struct desc_struct;
42 59
60/*
61 * Wrapper type for pointers to code which uses the non-standard
62 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
63 */
64struct paravirt_callee_save {
65 void *func;
66};
67
43/* general info */ 68/* general info */
44struct pv_info { 69struct pv_info {
45 unsigned int kernel_rpl; 70 unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
189 * expected to use X86_EFLAGS_IF; all other bits 214 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by 215 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl. 216 * restore_fl.
217 *
218 * NOTE: These functions callers expect the callee to preserve
219 * more registers than the standard C calling convention.
192 */ 220 */
193 unsigned long (*save_fl)(void); 221 struct paravirt_callee_save save_fl;
194 void (*restore_fl)(unsigned long); 222 struct paravirt_callee_save restore_fl;
195 void (*irq_disable)(void); 223 struct paravirt_callee_save irq_disable;
196 void (*irq_enable)(void); 224 struct paravirt_callee_save irq_enable;
225
197 void (*safe_halt)(void); 226 void (*safe_halt)(void);
198 void (*halt)(void); 227 void (*halt)(void);
199 228
@@ -244,7 +273,8 @@ struct pv_mmu_ops {
244 void (*flush_tlb_user)(void); 273 void (*flush_tlb_user)(void);
245 void (*flush_tlb_kernel)(void); 274 void (*flush_tlb_kernel)(void);
246 void (*flush_tlb_single)(unsigned long addr); 275 void (*flush_tlb_single)(unsigned long addr);
247 void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, 276 void (*flush_tlb_others)(const struct cpumask *cpus,
277 struct mm_struct *mm,
248 unsigned long va); 278 unsigned long va);
249 279
250 /* Hooks for allocating and freeing a pagetable top-level */ 280 /* Hooks for allocating and freeing a pagetable top-level */
@@ -278,12 +308,11 @@ struct pv_mmu_ops {
278 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 308 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
279 pte_t *ptep, pte_t pte); 309 pte_t *ptep, pte_t pte);
280 310
281 pteval_t (*pte_val)(pte_t); 311 struct paravirt_callee_save pte_val;
282 pteval_t (*pte_flags)(pte_t); 312 struct paravirt_callee_save make_pte;
283 pte_t (*make_pte)(pteval_t pte);
284 313
285 pgdval_t (*pgd_val)(pgd_t); 314 struct paravirt_callee_save pgd_val;
286 pgd_t (*make_pgd)(pgdval_t pgd); 315 struct paravirt_callee_save make_pgd;
287 316
288#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
289#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {
298 327
299 void (*set_pud)(pud_t *pudp, pud_t pudval); 328 void (*set_pud)(pud_t *pudp, pud_t pudval);
300 329
301 pmdval_t (*pmd_val)(pmd_t); 330 struct paravirt_callee_save pmd_val;
302 pmd_t (*make_pmd)(pmdval_t pmd); 331 struct paravirt_callee_save make_pmd;
303 332
304#if PAGETABLE_LEVELS == 4 333#if PAGETABLE_LEVELS == 4
305 pudval_t (*pud_val)(pud_t); 334 struct paravirt_callee_save pud_val;
306 pud_t (*make_pud)(pudval_t pud); 335 struct paravirt_callee_save make_pud;
307 336
308 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 337 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
309#endif /* PAGETABLE_LEVELS == 4 */ 338#endif /* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
388 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 417 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
389 418
390unsigned paravirt_patch_nop(void); 419unsigned paravirt_patch_nop(void);
420unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
421unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
391unsigned paravirt_patch_ignore(unsigned len); 422unsigned paravirt_patch_ignore(unsigned len);
392unsigned paravirt_patch_call(void *insnbuf, 423unsigned paravirt_patch_call(void *insnbuf,
393 const void *target, u16 tgt_clobbers, 424 const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
479 * makes sure the incoming and outgoing types are always correct. 510 * makes sure the incoming and outgoing types are always correct.
480 */ 511 */
481#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
482#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx 513#define PVOP_VCALL_ARGS \
514 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
483#define PVOP_CALL_ARGS PVOP_VCALL_ARGS 515#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
516
517#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
518#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
519#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
520
484#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ 521#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
485 "=c" (__ecx) 522 "=c" (__ecx)
486#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS 523#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
524
525#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
526#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
527
487#define EXTRA_CLOBBERS 528#define EXTRA_CLOBBERS
488#define VEXTRA_CLOBBERS 529#define VEXTRA_CLOBBERS
489#else 530#else /* CONFIG_X86_64 */
490#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx 531#define PVOP_VCALL_ARGS \
532 unsigned long __edi = __edi, __esi = __esi, \
533 __edx = __edx, __ecx = __ecx
491#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 534#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
535
536#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
537#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
538#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
539#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
540
492#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ 541#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
493 "=S" (__esi), "=d" (__edx), \ 542 "=S" (__esi), "=d" (__edx), \
494 "=c" (__ecx) 543 "=c" (__ecx)
495
496#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 544#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
497 545
546#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
547#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
548
498#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" 549#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
499#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" 550#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
500#endif 551#endif /* CONFIG_X86_32 */
501 552
502#ifdef CONFIG_PARAVIRT_DEBUG 553#ifdef CONFIG_PARAVIRT_DEBUG
503#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) 554#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
@@ -505,10 +556,11 @@ int paravirt_disable_iospace(void);
505#define PVOP_TEST_NULL(op) ((void)op) 556#define PVOP_TEST_NULL(op) ((void)op)
506#endif 557#endif
507 558
508#define __PVOP_CALL(rettype, op, pre, post, ...) \ 559#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
560 pre, post, ...) \
509 ({ \ 561 ({ \
510 rettype __ret; \ 562 rettype __ret; \
511 PVOP_CALL_ARGS; \ 563 PVOP_CALL_ARGS; \
512 PVOP_TEST_NULL(op); \ 564 PVOP_TEST_NULL(op); \
513 /* This is 32-bit specific, but is okay in 64-bit */ \ 565 /* This is 32-bit specific, but is okay in 64-bit */ \
514 /* since this condition will never hold */ \ 566 /* since this condition will never hold */ \
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
516 asm volatile(pre \ 568 asm volatile(pre \
517 paravirt_alt(PARAVIRT_CALL) \ 569 paravirt_alt(PARAVIRT_CALL) \
518 post \ 570 post \
519 : PVOP_CALL_CLOBBERS \ 571 : call_clbr \
520 : paravirt_type(op), \ 572 : paravirt_type(op), \
521 paravirt_clobber(CLBR_ANY), \ 573 paravirt_clobber(clbr), \
522 ##__VA_ARGS__ \ 574 ##__VA_ARGS__ \
523 : "memory", "cc" EXTRA_CLOBBERS); \ 575 : "memory", "cc" extra_clbr); \
524 __ret = (rettype)((((u64)__edx) << 32) | __eax); \ 576 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
525 } else { \ 577 } else { \
526 asm volatile(pre \ 578 asm volatile(pre \
527 paravirt_alt(PARAVIRT_CALL) \ 579 paravirt_alt(PARAVIRT_CALL) \
528 post \ 580 post \
529 : PVOP_CALL_CLOBBERS \ 581 : call_clbr \
530 : paravirt_type(op), \ 582 : paravirt_type(op), \
531 paravirt_clobber(CLBR_ANY), \ 583 paravirt_clobber(clbr), \
532 ##__VA_ARGS__ \ 584 ##__VA_ARGS__ \
533 : "memory", "cc" EXTRA_CLOBBERS); \ 585 : "memory", "cc" extra_clbr); \
534 __ret = (rettype)__eax; \ 586 __ret = (rettype)__eax; \
535 } \ 587 } \
536 __ret; \ 588 __ret; \
537 }) 589 })
538#define __PVOP_VCALL(op, pre, post, ...) \ 590
591#define __PVOP_CALL(rettype, op, pre, post, ...) \
592 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
593 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
594
595#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
596 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
597 PVOP_CALLEE_CLOBBERS, , \
598 pre, post, ##__VA_ARGS__)
599
600
601#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
539 ({ \ 602 ({ \
540 PVOP_VCALL_ARGS; \ 603 PVOP_VCALL_ARGS; \
541 PVOP_TEST_NULL(op); \ 604 PVOP_TEST_NULL(op); \
542 asm volatile(pre \ 605 asm volatile(pre \
543 paravirt_alt(PARAVIRT_CALL) \ 606 paravirt_alt(PARAVIRT_CALL) \
544 post \ 607 post \
545 : PVOP_VCALL_CLOBBERS \ 608 : call_clbr \
546 : paravirt_type(op), \ 609 : paravirt_type(op), \
547 paravirt_clobber(CLBR_ANY), \ 610 paravirt_clobber(clbr), \
548 ##__VA_ARGS__ \ 611 ##__VA_ARGS__ \
549 : "memory", "cc" VEXTRA_CLOBBERS); \ 612 : "memory", "cc" extra_clbr); \
550 }) 613 })
551 614
615#define __PVOP_VCALL(op, pre, post, ...) \
616 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
617 VEXTRA_CLOBBERS, \
618 pre, post, ##__VA_ARGS__)
619
620#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
621 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
622 PVOP_VCALLEE_CLOBBERS, , \
623 pre, post, ##__VA_ARGS__)
624
625
626
552#define PVOP_CALL0(rettype, op) \ 627#define PVOP_CALL0(rettype, op) \
553 __PVOP_CALL(rettype, op, "", "") 628 __PVOP_CALL(rettype, op, "", "")
554#define PVOP_VCALL0(op) \ 629#define PVOP_VCALL0(op) \
555 __PVOP_VCALL(op, "", "") 630 __PVOP_VCALL(op, "", "")
556 631
632#define PVOP_CALLEE0(rettype, op) \
633 __PVOP_CALLEESAVE(rettype, op, "", "")
634#define PVOP_VCALLEE0(op) \
635 __PVOP_VCALLEESAVE(op, "", "")
636
637
557#define PVOP_CALL1(rettype, op, arg1) \ 638#define PVOP_CALL1(rettype, op, arg1) \
558 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) 639 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
559#define PVOP_VCALL1(op, arg1) \ 640#define PVOP_VCALL1(op, arg1) \
560 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) 641 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
642
643#define PVOP_CALLEE1(rettype, op, arg1) \
644 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
645#define PVOP_VCALLEE1(op, arg1) \
646 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
647
561 648
562#define PVOP_CALL2(rettype, op, arg1, arg2) \ 649#define PVOP_CALL2(rettype, op, arg1, arg2) \
563 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 650 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
564 "1" ((unsigned long)(arg2))) 651 PVOP_CALL_ARG2(arg2))
565#define PVOP_VCALL2(op, arg1, arg2) \ 652#define PVOP_VCALL2(op, arg1, arg2) \
566 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 653 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
567 "1" ((unsigned long)(arg2))) 654 PVOP_CALL_ARG2(arg2))
655
656#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
657 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
658 PVOP_CALL_ARG2(arg2))
659#define PVOP_VCALLEE2(op, arg1, arg2) \
660 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
661 PVOP_CALL_ARG2(arg2))
662
568 663
569#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ 664#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
570 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 665 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
571 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 666 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
572#define PVOP_VCALL3(op, arg1, arg2, arg3) \ 667#define PVOP_VCALL3(op, arg1, arg2, arg3) \
573 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 668 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
574 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 669 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
575 670
576/* This is the only difference in x86_64. We can make it much simpler */ 671/* This is the only difference in x86_64. We can make it much simpler */
577#ifdef CONFIG_X86_32 672#ifdef CONFIG_X86_32
578#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 673#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
579 __PVOP_CALL(rettype, op, \ 674 __PVOP_CALL(rettype, op, \
580 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 675 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
581 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ 676 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
582 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 677 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
583#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 678#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
584 __PVOP_VCALL(op, \ 679 __PVOP_VCALL(op, \
585 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 680 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
587 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 682 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
588#else 683#else
589#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 684#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
590 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 685 __PVOP_CALL(rettype, op, "", "", \
591 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 686 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
592 "3"((unsigned long)(arg4))) 687 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
593#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 688#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
594 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 689 __PVOP_VCALL(op, "", "", \
595 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 690 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
596 "3"((unsigned long)(arg4))) 691 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
597#endif 692#endif
598 693
599static inline int paravirt_enabled(void) 694static inline int paravirt_enabled(void)
@@ -984,10 +1079,11 @@ static inline void __flush_tlb_single(unsigned long addr)
984 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); 1079 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
985} 1080}
986 1081
987static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 1082static inline void flush_tlb_others(const struct cpumask *cpumask,
1083 struct mm_struct *mm,
988 unsigned long va) 1084 unsigned long va)
989{ 1085{
990 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); 1086 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
991} 1087}
992 1088
993static inline int paravirt_pgd_alloc(struct mm_struct *mm) 1089static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -1059,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
1059 pteval_t ret; 1155 pteval_t ret;
1060 1156
1061 if (sizeof(pteval_t) > sizeof(long)) 1157 if (sizeof(pteval_t) > sizeof(long))
1062 ret = PVOP_CALL2(pteval_t, 1158 ret = PVOP_CALLEE2(pteval_t,
1063 pv_mmu_ops.make_pte, 1159 pv_mmu_ops.make_pte,
1064 val, (u64)val >> 32); 1160 val, (u64)val >> 32);
1065 else 1161 else
1066 ret = PVOP_CALL1(pteval_t, 1162 ret = PVOP_CALLEE1(pteval_t,
1067 pv_mmu_ops.make_pte, 1163 pv_mmu_ops.make_pte,
1068 val); 1164 val);
1069 1165
1070 return (pte_t) { .pte = ret }; 1166 return (pte_t) { .pte = ret };
1071} 1167}
@@ -1075,29 +1171,12 @@ static inline pteval_t pte_val(pte_t pte)
1075 pteval_t ret; 1171 pteval_t ret;
1076 1172
1077 if (sizeof(pteval_t) > sizeof(long)) 1173 if (sizeof(pteval_t) > sizeof(long))
1078 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, 1174 ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
1079 pte.pte, (u64)pte.pte >> 32); 1175 pte.pte, (u64)pte.pte >> 32);
1080 else
1081 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
1082 pte.pte);
1083
1084 return ret;
1085}
1086
1087static inline pteval_t pte_flags(pte_t pte)
1088{
1089 pteval_t ret;
1090
1091 if (sizeof(pteval_t) > sizeof(long))
1092 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
1093 pte.pte, (u64)pte.pte >> 32);
1094 else 1176 else
1095 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, 1177 ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
1096 pte.pte); 1178 pte.pte);
1097 1179
1098#ifdef CONFIG_PARAVIRT_DEBUG
1099 BUG_ON(ret & PTE_PFN_MASK);
1100#endif
1101 return ret; 1180 return ret;
1102} 1181}
1103 1182
@@ -1106,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
1106 pgdval_t ret; 1185 pgdval_t ret;
1107 1186
1108 if (sizeof(pgdval_t) > sizeof(long)) 1187 if (sizeof(pgdval_t) > sizeof(long))
1109 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, 1188 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
1110 val, (u64)val >> 32); 1189 val, (u64)val >> 32);
1111 else 1190 else
1112 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, 1191 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
1113 val); 1192 val);
1114 1193
1115 return (pgd_t) { ret }; 1194 return (pgd_t) { ret };
1116} 1195}
@@ -1120,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
1120 pgdval_t ret; 1199 pgdval_t ret;
1121 1200
1122 if (sizeof(pgdval_t) > sizeof(long)) 1201 if (sizeof(pgdval_t) > sizeof(long))
1123 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, 1202 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
1124 pgd.pgd, (u64)pgd.pgd >> 32); 1203 pgd.pgd, (u64)pgd.pgd >> 32);
1125 else 1204 else
1126 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, 1205 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
1127 pgd.pgd); 1206 pgd.pgd);
1128 1207
1129 return ret; 1208 return ret;
1130} 1209}
@@ -1188,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
1188 pmdval_t ret; 1267 pmdval_t ret;
1189 1268
1190 if (sizeof(pmdval_t) > sizeof(long)) 1269 if (sizeof(pmdval_t) > sizeof(long))
1191 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, 1270 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
1192 val, (u64)val >> 32); 1271 val, (u64)val >> 32);
1193 else 1272 else
1194 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, 1273 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
1195 val); 1274 val);
1196 1275
1197 return (pmd_t) { ret }; 1276 return (pmd_t) { ret };
1198} 1277}
@@ -1202,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
1202 pmdval_t ret; 1281 pmdval_t ret;
1203 1282
1204 if (sizeof(pmdval_t) > sizeof(long)) 1283 if (sizeof(pmdval_t) > sizeof(long))
1205 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, 1284 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
1206 pmd.pmd, (u64)pmd.pmd >> 32); 1285 pmd.pmd, (u64)pmd.pmd >> 32);
1207 else 1286 else
1208 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, 1287 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
1209 pmd.pmd); 1288 pmd.pmd);
1210 1289
1211 return ret; 1290 return ret;
1212} 1291}
@@ -1228,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
1228 pudval_t ret; 1307 pudval_t ret;
1229 1308
1230 if (sizeof(pudval_t) > sizeof(long)) 1309 if (sizeof(pudval_t) > sizeof(long))
1231 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, 1310 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
1232 val, (u64)val >> 32); 1311 val, (u64)val >> 32);
1233 else 1312 else
1234 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, 1313 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
1235 val); 1314 val);
1236 1315
1237 return (pud_t) { ret }; 1316 return (pud_t) { ret };
1238} 1317}
@@ -1242,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
1242 pudval_t ret; 1321 pudval_t ret;
1243 1322
1244 if (sizeof(pudval_t) > sizeof(long)) 1323 if (sizeof(pudval_t) > sizeof(long))
1245 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, 1324 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
1246 pud.pud, (u64)pud.pud >> 32); 1325 pud.pud, (u64)pud.pud >> 32);
1247 else 1326 else
1248 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, 1327 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
1249 pud.pud); 1328 pud.pud);
1250 1329
1251 return ret; 1330 return ret;
1252} 1331}
@@ -1374,9 +1453,10 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1374} 1453}
1375 1454
1376void _paravirt_nop(void); 1455void _paravirt_nop(void);
1377#define paravirt_nop ((void *)_paravirt_nop) 1456u32 _paravirt_ident_32(u32);
1457u64 _paravirt_ident_64(u64);
1378 1458
1379void paravirt_use_bytelocks(void); 1459#define paravirt_nop ((void *)_paravirt_nop)
1380 1460
1381#ifdef CONFIG_SMP 1461#ifdef CONFIG_SMP
1382 1462
@@ -1426,12 +1506,37 @@ extern struct paravirt_patch_site __parainstructions[],
1426 __parainstructions_end[]; 1506 __parainstructions_end[];
1427 1507
1428#ifdef CONFIG_X86_32 1508#ifdef CONFIG_X86_32
1429#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" 1509#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
1430#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" 1510#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
1511
1512/* save and restore all caller-save registers, except return value */
1513#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
1514#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
1515
1431#define PV_FLAGS_ARG "0" 1516#define PV_FLAGS_ARG "0"
1432#define PV_EXTRA_CLOBBERS 1517#define PV_EXTRA_CLOBBERS
1433#define PV_VEXTRA_CLOBBERS 1518#define PV_VEXTRA_CLOBBERS
1434#else 1519#else
1520/* save and restore all caller-save registers, except return value */
1521#define PV_SAVE_ALL_CALLER_REGS \
1522 "push %rcx;" \
1523 "push %rdx;" \
1524 "push %rsi;" \
1525 "push %rdi;" \
1526 "push %r8;" \
1527 "push %r9;" \
1528 "push %r10;" \
1529 "push %r11;"
1530#define PV_RESTORE_ALL_CALLER_REGS \
1531 "pop %r11;" \
1532 "pop %r10;" \
1533 "pop %r9;" \
1534 "pop %r8;" \
1535 "pop %rdi;" \
1536 "pop %rsi;" \
1537 "pop %rdx;" \
1538 "pop %rcx;"
1539
1435/* We save some registers, but all of them, that's too much. We clobber all 1540/* We save some registers, but all of them, that's too much. We clobber all
1436 * caller saved registers but the argument parameter */ 1541 * caller saved registers but the argument parameter */
1437#define PV_SAVE_REGS "pushq %%rdi;" 1542#define PV_SAVE_REGS "pushq %%rdi;"
@@ -1441,52 +1546,76 @@ extern struct paravirt_patch_site __parainstructions[],
1441#define PV_FLAGS_ARG "D" 1546#define PV_FLAGS_ARG "D"
1442#endif 1547#endif
1443 1548
1549/*
1550 * Generate a thunk around a function which saves all caller-save
1551 * registers except for the return value. This allows C functions to
1552 * be called from assembler code where fewer than normal registers are
1553 * available. It may also help code generation around calls from C
1554 * code if the common case doesn't use many registers.
1555 *
1556 * When a callee is wrapped in a thunk, the caller can assume that all
1557 * arg regs and all scratch registers are preserved across the
1558 * call. The return value in rax/eax will not be saved, even for void
1559 * functions.
1560 */
1561#define PV_CALLEE_SAVE_REGS_THUNK(func) \
1562 extern typeof(func) __raw_callee_save_##func; \
1563 static void *__##func##__ __used = func; \
1564 \
1565 asm(".pushsection .text;" \
1566 "__raw_callee_save_" #func ": " \
1567 PV_SAVE_ALL_CALLER_REGS \
1568 "call " #func ";" \
1569 PV_RESTORE_ALL_CALLER_REGS \
1570 "ret;" \
1571 ".popsection")
1572
1573/* Get a reference to a callee-save function */
1574#define PV_CALLEE_SAVE(func) \
1575 ((struct paravirt_callee_save) { __raw_callee_save_##func })
1576
1577/* Promise that "func" already uses the right calling convention */
1578#define __PV_IS_CALLEE_SAVE(func) \
1579 ((struct paravirt_callee_save) { func })
1580
1444static inline unsigned long __raw_local_save_flags(void) 1581static inline unsigned long __raw_local_save_flags(void)
1445{ 1582{
1446 unsigned long f; 1583 unsigned long f;
1447 1584
1448 asm volatile(paravirt_alt(PV_SAVE_REGS 1585 asm volatile(paravirt_alt(PARAVIRT_CALL)
1449 PARAVIRT_CALL
1450 PV_RESTORE_REGS)
1451 : "=a"(f) 1586 : "=a"(f)
1452 : paravirt_type(pv_irq_ops.save_fl), 1587 : paravirt_type(pv_irq_ops.save_fl),
1453 paravirt_clobber(CLBR_EAX) 1588 paravirt_clobber(CLBR_EAX)
1454 : "memory", "cc" PV_VEXTRA_CLOBBERS); 1589 : "memory", "cc");
1455 return f; 1590 return f;
1456} 1591}
1457 1592
1458static inline void raw_local_irq_restore(unsigned long f) 1593static inline void raw_local_irq_restore(unsigned long f)
1459{ 1594{
1460 asm volatile(paravirt_alt(PV_SAVE_REGS 1595 asm volatile(paravirt_alt(PARAVIRT_CALL)
1461 PARAVIRT_CALL
1462 PV_RESTORE_REGS)
1463 : "=a"(f) 1596 : "=a"(f)
1464 : PV_FLAGS_ARG(f), 1597 : PV_FLAGS_ARG(f),
1465 paravirt_type(pv_irq_ops.restore_fl), 1598 paravirt_type(pv_irq_ops.restore_fl),
1466 paravirt_clobber(CLBR_EAX) 1599 paravirt_clobber(CLBR_EAX)
1467 : "memory", "cc" PV_EXTRA_CLOBBERS); 1600 : "memory", "cc");
1468} 1601}
1469 1602
1470static inline void raw_local_irq_disable(void) 1603static inline void raw_local_irq_disable(void)
1471{ 1604{
1472 asm volatile(paravirt_alt(PV_SAVE_REGS 1605 asm volatile(paravirt_alt(PARAVIRT_CALL)
1473 PARAVIRT_CALL
1474 PV_RESTORE_REGS)
1475 : 1606 :
1476 : paravirt_type(pv_irq_ops.irq_disable), 1607 : paravirt_type(pv_irq_ops.irq_disable),
1477 paravirt_clobber(CLBR_EAX) 1608 paravirt_clobber(CLBR_EAX)
1478 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1609 : "memory", "eax", "cc");
1479} 1610}
1480 1611
1481static inline void raw_local_irq_enable(void) 1612static inline void raw_local_irq_enable(void)
1482{ 1613{
1483 asm volatile(paravirt_alt(PV_SAVE_REGS 1614 asm volatile(paravirt_alt(PARAVIRT_CALL)
1484 PARAVIRT_CALL
1485 PV_RESTORE_REGS)
1486 : 1615 :
1487 : paravirt_type(pv_irq_ops.irq_enable), 1616 : paravirt_type(pv_irq_ops.irq_enable),
1488 paravirt_clobber(CLBR_EAX) 1617 paravirt_clobber(CLBR_EAX)
1489 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1618 : "memory", "eax", "cc");
1490} 1619}
1491 1620
1492static inline unsigned long __raw_local_irq_save(void) 1621static inline unsigned long __raw_local_irq_save(void)
@@ -1529,33 +1658,49 @@ static inline unsigned long __raw_local_irq_save(void)
1529 .popsection 1658 .popsection
1530 1659
1531 1660
1661#define COND_PUSH(set, mask, reg) \
1662 .if ((~(set)) & mask); push %reg; .endif
1663#define COND_POP(set, mask, reg) \
1664 .if ((~(set)) & mask); pop %reg; .endif
1665
1532#ifdef CONFIG_X86_64 1666#ifdef CONFIG_X86_64
1533#define PV_SAVE_REGS \ 1667
1534 push %rax; \ 1668#define PV_SAVE_REGS(set) \
1535 push %rcx; \ 1669 COND_PUSH(set, CLBR_RAX, rax); \
1536 push %rdx; \ 1670 COND_PUSH(set, CLBR_RCX, rcx); \
1537 push %rsi; \ 1671 COND_PUSH(set, CLBR_RDX, rdx); \
1538 push %rdi; \ 1672 COND_PUSH(set, CLBR_RSI, rsi); \
1539 push %r8; \ 1673 COND_PUSH(set, CLBR_RDI, rdi); \
1540 push %r9; \ 1674 COND_PUSH(set, CLBR_R8, r8); \
1541 push %r10; \ 1675 COND_PUSH(set, CLBR_R9, r9); \
1542 push %r11 1676 COND_PUSH(set, CLBR_R10, r10); \
1543#define PV_RESTORE_REGS \ 1677 COND_PUSH(set, CLBR_R11, r11)
1544 pop %r11; \ 1678#define PV_RESTORE_REGS(set) \
1545 pop %r10; \ 1679 COND_POP(set, CLBR_R11, r11); \
1546 pop %r9; \ 1680 COND_POP(set, CLBR_R10, r10); \
1547 pop %r8; \ 1681 COND_POP(set, CLBR_R9, r9); \
1548 pop %rdi; \ 1682 COND_POP(set, CLBR_R8, r8); \
1549 pop %rsi; \ 1683 COND_POP(set, CLBR_RDI, rdi); \
1550 pop %rdx; \ 1684 COND_POP(set, CLBR_RSI, rsi); \
1551 pop %rcx; \ 1685 COND_POP(set, CLBR_RDX, rdx); \
1552 pop %rax 1686 COND_POP(set, CLBR_RCX, rcx); \
1687 COND_POP(set, CLBR_RAX, rax)
1688
1553#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) 1689#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1554#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) 1690#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1555#define PARA_INDIRECT(addr) *addr(%rip) 1691#define PARA_INDIRECT(addr) *addr(%rip)
1556#else 1692#else
1557#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx 1693#define PV_SAVE_REGS(set) \
1558#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax 1694 COND_PUSH(set, CLBR_EAX, eax); \
1695 COND_PUSH(set, CLBR_EDI, edi); \
1696 COND_PUSH(set, CLBR_ECX, ecx); \
1697 COND_PUSH(set, CLBR_EDX, edx)
1698#define PV_RESTORE_REGS(set) \
1699 COND_POP(set, CLBR_EDX, edx); \
1700 COND_POP(set, CLBR_ECX, ecx); \
1701 COND_POP(set, CLBR_EDI, edi); \
1702 COND_POP(set, CLBR_EAX, eax)
1703
1559#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) 1704#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1560#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) 1705#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1561#define PARA_INDIRECT(addr) *%cs:addr 1706#define PARA_INDIRECT(addr) *%cs:addr
@@ -1567,15 +1712,15 @@ static inline unsigned long __raw_local_irq_save(void)
1567 1712
1568#define DISABLE_INTERRUPTS(clobbers) \ 1713#define DISABLE_INTERRUPTS(clobbers) \
1569 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ 1714 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1570 PV_SAVE_REGS; \ 1715 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1571 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ 1716 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1572 PV_RESTORE_REGS;) \ 1717 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1573 1718
1574#define ENABLE_INTERRUPTS(clobbers) \ 1719#define ENABLE_INTERRUPTS(clobbers) \
1575 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ 1720 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1576 PV_SAVE_REGS; \ 1721 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1577 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ 1722 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1578 PV_RESTORE_REGS;) 1723 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1579 1724
1580#define USERGS_SYSRET32 \ 1725#define USERGS_SYSRET32 \
1581 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ 1726 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
@@ -1605,11 +1750,15 @@ static inline unsigned long __raw_local_irq_save(void)
1605 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1750 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1606 swapgs) 1751 swapgs)
1607 1752
1753/*
1754 * Note: swapgs is very special, and in practise is either going to be
1755 * implemented with a single "swapgs" instruction or something very
1756 * special. Either way, we don't need to save any registers for
1757 * it.
1758 */
1608#define SWAPGS \ 1759#define SWAPGS \
1609 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1760 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1610 PV_SAVE_REGS; \ 1761 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
1611 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1612 PV_RESTORE_REGS \
1613 ) 1762 )
1614 1763
1615#define GET_CR2_INTO_RCX \ 1764#define GET_CR2_INTO_RCX \
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b8493b3b9890..2cd07b9422f4 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,13 +2,12 @@
2#define _ASM_X86_PAT_H 2#define _ASM_X86_PAT_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/pgtable_types.h>
5 6
6#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
7extern int pat_enabled; 8extern int pat_enabled;
8extern void validate_pat_support(struct cpuinfo_x86 *c);
9#else 9#else
10static const int pat_enabled; 10static const int pat_enabled;
11static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
12#endif 11#endif
13 12
14extern void pat_init(void); 13extern void pat_init(void);
@@ -17,6 +16,11 @@ extern int reserve_memtype(u64 start, u64 end,
17 unsigned long req_type, unsigned long *ret_type); 16 unsigned long req_type, unsigned long *ret_type);
18extern int free_memtype(u64 start, u64 end); 17extern int free_memtype(u64 start, u64 end);
19 18
20extern void pat_disable(char *reason); 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
20 unsigned long flag);
21extern void map_devmem(unsigned long pfn, unsigned long size,
22 struct pgprot vma_prot);
23extern void unmap_devmem(unsigned long pfn, unsigned long size,
24 struct pgprot vma_prot);
21 25
22#endif /* _ASM_X86_PAT_H */ 26#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/pci-functions.h
index ed0bab427354..ed0bab427354 100644
--- a/arch/x86/include/asm/mach-default/pci-functions.h
+++ b/arch/x86/include/asm/pci-functions.h
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644
index 2fbfff88df37..000000000000
--- a/arch/x86/include/asm/pda.h
+++ /dev/null
@@ -1,137 +0,0 @@
1#ifndef _ASM_X86_PDA_H
2#define _ASM_X86_PDA_H
3
4#ifndef __ASSEMBLY__
5#include <linux/stddef.h>
6#include <linux/types.h>
7#include <linux/cache.h>
8#include <asm/page.h>
9
10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */
23#endif
24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
27 unsigned int __softirq_pending;
28 unsigned int __nmi_count; /* number of NMI on this CPUs */
29 short mmu_state;
30 short isidle;
31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs;
33 unsigned irq0_irqs;
34 unsigned irq_resched_count;
35 unsigned irq_call_count;
36 unsigned irq_tlb_count;
37 unsigned irq_thermal_count;
38 unsigned irq_threshold_count;
39 unsigned irq_spurious_count;
40} ____cacheline_aligned_in_smp;
41
42extern struct x8664_pda **_cpu_pda;
43extern void pda_init(int);
44
45#define cpu_pda(i) (_cpu_pda[i])
46
47/*
48 * There is no fast way to get the base address of the PDA, all the accesses
49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
50 */
51extern void __bad_pda_field(void) __attribute__((noreturn));
52
53/*
54 * proxy_pda doesn't actually exist, but tell gcc it is accessed for
55 * all PDA accesses so it gets read/write dependencies right.
56 */
57extern struct x8664_pda _proxy_pda;
58
59#define pda_offset(field) offsetof(struct x8664_pda, field)
60
61#define pda_to_op(op, field, val) \
62do { \
63 typedef typeof(_proxy_pda.field) T__; \
64 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
65 switch (sizeof(_proxy_pda.field)) { \
66 case 2: \
67 asm(op "w %1,%%gs:%c2" : \
68 "+m" (_proxy_pda.field) : \
69 "ri" ((T__)val), \
70 "i"(pda_offset(field))); \
71 break; \
72 case 4: \
73 asm(op "l %1,%%gs:%c2" : \
74 "+m" (_proxy_pda.field) : \
75 "ri" ((T__)val), \
76 "i" (pda_offset(field))); \
77 break; \
78 case 8: \
79 asm(op "q %1,%%gs:%c2": \
80 "+m" (_proxy_pda.field) : \
81 "ri" ((T__)val), \
82 "i"(pda_offset(field))); \
83 break; \
84 default: \
85 __bad_pda_field(); \
86 } \
87} while (0)
88
89#define pda_from_op(op, field) \
90({ \
91 typeof(_proxy_pda.field) ret__; \
92 switch (sizeof(_proxy_pda.field)) { \
93 case 2: \
94 asm(op "w %%gs:%c1,%0" : \
95 "=r" (ret__) : \
96 "i" (pda_offset(field)), \
97 "m" (_proxy_pda.field)); \
98 break; \
99 case 4: \
100 asm(op "l %%gs:%c1,%0": \
101 "=r" (ret__): \
102 "i" (pda_offset(field)), \
103 "m" (_proxy_pda.field)); \
104 break; \
105 case 8: \
106 asm(op "q %%gs:%c1,%0": \
107 "=r" (ret__) : \
108 "i" (pda_offset(field)), \
109 "m" (_proxy_pda.field)); \
110 break; \
111 default: \
112 __bad_pda_field(); \
113 } \
114 ret__; \
115})
116
117#define read_pda(field) pda_from_op("mov", field)
118#define write_pda(field, val) pda_to_op("mov", field, val)
119#define add_pda(field, val) pda_to_op("add", field, val)
120#define sub_pda(field, val) pda_to_op("sub", field, val)
121#define or_pda(field, val) pda_to_op("or", field, val)
122
123/* This is not atomic against other CPUs -- CPU preemption needs to be off */
124#define test_and_clear_bit_pda(bit, field) \
125({ \
126 int old__; \
127 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
128 : "=r" (old__), "+m" (_proxy_pda.field) \
129 : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
130 old__; \
131})
132
133#endif
134
135#define PDA_STACKOFFSET (5*8)
136
137#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece72053ba63..8f1d2fbec1d4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -2,53 +2,12 @@
2#define _ASM_X86_PERCPU_H 2#define _ASM_X86_PERCPU_H
3 3
4#ifdef CONFIG_X86_64 4#ifdef CONFIG_X86_64
5#include <linux/compiler.h> 5#define __percpu_seg gs
6 6#define __percpu_mov_op movq
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset 7#else
8 in the PDA. Longer term the PDA and every per cpu variable 8#define __percpu_seg fs
9 should be just put into a single section and referenced directly 9#define __percpu_mov_op movl
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
20#endif 10#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
51#else /* CONFIG_X86_64 */
52 11
53#ifdef __ASSEMBLY__ 12#ifdef __ASSEMBLY__
54 13
@@ -65,47 +24,56 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
65 * PER_CPU(cpu_gdt_descr, %ebx) 24 * PER_CPU(cpu_gdt_descr, %ebx)
66 */ 25 */
67#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
68#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
69 movl %fs:per_cpu__##this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
70 lea per_cpu__##var(reg), reg 29 lea per_cpu__##var(reg), reg
71#define PER_CPU_VAR(var) %fs:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
72#else /* ! SMP */ 31#else /* ! SMP */
73#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) \
74 movl $per_cpu__##var, reg 33 __percpu_mov_op $per_cpu__##var, reg
75#define PER_CPU_VAR(var) per_cpu__##var 34#define PER_CPU_VAR(var) per_cpu__##var
76#endif /* SMP */ 35#endif /* SMP */
77 36
78#else /* ...!ASSEMBLY */ 37#ifdef CONFIG_X86_64_SMP
79 38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
80/* 39#else
81 * PER_CPU finds an address of a per-cpu variable. 40#define INIT_PER_CPU_VAR(var) per_cpu__##var
82 * 41#endif
83 * Args:
84 * var - variable name
85 * cpu - 32bit register containing the current CPU number
86 *
87 * The resulting address is stored in the "cpu" argument.
88 *
89 * Example:
90 * PER_CPU(cpu_gdt_descr, %ebx)
91 */
92#ifdef CONFIG_SMP
93
94#define __my_cpu_offset x86_read_percpu(this_cpu_off)
95 42
96/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ 43#else /* ...!ASSEMBLY */
97#define __percpu_seg "%%fs:"
98 44
99#else /* !SMP */ 45#include <linux/stringify.h>
46#include <asm/sections.h>
100 47
101#define __percpu_seg "" 48#define __addr_to_pcpu_ptr(addr) \
49 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
50 + (unsigned long)__per_cpu_start)
51#define __pcpu_ptr_to_addr(ptr) \
52 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
53 - (unsigned long)__per_cpu_start)
102 54
103#endif /* SMP */ 55#ifdef CONFIG_SMP
56#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
57#define __my_cpu_offset percpu_read(this_cpu_off)
58#else
59#define __percpu_arg(x) "%" #x
60#endif
104 61
105#include <asm-generic/percpu.h> 62/*
63 * Initialized pointers to per-cpu variables needed for the boot
64 * processor need to use these macros to get the proper address
65 * offset from __per_cpu_load on SMP.
66 *
67 * There also must be an entry in vmlinux_64.lds.S
68 */
69#define DECLARE_INIT_PER_CPU(var) \
70 extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
106 71
107/* We can use this directly for local CPU (faster). */ 72#ifdef CONFIG_X86_64_SMP
108DECLARE_PER_CPU(unsigned long, this_cpu_off); 73#define init_per_cpu_var(var) init_per_cpu__##var
74#else
75#define init_per_cpu_var(var) per_cpu_var(var)
76#endif
109 77
110/* For arch-specific code, we can use direct single-insn ops (they 78/* For arch-specific code, we can use direct single-insn ops (they
111 * don't give an lvalue though). */ 79 * don't give an lvalue though). */
@@ -120,20 +88,25 @@ do { \
120 } \ 88 } \
121 switch (sizeof(var)) { \ 89 switch (sizeof(var)) { \
122 case 1: \ 90 case 1: \
123 asm(op "b %1,"__percpu_seg"%0" \ 91 asm(op "b %1,"__percpu_arg(0) \
124 : "+m" (var) \ 92 : "+m" (var) \
125 : "ri" ((T__)val)); \ 93 : "ri" ((T__)val)); \
126 break; \ 94 break; \
127 case 2: \ 95 case 2: \
128 asm(op "w %1,"__percpu_seg"%0" \ 96 asm(op "w %1,"__percpu_arg(0) \
129 : "+m" (var) \ 97 : "+m" (var) \
130 : "ri" ((T__)val)); \ 98 : "ri" ((T__)val)); \
131 break; \ 99 break; \
132 case 4: \ 100 case 4: \
133 asm(op "l %1,"__percpu_seg"%0" \ 101 asm(op "l %1,"__percpu_arg(0) \
134 : "+m" (var) \ 102 : "+m" (var) \
135 : "ri" ((T__)val)); \ 103 : "ri" ((T__)val)); \
136 break; \ 104 break; \
105 case 8: \
106 asm(op "q %1,"__percpu_arg(0) \
107 : "+m" (var) \
108 : "re" ((T__)val)); \
109 break; \
137 default: __bad_percpu_size(); \ 110 default: __bad_percpu_size(); \
138 } \ 111 } \
139} while (0) 112} while (0)
@@ -143,17 +116,22 @@ do { \
143 typeof(var) ret__; \ 116 typeof(var) ret__; \
144 switch (sizeof(var)) { \ 117 switch (sizeof(var)) { \
145 case 1: \ 118 case 1: \
146 asm(op "b "__percpu_seg"%1,%0" \ 119 asm(op "b "__percpu_arg(1)",%0" \
147 : "=r" (ret__) \ 120 : "=r" (ret__) \
148 : "m" (var)); \ 121 : "m" (var)); \
149 break; \ 122 break; \
150 case 2: \ 123 case 2: \
151 asm(op "w "__percpu_seg"%1,%0" \ 124 asm(op "w "__percpu_arg(1)",%0" \
152 : "=r" (ret__) \ 125 : "=r" (ret__) \
153 : "m" (var)); \ 126 : "m" (var)); \
154 break; \ 127 break; \
155 case 4: \ 128 case 4: \
156 asm(op "l "__percpu_seg"%1,%0" \ 129 asm(op "l "__percpu_arg(1)",%0" \
130 : "=r" (ret__) \
131 : "m" (var)); \
132 break; \
133 case 8: \
134 asm(op "q "__percpu_arg(1)",%0" \
157 : "=r" (ret__) \ 135 : "=r" (ret__) \
158 : "m" (var)); \ 136 : "m" (var)); \
159 break; \ 137 break; \
@@ -162,13 +140,30 @@ do { \
162 ret__; \ 140 ret__; \
163}) 141})
164 142
165#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) 143#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
166#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) 144#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
167#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) 145#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
168#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) 146#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
169#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) 147#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
148#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
149#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
150
151/* This is not atomic against other CPUs -- CPU preemption needs to be off */
152#define x86_test_and_clear_bit_percpu(bit, var) \
153({ \
154 int old__; \
155 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
156 : "=r" (old__), "+m" (per_cpu__##var) \
157 : "dIr" (bit)); \
158 old__; \
159})
160
161#include <asm-generic/percpu.h>
162
163/* We can use this directly for local CPU (faster). */
164DECLARE_PER_CPU(unsigned long, this_cpu_off);
165
170#endif /* !__ASSEMBLY__ */ 166#endif /* !__ASSEMBLY__ */
171#endif /* !CONFIG_X86_64 */
172 167
173#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
174 169
@@ -195,9 +190,9 @@ do { \
195#define early_per_cpu_ptr(_name) (_name##_early_ptr) 190#define early_per_cpu_ptr(_name) (_name##_early_ptr)
196#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 191#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
197#define early_per_cpu(_name, _cpu) \ 192#define early_per_cpu(_name, _cpu) \
198 (early_per_cpu_ptr(_name) ? \ 193 *(early_per_cpu_ptr(_name) ? \
199 early_per_cpu_ptr(_name)[_cpu] : \ 194 &early_per_cpu_ptr(_name)[_cpu] : \
200 per_cpu(_name, _cpu)) 195 &per_cpu(_name, _cpu))
201 196
202#else /* !CONFIG_SMP */ 197#else /* !CONFIG_SMP */
203#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 198#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index e0d199fe1d83..c1774ac9da7a 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -53,8 +53,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
53#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 53#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
54#endif 54#endif
55 55
56#define pte_none(x) (!(x).pte_low)
57
58/* 56/*
59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 57 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
60 * split up the 29 bits of offset into this range: 58 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level_types.h
index d77db8990eaa..daacc23e3fb9 100644
--- a/arch/x86/include/asm/pgtable-2level-defs.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -1,7 +1,23 @@
1#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H 1#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H 2#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
3 3
4#ifndef __ASSEMBLY__
5#include <linux/types.h>
6
7typedef unsigned long pteval_t;
8typedef unsigned long pmdval_t;
9typedef unsigned long pudval_t;
10typedef unsigned long pgdval_t;
11typedef unsigned long pgprotval_t;
12
13typedef union {
14 pteval_t pte;
15 pteval_t pte_low;
16} pte_t;
17#endif /* !__ASSEMBLY__ */
18
4#define SHARED_KERNEL_PMD 0 19#define SHARED_KERNEL_PMD 0
20#define PAGETABLE_LEVELS 2
5 21
6/* 22/*
7 * traditional i386 two-level paging structure: 23 * traditional i386 two-level paging structure:
@@ -10,6 +26,7 @@
10#define PGDIR_SHIFT 22 26#define PGDIR_SHIFT 22
11#define PTRS_PER_PGD 1024 27#define PTRS_PER_PGD 1024
12 28
29
13/* 30/*
14 * the i386 is two-level, so we don't really have any 31 * the i386 is two-level, so we don't really have any
15 * PMD directory physically. 32 * PMD directory physically.
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 447da43cddb3..3f13cdf61156 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -18,21 +18,6 @@
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \ 18 printk("%s:%d: bad pgd %p(%016Lx).\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e)) 19 __FILE__, __LINE__, &(e), pgd_val(e))
20 20
21static inline int pud_none(pud_t pud)
22{
23 return pud_val(pud) == 0;
24}
25
26static inline int pud_bad(pud_t pud)
27{
28 return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
29}
30
31static inline int pud_present(pud_t pud)
32{
33 return pud_val(pud) & _PAGE_PRESENT;
34}
35
36/* Rules for using set_pte: the pte being assigned *must* be 21/* Rules for using set_pte: the pte being assigned *must* be
37 * either not present or in a state where the hardware will 22 * either not present or in a state where the hardware will
38 * not attempt to update the pte. In places where this is 23 * not attempt to update the pte. In places where this is
@@ -120,15 +105,6 @@ static inline void pud_clear(pud_t *pudp)
120 write_cr3(pgd); 105 write_cr3(pgd);
121} 106}
122 107
123#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
124
125#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
126
127
128/* Find an entry in the second-level page table.. */
129#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \
130 pmd_index(address))
131
132#ifdef CONFIG_SMP 108#ifdef CONFIG_SMP
133static inline pte_t native_ptep_get_and_clear(pte_t *ptep) 109static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
134{ 110{
@@ -145,17 +121,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
145#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 121#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
146#endif 122#endif
147 123
148#define __HAVE_ARCH_PTE_SAME
149static inline int pte_same(pte_t a, pte_t b)
150{
151 return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
152}
153
154static inline int pte_none(pte_t pte)
155{
156 return !pte.pte_low && !pte.pte_high;
157}
158
159/* 124/*
160 * Bits 0, 6 and 7 are taken in the low part of the pte, 125 * Bits 0, 6 and 7 are taken in the low part of the pte,
161 * put the 32 bits of offset into the high part. 126 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level_types.h
index 62561367653c..1bd5876c8649 100644
--- a/arch/x86/include/asm/pgtable-3level-defs.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -1,12 +1,31 @@
1#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H 1#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H 2#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
3 3
4#ifndef __ASSEMBLY__
5#include <linux/types.h>
6
7typedef u64 pteval_t;
8typedef u64 pmdval_t;
9typedef u64 pudval_t;
10typedef u64 pgdval_t;
11typedef u64 pgprotval_t;
12
13typedef union {
14 struct {
15 unsigned long pte_low, pte_high;
16 };
17 pteval_t pte;
18} pte_t;
19#endif /* !__ASSEMBLY__ */
20
4#ifdef CONFIG_PARAVIRT 21#ifdef CONFIG_PARAVIRT
5#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) 22#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
6#else 23#else
7#define SHARED_KERNEL_PMD 1 24#define SHARED_KERNEL_PMD 1
8#endif 25#endif
9 26
27#define PAGETABLE_LEVELS 3
28
10/* 29/*
11 * PGDIR_SHIFT determines what a top-level page table entry can map 30 * PGDIR_SHIFT determines what a top-level page table entry can map
12 */ 31 */
@@ -25,4 +44,5 @@
25 */ 44 */
26#define PTRS_PER_PTE 512 45#define PTRS_PER_PTE 512
27 46
47
28#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ 48#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 4f5af8447d54..d0812e155f1d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,164 +1,9 @@
1#ifndef _ASM_X86_PGTABLE_H 1#ifndef _ASM_X86_PGTABLE_H
2#define _ASM_X86_PGTABLE_H 2#define _ASM_X86_PGTABLE_H
3 3
4#define FIRST_USER_ADDRESS 0 4#include <asm/page.h>
5
6#define _PAGE_BIT_PRESENT 0 /* is present */
7#define _PAGE_BIT_RW 1 /* writeable */
8#define _PAGE_BIT_USER 2 /* userspace addressable */
9#define _PAGE_BIT_PWT 3 /* page write through */
10#define _PAGE_BIT_PCD 4 /* page cache disabled */
11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
13#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
14#define _PAGE_BIT_PAT 7 /* on 4KB pages */
15#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
16#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
17#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
18#define _PAGE_BIT_UNUSED3 11
19#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
20#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
21#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
22#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
23
24/* If _PAGE_BIT_PRESENT is clear, we use these: */
25/* - if the user mapped it with PROT_NONE; pte_present gives true */
26#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
27/* - set: nonlinear file mapping, saved PTE; unset:swap */
28#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
29
30#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
31#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
32#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
33#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
34#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
35#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
36#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
37#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
38#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
39#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
40#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
41#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
42#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
43#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
44#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
45#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
46#define __HAVE_ARCH_PTE_SPECIAL
47
48#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
49#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
50#else
51#define _PAGE_NX (_AT(pteval_t, 0))
52#endif
53 5
54#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) 6#include <asm/pgtable_types.h>
55#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
56
57#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
58 _PAGE_ACCESSED | _PAGE_DIRTY)
59#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
60 _PAGE_DIRTY)
61
62/* Set of bits not changed in pte_modify */
63#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
64 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
65
66#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
67#define _PAGE_CACHE_WB (0)
68#define _PAGE_CACHE_WC (_PAGE_PWT)
69#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
70#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
71
72#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
73#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
74 _PAGE_ACCESSED | _PAGE_NX)
75
76#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
77 _PAGE_USER | _PAGE_ACCESSED)
78#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
79 _PAGE_ACCESSED | _PAGE_NX)
80#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
81 _PAGE_ACCESSED)
82#define PAGE_COPY PAGE_COPY_NOEXEC
83#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
84 _PAGE_ACCESSED | _PAGE_NX)
85#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
86 _PAGE_ACCESSED)
87
88#define __PAGE_KERNEL_EXEC \
89 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
90#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
91
92#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
93#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
94#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
95#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
96#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
97#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
98#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
99#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
100#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
101#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
102#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
103
104#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
105#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
106#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
107#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
108
109#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
110#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
111#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
112#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
113#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
114#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
115#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
116#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
117#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
118#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
119#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
120#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
121#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
122
123#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
124#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
125#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
126#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
127
128/* xwr */
129#define __P000 PAGE_NONE
130#define __P001 PAGE_READONLY
131#define __P010 PAGE_COPY
132#define __P011 PAGE_COPY
133#define __P100 PAGE_READONLY_EXEC
134#define __P101 PAGE_READONLY_EXEC
135#define __P110 PAGE_COPY_EXEC
136#define __P111 PAGE_COPY_EXEC
137
138#define __S000 PAGE_NONE
139#define __S001 PAGE_READONLY
140#define __S010 PAGE_SHARED
141#define __S011 PAGE_SHARED
142#define __S100 PAGE_READONLY_EXEC
143#define __S101 PAGE_READONLY_EXEC
144#define __S110 PAGE_SHARED_EXEC
145#define __S111 PAGE_SHARED_EXEC
146
147/*
148 * early identity mapping pte attrib macros.
149 */
150#ifdef CONFIG_X86_64
151#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
152#else
153/*
154 * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
155 * bits are combined, this will alow user to access the high address mapped
156 * VDSO in the presence of CONFIG_COMPAT_VDSO
157 */
158#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
159#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
160#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
161#endif
162 7
163/* 8/*
164 * Macro to mark a page protection value as UC- 9 * Macro to mark a page protection value as UC-
@@ -170,9 +15,6 @@
170 15
171#ifndef __ASSEMBLY__ 16#ifndef __ASSEMBLY__
172 17
173#define pgprot_writecombine pgprot_writecombine
174extern pgprot_t pgprot_writecombine(pgprot_t prot);
175
176/* 18/*
177 * ZERO_PAGE is a global shared page that is always zero: used 19 * ZERO_PAGE is a global shared page that is always zero: used
178 * for zero-mapped memory areas etc.. 20 * for zero-mapped memory areas etc..
@@ -183,6 +25,66 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
183extern spinlock_t pgd_lock; 25extern spinlock_t pgd_lock;
184extern struct list_head pgd_list; 26extern struct list_head pgd_list;
185 27
28#ifdef CONFIG_PARAVIRT
29#include <asm/paravirt.h>
30#else /* !CONFIG_PARAVIRT */
31#define set_pte(ptep, pte) native_set_pte(ptep, pte)
32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
33
34#define set_pte_present(mm, addr, ptep, pte) \
35 native_set_pte_present(mm, addr, ptep, pte)
36#define set_pte_atomic(ptep, pte) \
37 native_set_pte_atomic(ptep, pte)
38
39#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
40
41#ifndef __PAGETABLE_PUD_FOLDED
42#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
43#define pgd_clear(pgd) native_pgd_clear(pgd)
44#endif
45
46#ifndef set_pud
47# define set_pud(pudp, pud) native_set_pud(pudp, pud)
48#endif
49
50#ifndef __PAGETABLE_PMD_FOLDED
51#define pud_clear(pud) native_pud_clear(pud)
52#endif
53
54#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
55#define pmd_clear(pmd) native_pmd_clear(pmd)
56
57#define pte_update(mm, addr, ptep) do { } while (0)
58#define pte_update_defer(mm, addr, ptep) do { } while (0)
59
60static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
61{
62 native_pagetable_setup_start(base);
63}
64
65static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
66{
67 native_pagetable_setup_done(base);
68}
69
70#define pgd_val(x) native_pgd_val(x)
71#define __pgd(x) native_make_pgd(x)
72
73#ifndef __PAGETABLE_PUD_FOLDED
74#define pud_val(x) native_pud_val(x)
75#define __pud(x) native_make_pud(x)
76#endif
77
78#ifndef __PAGETABLE_PMD_FOLDED
79#define pmd_val(x) native_pmd_val(x)
80#define __pmd(x) native_make_pmd(x)
81#endif
82
83#define pte_val(x) native_pte_val(x)
84#define __pte(x) native_make_pte(x)
85
86#endif /* CONFIG_PARAVIRT */
87
186/* 88/*
187 * The following only work if pte_present() is true. 89 * The following only work if pte_present() is true.
188 * Undefined behaviour if not.. 90 * Undefined behaviour if not..
@@ -236,72 +138,84 @@ static inline unsigned long pte_pfn(pte_t pte)
236 138
237static inline int pmd_large(pmd_t pte) 139static inline int pmd_large(pmd_t pte)
238{ 140{
239 return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == 141 return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
240 (_PAGE_PSE | _PAGE_PRESENT); 142 (_PAGE_PSE | _PAGE_PRESENT);
241} 143}
242 144
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{
147 pteval_t v = native_pte_val(pte);
148
149 return native_make_pte(v | set);
150}
151
152static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
153{
154 pteval_t v = native_pte_val(pte);
155
156 return native_make_pte(v & ~clear);
157}
158
243static inline pte_t pte_mkclean(pte_t pte) 159static inline pte_t pte_mkclean(pte_t pte)
244{ 160{
245 return __pte(pte_val(pte) & ~_PAGE_DIRTY); 161 return pte_clear_flags(pte, _PAGE_DIRTY);
246} 162}
247 163
248static inline pte_t pte_mkold(pte_t pte) 164static inline pte_t pte_mkold(pte_t pte)
249{ 165{
250 return __pte(pte_val(pte) & ~_PAGE_ACCESSED); 166 return pte_clear_flags(pte, _PAGE_ACCESSED);
251} 167}
252 168
253static inline pte_t pte_wrprotect(pte_t pte) 169static inline pte_t pte_wrprotect(pte_t pte)
254{ 170{
255 return __pte(pte_val(pte) & ~_PAGE_RW); 171 return pte_clear_flags(pte, _PAGE_RW);
256} 172}
257 173
258static inline pte_t pte_mkexec(pte_t pte) 174static inline pte_t pte_mkexec(pte_t pte)
259{ 175{
260 return __pte(pte_val(pte) & ~_PAGE_NX); 176 return pte_clear_flags(pte, _PAGE_NX);
261} 177}
262 178
263static inline pte_t pte_mkdirty(pte_t pte) 179static inline pte_t pte_mkdirty(pte_t pte)
264{ 180{
265 return __pte(pte_val(pte) | _PAGE_DIRTY); 181 return pte_set_flags(pte, _PAGE_DIRTY);
266} 182}
267 183
268static inline pte_t pte_mkyoung(pte_t pte) 184static inline pte_t pte_mkyoung(pte_t pte)
269{ 185{
270 return __pte(pte_val(pte) | _PAGE_ACCESSED); 186 return pte_set_flags(pte, _PAGE_ACCESSED);
271} 187}
272 188
273static inline pte_t pte_mkwrite(pte_t pte) 189static inline pte_t pte_mkwrite(pte_t pte)
274{ 190{
275 return __pte(pte_val(pte) | _PAGE_RW); 191 return pte_set_flags(pte, _PAGE_RW);
276} 192}
277 193
278static inline pte_t pte_mkhuge(pte_t pte) 194static inline pte_t pte_mkhuge(pte_t pte)
279{ 195{
280 return __pte(pte_val(pte) | _PAGE_PSE); 196 return pte_set_flags(pte, _PAGE_PSE);
281} 197}
282 198
283static inline pte_t pte_clrhuge(pte_t pte) 199static inline pte_t pte_clrhuge(pte_t pte)
284{ 200{
285 return __pte(pte_val(pte) & ~_PAGE_PSE); 201 return pte_clear_flags(pte, _PAGE_PSE);
286} 202}
287 203
288static inline pte_t pte_mkglobal(pte_t pte) 204static inline pte_t pte_mkglobal(pte_t pte)
289{ 205{
290 return __pte(pte_val(pte) | _PAGE_GLOBAL); 206 return pte_set_flags(pte, _PAGE_GLOBAL);
291} 207}
292 208
293static inline pte_t pte_clrglobal(pte_t pte) 209static inline pte_t pte_clrglobal(pte_t pte)
294{ 210{
295 return __pte(pte_val(pte) & ~_PAGE_GLOBAL); 211 return pte_clear_flags(pte, _PAGE_GLOBAL);
296} 212}
297 213
298static inline pte_t pte_mkspecial(pte_t pte) 214static inline pte_t pte_mkspecial(pte_t pte)
299{ 215{
300 return __pte(pte_val(pte) | _PAGE_SPECIAL); 216 return pte_set_flags(pte, _PAGE_SPECIAL);
301} 217}
302 218
303extern pteval_t __supported_pte_mask;
304
305/* 219/*
306 * Mask out unsupported bits in a present pgprot. Non-present pgprots 220 * Mask out unsupported bits in a present pgprot. Non-present pgprots
307 * can use those bits for other purposes, so leave them be. 221 * can use those bits for other purposes, so leave them be.
@@ -374,82 +288,197 @@ static inline int is_new_memtype_allowed(unsigned long flags,
374 return 1; 288 return 1;
375} 289}
376 290
377#ifndef __ASSEMBLY__ 291pmd_t *populate_extra_pmd(unsigned long vaddr);
378/* Indicate that x86 has its own track and untrack pfn vma functions */ 292pte_t *populate_extra_pte(unsigned long vaddr);
379#define __HAVE_PFNMAP_TRACKING 293#endif /* __ASSEMBLY__ */
380
381#define __HAVE_PHYS_MEM_ACCESS_PROT
382struct file;
383pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
384 unsigned long size, pgprot_t vma_prot);
385int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
386 unsigned long size, pgprot_t *vma_prot);
387#endif
388
389/* Install a pte for a particular vaddr in kernel space. */
390void set_pte_vaddr(unsigned long vaddr, pte_t pte);
391 294
392#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
393extern void native_pagetable_setup_start(pgd_t *base); 296# include "pgtable_32.h"
394extern void native_pagetable_setup_done(pgd_t *base);
395#else 297#else
396static inline void native_pagetable_setup_start(pgd_t *base) {} 298# include "pgtable_64.h"
397static inline void native_pagetable_setup_done(pgd_t *base) {}
398#endif 299#endif
399 300
400struct seq_file; 301#ifndef __ASSEMBLY__
401extern void arch_report_meminfo(struct seq_file *m); 302#include <linux/mm_types.h>
402 303
403#ifdef CONFIG_PARAVIRT 304static inline int pte_none(pte_t pte)
404#include <asm/paravirt.h> 305{
405#else /* !CONFIG_PARAVIRT */ 306 return !pte.pte;
406#define set_pte(ptep, pte) native_set_pte(ptep, pte) 307}
407#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
408 308
409#define set_pte_present(mm, addr, ptep, pte) \ 309#define __HAVE_ARCH_PTE_SAME
410 native_set_pte_present(mm, addr, ptep, pte) 310static inline int pte_same(pte_t a, pte_t b)
411#define set_pte_atomic(ptep, pte) \ 311{
412 native_set_pte_atomic(ptep, pte) 312 return a.pte == b.pte;
313}
413 314
414#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) 315static inline int pte_present(pte_t a)
316{
317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
318}
415 319
416#ifndef __PAGETABLE_PUD_FOLDED 320static inline int pmd_present(pmd_t pmd)
417#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) 321{
418#define pgd_clear(pgd) native_pgd_clear(pgd) 322 return pmd_flags(pmd) & _PAGE_PRESENT;
419#endif 323}
420 324
421#ifndef set_pud 325static inline int pmd_none(pmd_t pmd)
422# define set_pud(pudp, pud) native_set_pud(pudp, pud) 326{
423#endif 327 /* Only check low word on 32-bit platforms, since it might be
328 out of sync with upper half. */
329 return (unsigned long)native_pmd_val(pmd) == 0;
330}
424 331
425#ifndef __PAGETABLE_PMD_FOLDED 332static inline unsigned long pmd_page_vaddr(pmd_t pmd)
426#define pud_clear(pud) native_pud_clear(pud) 333{
427#endif 334 return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
335}
428 336
429#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) 337/*
430#define pmd_clear(pmd) native_pmd_clear(pmd) 338 * Currently stuck as a macro due to indirect forward reference to
339 * linux/mmzone.h's __section_mem_map_addr() definition:
340 */
341#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
431 342
432#define pte_update(mm, addr, ptep) do { } while (0) 343/*
433#define pte_update_defer(mm, addr, ptep) do { } while (0) 344 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
345 *
346 * this macro returns the index of the entry in the pmd page which would
347 * control the given virtual address
348 */
349static inline unsigned pmd_index(unsigned long address)
350{
351 return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
352}
434 353
435static inline void __init paravirt_pagetable_setup_start(pgd_t *base) 354/*
355 * Conversion functions: convert a page and protection to a page entry,
356 * and a page entry and page directory to the page they refer to.
357 *
358 * (Currently stuck as a macro because of indirect forward reference
359 * to linux/mm.h:page_to_nid())
360 */
361#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
362
363/*
364 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
365 *
366 * this function returns the index of the entry in the pte page which would
367 * control the given virtual address
368 */
369static inline unsigned pte_index(unsigned long address)
436{ 370{
437 native_pagetable_setup_start(base); 371 return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
438} 372}
439 373
440static inline void __init paravirt_pagetable_setup_done(pgd_t *base) 374static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
441{ 375{
442 native_pagetable_setup_done(base); 376 return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
443} 377}
444#endif /* CONFIG_PARAVIRT */
445 378
446#endif /* __ASSEMBLY__ */ 379static inline int pmd_bad(pmd_t pmd)
380{
381 return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
382}
447 383
448#ifdef CONFIG_X86_32 384static inline unsigned long pages_to_mb(unsigned long npg)
449# include "pgtable_32.h" 385{
386 return npg >> (20 - PAGE_SHIFT);
387}
388
389#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
390 remap_pfn_range(vma, vaddr, pfn, size, prot)
391
392#if PAGETABLE_LEVELS > 2
393static inline int pud_none(pud_t pud)
394{
395 return native_pud_val(pud) == 0;
396}
397
398static inline int pud_present(pud_t pud)
399{
400 return pud_flags(pud) & _PAGE_PRESENT;
401}
402
403static inline unsigned long pud_page_vaddr(pud_t pud)
404{
405 return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
406}
407
408/*
409 * Currently stuck as a macro due to indirect forward reference to
410 * linux/mmzone.h's __section_mem_map_addr() definition:
411 */
412#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
413
414/* Find an entry in the second-level page table.. */
415static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
416{
417 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
418}
419
420static inline unsigned long pmd_pfn(pmd_t pmd)
421{
422 return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
423}
424
425static inline int pud_large(pud_t pud)
426{
427 return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
428 (_PAGE_PSE | _PAGE_PRESENT);
429}
430
431static inline int pud_bad(pud_t pud)
432{
433 return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
434}
450#else 435#else
451# include "pgtable_64.h" 436static inline int pud_large(pud_t pud)
452#endif 437{
438 return 0;
439}
440#endif /* PAGETABLE_LEVELS > 2 */
441
442#if PAGETABLE_LEVELS > 3
443static inline int pgd_present(pgd_t pgd)
444{
445 return pgd_flags(pgd) & _PAGE_PRESENT;
446}
447
448static inline unsigned long pgd_page_vaddr(pgd_t pgd)
449{
450 return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
451}
452
453/*
454 * Currently stuck as a macro due to indirect forward reference to
455 * linux/mmzone.h's __section_mem_map_addr() definition:
456 */
457#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
458
459/* to find an entry in a page-table-directory. */
460static inline unsigned pud_index(unsigned long address)
461{
462 return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
463}
464
465static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
466{
467 return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
468}
469
470static inline int pgd_bad(pgd_t pgd)
471{
472 return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
473}
474
475static inline int pgd_none(pgd_t pgd)
476{
477 return !native_pgd_val(pgd);
478}
479#endif /* PAGETABLE_LEVELS > 3 */
480
481#endif /* __ASSEMBLY__ */
453 482
454/* 483/*
455 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] 484 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
@@ -476,28 +505,6 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
476 505
477#ifndef __ASSEMBLY__ 506#ifndef __ASSEMBLY__
478 507
479enum {
480 PG_LEVEL_NONE,
481 PG_LEVEL_4K,
482 PG_LEVEL_2M,
483 PG_LEVEL_1G,
484 PG_LEVEL_NUM
485};
486
487#ifdef CONFIG_PROC_FS
488extern void update_page_count(int level, unsigned long pages);
489#else
490static inline void update_page_count(int level, unsigned long pages) { }
491#endif
492
493/*
494 * Helper function that returns the kernel pagetable entry controlling
495 * the virtual address 'address'. NULL means no pagetable entry present.
496 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
497 * as a pte too.
498 */
499extern pte_t *lookup_address(unsigned long address, unsigned int *level);
500
501/* local pte updates need not use xchg for locking */ 508/* local pte updates need not use xchg for locking */
502static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 509static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
503{ 510{
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 72b020deb46b..97612fc7632f 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_PGTABLE_32_H 1#ifndef _ASM_X86_PGTABLE_32_H
2#define _ASM_X86_PGTABLE_32_H 2#define _ASM_X86_PGTABLE_32_H
3 3
4#include <asm/pgtable_32_types.h>
4 5
5/* 6/*
6 * The Linux memory management assumes a three-level page table setup. On 7 * The Linux memory management assumes a three-level page table setup. On
@@ -33,47 +34,6 @@ void paging_init(void);
33 34
34extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); 35extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
35 36
36/*
37 * The Linux x86 paging architecture is 'compile-time dual-mode', it
38 * implements both the traditional 2-level x86 page tables and the
39 * newer 3-level PAE-mode page tables.
40 */
41#ifdef CONFIG_X86_PAE
42# include <asm/pgtable-3level-defs.h>
43# define PMD_SIZE (1UL << PMD_SHIFT)
44# define PMD_MASK (~(PMD_SIZE - 1))
45#else
46# include <asm/pgtable-2level-defs.h>
47#endif
48
49#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
50#define PGDIR_MASK (~(PGDIR_SIZE - 1))
51
52/* Just any arbitrary offset to the start of the vmalloc VM area: the
53 * current 8MB value just means that there will be a 8MB "hole" after the
54 * physical memory until the kernel virtual memory starts. That means that
55 * any out-of-bounds memory accesses will hopefully be caught.
56 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
57 * area for the same reason. ;)
58 */
59#define VMALLOC_OFFSET (8 * 1024 * 1024)
60#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
61#ifdef CONFIG_X86_PAE
62#define LAST_PKMAP 512
63#else
64#define LAST_PKMAP 1024
65#endif
66
67#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
68 & PMD_MASK)
69
70#ifdef CONFIG_HIGHMEM
71# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
72#else
73# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
74#endif
75
76#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
77 37
78/* 38/*
79 * Define this if things work differently on an i386 and an i486: 39 * Define this if things work differently on an i386 and an i486:
@@ -85,55 +45,12 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
85/* The boot page tables (all created as a single array) */ 45/* The boot page tables (all created as a single array) */
86extern unsigned long pg0[]; 46extern unsigned long pg0[];
87 47
88#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
89
90/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
91#define pmd_none(x) (!(unsigned long)pmd_val((x)))
92#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
93#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
94
95#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
96
97#ifdef CONFIG_X86_PAE 48#ifdef CONFIG_X86_PAE
98# include <asm/pgtable-3level.h> 49# include <asm/pgtable-3level.h>
99#else 50#else
100# include <asm/pgtable-2level.h> 51# include <asm/pgtable-2level.h>
101#endif 52#endif
102 53
103/*
104 * Conversion functions: convert a page and protection to a page entry,
105 * and a page entry and page directory to the page they refer to.
106 */
107#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
108
109
110static inline int pud_large(pud_t pud) { return 0; }
111
112/*
113 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
114 *
115 * this macro returns the index of the entry in the pmd page which would
116 * control the given virtual address
117 */
118#define pmd_index(address) \
119 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
120
121/*
122 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
123 *
124 * this macro returns the index of the entry in the pte page which would
125 * control the given virtual address
126 */
127#define pte_index(address) \
128 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
129#define pte_offset_kernel(dir, address) \
130 ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
131
132#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
133
134#define pmd_page_vaddr(pmd) \
135 ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
136
137#if defined(CONFIG_HIGHPTE) 54#if defined(CONFIG_HIGHPTE)
138#define pte_offset_map(dir, address) \ 55#define pte_offset_map(dir, address) \
139 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ 56 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
@@ -176,7 +93,4 @@ do { \
176#define kern_addr_valid(kaddr) (0) 93#define kern_addr_valid(kaddr) (0)
177#endif 94#endif
178 95
179#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
180 remap_pfn_range(vma, vaddr, pfn, size, prot)
181
182#endif /* _ASM_X86_PGTABLE_32_H */ 96#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
new file mode 100644
index 000000000000..2733fad45f98
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -0,0 +1,51 @@
1#ifndef _ASM_X86_PGTABLE_32_DEFS_H
2#define _ASM_X86_PGTABLE_32_DEFS_H
3
4/*
5 * The Linux x86 paging architecture is 'compile-time dual-mode', it
6 * implements both the traditional 2-level x86 page tables and the
7 * newer 3-level PAE-mode page tables.
8 */
9#ifdef CONFIG_X86_PAE
10# include <asm/pgtable-3level_types.h>
11# define PMD_SIZE (1UL << PMD_SHIFT)
12# define PMD_MASK (~(PMD_SIZE - 1))
13#else
14# include <asm/pgtable-2level_types.h>
15#endif
16
17#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
18#define PGDIR_MASK (~(PGDIR_SIZE - 1))
19
20/* Just any arbitrary offset to the start of the vmalloc VM area: the
21 * current 8MB value just means that there will be a 8MB "hole" after the
22 * physical memory until the kernel virtual memory starts. That means that
23 * any out-of-bounds memory accesses will hopefully be caught.
24 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
25 * area for the same reason. ;)
26 */
27#define VMALLOC_OFFSET (8 * 1024 * 1024)
28
29#ifndef __ASSEMBLER__
30extern bool __vmalloc_start_set; /* set once high_memory is set */
31#endif
32
33#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
34#ifdef CONFIG_X86_PAE
35#define LAST_PKMAP 512
36#else
37#define LAST_PKMAP 1024
38#endif
39
40#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
41 & PMD_MASK)
42
43#ifdef CONFIG_HIGHMEM
44# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
45#else
46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
47#endif
48
49#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
50
51#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ba09289accaa..6b87bc6d5018 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -2,6 +2,8 @@
2#define _ASM_X86_PGTABLE_64_H 2#define _ASM_X86_PGTABLE_64_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <asm/pgtable_64_types.h>
6
5#ifndef __ASSEMBLY__ 7#ifndef __ASSEMBLY__
6 8
7/* 9/*
@@ -11,7 +13,6 @@
11#include <asm/processor.h> 13#include <asm/processor.h>
12#include <linux/bitops.h> 14#include <linux/bitops.h>
13#include <linux/threads.h> 15#include <linux/threads.h>
14#include <asm/pda.h>
15 16
16extern pud_t level3_kernel_pgt[512]; 17extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512]; 18extern pud_t level3_ident_pgt[512];
@@ -26,32 +27,6 @@ extern void paging_init(void);
26 27
27#endif /* !__ASSEMBLY__ */ 28#endif /* !__ASSEMBLY__ */
28 29
29#define SHARED_KERNEL_PMD 0
30
31/*
32 * PGDIR_SHIFT determines what a top-level page table entry can map
33 */
34#define PGDIR_SHIFT 39
35#define PTRS_PER_PGD 512
36
37/*
38 * 3rd level page
39 */
40#define PUD_SHIFT 30
41#define PTRS_PER_PUD 512
42
43/*
44 * PMD_SHIFT determines the size of the area a middle-level
45 * page table can map
46 */
47#define PMD_SHIFT 21
48#define PTRS_PER_PMD 512
49
50/*
51 * entries per page directory level
52 */
53#define PTRS_PER_PTE 512
54
55#ifndef __ASSEMBLY__ 30#ifndef __ASSEMBLY__
56 31
57#define pte_ERROR(e) \ 32#define pte_ERROR(e) \
@@ -67,9 +42,6 @@ extern void paging_init(void);
67 printk("%s:%d: bad pgd %p(%016lx).\n", \ 42 printk("%s:%d: bad pgd %p(%016lx).\n", \
68 __FILE__, __LINE__, &(e), pgd_val(e)) 43 __FILE__, __LINE__, &(e), pgd_val(e))
69 44
70#define pgd_none(x) (!pgd_val(x))
71#define pud_none(x) (!pud_val(x))
72
73struct mm_struct; 45struct mm_struct;
74 46
75void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); 47void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
@@ -134,48 +106,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
134 native_set_pgd(pgd, native_make_pgd(0)); 106 native_set_pgd(pgd, native_make_pgd(0));
135} 107}
136 108
137#define pte_same(a, b) ((a).pte == (b).pte)
138
139#endif /* !__ASSEMBLY__ */
140
141#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
142#define PMD_MASK (~(PMD_SIZE - 1))
143#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
144#define PUD_MASK (~(PUD_SIZE - 1))
145#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
146#define PGDIR_MASK (~(PGDIR_SIZE - 1))
147
148
149#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
150#define VMALLOC_START _AC(0xffffc20000000000, UL)
151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
152#define VMEMMAP_START _AC(0xffffe20000000000, UL)
153#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
154#define MODULES_END _AC(0xffffffffff000000, UL)
155#define MODULES_LEN (MODULES_END - MODULES_VADDR)
156
157#ifndef __ASSEMBLY__
158
159static inline int pgd_bad(pgd_t pgd)
160{
161 return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
162}
163
164static inline int pud_bad(pud_t pud)
165{
166 return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
167}
168
169static inline int pmd_bad(pmd_t pmd)
170{
171 return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
172}
173
174#define pte_none(x) (!pte_val((x)))
175#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
176
177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
178
179/* 109/*
180 * Conversion functions: convert a page and protection to a page entry, 110 * Conversion functions: convert a page and protection to a page entry,
181 * and a page entry and page directory to the page they refer to. 111 * and a page entry and page directory to the page they refer to.
@@ -184,41 +114,12 @@ static inline int pmd_bad(pmd_t pmd)
184/* 114/*
185 * Level 4 access. 115 * Level 4 access.
186 */ 116 */
187#define pgd_page_vaddr(pgd) \
188 ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
189#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
190#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
191static inline int pgd_large(pgd_t pgd) { return 0; } 117static inline int pgd_large(pgd_t pgd) { return 0; }
192#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) 118#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
193 119
194/* PUD - Level3 access */ 120/* PUD - Level3 access */
195/* to find an entry in a page-table-directory. */
196#define pud_page_vaddr(pud) \
197 ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
198#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
199#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
200#define pud_offset(pgd, address) \
201 ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
202#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
203
204static inline int pud_large(pud_t pte)
205{
206 return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
207 (_PAGE_PSE | _PAGE_PRESENT);
208}
209 121
210/* PMD - Level 2 access */ 122/* PMD - Level 2 access */
211#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
212#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
213
214#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
215#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
216 pmd_index(address))
217#define pmd_none(x) (!pmd_val((x)))
218#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
219#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
220#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
221
222#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) 123#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
223#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ 124#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
224 _PAGE_FILE }) 125 _PAGE_FILE })
@@ -226,13 +127,6 @@ static inline int pud_large(pud_t pte)
226 127
227/* PTE - Level 1 access. */ 128/* PTE - Level 1 access. */
228 129
229/* page, protection -> pte */
230#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
231
232#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
233#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
234 pte_index((address)))
235
236/* x86-64 always has all page tables mapped. */ 130/* x86-64 always has all page tables mapped. */
237#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 131#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
238#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) 132#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
@@ -266,9 +160,6 @@ extern int direct_gbpages;
266extern int kern_addr_valid(unsigned long addr); 160extern int kern_addr_valid(unsigned long addr);
267extern void cleanup_highmap(void); 161extern void cleanup_highmap(void);
268 162
269#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
270 remap_pfn_range(vma, vaddr, pfn, size, prot)
271
272#define HAVE_ARCH_UNMAPPED_AREA 163#define HAVE_ARCH_UNMAPPED_AREA
273#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 164#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
274 165
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
new file mode 100644
index 000000000000..fbf42b8e0383
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -0,0 +1,63 @@
1#ifndef _ASM_X86_PGTABLE_64_DEFS_H
2#define _ASM_X86_PGTABLE_64_DEFS_H
3
4#ifndef __ASSEMBLY__
5#include <linux/types.h>
6
7/*
8 * These are used to make use of C type-checking..
9 */
10typedef unsigned long pteval_t;
11typedef unsigned long pmdval_t;
12typedef unsigned long pudval_t;
13typedef unsigned long pgdval_t;
14typedef unsigned long pgprotval_t;
15
16typedef struct { pteval_t pte; } pte_t;
17
18#endif /* !__ASSEMBLY__ */
19
20#define SHARED_KERNEL_PMD 0
21#define PAGETABLE_LEVELS 4
22
23/*
24 * PGDIR_SHIFT determines what a top-level page table entry can map
25 */
26#define PGDIR_SHIFT 39
27#define PTRS_PER_PGD 512
28
29/*
30 * 3rd level page
31 */
32#define PUD_SHIFT 30
33#define PTRS_PER_PUD 512
34
35/*
36 * PMD_SHIFT determines the size of the area a middle-level
37 * page table can map
38 */
39#define PMD_SHIFT 21
40#define PTRS_PER_PMD 512
41
42/*
43 * entries per page directory level
44 */
45#define PTRS_PER_PTE 512
46
47#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
48#define PMD_MASK (~(PMD_SIZE - 1))
49#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
50#define PUD_MASK (~(PUD_SIZE - 1))
51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
52#define PGDIR_MASK (~(PGDIR_SIZE - 1))
53
54
55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
56#define VMALLOC_START _AC(0xffffc20000000000, UL)
57#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
58#define VMEMMAP_START _AC(0xffffe20000000000, UL)
59#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
60#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR)
62
63#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
new file mode 100644
index 000000000000..b8238dc8786d
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -0,0 +1,329 @@
1#ifndef _ASM_X86_PGTABLE_DEFS_H
2#define _ASM_X86_PGTABLE_DEFS_H
3
4#include <linux/const.h>
5#include <asm/page_types.h>
6
7#define FIRST_USER_ADDRESS 0
8
9#define _PAGE_BIT_PRESENT 0 /* is present */
10#define _PAGE_BIT_RW 1 /* writeable */
11#define _PAGE_BIT_USER 2 /* userspace addressable */
12#define _PAGE_BIT_PWT 3 /* page write through */
13#define _PAGE_BIT_PCD 4 /* page cache disabled */
14#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
15#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
16#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17#define _PAGE_BIT_PAT 7 /* on 4KB pages */
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26
27/* If _PAGE_BIT_PRESENT is clear, we use these: */
28/* - if the user mapped it with PROT_NONE; pte_present gives true */
29#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
30/* - set: nonlinear file mapping, saved PTE; unset:swap */
31#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
32
33#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
34#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
35#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
36#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
37#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
38#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
39#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
40#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL
50
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else
54#define _PAGE_NX (_AT(pteval_t, 0))
55#endif
56
57#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
58#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
59
60#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
61 _PAGE_ACCESSED | _PAGE_DIRTY)
62#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
63 _PAGE_DIRTY)
64
65/* Set of bits not changed in pte_modify */
66#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
67 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
68
69#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
70#define _PAGE_CACHE_WB (0)
71#define _PAGE_CACHE_WC (_PAGE_PWT)
72#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
73#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
74
75#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
76#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
77 _PAGE_ACCESSED | _PAGE_NX)
78
79#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
80 _PAGE_USER | _PAGE_ACCESSED)
81#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
82 _PAGE_ACCESSED | _PAGE_NX)
83#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
84 _PAGE_ACCESSED)
85#define PAGE_COPY PAGE_COPY_NOEXEC
86#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
87 _PAGE_ACCESSED | _PAGE_NX)
88#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
89 _PAGE_ACCESSED)
90
91#define __PAGE_KERNEL_EXEC \
92 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
93#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
94
95#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
96#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
97#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
98#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
99#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
100#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
101#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
102#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
103#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
104#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
105#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
106
107#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
108#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
109#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
110#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
111
112#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
113#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
114#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
115#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
116#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
117#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
118#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
119#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
120#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
121#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
122#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
123#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
124#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
125
126#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
127#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
128#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
129#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
130
131/* xwr */
132#define __P000 PAGE_NONE
133#define __P001 PAGE_READONLY
134#define __P010 PAGE_COPY
135#define __P011 PAGE_COPY
136#define __P100 PAGE_READONLY_EXEC
137#define __P101 PAGE_READONLY_EXEC
138#define __P110 PAGE_COPY_EXEC
139#define __P111 PAGE_COPY_EXEC
140
141#define __S000 PAGE_NONE
142#define __S001 PAGE_READONLY
143#define __S010 PAGE_SHARED
144#define __S011 PAGE_SHARED
145#define __S100 PAGE_READONLY_EXEC
146#define __S101 PAGE_READONLY_EXEC
147#define __S110 PAGE_SHARED_EXEC
148#define __S111 PAGE_SHARED_EXEC
149
150/*
151 * early identity mapping pte attrib macros.
152 */
153#ifdef CONFIG_X86_64
154#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
155#else
156/*
157 * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
158 * bits are combined, this will alow user to access the high address mapped
159 * VDSO in the presence of CONFIG_COMPAT_VDSO
160 */
161#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
162#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
163#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
164#endif
165
166#ifdef CONFIG_X86_32
167# include "pgtable_32_types.h"
168#else
169# include "pgtable_64_types.h"
170#endif
171
172#ifndef __ASSEMBLY__
173
174#include <linux/types.h>
175
176/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
177#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
178
179/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
180#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
181
182typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
183
184typedef struct { pgdval_t pgd; } pgd_t;
185
186static inline pgd_t native_make_pgd(pgdval_t val)
187{
188 return (pgd_t) { val };
189}
190
191static inline pgdval_t native_pgd_val(pgd_t pgd)
192{
193 return pgd.pgd;
194}
195
196static inline pgdval_t pgd_flags(pgd_t pgd)
197{
198 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
199}
200
201#if PAGETABLE_LEVELS > 3
202typedef struct { pudval_t pud; } pud_t;
203
204static inline pud_t native_make_pud(pmdval_t val)
205{
206 return (pud_t) { val };
207}
208
209static inline pudval_t native_pud_val(pud_t pud)
210{
211 return pud.pud;
212}
213#else
214#include <asm-generic/pgtable-nopud.h>
215
216static inline pudval_t native_pud_val(pud_t pud)
217{
218 return native_pgd_val(pud.pgd);
219}
220#endif
221
222#if PAGETABLE_LEVELS > 2
223typedef struct { pmdval_t pmd; } pmd_t;
224
225static inline pmd_t native_make_pmd(pmdval_t val)
226{
227 return (pmd_t) { val };
228}
229
230static inline pmdval_t native_pmd_val(pmd_t pmd)
231{
232 return pmd.pmd;
233}
234#else
235#include <asm-generic/pgtable-nopmd.h>
236
237static inline pmdval_t native_pmd_val(pmd_t pmd)
238{
239 return native_pgd_val(pmd.pud.pgd);
240}
241#endif
242
243static inline pudval_t pud_flags(pud_t pud)
244{
245 return native_pud_val(pud) & PTE_FLAGS_MASK;
246}
247
248static inline pmdval_t pmd_flags(pmd_t pmd)
249{
250 return native_pmd_val(pmd) & PTE_FLAGS_MASK;
251}
252
253static inline pte_t native_make_pte(pteval_t val)
254{
255 return (pte_t) { .pte = val };
256}
257
258static inline pteval_t native_pte_val(pte_t pte)
259{
260 return pte.pte;
261}
262
263static inline pteval_t pte_flags(pte_t pte)
264{
265 return native_pte_val(pte) & PTE_FLAGS_MASK;
266}
267
268#define pgprot_val(x) ((x).pgprot)
269#define __pgprot(x) ((pgprot_t) { (x) } )
270
271
272typedef struct page *pgtable_t;
273
274extern pteval_t __supported_pte_mask;
275extern int nx_enabled;
276extern void set_nx(void);
277
278#define pgprot_writecombine pgprot_writecombine
279extern pgprot_t pgprot_writecombine(pgprot_t prot);
280
281/* Indicate that x86 has its own track and untrack pfn vma functions */
282#define __HAVE_PFNMAP_TRACKING
283
284#define __HAVE_PHYS_MEM_ACCESS_PROT
285struct file;
286pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
287 unsigned long size, pgprot_t vma_prot);
288int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
289 unsigned long size, pgprot_t *vma_prot);
290
291/* Install a pte for a particular vaddr in kernel space. */
292void set_pte_vaddr(unsigned long vaddr, pte_t pte);
293
294#ifdef CONFIG_X86_32
295extern void native_pagetable_setup_start(pgd_t *base);
296extern void native_pagetable_setup_done(pgd_t *base);
297#else
298static inline void native_pagetable_setup_start(pgd_t *base) {}
299static inline void native_pagetable_setup_done(pgd_t *base) {}
300#endif
301
302struct seq_file;
303extern void arch_report_meminfo(struct seq_file *m);
304
305enum {
306 PG_LEVEL_NONE,
307 PG_LEVEL_4K,
308 PG_LEVEL_2M,
309 PG_LEVEL_1G,
310 PG_LEVEL_NUM
311};
312
313#ifdef CONFIG_PROC_FS
314extern void update_page_count(int level, unsigned long pages);
315#else
316static inline void update_page_count(int level, unsigned long pages) { }
317#endif
318
319/*
320 * Helper function that returns the kernel pagetable entry controlling
321 * the virtual address 'address'. NULL means no pagetable entry present.
322 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
323 * as a pte too.
324 */
325extern pte_t *lookup_address(unsigned long address, unsigned int *level);
326
327#endif /* !__ASSEMBLY__ */
328
329#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
index a8894647dd9a..3ac5032fae09 100644
--- a/arch/x86/include/asm/prctl.h
+++ b/arch/x86/include/asm/prctl.h
@@ -6,8 +6,4 @@
6#define ARCH_GET_FS 0x1003 6#define ARCH_GET_FS 0x1003
7#define ARCH_GET_GS 0x1004 7#define ARCH_GET_GS 0x1004
8 8
9#ifdef CONFIG_X86_64
10extern long sys_arch_prctl(int, unsigned long);
11#endif /* CONFIG_X86_64 */
12
13#endif /* _ASM_X86_PRCTL_H */ 9#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3bfd5235a9eb..76139506c3e4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -16,6 +16,7 @@ struct mm_struct;
16#include <asm/cpufeature.h> 16#include <asm/cpufeature.h>
17#include <asm/system.h> 17#include <asm/system.h>
18#include <asm/page.h> 18#include <asm/page.h>
19#include <asm/pgtable_types.h>
19#include <asm/percpu.h> 20#include <asm/percpu.h>
20#include <asm/msr.h> 21#include <asm/msr.h>
21#include <asm/desc_defs.h> 22#include <asm/desc_defs.h>
@@ -73,7 +74,7 @@ struct cpuinfo_x86 {
73 char pad0; 74 char pad0;
74#else 75#else
75 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ 76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
76 int x86_tlbsize; 77 int x86_tlbsize;
77 __u8 x86_virt_bits; 78 __u8 x86_virt_bits;
78 __u8 x86_phys_bits; 79 __u8 x86_phys_bits;
79#endif 80#endif
@@ -247,7 +248,6 @@ struct x86_hw_tss {
247#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) 248#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
248#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) 249#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
249#define INVALID_IO_BITMAP_OFFSET 0x8000 250#define INVALID_IO_BITMAP_OFFSET 0x8000
250#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
251 251
252struct tss_struct { 252struct tss_struct {
253 /* 253 /*
@@ -262,11 +262,6 @@ struct tss_struct {
262 * be within the limit. 262 * be within the limit.
263 */ 263 */
264 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 264 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
265 /*
266 * Cache the current maximum and the last task that used the bitmap:
267 */
268 unsigned long io_bitmap_max;
269 struct thread_struct *io_bitmap_owner;
270 265
271 /* 266 /*
272 * .. and then another 0x100 bytes for the emergency kernel stack: 267 * .. and then another 0x100 bytes for the emergency kernel stack:
@@ -378,9 +373,30 @@ union thread_xstate {
378 373
379#ifdef CONFIG_X86_64 374#ifdef CONFIG_X86_64
380DECLARE_PER_CPU(struct orig_ist, orig_ist); 375DECLARE_PER_CPU(struct orig_ist, orig_ist);
376
377union irq_stack_union {
378 char irq_stack[IRQ_STACK_SIZE];
379 /*
380 * GCC hardcodes the stack canary as %gs:40. Since the
381 * irq_stack is the object at %gs:0, we reserve the bottom
382 * 48 bytes of the irq stack for the canary.
383 */
384 struct {
385 char gs_base[40];
386 unsigned long stack_canary;
387 };
388};
389
390DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
391DECLARE_INIT_PER_CPU(irq_stack_union);
392
393DECLARE_PER_CPU(char *, irq_stack_ptr);
394#else /* X86_64 */
395#ifdef CONFIG_CC_STACKPROTECTOR
396DECLARE_PER_CPU(unsigned long, stack_canary);
381#endif 397#endif
398#endif /* X86_64 */
382 399
383extern void print_cpu_info(struct cpuinfo_x86 *);
384extern unsigned int xstate_size; 400extern unsigned int xstate_size;
385extern void free_thread_xstate(struct task_struct *); 401extern void free_thread_xstate(struct task_struct *);
386extern struct kmem_cache *task_xstate_cachep; 402extern struct kmem_cache *task_xstate_cachep;
@@ -752,9 +768,9 @@ extern int sysenter_setup(void);
752extern struct desc_ptr early_gdt_descr; 768extern struct desc_ptr early_gdt_descr;
753 769
754extern void cpu_set_gdt(int); 770extern void cpu_set_gdt(int);
755extern void switch_to_new_gdt(void); 771extern void switch_to_new_gdt(int);
772extern void load_percpu_segment(int);
756extern void cpu_init(void); 773extern void cpu_init(void);
757extern void init_gdt(int cpu);
758 774
759static inline unsigned long get_debugctlmsr(void) 775static inline unsigned long get_debugctlmsr(void)
760{ 776{
@@ -839,6 +855,7 @@ static inline void spin_lock_prefetch(const void *x)
839 * User space process size: 3GB (default). 855 * User space process size: 3GB (default).
840 */ 856 */
841#define TASK_SIZE PAGE_OFFSET 857#define TASK_SIZE PAGE_OFFSET
858#define TASK_SIZE_MAX TASK_SIZE
842#define STACK_TOP TASK_SIZE 859#define STACK_TOP TASK_SIZE
843#define STACK_TOP_MAX STACK_TOP 860#define STACK_TOP_MAX STACK_TOP
844 861
@@ -898,7 +915,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
898/* 915/*
899 * User space process size. 47bits minus one guard page. 916 * User space process size. 47bits minus one guard page.
900 */ 917 */
901#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) 918#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
902 919
903/* This decides where the kernel will search for a free chunk of vm 920/* This decides where the kernel will search for a free chunk of vm
904 * space during mmap's. 921 * space during mmap's.
@@ -907,12 +924,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
907 0xc0000000 : 0xFFFFe000) 924 0xc0000000 : 0xFFFFe000)
908 925
909#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ 926#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
910 IA32_PAGE_OFFSET : TASK_SIZE64) 927 IA32_PAGE_OFFSET : TASK_SIZE_MAX)
911#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ 928#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
912 IA32_PAGE_OFFSET : TASK_SIZE64) 929 IA32_PAGE_OFFSET : TASK_SIZE_MAX)
913 930
914#define STACK_TOP TASK_SIZE 931#define STACK_TOP TASK_SIZE
915#define STACK_TOP_MAX TASK_SIZE64 932#define STACK_TOP_MAX TASK_SIZE_MAX
916 933
917#define INIT_THREAD { \ 934#define INIT_THREAD { \
918 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ 935 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index d6a22f92ba77..49fb3ecf3bb3 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -18,11 +18,7 @@ extern void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19extern void check_efer(void);
20 20
21#ifdef CONFIG_X86_BIOS_REBOOT
22extern int reboot_force; 21extern int reboot_force;
23#else
24static const int reboot_force = 0;
25#endif
26 22
27long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
28 24
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 8e0f8d199e05..86723035a515 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -80,8 +80,6 @@
80 80
81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ 81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
82 82
83#ifdef CONFIG_X86_PTRACE_BTS
84
85#ifndef __ASSEMBLY__ 83#ifndef __ASSEMBLY__
86#include <linux/types.h> 84#include <linux/types.h>
87 85
@@ -140,6 +138,5 @@ struct ptrace_bts_config {
140 BTS records are read from oldest to newest. 138 BTS records are read from oldest to newest.
141 Returns number of BTS records drained. 139 Returns number of BTS records drained.
142*/ 140*/
143#endif /* CONFIG_X86_PTRACE_BTS */
144 141
145#endif /* _ASM_X86_PTRACE_ABI_H */ 142#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6d34d954c228..e304b66abeea 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -28,7 +28,7 @@ struct pt_regs {
28 int xds; 28 int xds;
29 int xes; 29 int xes;
30 int xfs; 30 int xfs;
31 /* int gs; */ 31 int xgs;
32 long orig_eax; 32 long orig_eax;
33 long eip; 33 long eip;
34 int xcs; 34 int xcs;
@@ -50,7 +50,7 @@ struct pt_regs {
50 unsigned long ds; 50 unsigned long ds;
51 unsigned long es; 51 unsigned long es;
52 unsigned long fs; 52 unsigned long fs;
53 /* int gs; */ 53 unsigned long gs;
54 unsigned long orig_ax; 54 unsigned long orig_ax;
55 unsigned long ip; 55 unsigned long ip;
56 unsigned long cs; 56 unsigned long cs;
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
index c8e9c8bed3d0..c8e9c8bed3d0 100644
--- a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
+++ b/arch/x86/include/asm/rdc321x_defs.h
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 1dc1b51ac623..14e0ed86a6f9 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -61,7 +61,7 @@
61 * 61 *
62 * 26 - ESPFIX small SS 62 * 26 - ESPFIX small SS
63 * 27 - per-cpu [ offset to per-cpu data area ] 63 * 27 - per-cpu [ offset to per-cpu data area ]
64 * 28 - unused 64 * 28 - stack_canary-20 [ for stack protector ]
65 * 29 - unused 65 * 29 - unused
66 * 30 - unused 66 * 30 - unused
67 * 31 - TSS for double fault handler 67 * 31 - TSS for double fault handler
@@ -95,6 +95,13 @@
95#define __KERNEL_PERCPU 0 95#define __KERNEL_PERCPU 0
96#endif 96#endif
97 97
98#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
99#ifdef CONFIG_CC_STACKPROTECTOR
100#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
101#else
102#define __KERNEL_STACK_CANARY 0
103#endif
104
98#define GDT_ENTRY_DOUBLEFAULT_TSS 31 105#define GDT_ENTRY_DOUBLEFAULT_TSS 31
99 106
100/* 107/*
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ebe858cdc8a3..05c6f6b11fd5 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -1,33 +1,19 @@
1#ifndef _ASM_X86_SETUP_H 1#ifndef _ASM_X86_SETUP_H
2#define _ASM_X86_SETUP_H 2#define _ASM_X86_SETUP_H
3 3
4#ifdef __KERNEL__
5
4#define COMMAND_LINE_SIZE 2048 6#define COMMAND_LINE_SIZE 2048
5 7
6#ifndef __ASSEMBLY__ 8#ifndef __ASSEMBLY__
7 9
8/* Interrupt control for vSMPowered x86_64 systems */
9void vsmp_init(void);
10
11
12void setup_bios_corruption_check(void);
13
14
15#ifdef CONFIG_X86_VISWS
16extern void visws_early_detect(void);
17extern int is_visws_box(void);
18#else
19static inline void visws_early_detect(void) { }
20static inline int is_visws_box(void) { return 0; }
21#endif
22
23extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
24extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
25/* 10/*
26 * Any setup quirks to be performed? 11 * Any setup quirks to be performed?
27 */ 12 */
28struct mpc_cpu; 13struct mpc_cpu;
29struct mpc_bus; 14struct mpc_bus;
30struct mpc_oemtable; 15struct mpc_oemtable;
16
31struct x86_quirks { 17struct x86_quirks {
32 int (*arch_pre_time_init)(void); 18 int (*arch_pre_time_init)(void);
33 int (*arch_time_init)(void); 19 int (*arch_time_init)(void);
@@ -43,20 +29,19 @@ struct x86_quirks {
43 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 29 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
44 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 30 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
45 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, 31 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
46 unsigned short oemsize); 32 unsigned short oemsize);
47 int (*setup_ioapic_ids)(void); 33 int (*setup_ioapic_ids)(void);
48 int (*update_genapic)(void);
49}; 34};
50 35
51extern struct x86_quirks *x86_quirks; 36extern void x86_quirk_pre_intr_init(void);
52extern unsigned long saved_video_mode; 37extern void x86_quirk_intr_init(void);
53 38
54#ifndef CONFIG_PARAVIRT 39extern void x86_quirk_trap_init(void);
55#define paravirt_post_allocator_init() do {} while (0)
56#endif
57#endif /* __ASSEMBLY__ */
58 40
59#ifdef __KERNEL__ 41extern void x86_quirk_pre_time_init(void);
42extern void x86_quirk_time_init(void);
43
44#endif /* __ASSEMBLY__ */
60 45
61#ifdef __i386__ 46#ifdef __i386__
62 47
@@ -78,6 +63,30 @@ extern unsigned long saved_video_mode;
78#ifndef __ASSEMBLY__ 63#ifndef __ASSEMBLY__
79#include <asm/bootparam.h> 64#include <asm/bootparam.h>
80 65
66/* Interrupt control for vSMPowered x86_64 systems */
67#ifdef CONFIG_X86_VSMP
68void vsmp_init(void);
69#else
70static inline void vsmp_init(void) { }
71#endif
72
73void setup_bios_corruption_check(void);
74
75#ifdef CONFIG_X86_VISWS
76extern void visws_early_detect(void);
77extern int is_visws_box(void);
78#else
79static inline void visws_early_detect(void) { }
80static inline int is_visws_box(void) { return 0; }
81#endif
82
83extern struct x86_quirks *x86_quirks;
84extern unsigned long saved_video_mode;
85
86#ifndef CONFIG_PARAVIRT
87#define paravirt_post_allocator_init() do {} while (0)
88#endif
89
81#ifndef _SETUP 90#ifndef _SETUP
82 91
83/* 92/*
@@ -100,7 +109,6 @@ extern unsigned long init_pg_tables_start;
100extern unsigned long init_pg_tables_end; 109extern unsigned long init_pg_tables_end;
101 110
102#else 111#else
103void __init x86_64_init_pda(void);
104void __init x86_64_start_kernel(char *real_mode); 112void __init x86_64_start_kernel(char *real_mode);
105void __init x86_64_start_reservations(char *real_mode_data); 113void __init x86_64_start_reservations(char *real_mode_data);
106 114
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/setup_arch.h
index 38846208b548..38846208b548 100644
--- a/arch/x86/include/asm/mach-default/setup_arch.h
+++ b/arch/x86/include/asm/setup_arch.h
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19953df61c52..47d0e21f2b9e 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -15,34 +15,8 @@
15# include <asm/io_apic.h> 15# include <asm/io_apic.h>
16# endif 16# endif
17#endif 17#endif
18#include <asm/pda.h>
19#include <asm/thread_info.h> 18#include <asm/thread_info.h>
20 19#include <asm/cpumask.h>
21#ifdef CONFIG_X86_64
22
23extern cpumask_var_t cpu_callin_mask;
24extern cpumask_var_t cpu_callout_mask;
25extern cpumask_var_t cpu_initialized_mask;
26extern cpumask_var_t cpu_sibling_setup_mask;
27
28#else /* CONFIG_X86_32 */
29
30extern cpumask_t cpu_callin_map;
31extern cpumask_t cpu_callout_map;
32extern cpumask_t cpu_initialized;
33extern cpumask_t cpu_sibling_setup_map;
34
35#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
36#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
37#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
38#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
39
40#endif /* CONFIG_X86_32 */
41
42extern void (*mtrr_hook)(void);
43extern void zap_low_mappings(void);
44
45extern int __cpuinit get_local_pda(int cpu);
46 20
47extern int smp_num_siblings; 21extern int smp_num_siblings;
48extern unsigned int num_processors; 22extern unsigned int num_processors;
@@ -50,9 +24,7 @@ extern unsigned int num_processors;
50DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 24DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
51DECLARE_PER_CPU(cpumask_t, cpu_core_map); 25DECLARE_PER_CPU(cpumask_t, cpu_core_map);
52DECLARE_PER_CPU(u16, cpu_llc_id); 26DECLARE_PER_CPU(u16, cpu_llc_id);
53#ifdef CONFIG_X86_32
54DECLARE_PER_CPU(int, cpu_number); 27DECLARE_PER_CPU(int, cpu_number);
55#endif
56 28
57static inline struct cpumask *cpu_sibling_mask(int cpu) 29static inline struct cpumask *cpu_sibling_mask(int cpu)
58{ 30{
@@ -167,8 +139,6 @@ void play_dead_common(void);
167void native_send_call_func_ipi(const struct cpumask *mask); 139void native_send_call_func_ipi(const struct cpumask *mask);
168void native_send_call_func_single_ipi(int cpu); 140void native_send_call_func_single_ipi(int cpu);
169 141
170extern void prefill_possible_map(void);
171
172void smp_store_cpu_info(int id); 142void smp_store_cpu_info(int id);
173#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 143#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
174 144
@@ -177,10 +147,6 @@ static inline int num_booting_cpus(void)
177{ 147{
178 return cpumask_weight(cpu_callout_mask); 148 return cpumask_weight(cpu_callout_mask);
179} 149}
180#else
181static inline void prefill_possible_map(void)
182{
183}
184#endif /* CONFIG_SMP */ 150#endif /* CONFIG_SMP */
185 151
186extern unsigned disabled_cpus __cpuinitdata; 152extern unsigned disabled_cpus __cpuinitdata;
@@ -191,11 +157,11 @@ extern unsigned disabled_cpus __cpuinitdata;
191 * from the initial startup. We map APIC_BASE very early in page_setup(), 157 * from the initial startup. We map APIC_BASE very early in page_setup(),
192 * so this is correct in the x86 case. 158 * so this is correct in the x86 case.
193 */ 159 */
194#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) 160#define raw_smp_processor_id() (percpu_read(cpu_number))
195extern int safe_smp_processor_id(void); 161extern int safe_smp_processor_id(void);
196 162
197#elif defined(CONFIG_X86_64_SMP) 163#elif defined(CONFIG_X86_64_SMP)
198#define raw_smp_processor_id() read_pda(cpunumber) 164#define raw_smp_processor_id() (percpu_read(cpu_number))
199 165
200#define stack_smp_processor_id() \ 166#define stack_smp_processor_id() \
201({ \ 167({ \
@@ -205,10 +171,6 @@ extern int safe_smp_processor_id(void);
205}) 171})
206#define safe_smp_processor_id() smp_processor_id() 172#define safe_smp_processor_id() smp_processor_id()
207 173
208#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
209#define cpu_physical_id(cpu) boot_cpu_physical_apicid
210#define safe_smp_processor_id() 0
211#define stack_smp_processor_id() 0
212#endif 174#endif
213 175
214#ifdef CONFIG_X86_LOCAL_APIC 176#ifdef CONFIG_X86_LOCAL_APIC
@@ -220,28 +182,9 @@ static inline int logical_smp_processor_id(void)
220 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); 182 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
221} 183}
222 184
223#include <mach_apicdef.h>
224static inline unsigned int read_apic_id(void)
225{
226 unsigned int reg;
227
228 reg = *(u32 *)(APIC_BASE + APIC_ID);
229
230 return GET_APIC_ID(reg);
231}
232#endif 185#endif
233 186
234
235# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
236extern int hard_smp_processor_id(void); 187extern int hard_smp_processor_id(void);
237# else
238#include <mach_apicdef.h>
239static inline int hard_smp_processor_id(void)
240{
241 /* we don't want to mark this access volatile - bad code generation */
242 return read_apic_id();
243}
244# endif /* APIC_DEFINITION */
245 188
246#else /* CONFIG_X86_LOCAL_APIC */ 189#else /* CONFIG_X86_LOCAL_APIC */
247 190
@@ -251,11 +194,5 @@ static inline int hard_smp_processor_id(void)
251 194
252#endif /* CONFIG_X86_LOCAL_APIC */ 195#endif /* CONFIG_X86_LOCAL_APIC */
253 196
254#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
255extern unsigned char boot_cpu_id;
256#else
257#define boot_cpu_id 0
258#endif
259
260#endif /* __ASSEMBLY__ */ 197#endif /* __ASSEMBLY__ */
261#endif /* _ASM_X86_SMP_H */ 198#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 23bf52103b89..1def60114906 100644
--- a/arch/x86/include/asm/mach-default/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -13,10 +13,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
13 CMOS_WRITE(0xa, 0xf); 13 CMOS_WRITE(0xa, 0xf);
14 local_flush_tlb(); 14 local_flush_tlb();
15 pr_debug("1.\n"); 15 pr_debug("1.\n");
16 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = 16 *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
17 start_eip >> 4; 17 start_eip >> 4;
18 pr_debug("2.\n"); 18 pr_debug("2.\n");
19 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 19 *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
20 start_eip & 0xf; 20 start_eip & 0xf;
21 pr_debug("3.\n"); 21 pr_debug("3.\n");
22} 22}
@@ -34,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
34 */ 34 */
35 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
36 36
37 *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; 37 *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0;
38} 38}
39 39
40static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 8247e94ac6b1..3a5696656680 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -172,70 +172,8 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; 172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
173} 173}
174 174
175#ifdef CONFIG_PARAVIRT 175#ifndef CONFIG_PARAVIRT
176/*
177 * Define virtualization-friendly old-style lock byte lock, for use in
178 * pv_lock_ops if desired.
179 *
180 * This differs from the pre-2.6.24 spinlock by always using xchgb
181 * rather than decb to take the lock; this allows it to use a
182 * zero-initialized lock structure. It also maintains a 1-byte
183 * contention counter, so that we can implement
184 * __byte_spin_is_contended.
185 */
186struct __byte_spinlock {
187 s8 lock;
188 s8 spinners;
189};
190
191static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
192{
193 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
194 return bl->lock != 0;
195}
196
197static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
198{
199 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
200 return bl->spinners != 0;
201}
202
203static inline void __byte_spin_lock(raw_spinlock_t *lock)
204{
205 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
206 s8 val = 1;
207
208 asm("1: xchgb %1, %0\n"
209 " test %1,%1\n"
210 " jz 3f\n"
211 " " LOCK_PREFIX "incb %2\n"
212 "2: rep;nop\n"
213 " cmpb $1, %0\n"
214 " je 2b\n"
215 " " LOCK_PREFIX "decb %2\n"
216 " jmp 1b\n"
217 "3:"
218 : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
219}
220
221static inline int __byte_spin_trylock(raw_spinlock_t *lock)
222{
223 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
224 u8 old = 1;
225
226 asm("xchgb %1,%0"
227 : "+m" (bl->lock), "+q" (old) : : "memory");
228 176
229 return old == 0;
230}
231
232static inline void __byte_spin_unlock(raw_spinlock_t *lock)
233{
234 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
235 smp_wmb();
236 bl->lock = 0;
237}
238#else /* !CONFIG_PARAVIRT */
239static inline int __raw_spin_is_locked(raw_spinlock_t *lock) 177static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
240{ 178{
241 return __ticket_spin_is_locked(lock); 179 return __ticket_spin_is_locked(lock);
@@ -268,7 +206,7 @@ static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
268 __raw_spin_lock(lock); 206 __raw_spin_lock(lock);
269} 207}
270 208
271#endif /* CONFIG_PARAVIRT */ 209#endif
272 210
273static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
274{ 212{
@@ -330,8 +268,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
330{ 268{
331 atomic_t *count = (atomic_t *)lock; 269 atomic_t *count = (atomic_t *)lock;
332 270
333 atomic_dec(count); 271 if (atomic_dec_return(count) >= 0)
334 if (atomic_read(count) >= 0)
335 return 1; 272 return 1;
336 atomic_inc(count); 273 atomic_inc(count);
337 return 0; 274 return 0;
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644
index 000000000000..c2d742c6e15f
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@ -0,0 +1,124 @@
1/*
2 * GCC stack protector support.
3 *
4 * Stack protector works by putting predefined pattern at the start of
5 * the stack frame and verifying that it hasn't been overwritten when
6 * returning from the function. The pattern is called stack canary
7 * and unfortunately gcc requires it to be at a fixed offset from %gs.
8 * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
9 * and x86_32 use segment registers differently and thus handles this
10 * requirement differently.
11 *
12 * On x86_64, %gs is shared by percpu area and stack canary. All
13 * percpu symbols are zero based and %gs points to the base of percpu
14 * area. The first occupant of the percpu area is always
15 * irq_stack_union which contains stack_canary at offset 40. Userland
16 * %gs is always saved and restored on kernel entry and exit using
17 * swapgs, so stack protector doesn't add any complexity there.
18 *
19 * On x86_32, it's slightly more complicated. As in x86_64, %gs is
20 * used for userland TLS. Unfortunately, some processors are much
21 * slower at loading segment registers with different value when
22 * entering and leaving the kernel, so the kernel uses %fs for percpu
23 * area and manages %gs lazily so that %gs is switched only when
24 * necessary, usually during task switch.
25 *
26 * As gcc requires the stack canary at %gs:20, %gs can't be managed
27 * lazily if stack protector is enabled, so the kernel saves and
28 * restores userland %gs on kernel entry and exit. This behavior is
29 * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
30 * system.h to hide the details.
31 */
32
33#ifndef _ASM_STACKPROTECTOR_H
34#define _ASM_STACKPROTECTOR_H 1
35
36#ifdef CONFIG_CC_STACKPROTECTOR
37
38#include <asm/tsc.h>
39#include <asm/processor.h>
40#include <asm/percpu.h>
41#include <asm/system.h>
42#include <asm/desc.h>
43#include <linux/random.h>
44
45/*
46 * 24 byte read-only segment initializer for stack canary. Linker
47 * can't handle the address bit shifting. Address will be set in
48 * head_32 for boot CPU and setup_per_cpu_areas() for others.
49 */
50#define GDT_STACK_CANARY_INIT \
51 [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } },
52
53/*
54 * Initialize the stackprotector canary value.
55 *
56 * NOTE: this must only be called from functions that never return,
57 * and it must always be inlined.
58 */
59static __always_inline void boot_init_stack_canary(void)
60{
61 u64 canary;
62 u64 tsc;
63
64#ifdef CONFIG_X86_64
65 BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
66#endif
67 /*
68 * We both use the random pool and the current TSC as a source
69 * of randomness. The TSC only matters for very early init,
70 * there it already has some randomness on most systems. Later
71 * on during the bootup the random pool has true entropy too.
72 */
73 get_random_bytes(&canary, sizeof(canary));
74 tsc = __native_read_tsc();
75 canary += tsc + (tsc << 32UL);
76
77 current->stack_canary = canary;
78#ifdef CONFIG_X86_64
79 percpu_write(irq_stack_union.stack_canary, canary);
80#else
81 percpu_write(stack_canary, canary);
82#endif
83}
84
85static inline void setup_stack_canary_segment(int cpu)
86{
87#ifdef CONFIG_X86_32
88 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
89 struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
90 struct desc_struct desc;
91
92 desc = gdt_table[GDT_ENTRY_STACK_CANARY];
93 desc.base0 = canary & 0xffff;
94 desc.base1 = (canary >> 16) & 0xff;
95 desc.base2 = (canary >> 24) & 0xff;
96 write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
97#endif
98}
99
100static inline void load_stack_canary_segment(void)
101{
102#ifdef CONFIG_X86_32
103 asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
104#endif
105}
106
107#else /* CC_STACKPROTECTOR */
108
109#define GDT_STACK_CANARY_INIT
110
111/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
112
113static inline void setup_stack_canary_segment(int cpu)
114{ }
115
116static inline void load_stack_canary_segment(void)
117{
118#ifdef CONFIG_X86_32
119 asm volatile ("mov %0, %%gs" : : "r" (0));
120#endif
121}
122
123#endif /* CC_STACKPROTECTOR */
124#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
deleted file mode 100644
index 93d2c8667cfe..000000000000
--- a/arch/x86/include/asm/summit/apic.h
+++ /dev/null
@@ -1,202 +0,0 @@
1#ifndef __ASM_SUMMIT_APIC_H
2#define __ASM_SUMMIT_APIC_H
3
4#include <asm/smp.h>
5#include <linux/gfp.h>
6
7#define esr_disable (1)
8#define NO_BALANCE_IRQ (0)
9
10/* In clustered mode, the high nibble of APIC ID is a cluster number.
11 * The low nibble is a 4-bit bitmap. */
12#define XAPIC_DEST_CPUS_SHIFT 4
13#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
14#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
15
16#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
17
18static inline const cpumask_t *target_cpus(void)
19{
20 /* CPU_MASK_ALL (0xff) has undefined behaviour with
21 * dest_LowestPrio mode logical clustered apic interrupt routing
22 * Just start on cpu 0. IRQ balancing will spread load
23 */
24 return &cpumask_of_cpu(0);
25}
26
27#define INT_DELIVERY_MODE (dest_LowestPrio)
28#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
29
30static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
31{
32 return 0;
33}
34
35/* we don't use the phys_cpu_present_map to indicate apicid presence */
36static inline unsigned long check_apicid_present(int bit)
37{
38 return 1;
39}
40
41#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
42
43extern u8 cpu_2_logical_apicid[];
44
45static inline void init_apic_ldr(void)
46{
47 unsigned long val, id;
48 int count = 0;
49 u8 my_id = (u8)hard_smp_processor_id();
50 u8 my_cluster = (u8)apicid_cluster(my_id);
51#ifdef CONFIG_SMP
52 u8 lid;
53 int i;
54
55 /* Create logical APIC IDs by counting CPUs already in cluster. */
56 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
57 lid = cpu_2_logical_apicid[i];
58 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
59 ++count;
60 }
61#endif
62 /* We only have a 4 wide bitmap in cluster mode. If a deranged
63 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
64 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
65 id = my_cluster | (1UL << count);
66 apic_write(APIC_DFR, APIC_DFR_VALUE);
67 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
68 val |= SET_APIC_LOGICAL_ID(id);
69 apic_write(APIC_LDR, val);
70}
71
72static inline int multi_timer_check(int apic, int irq)
73{
74 return 0;
75}
76
77static inline int apic_id_registered(void)
78{
79 return 1;
80}
81
82static inline void setup_apic_routing(void)
83{
84 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
85 nr_ioapics);
86}
87
88static inline int apicid_to_node(int logical_apicid)
89{
90#ifdef CONFIG_SMP
91 return apicid_2_node[hard_smp_processor_id()];
92#else
93 return 0;
94#endif
95}
96
97/* Mapping from cpu number to logical apicid */
98static inline int cpu_to_logical_apicid(int cpu)
99{
100#ifdef CONFIG_SMP
101 if (cpu >= nr_cpu_ids)
102 return BAD_APICID;
103 return (int)cpu_2_logical_apicid[cpu];
104#else
105 return logical_smp_processor_id();
106#endif
107}
108
109static inline int cpu_present_to_apicid(int mps_cpu)
110{
111 if (mps_cpu < nr_cpu_ids)
112 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
113 else
114 return BAD_APICID;
115}
116
117static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
118{
119 /* For clustered we don't have a good way to do this yet - hack */
120 return physids_promote(0x0F);
121}
122
123static inline physid_mask_t apicid_to_cpu_present(int apicid)
124{
125 return physid_mask_of_physid(0);
126}
127
128static inline void setup_portio_remap(void)
129{
130}
131
132static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
133{
134 return 1;
135}
136
137static inline void enable_apic_mode(void)
138{
139}
140
141static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
142{
143 int num_bits_set;
144 int cpus_found = 0;
145 int cpu;
146 int apicid;
147
148 num_bits_set = cpus_weight(*cpumask);
149 /* Return id to all */
150 if (num_bits_set >= nr_cpu_ids)
151 return (int) 0xFF;
152 /*
153 * The cpus in the mask must all be on the apic cluster. If are not
154 * on the same apicid cluster return default value of TARGET_CPUS.
155 */
156 cpu = first_cpu(*cpumask);
157 apicid = cpu_to_logical_apicid(cpu);
158 while (cpus_found < num_bits_set) {
159 if (cpu_isset(cpu, *cpumask)) {
160 int new_apicid = cpu_to_logical_apicid(cpu);
161 if (apicid_cluster(apicid) !=
162 apicid_cluster(new_apicid)){
163 printk ("%s: Not a valid mask!\n", __func__);
164 return 0xFF;
165 }
166 apicid = apicid | new_apicid;
167 cpus_found++;
168 }
169 cpu++;
170 }
171 return apicid;
172}
173
174static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
175 const struct cpumask *andmask)
176{
177 int apicid = cpu_to_logical_apicid(0);
178 cpumask_var_t cpumask;
179
180 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
181 return apicid;
182
183 cpumask_and(cpumask, inmask, andmask);
184 cpumask_and(cpumask, cpumask, cpu_online_mask);
185 apicid = cpu_mask_to_apicid(cpumask);
186
187 free_cpumask_var(cpumask);
188 return apicid;
189}
190
191/* cpuid returns the value latched in the HW at reset, not the APIC ID
192 * register's value. For any box whose BIOS changes APIC IDs, like
193 * clustered APIC systems, we must use hard_smp_processor_id.
194 *
195 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
196 */
197static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
198{
199 return hard_smp_processor_id() >> index_msb;
200}
201
202#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
deleted file mode 100644
index f3fbca1f61c1..000000000000
--- a/arch/x86/include/asm/summit/apicdef.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef __ASM_SUMMIT_APICDEF_H
2#define __ASM_SUMMIT_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (x>>24)&0xFF;
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
deleted file mode 100644
index a8a2c24f50cc..000000000000
--- a/arch/x86/include/asm/summit/ipi.h
+++ /dev/null
@@ -1,26 +0,0 @@
1#ifndef __ASM_SUMMIT_IPI_H
2#define __ASM_SUMMIT_IPI_H
3
4void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
5void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
6
7static inline void send_IPI_mask(const cpumask_t *mask, int vector)
8{
9 send_IPI_mask_sequence(mask, vector);
10}
11
12static inline void send_IPI_allbutself(int vector)
13{
14 cpumask_t mask = cpu_online_map;
15 cpu_clear(smp_processor_id(), mask);
16
17 if (!cpus_empty(mask))
18 send_IPI_mask(&mask, vector);
19}
20
21static inline void send_IPI_all(int vector)
22{
23 send_IPI_mask(&cpu_online_map, vector);
24}
25
26#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
deleted file mode 100644
index 380e86c02363..000000000000
--- a/arch/x86/include/asm/summit/mpparse.h
+++ /dev/null
@@ -1,109 +0,0 @@
1#ifndef __ASM_SUMMIT_MPPARSE_H
2#define __ASM_SUMMIT_MPPARSE_H
3
4#include <asm/tsc.h>
5
6extern int use_cyclone;
7
8#ifdef CONFIG_X86_SUMMIT_NUMA
9extern void setup_summit(void);
10#else
11#define setup_summit() {}
12#endif
13
14static inline int mps_oem_check(struct mpc_table *mpc, char *oem,
15 char *productid)
16{
17 if (!strncmp(oem, "IBM ENSW", 8) &&
18 (!strncmp(productid, "VIGIL SMP", 9)
19 || !strncmp(productid, "EXA", 3)
20 || !strncmp(productid, "RUTHLESS SMP", 12))){
21 mark_tsc_unstable("Summit based system");
22 use_cyclone = 1; /*enable cyclone-timer*/
23 setup_summit();
24 return 1;
25 }
26 return 0;
27}
28
29/* Hook from generic ACPI tables.c */
30static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
31{
32 if (!strncmp(oem_id, "IBM", 3) &&
33 (!strncmp(oem_table_id, "SERVIGIL", 8)
34 || !strncmp(oem_table_id, "EXA", 3))){
35 mark_tsc_unstable("Summit based system");
36 use_cyclone = 1; /*enable cyclone-timer*/
37 setup_summit();
38 return 1;
39 }
40 return 0;
41}
42
43struct rio_table_hdr {
44 unsigned char version; /* Version number of this data structure */
45 /* Version 3 adds chassis_num & WP_index */
46 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
47 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
48} __attribute__((packed));
49
50struct scal_detail {
51 unsigned char node_id; /* Scalability Node ID */
52 unsigned long CBAR; /* Address of 1MB register space */
53 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
54 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
55 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
56 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
57 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
58 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
59 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
60} __attribute__((packed));
61
62struct rio_detail {
63 unsigned char node_id; /* RIO Node ID */
64 unsigned long BBAR; /* Address of 1MB register space */
65 unsigned char type; /* Type of device */
66 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
67 /* For CYC: Node ID of Twister that owns this CYC */
68 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
69 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
70 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
71 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
72 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
73 /* For CYC: 0 */
74 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
75 /* = 0 : the XAPIC is not used, ie:*/
76 /* ints fwded to another XAPIC */
77 /* Bits1:7 Reserved */
78 /* For CYC: Bits0:7 Reserved */
79 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
80 /* lower slot numbers/PCI bus numbers */
81 /* For CYC: No meaning */
82 unsigned char chassis_num; /* 1 based Chassis number */
83 /* For LookOut WPEGs this field indicates the */
84 /* Expansion Chassis #, enumerated from Boot */
85 /* Node WPEG external port, then Boot Node CYC */
86 /* external port, then Next Vigil chassis WPEG */
87 /* external port, etc. */
88 /* Shared Lookouts have only 1 chassis number (the */
89 /* first one assigned) */
90} __attribute__((packed));
91
92
93typedef enum {
94 CompatTwister = 0, /* Compatibility Twister */
95 AltTwister = 1, /* Alternate Twister of internal 8-way */
96 CompatCyclone = 2, /* Compatibility Cyclone */
97 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
98 CompatWPEG = 4, /* Compatibility WPEG */
99 AltWPEG = 5, /* Second Planar WPEG */
100 LookOutAWPEG = 6, /* LookOut WPEG */
101 LookOutBWPEG = 7, /* LookOut WPEG */
102} node_type;
103
104static inline int is_WPEG(struct rio_detail *rio){
105 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
106 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
107}
108
109#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index c0b0bda754ee..7043408f6904 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -29,21 +29,21 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
29/* X86_32 only */ 29/* X86_32 only */
30#ifdef CONFIG_X86_32 30#ifdef CONFIG_X86_32
31/* kernel/process_32.c */ 31/* kernel/process_32.c */
32asmlinkage int sys_fork(struct pt_regs); 32int sys_fork(struct pt_regs *);
33asmlinkage int sys_clone(struct pt_regs); 33int sys_clone(struct pt_regs *);
34asmlinkage int sys_vfork(struct pt_regs); 34int sys_vfork(struct pt_regs *);
35asmlinkage int sys_execve(struct pt_regs); 35int sys_execve(struct pt_regs *);
36 36
37/* kernel/signal_32.c */ 37/* kernel/signal_32.c */
38asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 38asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
40 struct old_sigaction __user *); 40 struct old_sigaction __user *);
41asmlinkage int sys_sigaltstack(unsigned long); 41int sys_sigaltstack(struct pt_regs *);
42asmlinkage unsigned long sys_sigreturn(unsigned long); 42unsigned long sys_sigreturn(struct pt_regs *);
43asmlinkage int sys_rt_sigreturn(unsigned long); 43long sys_rt_sigreturn(struct pt_regs *);
44 44
45/* kernel/ioport.c */ 45/* kernel/ioport.c */
46asmlinkage long sys_iopl(unsigned long); 46long sys_iopl(struct pt_regs *);
47 47
48/* kernel/sys_i386_32.c */ 48/* kernel/sys_i386_32.c */
49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
@@ -59,8 +59,8 @@ struct oldold_utsname;
59asmlinkage int sys_olduname(struct oldold_utsname __user *); 59asmlinkage int sys_olduname(struct oldold_utsname __user *);
60 60
61/* kernel/vm86_32.c */ 61/* kernel/vm86_32.c */
62asmlinkage int sys_vm86old(struct pt_regs); 62int sys_vm86old(struct pt_regs *);
63asmlinkage int sys_vm86(struct pt_regs); 63int sys_vm86(struct pt_regs *);
64 64
65#else /* CONFIG_X86_32 */ 65#else /* CONFIG_X86_32 */
66 66
@@ -74,6 +74,7 @@ asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *, 74asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *, 75 char __user * __user *,
76 struct pt_regs *); 76 struct pt_regs *);
77long sys_arch_prctl(int, unsigned long);
77 78
78/* kernel/ioport.c */ 79/* kernel/ioport.c */
79asmlinkage long sys_iopl(unsigned int, struct pt_regs *); 80asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
@@ -81,7 +82,7 @@ asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
81/* kernel/signal_64.c */ 82/* kernel/signal_64.c */
82asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, 83asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
83 struct pt_regs *); 84 struct pt_regs *);
84asmlinkage long sys_rt_sigreturn(struct pt_regs *); 85long sys_rt_sigreturn(struct pt_regs *);
85 86
86/* kernel/sys_x86_64.c */ 87/* kernel/sys_x86_64.c */
87asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 88asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 8e626ea33a1a..643c59b4bc6e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -20,9 +20,26 @@
20struct task_struct; /* one of the stranger aspects of C forward declarations */ 20struct task_struct; /* one of the stranger aspects of C forward declarations */
21struct task_struct *__switch_to(struct task_struct *prev, 21struct task_struct *__switch_to(struct task_struct *prev,
22 struct task_struct *next); 22 struct task_struct *next);
23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss);
23 26
24#ifdef CONFIG_X86_32 27#ifdef CONFIG_X86_32
25 28
29#ifdef CONFIG_CC_STACKPROTECTOR
30#define __switch_canary \
31 "movl %P[task_canary](%[next]), %%ebx\n\t" \
32 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
33#define __switch_canary_oparam \
34 , [stack_canary] "=m" (per_cpu_var(stack_canary))
35#define __switch_canary_iparam \
36 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
37#else /* CC_STACKPROTECTOR */
38#define __switch_canary
39#define __switch_canary_oparam
40#define __switch_canary_iparam
41#endif /* CC_STACKPROTECTOR */
42
26/* 43/*
27 * Saving eflags is important. It switches not only IOPL between tasks, 44 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc. 45 * it also protects other tasks from NT leaking through sysenter etc.
@@ -44,6 +61,7 @@ do { \
44 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ 61 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
45 "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ 62 "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
46 "pushl %[next_ip]\n\t" /* restore EIP */ \ 63 "pushl %[next_ip]\n\t" /* restore EIP */ \
64 __switch_canary \
47 "jmp __switch_to\n" /* regparm call */ \ 65 "jmp __switch_to\n" /* regparm call */ \
48 "1:\t" \ 66 "1:\t" \
49 "popl %%ebp\n\t" /* restore EBP */ \ 67 "popl %%ebp\n\t" /* restore EBP */ \
@@ -58,6 +76,8 @@ do { \
58 "=b" (ebx), "=c" (ecx), "=d" (edx), \ 76 "=b" (ebx), "=c" (ecx), "=d" (edx), \
59 "=S" (esi), "=D" (edi) \ 77 "=S" (esi), "=D" (edi) \
60 \ 78 \
79 __switch_canary_oparam \
80 \
61 /* input parameters: */ \ 81 /* input parameters: */ \
62 : [next_sp] "m" (next->thread.sp), \ 82 : [next_sp] "m" (next->thread.sp), \
63 [next_ip] "m" (next->thread.ip), \ 83 [next_ip] "m" (next->thread.ip), \
@@ -66,6 +86,8 @@ do { \
66 [prev] "a" (prev), \ 86 [prev] "a" (prev), \
67 [next] "d" (next) \ 87 [next] "d" (next) \
68 \ 88 \
89 __switch_canary_iparam \
90 \
69 : /* reloaded segment registers */ \ 91 : /* reloaded segment registers */ \
70 "memory"); \ 92 "memory"); \
71} while (0) 93} while (0)
@@ -86,27 +108,44 @@ do { \
86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ 108 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
87 "r12", "r13", "r14", "r15" 109 "r12", "r13", "r14", "r15"
88 110
111#ifdef CONFIG_CC_STACKPROTECTOR
112#define __switch_canary \
113 "movq %P[task_canary](%%rsi),%%r8\n\t" \
114 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
115#define __switch_canary_oparam \
116 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
117#define __switch_canary_iparam \
118 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
119#else /* CC_STACKPROTECTOR */
120#define __switch_canary
121#define __switch_canary_oparam
122#define __switch_canary_iparam
123#endif /* CC_STACKPROTECTOR */
124
89/* Save restore flags to clear handle leaking NT */ 125/* Save restore flags to clear handle leaking NT */
90#define switch_to(prev, next, last) \ 126#define switch_to(prev, next, last) \
91 asm volatile(SAVE_CONTEXT \ 127 asm volatile(SAVE_CONTEXT \
92 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
93 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
94 "call __switch_to\n\t" \ 130 "call __switch_to\n\t" \
95 ".globl thread_return\n" \ 131 ".globl thread_return\n" \
96 "thread_return:\n\t" \ 132 "thread_return:\n\t" \
97 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ 133 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
134 __switch_canary \
98 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 135 "movq %P[thread_info](%%rsi),%%r8\n\t" \
99 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
100 "movq %%rax,%%rdi\n\t" \ 136 "movq %%rax,%%rdi\n\t" \
101 "jc ret_from_fork\n\t" \ 137 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
138 "jnz ret_from_fork\n\t" \
102 RESTORE_CONTEXT \ 139 RESTORE_CONTEXT \
103 : "=a" (last) \ 140 : "=a" (last) \
141 __switch_canary_oparam \
104 : [next] "S" (next), [prev] "D" (prev), \ 142 : [next] "S" (next), [prev] "D" (prev), \
105 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ 143 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
106 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 144 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
107 [tif_fork] "i" (TIF_FORK), \ 145 [_tif_fork] "i" (_TIF_FORK), \
108 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 146 [thread_info] "i" (offsetof(struct task_struct, stack)), \
109 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ 147 [current_task] "m" (per_cpu_var(current_task)) \
148 __switch_canary_iparam \
110 : "memory", "cc" __EXTRA_CLOBBER) 149 : "memory", "cc" __EXTRA_CLOBBER)
111#endif 150#endif
112 151
@@ -165,6 +204,25 @@ extern void native_load_gs_index(unsigned);
165#define savesegment(seg, value) \ 204#define savesegment(seg, value) \
166 asm("mov %%" #seg ",%0":"=r" (value) : : "memory") 205 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
167 206
207/*
208 * x86_32 user gs accessors.
209 */
210#ifdef CONFIG_X86_32
211#ifdef CONFIG_X86_32_LAZY_GS
212#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
213#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
214#define task_user_gs(tsk) ((tsk)->thread.gs)
215#define lazy_save_gs(v) savesegment(gs, (v))
216#define lazy_load_gs(v) loadsegment(gs, (v))
217#else /* X86_32_LAZY_GS */
218#define get_user_gs(regs) (u16)((regs)->gs)
219#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
220#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
221#define lazy_save_gs(v) do { } while (0)
222#define lazy_load_gs(v) do { } while (0)
223#endif /* X86_32_LAZY_GS */
224#endif /* X86_32 */
225
168static inline unsigned long get_limit(unsigned long segment) 226static inline unsigned long get_limit(unsigned long segment)
169{ 227{
170 unsigned long __limit; 228 unsigned long __limit;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 98789647baa9..8820a73ae090 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,6 +40,7 @@ struct thread_info {
40 */ 40 */
41 __u8 supervisor_stack[0]; 41 __u8 supervisor_stack[0];
42#endif 42#endif
43 int uaccess_err;
43}; 44};
44 45
45#define INIT_THREAD_INFO(tsk) \ 46#define INIT_THREAD_INFO(tsk) \
@@ -93,6 +94,7 @@ struct thread_info {
93#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
94#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
95#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
96 98
97#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
98#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -114,15 +116,17 @@ struct thread_info {
114#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
115#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
116#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
117 120
118/* work to do in syscall_trace_enter() */ 121/* work to do in syscall_trace_enter() */
119#define _TIF_WORK_SYSCALL_ENTRY \ 122#define _TIF_WORK_SYSCALL_ENTRY \
120 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ 123 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
121 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) 124 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
122 125
123/* work to do in syscall_trace_leave() */ 126/* work to do in syscall_trace_leave() */
124#define _TIF_WORK_SYSCALL_EXIT \ 127#define _TIF_WORK_SYSCALL_EXIT \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) 128 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
129 _TIF_SYSCALL_FTRACE)
126 130
127/* work to do on interrupt/exception return */ 131/* work to do on interrupt/exception return */
128#define _TIF_WORK_MASK \ 132#define _TIF_WORK_MASK \
@@ -131,7 +135,7 @@ struct thread_info {
131 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 135 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
132 136
133/* work to do on any return to user space */ 137/* work to do on any return to user space */
134#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) 138#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
135 139
136/* Only used for 64 bit */ 140/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \ 141#define _TIF_DO_NOTIFY_MASK \
@@ -194,25 +198,21 @@ static inline struct thread_info *current_thread_info(void)
194 198
195#else /* X86_32 */ 199#else /* X86_32 */
196 200
197#include <asm/pda.h> 201#include <asm/percpu.h>
202#define KERNEL_STACK_OFFSET (5*8)
198 203
199/* 204/*
200 * macros/functions for gaining access to the thread information structure 205 * macros/functions for gaining access to the thread information structure
201 * preempt_count needs to be 1 initially, until the scheduler is functional. 206 * preempt_count needs to be 1 initially, until the scheduler is functional.
202 */ 207 */
203#ifndef __ASSEMBLY__ 208#ifndef __ASSEMBLY__
204static inline struct thread_info *current_thread_info(void) 209DECLARE_PER_CPU(unsigned long, kernel_stack);
205{
206 struct thread_info *ti;
207 ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
208 return ti;
209}
210 210
211/* do not use in interrupt context */ 211static inline struct thread_info *current_thread_info(void)
212static inline struct thread_info *stack_thread_info(void)
213{ 212{
214 struct thread_info *ti; 213 struct thread_info *ti;
215 asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); 214 ti = (void *)(percpu_read(kernel_stack) +
215 KERNEL_STACK_OFFSET - THREAD_SIZE);
216 return ti; 216 return ti;
217} 217}
218 218
@@ -220,8 +220,8 @@ static inline struct thread_info *stack_thread_info(void)
220 220
221/* how to get the thread information struct from ASM */ 221/* how to get the thread information struct from ASM */
222#define GET_THREAD_INFO(reg) \ 222#define GET_THREAD_INFO(reg) \
223 movq %gs:pda_kernelstack,reg ; \ 223 movq PER_CPU_VAR(kernel_stack),reg ; \
224 subq $(THREAD_SIZE-PDA_STACKOFFSET),reg 224 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
225 225
226#endif 226#endif
227 227
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 2bb6a835c453..a81195eaa2b3 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/percpu.h> 5#include <linux/percpu.h>
6#include <linux/interrupt.h>
6 7
7#define TICK_SIZE (tick_nsec / 1000) 8#define TICK_SIZE (tick_nsec / 1000)
8 9
@@ -12,6 +13,7 @@ unsigned long native_calibrate_tsc(void);
12#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
13extern int timer_ack; 14extern int timer_ack;
14extern int recalibrate_cpu_khz(void); 15extern int recalibrate_cpu_khz(void);
16extern irqreturn_t timer_interrupt(int irq, void *dev_id);
15#endif /* CONFIG_X86_32 */ 17#endif /* CONFIG_X86_32 */
16 18
17extern int no_timer_check; 19extern int no_timer_check;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb549116..d3539f998f88 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
113 __flush_tlb(); 113 __flush_tlb();
114} 114}
115 115
116static inline void native_flush_tlb_others(const cpumask_t *cpumask, 116static inline void native_flush_tlb_others(const struct cpumask *cpumask,
117 struct mm_struct *mm, 117 struct mm_struct *mm,
118 unsigned long va) 118 unsigned long va)
119{ 119{
@@ -142,31 +142,28 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
142 flush_tlb_mm(vma->vm_mm); 142 flush_tlb_mm(vma->vm_mm);
143} 143}
144 144
145void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, 145void native_flush_tlb_others(const struct cpumask *cpumask,
146 unsigned long va); 146 struct mm_struct *mm, unsigned long va);
147 147
148#define TLBSTATE_OK 1 148#define TLBSTATE_OK 1
149#define TLBSTATE_LAZY 2 149#define TLBSTATE_LAZY 2
150 150
151#ifdef CONFIG_X86_32
152struct tlb_state { 151struct tlb_state {
153 struct mm_struct *active_mm; 152 struct mm_struct *active_mm;
154 int state; 153 int state;
155 char __cacheline_padding[L1_CACHE_BYTES-8];
156}; 154};
157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); 155DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158 156
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void) 157static inline void reset_lazy_tlbstate(void)
162{ 158{
159 percpu_write(cpu_tlbstate.state, 0);
160 percpu_write(cpu_tlbstate.active_mm, &init_mm);
163} 161}
164#endif
165 162
166#endif /* SMP */ 163#endif /* SMP */
167 164
168#ifndef CONFIG_PARAVIRT 165#ifndef CONFIG_PARAVIRT
169#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va) 166#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
170#endif 167#endif
171 168
172static inline void flush_tlb_kernel_range(unsigned long start, 169static inline void flush_tlb_kernel_range(unsigned long start,
@@ -175,4 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
175 flush_tlb_all(); 172 flush_tlb_all();
176} 173}
177 174
175extern void zap_low_mappings(void);
176
178#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4e2f2e0aab27..77cfb2cfb386 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -74,6 +74,8 @@ static inline const struct cpumask *cpumask_of_node(int node)
74 return &node_to_cpumask_map[node]; 74 return &node_to_cpumask_map[node];
75} 75}
76 76
77static inline void setup_node_to_cpumask_map(void) { }
78
77#else /* CONFIG_X86_64 */ 79#else /* CONFIG_X86_64 */
78 80
79/* Mappings between node number and cpus on that node. */ 81/* Mappings between node number and cpus on that node. */
@@ -83,7 +85,8 @@ extern cpumask_t *node_to_cpumask_map;
83DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 85DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
84 86
85/* Returns the number of the current Node. */ 87/* Returns the number of the current Node. */
86#define numa_node_id() read_pda(nodenumber) 88DECLARE_PER_CPU(int, node_number);
89#define numa_node_id() percpu_read(node_number)
87 90
88#ifdef CONFIG_DEBUG_PER_CPU_MAPS 91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
89extern int cpu_to_node(int cpu); 92extern int cpu_to_node(int cpu);
@@ -102,10 +105,7 @@ static inline int cpu_to_node(int cpu)
102/* Same function but used if called before per_cpu areas are setup */ 105/* Same function but used if called before per_cpu areas are setup */
103static inline int early_cpu_to_node(int cpu) 106static inline int early_cpu_to_node(int cpu)
104{ 107{
105 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 108 return early_per_cpu(x86_cpu_to_node_map, cpu);
106 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
107
108 return per_cpu(x86_cpu_to_node_map, cpu);
109} 109}
110 110
111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
@@ -122,6 +122,8 @@ static inline cpumask_t node_to_cpumask(int node)
122 122
123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
124 124
125extern void setup_node_to_cpumask_map(void);
126
125/* 127/*
126 * Replace default node_to_cpumask_ptr with optimized version 128 * Replace default node_to_cpumask_ptr with optimized version
127 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" 129 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
@@ -192,9 +194,20 @@ extern int __node_distance(int, int);
192 194
193#else /* !CONFIG_NUMA */ 195#else /* !CONFIG_NUMA */
194 196
195#define numa_node_id() 0 197static inline int numa_node_id(void)
196#define cpu_to_node(cpu) 0 198{
197#define early_cpu_to_node(cpu) 0 199 return 0;
200}
201
202static inline int cpu_to_node(int cpu)
203{
204 return 0;
205}
206
207static inline int early_cpu_to_node(int cpu)
208{
209 return 0;
210}
198 211
199static inline const cpumask_t *cpumask_of_node(int node) 212static inline const cpumask_t *cpumask_of_node(int node)
200{ 213{
@@ -209,6 +222,8 @@ static inline int node_to_first_cpu(int node)
209 return first_cpu(cpu_online_map); 222 return first_cpu(cpu_online_map);
210} 223}
211 224
225static inline void setup_node_to_cpumask_map(void) { }
226
212/* 227/*
213 * Replace default node_to_cpumask_ptr with optimized version 228 * Replace default node_to_cpumask_ptr with optimized version
214 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" 229 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 780ba0ab94f9..90f06c25221d 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base;
13 13
14extern unsigned long init_rsp; 14extern unsigned long init_rsp;
15extern unsigned long initial_code; 15extern unsigned long initial_code;
16extern unsigned long initial_gs;
16 17
17#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
18#define TRAMPOLINE_BASE 0x6000 19#define TRAMPOLINE_BASE 0x6000
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index cf3bb053da0b..0d5342515b86 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,7 +41,7 @@ dotraplinkage void do_int3(struct pt_regs *, long);
41dotraplinkage void do_overflow(struct pt_regs *, long); 41dotraplinkage void do_overflow(struct pt_regs *, long);
42dotraplinkage void do_bounds(struct pt_regs *, long); 42dotraplinkage void do_bounds(struct pt_regs *, long);
43dotraplinkage void do_invalid_op(struct pt_regs *, long); 43dotraplinkage void do_invalid_op(struct pt_regs *, long);
44dotraplinkage void do_device_not_available(struct pt_regs); 44dotraplinkage void do_device_not_available(struct pt_regs *, long);
45dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); 45dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
46dotraplinkage void do_invalid_TSS(struct pt_regs *, long); 46dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
47dotraplinkage void do_segment_not_present(struct pt_regs *, long); 47dotraplinkage void do_segment_not_present(struct pt_regs *, long);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 4340055b7559..b685ece89d5c 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -121,7 +121,7 @@ extern int __get_user_bad(void);
121 121
122#define __get_user_x(size, ret, x, ptr) \ 122#define __get_user_x(size, ret, x, ptr) \
123 asm volatile("call __get_user_" #size \ 123 asm volatile("call __get_user_" #size \
124 : "=a" (ret),"=d" (x) \ 124 : "=a" (ret), "=d" (x) \
125 : "0" (ptr)) \ 125 : "0" (ptr)) \
126 126
127/* Careful: we have to cast the result to the type of the pointer 127/* Careful: we have to cast the result to the type of the pointer
@@ -181,12 +181,12 @@ extern int __get_user_bad(void);
181 181
182#define __put_user_x(size, x, ptr, __ret_pu) \ 182#define __put_user_x(size, x, ptr, __ret_pu) \
183 asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ 183 asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
184 :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 184 : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
185 185
186 186
187 187
188#ifdef CONFIG_X86_32 188#ifdef CONFIG_X86_32
189#define __put_user_u64(x, addr, err) \ 189#define __put_user_asm_u64(x, addr, err, errret) \
190 asm volatile("1: movl %%eax,0(%2)\n" \ 190 asm volatile("1: movl %%eax,0(%2)\n" \
191 "2: movl %%edx,4(%2)\n" \ 191 "2: movl %%edx,4(%2)\n" \
192 "3:\n" \ 192 "3:\n" \
@@ -197,14 +197,24 @@ extern int __get_user_bad(void);
197 _ASM_EXTABLE(1b, 4b) \ 197 _ASM_EXTABLE(1b, 4b) \
198 _ASM_EXTABLE(2b, 4b) \ 198 _ASM_EXTABLE(2b, 4b) \
199 : "=r" (err) \ 199 : "=r" (err) \
200 : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err)) 200 : "A" (x), "r" (addr), "i" (errret), "0" (err))
201
202#define __put_user_asm_ex_u64(x, addr) \
203 asm volatile("1: movl %%eax,0(%1)\n" \
204 "2: movl %%edx,4(%1)\n" \
205 "3:\n" \
206 _ASM_EXTABLE(1b, 2b - 1b) \
207 _ASM_EXTABLE(2b, 3b - 2b) \
208 : : "A" (x), "r" (addr))
201 209
202#define __put_user_x8(x, ptr, __ret_pu) \ 210#define __put_user_x8(x, ptr, __ret_pu) \
203 asm volatile("call __put_user_8" : "=a" (__ret_pu) \ 211 asm volatile("call __put_user_8" : "=a" (__ret_pu) \
204 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
205#else 213#else
206#define __put_user_u64(x, ptr, retval) \ 214#define __put_user_asm_u64(x, ptr, retval, errret) \
207 __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT) 215 __put_user_asm(x, ptr, retval, "q", "", "Zr", errret)
216#define __put_user_asm_ex_u64(x, addr) \
217 __put_user_asm_ex(x, addr, "q", "", "Zr")
208#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) 218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
209#endif 219#endif
210 220
@@ -276,10 +286,32 @@ do { \
276 __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \ 286 __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
277 break; \ 287 break; \
278 case 4: \ 288 case 4: \
279 __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\ 289 __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \
280 break; \ 290 break; \
281 case 8: \ 291 case 8: \
282 __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \ 292 __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \
293 errret); \
294 break; \
295 default: \
296 __put_user_bad(); \
297 } \
298} while (0)
299
300#define __put_user_size_ex(x, ptr, size) \
301do { \
302 __chk_user_ptr(ptr); \
303 switch (size) { \
304 case 1: \
305 __put_user_asm_ex(x, ptr, "b", "b", "iq"); \
306 break; \
307 case 2: \
308 __put_user_asm_ex(x, ptr, "w", "w", "ir"); \
309 break; \
310 case 4: \
311 __put_user_asm_ex(x, ptr, "l", "k", "ir"); \
312 break; \
313 case 8: \
314 __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \
283 break; \ 315 break; \
284 default: \ 316 default: \
285 __put_user_bad(); \ 317 __put_user_bad(); \
@@ -311,9 +343,12 @@ do { \
311 343
312#ifdef CONFIG_X86_32 344#ifdef CONFIG_X86_32
313#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() 345#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
346#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
314#else 347#else
315#define __get_user_asm_u64(x, ptr, retval, errret) \ 348#define __get_user_asm_u64(x, ptr, retval, errret) \
316 __get_user_asm(x, ptr, retval, "q", "", "=r", errret) 349 __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
350#define __get_user_asm_ex_u64(x, ptr) \
351 __get_user_asm_ex(x, ptr, "q", "", "=r")
317#endif 352#endif
318 353
319#define __get_user_size(x, ptr, size, retval, errret) \ 354#define __get_user_size(x, ptr, size, retval, errret) \
@@ -350,6 +385,33 @@ do { \
350 : "=r" (err), ltype(x) \ 385 : "=r" (err), ltype(x) \
351 : "m" (__m(addr)), "i" (errret), "0" (err)) 386 : "m" (__m(addr)), "i" (errret), "0" (err))
352 387
388#define __get_user_size_ex(x, ptr, size) \
389do { \
390 __chk_user_ptr(ptr); \
391 switch (size) { \
392 case 1: \
393 __get_user_asm_ex(x, ptr, "b", "b", "=q"); \
394 break; \
395 case 2: \
396 __get_user_asm_ex(x, ptr, "w", "w", "=r"); \
397 break; \
398 case 4: \
399 __get_user_asm_ex(x, ptr, "l", "k", "=r"); \
400 break; \
401 case 8: \
402 __get_user_asm_ex_u64(x, ptr); \
403 break; \
404 default: \
405 (x) = __get_user_bad(); \
406 } \
407} while (0)
408
409#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
410 asm volatile("1: mov"itype" %1,%"rtype"0\n" \
411 "2:\n" \
412 _ASM_EXTABLE(1b, 2b - 1b) \
413 : ltype(x) : "m" (__m(addr)))
414
353#define __put_user_nocheck(x, ptr, size) \ 415#define __put_user_nocheck(x, ptr, size) \
354({ \ 416({ \
355 int __pu_err; \ 417 int __pu_err; \
@@ -385,6 +447,26 @@ struct __large_struct { unsigned long buf[100]; };
385 _ASM_EXTABLE(1b, 3b) \ 447 _ASM_EXTABLE(1b, 3b) \
386 : "=r"(err) \ 448 : "=r"(err) \
387 : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) 449 : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
450
451#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \
452 asm volatile("1: mov"itype" %"rtype"0,%1\n" \
453 "2:\n" \
454 _ASM_EXTABLE(1b, 2b - 1b) \
455 : : ltype(x), "m" (__m(addr)))
456
457/*
458 * uaccess_try and catch
459 */
460#define uaccess_try do { \
461 int prev_err = current_thread_info()->uaccess_err; \
462 current_thread_info()->uaccess_err = 0; \
463 barrier();
464
465#define uaccess_catch(err) \
466 (err) |= current_thread_info()->uaccess_err; \
467 current_thread_info()->uaccess_err = prev_err; \
468} while (0)
469
388/** 470/**
389 * __get_user: - Get a simple variable from user space, with less checking. 471 * __get_user: - Get a simple variable from user space, with less checking.
390 * @x: Variable to store result. 472 * @x: Variable to store result.
@@ -408,6 +490,7 @@ struct __large_struct { unsigned long buf[100]; };
408 490
409#define __get_user(x, ptr) \ 491#define __get_user(x, ptr) \
410 __get_user_nocheck((x), (ptr), sizeof(*(ptr))) 492 __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
493
411/** 494/**
412 * __put_user: - Write a simple value into user space, with less checking. 495 * __put_user: - Write a simple value into user space, with less checking.
413 * @x: Value to copy to user space. 496 * @x: Value to copy to user space.
@@ -435,6 +518,45 @@ struct __large_struct { unsigned long buf[100]; };
435#define __put_user_unaligned __put_user 518#define __put_user_unaligned __put_user
436 519
437/* 520/*
521 * {get|put}_user_try and catch
522 *
523 * get_user_try {
524 * get_user_ex(...);
525 * } get_user_catch(err)
526 */
527#define get_user_try uaccess_try
528#define get_user_catch(err) uaccess_catch(err)
529
530#define get_user_ex(x, ptr) do { \
531 unsigned long __gue_val; \
532 __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \
533 (x) = (__force __typeof__(*(ptr)))__gue_val; \
534} while (0)
535
536#ifdef CONFIG_X86_WP_WORKS_OK
537
538#define put_user_try uaccess_try
539#define put_user_catch(err) uaccess_catch(err)
540
541#define put_user_ex(x, ptr) \
542 __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
543
544#else /* !CONFIG_X86_WP_WORKS_OK */
545
546#define put_user_try do { \
547 int __uaccess_err = 0;
548
549#define put_user_catch(err) \
550 (err) |= __uaccess_err; \
551} while (0)
552
553#define put_user_ex(x, ptr) do { \
554 __uaccess_err |= __put_user(x, ptr); \
555} while (0)
556
557#endif /* CONFIG_X86_WP_WORKS_OK */
558
559/*
438 * movsl can be slow when source and dest are not both 8-byte aligned 560 * movsl can be slow when source and dest are not both 8-byte aligned
439 */ 561 */
440#ifdef CONFIG_X86_INTEL_USERCOPY 562#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 84210c479fca..8cc687326eb8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,16 +188,16 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
188extern long __copy_user_nocache(void *dst, const void __user *src, 188extern long __copy_user_nocache(void *dst, const void __user *src,
189 unsigned size, int zerorest); 189 unsigned size, int zerorest);
190 190
191static inline int __copy_from_user_nocache(void *dst, const void __user *src, 191static inline int
192 unsigned size) 192__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
193{ 193{
194 might_sleep(); 194 might_sleep();
195 return __copy_user_nocache(dst, src, size, 1); 195 return __copy_user_nocache(dst, src, size, 1);
196} 196}
197 197
198static inline int __copy_from_user_inatomic_nocache(void *dst, 198static inline int
199 const void __user *src, 199__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
200 unsigned size) 200 unsigned size)
201{ 201{
202 return __copy_user_nocache(dst, src, size, 0); 202 return __copy_user_nocache(dst, src, size, 0);
203} 203}
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644
index 000000000000..c0a01b5d985b
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_UV_UV_H
2#define _ASM_X86_UV_UV_H
3
4enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
5
6struct cpumask;
7struct mm_struct;
8
9#ifdef CONFIG_X86_UV
10
11extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void);
13extern void uv_cpu_init(void);
14extern void uv_system_init(void);
15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
16 struct mm_struct *mm,
17 unsigned long va,
18 unsigned int cpu);
19
20#else /* X86_UV */
21
22static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
23static inline int is_uv_system(void) { return 0; }
24static inline void uv_cpu_init(void) { }
25static inline void uv_system_init(void) { }
26static inline const struct cpumask *
27uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
28 unsigned long va, unsigned int cpu)
29{ return cpumask; }
30
31#endif /* X86_UV */
32
33#endif /* _ASM_X86_UV_UV_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 50423c7b56b2..9b0e61bf7a88 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
325#define cpubit_isset(cpu, bau_local_cpumask) \ 325#define cpubit_isset(cpu, bau_local_cpumask) \
326 test_bit((cpu), (bau_local_cpumask).bits) 326 test_bit((cpu), (bau_local_cpumask).bits)
327 327
328extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
329extern void uv_bau_message_intr1(void); 328extern void uv_bau_message_intr1(void);
330extern void uv_bau_timeout_intr1(void); 329extern void uv_bau_timeout_intr1(void);
331 330
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c1..9f4dfba33b28 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ 199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ 200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201 201
202/* Loop through all installed blades */
203#define for_each_possible_blade(bid) \
204 for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
205
202/* 206/*
203 * Macros for converting between kernel virtual addresses, socket local physical 207 * Macros for converting between kernel virtual addresses, socket local physical
204 * addresses, and UV global physical addresses. 208 * addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h
deleted file mode 100644
index 53100f353612..000000000000
--- a/arch/x86/include/asm/vic.h
+++ /dev/null
@@ -1,61 +0,0 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager Interrupt Controller */
6
7/* The eight CPI vectors. To activate a CPI, you write a bit mask
8 * corresponding to the processor set to be interrupted into the
9 * relevant register. That set of CPUs will then be interrupted with
10 * the CPI */
11static const int VIC_CPI_Registers[] =
12 {0xFC00, 0xFC01, 0xFC08, 0xFC09,
13 0xFC10, 0xFC11, 0xFC18, 0xFC19 };
14
15#define VIC_PROC_WHO_AM_I 0xfc29
16# define QUAD_IDENTIFIER 0xC0
17# define EIGHT_SLOT_IDENTIFIER 0xE0
18#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72
19#define VIC_CPI_BASE_REGISTER 0xFC41
20#define VIC_PROCESSOR_ID 0xFC21
21# define VIC_CPU_MASQUERADE_ENABLE 0x8
22
23#define VIC_CLAIM_REGISTER_0 0xFC38
24#define VIC_CLAIM_REGISTER_1 0xFC39
25#define VIC_REDIRECT_REGISTER_0 0xFC60
26#define VIC_REDIRECT_REGISTER_1 0xFC61
27#define VIC_PRIORITY_REGISTER 0xFC20
28
29#define VIC_PRIMARY_MC_BASE 0xFC48
30#define VIC_SECONDARY_MC_BASE 0xFC49
31
32#define QIC_PROCESSOR_ID 0xFC71
33# define QIC_CPUID_ENABLE 0x08
34
35#define QIC_VIC_CPI_BASE_REGISTER 0xFC79
36#define QIC_CPI_BASE_REGISTER 0xFC7A
37
38#define QIC_MASK_REGISTER0 0xFC80
39/* NOTE: these are masked high, enabled low */
40# define QIC_PERF_TIMER 0x01
41# define QIC_LPE 0x02
42# define QIC_SYS_INT 0x04
43# define QIC_CMN_INT 0x08
44/* at the moment, just enable CMN_INT, disable SYS_INT */
45# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */))
46#define QIC_MASK_REGISTER1 0xFC81
47# define QIC_BOOT_CPI_MASK 0xFE
48/* Enable CPI's 1-6 inclusive */
49# define QIC_CPI_ENABLE 0x81
50
51#define QIC_INTERRUPT_CLEAR0 0xFC8A
52#define QIC_INTERRUPT_CLEAR1 0xFC8B
53
54/* this is where we place the CPI vectors */
55#define VIC_DEFAULT_CPI_BASE 0xC0
56/* this is where we place the QIC CPI vectors */
57#define QIC_DEFAULT_CPI_BASE 0xD0
58
59#define VIC_BOOT_INTERRUPT_MASK 0xfe
60
61extern void smp_vic_timer_interrupt(void);
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
deleted file mode 100644
index b3e647307625..000000000000
--- a/arch/x86/include/asm/voyager.h
+++ /dev/null
@@ -1,529 +0,0 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager system */
6
7#undef VOYAGER_DEBUG
8#undef VOYAGER_CAT_DEBUG
9
10#ifdef VOYAGER_DEBUG
11#define VDEBUG(x) printk x
12#else
13#define VDEBUG(x)
14#endif
15
16/* There are three levels of voyager machine: 3,4 and 5. The rule is
17 * if it's less than 3435 it's a Level 3 except for a 3360 which is
18 * a level 4. A 3435 or above is a Level 5 */
19#define VOYAGER_LEVEL5_AND_ABOVE 0x3435
20#define VOYAGER_LEVEL4 0x3360
21
22/* The L4 DINO ASIC */
23#define VOYAGER_DINO 0x43
24
25/* voyager ports in standard I/O space */
26#define VOYAGER_MC_SETUP 0x96
27
28
29#define VOYAGER_CAT_CONFIG_PORT 0x97
30# define VOYAGER_CAT_DESELECT 0xff
31#define VOYAGER_SSPB_RELOCATION_PORT 0x98
32
33/* Valid CAT controller commands */
34/* start instruction register cycle */
35#define VOYAGER_CAT_IRCYC 0x01
36/* start data register cycle */
37#define VOYAGER_CAT_DRCYC 0x02
38/* move to execute state */
39#define VOYAGER_CAT_RUN 0x0F
40/* end operation */
41#define VOYAGER_CAT_END 0x80
42/* hold in idle state */
43#define VOYAGER_CAT_HOLD 0x90
44/* single step an "intest" vector */
45#define VOYAGER_CAT_STEP 0xE0
46/* return cat controller to CLEMSON mode */
47#define VOYAGER_CAT_CLEMSON 0xFF
48
49/* the default cat command header */
50#define VOYAGER_CAT_HEADER 0x7F
51
52/* the range of possible CAT module ids in the system */
53#define VOYAGER_MIN_MODULE 0x10
54#define VOYAGER_MAX_MODULE 0x1f
55
56/* The voyager registers per asic */
57#define VOYAGER_ASIC_ID_REG 0x00
58#define VOYAGER_ASIC_TYPE_REG 0x01
59/* the sub address registers can be made auto incrementing on reads */
60#define VOYAGER_AUTO_INC_REG 0x02
61# define VOYAGER_AUTO_INC 0x04
62# define VOYAGER_NO_AUTO_INC 0xfb
63#define VOYAGER_SUBADDRDATA 0x03
64#define VOYAGER_SCANPATH 0x05
65# define VOYAGER_CONNECT_ASIC 0x01
66# define VOYAGER_DISCONNECT_ASIC 0xfe
67#define VOYAGER_SUBADDRLO 0x06
68#define VOYAGER_SUBADDRHI 0x07
69#define VOYAGER_SUBMODSELECT 0x08
70#define VOYAGER_SUBMODPRESENT 0x09
71
72#define VOYAGER_SUBADDR_LO 0xff
73#define VOYAGER_SUBADDR_HI 0xffff
74
75/* the maximum size of a scan path -- used to form instructions */
76#define VOYAGER_MAX_SCAN_PATH 0x100
77/* the biggest possible register size (in bytes) */
78#define VOYAGER_MAX_REG_SIZE 4
79
80/* Total number of possible modules (including submodules) */
81#define VOYAGER_MAX_MODULES 16
82/* Largest number of asics per module */
83#define VOYAGER_MAX_ASICS_PER_MODULE 7
84
85/* the CAT asic of each module is always the first one */
86#define VOYAGER_CAT_ID 0
87#define VOYAGER_PSI 0x1a
88
89/* voyager instruction operations and registers */
90#define VOYAGER_READ_CONFIG 0x1
91#define VOYAGER_WRITE_CONFIG 0x2
92#define VOYAGER_BYPASS 0xff
93
94typedef struct voyager_asic {
95 __u8 asic_addr; /* ASIC address; Level 4 */
96 __u8 asic_type; /* ASIC type */
97 __u8 asic_id; /* ASIC id */
98 __u8 jtag_id[4]; /* JTAG id */
99 __u8 asic_location; /* Location within scan path; start w/ 0 */
100 __u8 bit_location; /* Location within bit stream; start w/ 0 */
101 __u8 ireg_length; /* Instruction register length */
102 __u16 subaddr; /* Amount of sub address space */
103 struct voyager_asic *next; /* Next asic in linked list */
104} voyager_asic_t;
105
106typedef struct voyager_module {
107 __u8 module_addr; /* Module address */
108 __u8 scan_path_connected; /* Scan path connected */
109 __u16 ee_size; /* Size of the EEPROM */
110 __u16 num_asics; /* Number of Asics */
111 __u16 inst_bits; /* Instruction bits in the scan path */
112 __u16 largest_reg; /* Largest register in the scan path */
113 __u16 smallest_reg; /* Smallest register in the scan path */
114 voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */
115 struct voyager_module *submodule; /* Submodule pointer */
116 struct voyager_module *next; /* Next module in linked list */
117} voyager_module_t;
118
119typedef struct voyager_eeprom_hdr {
120 __u8 module_id[4];
121 __u8 version_id;
122 __u8 config_id;
123 __u16 boundry_id; /* boundary scan id */
124 __u16 ee_size; /* size of EEPROM */
125 __u8 assembly[11]; /* assembly # */
126 __u8 assembly_rev; /* assembly rev */
127 __u8 tracer[4]; /* tracer number */
128 __u16 assembly_cksum; /* asm checksum */
129 __u16 power_consump; /* pwr requirements */
130 __u16 num_asics; /* number of asics */
131 __u16 bist_time; /* min. bist time */
132 __u16 err_log_offset; /* error log offset */
133 __u16 scan_path_offset;/* scan path offset */
134 __u16 cct_offset;
135 __u16 log_length; /* length of err log */
136 __u16 xsum_end; /* offset to end of
137 checksum */
138 __u8 reserved[4];
139 __u8 sflag; /* starting sentinal */
140 __u8 part_number[13]; /* prom part number */
141 __u8 version[10]; /* version number */
142 __u8 signature[8];
143 __u16 eeprom_chksum;
144 __u32 data_stamp_offset;
145 __u8 eflag ; /* ending sentinal */
146} __attribute__((packed)) voyager_eprom_hdr_t;
147
148
149
150#define VOYAGER_EPROM_SIZE_OFFSET \
151 ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size)))
152#define VOYAGER_XSUM_END_OFFSET 0x2a
153
154/* the following three definitions are for internal table layouts
155 * in the module EPROMs. We really only care about the IDs and
156 * offsets */
157typedef struct voyager_sp_table {
158 __u8 asic_id;
159 __u8 bypass_flag;
160 __u16 asic_data_offset;
161 __u16 config_data_offset;
162} __attribute__((packed)) voyager_sp_table_t;
163
164typedef struct voyager_jtag_table {
165 __u8 icode[4];
166 __u8 runbist[4];
167 __u8 intest[4];
168 __u8 samp_preld[4];
169 __u8 ireg_len;
170} __attribute__((packed)) voyager_jtt_t;
171
172typedef struct voyager_asic_data_table {
173 __u8 jtag_id[4];
174 __u16 length_bsr;
175 __u16 length_bist_reg;
176 __u32 bist_clk;
177 __u16 subaddr_bits;
178 __u16 seed_bits;
179 __u16 sig_bits;
180 __u16 jtag_offset;
181} __attribute__((packed)) voyager_at_t;
182
183/* Voyager Interrupt Controller (VIC) registers */
184
185/* Base to add to Cross Processor Interrupts (CPIs) when triggering
186 * the CPU IRQ line */
187/* register defines for the WCBICs (one per processor) */
188#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */
189#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */
190#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */
191#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */
192#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */
193#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */
194#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */
195#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */
196
197
198/* top of memory registers */
199#define VOYAGER_WCBIC_TOM_L 0x4
200#define VOYAGER_WCBIC_TOM_H 0x5
201
202/* register defines for Voyager Memory Contol (VMC)
203 * these are present on L4 machines only */
204#define VOYAGER_VMC1 0x81
205#define VOYAGER_VMC2 0x91
206#define VOYAGER_VMC3 0xa1
207#define VOYAGER_VMC4 0xb1
208
209/* VMC Ports */
210#define VOYAGER_VMC_MEMORY_SETUP 0x9
211# define VMC_Interleaving 0x01
212# define VMC_4Way 0x02
213# define VMC_EvenCacheLines 0x04
214# define VMC_HighLine 0x08
215# define VMC_Start0_Enable 0x20
216# define VMC_Start1_Enable 0x40
217# define VMC_Vremap 0x80
218#define VOYAGER_VMC_BANK_DENSITY 0xa
219# define VMC_BANK_EMPTY 0
220# define VMC_BANK_4MB 1
221# define VMC_BANK_16MB 2
222# define VMC_BANK_64MB 3
223# define VMC_BANK0_MASK 0x03
224# define VMC_BANK1_MASK 0x0C
225# define VMC_BANK2_MASK 0x30
226# define VMC_BANK3_MASK 0xC0
227
228/* Magellan Memory Controller (MMC) defines - present on L5 */
229#define VOYAGER_MMC_ASIC_ID 1
230/* the two memory modules corresponding to memory cards in the system */
231#define VOYAGER_MMC_MEMORY0_MODULE 0x14
232#define VOYAGER_MMC_MEMORY1_MODULE 0x15
233/* the Magellan Memory Address (MMA) defines */
234#define VOYAGER_MMA_ASIC_ID 2
235
236/* Submodule number for the Quad Baseboard */
237#define VOYAGER_QUAD_BASEBOARD 1
238
239/* ASIC defines for the Quad Baseboard */
240#define VOYAGER_QUAD_QDATA0 1
241#define VOYAGER_QUAD_QDATA1 2
242#define VOYAGER_QUAD_QABC 3
243
244/* Useful areas in extended CMOS */
245#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a
246#define VOYAGER_MEMORY_CLICKMAP 0xa23
247#define VOYAGER_DUMP_LOCATION 0xb1a
248
249/* SUS In Control bit - used to tell SUS that we don't need to be
250 * babysat anymore */
251#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff
252# define VOYAGER_IN_CONTROL_FLAG 0x80
253
254/* Voyager PSI defines */
255#define VOYAGER_PSI_STATUS_REG 0x08
256# define PSI_DC_FAIL 0x01
257# define PSI_MON 0x02
258# define PSI_FAULT 0x04
259# define PSI_ALARM 0x08
260# define PSI_CURRENT 0x10
261# define PSI_DVM 0x20
262# define PSI_PSCFAULT 0x40
263# define PSI_STAT_CHG 0x80
264
265#define VOYAGER_PSI_SUPPLY_REG 0x8000
266 /* read */
267# define PSI_FAIL_DC 0x01
268# define PSI_FAIL_AC 0x02
269# define PSI_MON_INT 0x04
270# define PSI_SWITCH_OFF 0x08
271# define PSI_HX_OFF 0x10
272# define PSI_SECURITY 0x20
273# define PSI_CMOS_BATT_LOW 0x40
274# define PSI_CMOS_BATT_FAIL 0x80
275 /* write */
276# define PSI_CLR_SWITCH_OFF 0x13
277# define PSI_CLR_HX_OFF 0x14
278# define PSI_CLR_CMOS_BATT_FAIL 0x17
279
280#define VOYAGER_PSI_MASK 0x8001
281# define PSI_MASK_MASK 0x10
282
283#define VOYAGER_PSI_AC_FAIL_REG 0x8004
284#define AC_FAIL_STAT_CHANGE 0x80
285
286#define VOYAGER_PSI_GENERAL_REG 0x8007
287 /* read */
288# define PSI_SWITCH_ON 0x01
289# define PSI_SWITCH_ENABLED 0x02
290# define PSI_ALARM_ENABLED 0x08
291# define PSI_SECURE_ENABLED 0x10
292# define PSI_COLD_RESET 0x20
293# define PSI_COLD_START 0x80
294 /* write */
295# define PSI_POWER_DOWN 0x10
296# define PSI_SWITCH_DISABLE 0x01
297# define PSI_SWITCH_ENABLE 0x11
298# define PSI_CLEAR 0x12
299# define PSI_ALARM_DISABLE 0x03
300# define PSI_ALARM_ENABLE 0x13
301# define PSI_CLEAR_COLD_RESET 0x05
302# define PSI_SET_COLD_RESET 0x15
303# define PSI_CLEAR_COLD_START 0x07
304# define PSI_SET_COLD_START 0x17
305
306
307
308struct voyager_bios_info {
309 __u8 len;
310 __u8 major;
311 __u8 minor;
312 __u8 debug;
313 __u8 num_classes;
314 __u8 class_1;
315 __u8 class_2;
316};
317
318/* The following structures and definitions are for the Kernel/SUS
319 * interface these are needed to find out how SUS initialised any Quad
320 * boards in the system */
321
322#define NUMBER_OF_MC_BUSSES 2
323#define SLOTS_PER_MC_BUS 8
324#define MAX_CPUS 16 /* 16 way CPU system */
325#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */
326#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */
327#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */
328#define NUMBER_OF_POS_REGS 8
329
330typedef struct {
331 __u8 MC_Slot;
332 __u8 POS_Values[NUMBER_OF_POS_REGS];
333} __attribute__((packed)) MC_SlotInformation_t;
334
335struct QuadDescription {
336 __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields
337 * will be zero except for slot */
338 __u8 StructureVersion;
339 __u32 CPI_BaseAddress;
340 __u32 LARC_BankSize;
341 __u32 LocalMemoryStateBits;
342 __u8 Slot; /* Processor slots 1 - 4 */
343} __attribute__((packed));
344
345struct ProcBoardInfo {
346 __u8 Type;
347 __u8 StructureVersion;
348 __u8 NumberOfBoards;
349 struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS];
350} __attribute__((packed));
351
352struct CacheDescription {
353 __u8 Level;
354 __u32 TotalSize;
355 __u16 LineSize;
356 __u8 Associativity;
357 __u8 CacheType;
358 __u8 WriteType;
359 __u8 Number_CPUs_SharedBy;
360 __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS];
361
362} __attribute__((packed));
363
364struct CPU_Description {
365 __u8 CPU_HardwareId;
366 char *FRU_String;
367 __u8 NumberOfCacheLevels;
368 struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS];
369} __attribute__((packed));
370
371struct CPU_Info {
372 __u8 Type;
373 __u8 StructureVersion;
374 __u8 NumberOf_CPUs;
375 struct CPU_Description CPU_Data[MAX_CPUS];
376} __attribute__((packed));
377
378
379/*
380 * This structure will be used by SUS and the OS.
381 * The assumption about this structure is that no blank space is
382 * packed in it by our friend the compiler.
383 */
384typedef struct {
385 __u8 Mailbox_SUS; /* Written to by SUS to give
386 commands/response to the OS */
387 __u8 Mailbox_OS; /* Written to by the OS to give
388 commands/response to SUS */
389 __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the
390 interface SUS supports */
391 __u8 OS_MailboxVersion; /* Tells SUS which iteration of the
392 interface the OS supports */
393 __u32 OS_Flags; /* Flags set by the OS as info for
394 SUS */
395 __u32 SUS_Flags; /* Flags set by SUS as info
396 for the OS */
397 __u32 WatchDogPeriod; /* Watchdog period (in seconds) which
398 the DP uses to see if the OS
399 is dead */
400 __u32 WatchDogCount; /* Updated by the OS on every tic. */
401 __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS
402 where to stuff the SUS error log
403 on a dump */
404 MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS];
405 /* Storage for MCA POS data */
406 /* All new SECOND_PASS_INTERFACE fields added from this point */
407 struct ProcBoardInfo *BoardData;
408 struct CPU_Info *CPU_Data;
409 /* All new fields must be added from this point */
410} Voyager_KernelSUS_Mbox_t;
411
412/* structure for finding the right memory address to send a QIC CPI to */
413struct voyager_qic_cpi {
414 /* Each cache line (32 bytes) can trigger a cpi. The cpi
415 * read/write may occur anywhere in the cache line---pick the
416 * middle to be safe */
417 struct {
418 __u32 pad1[3];
419 __u32 cpi;
420 __u32 pad2[4];
421 } qic_cpi[8];
422};
423
424struct voyager_status {
425 __u32 power_fail:1;
426 __u32 switch_off:1;
427 __u32 request_from_kernel:1;
428};
429
430struct voyager_psi_regs {
431 __u8 cat_id;
432 __u8 cat_dev;
433 __u8 cat_control;
434 __u8 subaddr;
435 __u8 dummy4;
436 __u8 checkbit;
437 __u8 subaddr_low;
438 __u8 subaddr_high;
439 __u8 intstatus;
440 __u8 stat1;
441 __u8 stat3;
442 __u8 fault;
443 __u8 tms;
444 __u8 gen;
445 __u8 sysconf;
446 __u8 dummy15;
447};
448
449struct voyager_psi_subregs {
450 __u8 supply;
451 __u8 mask;
452 __u8 present;
453 __u8 DCfail;
454 __u8 ACfail;
455 __u8 fail;
456 __u8 UPSfail;
457 __u8 genstatus;
458};
459
460struct voyager_psi {
461 struct voyager_psi_regs regs;
462 struct voyager_psi_subregs subregs;
463};
464
465struct voyager_SUS {
466#define VOYAGER_DUMP_BUTTON_NMI 0x1
467#define VOYAGER_SUS_VALID 0x2
468#define VOYAGER_SYSINT_COMPLETE 0x3
469 __u8 SUS_mbox;
470#define VOYAGER_NO_COMMAND 0x0
471#define VOYAGER_IGNORE_DUMP 0x1
472#define VOYAGER_DO_DUMP 0x2
473#define VOYAGER_SYSINT_HANDSHAKE 0x3
474#define VOYAGER_DO_MEM_DUMP 0x4
475#define VOYAGER_SYSINT_WAS_RECOVERED 0x5
476 __u8 kernel_mbox;
477#define VOYAGER_MAILBOX_VERSION 0x10
478 __u8 SUS_version;
479 __u8 kernel_version;
480#define VOYAGER_OS_HAS_SYSINT 0x1
481#define VOYAGER_OS_IN_PROGRESS 0x2
482#define VOYAGER_UPDATING_WDPERIOD 0x4
483 __u32 kernel_flags;
484#define VOYAGER_SUS_BOOTING 0x1
485#define VOYAGER_SUS_IN_PROGRESS 0x2
486 __u32 SUS_flags;
487 __u32 watchdog_period;
488 __u32 watchdog_count;
489 __u32 SUS_errorlog;
490 /* lots of system configuration stuff under here */
491};
492
493/* Variables exported by voyager_smp */
494extern __u32 voyager_extended_vic_processors;
495extern __u32 voyager_allowed_boot_processors;
496extern __u32 voyager_quad_processors;
497extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS];
498extern struct voyager_SUS *voyager_SUS;
499
500/* variables exported always */
501extern struct task_struct *voyager_thread;
502extern int voyager_level;
503extern struct voyager_status voyager_status;
504
505/* functions exported by the voyager and voyager_smp modules */
506extern int voyager_cat_readb(__u8 module, __u8 asic, int reg);
507extern void voyager_cat_init(void);
508extern void voyager_detect(struct voyager_bios_info *);
509extern void voyager_trap_init(void);
510extern void voyager_setup_irqs(void);
511extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length);
512extern void voyager_smp_intr_init(void);
513extern __u8 voyager_extended_cmos_read(__u16 cmos_address);
514extern void voyager_smp_dump(void);
515extern void voyager_timer_interrupt(void);
516extern void smp_local_timer_interrupt(void);
517extern void voyager_power_off(void);
518extern void smp_voyager_power_off(void *dummy);
519extern void voyager_restart(void);
520extern void voyager_cat_power_off(void);
521extern void voyager_cat_do_common_interrupt(void);
522extern void voyager_handle_nmi(void);
523extern void voyager_smp_intr_init(void);
524/* Commands for the following are */
525#define VOYAGER_PSI_READ 0
526#define VOYAGER_PSI_WRITE 1
527#define VOYAGER_PSI_SUBREAD 2
528#define VOYAGER_PSI_SUBWRITE 3
529extern void voyager_cat_psi(__u8, __u16, __u8 *);
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 19144184983a..1df35417c412 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -15,10 +15,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
15 return raw_irqs_disabled_flags(regs->flags); 15 return raw_irqs_disabled_flags(regs->flags);
16} 16}
17 17
18static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
19{
20 regs->orig_ax = ~irq;
21 do_IRQ(regs);
22}
23
24#endif /* _ASM_X86_XEN_EVENTS_H */ 18#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 81fbd735aec4..d5b7e90c0edf 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -38,22 +38,30 @@ extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40enum xen_domain_type { 40enum xen_domain_type {
41 XEN_NATIVE, 41 XEN_NATIVE, /* running on bare hardware */
42 XEN_PV_DOMAIN, 42 XEN_PV_DOMAIN, /* running in a PV domain */
43 XEN_HVM_DOMAIN, 43 XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
44}; 44};
45 45
46extern enum xen_domain_type xen_domain_type;
47
48#ifdef CONFIG_XEN 46#ifdef CONFIG_XEN
49#define xen_domain() (xen_domain_type != XEN_NATIVE) 47extern enum xen_domain_type xen_domain_type;
50#else 48#else
51#define xen_domain() (0) 49#define xen_domain_type XEN_NATIVE
52#endif 50#endif
53 51
54#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) 52#define xen_domain() (xen_domain_type != XEN_NATIVE)
55#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) 53#define xen_pv_domain() (xen_domain() && \
54 xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && \
56 xen_domain_type == XEN_HVM_DOMAIN)
57
58#ifdef CONFIG_XEN_DOM0
59#include <xen/interface/xen.h>
56 60
57#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) 61#define xen_initial_domain() (xen_pv_domain() && \
62 xen_start_info->flags & SIF_INITDOMAIN)
63#else /* !CONFIG_XEN_DOM0 */
64#define xen_initial_domain() (0)
65#endif /* CONFIG_XEN_DOM0 */
58 66
59#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 67#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43df..1a918dde46b5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
164 164
165 165
166xmaddr_t arbitrary_virt_to_machine(void *address); 166xmaddr_t arbitrary_virt_to_machine(void *address);
167unsigned long arbitrary_virt_to_mfn(void *vaddr);
167void make_lowmem_page_readonly(void *vaddr); 168void make_lowmem_page_readonly(void *vaddr);
168void make_lowmem_page_readwrite(void *vaddr); 169void make_lowmem_page_readwrite(void *vaddr);
169 170
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..84000eb931ff 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
26 27
27obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 31obj-y += setup.o i8259.o irqinit_$(BITS).o
31obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
32obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
33obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -49,31 +50,28 @@ obj-y += step.o
49obj-$(CONFIG_STACKTRACE) += stacktrace.o 50obj-$(CONFIG_STACKTRACE) += stacktrace.o
50obj-y += cpu/ 51obj-y += cpu/
51obj-y += acpi/ 52obj-y += acpi/
52obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o 53obj-y += reboot.o
53obj-$(CONFIG_MCA) += mca_32.o 54obj-$(CONFIG_MCA) += mca_32.o
54obj-$(CONFIG_X86_MSR) += msr.o 55obj-$(CONFIG_X86_MSR) += msr.o
55obj-$(CONFIG_X86_CPUID) += cpuid.o 56obj-$(CONFIG_X86_CPUID) += cpuid.o
56obj-$(CONFIG_PCI) += early-quirks.o 57obj-$(CONFIG_PCI) += early-quirks.o
57apm-y := apm_32.o 58apm-y := apm_32.o
58obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
59obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_SMP) += smp.o
60obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
61obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_SMP) += setup_percpu.o
62obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
63obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
64obj-$(CONFIG_X86_MPPARSE) += mpparse.o 65obj-$(CONFIG_X86_MPPARSE) += mpparse.o
65obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 66obj-y += apic/
66obj-$(CONFIG_X86_IO_APIC) += io_apic.o
67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
70obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 71obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 72obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 73obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
73obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 74obj-$(CONFIG_X86_VSMP) += vsmp_64.o
74obj-$(CONFIG_X86_ES7000) += es7000_32.o
75obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
76obj-y += vsmp_64.o
77obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_MODULES) += module_$(BITS).o 76obj-$(CONFIG_MODULES) += module_$(BITS).o
79obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -114,16 +112,13 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
114### 112###
115# 64 bit specific files 113# 64 bit specific files
116ifeq ($(CONFIG_X86_64),y) 114ifeq ($(CONFIG_X86_64),y)
117 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 115 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
118 obj-y += bios_uv.o uv_irq.o uv_sysfs.o 116 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
119 obj-y += genx2apic_cluster.o 117 obj-$(CONFIG_AUDIT) += audit_64.o
120 obj-y += genx2apic_phys.o 118
121 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 119 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
122 obj-$(CONFIG_AUDIT) += audit_64.o 120 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
123 121 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 122
125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 123 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
127
128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
129endif 124endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c4568..a18eb7ce2236 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,15 +37,10 @@
37#include <asm/pgtable.h> 37#include <asm/pgtable.h>
38#include <asm/io_apic.h> 38#include <asm/io_apic.h>
39#include <asm/apic.h> 39#include <asm/apic.h>
40#include <asm/genapic.h>
41#include <asm/io.h> 40#include <asm/io.h>
42#include <asm/mpspec.h> 41#include <asm/mpspec.h>
43#include <asm/smp.h> 42#include <asm/smp.h>
44 43
45#ifdef CONFIG_X86_LOCAL_APIC
46# include <mach_apic.h>
47#endif
48
49static int __initdata acpi_force = 0; 44static int __initdata acpi_force = 0;
50u32 acpi_rsdt_forced; 45u32 acpi_rsdt_forced;
51#ifdef CONFIG_ACPI 46#ifdef CONFIG_ACPI
@@ -56,16 +51,7 @@ int acpi_disabled = 1;
56EXPORT_SYMBOL(acpi_disabled); 51EXPORT_SYMBOL(acpi_disabled);
57 52
58#ifdef CONFIG_X86_64 53#ifdef CONFIG_X86_64
59 54# include <asm/proto.h>
60#include <asm/proto.h>
61
62#else /* X86 */
63
64#ifdef CONFIG_X86_LOCAL_APIC
65#include <mach_apic.h>
66#include <mach_mpparse.h>
67#endif /* CONFIG_X86_LOCAL_APIC */
68
69#endif /* X86 */ 55#endif /* X86 */
70 56
71#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -121,35 +107,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
121 */ 107 */
122char *__init __acpi_map_table(unsigned long phys, unsigned long size) 108char *__init __acpi_map_table(unsigned long phys, unsigned long size)
123{ 109{
124 unsigned long base, offset, mapped_size;
125 int idx;
126 110
127 if (!phys || !size) 111 if (!phys || !size)
128 return NULL; 112 return NULL;
129 113
130 if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) 114 return early_ioremap(phys, size);
131 return __va(phys); 115}
132 116void __init __acpi_unmap_table(char *map, unsigned long size)
133 offset = phys & (PAGE_SIZE - 1); 117{
134 mapped_size = PAGE_SIZE - offset; 118 if (!map || !size)
135 clear_fixmap(FIX_ACPI_END); 119 return;
136 set_fixmap(FIX_ACPI_END, phys);
137 base = fix_to_virt(FIX_ACPI_END);
138
139 /*
140 * Most cases can be covered by the below.
141 */
142 idx = FIX_ACPI_END;
143 while (mapped_size < size) {
144 if (--idx < FIX_ACPI_BEGIN)
145 return NULL; /* cannot handle this */
146 phys += PAGE_SIZE;
147 clear_fixmap(idx);
148 set_fixmap(idx, phys);
149 mapped_size += PAGE_SIZE;
150 }
151 120
152 return ((unsigned char *)base + offset); 121 early_iounmap(map, size);
153} 122}
154 123
155#ifdef CONFIG_PCI_MMCONFIG 124#ifdef CONFIG_PCI_MMCONFIG
@@ -239,7 +208,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
239 madt->address); 208 madt->address);
240 } 209 }
241 210
242 acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); 211 default_acpi_madt_oem_check(madt->header.oem_id,
212 madt->header.oem_table_id);
243 213
244 return 0; 214 return 0;
245} 215}
@@ -884,7 +854,7 @@ static struct {
884 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 854 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
885} mp_ioapic_routing[MAX_IO_APICS]; 855} mp_ioapic_routing[MAX_IO_APICS];
886 856
887static int mp_find_ioapic(int gsi) 857int mp_find_ioapic(int gsi)
888{ 858{
889 int i = 0; 859 int i = 0;
890 860
@@ -899,6 +869,16 @@ static int mp_find_ioapic(int gsi)
899 return -1; 869 return -1;
900} 870}
901 871
872int mp_find_ioapic_pin(int ioapic, int gsi)
873{
874 if (WARN_ON(ioapic == -1))
875 return -1;
876 if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
877 return -1;
878
879 return gsi - mp_ioapic_routing[ioapic].gsi_base;
880}
881
902static u8 __init uniq_ioapic_id(u8 id) 882static u8 __init uniq_ioapic_id(u8 id)
903{ 883{
904#ifdef CONFIG_X86_32 884#ifdef CONFIG_X86_32
@@ -912,8 +892,8 @@ static u8 __init uniq_ioapic_id(u8 id)
912 DECLARE_BITMAP(used, 256); 892 DECLARE_BITMAP(used, 256);
913 bitmap_zero(used, 256); 893 bitmap_zero(used, 256);
914 for (i = 0; i < nr_ioapics; i++) { 894 for (i = 0; i < nr_ioapics; i++) {
915 struct mp_config_ioapic *ia = &mp_ioapics[i]; 895 struct mpc_ioapic *ia = &mp_ioapics[i];
916 __set_bit(ia->mp_apicid, used); 896 __set_bit(ia->apicid, used);
917 } 897 }
918 if (!test_bit(id, used)) 898 if (!test_bit(id, used))
919 return id; 899 return id;
@@ -945,29 +925,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
945 925
946 idx = nr_ioapics; 926 idx = nr_ioapics;
947 927
948 mp_ioapics[idx].mp_type = MP_IOAPIC; 928 mp_ioapics[idx].type = MP_IOAPIC;
949 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; 929 mp_ioapics[idx].flags = MPC_APIC_USABLE;
950 mp_ioapics[idx].mp_apicaddr = address; 930 mp_ioapics[idx].apicaddr = address;
951 931
952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 932 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
953 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); 933 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
954#ifdef CONFIG_X86_32 934#ifdef CONFIG_X86_32
955 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); 935 mp_ioapics[idx].apicver = io_apic_get_version(idx);
956#else 936#else
957 mp_ioapics[idx].mp_apicver = 0; 937 mp_ioapics[idx].apicver = 0;
958#endif 938#endif
959 /* 939 /*
960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 940 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 941 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
962 */ 942 */
963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; 943 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
964 mp_ioapic_routing[idx].gsi_base = gsi_base; 944 mp_ioapic_routing[idx].gsi_base = gsi_base;
965 mp_ioapic_routing[idx].gsi_end = gsi_base + 945 mp_ioapic_routing[idx].gsi_end = gsi_base +
966 io_apic_get_redir_entries(idx); 946 io_apic_get_redir_entries(idx);
967 947
968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 948 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
969 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, 949 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
970 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, 950 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 951 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
972 952
973 nr_ioapics++; 953 nr_ioapics++;
@@ -996,19 +976,19 @@ int __init acpi_probe_gsi(void)
996 return max_gsi + 1; 976 return max_gsi + 1;
997} 977}
998 978
999static void assign_to_mp_irq(struct mp_config_intsrc *m, 979static void assign_to_mp_irq(struct mpc_intsrc *m,
1000 struct mp_config_intsrc *mp_irq) 980 struct mpc_intsrc *mp_irq)
1001{ 981{
1002 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); 982 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
1003} 983}
1004 984
1005static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, 985static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
1006 struct mp_config_intsrc *m) 986 struct mpc_intsrc *m)
1007{ 987{
1008 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); 988 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
1009} 989}
1010 990
1011static void save_mp_irq(struct mp_config_intsrc *m) 991static void save_mp_irq(struct mpc_intsrc *m)
1012{ 992{
1013 int i; 993 int i;
1014 994
@@ -1026,7 +1006,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1026{ 1006{
1027 int ioapic; 1007 int ioapic;
1028 int pin; 1008 int pin;
1029 struct mp_config_intsrc mp_irq; 1009 struct mpc_intsrc mp_irq;
1030 1010
1031 /* 1011 /*
1032 * Convert 'gsi' to 'ioapic.pin'. 1012 * Convert 'gsi' to 'ioapic.pin'.
@@ -1034,7 +1014,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1034 ioapic = mp_find_ioapic(gsi); 1014 ioapic = mp_find_ioapic(gsi);
1035 if (ioapic < 0) 1015 if (ioapic < 0)
1036 return; 1016 return;
1037 pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1017 pin = mp_find_ioapic_pin(ioapic, gsi);
1038 1018
1039 /* 1019 /*
1040 * TBD: This check is for faulty timer entries, where the override 1020 * TBD: This check is for faulty timer entries, where the override
@@ -1044,13 +1024,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1044 if ((bus_irq == 0) && (trigger == 3)) 1024 if ((bus_irq == 0) && (trigger == 3))
1045 trigger = 1; 1025 trigger = 1;
1046 1026
1047 mp_irq.mp_type = MP_INTSRC; 1027 mp_irq.type = MP_INTSRC;
1048 mp_irq.mp_irqtype = mp_INT; 1028 mp_irq.irqtype = mp_INT;
1049 mp_irq.mp_irqflag = (trigger << 2) | polarity; 1029 mp_irq.irqflag = (trigger << 2) | polarity;
1050 mp_irq.mp_srcbus = MP_ISA_BUS; 1030 mp_irq.srcbus = MP_ISA_BUS;
1051 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ 1031 mp_irq.srcbusirq = bus_irq; /* IRQ */
1052 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ 1032 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1053 mp_irq.mp_dstirq = pin; /* INTIN# */ 1033 mp_irq.dstirq = pin; /* INTIN# */
1054 1034
1055 save_mp_irq(&mp_irq); 1035 save_mp_irq(&mp_irq);
1056} 1036}
@@ -1060,7 +1040,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1060 int i; 1040 int i;
1061 int ioapic; 1041 int ioapic;
1062 unsigned int dstapic; 1042 unsigned int dstapic;
1063 struct mp_config_intsrc mp_irq; 1043 struct mpc_intsrc mp_irq;
1064 1044
1065#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1045#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1066 /* 1046 /*
@@ -1085,7 +1065,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1085 ioapic = mp_find_ioapic(0); 1065 ioapic = mp_find_ioapic(0);
1086 if (ioapic < 0) 1066 if (ioapic < 0)
1087 return; 1067 return;
1088 dstapic = mp_ioapics[ioapic].mp_apicid; 1068 dstapic = mp_ioapics[ioapic].apicid;
1089 1069
1090 /* 1070 /*
1091 * Use the default configuration for the IRQs 0-15. Unless 1071 * Use the default configuration for the IRQs 0-15. Unless
@@ -1095,16 +1075,14 @@ void __init mp_config_acpi_legacy_irqs(void)
1095 int idx; 1075 int idx;
1096 1076
1097 for (idx = 0; idx < mp_irq_entries; idx++) { 1077 for (idx = 0; idx < mp_irq_entries; idx++) {
1098 struct mp_config_intsrc *irq = mp_irqs + idx; 1078 struct mpc_intsrc *irq = mp_irqs + idx;
1099 1079
1100 /* Do we already have a mapping for this ISA IRQ? */ 1080 /* Do we already have a mapping for this ISA IRQ? */
1101 if (irq->mp_srcbus == MP_ISA_BUS 1081 if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
1102 && irq->mp_srcbusirq == i)
1103 break; 1082 break;
1104 1083
1105 /* Do we already have a mapping for this IOAPIC pin */ 1084 /* Do we already have a mapping for this IOAPIC pin */
1106 if (irq->mp_dstapic == dstapic && 1085 if (irq->dstapic == dstapic && irq->dstirq == i)
1107 irq->mp_dstirq == i)
1108 break; 1086 break;
1109 } 1087 }
1110 1088
@@ -1113,13 +1091,13 @@ void __init mp_config_acpi_legacy_irqs(void)
1113 continue; /* IRQ already used */ 1091 continue; /* IRQ already used */
1114 } 1092 }
1115 1093
1116 mp_irq.mp_type = MP_INTSRC; 1094 mp_irq.type = MP_INTSRC;
1117 mp_irq.mp_irqflag = 0; /* Conforming */ 1095 mp_irq.irqflag = 0; /* Conforming */
1118 mp_irq.mp_srcbus = MP_ISA_BUS; 1096 mp_irq.srcbus = MP_ISA_BUS;
1119 mp_irq.mp_dstapic = dstapic; 1097 mp_irq.dstapic = dstapic;
1120 mp_irq.mp_irqtype = mp_INT; 1098 mp_irq.irqtype = mp_INT;
1121 mp_irq.mp_srcbusirq = i; /* Identity mapped */ 1099 mp_irq.srcbusirq = i; /* Identity mapped */
1122 mp_irq.mp_dstirq = i; 1100 mp_irq.dstirq = i;
1123 1101
1124 save_mp_irq(&mp_irq); 1102 save_mp_irq(&mp_irq);
1125 } 1103 }
@@ -1156,7 +1134,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1156 return gsi; 1134 return gsi;
1157 } 1135 }
1158 1136
1159 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; 1137 ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
1160 1138
1161#ifdef CONFIG_X86_32 1139#ifdef CONFIG_X86_32
1162 if (ioapic_renumber_irq) 1140 if (ioapic_renumber_irq)
@@ -1230,22 +1208,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1230 u32 gsi, int triggering, int polarity) 1208 u32 gsi, int triggering, int polarity)
1231{ 1209{
1232#ifdef CONFIG_X86_MPPARSE 1210#ifdef CONFIG_X86_MPPARSE
1233 struct mp_config_intsrc mp_irq; 1211 struct mpc_intsrc mp_irq;
1234 int ioapic; 1212 int ioapic;
1235 1213
1236 if (!acpi_ioapic) 1214 if (!acpi_ioapic)
1237 return 0; 1215 return 0;
1238 1216
1239 /* print the entry should happen on mptable identically */ 1217 /* print the entry should happen on mptable identically */
1240 mp_irq.mp_type = MP_INTSRC; 1218 mp_irq.type = MP_INTSRC;
1241 mp_irq.mp_irqtype = mp_INT; 1219 mp_irq.irqtype = mp_INT;
1242 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | 1220 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1243 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); 1221 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1244 mp_irq.mp_srcbus = number; 1222 mp_irq.srcbus = number;
1245 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1223 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1246 ioapic = mp_find_ioapic(gsi); 1224 ioapic = mp_find_ioapic(gsi);
1247 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; 1225 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1248 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; 1226 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1249 1227
1250 save_mp_irq(&mp_irq); 1228 save_mp_irq(&mp_irq);
1251#endif 1229#endif
@@ -1372,7 +1350,7 @@ static void __init acpi_process_madt(void)
1372 if (!error) { 1350 if (!error) {
1373 acpi_lapic = 1; 1351 acpi_lapic = 1;
1374 1352
1375#ifdef CONFIG_X86_GENERICARCH 1353#ifdef CONFIG_X86_BIGSMP
1376 generic_bigsmp_probe(); 1354 generic_bigsmp_probe();
1377#endif 1355#endif
1378 /* 1356 /*
@@ -1384,9 +1362,8 @@ static void __init acpi_process_madt(void)
1384 acpi_ioapic = 1; 1362 acpi_ioapic = 1;
1385 1363
1386 smp_found_config = 1; 1364 smp_found_config = 1;
1387#ifdef CONFIG_X86_32 1365 if (apic->setup_apic_routing)
1388 setup_apic_routing(); 1366 apic->setup_apic_routing();
1389#endif
1390 } 1367 }
1391 } 1368 }
1392 if (error == -EINVAL) { 1369 if (error == -EINVAL) {
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 3355973b12ac..580b4e296010 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -3,8 +3,8 @@
3 */ 3 */
4#include <asm/segment.h> 4#include <asm/segment.h>
5#include <asm/msr-index.h> 5#include <asm/msr-index.h>
6#include <asm/page.h> 6#include <asm/page_types.h>
7#include <asm/pgtable.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9 9
10 .code16 10 .code16
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a60c1f3bcb87..7c243a2c5115 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index a12e6a9fb659..8ded418b0593 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,7 +1,7 @@
1 .section .text.page_aligned 1 .section .text.page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page.h> 4#include <asm/page_types.h>
5 5
6# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 6# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
7 7
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 96258d9dc974..8ea5164cbd04 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,8 +1,8 @@
1.text 1.text
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/pgtable.h> 4#include <asm/pgtable_types.h>
5#include <asm/page.h> 5#include <asm/page_types.h>
6#include <asm/msr.h> 6#include <asm/msr.h>
7#include <asm/asm-offsets.h> 7#include <asm/asm-offsets.h>
8 8
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e6..f57658702571 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,6 +5,7 @@
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/vmalloc.h> 7#include <linux/vmalloc.h>
8#include <linux/memory.h>
8#include <asm/alternative.h> 9#include <asm/alternative.h>
9#include <asm/sections.h> 10#include <asm/sections.h>
10#include <asm/pgtable.h> 11#include <asm/pgtable.h>
@@ -12,7 +13,9 @@
12#include <asm/nmi.h> 13#include <asm/nmi.h>
13#include <asm/vsyscall.h> 14#include <asm/vsyscall.h>
14#include <asm/cacheflush.h> 15#include <asm/cacheflush.h>
16#include <asm/tlbflush.h>
15#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/fixmap.h>
16 19
17#define MAX_PATCH_LEN (255-1) 20#define MAX_PATCH_LEN (255-1)
18 21
@@ -226,6 +229,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
226{ 229{
227 u8 **ptr; 230 u8 **ptr;
228 231
232 mutex_lock(&text_mutex);
229 for (ptr = start; ptr < end; ptr++) { 233 for (ptr = start; ptr < end; ptr++) {
230 if (*ptr < text) 234 if (*ptr < text)
231 continue; 235 continue;
@@ -234,6 +238,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
234 /* turn DS segment override prefix into lock prefix */ 238 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 239 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
236 }; 240 };
241 mutex_unlock(&text_mutex);
237} 242}
238 243
239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 244static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
@@ -243,6 +248,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
243 if (noreplace_smp) 248 if (noreplace_smp)
244 return; 249 return;
245 250
251 mutex_lock(&text_mutex);
246 for (ptr = start; ptr < end; ptr++) { 252 for (ptr = start; ptr < end; ptr++) {
247 if (*ptr < text) 253 if (*ptr < text)
248 continue; 254 continue;
@@ -251,6 +257,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
251 /* turn lock prefix into DS segment override prefix */ 257 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 258 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
253 }; 259 };
260 mutex_unlock(&text_mutex);
254} 261}
255 262
256struct smp_alt_module { 263struct smp_alt_module {
@@ -414,9 +421,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 421 that might execute the to be patched code.
415 Other CPUs are not running. */ 422 Other CPUs are not running. */
416 stop_nmi(); 423 stop_nmi();
417#ifdef CONFIG_X86_MCE 424
418 stop_mce(); 425 /*
419#endif 426 * Don't stop machine check exceptions while patching.
427 * MCEs only happen when something got corrupted and in this
428 * case we must do something about the corruption.
429 * Ignoring it is worse than a unlikely patching race.
430 * Also machine checks tend to be broadcast and if one CPU
431 * goes into machine check the others follow quickly, so we don't
432 * expect a machine check to cause undue problems during to code
433 * patching.
434 */
420 435
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 436 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 437
@@ -456,9 +471,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 471 (unsigned long)__smp_locks_end);
457 472
458 restart_nmi(); 473 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 474}
463 475
464/** 476/**
@@ -495,12 +507,13 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
495 * It means the size must be writable atomically and the address must be aligned 507 * It means the size must be writable atomically and the address must be aligned
496 * in a way that permits an atomic write. It also makes sure we fit on a single 508 * in a way that permits an atomic write. It also makes sure we fit on a single
497 * page. 509 * page.
510 *
511 * Note: Must be called under text_mutex.
498 */ 512 */
499void *__kprobes text_poke(void *addr, const void *opcode, size_t len) 513void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
500{ 514{
501 unsigned long flags; 515 unsigned long flags;
502 char *vaddr; 516 char *vaddr;
503 int nr_pages = 2;
504 struct page *pages[2]; 517 struct page *pages[2];
505 int i; 518 int i;
506 519
@@ -513,18 +526,21 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
513 pages[1] = virt_to_page(addr + PAGE_SIZE); 526 pages[1] = virt_to_page(addr + PAGE_SIZE);
514 } 527 }
515 BUG_ON(!pages[0]); 528 BUG_ON(!pages[0]);
516 if (!pages[1])
517 nr_pages = 1;
518 vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
519 BUG_ON(!vaddr);
520 local_irq_save(flags); 529 local_irq_save(flags);
530 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
531 if (pages[1])
532 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
533 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
521 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); 534 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
522 local_irq_restore(flags); 535 clear_fixmap(FIX_TEXT_POKE0);
523 vunmap(vaddr); 536 if (pages[1])
537 clear_fixmap(FIX_TEXT_POKE1);
538 local_flush_tlb();
524 sync_core(); 539 sync_core();
525 /* Could also do a CLFLUSH here to speed up CPU recovery; but 540 /* Could also do a CLFLUSH here to speed up CPU recovery; but
526 that causes hangs on some VIA CPUs. */ 541 that causes hangs on some VIA CPUs. */
527 for (i = 0; i < len; i++) 542 for (i = 0; i < len; i++)
528 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); 543 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
544 local_irq_restore(flags);
529 return addr; 545 return addr;
530} 546}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 000000000000..da7b7b9f8bd8
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for local APIC drivers and for the IO-APIC code
3#
4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o
8
9ifeq ($(CONFIG_X86_64),y)
10obj-y += apic_flat_64.o
11obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
12obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14endif
15
16obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
17obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
18obj-$(CONFIG_X86_ES7000) += es7000_32.o
19obj-$(CONFIG_X86_SUMMIT) += summit_32.o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic/apic.c
index 570f36e44e59..30909a258d0f 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Local APIC handling, local APIC timers 2 * Local APIC handling, local APIC timers
3 * 3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> 4 * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
5 * 5 *
6 * Fixes 6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs; 7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
@@ -14,51 +14,70 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 18#include <linux/mc146818rtc.h>
26#include <linux/ioport.h>
27#include <linux/cpu.h>
28#include <linux/clockchips.h>
29#include <linux/acpi_pmtmr.h> 19#include <linux/acpi_pmtmr.h>
20#include <linux/clockchips.h>
21#include <linux/interrupt.h>
22#include <linux/bootmem.h>
23#include <linux/ftrace.h>
24#include <linux/ioport.h>
30#include <linux/module.h> 25#include <linux/module.h>
31#include <linux/dmi.h> 26#include <linux/sysdev.h>
27#include <linux/delay.h>
28#include <linux/timex.h>
32#include <linux/dmar.h> 29#include <linux/dmar.h>
33#include <linux/ftrace.h> 30#include <linux/init.h>
34#include <linux/smp.h> 31#include <linux/cpu.h>
32#include <linux/dmi.h>
35#include <linux/nmi.h> 33#include <linux/nmi.h>
36#include <linux/timex.h> 34#include <linux/smp.h>
35#include <linux/mm.h>
37 36
37#include <asm/pgalloc.h>
38#include <asm/atomic.h> 38#include <asm/atomic.h>
39#include <asm/mtrr.h>
40#include <asm/mpspec.h> 39#include <asm/mpspec.h>
41#include <asm/desc.h>
42#include <asm/arch_hooks.h>
43#include <asm/hpet.h>
44#include <asm/pgalloc.h>
45#include <asm/i8253.h> 40#include <asm/i8253.h>
46#include <asm/idle.h> 41#include <asm/i8259.h>
47#include <asm/proto.h> 42#include <asm/proto.h>
48#include <asm/apic.h> 43#include <asm/apic.h>
49#include <asm/i8259.h> 44#include <asm/desc.h>
45#include <asm/hpet.h>
46#include <asm/idle.h>
47#include <asm/mtrr.h>
50#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
50
51unsigned int num_processors;
52
53unsigned disabled_cpus __cpuinitdata;
51 54
52#include <mach_apic.h> 55/* Processor that is doing the boot up */
53#include <mach_apicdef.h> 56unsigned int boot_cpu_physical_apicid = -1U;
54#include <mach_ipi.h>
55 57
56/* 58/*
57 * Sanity check 59 * The highest APIC ID seen during enumeration.
60 *
61 * This determines the messaging protocol we can use: if all APIC IDs
62 * are in the 0 ... 7 range, then we can use logical addressing which
63 * has some performance advantages (better broadcasting).
64 *
65 * If there's an APIC ID above 8, we use physical addressing.
58 */ 66 */
59#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) 67unsigned int max_physical_apicid;
60# error SPURIOUS_APIC_VECTOR definition error 68
61#endif 69/*
70 * Bitmask of physically existing CPUs:
71 */
72physid_mask_t phys_cpu_present_map;
73
74/*
75 * Map cpu index to physical APIC ID
76 */
77DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
78DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
62 81
63#ifdef CONFIG_X86_32 82#ifdef CONFIG_X86_32
64/* 83/*
@@ -92,11 +111,7 @@ static __init int setup_apicpmtimer(char *s)
92__setup("apicpmtimer", setup_apicpmtimer); 111__setup("apicpmtimer", setup_apicpmtimer);
93#endif 112#endif
94 113
95#ifdef CONFIG_X86_64 114#ifdef CONFIG_X86_X2APIC
96#define HAVE_X2APIC
97#endif
98
99#ifdef HAVE_X2APIC
100int x2apic; 115int x2apic;
101/* x2apic enabled before OS handover */ 116/* x2apic enabled before OS handover */
102static int x2apic_preenabled; 117static int x2apic_preenabled;
@@ -194,18 +209,13 @@ static int modern_apic(void)
194 return lapic_get_version() >= 0x14; 209 return lapic_get_version() >= 0x14;
195} 210}
196 211
197/* 212void native_apic_wait_icr_idle(void)
198 * Paravirt kernels also might be using these below ops. So we still
199 * use generic apic_read()/apic_write(), which might be pointing to different
200 * ops in PARAVIRT case.
201 */
202void xapic_wait_icr_idle(void)
203{ 213{
204 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
205 cpu_relax(); 215 cpu_relax();
206} 216}
207 217
208u32 safe_xapic_wait_icr_idle(void) 218u32 native_safe_apic_wait_icr_idle(void)
209{ 219{
210 u32 send_status; 220 u32 send_status;
211 int timeout; 221 int timeout;
@@ -221,13 +231,13 @@ u32 safe_xapic_wait_icr_idle(void)
221 return send_status; 231 return send_status;
222} 232}
223 233
224void xapic_icr_write(u32 low, u32 id) 234void native_apic_icr_write(u32 low, u32 id)
225{ 235{
226 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); 236 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
227 apic_write(APIC_ICR, low); 237 apic_write(APIC_ICR, low);
228} 238}
229 239
230static u64 xapic_icr_read(void) 240u64 native_apic_icr_read(void)
231{ 241{
232 u32 icr1, icr2; 242 u32 icr1, icr2;
233 243
@@ -237,54 +247,6 @@ static u64 xapic_icr_read(void)
237 return icr1 | ((u64)icr2 << 32); 247 return icr1 | ((u64)icr2 << 32);
238} 248}
239 249
240static struct apic_ops xapic_ops = {
241 .read = native_apic_mem_read,
242 .write = native_apic_mem_write,
243 .icr_read = xapic_icr_read,
244 .icr_write = xapic_icr_write,
245 .wait_icr_idle = xapic_wait_icr_idle,
246 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
247};
248
249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
250EXPORT_SYMBOL_GPL(apic_ops);
251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270static u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
288/** 250/**
289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 251 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
290 */ 252 */
@@ -457,7 +419,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
457static void lapic_timer_broadcast(const struct cpumask *mask) 419static void lapic_timer_broadcast(const struct cpumask *mask)
458{ 420{
459#ifdef CONFIG_SMP 421#ifdef CONFIG_SMP
460 send_IPI_mask(mask, LOCAL_TIMER_VECTOR); 422 apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
461#endif 423#endif
462} 424}
463 425
@@ -535,7 +497,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
535 } 497 }
536} 498}
537 499
538static int __init calibrate_by_pmtimer(long deltapm, long *delta) 500static int __init
501calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
539{ 502{
540 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; 503 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
541 const long pm_thresh = pm_100ms / 100; 504 const long pm_thresh = pm_100ms / 100;
@@ -546,7 +509,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
546 return -1; 509 return -1;
547#endif 510#endif
548 511
549 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 512 apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
550 513
551 /* Check, if the PM timer is available */ 514 /* Check, if the PM timer is available */
552 if (!deltapm) 515 if (!deltapm)
@@ -556,19 +519,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
556 519
557 if (deltapm > (pm_100ms - pm_thresh) && 520 if (deltapm > (pm_100ms - pm_thresh) &&
558 deltapm < (pm_100ms + pm_thresh)) { 521 deltapm < (pm_100ms + pm_thresh)) {
559 apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); 522 apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
560 } else { 523 return 0;
561 res = (((u64)deltapm) * mult) >> 22; 524 }
562 do_div(res, 1000000); 525
563 pr_warning("APIC calibration not consistent " 526 res = (((u64)deltapm) * mult) >> 22;
564 "with PM Timer: %ldms instead of 100ms\n", 527 do_div(res, 1000000);
565 (long)res); 528 pr_warning("APIC calibration not consistent "
566 /* Correct the lapic counter value */ 529 "with PM-Timer: %ldms instead of 100ms\n",(long)res);
567 res = (((u64)(*delta)) * pm_100ms); 530
531 /* Correct the lapic counter value */
532 res = (((u64)(*delta)) * pm_100ms);
533 do_div(res, deltapm);
534 pr_info("APIC delta adjusted to PM-Timer: "
535 "%lu (%ld)\n", (unsigned long)res, *delta);
536 *delta = (long)res;
537
538 /* Correct the tsc counter value */
539 if (cpu_has_tsc) {
540 res = (((u64)(*deltatsc)) * pm_100ms);
568 do_div(res, deltapm); 541 do_div(res, deltapm);
569 pr_info("APIC delta adjusted to PM-Timer: " 542 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
570 "%lu (%ld)\n", (unsigned long)res, *delta); 543 "PM-Timer: %lu (%ld) \n",
571 *delta = (long)res; 544 (unsigned long)res, *deltatsc);
545 *deltatsc = (long)res;
572 } 546 }
573 547
574 return 0; 548 return 0;
@@ -579,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
579 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 553 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
580 void (*real_handler)(struct clock_event_device *dev); 554 void (*real_handler)(struct clock_event_device *dev);
581 unsigned long deltaj; 555 unsigned long deltaj;
582 long delta; 556 long delta, deltatsc;
583 int pm_referenced = 0; 557 int pm_referenced = 0;
584 558
585 local_irq_disable(); 559 local_irq_disable();
@@ -609,9 +583,11 @@ static int __init calibrate_APIC_clock(void)
609 delta = lapic_cal_t1 - lapic_cal_t2; 583 delta = lapic_cal_t1 - lapic_cal_t2;
610 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 584 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
611 585
586 deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
587
612 /* we trust the PM based calibration if possible */ 588 /* we trust the PM based calibration if possible */
613 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, 589 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
614 &delta); 590 &delta, &deltatsc);
615 591
616 /* Calculate the scaled math multiplication factor */ 592 /* Calculate the scaled math multiplication factor */
617 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 593 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -629,11 +605,10 @@ static int __init calibrate_APIC_clock(void)
629 calibration_result); 605 calibration_result);
630 606
631 if (cpu_has_tsc) { 607 if (cpu_has_tsc) {
632 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
633 apic_printk(APIC_VERBOSE, "..... CPU clock speed is " 608 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
634 "%ld.%04ld MHz.\n", 609 "%ld.%04ld MHz.\n",
635 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), 610 (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
636 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); 611 (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
637 } 612 }
638 613
639 apic_printk(APIC_VERBOSE, "..... host bus clock speed is " 614 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
@@ -868,6 +843,14 @@ void clear_local_APIC(void)
868 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
869 } 844 }
870#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
871 /* 854 /*
872 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
873 */ 856 */
@@ -991,11 +974,11 @@ int __init verify_local_APIC(void)
991 */ 974 */
992 reg0 = apic_read(APIC_ID); 975 reg0 = apic_read(APIC_ID);
993 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 976 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
994 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); 977 apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
995 reg1 = apic_read(APIC_ID); 978 reg1 = apic_read(APIC_ID);
996 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); 979 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
997 apic_write(APIC_ID, reg0); 980 apic_write(APIC_ID, reg0);
998 if (reg1 != (reg0 ^ APIC_ID_MASK)) 981 if (reg1 != (reg0 ^ apic->apic_id_mask))
999 return 0; 982 return 0;
1000 983
1001 /* 984 /*
@@ -1089,7 +1072,7 @@ static void __cpuinit lapic_setup_esr(void)
1089 return; 1072 return;
1090 } 1073 }
1091 1074
1092 if (esr_disable) { 1075 if (apic->disable_esr) {
1093 /* 1076 /*
1094 * Something untraceable is creating bad interrupts on 1077 * Something untraceable is creating bad interrupts on
1095 * secondary quads ... for the moment, just leave the 1078 * secondary quads ... for the moment, just leave the
@@ -1130,9 +1113,14 @@ void __cpuinit setup_local_APIC(void)
1130 unsigned int value; 1113 unsigned int value;
1131 int i, j; 1114 int i, j;
1132 1115
1116 if (disable_apic) {
1117 arch_disable_smp_support();
1118 return;
1119 }
1120
1133#ifdef CONFIG_X86_32 1121#ifdef CONFIG_X86_32
1134 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1122 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1135 if (lapic_is_integrated() && esr_disable) { 1123 if (lapic_is_integrated() && apic->disable_esr) {
1136 apic_write(APIC_ESR, 0); 1124 apic_write(APIC_ESR, 0);
1137 apic_write(APIC_ESR, 0); 1125 apic_write(APIC_ESR, 0);
1138 apic_write(APIC_ESR, 0); 1126 apic_write(APIC_ESR, 0);
@@ -1146,7 +1134,7 @@ void __cpuinit setup_local_APIC(void)
1146 * Double-check whether this APIC is really registered. 1134 * Double-check whether this APIC is really registered.
1147 * This is meaningless in clustered apic mode, so we skip it. 1135 * This is meaningless in clustered apic mode, so we skip it.
1148 */ 1136 */
1149 if (!apic_id_registered()) 1137 if (!apic->apic_id_registered())
1150 BUG(); 1138 BUG();
1151 1139
1152 /* 1140 /*
@@ -1154,7 +1142,7 @@ void __cpuinit setup_local_APIC(void)
1154 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 1142 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
1155 * document number 292116). So here it goes... 1143 * document number 292116). So here it goes...
1156 */ 1144 */
1157 init_apic_ldr(); 1145 apic->init_apic_ldr();
1158 1146
1159 /* 1147 /*
1160 * Set Task Priority to 'accept all'. We never change this 1148 * Set Task Priority to 'accept all'. We never change this
@@ -1262,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1262 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1263 1251
1264 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1265} 1259}
1266 1260
1267void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
@@ -1282,17 +1276,12 @@ void __cpuinit end_local_APIC_setup(void)
1282 apic_pm_activate(); 1276 apic_pm_activate();
1283} 1277}
1284 1278
1285#ifdef HAVE_X2APIC 1279#ifdef CONFIG_X86_X2APIC
1286void check_x2apic(void) 1280void check_x2apic(void)
1287{ 1281{
1288 int msr, msr2; 1282 if (x2apic_enabled()) {
1289
1290 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1291
1292 if (msr & X2APIC_ENABLE) {
1293 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1283 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1294 x2apic_preenabled = x2apic = 1; 1284 x2apic_preenabled = x2apic = 1;
1295 apic_ops = &x2apic_ops;
1296 } 1285 }
1297} 1286}
1298 1287
@@ -1300,6 +1289,9 @@ void enable_x2apic(void)
1300{ 1289{
1301 int msr, msr2; 1290 int msr, msr2;
1302 1291
1292 if (!x2apic)
1293 return;
1294
1303 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1295 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1304 if (!(msr & X2APIC_ENABLE)) { 1296 if (!(msr & X2APIC_ENABLE)) {
1305 pr_info("Enabling x2apic\n"); 1297 pr_info("Enabling x2apic\n");
@@ -1363,7 +1355,6 @@ void __init enable_IR_x2apic(void)
1363 1355
1364 if (!x2apic) { 1356 if (!x2apic) {
1365 x2apic = 1; 1357 x2apic = 1;
1366 apic_ops = &x2apic_ops;
1367 enable_x2apic(); 1358 enable_x2apic();
1368 } 1359 }
1369 1360
@@ -1401,7 +1392,7 @@ end:
1401 1392
1402 return; 1393 return;
1403} 1394}
1404#endif /* HAVE_X2APIC */ 1395#endif /* CONFIG_X86_X2APIC */
1405 1396
1406#ifdef CONFIG_X86_64 1397#ifdef CONFIG_X86_64
1407/* 1398/*
@@ -1532,7 +1523,7 @@ void __init early_init_lapic_mapping(void)
1532 */ 1523 */
1533void __init init_apic_mappings(void) 1524void __init init_apic_mappings(void)
1534{ 1525{
1535#ifdef HAVE_X2APIC 1526#ifdef CONFIG_X86_X2APIC
1536 if (x2apic) { 1527 if (x2apic) {
1537 boot_cpu_physical_apicid = read_apic_id(); 1528 boot_cpu_physical_apicid = read_apic_id();
1538 return; 1529 return;
@@ -1570,11 +1561,11 @@ int apic_version[MAX_APICS];
1570 1561
1571int __init APIC_init_uniprocessor(void) 1562int __init APIC_init_uniprocessor(void)
1572{ 1563{
1573#ifdef CONFIG_X86_64
1574 if (disable_apic) { 1564 if (disable_apic) {
1575 pr_info("Apic disabled\n"); 1565 pr_info("Apic disabled\n");
1576 return -1; 1566 return -1;
1577 } 1567 }
1568#ifdef CONFIG_X86_64
1578 if (!cpu_has_apic) { 1569 if (!cpu_has_apic) {
1579 disable_apic = 1; 1570 disable_apic = 1;
1580 pr_info("Apic disabled by BIOS\n"); 1571 pr_info("Apic disabled by BIOS\n");
@@ -1596,11 +1587,9 @@ int __init APIC_init_uniprocessor(void)
1596 } 1587 }
1597#endif 1588#endif
1598 1589
1599#ifdef HAVE_X2APIC
1600 enable_IR_x2apic(); 1590 enable_IR_x2apic();
1601#endif
1602#ifdef CONFIG_X86_64 1591#ifdef CONFIG_X86_64
1603 setup_apic_routing(); 1592 default_setup_apic_routing();
1604#endif 1593#endif
1605 1594
1606 verify_local_APIC(); 1595 verify_local_APIC();
@@ -1621,35 +1610,31 @@ int __init APIC_init_uniprocessor(void)
1621 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1610 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1622 setup_local_APIC(); 1611 setup_local_APIC();
1623 1612
1624#ifdef CONFIG_X86_64 1613#ifdef CONFIG_X86_IO_APIC
1625 /* 1614 /*
1626 * Now enable IO-APICs, actually call clear_IO_APIC 1615 * Now enable IO-APICs, actually call clear_IO_APIC
1627 * We need clear_IO_APIC before enabling vector on BP 1616 * We need clear_IO_APIC before enabling error vector
1628 */ 1617 */
1629 if (!skip_ioapic_setup && nr_ioapics) 1618 if (!skip_ioapic_setup && nr_ioapics)
1630 enable_IO_APIC(); 1619 enable_IO_APIC();
1631#endif 1620#endif
1632 1621
1633#ifdef CONFIG_X86_IO_APIC
1634 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1635#endif
1636 localise_nmi_watchdog();
1637 end_local_APIC_setup(); 1622 end_local_APIC_setup();
1638 1623
1639#ifdef CONFIG_X86_IO_APIC 1624#ifdef CONFIG_X86_IO_APIC
1640 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1625 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1641 setup_IO_APIC(); 1626 setup_IO_APIC();
1642# ifdef CONFIG_X86_64 1627 else {
1643 else
1644 nr_ioapics = 0; 1628 nr_ioapics = 0;
1645# endif 1629 localise_nmi_watchdog();
1630 }
1631#else
1632 localise_nmi_watchdog();
1646#endif 1633#endif
1647 1634
1635 setup_boot_clock();
1648#ifdef CONFIG_X86_64 1636#ifdef CONFIG_X86_64
1649 setup_boot_APIC_clock();
1650 check_nmi_watchdog(); 1637 check_nmi_watchdog();
1651#else
1652 setup_boot_clock();
1653#endif 1638#endif
1654 1639
1655 return 0; 1640 return 0;
@@ -1738,7 +1723,8 @@ void __init connect_bsp_APIC(void)
1738 outb(0x01, 0x23); 1723 outb(0x01, 0x23);
1739 } 1724 }
1740#endif 1725#endif
1741 enable_apic_mode(); 1726 if (apic->enable_apic_mode)
1727 apic->enable_apic_mode();
1742} 1728}
1743 1729
1744/** 1730/**
@@ -1876,29 +1862,39 @@ void __cpuinit generic_processor_info(int apicid, int version)
1876 } 1862 }
1877#endif 1863#endif
1878 1864
1879#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1865#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1880 /* are we being called early in kernel startup? */ 1866 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1881 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1867 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1882 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1883 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1884
1885 cpu_to_apicid[cpu] = apicid;
1886 bios_cpu_apicid[cpu] = apicid;
1887 } else {
1888 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1889 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1890 }
1891#endif 1868#endif
1892 1869
1893 set_cpu_possible(cpu, true); 1870 set_cpu_possible(cpu, true);
1894 set_cpu_present(cpu, true); 1871 set_cpu_present(cpu, true);
1895} 1872}
1896 1873
1897#ifdef CONFIG_X86_64
1898int hard_smp_processor_id(void) 1874int hard_smp_processor_id(void)
1899{ 1875{
1900 return read_apic_id(); 1876 return read_apic_id();
1901} 1877}
1878
1879void default_init_apic_ldr(void)
1880{
1881 unsigned long val;
1882
1883 apic_write(APIC_DFR, APIC_DFR_VALUE);
1884 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
1885 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
1886 apic_write(APIC_LDR, val);
1887}
1888
1889#ifdef CONFIG_X86_32
1890int default_apicid_to_node(int logical_apicid)
1891{
1892#ifdef CONFIG_SMP
1893 return apicid_2_node[hard_smp_processor_id()];
1894#else
1895 return 0;
1896#endif
1897}
1902#endif 1898#endif
1903 1899
1904/* 1900/*
@@ -1976,7 +1972,7 @@ static int lapic_resume(struct sys_device *dev)
1976 1972
1977 local_irq_save(flags); 1973 local_irq_save(flags);
1978 1974
1979#ifdef HAVE_X2APIC 1975#ifdef CONFIG_X86_X2APIC
1980 if (x2apic) 1976 if (x2apic)
1981 enable_x2apic(); 1977 enable_x2apic();
1982 else 1978 else
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 34185488e4fb..f933822dba18 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -17,9 +17,8 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/apic.h>
20#include <asm/ipi.h> 21#include <asm/ipi.h>
21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23 22
24#ifdef CONFIG_ACPI 23#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
@@ -74,7 +73,7 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
74 unsigned long flags; 73 unsigned long flags;
75 74
76 local_irq_save(flags); 75 local_irq_save(flags);
77 __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); 76 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
78 local_irq_restore(flags); 77 local_irq_restore(flags);
79} 78}
80 79
@@ -85,14 +84,15 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
85 _flat_send_IPI_mask(mask, vector); 84 _flat_send_IPI_mask(mask, vector);
86} 85}
87 86
88static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, 87static void
89 int vector) 88 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
90{ 89{
91 unsigned long mask = cpumask_bits(cpumask)[0]; 90 unsigned long mask = cpumask_bits(cpumask)[0];
92 int cpu = smp_processor_id(); 91 int cpu = smp_processor_id();
93 92
94 if (cpu < BITS_PER_LONG) 93 if (cpu < BITS_PER_LONG)
95 clear_bit(cpu, &mask); 94 clear_bit(cpu, &mask);
95
96 _flat_send_IPI_mask(mask, vector); 96 _flat_send_IPI_mask(mask, vector);
97} 97}
98 98
@@ -114,23 +114,27 @@ static void flat_send_IPI_allbutself(int vector)
114 _flat_send_IPI_mask(mask, vector); 114 _flat_send_IPI_mask(mask, vector);
115 } 115 }
116 } else if (num_online_cpus() > 1) { 116 } else if (num_online_cpus() > 1) {
117 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 117 __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
118 vector, apic->dest_logical);
118 } 119 }
119} 120}
120 121
121static void flat_send_IPI_all(int vector) 122static void flat_send_IPI_all(int vector)
122{ 123{
123 if (vector == NMI_VECTOR) 124 if (vector == NMI_VECTOR) {
124 flat_send_IPI_mask(cpu_online_mask, vector); 125 flat_send_IPI_mask(cpu_online_mask, vector);
125 else 126 } else {
126 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 127 __default_send_IPI_shortcut(APIC_DEST_ALLINC,
128 vector, apic->dest_logical);
129 }
127} 130}
128 131
129static unsigned int get_apic_id(unsigned long x) 132static unsigned int flat_get_apic_id(unsigned long x)
130{ 133{
131 unsigned int id; 134 unsigned int id;
132 135
133 id = (((x)>>24) & 0xFFu); 136 id = (((x)>>24) & 0xFFu);
137
134 return id; 138 return id;
135} 139}
136 140
@@ -146,7 +150,7 @@ static unsigned int read_xapic_id(void)
146{ 150{
147 unsigned int id; 151 unsigned int id;
148 152
149 id = get_apic_id(apic_read(APIC_ID)); 153 id = flat_get_apic_id(apic_read(APIC_ID));
150 return id; 154 return id;
151} 155}
152 156
@@ -169,31 +173,67 @@ static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
169 return mask1 & mask2; 173 return mask1 & mask2;
170} 174}
171 175
172static unsigned int phys_pkg_id(int index_msb) 176static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
173{ 177{
174 return hard_smp_processor_id() >> index_msb; 178 return hard_smp_processor_id() >> index_msb;
175} 179}
176 180
177struct genapic apic_flat = { 181struct apic apic_flat = {
178 .name = "flat", 182 .name = "flat",
179 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 183 .probe = NULL,
180 .int_delivery_mode = dest_LowestPrio, 184 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
181 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 185 .apic_id_registered = flat_apic_id_registered,
182 .target_cpus = flat_target_cpus, 186
183 .vector_allocation_domain = flat_vector_allocation_domain, 187 .irq_delivery_mode = dest_LowestPrio,
184 .apic_id_registered = flat_apic_id_registered, 188 .irq_dest_mode = 1, /* logical */
185 .init_apic_ldr = flat_init_apic_ldr, 189
186 .send_IPI_all = flat_send_IPI_all, 190 .target_cpus = flat_target_cpus,
187 .send_IPI_allbutself = flat_send_IPI_allbutself, 191 .disable_esr = 0,
188 .send_IPI_mask = flat_send_IPI_mask, 192 .dest_logical = APIC_DEST_LOGICAL,
189 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 193 .check_apicid_used = NULL,
190 .send_IPI_self = apic_send_IPI_self, 194 .check_apicid_present = NULL,
191 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 195
192 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 196 .vector_allocation_domain = flat_vector_allocation_domain,
193 .phys_pkg_id = phys_pkg_id, 197 .init_apic_ldr = flat_init_apic_ldr,
194 .get_apic_id = get_apic_id, 198
195 .set_apic_id = set_apic_id, 199 .ioapic_phys_id_map = NULL,
196 .apic_id_mask = (0xFFu<<24), 200 .setup_apic_routing = NULL,
201 .multi_timer_check = NULL,
202 .apicid_to_node = NULL,
203 .cpu_to_logical_apicid = NULL,
204 .cpu_present_to_apicid = default_cpu_present_to_apicid,
205 .apicid_to_cpu_present = NULL,
206 .setup_portio_remap = NULL,
207 .check_phys_apicid_present = default_check_phys_apicid_present,
208 .enable_apic_mode = NULL,
209 .phys_pkg_id = flat_phys_pkg_id,
210 .mps_oem_check = NULL,
211
212 .get_apic_id = flat_get_apic_id,
213 .set_apic_id = set_apic_id,
214 .apic_id_mask = 0xFFu << 24,
215
216 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
217 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
218
219 .send_IPI_mask = flat_send_IPI_mask,
220 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
221 .send_IPI_allbutself = flat_send_IPI_allbutself,
222 .send_IPI_all = flat_send_IPI_all,
223 .send_IPI_self = apic_send_IPI_self,
224
225 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
226 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
227 .wait_for_init_deassert = NULL,
228 .smp_callin_clear_local_apic = NULL,
229 .inquire_remote_apic = NULL,
230
231 .read = native_apic_mem_read,
232 .write = native_apic_mem_write,
233 .icr_read = native_apic_icr_read,
234 .icr_write = native_apic_icr_write,
235 .wait_icr_idle = native_apic_wait_icr_idle,
236 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
197}; 237};
198 238
199/* 239/*
@@ -232,18 +272,18 @@ static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
232 272
233static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 273static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
234{ 274{
235 send_IPI_mask_sequence(cpumask, vector); 275 default_send_IPI_mask_sequence_phys(cpumask, vector);
236} 276}
237 277
238static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, 278static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
239 int vector) 279 int vector)
240{ 280{
241 send_IPI_mask_allbutself(cpumask, vector); 281 default_send_IPI_mask_allbutself_phys(cpumask, vector);
242} 282}
243 283
244static void physflat_send_IPI_allbutself(int vector) 284static void physflat_send_IPI_allbutself(int vector)
245{ 285{
246 send_IPI_mask_allbutself(cpu_online_mask, vector); 286 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
247} 287}
248 288
249static void physflat_send_IPI_all(int vector) 289static void physflat_send_IPI_all(int vector)
@@ -276,32 +316,72 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
276 * We're using fixed IRQ delivery, can only return one phys APIC ID. 316 * We're using fixed IRQ delivery, can only return one phys APIC ID.
277 * May as well be the first. 317 * May as well be the first.
278 */ 318 */
279 for_each_cpu_and(cpu, cpumask, andmask) 319 for_each_cpu_and(cpu, cpumask, andmask) {
280 if (cpumask_test_cpu(cpu, cpu_online_mask)) 320 if (cpumask_test_cpu(cpu, cpu_online_mask))
281 break; 321 break;
322 }
282 if (cpu < nr_cpu_ids) 323 if (cpu < nr_cpu_ids)
283 return per_cpu(x86_cpu_to_apicid, cpu); 324 return per_cpu(x86_cpu_to_apicid, cpu);
325
284 return BAD_APICID; 326 return BAD_APICID;
285} 327}
286 328
287struct genapic apic_physflat = { 329struct apic apic_physflat = {
288 .name = "physical flat", 330
289 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 331 .name = "physical flat",
290 .int_delivery_mode = dest_Fixed, 332 .probe = NULL,
291 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 333 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
292 .target_cpus = physflat_target_cpus, 334 .apic_id_registered = flat_apic_id_registered,
293 .vector_allocation_domain = physflat_vector_allocation_domain, 335
294 .apic_id_registered = flat_apic_id_registered, 336 .irq_delivery_mode = dest_Fixed,
295 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ 337 .irq_dest_mode = 0, /* physical */
296 .send_IPI_all = physflat_send_IPI_all, 338
297 .send_IPI_allbutself = physflat_send_IPI_allbutself, 339 .target_cpus = physflat_target_cpus,
298 .send_IPI_mask = physflat_send_IPI_mask, 340 .disable_esr = 0,
299 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 341 .dest_logical = 0,
300 .send_IPI_self = apic_send_IPI_self, 342 .check_apicid_used = NULL,
301 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 343 .check_apicid_present = NULL,
302 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, 344
303 .phys_pkg_id = phys_pkg_id, 345 .vector_allocation_domain = physflat_vector_allocation_domain,
304 .get_apic_id = get_apic_id, 346 /* not needed, but shouldn't hurt: */
305 .set_apic_id = set_apic_id, 347 .init_apic_ldr = flat_init_apic_ldr,
306 .apic_id_mask = (0xFFu<<24), 348
349 .ioapic_phys_id_map = NULL,
350 .setup_apic_routing = NULL,
351 .multi_timer_check = NULL,
352 .apicid_to_node = NULL,
353 .cpu_to_logical_apicid = NULL,
354 .cpu_present_to_apicid = default_cpu_present_to_apicid,
355 .apicid_to_cpu_present = NULL,
356 .setup_portio_remap = NULL,
357 .check_phys_apicid_present = default_check_phys_apicid_present,
358 .enable_apic_mode = NULL,
359 .phys_pkg_id = flat_phys_pkg_id,
360 .mps_oem_check = NULL,
361
362 .get_apic_id = flat_get_apic_id,
363 .set_apic_id = set_apic_id,
364 .apic_id_mask = 0xFFu << 24,
365
366 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
367 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
368
369 .send_IPI_mask = physflat_send_IPI_mask,
370 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
371 .send_IPI_allbutself = physflat_send_IPI_allbutself,
372 .send_IPI_all = physflat_send_IPI_all,
373 .send_IPI_self = apic_send_IPI_self,
374
375 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
376 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
377 .wait_for_init_deassert = NULL,
378 .smp_callin_clear_local_apic = NULL,
379 .inquire_remote_apic = NULL,
380
381 .read = native_apic_mem_read,
382 .write = native_apic_mem_write,
383 .icr_read = native_apic_icr_read,
384 .icr_write = native_apic_icr_write,
385 .wait_icr_idle = native_apic_wait_icr_idle,
386 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
307}; 387};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 000000000000..d806ecaa948f
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,267 @@
1/*
2 * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
3 *
4 * Drives the local APIC in "clustered mode".
5 */
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/dmi.h>
11#include <linux/smp.h>
12
13#include <asm/apicdef.h>
14#include <asm/fixmap.h>
15#include <asm/mpspec.h>
16#include <asm/apic.h>
17#include <asm/ipi.h>
18
19static unsigned bigsmp_get_apic_id(unsigned long x)
20{
21 return (x >> 24) & 0xFF;
22}
23
24static int bigsmp_apic_id_registered(void)
25{
26 return 1;
27}
28
29static const cpumask_t *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return &cpu_online_map;
33#else
34 return &cpumask_of_cpu(0);
35#endif
36}
37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
39{
40 return 0;
41}
42
43static unsigned long bigsmp_check_apicid_present(int bit)
44{
45 return 1;
46}
47
48static inline unsigned long calculate_ldr(int cpu)
49{
50 unsigned long val, id;
51
52 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
53 id = per_cpu(x86_bios_cpu_apicid, cpu);
54 val |= SET_APIC_LOGICAL_ID(id);
55
56 return val;
57}
58
59/*
60 * Set up the logical destination ID.
61 *
62 * Intel recommends to set DFR, LDR and TPR before enabling
63 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
64 * document number 292116). So here it goes...
65 */
66static void bigsmp_init_apic_ldr(void)
67{
68 unsigned long val;
69 int cpu = smp_processor_id();
70
71 apic_write(APIC_DFR, APIC_DFR_FLAT);
72 val = calculate_ldr(cpu);
73 apic_write(APIC_LDR, val);
74}
75
76static void bigsmp_setup_apic_routing(void)
77{
78 printk(KERN_INFO
79 "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
80 nr_ioapics);
81}
82
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{
90 if (mps_cpu < nr_cpu_ids)
91 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
92
93 return BAD_APICID;
94}
95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{
104 if (cpu >= nr_cpu_ids)
105 return BAD_APICID;
106 return cpu_physical_id(cpu);
107}
108
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
110{
111 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL);
113}
114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
116{
117 return 1;
118}
119
120/* As we are using single CPU as destination, pick only one CPU here */
121static unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask)
122{
123 return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask));
124}
125
126static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
127 const struct cpumask *andmask)
128{
129 int cpu;
130
131 /*
132 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first.
134 */
135 for_each_cpu_and(cpu, cpumask, andmask) {
136 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break;
138 }
139 if (cpu < nr_cpu_ids)
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143}
144
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
146{
147 return cpuid_apic >> index_msb;
148}
149
150static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
151{
152 default_send_IPI_mask_sequence_phys(mask, vector);
153}
154
155static void bigsmp_send_IPI_allbutself(int vector)
156{
157 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
158}
159
160static void bigsmp_send_IPI_all(int vector)
161{
162 bigsmp_send_IPI_mask(cpu_online_mask, vector);
163}
164
165static int dmi_bigsmp; /* can be set by dmi scanners */
166
167static int hp_ht_bigsmp(const struct dmi_system_id *d)
168{
169 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
170 dmi_bigsmp = 1;
171
172 return 0;
173}
174
175
176static const struct dmi_system_id bigsmp_dmi_table[] = {
177 { hp_ht_bigsmp, "HP ProLiant DL760 G2",
178 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
179 DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
180 }
181 },
182
183 { hp_ht_bigsmp, "HP ProLiant DL740",
184 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
185 DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
186 }
187 },
188 { } /* NULL entry stops DMI scanning */
189};
190
191static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask)
192{
193 cpus_clear(*retmask);
194 cpu_set(cpu, *retmask);
195}
196
197static int probe_bigsmp(void)
198{
199 if (def_to_bigsmp)
200 dmi_bigsmp = 1;
201 else
202 dmi_check_system(bigsmp_dmi_table);
203
204 return dmi_bigsmp;
205}
206
207struct apic apic_bigsmp = {
208
209 .name = "bigsmp",
210 .probe = probe_bigsmp,
211 .acpi_madt_oem_check = NULL,
212 .apic_id_registered = bigsmp_apic_id_registered,
213
214 .irq_delivery_mode = dest_Fixed,
215 /* phys delivery to target CPU: */
216 .irq_dest_mode = 0,
217
218 .target_cpus = bigsmp_target_cpus,
219 .disable_esr = 1,
220 .dest_logical = 0,
221 .check_apicid_used = bigsmp_check_apicid_used,
222 .check_apicid_present = bigsmp_check_apicid_present,
223
224 .vector_allocation_domain = bigsmp_vector_allocation_domain,
225 .init_apic_ldr = bigsmp_init_apic_ldr,
226
227 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
228 .setup_apic_routing = bigsmp_setup_apic_routing,
229 .multi_timer_check = NULL,
230 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
234 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL,
237 .phys_pkg_id = bigsmp_phys_pkg_id,
238 .mps_oem_check = NULL,
239
240 .get_apic_id = bigsmp_get_apic_id,
241 .set_apic_id = NULL,
242 .apic_id_mask = 0xFF << 24,
243
244 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
245 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
246
247 .send_IPI_mask = bigsmp_send_IPI_mask,
248 .send_IPI_mask_allbutself = NULL,
249 .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
250 .send_IPI_all = bigsmp_send_IPI_all,
251 .send_IPI_self = default_send_IPI_self,
252
253 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
254 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
255
256 .wait_for_init_deassert = default_wait_for_init_deassert,
257
258 .smp_callin_clear_local_apic = NULL,
259 .inquire_remote_apic = default_inquire_remote_apic,
260
261 .read = native_apic_mem_read,
262 .write = native_apic_mem_write,
263 .icr_read = native_apic_icr_read,
264 .icr_write = native_apic_icr_write,
265 .wait_icr_idle = native_apic_wait_icr_idle,
266 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
267};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
new file mode 100644
index 000000000000..19588f2770ee
--- /dev/null
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -0,0 +1,780 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 *
5 * This file contains the code to configure and interface
6 * with Unisys ES7000 series hardware system manager.
7 *
8 * Copyright (c) 2003 Unisys Corporation.
9 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
10 *
11 * All Rights Reserved.
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of version 2 of the GNU General Public License as
15 * published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it would be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write the Free Software Foundation, Inc., 59
23 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
24 *
25 * Contact information: Unisys Corporation, Township Line & Union Meeting
26 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
27 *
28 * http://www.unisys.com
29 */
30#include <linux/notifier.h>
31#include <linux/spinlock.h>
32#include <linux/cpumask.h>
33#include <linux/threads.h>
34#include <linux/kernel.h>
35#include <linux/module.h>
36#include <linux/reboot.h>
37#include <linux/string.h>
38#include <linux/types.h>
39#include <linux/errno.h>
40#include <linux/acpi.h>
41#include <linux/init.h>
42#include <linux/nmi.h>
43#include <linux/smp.h>
44#include <linux/io.h>
45
46#include <asm/apicdef.h>
47#include <asm/atomic.h>
48#include <asm/fixmap.h>
49#include <asm/mpspec.h>
50#include <asm/setup.h>
51#include <asm/apic.h>
52#include <asm/ipi.h>
53
54/*
55 * ES7000 chipsets
56 */
57
58#define NON_UNISYS 0
59#define ES7000_CLASSIC 1
60#define ES7000_ZORRO 2
61
62#define MIP_REG 1
63#define MIP_PSAI_REG 4
64
65#define MIP_BUSY 1
66#define MIP_SPIN 0xf0000
67#define MIP_VALID 0x0100000000000000ULL
68#define MIP_SW_APIC 0x1020b
69
70#define MIP_PORT(val) ((val >> 32) & 0xffff)
71
72#define MIP_RD_LO(val) (val & 0xffffffff)
73
74struct mip_reg {
75 unsigned long long off_0x00;
76 unsigned long long off_0x08;
77 unsigned long long off_0x10;
78 unsigned long long off_0x18;
79 unsigned long long off_0x20;
80 unsigned long long off_0x28;
81 unsigned long long off_0x30;
82 unsigned long long off_0x38;
83};
84
85struct mip_reg_info {
86 unsigned long long mip_info;
87 unsigned long long delivery_info;
88 unsigned long long host_reg;
89 unsigned long long mip_reg;
90};
91
92struct psai {
93 unsigned long long entry_type;
94 unsigned long long addr;
95 unsigned long long bep_addr;
96};
97
98#ifdef CONFIG_ACPI
99
100struct es7000_oem_table {
101 struct acpi_table_header Header;
102 u32 OEMTableAddr;
103 u32 OEMTableSize;
104};
105
106static unsigned long oem_addrX;
107static unsigned long oem_size;
108
109#endif
110
111/*
112 * ES7000 Globals
113 */
114
115static volatile unsigned long *psai;
116static struct mip_reg *mip_reg;
117static struct mip_reg *host_reg;
118static int mip_port;
119static unsigned long mip_addr;
120static unsigned long host_addr;
121
122int es7000_plat;
123
124/*
125 * GSI override for ES7000 platforms.
126 */
127
128static unsigned int base;
129
130static int
131es7000_rename_gsi(int ioapic, int gsi)
132{
133 if (es7000_plat == ES7000_ZORRO)
134 return gsi;
135
136 if (!base) {
137 int i;
138 for (i = 0; i < nr_ioapics; i++)
139 base += nr_ioapic_registers[i];
140 }
141
142 if (!ioapic && (gsi < 16))
143 gsi += base;
144
145 return gsi;
146}
147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{
150 unsigned long vect = 0, psaival = 0;
151
152 if (psai == NULL)
153 return -1;
154
155 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
156 psaival = (0x1000000 | vect | cpu);
157
158 while (*psai & 0x1000000)
159 ;
160
161 *psai = psaival;
162
163 return 0;
164}
165
166static int es7000_apic_is_cluster(void)
167{
168 /* MPENTIUMIII */
169 if (boot_cpu_data.x86 == 6 &&
170 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
171 return 1;
172
173 return 0;
174}
175
176static void setup_unisys(void)
177{
178 /*
179 * Determine the generation of the ES7000 currently running.
180 *
181 * es7000_plat = 1 if the machine is a 5xx ES7000 box
182 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
183 *
184 */
185 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
186 es7000_plat = ES7000_ZORRO;
187 else
188 es7000_plat = ES7000_CLASSIC;
189 ioapic_renumber_irq = es7000_rename_gsi;
190}
191
192/*
193 * Parse the OEM Table:
194 */
195static int parse_unisys_oem(char *oemptr)
196{
197 int i;
198 int success = 0;
199 unsigned char type, size;
200 unsigned long val;
201 char *tp = NULL;
202 struct psai *psaip = NULL;
203 struct mip_reg_info *mi;
204 struct mip_reg *host, *mip;
205
206 tp = oemptr;
207
208 tp += 8;
209
210 for (i = 0; i <= 6; i++) {
211 type = *tp++;
212 size = *tp++;
213 tp -= 2;
214 switch (type) {
215 case MIP_REG:
216 mi = (struct mip_reg_info *)tp;
217 val = MIP_RD_LO(mi->host_reg);
218 host_addr = val;
219 host = (struct mip_reg *)val;
220 host_reg = __va(host);
221 val = MIP_RD_LO(mi->mip_reg);
222 mip_port = MIP_PORT(mi->mip_info);
223 mip_addr = val;
224 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
227 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
229 (unsigned long)mip_reg);
230 success++;
231 break;
232 case MIP_PSAI_REG:
233 psaip = (struct psai *)tp;
234 if (tp != NULL) {
235 if (psaip->addr)
236 psai = __va(psaip->addr);
237 else
238 psai = NULL;
239 success++;
240 }
241 break;
242 default:
243 break;
244 }
245 tp += size;
246 }
247
248 if (success < 2)
249 es7000_plat = NON_UNISYS;
250 else
251 setup_unisys();
252
253 return es7000_plat;
254}
255
256#ifdef CONFIG_ACPI
257static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
258{
259 struct acpi_table_header *header = NULL;
260 struct es7000_oem_table *table;
261 acpi_size tbl_size;
262 acpi_status ret;
263 int i = 0;
264
265 for (;;) {
266 ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
267 if (!ACPI_SUCCESS(ret))
268 return -1;
269
270 if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
271 break;
272
273 early_acpi_os_unmap_memory(header, tbl_size);
274 }
275
276 table = (void *)header;
277
278 oem_addrX = table->OEMTableAddr;
279 oem_size = table->OEMTableSize;
280
281 early_acpi_os_unmap_memory(header, tbl_size);
282
283 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
284
285 return 0;
286}
287
288static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
289{
290 if (!oem_addr)
291 return;
292
293 __acpi_unmap_table((char *)oem_addr, oem_size);
294}
295
296static int es7000_check_dsdt(void)
297{
298 struct acpi_table_header header;
299
300 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
301 !strncmp(header.oem_id, "UNISYS", 6))
302 return 1;
303 return 0;
304}
305
306static int es7000_acpi_ret;
307
308/* Hook from generic ACPI tables.c */
309static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
310{
311 unsigned long oem_addr = 0;
312 int check_dsdt;
313 int ret = 0;
314
315 /* check dsdt at first to avoid clear fix_map for oem_addr */
316 check_dsdt = es7000_check_dsdt();
317
318 if (!find_unisys_acpi_oem_table(&oem_addr)) {
319 if (check_dsdt) {
320 ret = parse_unisys_oem((char *)oem_addr);
321 } else {
322 setup_unisys();
323 ret = 1;
324 }
325 /*
326 * we need to unmap it
327 */
328 unmap_unisys_acpi_oem_table(oem_addr);
329 }
330
331 es7000_acpi_ret = ret;
332
333 return ret && !es7000_apic_is_cluster();
334}
335
336static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
337{
338 int ret = es7000_acpi_ret;
339
340 return ret && es7000_apic_is_cluster();
341}
342
343#else /* !CONFIG_ACPI: */
344static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
345{
346 return 0;
347}
348
349static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
350{
351 return 0;
352}
353#endif /* !CONFIG_ACPI */
354
355static void es7000_spin(int n)
356{
357 int i = 0;
358
359 while (i++ < n)
360 rep_nop();
361}
362
363static int es7000_mip_write(struct mip_reg *mip_reg)
364{
365 int status = 0;
366 int spin;
367
368 spin = MIP_SPIN;
369 while ((host_reg->off_0x38 & MIP_VALID) != 0) {
370 if (--spin <= 0) {
371 WARN(1, "Timeout waiting for Host Valid Flag\n");
372 return -1;
373 }
374 es7000_spin(MIP_SPIN);
375 }
376
377 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
378 outb(1, mip_port);
379
380 spin = MIP_SPIN;
381
382 while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
383 if (--spin <= 0) {
384 WARN(1, "Timeout waiting for MIP Valid Flag\n");
385 return -1;
386 }
387 es7000_spin(MIP_SPIN);
388 }
389
390 status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
391 mip_reg->off_0x38 &= ~MIP_VALID;
392
393 return status;
394}
395
396static void es7000_enable_apic_mode(void)
397{
398 struct mip_reg es7000_mip_reg;
399 int mip_status;
400
401 if (!es7000_plat)
402 return;
403
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID;
408
409 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
410 WARN(1, "Command failed, status = %x\n", mip_status);
411}
412
413static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
414{
415 /* Careful. Some cpus do not strictly honor the set of cpus
416 * specified in the interrupt destination when using lowest
417 * priority interrupt delivery mode.
418 *
419 * In particular there was a hyperthreading cpu observed to
420 * deliver interrupts to the wrong hyperthread when only one
421 * hyperthread was specified in the interrupt desitination.
422 */
423 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
424}
425
426
427static void es7000_wait_for_init_deassert(atomic_t *deassert)
428{
429 while (!atomic_read(deassert))
430 cpu_relax();
431}
432
433static unsigned int es7000_get_apic_id(unsigned long x)
434{
435 return (x >> 24) & 0xFF;
436}
437
438static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
439{
440 default_send_IPI_mask_sequence_phys(mask, vector);
441}
442
443static void es7000_send_IPI_allbutself(int vector)
444{
445 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
446}
447
448static void es7000_send_IPI_all(int vector)
449{
450 es7000_send_IPI_mask(cpu_online_mask, vector);
451}
452
453static int es7000_apic_id_registered(void)
454{
455 return 1;
456}
457
458static const cpumask_t *target_cpus_cluster(void)
459{
460 return &CPU_MASK_ALL;
461}
462
463static const cpumask_t *es7000_target_cpus(void)
464{
465 return &cpumask_of_cpu(smp_processor_id());
466}
467
468static unsigned long
469es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
470{
471 return 0;
472}
473static unsigned long es7000_check_apicid_present(int bit)
474{
475 return physid_isset(bit, phys_cpu_present_map);
476}
477
478static unsigned long calculate_ldr(int cpu)
479{
480 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
481
482 return SET_APIC_LOGICAL_ID(id);
483}
484
485/*
486 * Set up the logical destination ID.
487 *
488 * Intel recommends to set DFR, LdR and TPR before enabling
489 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
490 * document number 292116). So here it goes...
491 */
492static void es7000_init_apic_ldr_cluster(void)
493{
494 unsigned long val;
495 int cpu = smp_processor_id();
496
497 apic_write(APIC_DFR, APIC_DFR_CLUSTER);
498 val = calculate_ldr(cpu);
499 apic_write(APIC_LDR, val);
500}
501
502static void es7000_init_apic_ldr(void)
503{
504 unsigned long val;
505 int cpu = smp_processor_id();
506
507 apic_write(APIC_DFR, APIC_DFR_FLAT);
508 val = calculate_ldr(cpu);
509 apic_write(APIC_LDR, val);
510}
511
512static void es7000_setup_apic_routing(void)
513{
514 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
515
516 printk(KERN_INFO
517 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 (apic_version[apic] == 0x14) ?
519 "Physical Cluster" : "Logical Cluster",
520 nr_ioapics, cpus_addr(*es7000_target_cpus())[0]);
521}
522
523static int es7000_apicid_to_node(int logical_apicid)
524{
525 return 0;
526}
527
528
529static int es7000_cpu_present_to_apicid(int mps_cpu)
530{
531 if (!mps_cpu)
532 return boot_cpu_physical_apicid;
533 else if (mps_cpu < nr_cpu_ids)
534 return per_cpu(x86_bios_cpu_apicid, mps_cpu);
535 else
536 return BAD_APICID;
537}
538
539static int cpu_id;
540
541static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
542{
543 physid_mask_t mask;
544
545 mask = physid_mask_of_physid(cpu_id);
546 ++cpu_id;
547
548 return mask;
549}
550
551/* Mapping from cpu number to logical apicid */
552static int es7000_cpu_to_logical_apicid(int cpu)
553{
554#ifdef CONFIG_SMP
555 if (cpu >= nr_cpu_ids)
556 return BAD_APICID;
557 return cpu_2_logical_apicid[cpu];
558#else
559 return logical_smp_processor_id();
560#endif
561}
562
563static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
564{
565 /* For clustered we don't have a good way to do this yet - hack */
566 return physids_promote(0xff);
567}
568
569static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
570{
571 boot_cpu_physical_apicid = read_apic_id();
572 return 1;
573}
574
575static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
576{
577 unsigned int round = 0;
578 int cpu, uninitialized_var(apicid);
579
580 /*
581 * The cpus in the mask must all be on the apic cluster.
582 */
583 for_each_cpu(cpu, cpumask) {
584 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
585
586 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
587 WARN(1, "Not a valid mask!");
588
589 return BAD_APICID;
590 }
591 apicid = new_apicid;
592 round++;
593 }
594 return apicid;
595}
596
597static unsigned int
598es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
599 const struct cpumask *andmask)
600{
601 int apicid = es7000_cpu_to_logical_apicid(0);
602 cpumask_var_t cpumask;
603
604 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
605 return apicid;
606
607 cpumask_and(cpumask, inmask, andmask);
608 cpumask_and(cpumask, cpumask, cpu_online_mask);
609 apicid = es7000_cpu_mask_to_apicid(cpumask);
610
611 free_cpumask_var(cpumask);
612
613 return apicid;
614}
615
616static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
617{
618 return cpuid_apic >> index_msb;
619}
620
621static int probe_es7000(void)
622{
623 /* probed later in mptable/ACPI hooks */
624 return 0;
625}
626
627static int es7000_mps_ret;
628static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
629 char *productid)
630{
631 int ret = 0;
632
633 if (mpc->oemptr) {
634 struct mpc_oemtable *oem_table =
635 (struct mpc_oemtable *)mpc->oemptr;
636
637 if (!strncmp(oem, "UNISYS", 6))
638 ret = parse_unisys_oem((char *)oem_table);
639 }
640
641 es7000_mps_ret = ret;
642
643 return ret && !es7000_apic_is_cluster();
644}
645
646static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
647 char *productid)
648{
649 int ret = es7000_mps_ret;
650
651 return ret && es7000_apic_is_cluster();
652}
653
654struct apic apic_es7000_cluster = {
655
656 .name = "es7000",
657 .probe = probe_es7000,
658 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
659 .apic_id_registered = es7000_apic_id_registered,
660
661 .irq_delivery_mode = dest_LowestPrio,
662 /* logical delivery broadcast to all procs: */
663 .irq_dest_mode = 1,
664
665 .target_cpus = target_cpus_cluster,
666 .disable_esr = 1,
667 .dest_logical = 0,
668 .check_apicid_used = es7000_check_apicid_used,
669 .check_apicid_present = es7000_check_apicid_present,
670
671 .vector_allocation_domain = es7000_vector_allocation_domain,
672 .init_apic_ldr = es7000_init_apic_ldr_cluster,
673
674 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
675 .setup_apic_routing = es7000_setup_apic_routing,
676 .multi_timer_check = NULL,
677 .apicid_to_node = es7000_apicid_to_node,
678 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
679 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
680 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
681 .setup_portio_remap = NULL,
682 .check_phys_apicid_present = es7000_check_phys_apicid_present,
683 .enable_apic_mode = es7000_enable_apic_mode,
684 .phys_pkg_id = es7000_phys_pkg_id,
685 .mps_oem_check = es7000_mps_oem_check_cluster,
686
687 .get_apic_id = es7000_get_apic_id,
688 .set_apic_id = NULL,
689 .apic_id_mask = 0xFF << 24,
690
691 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
692 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
693
694 .send_IPI_mask = es7000_send_IPI_mask,
695 .send_IPI_mask_allbutself = NULL,
696 .send_IPI_allbutself = es7000_send_IPI_allbutself,
697 .send_IPI_all = es7000_send_IPI_all,
698 .send_IPI_self = default_send_IPI_self,
699
700 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
701
702 .trampoline_phys_low = 0x467,
703 .trampoline_phys_high = 0x469,
704
705 .wait_for_init_deassert = NULL,
706
707 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
708 .smp_callin_clear_local_apic = NULL,
709 .inquire_remote_apic = default_inquire_remote_apic,
710
711 .read = native_apic_mem_read,
712 .write = native_apic_mem_write,
713 .icr_read = native_apic_icr_read,
714 .icr_write = native_apic_icr_write,
715 .wait_icr_idle = native_apic_wait_icr_idle,
716 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
717};
718
719struct apic apic_es7000 = {
720
721 .name = "es7000",
722 .probe = probe_es7000,
723 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
724 .apic_id_registered = es7000_apic_id_registered,
725
726 .irq_delivery_mode = dest_Fixed,
727 /* phys delivery to target CPUs: */
728 .irq_dest_mode = 0,
729
730 .target_cpus = es7000_target_cpus,
731 .disable_esr = 1,
732 .dest_logical = 0,
733 .check_apicid_used = es7000_check_apicid_used,
734 .check_apicid_present = es7000_check_apicid_present,
735
736 .vector_allocation_domain = es7000_vector_allocation_domain,
737 .init_apic_ldr = es7000_init_apic_ldr,
738
739 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
740 .setup_apic_routing = es7000_setup_apic_routing,
741 .multi_timer_check = NULL,
742 .apicid_to_node = es7000_apicid_to_node,
743 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
744 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
745 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
746 .setup_portio_remap = NULL,
747 .check_phys_apicid_present = es7000_check_phys_apicid_present,
748 .enable_apic_mode = es7000_enable_apic_mode,
749 .phys_pkg_id = es7000_phys_pkg_id,
750 .mps_oem_check = es7000_mps_oem_check,
751
752 .get_apic_id = es7000_get_apic_id,
753 .set_apic_id = NULL,
754 .apic_id_mask = 0xFF << 24,
755
756 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
757 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
758
759 .send_IPI_mask = es7000_send_IPI_mask,
760 .send_IPI_mask_allbutself = NULL,
761 .send_IPI_allbutself = es7000_send_IPI_allbutself,
762 .send_IPI_all = es7000_send_IPI_all,
763 .send_IPI_self = default_send_IPI_self,
764
765 .trampoline_phys_low = 0x467,
766 .trampoline_phys_high = 0x469,
767
768 .wait_for_init_deassert = es7000_wait_for_init_deassert,
769
770 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
771 .smp_callin_clear_local_apic = NULL,
772 .inquire_remote_apic = default_inquire_remote_apic,
773
774 .read = native_apic_mem_read,
775 .write = native_apic_mem_write,
776 .icr_read = native_apic_icr_read,
777 .icr_write = native_apic_icr_write,
778 .wait_icr_idle = native_apic_wait_icr_idle,
779 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
780};
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index bc7ac4da90d7..00e6071cefc4 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Intel IO-APIC support for multi-Pentium hosts. 2 * Intel IO-APIC support for multi-Pentium hosts.
3 * 3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo 4 * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
5 * 5 *
6 * Many thanks to Stig Venaas for trying out countless experimental 6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently! 7 * patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/io.h> 47#include <asm/io.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/cpu.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/proto.h> 51#include <asm/proto.h>
51#include <asm/acpi.h> 52#include <asm/acpi.h>
@@ -61,9 +62,7 @@
61#include <asm/uv/uv_hub.h> 62#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h> 63#include <asm/uv/uv_irq.h>
63 64
64#include <mach_ipi.h> 65#include <asm/apic.h>
65#include <mach_apic.h>
66#include <mach_apicdef.h>
67 66
68#define __apicdebuginit(type) static type __init 67#define __apicdebuginit(type) static type __init
69 68
@@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);
82int nr_ioapic_registers[MAX_IO_APICS]; 81int nr_ioapic_registers[MAX_IO_APICS];
83 82
84/* I/O APIC entries */ 83/* I/O APIC entries */
85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 84struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 85int nr_ioapics;
87 86
88/* MP IRQ source entries */ 87/* MP IRQ source entries */
89struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 88struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 89
91/* # of MP IRQ source entries */ 90/* # of MP IRQ source entries */
92int mp_irq_entries; 91int mp_irq_entries;
@@ -99,10 +98,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
99 98
100int skip_ioapic_setup; 99int skip_ioapic_setup;
101 100
101void arch_disable_smp_support(void)
102{
103#ifdef CONFIG_PCI
104 noioapicquirk = 1;
105 noioapicreroute = -1;
106#endif
107 skip_ioapic_setup = 1;
108}
109
102static int __init parse_noapic(char *str) 110static int __init parse_noapic(char *str)
103{ 111{
104 /* disable IO-APIC */ 112 /* disable IO-APIC */
105 disable_ioapic_setup(); 113 arch_disable_smp_support();
106 return 0; 114 return 0;
107} 115}
108early_param("noapic", parse_noapic); 116early_param("noapic", parse_noapic);
@@ -356,7 +364,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356 364
357 if (!cfg->move_in_progress) { 365 if (!cfg->move_in_progress) {
358 /* it means that domain is not changed */ 366 /* it means that domain is not changed */
359 if (!cpumask_intersects(&desc->affinity, mask)) 367 if (!cpumask_intersects(desc->affinity, mask))
360 cfg->move_desc_pending = 1; 368 cfg->move_desc_pending = 1;
361 } 369 }
362} 370}
@@ -386,7 +394,7 @@ struct io_apic {
386static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
387{ 395{
388 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 396 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
389 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); 397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
390} 398}
391 399
392static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -478,7 +486,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 486 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479} 487}
480 488
481static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 489void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
482{ 490{
483 unsigned long flags; 491 unsigned long flags;
484 spin_lock_irqsave(&ioapic_lock, flags); 492 spin_lock_irqsave(&ioapic_lock, flags);
@@ -513,11 +521,11 @@ static void send_cleanup_vector(struct irq_cfg *cfg)
513 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 521 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
514 cfg->move_cleanup_count++; 522 cfg->move_cleanup_count++;
515 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 523 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
516 send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 524 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
517 } else { 525 } else {
518 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 526 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
519 cfg->move_cleanup_count = cpumask_weight(cleanup_mask); 527 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
520 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 528 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
521 free_cpumask_var(cleanup_mask); 529 free_cpumask_var(cleanup_mask);
522 } 530 }
523 cfg->move_in_progress = 0; 531 cfg->move_in_progress = 0;
@@ -562,8 +570,9 @@ static int
562assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); 570assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
563 571
564/* 572/*
565 * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid 573 * Either sets desc->affinity to a valid value, and returns
566 * of that, or returns BAD_APICID and leaves desc->affinity untouched. 574 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
575 * leaves desc->affinity untouched.
567 */ 576 */
568static unsigned int 577static unsigned int
569set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 578set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
@@ -579,9 +588,10 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
579 if (assign_irq_vector(irq, cfg, mask)) 588 if (assign_irq_vector(irq, cfg, mask))
580 return BAD_APICID; 589 return BAD_APICID;
581 590
582 cpumask_and(&desc->affinity, cfg->domain, mask); 591 cpumask_and(desc->affinity, cfg->domain, mask);
583 set_extra_move_desc(desc, mask); 592 set_extra_move_desc(desc, mask);
584 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); 593
594 return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
585} 595}
586 596
587static void 597static void
@@ -796,23 +806,6 @@ static void clear_IO_APIC (void)
796 clear_IO_APIC_pin(apic, pin); 806 clear_IO_APIC_pin(apic, pin);
797} 807}
798 808
799#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
800void send_IPI_self(int vector)
801{
802 unsigned int cfg;
803
804 /*
805 * Wait for idle.
806 */
807 apic_wait_icr_idle();
808 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
809 /*
810 * Send the IPI. The write to APIC_ICR fires this off.
811 */
812 apic_write(APIC_ICR, cfg);
813}
814#endif /* !CONFIG_SMP && CONFIG_X86_32*/
815
816#ifdef CONFIG_X86_32 809#ifdef CONFIG_X86_32
817/* 810/*
818 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to 811 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -820,8 +813,9 @@ void send_IPI_self(int vector)
820 */ 813 */
821 814
822#define MAX_PIRQS 8 815#define MAX_PIRQS 8
823static int pirq_entries [MAX_PIRQS]; 816static int pirq_entries[MAX_PIRQS] = {
824static int pirqs_enabled; 817 [0 ... MAX_PIRQS - 1] = -1
818};
825 819
826static int __init ioapic_pirq_setup(char *str) 820static int __init ioapic_pirq_setup(char *str)
827{ 821{
@@ -830,10 +824,6 @@ static int __init ioapic_pirq_setup(char *str)
830 824
831 get_options(str, ARRAY_SIZE(ints), ints); 825 get_options(str, ARRAY_SIZE(ints), ints);
832 826
833 for (i = 0; i < MAX_PIRQS; i++)
834 pirq_entries[i] = -1;
835
836 pirqs_enabled = 1;
837 apic_printk(APIC_VERBOSE, KERN_INFO 827 apic_printk(APIC_VERBOSE, KERN_INFO
838 "PIRQ redirection, working around broken MP-BIOS.\n"); 828 "PIRQ redirection, working around broken MP-BIOS.\n");
839 max = MAX_PIRQS; 829 max = MAX_PIRQS;
@@ -944,10 +934,10 @@ static int find_irq_entry(int apic, int pin, int type)
944 int i; 934 int i;
945 935
946 for (i = 0; i < mp_irq_entries; i++) 936 for (i = 0; i < mp_irq_entries; i++)
947 if (mp_irqs[i].mp_irqtype == type && 937 if (mp_irqs[i].irqtype == type &&
948 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || 938 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
949 mp_irqs[i].mp_dstapic == MP_APIC_ALL) && 939 mp_irqs[i].dstapic == MP_APIC_ALL) &&
950 mp_irqs[i].mp_dstirq == pin) 940 mp_irqs[i].dstirq == pin)
951 return i; 941 return i;
952 942
953 return -1; 943 return -1;
@@ -961,13 +951,13 @@ static int __init find_isa_irq_pin(int irq, int type)
961 int i; 951 int i;
962 952
963 for (i = 0; i < mp_irq_entries; i++) { 953 for (i = 0; i < mp_irq_entries; i++) {
964 int lbus = mp_irqs[i].mp_srcbus; 954 int lbus = mp_irqs[i].srcbus;
965 955
966 if (test_bit(lbus, mp_bus_not_pci) && 956 if (test_bit(lbus, mp_bus_not_pci) &&
967 (mp_irqs[i].mp_irqtype == type) && 957 (mp_irqs[i].irqtype == type) &&
968 (mp_irqs[i].mp_srcbusirq == irq)) 958 (mp_irqs[i].srcbusirq == irq))
969 959
970 return mp_irqs[i].mp_dstirq; 960 return mp_irqs[i].dstirq;
971 } 961 }
972 return -1; 962 return -1;
973} 963}
@@ -977,17 +967,17 @@ static int __init find_isa_irq_apic(int irq, int type)
977 int i; 967 int i;
978 968
979 for (i = 0; i < mp_irq_entries; i++) { 969 for (i = 0; i < mp_irq_entries; i++) {
980 int lbus = mp_irqs[i].mp_srcbus; 970 int lbus = mp_irqs[i].srcbus;
981 971
982 if (test_bit(lbus, mp_bus_not_pci) && 972 if (test_bit(lbus, mp_bus_not_pci) &&
983 (mp_irqs[i].mp_irqtype == type) && 973 (mp_irqs[i].irqtype == type) &&
984 (mp_irqs[i].mp_srcbusirq == irq)) 974 (mp_irqs[i].srcbusirq == irq))
985 break; 975 break;
986 } 976 }
987 if (i < mp_irq_entries) { 977 if (i < mp_irq_entries) {
988 int apic; 978 int apic;
989 for(apic = 0; apic < nr_ioapics; apic++) { 979 for(apic = 0; apic < nr_ioapics; apic++) {
990 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) 980 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
991 return apic; 981 return apic;
992 } 982 }
993 } 983 }
@@ -1012,23 +1002,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1012 return -1; 1002 return -1;
1013 } 1003 }
1014 for (i = 0; i < mp_irq_entries; i++) { 1004 for (i = 0; i < mp_irq_entries; i++) {
1015 int lbus = mp_irqs[i].mp_srcbus; 1005 int lbus = mp_irqs[i].srcbus;
1016 1006
1017 for (apic = 0; apic < nr_ioapics; apic++) 1007 for (apic = 0; apic < nr_ioapics; apic++)
1018 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || 1008 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1019 mp_irqs[i].mp_dstapic == MP_APIC_ALL) 1009 mp_irqs[i].dstapic == MP_APIC_ALL)
1020 break; 1010 break;
1021 1011
1022 if (!test_bit(lbus, mp_bus_not_pci) && 1012 if (!test_bit(lbus, mp_bus_not_pci) &&
1023 !mp_irqs[i].mp_irqtype && 1013 !mp_irqs[i].irqtype &&
1024 (bus == lbus) && 1014 (bus == lbus) &&
1025 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { 1015 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1026 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); 1016 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1027 1017
1028 if (!(apic || IO_APIC_IRQ(irq))) 1018 if (!(apic || IO_APIC_IRQ(irq)))
1029 continue; 1019 continue;
1030 1020
1031 if (pin == (mp_irqs[i].mp_srcbusirq & 3)) 1021 if (pin == (mp_irqs[i].srcbusirq & 3))
1032 return irq; 1022 return irq;
1033 /* 1023 /*
1034 * Use the first all-but-pin matching entry as a 1024 * Use the first all-but-pin matching entry as a
@@ -1071,7 +1061,7 @@ static int EISA_ELCR(unsigned int irq)
1071 * EISA conforming in the MP table, that means its trigger type must 1061 * EISA conforming in the MP table, that means its trigger type must
1072 * be read in from the ELCR */ 1062 * be read in from the ELCR */
1073 1063
1074#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) 1064#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
1075#define default_EISA_polarity(idx) default_ISA_polarity(idx) 1065#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1076 1066
1077/* PCI interrupts are always polarity one level triggered, 1067/* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1078,13 @@ static int EISA_ELCR(unsigned int irq)
1088 1078
1089static int MPBIOS_polarity(int idx) 1079static int MPBIOS_polarity(int idx)
1090{ 1080{
1091 int bus = mp_irqs[idx].mp_srcbus; 1081 int bus = mp_irqs[idx].srcbus;
1092 int polarity; 1082 int polarity;
1093 1083
1094 /* 1084 /*
1095 * Determine IRQ line polarity (high active or low active): 1085 * Determine IRQ line polarity (high active or low active):
1096 */ 1086 */
1097 switch (mp_irqs[idx].mp_irqflag & 3) 1087 switch (mp_irqs[idx].irqflag & 3)
1098 { 1088 {
1099 case 0: /* conforms, ie. bus-type dependent polarity */ 1089 case 0: /* conforms, ie. bus-type dependent polarity */
1100 if (test_bit(bus, mp_bus_not_pci)) 1090 if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1120,13 @@ static int MPBIOS_polarity(int idx)
1130 1120
1131static int MPBIOS_trigger(int idx) 1121static int MPBIOS_trigger(int idx)
1132{ 1122{
1133 int bus = mp_irqs[idx].mp_srcbus; 1123 int bus = mp_irqs[idx].srcbus;
1134 int trigger; 1124 int trigger;
1135 1125
1136 /* 1126 /*
1137 * Determine IRQ trigger mode (edge or level sensitive): 1127 * Determine IRQ trigger mode (edge or level sensitive):
1138 */ 1128 */
1139 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) 1129 switch ((mp_irqs[idx].irqflag>>2) & 3)
1140 { 1130 {
1141 case 0: /* conforms, ie. bus-type dependent */ 1131 case 0: /* conforms, ie. bus-type dependent */
1142 if (test_bit(bus, mp_bus_not_pci)) 1132 if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1204,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
1214static int pin_2_irq(int idx, int apic, int pin) 1204static int pin_2_irq(int idx, int apic, int pin)
1215{ 1205{
1216 int irq, i; 1206 int irq, i;
1217 int bus = mp_irqs[idx].mp_srcbus; 1207 int bus = mp_irqs[idx].srcbus;
1218 1208
1219 /* 1209 /*
1220 * Debugging check, we are in big trouble if this message pops up! 1210 * Debugging check, we are in big trouble if this message pops up!
1221 */ 1211 */
1222 if (mp_irqs[idx].mp_dstirq != pin) 1212 if (mp_irqs[idx].dstirq != pin)
1223 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1213 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1224 1214
1225 if (test_bit(bus, mp_bus_not_pci)) { 1215 if (test_bit(bus, mp_bus_not_pci)) {
1226 irq = mp_irqs[idx].mp_srcbusirq; 1216 irq = mp_irqs[idx].srcbusirq;
1227 } else { 1217 } else {
1228 /* 1218 /*
1229 * PCI IRQs are mapped in order 1219 * PCI IRQs are mapped in order
@@ -1315,7 +1305,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1315 int new_cpu; 1305 int new_cpu;
1316 int vector, offset; 1306 int vector, offset;
1317 1307
1318 vector_allocation_domain(cpu, tmp_mask); 1308 apic->vector_allocation_domain(cpu, tmp_mask);
1319 1309
1320 vector = current_vector; 1310 vector = current_vector;
1321 offset = current_offset; 1311 offset = current_offset;
@@ -1485,10 +1475,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1485 handle_edge_irq, "edge"); 1475 handle_edge_irq, "edge");
1486} 1476}
1487 1477
1488static int setup_ioapic_entry(int apic, int irq, 1478int setup_ioapic_entry(int apic_id, int irq,
1489 struct IO_APIC_route_entry *entry, 1479 struct IO_APIC_route_entry *entry,
1490 unsigned int destination, int trigger, 1480 unsigned int destination, int trigger,
1491 int polarity, int vector) 1481 int polarity, int vector)
1492{ 1482{
1493 /* 1483 /*
1494 * add it to the IO-APIC irq-routing table: 1484 * add it to the IO-APIC irq-routing table:
@@ -1497,25 +1487,25 @@ static int setup_ioapic_entry(int apic, int irq,
1497 1487
1498#ifdef CONFIG_INTR_REMAP 1488#ifdef CONFIG_INTR_REMAP
1499 if (intr_remapping_enabled) { 1489 if (intr_remapping_enabled) {
1500 struct intel_iommu *iommu = map_ioapic_to_ir(apic); 1490 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1501 struct irte irte; 1491 struct irte irte;
1502 struct IR_IO_APIC_route_entry *ir_entry = 1492 struct IR_IO_APIC_route_entry *ir_entry =
1503 (struct IR_IO_APIC_route_entry *) entry; 1493 (struct IR_IO_APIC_route_entry *) entry;
1504 int index; 1494 int index;
1505 1495
1506 if (!iommu) 1496 if (!iommu)
1507 panic("No mapping iommu for ioapic %d\n", apic); 1497 panic("No mapping iommu for ioapic %d\n", apic_id);
1508 1498
1509 index = alloc_irte(iommu, irq, 1); 1499 index = alloc_irte(iommu, irq, 1);
1510 if (index < 0) 1500 if (index < 0)
1511 panic("Failed to allocate IRTE for ioapic %d\n", apic); 1501 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1512 1502
1513 memset(&irte, 0, sizeof(irte)); 1503 memset(&irte, 0, sizeof(irte));
1514 1504
1515 irte.present = 1; 1505 irte.present = 1;
1516 irte.dst_mode = INT_DEST_MODE; 1506 irte.dst_mode = apic->irq_dest_mode;
1517 irte.trigger_mode = trigger; 1507 irte.trigger_mode = trigger;
1518 irte.dlvry_mode = INT_DELIVERY_MODE; 1508 irte.dlvry_mode = apic->irq_delivery_mode;
1519 irte.vector = vector; 1509 irte.vector = vector;
1520 irte.dest_id = IRTE_DEST(destination); 1510 irte.dest_id = IRTE_DEST(destination);
1521 1511
@@ -1528,8 +1518,8 @@ static int setup_ioapic_entry(int apic, int irq,
1528 } else 1518 } else
1529#endif 1519#endif
1530 { 1520 {
1531 entry->delivery_mode = INT_DELIVERY_MODE; 1521 entry->delivery_mode = apic->irq_delivery_mode;
1532 entry->dest_mode = INT_DEST_MODE; 1522 entry->dest_mode = apic->irq_dest_mode;
1533 entry->dest = destination; 1523 entry->dest = destination;
1534 } 1524 }
1535 1525
@@ -1546,7 +1536,7 @@ static int setup_ioapic_entry(int apic, int irq,
1546 return 0; 1536 return 0;
1547} 1537}
1548 1538
1549static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, 1539static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
1550 int trigger, int polarity) 1540 int trigger, int polarity)
1551{ 1541{
1552 struct irq_cfg *cfg; 1542 struct irq_cfg *cfg;
@@ -1558,22 +1548,22 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1558 1548
1559 cfg = desc->chip_data; 1549 cfg = desc->chip_data;
1560 1550
1561 if (assign_irq_vector(irq, cfg, TARGET_CPUS)) 1551 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1562 return; 1552 return;
1563 1553
1564 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 1554 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
1565 1555
1566 apic_printk(APIC_VERBOSE,KERN_DEBUG 1556 apic_printk(APIC_VERBOSE,KERN_DEBUG
1567 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1557 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1568 "IRQ %d Mode:%i Active:%i)\n", 1558 "IRQ %d Mode:%i Active:%i)\n",
1569 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1559 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
1570 irq, trigger, polarity); 1560 irq, trigger, polarity);
1571 1561
1572 1562
1573 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1563 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1574 dest, trigger, polarity, cfg->vector)) { 1564 dest, trigger, polarity, cfg->vector)) {
1575 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1565 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1576 mp_ioapics[apic].mp_apicid, pin); 1566 mp_ioapics[apic_id].apicid, pin);
1577 __clear_irq_vector(irq, cfg); 1567 __clear_irq_vector(irq, cfg);
1578 return; 1568 return;
1579 } 1569 }
@@ -1582,12 +1572,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1582 if (irq < NR_IRQS_LEGACY) 1572 if (irq < NR_IRQS_LEGACY)
1583 disable_8259A_irq(irq); 1573 disable_8259A_irq(irq);
1584 1574
1585 ioapic_write_entry(apic, pin, entry); 1575 ioapic_write_entry(apic_id, pin, entry);
1586} 1576}
1587 1577
1588static void __init setup_IO_APIC_irqs(void) 1578static void __init setup_IO_APIC_irqs(void)
1589{ 1579{
1590 int apic, pin, idx, irq; 1580 int apic_id, pin, idx, irq;
1591 int notcon = 0; 1581 int notcon = 0;
1592 struct irq_desc *desc; 1582 struct irq_desc *desc;
1593 struct irq_cfg *cfg; 1583 struct irq_cfg *cfg;
@@ -1595,21 +1585,19 @@ static void __init setup_IO_APIC_irqs(void)
1595 1585
1596 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1586 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1597 1587
1598 for (apic = 0; apic < nr_ioapics; apic++) { 1588 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
1599 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1589 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1600 1590
1601 idx = find_irq_entry(apic, pin, mp_INT); 1591 idx = find_irq_entry(apic_id, pin, mp_INT);
1602 if (idx == -1) { 1592 if (idx == -1) {
1603 if (!notcon) { 1593 if (!notcon) {
1604 notcon = 1; 1594 notcon = 1;
1605 apic_printk(APIC_VERBOSE, 1595 apic_printk(APIC_VERBOSE,
1606 KERN_DEBUG " %d-%d", 1596 KERN_DEBUG " %d-%d",
1607 mp_ioapics[apic].mp_apicid, 1597 mp_ioapics[apic_id].apicid, pin);
1608 pin);
1609 } else 1598 } else
1610 apic_printk(APIC_VERBOSE, " %d-%d", 1599 apic_printk(APIC_VERBOSE, " %d-%d",
1611 mp_ioapics[apic].mp_apicid, 1600 mp_ioapics[apic_id].apicid, pin);
1612 pin);
1613 continue; 1601 continue;
1614 } 1602 }
1615 if (notcon) { 1603 if (notcon) {
@@ -1618,20 +1606,25 @@ static void __init setup_IO_APIC_irqs(void)
1618 notcon = 0; 1606 notcon = 0;
1619 } 1607 }
1620 1608
1621 irq = pin_2_irq(idx, apic, pin); 1609 irq = pin_2_irq(idx, apic_id, pin);
1622#ifdef CONFIG_X86_32 1610
1623 if (multi_timer_check(apic, irq)) 1611 /*
1612 * Skip the timer IRQ if there's a quirk handler
1613 * installed and if it returns 1:
1614 */
1615 if (apic->multi_timer_check &&
1616 apic->multi_timer_check(apic_id, irq))
1624 continue; 1617 continue;
1625#endif 1618
1626 desc = irq_to_desc_alloc_cpu(irq, cpu); 1619 desc = irq_to_desc_alloc_cpu(irq, cpu);
1627 if (!desc) { 1620 if (!desc) {
1628 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1621 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1629 continue; 1622 continue;
1630 } 1623 }
1631 cfg = desc->chip_data; 1624 cfg = desc->chip_data;
1632 add_pin_to_irq_cpu(cfg, cpu, apic, pin); 1625 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
1633 1626
1634 setup_IO_APIC_irq(apic, pin, irq, desc, 1627 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1635 irq_trigger(idx), irq_polarity(idx)); 1628 irq_trigger(idx), irq_polarity(idx));
1636 } 1629 }
1637 } 1630 }
@@ -1644,7 +1637,7 @@ static void __init setup_IO_APIC_irqs(void)
1644/* 1637/*
1645 * Set up the timer pin, possibly with the 8259A-master behind. 1638 * Set up the timer pin, possibly with the 8259A-master behind.
1646 */ 1639 */
1647static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, 1640static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1648 int vector) 1641 int vector)
1649{ 1642{
1650 struct IO_APIC_route_entry entry; 1643 struct IO_APIC_route_entry entry;
@@ -1660,10 +1653,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1660 * We use logical delivery to get the timer IRQ 1653 * We use logical delivery to get the timer IRQ
1661 * to the first CPU. 1654 * to the first CPU.
1662 */ 1655 */
1663 entry.dest_mode = INT_DEST_MODE; 1656 entry.dest_mode = apic->irq_dest_mode;
1664 entry.mask = 1; /* mask IRQ now */ 1657 entry.mask = 0; /* don't mask IRQ for edge */
1665 entry.dest = cpu_mask_to_apicid(TARGET_CPUS); 1658 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
1666 entry.delivery_mode = INT_DELIVERY_MODE; 1659 entry.delivery_mode = apic->irq_delivery_mode;
1667 entry.polarity = 0; 1660 entry.polarity = 0;
1668 entry.trigger = 0; 1661 entry.trigger = 0;
1669 entry.vector = vector; 1662 entry.vector = vector;
@@ -1677,7 +1670,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1677 /* 1670 /*
1678 * Add it to the IO-APIC irq-routing table: 1671 * Add it to the IO-APIC irq-routing table:
1679 */ 1672 */
1680 ioapic_write_entry(apic, pin, entry); 1673 ioapic_write_entry(apic_id, pin, entry);
1681} 1674}
1682 1675
1683 1676
@@ -1699,7 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1699 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1692 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1700 for (i = 0; i < nr_ioapics; i++) 1693 for (i = 0; i < nr_ioapics; i++)
1701 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1694 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1702 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); 1695 mp_ioapics[i].apicid, nr_ioapic_registers[i]);
1703 1696
1704 /* 1697 /*
1705 * We are a bit conservative about what we expect. We have to 1698 * We are a bit conservative about what we expect. We have to
@@ -1719,7 +1712,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1719 spin_unlock_irqrestore(&ioapic_lock, flags); 1712 spin_unlock_irqrestore(&ioapic_lock, flags);
1720 1713
1721 printk("\n"); 1714 printk("\n");
1722 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1715 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
1723 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1716 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1724 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1717 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1725 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1718 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1980,13 +1973,6 @@ void __init enable_IO_APIC(void)
1980 int apic; 1973 int apic;
1981 unsigned long flags; 1974 unsigned long flags;
1982 1975
1983#ifdef CONFIG_X86_32
1984 int i;
1985 if (!pirqs_enabled)
1986 for (i = 0; i < MAX_PIRQS; i++)
1987 pirq_entries[i] = -1;
1988#endif
1989
1990 /* 1976 /*
1991 * The number of IO-APIC IRQ registers (== #pins): 1977 * The number of IO-APIC IRQ registers (== #pins):
1992 */ 1978 */
@@ -2090,7 +2076,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
2090{ 2076{
2091 union IO_APIC_reg_00 reg_00; 2077 union IO_APIC_reg_00 reg_00;
2092 physid_mask_t phys_id_present_map; 2078 physid_mask_t phys_id_present_map;
2093 int apic; 2079 int apic_id;
2094 int i; 2080 int i;
2095 unsigned char old_id; 2081 unsigned char old_id;
2096 unsigned long flags; 2082 unsigned long flags;
@@ -2109,26 +2095,26 @@ static void __init setup_ioapic_ids_from_mpc(void)
2109 * This is broken; anything with a real cpu count has to 2095 * This is broken; anything with a real cpu count has to
2110 * circumvent this idiocy regardless. 2096 * circumvent this idiocy regardless.
2111 */ 2097 */
2112 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); 2098 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
2113 2099
2114 /* 2100 /*
2115 * Set the IOAPIC ID to the value stored in the MPC table. 2101 * Set the IOAPIC ID to the value stored in the MPC table.
2116 */ 2102 */
2117 for (apic = 0; apic < nr_ioapics; apic++) { 2103 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2118 2104
2119 /* Read the register 0 value */ 2105 /* Read the register 0 value */
2120 spin_lock_irqsave(&ioapic_lock, flags); 2106 spin_lock_irqsave(&ioapic_lock, flags);
2121 reg_00.raw = io_apic_read(apic, 0); 2107 reg_00.raw = io_apic_read(apic_id, 0);
2122 spin_unlock_irqrestore(&ioapic_lock, flags); 2108 spin_unlock_irqrestore(&ioapic_lock, flags);
2123 2109
2124 old_id = mp_ioapics[apic].mp_apicid; 2110 old_id = mp_ioapics[apic_id].apicid;
2125 2111
2126 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { 2112 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
2127 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 2113 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2128 apic, mp_ioapics[apic].mp_apicid); 2114 apic_id, mp_ioapics[apic_id].apicid);
2129 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2115 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2130 reg_00.bits.ID); 2116 reg_00.bits.ID);
2131 mp_ioapics[apic].mp_apicid = reg_00.bits.ID; 2117 mp_ioapics[apic_id].apicid = reg_00.bits.ID;
2132 } 2118 }
2133 2119
2134 /* 2120 /*
@@ -2136,10 +2122,10 @@ static void __init setup_ioapic_ids_from_mpc(void)
2136 * system must have a unique ID or we get lots of nice 2122 * system must have a unique ID or we get lots of nice
2137 * 'stuck on smp_invalidate_needed IPI wait' messages. 2123 * 'stuck on smp_invalidate_needed IPI wait' messages.
2138 */ 2124 */
2139 if (check_apicid_used(phys_id_present_map, 2125 if (apic->check_apicid_used(phys_id_present_map,
2140 mp_ioapics[apic].mp_apicid)) { 2126 mp_ioapics[apic_id].apicid)) {
2141 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2127 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2142 apic, mp_ioapics[apic].mp_apicid); 2128 apic_id, mp_ioapics[apic_id].apicid);
2143 for (i = 0; i < get_physical_broadcast(); i++) 2129 for (i = 0; i < get_physical_broadcast(); i++)
2144 if (!physid_isset(i, phys_id_present_map)) 2130 if (!physid_isset(i, phys_id_present_map))
2145 break; 2131 break;
@@ -2148,13 +2134,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
2148 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2134 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2149 i); 2135 i);
2150 physid_set(i, phys_id_present_map); 2136 physid_set(i, phys_id_present_map);
2151 mp_ioapics[apic].mp_apicid = i; 2137 mp_ioapics[apic_id].apicid = i;
2152 } else { 2138 } else {
2153 physid_mask_t tmp; 2139 physid_mask_t tmp;
2154 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); 2140 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
2155 apic_printk(APIC_VERBOSE, "Setting %d in the " 2141 apic_printk(APIC_VERBOSE, "Setting %d in the "
2156 "phys_id_present_map\n", 2142 "phys_id_present_map\n",
2157 mp_ioapics[apic].mp_apicid); 2143 mp_ioapics[apic_id].apicid);
2158 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2144 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2159 } 2145 }
2160 2146
@@ -2163,11 +2149,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
2163 * We need to adjust the IRQ routing table 2149 * We need to adjust the IRQ routing table
2164 * if the ID changed. 2150 * if the ID changed.
2165 */ 2151 */
2166 if (old_id != mp_ioapics[apic].mp_apicid) 2152 if (old_id != mp_ioapics[apic_id].apicid)
2167 for (i = 0; i < mp_irq_entries; i++) 2153 for (i = 0; i < mp_irq_entries; i++)
2168 if (mp_irqs[i].mp_dstapic == old_id) 2154 if (mp_irqs[i].dstapic == old_id)
2169 mp_irqs[i].mp_dstapic 2155 mp_irqs[i].dstapic
2170 = mp_ioapics[apic].mp_apicid; 2156 = mp_ioapics[apic_id].apicid;
2171 2157
2172 /* 2158 /*
2173 * Read the right value from the MPC table and 2159 * Read the right value from the MPC table and
@@ -2175,20 +2161,20 @@ static void __init setup_ioapic_ids_from_mpc(void)
2175 */ 2161 */
2176 apic_printk(APIC_VERBOSE, KERN_INFO 2162 apic_printk(APIC_VERBOSE, KERN_INFO
2177 "...changing IO-APIC physical APIC ID to %d ...", 2163 "...changing IO-APIC physical APIC ID to %d ...",
2178 mp_ioapics[apic].mp_apicid); 2164 mp_ioapics[apic_id].apicid);
2179 2165
2180 reg_00.bits.ID = mp_ioapics[apic].mp_apicid; 2166 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2181 spin_lock_irqsave(&ioapic_lock, flags); 2167 spin_lock_irqsave(&ioapic_lock, flags);
2182 io_apic_write(apic, 0, reg_00.raw); 2168 io_apic_write(apic_id, 0, reg_00.raw);
2183 spin_unlock_irqrestore(&ioapic_lock, flags); 2169 spin_unlock_irqrestore(&ioapic_lock, flags);
2184 2170
2185 /* 2171 /*
2186 * Sanity check 2172 * Sanity check
2187 */ 2173 */
2188 spin_lock_irqsave(&ioapic_lock, flags); 2174 spin_lock_irqsave(&ioapic_lock, flags);
2189 reg_00.raw = io_apic_read(apic, 0); 2175 reg_00.raw = io_apic_read(apic_id, 0);
2190 spin_unlock_irqrestore(&ioapic_lock, flags); 2176 spin_unlock_irqrestore(&ioapic_lock, flags);
2191 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) 2177 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2192 printk("could not set ID!\n"); 2178 printk("could not set ID!\n");
2193 else 2179 else
2194 apic_printk(APIC_VERBOSE, " ok.\n"); 2180 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2291,7 +2277,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2291 unsigned long flags; 2277 unsigned long flags;
2292 2278
2293 spin_lock_irqsave(&vector_lock, flags); 2279 spin_lock_irqsave(&vector_lock, flags);
2294 send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2280 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2295 spin_unlock_irqrestore(&vector_lock, flags); 2281 spin_unlock_irqrestore(&vector_lock, flags);
2296 2282
2297 return 1; 2283 return 1;
@@ -2299,7 +2285,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2299#else 2285#else
2300static int ioapic_retrigger_irq(unsigned int irq) 2286static int ioapic_retrigger_irq(unsigned int irq)
2301{ 2287{
2302 send_IPI_self(irq_cfg(irq)->vector); 2288 apic->send_IPI_self(irq_cfg(irq)->vector);
2303 2289
2304 return 1; 2290 return 1;
2305} 2291}
@@ -2363,7 +2349,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2363 2349
2364 set_extra_move_desc(desc, mask); 2350 set_extra_move_desc(desc, mask);
2365 2351
2366 dest = cpu_mask_to_apicid_and(cfg->domain, mask); 2352 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2367 2353
2368 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2354 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2369 if (modify_ioapic_rte) { 2355 if (modify_ioapic_rte) {
@@ -2383,7 +2369,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2383 if (cfg->move_in_progress) 2369 if (cfg->move_in_progress)
2384 send_cleanup_vector(cfg); 2370 send_cleanup_vector(cfg);
2385 2371
2386 cpumask_copy(&desc->affinity, mask); 2372 cpumask_copy(desc->affinity, mask);
2387} 2373}
2388 2374
2389static int migrate_irq_remapped_level_desc(struct irq_desc *desc) 2375static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2391,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2405 } 2391 }
2406 2392
2407 /* everthing is clear. we have right of way */ 2393 /* everthing is clear. we have right of way */
2408 migrate_ioapic_irq_desc(desc, &desc->pending_mask); 2394 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2409 2395
2410 ret = 0; 2396 ret = 0;
2411 desc->status &= ~IRQ_MOVE_PENDING; 2397 desc->status &= ~IRQ_MOVE_PENDING;
2412 cpumask_clear(&desc->pending_mask); 2398 cpumask_clear(desc->pending_mask);
2413 2399
2414unmask: 2400unmask:
2415 unmask_IO_APIC_irq_desc(desc); 2401 unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2420,7 @@ static void ir_irq_migration(struct work_struct *work)
2434 continue; 2420 continue;
2435 } 2421 }
2436 2422
2437 desc->chip->set_affinity(irq, &desc->pending_mask); 2423 desc->chip->set_affinity(irq, desc->pending_mask);
2438 spin_unlock_irqrestore(&desc->lock, flags); 2424 spin_unlock_irqrestore(&desc->lock, flags);
2439 } 2425 }
2440 } 2426 }
@@ -2448,7 +2434,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2448{ 2434{
2449 if (desc->status & IRQ_LEVEL) { 2435 if (desc->status & IRQ_LEVEL) {
2450 desc->status |= IRQ_MOVE_PENDING; 2436 desc->status |= IRQ_MOVE_PENDING;
2451 cpumask_copy(&desc->pending_mask, mask); 2437 cpumask_copy(desc->pending_mask, mask);
2452 migrate_irq_remapped_level_desc(desc); 2438 migrate_irq_remapped_level_desc(desc);
2453 return; 2439 return;
2454 } 2440 }
@@ -2516,7 +2502,7 @@ static void irq_complete_move(struct irq_desc **descp)
2516 2502
2517 /* domain has not changed, but affinity did */ 2503 /* domain has not changed, but affinity did */
2518 me = smp_processor_id(); 2504 me = smp_processor_id();
2519 if (cpu_isset(me, desc->affinity)) { 2505 if (cpumask_test_cpu(me, desc->affinity)) {
2520 *descp = desc = move_irq_desc(desc, me); 2506 *descp = desc = move_irq_desc(desc, me);
2521 /* get the new one */ 2507 /* get the new one */
2522 cfg = desc->chip_data; 2508 cfg = desc->chip_data;
@@ -2867,19 +2853,15 @@ static inline void __init check_timer(void)
2867 int cpu = boot_cpu_id; 2853 int cpu = boot_cpu_id;
2868 int apic1, pin1, apic2, pin2; 2854 int apic1, pin1, apic2, pin2;
2869 unsigned long flags; 2855 unsigned long flags;
2870 unsigned int ver;
2871 int no_pin1 = 0; 2856 int no_pin1 = 0;
2872 2857
2873 local_irq_save(flags); 2858 local_irq_save(flags);
2874 2859
2875 ver = apic_read(APIC_LVR);
2876 ver = GET_APIC_VERSION(ver);
2877
2878 /* 2860 /*
2879 * get/set the timer IRQ vector: 2861 * get/set the timer IRQ vector:
2880 */ 2862 */
2881 disable_8259A_irq(0); 2863 disable_8259A_irq(0);
2882 assign_irq_vector(0, cfg, TARGET_CPUS); 2864 assign_irq_vector(0, cfg, apic->target_cpus());
2883 2865
2884 /* 2866 /*
2885 * As IRQ0 is to be enabled in the 8259A, the virtual 2867 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2893,7 +2875,13 @@ static inline void __init check_timer(void)
2893 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2875 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2894 init_8259A(1); 2876 init_8259A(1);
2895#ifdef CONFIG_X86_32 2877#ifdef CONFIG_X86_32
2896 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2878 {
2879 unsigned int ver;
2880
2881 ver = apic_read(APIC_LVR);
2882 ver = GET_APIC_VERSION(ver);
2883 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2884 }
2897#endif 2885#endif
2898 2886
2899 pin1 = find_isa_irq_pin(0, mp_INT); 2887 pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2932,8 +2920,17 @@ static inline void __init check_timer(void)
2932 if (no_pin1) { 2920 if (no_pin1) {
2933 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2921 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2934 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2922 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2923 } else {
2924 /* for edge trigger, setup_IO_APIC_irq already
2925 * leave it unmasked.
2926 * so only need to unmask if it is level-trigger
2927 * do we really have level trigger timer?
2928 */
2929 int idx;
2930 idx = find_irq_entry(apic1, pin1, mp_INT);
2931 if (idx != -1 && irq_trigger(idx))
2932 unmask_IO_APIC_irq_desc(desc);
2935 } 2933 }
2936 unmask_IO_APIC_irq_desc(desc);
2937 if (timer_irq_works()) { 2934 if (timer_irq_works()) {
2938 if (nmi_watchdog == NMI_IO_APIC) { 2935 if (nmi_watchdog == NMI_IO_APIC) {
2939 setup_nmi(); 2936 setup_nmi();
@@ -2947,6 +2944,7 @@ static inline void __init check_timer(void)
2947 if (intr_remapping_enabled) 2944 if (intr_remapping_enabled)
2948 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2945 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2949#endif 2946#endif
2947 local_irq_disable();
2950 clear_IO_APIC_pin(apic1, pin1); 2948 clear_IO_APIC_pin(apic1, pin1);
2951 if (!no_pin1) 2949 if (!no_pin1)
2952 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2950 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2961,7 +2959,6 @@ static inline void __init check_timer(void)
2961 */ 2959 */
2962 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 2960 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2963 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2961 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2964 unmask_IO_APIC_irq_desc(desc);
2965 enable_8259A_irq(0); 2962 enable_8259A_irq(0);
2966 if (timer_irq_works()) { 2963 if (timer_irq_works()) {
2967 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2964 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2976,6 +2973,7 @@ static inline void __init check_timer(void)
2976 /* 2973 /*
2977 * Cleanup, just in case ... 2974 * Cleanup, just in case ...
2978 */ 2975 */
2976 local_irq_disable();
2979 disable_8259A_irq(0); 2977 disable_8259A_irq(0);
2980 clear_IO_APIC_pin(apic2, pin2); 2978 clear_IO_APIC_pin(apic2, pin2);
2981 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2979 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -3001,6 +2999,7 @@ static inline void __init check_timer(void)
3001 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 2999 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3002 goto out; 3000 goto out;
3003 } 3001 }
3002 local_irq_disable();
3004 disable_8259A_irq(0); 3003 disable_8259A_irq(0);
3005 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3004 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3006 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3005 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3018,6 +3017,7 @@ static inline void __init check_timer(void)
3018 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3017 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3019 goto out; 3018 goto out;
3020 } 3019 }
3020 local_irq_disable();
3021 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 3021 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
3022 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 3022 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
3023 "report. Then try booting with the 'noapic' option.\n"); 3023 "report. Then try booting with the 'noapic' option.\n");
@@ -3047,13 +3047,9 @@ out:
3047void __init setup_IO_APIC(void) 3047void __init setup_IO_APIC(void)
3048{ 3048{
3049 3049
3050#ifdef CONFIG_X86_32
3051 enable_IO_APIC();
3052#else
3053 /* 3050 /*
3054 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3051 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3055 */ 3052 */
3056#endif
3057 3053
3058 io_apic_irqs = ~PIC_IRQS; 3054 io_apic_irqs = ~PIC_IRQS;
3059 3055
@@ -3118,8 +3114,8 @@ static int ioapic_resume(struct sys_device *dev)
3118 3114
3119 spin_lock_irqsave(&ioapic_lock, flags); 3115 spin_lock_irqsave(&ioapic_lock, flags);
3120 reg_00.raw = io_apic_read(dev->id, 0); 3116 reg_00.raw = io_apic_read(dev->id, 0);
3121 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { 3117 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3122 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; 3118 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3123 io_apic_write(dev->id, 0, reg_00.raw); 3119 io_apic_write(dev->id, 0, reg_00.raw);
3124 } 3120 }
3125 spin_unlock_irqrestore(&ioapic_lock, flags); 3121 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3169,6 +3165,7 @@ static int __init ioapic_init_sysfs(void)
3169 3165
3170device_initcall(ioapic_init_sysfs); 3166device_initcall(ioapic_init_sysfs);
3171 3167
3168static int nr_irqs_gsi = NR_IRQS_LEGACY;
3172/* 3169/*
3173 * Dynamic irq allocate and deallocation 3170 * Dynamic irq allocate and deallocation
3174 */ 3171 */
@@ -3183,11 +3180,11 @@ unsigned int create_irq_nr(unsigned int irq_want)
3183 struct irq_desc *desc_new = NULL; 3180 struct irq_desc *desc_new = NULL;
3184 3181
3185 irq = 0; 3182 irq = 0;
3186 spin_lock_irqsave(&vector_lock, flags); 3183 if (irq_want < nr_irqs_gsi)
3187 for (new = irq_want; new < NR_IRQS; new++) { 3184 irq_want = nr_irqs_gsi;
3188 if (platform_legacy_irq(new))
3189 continue;
3190 3185
3186 spin_lock_irqsave(&vector_lock, flags);
3187 for (new = irq_want; new < nr_irqs; new++) {
3191 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3188 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3192 if (!desc_new) { 3189 if (!desc_new) {
3193 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3190 printk(KERN_INFO "can not get irq_desc for %d\n", new);
@@ -3197,7 +3194,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3197 3194
3198 if (cfg_new->vector != 0) 3195 if (cfg_new->vector != 0)
3199 continue; 3196 continue;
3200 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) 3197 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3201 irq = new; 3198 irq = new;
3202 break; 3199 break;
3203 } 3200 }
@@ -3212,7 +3209,6 @@ unsigned int create_irq_nr(unsigned int irq_want)
3212 return irq; 3209 return irq;
3213} 3210}
3214 3211
3215static int nr_irqs_gsi = NR_IRQS_LEGACY;
3216int create_irq(void) 3212int create_irq(void)
3217{ 3213{
3218 unsigned int irq_want; 3214 unsigned int irq_want;
@@ -3259,12 +3255,15 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3259 int err; 3255 int err;
3260 unsigned dest; 3256 unsigned dest;
3261 3257
3258 if (disable_apic)
3259 return -ENXIO;
3260
3262 cfg = irq_cfg(irq); 3261 cfg = irq_cfg(irq);
3263 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3262 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3264 if (err) 3263 if (err)
3265 return err; 3264 return err;
3266 3265
3267 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 3266 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3268 3267
3269#ifdef CONFIG_INTR_REMAP 3268#ifdef CONFIG_INTR_REMAP
3270 if (irq_remapped(irq)) { 3269 if (irq_remapped(irq)) {
@@ -3278,9 +3277,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3278 memset (&irte, 0, sizeof(irte)); 3277 memset (&irte, 0, sizeof(irte));
3279 3278
3280 irte.present = 1; 3279 irte.present = 1;
3281 irte.dst_mode = INT_DEST_MODE; 3280 irte.dst_mode = apic->irq_dest_mode;
3282 irte.trigger_mode = 0; /* edge */ 3281 irte.trigger_mode = 0; /* edge */
3283 irte.dlvry_mode = INT_DELIVERY_MODE; 3282 irte.dlvry_mode = apic->irq_delivery_mode;
3284 irte.vector = cfg->vector; 3283 irte.vector = cfg->vector;
3285 irte.dest_id = IRTE_DEST(dest); 3284 irte.dest_id = IRTE_DEST(dest);
3286 3285
@@ -3298,10 +3297,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3298 msg->address_hi = MSI_ADDR_BASE_HI; 3297 msg->address_hi = MSI_ADDR_BASE_HI;
3299 msg->address_lo = 3298 msg->address_lo =
3300 MSI_ADDR_BASE_LO | 3299 MSI_ADDR_BASE_LO |
3301 ((INT_DEST_MODE == 0) ? 3300 ((apic->irq_dest_mode == 0) ?
3302 MSI_ADDR_DEST_MODE_PHYSICAL: 3301 MSI_ADDR_DEST_MODE_PHYSICAL:
3303 MSI_ADDR_DEST_MODE_LOGICAL) | 3302 MSI_ADDR_DEST_MODE_LOGICAL) |
3304 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3303 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3305 MSI_ADDR_REDIRECTION_CPU: 3304 MSI_ADDR_REDIRECTION_CPU:
3306 MSI_ADDR_REDIRECTION_LOWPRI) | 3305 MSI_ADDR_REDIRECTION_LOWPRI) |
3307 MSI_ADDR_DEST_ID(dest); 3306 MSI_ADDR_DEST_ID(dest);
@@ -3309,7 +3308,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3309 msg->data = 3308 msg->data =
3310 MSI_DATA_TRIGGER_EDGE | 3309 MSI_DATA_TRIGGER_EDGE |
3311 MSI_DATA_LEVEL_ASSERT | 3310 MSI_DATA_LEVEL_ASSERT |
3312 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3311 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3313 MSI_DATA_DELIVERY_FIXED: 3312 MSI_DATA_DELIVERY_FIXED:
3314 MSI_DATA_DELIVERY_LOWPRI) | 3313 MSI_DATA_DELIVERY_LOWPRI) |
3315 MSI_DATA_VECTOR(cfg->vector); 3314 MSI_DATA_VECTOR(cfg->vector);
@@ -3464,40 +3463,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3464 return 0; 3463 return 0;
3465} 3464}
3466 3465
3467int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3468{
3469 unsigned int irq;
3470 int ret;
3471 unsigned int irq_want;
3472
3473 irq_want = nr_irqs_gsi;
3474 irq = create_irq_nr(irq_want);
3475 if (irq == 0)
3476 return -1;
3477
3478#ifdef CONFIG_INTR_REMAP
3479 if (!intr_remapping_enabled)
3480 goto no_ir;
3481
3482 ret = msi_alloc_irte(dev, irq, 1);
3483 if (ret < 0)
3484 goto error;
3485no_ir:
3486#endif
3487 ret = setup_msi_irq(dev, msidesc, irq);
3488 if (ret < 0) {
3489 destroy_irq(irq);
3490 return ret;
3491 }
3492 return 0;
3493
3494#ifdef CONFIG_INTR_REMAP
3495error:
3496 destroy_irq(irq);
3497 return ret;
3498#endif
3499}
3500
3501int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3466int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3502{ 3467{
3503 unsigned int irq; 3468 unsigned int irq;
@@ -3514,9 +3479,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3514 sub_handle = 0; 3479 sub_handle = 0;
3515 list_for_each_entry(msidesc, &dev->msi_list, list) { 3480 list_for_each_entry(msidesc, &dev->msi_list, list) {
3516 irq = create_irq_nr(irq_want); 3481 irq = create_irq_nr(irq_want);
3517 irq_want++;
3518 if (irq == 0) 3482 if (irq == 0)
3519 return -1; 3483 return -1;
3484 irq_want = irq + 1;
3520#ifdef CONFIG_INTR_REMAP 3485#ifdef CONFIG_INTR_REMAP
3521 if (!intr_remapping_enabled) 3486 if (!intr_remapping_enabled)
3522 goto no_ir; 3487 goto no_ir;
@@ -3727,13 +3692,17 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3727 struct irq_cfg *cfg; 3692 struct irq_cfg *cfg;
3728 int err; 3693 int err;
3729 3694
3695 if (disable_apic)
3696 return -ENXIO;
3697
3730 cfg = irq_cfg(irq); 3698 cfg = irq_cfg(irq);
3731 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3699 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3732 if (!err) { 3700 if (!err) {
3733 struct ht_irq_msg msg; 3701 struct ht_irq_msg msg;
3734 unsigned dest; 3702 unsigned dest;
3735 3703
3736 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); 3704 dest = apic->cpu_mask_to_apicid_and(cfg->domain,
3705 apic->target_cpus());
3737 3706
3738 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3707 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3739 3708
@@ -3741,11 +3710,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3741 HT_IRQ_LOW_BASE | 3710 HT_IRQ_LOW_BASE |
3742 HT_IRQ_LOW_DEST_ID(dest) | 3711 HT_IRQ_LOW_DEST_ID(dest) |
3743 HT_IRQ_LOW_VECTOR(cfg->vector) | 3712 HT_IRQ_LOW_VECTOR(cfg->vector) |
3744 ((INT_DEST_MODE == 0) ? 3713 ((apic->irq_dest_mode == 0) ?
3745 HT_IRQ_LOW_DM_PHYSICAL : 3714 HT_IRQ_LOW_DM_PHYSICAL :
3746 HT_IRQ_LOW_DM_LOGICAL) | 3715 HT_IRQ_LOW_DM_LOGICAL) |
3747 HT_IRQ_LOW_RQEOI_EDGE | 3716 HT_IRQ_LOW_RQEOI_EDGE |
3748 ((INT_DELIVERY_MODE != dest_LowestPrio) ? 3717 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3749 HT_IRQ_LOW_MT_FIXED : 3718 HT_IRQ_LOW_MT_FIXED :
3750 HT_IRQ_LOW_MT_ARBITRATED) | 3719 HT_IRQ_LOW_MT_ARBITRATED) |
3751 HT_IRQ_LOW_IRQ_MASKED; 3720 HT_IRQ_LOW_IRQ_MASKED;
@@ -3761,7 +3730,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3761} 3730}
3762#endif /* CONFIG_HT_IRQ */ 3731#endif /* CONFIG_HT_IRQ */
3763 3732
3764#ifdef CONFIG_X86_64 3733#ifdef CONFIG_X86_UV
3765/* 3734/*
3766 * Re-target the irq to the specified CPU and enable the specified MMR located 3735 * Re-target the irq to the specified CPU and enable the specified MMR located
3767 * on the specified blade to allow the sending of MSIs to the specified CPU. 3736 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3793,12 +3762,12 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3793 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3762 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3794 3763
3795 entry->vector = cfg->vector; 3764 entry->vector = cfg->vector;
3796 entry->delivery_mode = INT_DELIVERY_MODE; 3765 entry->delivery_mode = apic->irq_delivery_mode;
3797 entry->dest_mode = INT_DEST_MODE; 3766 entry->dest_mode = apic->irq_dest_mode;
3798 entry->polarity = 0; 3767 entry->polarity = 0;
3799 entry->trigger = 0; 3768 entry->trigger = 0;
3800 entry->mask = 0; 3769 entry->mask = 0;
3801 entry->dest = cpu_mask_to_apicid(eligible_cpu); 3770 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3802 3771
3803 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3772 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3804 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3773 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3861,6 +3830,28 @@ void __init probe_nr_irqs_gsi(void)
3861 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3830 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3862} 3831}
3863 3832
3833#ifdef CONFIG_SPARSE_IRQ
3834int __init arch_probe_nr_irqs(void)
3835{
3836 int nr;
3837
3838 if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
3839 nr_irqs = NR_VECTORS * nr_cpu_ids;
3840
3841 nr = nr_irqs_gsi + 8 * nr_cpu_ids;
3842#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
3843 /*
3844 * for MSI and HT dyn irq
3845 */
3846 nr += nr_irqs_gsi * 16;
3847#endif
3848 if (nr < nr_irqs)
3849 nr_irqs = nr;
3850
3851 return 0;
3852}
3853#endif
3854
3864/* -------------------------------------------------------------------------- 3855/* --------------------------------------------------------------------------
3865 ACPI-based IOAPIC Configuration 3856 ACPI-based IOAPIC Configuration
3866 -------------------------------------------------------------------------- */ 3857 -------------------------------------------------------------------------- */
@@ -3886,7 +3877,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3886 */ 3877 */
3887 3878
3888 if (physids_empty(apic_id_map)) 3879 if (physids_empty(apic_id_map))
3889 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); 3880 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
3890 3881
3891 spin_lock_irqsave(&ioapic_lock, flags); 3882 spin_lock_irqsave(&ioapic_lock, flags);
3892 reg_00.raw = io_apic_read(ioapic, 0); 3883 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3902,10 +3893,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3902 * Every APIC in a system must have a unique ID or we get lots of nice 3893 * Every APIC in a system must have a unique ID or we get lots of nice
3903 * 'stuck on smp_invalidate_needed IPI wait' messages. 3894 * 'stuck on smp_invalidate_needed IPI wait' messages.
3904 */ 3895 */
3905 if (check_apicid_used(apic_id_map, apic_id)) { 3896 if (apic->check_apicid_used(apic_id_map, apic_id)) {
3906 3897
3907 for (i = 0; i < get_physical_broadcast(); i++) { 3898 for (i = 0; i < get_physical_broadcast(); i++) {
3908 if (!check_apicid_used(apic_id_map, i)) 3899 if (!apic->check_apicid_used(apic_id_map, i))
3909 break; 3900 break;
3910 } 3901 }
3911 3902
@@ -3918,7 +3909,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3918 apic_id = i; 3909 apic_id = i;
3919 } 3910 }
3920 3911
3921 tmp = apicid_to_cpu_present(apic_id); 3912 tmp = apic->apicid_to_cpu_present(apic_id);
3922 physids_or(apic_id_map, apic_id_map, tmp); 3913 physids_or(apic_id_map, apic_id_map, tmp);
3923 3914
3924 if (reg_00.bits.ID != apic_id) { 3915 if (reg_00.bits.ID != apic_id) {
@@ -3995,8 +3986,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3995 return -1; 3986 return -1;
3996 3987
3997 for (i = 0; i < mp_irq_entries; i++) 3988 for (i = 0; i < mp_irq_entries; i++)
3998 if (mp_irqs[i].mp_irqtype == mp_INT && 3989 if (mp_irqs[i].irqtype == mp_INT &&
3999 mp_irqs[i].mp_srcbusirq == bus_irq) 3990 mp_irqs[i].srcbusirq == bus_irq)
4000 break; 3991 break;
4001 if (i >= mp_irq_entries) 3992 if (i >= mp_irq_entries)
4002 return -1; 3993 return -1;
@@ -4011,7 +4002,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4011/* 4002/*
4012 * This function currently is only a helper for the i386 smp boot process where 4003 * This function currently is only a helper for the i386 smp boot process where
4013 * we need to reprogram the ioredtbls to cater for the cpus which have come online 4004 * we need to reprogram the ioredtbls to cater for the cpus which have come online
4014 * so mask in all cases should simply be TARGET_CPUS 4005 * so mask in all cases should simply be apic->target_cpus()
4015 */ 4006 */
4016#ifdef CONFIG_SMP 4007#ifdef CONFIG_SMP
4017void __init setup_ioapic_dest(void) 4008void __init setup_ioapic_dest(void)
@@ -4050,9 +4041,9 @@ void __init setup_ioapic_dest(void)
4050 */ 4041 */
4051 if (desc->status & 4042 if (desc->status &
4052 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4043 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4053 mask = &desc->affinity; 4044 mask = desc->affinity;
4054 else 4045 else
4055 mask = TARGET_CPUS; 4046 mask = apic->target_cpus();
4056 4047
4057#ifdef CONFIG_INTR_REMAP 4048#ifdef CONFIG_INTR_REMAP
4058 if (intr_remapping_enabled) 4049 if (intr_remapping_enabled)
@@ -4111,7 +4102,7 @@ void __init ioapic_init_mappings(void)
4111 ioapic_res = ioapic_setup_resources(); 4102 ioapic_res = ioapic_setup_resources();
4112 for (i = 0; i < nr_ioapics; i++) { 4103 for (i = 0; i < nr_ioapics; i++) {
4113 if (smp_found_config) { 4104 if (smp_found_config) {
4114 ioapic_phys = mp_ioapics[i].mp_apicaddr; 4105 ioapic_phys = mp_ioapics[i].apicaddr;
4115#ifdef CONFIG_X86_32 4106#ifdef CONFIG_X86_32
4116 if (!ioapic_phys) { 4107 if (!ioapic_phys) {
4117 printk(KERN_ERR 4108 printk(KERN_ERR
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 000000000000..dbf5445727a9
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,164 @@
1#include <linux/cpumask.h>
2#include <linux/interrupt.h>
3#include <linux/init.h>
4
5#include <linux/mm.h>
6#include <linux/delay.h>
7#include <linux/spinlock.h>
8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h>
10#include <linux/cache.h>
11#include <linux/cpu.h>
12#include <linux/module.h>
13
14#include <asm/smp.h>
15#include <asm/mtrr.h>
16#include <asm/tlbflush.h>
17#include <asm/mmu_context.h>
18#include <asm/apic.h>
19#include <asm/proto.h>
20#include <asm/ipi.h>
21
22void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
23{
24 unsigned long query_cpu;
25 unsigned long flags;
26
27 /*
28 * Hack. The clustered APIC addressing mode doesn't allow us to send
29 * to an arbitrary mask, so I do a unicast to each CPU instead.
30 * - mbligh
31 */
32 local_irq_save(flags);
33 for_each_cpu(query_cpu, mask) {
34 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
35 query_cpu), vector, APIC_DEST_PHYSICAL);
36 }
37 local_irq_restore(flags);
38}
39
40void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
41 int vector)
42{
43 unsigned int this_cpu = smp_processor_id();
44 unsigned int query_cpu;
45 unsigned long flags;
46
47 /* See Hack comment above */
48
49 local_irq_save(flags);
50 for_each_cpu(query_cpu, mask) {
51 if (query_cpu == this_cpu)
52 continue;
53 __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
54 query_cpu), vector, APIC_DEST_PHYSICAL);
55 }
56 local_irq_restore(flags);
57}
58
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector)
61{
62 unsigned long flags;
63 unsigned int query_cpu;
64
65 /*
66 * Hack. The clustered APIC addressing mode doesn't allow us to send
67 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
68 * should be modified to do 1 message per cluster ID - mbligh
69 */
70
71 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector,
75 apic->dest_logical);
76 local_irq_restore(flags);
77}
78
79void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
80 int vector)
81{
82 unsigned long flags;
83 unsigned int query_cpu;
84 unsigned int this_cpu = smp_processor_id();
85
86 /* See Hack comment above */
87
88 local_irq_save(flags);
89 for_each_cpu(query_cpu, mask) {
90 if (query_cpu == this_cpu)
91 continue;
92 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector,
94 apic->dest_logical);
95 }
96 local_irq_restore(flags);
97}
98
99#ifdef CONFIG_X86_32
100
101/*
102 * This is only used on smaller machines.
103 */
104void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
105{
106 unsigned long mask = cpumask_bits(cpumask)[0];
107 unsigned long flags;
108
109 local_irq_save(flags);
110 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
111 __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
112 local_irq_restore(flags);
113}
114
115void default_send_IPI_allbutself(int vector)
116{
117 /*
118 * if there are no other CPUs in the system then we get an APIC send
119 * error if we try to broadcast, thus avoid sending IPIs in this case.
120 */
121 if (!(num_online_cpus() > 1))
122 return;
123
124 __default_local_send_IPI_allbutself(vector);
125}
126
127void default_send_IPI_all(int vector)
128{
129 __default_local_send_IPI_all(vector);
130}
131
132void default_send_IPI_self(int vector)
133{
134 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
135}
136
137/* must come after the send_IPI functions above for inlining */
138static int convert_apicid_to_cpu(int apic_id)
139{
140 int i;
141
142 for_each_possible_cpu(i) {
143 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
144 return i;
145 }
146 return -1;
147}
148
149int safe_smp_processor_id(void)
150{
151 int apicid, cpuid;
152
153 if (!boot_cpu_has(X86_FEATURE_APIC))
154 return 0;
155
156 apicid = hard_smp_processor_id();
157 if (apicid == BAD_APICID)
158 return 0;
159
160 cpuid = convert_apicid_to_cpu(apicid);
161
162 return cpuid >= 0 ? cpuid : 0;
163}
164#endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7228979f1e7f..bdfad80c3cf1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -34,7 +34,7 @@
34 34
35#include <asm/mce.h> 35#include <asm/mce.h>
36 36
37#include <mach_traps.h> 37#include <asm/mach_traps.h>
38 38
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
@@ -61,11 +61,7 @@ static int endflag __initdata;
61 61
62static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
63{ 63{
64#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
65 return cpu_pda(cpu)->__nmi_count;
66#else
67 return nmi_count(cpu);
68#endif
69} 65}
70 66
71static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
82 */ 78 */
83static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
84{ 80{
85#ifdef CONFIG_X86_64
86 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
87#else
88 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
89 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
90#endif
91} 83}
92 84
93#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
new file mode 100644
index 000000000000..ba2fc6465534
--- /dev/null
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -0,0 +1,557 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to <gone@us.ibm.com>
25 */
26#include <linux/nodemask.h>
27#include <linux/topology.h>
28#include <linux/bootmem.h>
29#include <linux/threads.h>
30#include <linux/cpumask.h>
31#include <linux/kernel.h>
32#include <linux/mmzone.h>
33#include <linux/module.h>
34#include <linux/string.h>
35#include <linux/init.h>
36#include <linux/numa.h>
37#include <linux/smp.h>
38#include <linux/io.h>
39#include <linux/mm.h>
40
41#include <asm/processor.h>
42#include <asm/fixmap.h>
43#include <asm/mpspec.h>
44#include <asm/numaq.h>
45#include <asm/setup.h>
46#include <asm/apic.h>
47#include <asm/e820.h>
48#include <asm/ipi.h>
49
50#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
51
52int found_numaq;
53
54/*
55 * Have to match translation table entries to main table entries by counter
56 * hence the mpc_record variable .... can't see a less disgusting way of
57 * doing this ....
58 */
59struct mpc_trans {
60 unsigned char mpc_type;
61 unsigned char trans_len;
62 unsigned char trans_type;
63 unsigned char trans_quad;
64 unsigned char trans_global;
65 unsigned char trans_local;
66 unsigned short trans_reserved;
67};
68
69/* x86_quirks member */
70static int mpc_record;
71
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
73
74int mp_bus_id_to_node[MAX_MP_BUSSES];
75int mp_bus_id_to_local[MAX_MP_BUSSES];
76int quad_local_to_mp_bus_id[NR_CPUS/4][4];
77
78
79static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
80{
81 struct eachquadmem *eq = scd->eq + node;
82
83 node_set_online(node);
84
85 /* Convert to pages */
86 node_start_pfn[node] =
87 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
88
89 node_end_pfn[node] =
90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
91
92 e820_register_active_regions(node, node_start_pfn[node],
93 node_end_pfn[node]);
94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100}
101
102/*
103 * Function: smp_dump_qct()
104 *
105 * Description: gets memory layout from the quad config table. This
106 * function also updates node_online_map with the nodes (quads) present.
107 */
108static void __init smp_dump_qct(void)
109{
110 struct sys_cfg_data *scd;
111 int node;
112
113 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
114
115 nodes_clear(node_online_map);
116 for_each_node(node) {
117 if (scd->quads_present31_0 & (1 << node))
118 numaq_register_node(node, scd);
119 }
120}
121
122void __cpuinit numaq_tsc_disable(void)
123{
124 if (!found_numaq)
125 return;
126
127 if (num_online_nodes() > 1) {
128 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
129 setup_clear_cpu_cap(X86_FEATURE_TSC);
130 }
131}
132
133static int __init numaq_pre_time_init(void)
134{
135 numaq_tsc_disable();
136 return 0;
137}
138
139static inline int generate_logical_apicid(int quad, int phys_apicid)
140{
141 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
142}
143
144/* x86_quirks member */
145static int mpc_apic_id(struct mpc_cpu *m)
146{
147 int quad = translation_table[mpc_record]->trans_quad;
148 int logical_apicid = generate_logical_apicid(quad, m->apicid);
149
150 printk(KERN_DEBUG
151 "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
152 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
153 (m->cpufeature & CPU_MODEL_MASK) >> 4,
154 m->apicver, quad, logical_apicid);
155
156 return logical_apicid;
157}
158
159/* x86_quirks member */
160static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
161{
162 int quad = translation_table[mpc_record]->trans_quad;
163 int local = translation_table[mpc_record]->trans_local;
164
165 mp_bus_id_to_node[m->busid] = quad;
166 mp_bus_id_to_local[m->busid] = local;
167
168 printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
169}
170
171/* x86_quirks member */
172static void mpc_oem_pci_bus(struct mpc_bus *m)
173{
174 int quad = translation_table[mpc_record]->trans_quad;
175 int local = translation_table[mpc_record]->trans_local;
176
177 quad_local_to_mp_bus_id[quad][local] = m->busid;
178}
179
180static void __init MP_translation_info(struct mpc_trans *m)
181{
182 printk(KERN_INFO
183 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
184 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
185 m->trans_local);
186
187 if (mpc_record >= MAX_MPC_ENTRY)
188 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
189 else
190 translation_table[mpc_record] = m; /* stash this for later */
191
192 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
193 node_set_online(m->trans_quad);
194}
195
196static int __init mpf_checksum(unsigned char *mp, int len)
197{
198 int sum = 0;
199
200 while (len--)
201 sum += *mp++;
202
203 return sum & 0xFF;
204}
205
206/*
207 * Read/parse the MPC oem tables
208 */
209static void __init
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{
212 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214
215 mpc_record = 0;
216 printk(KERN_INFO
217 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
218
219 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
220 printk(KERN_WARNING
221 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
222 oemtable->signature[0], oemtable->signature[1],
223 oemtable->signature[2], oemtable->signature[3]);
224 return;
225 }
226
227 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
228 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
229 return;
230 }
231
232 while (count < oemtable->length) {
233 switch (*oemptr) {
234 case MP_TRANSLATION:
235 {
236 struct mpc_trans *m = (void *)oemptr;
237
238 MP_translation_info(m);
239 oemptr += sizeof(*m);
240 count += sizeof(*m);
241 ++mpc_record;
242 break;
243 }
244 default:
245 printk(KERN_WARNING
246 "Unrecognised OEM table entry type! - %d\n",
247 (int)*oemptr);
248 return;
249 }
250 }
251}
252
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void)
277{
278 /*
279 * Find possible boot-time SMP configuration:
280 */
281 early_find_smp_config();
282
283 /*
284 * get boot-time SMP configuration:
285 */
286 if (smp_found_config)
287 early_get_smp_config();
288
289 if (found_numaq)
290 x86_quirks = &numaq_x86_quirks;
291}
292
293int __init get_memcfg_numaq(void)
294{
295 early_check_numaq();
296 if (!found_numaq)
297 return 0;
298 smp_dump_qct();
299
300 return 1;
301}
302
303#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
304
305static inline unsigned int numaq_get_apic_id(unsigned long x)
306{
307 return (x >> 24) & 0x0F;
308}
309
310static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
311{
312 default_send_IPI_mask_sequence_logical(mask, vector);
313}
314
315static inline void numaq_send_IPI_allbutself(int vector)
316{
317 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
318}
319
320static inline void numaq_send_IPI_all(int vector)
321{
322 numaq_send_IPI_mask(cpu_online_mask, vector);
323}
324
325#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
326#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
327
328/*
329 * Because we use NMIs rather than the INIT-STARTUP sequence to
330 * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
331 */
332static inline void numaq_smp_callin_clear_local_apic(void)
333{
334 clear_local_APIC();
335}
336
337static inline const cpumask_t *numaq_target_cpus(void)
338{
339 return &CPU_MASK_ALL;
340}
341
342static inline unsigned long
343numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
344{
345 return physid_isset(apicid, bitmap);
346}
347
348static inline unsigned long numaq_check_apicid_present(int bit)
349{
350 return physid_isset(bit, phys_cpu_present_map);
351}
352
353static inline int numaq_apic_id_registered(void)
354{
355 return 1;
356}
357
358static inline void numaq_init_apic_ldr(void)
359{
360 /* Already done in NUMA-Q firmware */
361}
362
363static inline void numaq_setup_apic_routing(void)
364{
365 printk(KERN_INFO
366 "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
367 nr_ioapics);
368}
369
370/*
371 * Skip adding the timer int on secondary nodes, which causes
372 * a small but painful rift in the time-space continuum.
373 */
374static inline int numaq_multi_timer_check(int apic, int irq)
375{
376 return apic != 0 && irq == 0;
377}
378
379static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
380{
381 /* We don't have a good way to do this yet - hack */
382 return physids_promote(0xFUL);
383}
384
385static inline int numaq_cpu_to_logical_apicid(int cpu)
386{
387 if (cpu >= nr_cpu_ids)
388 return BAD_APICID;
389 return cpu_2_logical_apicid[cpu];
390}
391
392/*
393 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
394 * cpu to APIC ID relation to properly interact with the intelligent
395 * mode of the cluster controller.
396 */
397static inline int numaq_cpu_present_to_apicid(int mps_cpu)
398{
399 if (mps_cpu < 60)
400 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
401 else
402 return BAD_APICID;
403}
404
405static inline int numaq_apicid_to_node(int logical_apicid)
406{
407 return logical_apicid >> 4;
408}
409
410static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
411{
412 int node = numaq_apicid_to_node(logical_apicid);
413 int cpu = __ffs(logical_apicid & 0xf);
414
415 return physid_mask_of_physid(cpu + 4*node);
416}
417
418/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio;
420
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
422{
423 return 1;
424}
425
426/*
427 * We use physical apicids here, not logical, so just return the default
428 * physical broadcast to stop people from breaking us
429 */
430static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask)
431{
432 return 0x0F;
433}
434
435static inline unsigned int
436numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
437 const struct cpumask *andmask)
438{
439 return 0x0F;
440}
441
442/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
443static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
444{
445 return cpuid_apic >> index_msb;
446}
447
448static int
449numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
450{
451 if (strncmp(oem, "IBM NUMA", 8))
452 printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
453 else
454 found_numaq = 1;
455
456 return found_numaq;
457}
458
459static int probe_numaq(void)
460{
461 /* already know from get_memcfg_numaq() */
462 return found_numaq;
463}
464
465static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
466{
467 /* Careful. Some cpus do not strictly honor the set of cpus
468 * specified in the interrupt destination when using lowest
469 * priority interrupt delivery mode.
470 *
471 * In particular there was a hyperthreading cpu observed to
472 * deliver interrupts to the wrong hyperthread when only one
473 * hyperthread was specified in the interrupt desitination.
474 */
475 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
476}
477
478static void numaq_setup_portio_remap(void)
479{
480 int num_quads = num_online_nodes();
481
482 if (num_quads <= 1)
483 return;
484
485 printk(KERN_INFO
486 "Remapping cross-quad port I/O for %d quads\n", num_quads);
487
488 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
489
490 printk(KERN_INFO
491 "xquad_portio vaddr 0x%08lx, len %08lx\n",
492 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
493}
494
495struct apic apic_numaq = {
496
497 .name = "NUMAQ",
498 .probe = probe_numaq,
499 .acpi_madt_oem_check = NULL,
500 .apic_id_registered = numaq_apic_id_registered,
501
502 .irq_delivery_mode = dest_LowestPrio,
503 /* physical delivery on LOCAL quad: */
504 .irq_dest_mode = 0,
505
506 .target_cpus = numaq_target_cpus,
507 .disable_esr = 1,
508 .dest_logical = APIC_DEST_LOGICAL,
509 .check_apicid_used = numaq_check_apicid_used,
510 .check_apicid_present = numaq_check_apicid_present,
511
512 .vector_allocation_domain = numaq_vector_allocation_domain,
513 .init_apic_ldr = numaq_init_apic_ldr,
514
515 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
516 .setup_apic_routing = numaq_setup_apic_routing,
517 .multi_timer_check = numaq_multi_timer_check,
518 .apicid_to_node = numaq_apicid_to_node,
519 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
520 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
521 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
522 .setup_portio_remap = numaq_setup_portio_remap,
523 .check_phys_apicid_present = numaq_check_phys_apicid_present,
524 .enable_apic_mode = NULL,
525 .phys_pkg_id = numaq_phys_pkg_id,
526 .mps_oem_check = numaq_mps_oem_check,
527
528 .get_apic_id = numaq_get_apic_id,
529 .set_apic_id = NULL,
530 .apic_id_mask = 0x0F << 24,
531
532 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
533 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
534
535 .send_IPI_mask = numaq_send_IPI_mask,
536 .send_IPI_mask_allbutself = NULL,
537 .send_IPI_allbutself = numaq_send_IPI_allbutself,
538 .send_IPI_all = numaq_send_IPI_all,
539 .send_IPI_self = default_send_IPI_self,
540
541 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
542 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
543 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
544
545 /* We don't do anything here because we use NMI's to boot instead */
546 .wait_for_init_deassert = NULL,
547
548 .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
549 .inquire_remote_apic = NULL,
550
551 .read = native_apic_mem_read,
552 .write = native_apic_mem_write,
553 .icr_read = native_apic_icr_read,
554 .icr_write = native_apic_icr_write,
555 .wait_icr_idle = native_apic_wait_icr_idle,
556 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
557};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 000000000000..141c99a1c264
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,284 @@
1/*
2 * Default generic APIC driver. This handles up to 8 CPUs.
3 *
4 * Copyright 2003 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, v.2
6 *
7 * Generic x86 APIC driver probe layer.
8 */
9#include <linux/threads.h>
10#include <linux/cpumask.h>
11#include <linux/module.h>
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/ctype.h>
15#include <linux/init.h>
16#include <linux/errno.h>
17#include <asm/fixmap.h>
18#include <asm/mpspec.h>
19#include <asm/apicdef.h>
20#include <asm/apic.h>
21#include <asm/setup.h>
22
23#include <linux/threads.h>
24#include <linux/cpumask.h>
25#include <asm/mpspec.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/ipi.h>
33
34#include <linux/smp.h>
35#include <linux/init.h>
36#include <linux/interrupt.h>
37#include <asm/acpi.h>
38#include <asm/e820.h>
39#include <asm/setup.h>
40
41#ifdef CONFIG_HOTPLUG_CPU
42#define DEFAULT_SEND_IPI (1)
43#else
44#define DEFAULT_SEND_IPI (0)
45#endif
46
47int no_broadcast = DEFAULT_SEND_IPI;
48
49static __init int no_ipi_broadcast(char *str)
50{
51 get_option(&str, &no_broadcast);
52 pr_info("Using %s mode\n",
53 no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
54 return 1;
55}
56__setup("no_ipi_broadcast=", no_ipi_broadcast);
57
58static int __init print_ipi_mode(void)
59{
60 pr_info("Using IPI %s mode\n",
61 no_broadcast ? "No-Shortcut" : "Shortcut");
62 return 0;
63}
64late_initcall(print_ipi_mode);
65
66void default_setup_apic_routing(void)
67{
68#ifdef CONFIG_X86_IO_APIC
69 printk(KERN_INFO
70 "Enabling APIC mode: Flat. Using %d I/O APICs\n",
71 nr_ioapics);
72#endif
73}
74
75static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
76{
77 /*
78 * Careful. Some cpus do not strictly honor the set of cpus
79 * specified in the interrupt destination when using lowest
80 * priority interrupt delivery mode.
81 *
82 * In particular there was a hyperthreading cpu observed to
83 * deliver interrupts to the wrong hyperthread when only one
84 * hyperthread was specified in the interrupt desitination.
85 */
86 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
87}
88
89/* should be called last. */
90static int probe_default(void)
91{
92 return 1;
93}
94
95struct apic apic_default = {
96
97 .name = "default",
98 .probe = probe_default,
99 .acpi_madt_oem_check = NULL,
100 .apic_id_registered = default_apic_id_registered,
101
102 .irq_delivery_mode = dest_LowestPrio,
103 /* logical delivery broadcast to all CPUs: */
104 .irq_dest_mode = 1,
105
106 .target_cpus = default_target_cpus,
107 .disable_esr = 0,
108 .dest_logical = APIC_DEST_LOGICAL,
109 .check_apicid_used = default_check_apicid_used,
110 .check_apicid_present = default_check_apicid_present,
111
112 .vector_allocation_domain = default_vector_allocation_domain,
113 .init_apic_ldr = default_init_apic_ldr,
114
115 .ioapic_phys_id_map = default_ioapic_phys_id_map,
116 .setup_apic_routing = default_setup_apic_routing,
117 .multi_timer_check = NULL,
118 .apicid_to_node = default_apicid_to_node,
119 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
120 .cpu_present_to_apicid = default_cpu_present_to_apicid,
121 .apicid_to_cpu_present = default_apicid_to_cpu_present,
122 .setup_portio_remap = NULL,
123 .check_phys_apicid_present = default_check_phys_apicid_present,
124 .enable_apic_mode = NULL,
125 .phys_pkg_id = default_phys_pkg_id,
126 .mps_oem_check = NULL,
127
128 .get_apic_id = default_get_apic_id,
129 .set_apic_id = NULL,
130 .apic_id_mask = 0x0F << 24,
131
132 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
133 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
134
135 .send_IPI_mask = default_send_IPI_mask_logical,
136 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
137 .send_IPI_allbutself = default_send_IPI_allbutself,
138 .send_IPI_all = default_send_IPI_all,
139 .send_IPI_self = default_send_IPI_self,
140
141 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
142 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
143
144 .wait_for_init_deassert = default_wait_for_init_deassert,
145
146 .smp_callin_clear_local_apic = NULL,
147 .inquire_remote_apic = default_inquire_remote_apic,
148
149 .read = native_apic_mem_read,
150 .write = native_apic_mem_write,
151 .icr_read = native_apic_icr_read,
152 .icr_write = native_apic_icr_write,
153 .wait_icr_idle = native_apic_wait_icr_idle,
154 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
155};
156
157extern struct apic apic_numaq;
158extern struct apic apic_summit;
159extern struct apic apic_bigsmp;
160extern struct apic apic_es7000;
161extern struct apic apic_es7000_cluster;
162extern struct apic apic_default;
163
164struct apic *apic = &apic_default;
165EXPORT_SYMBOL_GPL(apic);
166
167static struct apic *apic_probe[] __initdata = {
168#ifdef CONFIG_X86_NUMAQ
169 &apic_numaq,
170#endif
171#ifdef CONFIG_X86_SUMMIT
172 &apic_summit,
173#endif
174#ifdef CONFIG_X86_BIGSMP
175 &apic_bigsmp,
176#endif
177#ifdef CONFIG_X86_ES7000
178 &apic_es7000,
179 &apic_es7000_cluster,
180#endif
181 &apic_default, /* must be last */
182 NULL,
183};
184
185static int cmdline_apic __initdata;
186static int __init parse_apic(char *arg)
187{
188 int i;
189
190 if (!arg)
191 return -EINVAL;
192
193 for (i = 0; apic_probe[i]; i++) {
194 if (!strcmp(apic_probe[i]->name, arg)) {
195 apic = apic_probe[i];
196 cmdline_apic = 1;
197 return 0;
198 }
199 }
200
201 /* Parsed again by __setup for debug/verbose */
202 return 0;
203}
204early_param("apic", parse_apic);
205
206void __init generic_bigsmp_probe(void)
207{
208#ifdef CONFIG_X86_BIGSMP
209 /*
210 * This routine is used to switch to bigsmp mode when
211 * - There is no apic= option specified by the user
212 * - generic_apic_probe() has chosen apic_default as the sub_arch
213 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
214 */
215
216 if (!cmdline_apic && apic == &apic_default) {
217 if (apic_bigsmp.probe()) {
218 apic = &apic_bigsmp;
219 printk(KERN_INFO "Overriding APIC driver with %s\n",
220 apic->name);
221 }
222 }
223#endif
224}
225
226void __init generic_apic_probe(void)
227{
228 if (!cmdline_apic) {
229 int i;
230 for (i = 0; apic_probe[i]; i++) {
231 if (apic_probe[i]->probe()) {
232 apic = apic_probe[i];
233 break;
234 }
235 }
236 /* Not visible without early console */
237 if (!apic_probe[i])
238 panic("Didn't find an APIC driver");
239 }
240 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
241}
242
243/* These functions can switch the APIC even after the initial ->probe() */
244
245int __init
246generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
247{
248 int i;
249
250 for (i = 0; apic_probe[i]; ++i) {
251 if (!apic_probe[i]->mps_oem_check)
252 continue;
253 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
254 continue;
255
256 if (!cmdline_apic) {
257 apic = apic_probe[i];
258 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
259 apic->name);
260 }
261 return 1;
262 }
263 return 0;
264}
265
266int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
267{
268 int i;
269
270 for (i = 0; apic_probe[i]; ++i) {
271 if (!apic_probe[i]->acpi_madt_oem_check)
272 continue;
273 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
274 continue;
275
276 if (!cmdline_apic) {
277 apic = apic_probe[i];
278 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
279 apic->name);
280 }
281 return 1;
282 }
283 return 0;
284}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/apic/probe_64.c
index 2bced78b0b8e..8d7748efe6a8 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -19,22 +19,27 @@
19#include <linux/dmar.h> 19#include <linux/dmar.h>
20 20
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/apic.h>
22#include <asm/ipi.h> 23#include <asm/ipi.h>
23#include <asm/genapic.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26extern struct genapic apic_flat; 26extern struct apic apic_flat;
27extern struct genapic apic_physflat; 27extern struct apic apic_physflat;
28extern struct genapic apic_x2xpic_uv_x; 28extern struct apic apic_x2xpic_uv_x;
29extern struct genapic apic_x2apic_phys; 29extern struct apic apic_x2apic_phys;
30extern struct genapic apic_x2apic_cluster; 30extern struct apic apic_x2apic_cluster;
31 31
32struct genapic __read_mostly *genapic = &apic_flat; 32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
33 34
34static struct genapic *apic_probe[] __initdata = { 35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
35 &apic_x2apic_uv_x, 37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
36 &apic_x2apic_phys, 40 &apic_x2apic_phys,
37 &apic_x2apic_cluster, 41 &apic_x2apic_cluster,
42#endif
38 &apic_physflat, 43 &apic_physflat,
39 NULL, 44 NULL,
40}; 45};
@@ -42,39 +47,45 @@ static struct genapic *apic_probe[] __initdata = {
42/* 47/*
43 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 48 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
44 */ 49 */
45void __init setup_apic_routing(void) 50void __init default_setup_apic_routing(void)
46{ 51{
47 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { 52#ifdef CONFIG_X86_X2APIC
48 if (!intr_remapping_enabled) 53 if (x2apic && (apic != &apic_x2apic_phys &&
49 genapic = &apic_flat; 54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x &&
56#endif
57 apic != &apic_x2apic_cluster)) {
58 if (x2apic_phys)
59 apic = &apic_x2apic_phys;
60 else
61 apic = &apic_x2apic_cluster;
62 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
50 } 63 }
64#endif
51 65
52 if (genapic == &apic_flat) { 66 if (apic == &apic_flat) {
53 if (max_physical_apicid >= 8) 67 if (max_physical_apicid >= 8)
54 genapic = &apic_physflat; 68 apic = &apic_physflat;
55 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
56 } 70 }
57
58 if (x86_quirks->update_genapic)
59 x86_quirks->update_genapic();
60} 71}
61 72
62/* Same for both flat and physical. */ 73/* Same for both flat and physical. */
63 74
64void apic_send_IPI_self(int vector) 75void apic_send_IPI_self(int vector)
65{ 76{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 77 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 78}
68 79
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 80int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 81{
71 int i; 82 int i;
72 83
73 for (i = 0; apic_probe[i]; ++i) { 84 for (i = 0; apic_probe[i]; ++i) {
74 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 85 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 genapic = apic_probe[i]; 86 apic = apic_probe[i];
76 printk(KERN_INFO "Setting APIC routing to %s.\n", 87 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 genapic->name); 88 apic->name);
78 return 1; 89 return 1;
79 } 90 }
80 } 91 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
new file mode 100644
index 000000000000..aac52fa873ff
--- /dev/null
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -0,0 +1,579 @@
1/*
2 * IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/bios_ebda.h>
33
34/*
35 * APIC driver for the IBM "Summit" chipset.
36 */
37#include <linux/threads.h>
38#include <linux/cpumask.h>
39#include <asm/mpspec.h>
40#include <asm/apic.h>
41#include <asm/smp.h>
42#include <asm/fixmap.h>
43#include <asm/apicdef.h>
44#include <asm/ipi.h>
45#include <linux/kernel.h>
46#include <linux/string.h>
47#include <linux/init.h>
48#include <linux/gfp.h>
49#include <linux/smp.h>
50
51static unsigned summit_get_apic_id(unsigned long x)
52{
53 return (x >> 24) & 0xFF;
54}
55
56static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector)
57{
58 default_send_IPI_mask_sequence_logical(mask, vector);
59}
60
61static void summit_send_IPI_allbutself(int vector)
62{
63 cpumask_t mask = cpu_online_map;
64 cpu_clear(smp_processor_id(), mask);
65
66 if (!cpus_empty(mask))
67 summit_send_IPI_mask(&mask, vector);
68}
69
70static void summit_send_IPI_all(int vector)
71{
72 summit_send_IPI_mask(&cpu_online_map, vector);
73}
74
75#include <asm/tsc.h>
76
77extern int use_cyclone;
78
79#ifdef CONFIG_X86_SUMMIT_NUMA
80static void setup_summit(void);
81#else
82static inline void setup_summit(void) {}
83#endif
84
85static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
86 char *productid)
87{
88 if (!strncmp(oem, "IBM ENSW", 8) &&
89 (!strncmp(productid, "VIGIL SMP", 9)
90 || !strncmp(productid, "EXA", 3)
91 || !strncmp(productid, "RUTHLESS SMP", 12))){
92 mark_tsc_unstable("Summit based system");
93 use_cyclone = 1; /*enable cyclone-timer*/
94 setup_summit();
95 return 1;
96 }
97 return 0;
98}
99
100/* Hook from generic ACPI tables.c */
101static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
102{
103 if (!strncmp(oem_id, "IBM", 3) &&
104 (!strncmp(oem_table_id, "SERVIGIL", 8)
105 || !strncmp(oem_table_id, "EXA", 3))){
106 mark_tsc_unstable("Summit based system");
107 use_cyclone = 1; /*enable cyclone-timer*/
108 setup_summit();
109 return 1;
110 }
111 return 0;
112}
113
114struct rio_table_hdr {
115 unsigned char version; /* Version number of this data structure */
116 /* Version 3 adds chassis_num & WP_index */
117 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
118 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
119} __attribute__((packed));
120
121struct scal_detail {
122 unsigned char node_id; /* Scalability Node ID */
123 unsigned long CBAR; /* Address of 1MB register space */
124 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
125 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
126 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
127 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
128 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
129 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
130 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
131} __attribute__((packed));
132
133struct rio_detail {
134 unsigned char node_id; /* RIO Node ID */
135 unsigned long BBAR; /* Address of 1MB register space */
136 unsigned char type; /* Type of device */
137 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
138 /* For CYC: Node ID of Twister that owns this CYC */
139 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
140 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
141 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
142 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
143 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
144 /* For CYC: 0 */
145 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
146 /* = 0 : the XAPIC is not used, ie:*/
147 /* ints fwded to another XAPIC */
148 /* Bits1:7 Reserved */
149 /* For CYC: Bits0:7 Reserved */
150 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
151 /* lower slot numbers/PCI bus numbers */
152 /* For CYC: No meaning */
153 unsigned char chassis_num; /* 1 based Chassis number */
154 /* For LookOut WPEGs this field indicates the */
155 /* Expansion Chassis #, enumerated from Boot */
156 /* Node WPEG external port, then Boot Node CYC */
157 /* external port, then Next Vigil chassis WPEG */
158 /* external port, etc. */
159 /* Shared Lookouts have only 1 chassis number (the */
160 /* first one assigned) */
161} __attribute__((packed));
162
163
164typedef enum {
165 CompatTwister = 0, /* Compatibility Twister */
166 AltTwister = 1, /* Alternate Twister of internal 8-way */
167 CompatCyclone = 2, /* Compatibility Cyclone */
168 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
169 CompatWPEG = 4, /* Compatibility WPEG */
170 AltWPEG = 5, /* Second Planar WPEG */
171 LookOutAWPEG = 6, /* LookOut WPEG */
172 LookOutBWPEG = 7, /* LookOut WPEG */
173} node_type;
174
175static inline int is_WPEG(struct rio_detail *rio){
176 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
177 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
178}
179
180
181/* In clustered mode, the high nibble of APIC ID is a cluster number.
182 * The low nibble is a 4-bit bitmap. */
183#define XAPIC_DEST_CPUS_SHIFT 4
184#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
185#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
186
187#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
188
189static const cpumask_t *summit_target_cpus(void)
190{
191 /* CPU_MASK_ALL (0xff) has undefined behaviour with
192 * dest_LowestPrio mode logical clustered apic interrupt routing
193 * Just start on cpu 0. IRQ balancing will spread load
194 */
195 return &cpumask_of_cpu(0);
196}
197
198static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
199{
200 return 0;
201}
202
203/* we don't use the phys_cpu_present_map to indicate apicid presence */
204static unsigned long summit_check_apicid_present(int bit)
205{
206 return 1;
207}
208
209static void summit_init_apic_ldr(void)
210{
211 unsigned long val, id;
212 int count = 0;
213 u8 my_id = (u8)hard_smp_processor_id();
214 u8 my_cluster = APIC_CLUSTER(my_id);
215#ifdef CONFIG_SMP
216 u8 lid;
217 int i;
218
219 /* Create logical APIC IDs by counting CPUs already in cluster. */
220 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
221 lid = cpu_2_logical_apicid[i];
222 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
223 ++count;
224 }
225#endif
226 /* We only have a 4 wide bitmap in cluster mode. If a deranged
227 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
228 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
229 id = my_cluster | (1UL << count);
230 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
231 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
232 val |= SET_APIC_LOGICAL_ID(id);
233 apic_write(APIC_LDR, val);
234}
235
236static int summit_apic_id_registered(void)
237{
238 return 1;
239}
240
241static void summit_setup_apic_routing(void)
242{
243 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
244 nr_ioapics);
245}
246
247static int summit_apicid_to_node(int logical_apicid)
248{
249#ifdef CONFIG_SMP
250 return apicid_2_node[hard_smp_processor_id()];
251#else
252 return 0;
253#endif
254}
255
256/* Mapping from cpu number to logical apicid */
257static inline int summit_cpu_to_logical_apicid(int cpu)
258{
259#ifdef CONFIG_SMP
260 if (cpu >= nr_cpu_ids)
261 return BAD_APICID;
262 return cpu_2_logical_apicid[cpu];
263#else
264 return logical_smp_processor_id();
265#endif
266}
267
268static int summit_cpu_present_to_apicid(int mps_cpu)
269{
270 if (mps_cpu < nr_cpu_ids)
271 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
272 else
273 return BAD_APICID;
274}
275
276static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
277{
278 /* For clustered we don't have a good way to do this yet - hack */
279 return physids_promote(0x0F);
280}
281
282static physid_mask_t summit_apicid_to_cpu_present(int apicid)
283{
284 return physid_mask_of_physid(0);
285}
286
287static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
288{
289 return 1;
290}
291
292static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask)
293{
294 unsigned int round = 0;
295 int cpu, apicid = 0;
296
297 /*
298 * The cpus in the mask must all be on the apic cluster.
299 */
300 for_each_cpu(cpu, cpumask) {
301 int new_apicid = summit_cpu_to_logical_apicid(cpu);
302
303 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
304 printk("%s: Not a valid mask!\n", __func__);
305 return BAD_APICID;
306 }
307 apicid |= new_apicid;
308 round++;
309 }
310 return apicid;
311}
312
313static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
314 const struct cpumask *andmask)
315{
316 int apicid = summit_cpu_to_logical_apicid(0);
317 cpumask_var_t cpumask;
318
319 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
320 return apicid;
321
322 cpumask_and(cpumask, inmask, andmask);
323 cpumask_and(cpumask, cpumask, cpu_online_mask);
324 apicid = summit_cpu_mask_to_apicid(cpumask);
325
326 free_cpumask_var(cpumask);
327
328 return apicid;
329}
330
331/*
332 * cpuid returns the value latched in the HW at reset, not the APIC ID
333 * register's value. For any box whose BIOS changes APIC IDs, like
334 * clustered APIC systems, we must use hard_smp_processor_id.
335 *
336 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
337 */
338static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
339{
340 return hard_smp_processor_id() >> index_msb;
341}
342
343static int probe_summit(void)
344{
345 /* probed later in mptable/ACPI hooks */
346 return 0;
347}
348
349static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
350{
351 /* Careful. Some cpus do not strictly honor the set of cpus
352 * specified in the interrupt destination when using lowest
353 * priority interrupt delivery mode.
354 *
355 * In particular there was a hyperthreading cpu observed to
356 * deliver interrupts to the wrong hyperthread when only one
357 * hyperthread was specified in the interrupt desitination.
358 */
359 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
360}
361
362#ifdef CONFIG_X86_SUMMIT_NUMA
363static struct rio_table_hdr *rio_table_hdr;
364static struct scal_detail *scal_devs[MAX_NUMNODES];
365static struct rio_detail *rio_devs[MAX_NUMNODES*4];
366
367#ifndef CONFIG_X86_NUMAQ
368static int mp_bus_id_to_node[MAX_MP_BUSSES];
369#endif
370
371static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
372{
373 int twister = 0, node = 0;
374 int i, bus, num_buses;
375
376 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
377 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
378 twister = rio_devs[i]->owner_id;
379 break;
380 }
381 }
382 if (i == rio_table_hdr->num_rio_dev) {
383 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
384 return last_bus;
385 }
386
387 for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
388 if (scal_devs[i]->node_id == twister) {
389 node = scal_devs[i]->node_id;
390 break;
391 }
392 }
393 if (i == rio_table_hdr->num_scal_dev) {
394 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
395 return last_bus;
396 }
397
398 switch (rio_devs[wpeg_num]->type) {
399 case CompatWPEG:
400 /*
401 * The Compatibility Winnipeg controls the 2 legacy buses,
402 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
403 * a PCI-PCI bridge card is used in either slot: total 5 buses.
404 */
405 num_buses = 5;
406 break;
407 case AltWPEG:
408 /*
409 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
410 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
411 * the "extra" buses for each of those slots: total 7 buses.
412 */
413 num_buses = 7;
414 break;
415 case LookOutAWPEG:
416 case LookOutBWPEG:
417 /*
418 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
419 * & the "extra" buses for each of those slots: total 9 buses.
420 */
421 num_buses = 9;
422 break;
423 default:
424 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
425 return last_bus;
426 }
427
428 for (bus = last_bus; bus < last_bus + num_buses; bus++)
429 mp_bus_id_to_node[bus] = node;
430 return bus;
431}
432
433static int build_detail_arrays(void)
434{
435 unsigned long ptr;
436 int i, scal_detail_size, rio_detail_size;
437
438 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
439 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
440 return 0;
441 }
442
443 switch (rio_table_hdr->version) {
444 default:
445 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
446 return 0;
447 case 2:
448 scal_detail_size = 11;
449 rio_detail_size = 13;
450 break;
451 case 3:
452 scal_detail_size = 12;
453 rio_detail_size = 15;
454 break;
455 }
456
457 ptr = (unsigned long)rio_table_hdr + 3;
458 for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
459 scal_devs[i] = (struct scal_detail *)ptr;
460
461 for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
462 rio_devs[i] = (struct rio_detail *)ptr;
463
464 return 1;
465}
466
467void setup_summit(void)
468{
469 unsigned long ptr;
470 unsigned short offset;
471 int i, next_wpeg, next_bus = 0;
472
473 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
474 ptr = get_bios_ebda();
475 ptr = (unsigned long)phys_to_virt(ptr);
476
477 rio_table_hdr = NULL;
478 offset = 0x180;
479 while (offset) {
480 /* The block id is stored in the 2nd word */
481 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
482 /* set the pointer past the offset & block id */
483 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
484 break;
485 }
486 /* The next offset is stored in the 1st word. 0 means no more */
487 offset = *((unsigned short *)(ptr + offset));
488 }
489 if (!rio_table_hdr) {
490 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
491 return;
492 }
493
494 if (!build_detail_arrays())
495 return;
496
497 /* The first Winnipeg we're looking for has an index of 0 */
498 next_wpeg = 0;
499 do {
500 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
501 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
502 /* It's the Winnipeg we're looking for! */
503 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
504 next_wpeg++;
505 break;
506 }
507 }
508 /*
509 * If we go through all Rio devices and don't find one with
510 * the next index, it means we've found all the Winnipegs,
511 * and thus all the PCI buses.
512 */
513 if (i == rio_table_hdr->num_rio_dev)
514 next_wpeg = 0;
515 } while (next_wpeg != 0);
516}
517#endif
518
519struct apic apic_summit = {
520
521 .name = "summit",
522 .probe = probe_summit,
523 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
524 .apic_id_registered = summit_apic_id_registered,
525
526 .irq_delivery_mode = dest_LowestPrio,
527 /* logical delivery broadcast to all CPUs: */
528 .irq_dest_mode = 1,
529
530 .target_cpus = summit_target_cpus,
531 .disable_esr = 1,
532 .dest_logical = APIC_DEST_LOGICAL,
533 .check_apicid_used = summit_check_apicid_used,
534 .check_apicid_present = summit_check_apicid_present,
535
536 .vector_allocation_domain = summit_vector_allocation_domain,
537 .init_apic_ldr = summit_init_apic_ldr,
538
539 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
540 .setup_apic_routing = summit_setup_apic_routing,
541 .multi_timer_check = NULL,
542 .apicid_to_node = summit_apicid_to_node,
543 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
544 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
545 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
546 .setup_portio_remap = NULL,
547 .check_phys_apicid_present = summit_check_phys_apicid_present,
548 .enable_apic_mode = NULL,
549 .phys_pkg_id = summit_phys_pkg_id,
550 .mps_oem_check = summit_mps_oem_check,
551
552 .get_apic_id = summit_get_apic_id,
553 .set_apic_id = NULL,
554 .apic_id_mask = 0xFF << 24,
555
556 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
557 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
558
559 .send_IPI_mask = summit_send_IPI_mask,
560 .send_IPI_mask_allbutself = NULL,
561 .send_IPI_allbutself = summit_send_IPI_allbutself,
562 .send_IPI_all = summit_send_IPI_all,
563 .send_IPI_self = default_send_IPI_self,
564
565 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
566 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
567
568 .wait_for_init_deassert = default_wait_for_init_deassert,
569
570 .smp_callin_clear_local_apic = NULL,
571 .inquire_remote_apic = default_inquire_remote_apic,
572
573 .read = native_apic_mem_read,
574 .write = native_apic_mem_write,
575 .icr_read = native_apic_icr_read,
576 .icr_write = native_apic_icr_write,
577 .wait_icr_idle = native_apic_wait_icr_idle,
578 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
579};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 6ce497cc372d..8fb87b6dd633 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -7,17 +7,14 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h>
10#include <asm/ipi.h> 11#include <asm/ipi.h>
11#include <asm/genapic.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
17 if (cpu_has_x2apic) 17 return x2apic_enabled();
18 return 1;
19
20 return 0;
21} 18}
22 19
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 20/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
@@ -36,8 +33,8 @@ static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
36 cpumask_set_cpu(cpu, retmask); 33 cpumask_set_cpu(cpu, retmask);
37} 34}
38 35
39static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 36static void
40 unsigned int dest) 37 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
41{ 38{
42 unsigned long cfg; 39 unsigned long cfg;
43 40
@@ -46,7 +43,7 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
46 /* 43 /*
47 * send the IPI. 44 * send the IPI.
48 */ 45 */
49 x2apic_icr_write(cfg, apicid); 46 native_x2apic_icr_write(cfg, apicid);
50} 47}
51 48
52/* 49/*
@@ -57,45 +54,50 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
57 */ 54 */
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) 55static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{ 56{
60 unsigned long flags;
61 unsigned long query_cpu; 57 unsigned long query_cpu;
58 unsigned long flags;
62 59
63 local_irq_save(flags); 60 local_irq_save(flags);
64 for_each_cpu(query_cpu, mask) 61 for_each_cpu(query_cpu, mask) {
65 __x2apic_send_IPI_dest( 62 __x2apic_send_IPI_dest(
66 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 63 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL); 64 vector, apic->dest_logical);
65 }
68 local_irq_restore(flags); 66 local_irq_restore(flags);
69} 67}
70 68
71static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, 69static void
72 int vector) 70 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
73{ 71{
74 unsigned long flags;
75 unsigned long query_cpu;
76 unsigned long this_cpu = smp_processor_id(); 72 unsigned long this_cpu = smp_processor_id();
73 unsigned long query_cpu;
74 unsigned long flags;
77 75
78 local_irq_save(flags); 76 local_irq_save(flags);
79 for_each_cpu(query_cpu, mask) 77 for_each_cpu(query_cpu, mask) {
80 if (query_cpu != this_cpu) 78 if (query_cpu == this_cpu)
81 __x2apic_send_IPI_dest( 79 continue;
80 __x2apic_send_IPI_dest(
82 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 81 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
83 vector, APIC_DEST_LOGICAL); 82 vector, apic->dest_logical);
83 }
84 local_irq_restore(flags); 84 local_irq_restore(flags);
85} 85}
86 86
87static void x2apic_send_IPI_allbutself(int vector) 87static void x2apic_send_IPI_allbutself(int vector)
88{ 88{
89 unsigned long flags;
90 unsigned long query_cpu;
91 unsigned long this_cpu = smp_processor_id(); 89 unsigned long this_cpu = smp_processor_id();
90 unsigned long query_cpu;
91 unsigned long flags;
92 92
93 local_irq_save(flags); 93 local_irq_save(flags);
94 for_each_online_cpu(query_cpu) 94 for_each_online_cpu(query_cpu) {
95 if (query_cpu != this_cpu) 95 if (query_cpu == this_cpu)
96 __x2apic_send_IPI_dest( 96 continue;
97 __x2apic_send_IPI_dest(
97 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 98 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
98 vector, APIC_DEST_LOGICAL); 99 vector, apic->dest_logical);
100 }
99 local_irq_restore(flags); 101 local_irq_restore(flags);
100} 102}
101 103
@@ -111,21 +113,21 @@ static int x2apic_apic_id_registered(void)
111 113
112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 114static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
113{ 115{
114 int cpu;
115
116 /* 116 /*
117 * We're using fixed IRQ delivery, can only return one logical APIC ID. 117 * We're using fixed IRQ delivery, can only return one logical APIC ID.
118 * May as well be the first. 118 * May as well be the first.
119 */ 119 */
120 cpu = cpumask_first(cpumask); 120 int cpu = cpumask_first(cpumask);
121
121 if ((unsigned)cpu < nr_cpu_ids) 122 if ((unsigned)cpu < nr_cpu_ids)
122 return per_cpu(x86_cpu_to_logical_apicid, cpu); 123 return per_cpu(x86_cpu_to_logical_apicid, cpu);
123 else 124 else
124 return BAD_APICID; 125 return BAD_APICID;
125} 126}
126 127
127static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 128static unsigned int
128 const struct cpumask *andmask) 129x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
130 const struct cpumask *andmask)
129{ 131{
130 int cpu; 132 int cpu;
131 133
@@ -133,15 +135,18 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 * We're using fixed IRQ delivery, can only return one logical APIC ID. 135 * We're using fixed IRQ delivery, can only return one logical APIC ID.
134 * May as well be the first. 136 * May as well be the first.
135 */ 137 */
136 for_each_cpu_and(cpu, cpumask, andmask) 138 for_each_cpu_and(cpu, cpumask, andmask) {
137 if (cpumask_test_cpu(cpu, cpu_online_mask)) 139 if (cpumask_test_cpu(cpu, cpu_online_mask))
138 break; 140 break;
141 }
142
139 if (cpu < nr_cpu_ids) 143 if (cpu < nr_cpu_ids)
140 return per_cpu(x86_cpu_to_logical_apicid, cpu); 144 return per_cpu(x86_cpu_to_logical_apicid, cpu);
145
141 return BAD_APICID; 146 return BAD_APICID;
142} 147}
143 148
144static unsigned int get_apic_id(unsigned long x) 149static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
145{ 150{
146 unsigned int id; 151 unsigned int id;
147 152
@@ -157,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id)
157 return x; 162 return x;
158} 163}
159 164
160static unsigned int phys_pkg_id(int index_msb) 165static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
161{ 166{
162 return current_cpu_data.initial_apicid >> index_msb; 167 return current_cpu_data.initial_apicid >> index_msb;
163} 168}
@@ -172,27 +177,63 @@ static void init_x2apic_ldr(void)
172 int cpu = smp_processor_id(); 177 int cpu = smp_processor_id();
173 178
174 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 179 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
175 return; 180}
176} 181
177 182struct apic apic_x2apic_cluster = {
178struct genapic apic_x2apic_cluster = { 183
179 .name = "cluster x2apic", 184 .name = "cluster x2apic",
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 185 .probe = NULL,
181 .int_delivery_mode = dest_LowestPrio, 186 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
182 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 187 .apic_id_registered = x2apic_apic_id_registered,
183 .target_cpus = x2apic_target_cpus, 188
184 .vector_allocation_domain = x2apic_vector_allocation_domain, 189 .irq_delivery_mode = dest_LowestPrio,
185 .apic_id_registered = x2apic_apic_id_registered, 190 .irq_dest_mode = 1, /* logical */
186 .init_apic_ldr = init_x2apic_ldr, 191
187 .send_IPI_all = x2apic_send_IPI_all, 192 .target_cpus = x2apic_target_cpus,
188 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 193 .disable_esr = 0,
189 .send_IPI_mask = x2apic_send_IPI_mask, 194 .dest_logical = APIC_DEST_LOGICAL,
190 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 195 .check_apicid_used = NULL,
191 .send_IPI_self = x2apic_send_IPI_self, 196 .check_apicid_present = NULL,
192 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 197
193 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 198 .vector_allocation_domain = x2apic_vector_allocation_domain,
194 .phys_pkg_id = phys_pkg_id, 199 .init_apic_ldr = init_x2apic_ldr,
195 .get_apic_id = get_apic_id, 200
196 .set_apic_id = set_apic_id, 201 .ioapic_phys_id_map = NULL,
197 .apic_id_mask = (0xFFFFFFFFu), 202 .setup_apic_routing = NULL,
203 .multi_timer_check = NULL,
204 .apicid_to_node = NULL,
205 .cpu_to_logical_apicid = NULL,
206 .cpu_present_to_apicid = default_cpu_present_to_apicid,
207 .apicid_to_cpu_present = NULL,
208 .setup_portio_remap = NULL,
209 .check_phys_apicid_present = default_check_phys_apicid_present,
210 .enable_apic_mode = NULL,
211 .phys_pkg_id = x2apic_cluster_phys_pkg_id,
212 .mps_oem_check = NULL,
213
214 .get_apic_id = x2apic_cluster_phys_get_apic_id,
215 .set_apic_id = set_apic_id,
216 .apic_id_mask = 0xFFFFFFFFu,
217
218 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
219 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
220
221 .send_IPI_mask = x2apic_send_IPI_mask,
222 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
223 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
224 .send_IPI_all = x2apic_send_IPI_all,
225 .send_IPI_self = x2apic_send_IPI_self,
226
227 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
228 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
229 .wait_for_init_deassert = NULL,
230 .smp_callin_clear_local_apic = NULL,
231 .inquire_remote_apic = NULL,
232
233 .read = native_apic_msr_read,
234 .write = native_apic_msr_write,
235 .icr_read = native_x2apic_icr_read,
236 .icr_write = native_x2apic_icr_write,
237 .wait_icr_idle = native_x2apic_wait_icr_idle,
238 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
198}; 239};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 21bcc0e098ba..23625b9f98b2 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,10 +7,10 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h>
10#include <asm/ipi.h> 11#include <asm/ipi.h>
11#include <asm/genapic.h>
12 12
13static int x2apic_phys; 13int x2apic_phys;
14 14
15static int set_x2apic_phys_mode(char *arg) 15static int set_x2apic_phys_mode(char *arg)
16{ 16{
@@ -21,10 +21,10 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
21 21
22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{ 23{
24 if (cpu_has_x2apic && x2apic_phys) 24 if (x2apic_phys)
25 return 1; 25 return x2apic_enabled();
26 26 else
27 return 0; 27 return 0;
28} 28}
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
@@ -50,13 +50,13 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
50 /* 50 /*
51 * send the IPI. 51 * send the IPI.
52 */ 52 */
53 x2apic_icr_write(cfg, apicid); 53 native_x2apic_icr_write(cfg, apicid);
54} 54}
55 55
56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) 56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57{ 57{
58 unsigned long flags;
59 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags;
60 60
61 local_irq_save(flags); 61 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 62 for_each_cpu(query_cpu, mask) {
@@ -66,12 +66,12 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
66 local_irq_restore(flags); 66 local_irq_restore(flags);
67} 67}
68 68
69static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, 69static void
70 int vector) 70 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
71{ 71{
72 unsigned long flags;
73 unsigned long query_cpu;
74 unsigned long this_cpu = smp_processor_id(); 72 unsigned long this_cpu = smp_processor_id();
73 unsigned long query_cpu;
74 unsigned long flags;
75 75
76 local_irq_save(flags); 76 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 77 for_each_cpu(query_cpu, mask) {
@@ -85,16 +85,17 @@ static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
85 85
86static void x2apic_send_IPI_allbutself(int vector) 86static void x2apic_send_IPI_allbutself(int vector)
87{ 87{
88 unsigned long flags;
89 unsigned long query_cpu;
90 unsigned long this_cpu = smp_processor_id(); 88 unsigned long this_cpu = smp_processor_id();
89 unsigned long query_cpu;
90 unsigned long flags;
91 91
92 local_irq_save(flags); 92 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) 93 for_each_online_cpu(query_cpu) {
94 if (query_cpu != this_cpu) 94 if (query_cpu == this_cpu)
95 __x2apic_send_IPI_dest( 95 continue;
96 per_cpu(x86_cpu_to_apicid, query_cpu), 96 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
97 vector, APIC_DEST_PHYSICAL); 97 vector, APIC_DEST_PHYSICAL);
98 }
98 local_irq_restore(flags); 99 local_irq_restore(flags);
99} 100}
100 101
@@ -110,21 +111,21 @@ static int x2apic_apic_id_registered(void)
110 111
111static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
112{ 113{
113 int cpu;
114
115 /* 114 /*
116 * We're using fixed IRQ delivery, can only return one phys APIC ID. 115 * We're using fixed IRQ delivery, can only return one phys APIC ID.
117 * May as well be the first. 116 * May as well be the first.
118 */ 117 */
119 cpu = cpumask_first(cpumask); 118 int cpu = cpumask_first(cpumask);
119
120 if ((unsigned)cpu < nr_cpu_ids) 120 if ((unsigned)cpu < nr_cpu_ids)
121 return per_cpu(x86_cpu_to_apicid, cpu); 121 return per_cpu(x86_cpu_to_apicid, cpu);
122 else 122 else
123 return BAD_APICID; 123 return BAD_APICID;
124} 124}
125 125
126static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 126static unsigned int
127 const struct cpumask *andmask) 127x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
128 const struct cpumask *andmask)
128{ 129{
129 int cpu; 130 int cpu;
130 131
@@ -132,31 +133,28 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
132 * We're using fixed IRQ delivery, can only return one phys APIC ID. 133 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first. 134 * May as well be the first.
134 */ 135 */
135 for_each_cpu_and(cpu, cpumask, andmask) 136 for_each_cpu_and(cpu, cpumask, andmask) {
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 137 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 138 break;
139 }
140
138 if (cpu < nr_cpu_ids) 141 if (cpu < nr_cpu_ids)
139 return per_cpu(x86_cpu_to_apicid, cpu); 142 return per_cpu(x86_cpu_to_apicid, cpu);
143
140 return BAD_APICID; 144 return BAD_APICID;
141} 145}
142 146
143static unsigned int get_apic_id(unsigned long x) 147static unsigned int x2apic_phys_get_apic_id(unsigned long x)
144{ 148{
145 unsigned int id; 149 return x;
146
147 id = x;
148 return id;
149} 150}
150 151
151static unsigned long set_apic_id(unsigned int id) 152static unsigned long set_apic_id(unsigned int id)
152{ 153{
153 unsigned long x; 154 return id;
154
155 x = id;
156 return x;
157} 155}
158 156
159static unsigned int phys_pkg_id(int index_msb) 157static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
160{ 158{
161 return current_cpu_data.initial_apicid >> index_msb; 159 return current_cpu_data.initial_apicid >> index_msb;
162} 160}
@@ -168,27 +166,63 @@ static void x2apic_send_IPI_self(int vector)
168 166
169static void init_x2apic_ldr(void) 167static void init_x2apic_ldr(void)
170{ 168{
171 return; 169}
172} 170
173 171struct apic apic_x2apic_phys = {
174struct genapic apic_x2apic_phys = { 172
175 .name = "physical x2apic", 173 .name = "physical x2apic",
176 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 174 .probe = NULL,
177 .int_delivery_mode = dest_Fixed, 175 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
178 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 176 .apic_id_registered = x2apic_apic_id_registered,
179 .target_cpus = x2apic_target_cpus, 177
180 .vector_allocation_domain = x2apic_vector_allocation_domain, 178 .irq_delivery_mode = dest_Fixed,
181 .apic_id_registered = x2apic_apic_id_registered, 179 .irq_dest_mode = 0, /* physical */
182 .init_apic_ldr = init_x2apic_ldr, 180
183 .send_IPI_all = x2apic_send_IPI_all, 181 .target_cpus = x2apic_target_cpus,
184 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 182 .disable_esr = 0,
185 .send_IPI_mask = x2apic_send_IPI_mask, 183 .dest_logical = 0,
186 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 184 .check_apicid_used = NULL,
187 .send_IPI_self = x2apic_send_IPI_self, 185 .check_apicid_present = NULL,
188 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 186
189 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 187 .vector_allocation_domain = x2apic_vector_allocation_domain,
190 .phys_pkg_id = phys_pkg_id, 188 .init_apic_ldr = init_x2apic_ldr,
191 .get_apic_id = get_apic_id, 189
192 .set_apic_id = set_apic_id, 190 .ioapic_phys_id_map = NULL,
193 .apic_id_mask = (0xFFFFFFFFu), 191 .setup_apic_routing = NULL,
192 .multi_timer_check = NULL,
193 .apicid_to_node = NULL,
194 .cpu_to_logical_apicid = NULL,
195 .cpu_present_to_apicid = default_cpu_present_to_apicid,
196 .apicid_to_cpu_present = NULL,
197 .setup_portio_remap = NULL,
198 .check_phys_apicid_present = default_check_phys_apicid_present,
199 .enable_apic_mode = NULL,
200 .phys_pkg_id = x2apic_phys_pkg_id,
201 .mps_oem_check = NULL,
202
203 .get_apic_id = x2apic_phys_get_apic_id,
204 .set_apic_id = set_apic_id,
205 .apic_id_mask = 0xFFFFFFFFu,
206
207 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
208 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
209
210 .send_IPI_mask = x2apic_send_IPI_mask,
211 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
212 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
213 .send_IPI_all = x2apic_send_IPI_all,
214 .send_IPI_self = x2apic_send_IPI_self,
215
216 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
217 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
218 .wait_for_init_deassert = NULL,
219 .smp_callin_clear_local_apic = NULL,
220 .inquire_remote_apic = NULL,
221
222 .read = native_apic_msr_read,
223 .write = native_apic_msr_write,
224 .icr_read = native_x2apic_icr_read,
225 .icr_write = native_x2apic_icr_write,
226 .wait_icr_idle = native_x2apic_wait_icr_idle,
227 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
194}; 228};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b193e082f6ce..1bd6da1f8fad 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,27 +7,28 @@
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10
11#include <linux/kernel.h>
12#include <linux/threads.h>
13#include <linux/cpu.h>
14#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h>
12#include <linux/proc_fs.h>
13#include <linux/threads.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/ctype.h> 17#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <linux/timer.h> 19#include <linux/timer.h>
22#include <linux/proc_fs.h> 20#include <linux/cpu.h>
23#include <asm/current.h> 21#include <linux/init.h>
24#include <asm/smp.h> 22
25#include <asm/ipi.h>
26#include <asm/genapic.h>
27#include <asm/pgtable.h>
28#include <asm/uv/uv_mmrs.h> 23#include <asm/uv/uv_mmrs.h>
29#include <asm/uv/uv_hub.h> 24#include <asm/uv/uv_hub.h>
25#include <asm/current.h>
26#include <asm/pgtable.h>
30#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28#include <asm/uv/uv.h>
29#include <asm/apic.h>
30#include <asm/ipi.h>
31#include <asm/smp.h>
31 32
32DEFINE_PER_CPU(int, x2apic_extra_bits); 33DEFINE_PER_CPU(int, x2apic_extra_bits);
33 34
@@ -90,39 +91,43 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
90 cpumask_set_cpu(cpu, retmask); 91 cpumask_set_cpu(cpu, retmask);
91} 92}
92 93
93int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 94static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
94{ 95{
96#ifdef CONFIG_SMP
95 unsigned long val; 97 unsigned long val;
96 int pnode; 98 int pnode;
97 99
98 pnode = uv_apicid_to_pnode(phys_apicid); 100 pnode = uv_apicid_to_pnode(phys_apicid);
99 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 101 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
100 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 102 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
101 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 103 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
102 APIC_DM_INIT; 104 APIC_DM_INIT;
103 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 105 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
104 mdelay(10); 106 mdelay(10);
105 107
106 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 108 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
107 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 109 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
108 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 110 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
109 APIC_DM_STARTUP; 111 APIC_DM_STARTUP;
110 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 112 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
113
114 atomic_set(&init_deasserted, 1);
115#endif
111 return 0; 116 return 0;
112} 117}
113 118
114static void uv_send_IPI_one(int cpu, int vector) 119static void uv_send_IPI_one(int cpu, int vector)
115{ 120{
116 unsigned long val, apicid, lapicid; 121 unsigned long val, apicid;
117 int pnode; 122 int pnode;
118 123
119 apicid = per_cpu(x86_cpu_to_apicid, cpu); 124 apicid = per_cpu(x86_cpu_to_apicid, cpu);
120 lapicid = apicid & 0x3f; /* ZZZ macro needed */
121 pnode = uv_apicid_to_pnode(apicid); 125 pnode = uv_apicid_to_pnode(apicid);
122 val = 126
123 (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << 127 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
124 UVH_IPI_INT_APIC_ID_SHFT) | 128 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
125 (vector << UVH_IPI_INT_VECTOR_SHFT); 129 (vector << UVH_IPI_INT_VECTOR_SHFT);
130
126 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 131 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
127} 132}
128 133
@@ -136,22 +141,24 @@ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
136 141
137static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 142static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
138{ 143{
139 unsigned int cpu;
140 unsigned int this_cpu = smp_processor_id(); 144 unsigned int this_cpu = smp_processor_id();
145 unsigned int cpu;
141 146
142 for_each_cpu(cpu, mask) 147 for_each_cpu(cpu, mask) {
143 if (cpu != this_cpu) 148 if (cpu != this_cpu)
144 uv_send_IPI_one(cpu, vector); 149 uv_send_IPI_one(cpu, vector);
150 }
145} 151}
146 152
147static void uv_send_IPI_allbutself(int vector) 153static void uv_send_IPI_allbutself(int vector)
148{ 154{
149 unsigned int cpu;
150 unsigned int this_cpu = smp_processor_id(); 155 unsigned int this_cpu = smp_processor_id();
156 unsigned int cpu;
151 157
152 for_each_online_cpu(cpu) 158 for_each_online_cpu(cpu) {
153 if (cpu != this_cpu) 159 if (cpu != this_cpu)
154 uv_send_IPI_one(cpu, vector); 160 uv_send_IPI_one(cpu, vector);
161 }
155} 162}
156 163
157static void uv_send_IPI_all(int vector) 164static void uv_send_IPI_all(int vector)
@@ -170,21 +177,21 @@ static void uv_init_apic_ldr(void)
170 177
171static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) 178static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
172{ 179{
173 int cpu;
174
175 /* 180 /*
176 * We're using fixed IRQ delivery, can only return one phys APIC ID. 181 * We're using fixed IRQ delivery, can only return one phys APIC ID.
177 * May as well be the first. 182 * May as well be the first.
178 */ 183 */
179 cpu = cpumask_first(cpumask); 184 int cpu = cpumask_first(cpumask);
185
180 if ((unsigned)cpu < nr_cpu_ids) 186 if ((unsigned)cpu < nr_cpu_ids)
181 return per_cpu(x86_cpu_to_apicid, cpu); 187 return per_cpu(x86_cpu_to_apicid, cpu);
182 else 188 else
183 return BAD_APICID; 189 return BAD_APICID;
184} 190}
185 191
186static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 192static unsigned int
187 const struct cpumask *andmask) 193uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
194 const struct cpumask *andmask)
188{ 195{
189 int cpu; 196 int cpu;
190 197
@@ -192,15 +199,17 @@ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
192 * We're using fixed IRQ delivery, can only return one phys APIC ID. 199 * We're using fixed IRQ delivery, can only return one phys APIC ID.
193 * May as well be the first. 200 * May as well be the first.
194 */ 201 */
195 for_each_cpu_and(cpu, cpumask, andmask) 202 for_each_cpu_and(cpu, cpumask, andmask) {
196 if (cpumask_test_cpu(cpu, cpu_online_mask)) 203 if (cpumask_test_cpu(cpu, cpu_online_mask))
197 break; 204 break;
205 }
198 if (cpu < nr_cpu_ids) 206 if (cpu < nr_cpu_ids)
199 return per_cpu(x86_cpu_to_apicid, cpu); 207 return per_cpu(x86_cpu_to_apicid, cpu);
208
200 return BAD_APICID; 209 return BAD_APICID;
201} 210}
202 211
203static unsigned int get_apic_id(unsigned long x) 212static unsigned int x2apic_get_apic_id(unsigned long x)
204{ 213{
205 unsigned int id; 214 unsigned int id;
206 215
@@ -222,10 +231,10 @@ static unsigned long set_apic_id(unsigned int id)
222static unsigned int uv_read_apic_id(void) 231static unsigned int uv_read_apic_id(void)
223{ 232{
224 233
225 return get_apic_id(apic_read(APIC_ID)); 234 return x2apic_get_apic_id(apic_read(APIC_ID));
226} 235}
227 236
228static unsigned int phys_pkg_id(int index_msb) 237static int uv_phys_pkg_id(int initial_apicid, int index_msb)
229{ 238{
230 return uv_read_apic_id() >> index_msb; 239 return uv_read_apic_id() >> index_msb;
231} 240}
@@ -235,26 +244,64 @@ static void uv_send_IPI_self(int vector)
235 apic_write(APIC_SELF_IPI, vector); 244 apic_write(APIC_SELF_IPI, vector);
236} 245}
237 246
238struct genapic apic_x2apic_uv_x = { 247struct apic apic_x2apic_uv_x = {
239 .name = "UV large system", 248
240 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 249 .name = "UV large system",
241 .int_delivery_mode = dest_Fixed, 250 .probe = NULL,
242 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 251 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
243 .target_cpus = uv_target_cpus, 252 .apic_id_registered = uv_apic_id_registered,
244 .vector_allocation_domain = uv_vector_allocation_domain, 253
245 .apic_id_registered = uv_apic_id_registered, 254 .irq_delivery_mode = dest_Fixed,
246 .init_apic_ldr = uv_init_apic_ldr, 255 .irq_dest_mode = 1, /* logical */
247 .send_IPI_all = uv_send_IPI_all, 256
248 .send_IPI_allbutself = uv_send_IPI_allbutself, 257 .target_cpus = uv_target_cpus,
249 .send_IPI_mask = uv_send_IPI_mask, 258 .disable_esr = 0,
250 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, 259 .dest_logical = APIC_DEST_LOGICAL,
251 .send_IPI_self = uv_send_IPI_self, 260 .check_apicid_used = NULL,
252 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 261 .check_apicid_present = NULL,
253 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 262
254 .phys_pkg_id = phys_pkg_id, 263 .vector_allocation_domain = uv_vector_allocation_domain,
255 .get_apic_id = get_apic_id, 264 .init_apic_ldr = uv_init_apic_ldr,
256 .set_apic_id = set_apic_id, 265
257 .apic_id_mask = (0xFFFFFFFFu), 266 .ioapic_phys_id_map = NULL,
267 .setup_apic_routing = NULL,
268 .multi_timer_check = NULL,
269 .apicid_to_node = NULL,
270 .cpu_to_logical_apicid = NULL,
271 .cpu_present_to_apicid = default_cpu_present_to_apicid,
272 .apicid_to_cpu_present = NULL,
273 .setup_portio_remap = NULL,
274 .check_phys_apicid_present = default_check_phys_apicid_present,
275 .enable_apic_mode = NULL,
276 .phys_pkg_id = uv_phys_pkg_id,
277 .mps_oem_check = NULL,
278
279 .get_apic_id = x2apic_get_apic_id,
280 .set_apic_id = set_apic_id,
281 .apic_id_mask = 0xFFFFFFFFu,
282
283 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
284 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
285
286 .send_IPI_mask = uv_send_IPI_mask,
287 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
288 .send_IPI_allbutself = uv_send_IPI_allbutself,
289 .send_IPI_all = uv_send_IPI_all,
290 .send_IPI_self = uv_send_IPI_self,
291
292 .wakeup_secondary_cpu = uv_wakeup_secondary,
293 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
294 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
295 .wait_for_init_deassert = NULL,
296 .smp_callin_clear_local_apic = NULL,
297 .inquire_remote_apic = NULL,
298
299 .read = native_apic_msr_read,
300 .write = native_apic_msr_write,
301 .icr_read = native_x2apic_icr_read,
302 .icr_write = native_x2apic_icr_write,
303 .wait_icr_idle = native_x2apic_wait_icr_idle,
304 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
258}; 305};
259 306
260static __cpuinit void set_x2apic_extra_bits(int pnode) 307static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -322,7 +369,7 @@ static __init void map_high(char *id, unsigned long base, int shift,
322 paddr = base << shift; 369 paddr = base << shift;
323 bytes = (1UL << shift) * (max_pnode + 1); 370 bytes = (1UL << shift) * (max_pnode + 1);
324 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 371 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
325 paddr + bytes); 372 paddr + bytes);
326 if (map_type == map_uc) 373 if (map_type == map_uc)
327 init_extra_mapping_uc(paddr, bytes); 374 init_extra_mapping_uc(paddr, bytes);
328 else 375 else
@@ -485,7 +532,7 @@ late_initcall(uv_init_heartbeat);
485 532
486/* 533/*
487 * Called on each cpu to initialize the per_cpu UV data area. 534 * Called on each cpu to initialize the per_cpu UV data area.
488 * ZZZ hotplug not supported yet 535 * FIXME: hotplug not supported yet
489 */ 536 */
490void __cpuinit uv_cpu_init(void) 537void __cpuinit uv_cpu_init(void)
491{ 538{
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 266ec6c18b6c..10033fe718e0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);
301 */ 301 */
302#define APM_ZERO_SEGS 302#define APM_ZERO_SEGS
303 303
304#include "apm.h" 304#include <asm/apm.h>
305 305
306/* 306/*
307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. 307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08feee6..fbf2f33e3080 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,6 +75,7 @@ void foo(void)
75 OFFSET(PT_DS, pt_regs, ds); 75 OFFSET(PT_DS, pt_regs, ds);
76 OFFSET(PT_ES, pt_regs, es); 76 OFFSET(PT_ES, pt_regs, es);
77 OFFSET(PT_FS, pt_regs, fs); 77 OFFSET(PT_FS, pt_regs, fs);
78 OFFSET(PT_GS, pt_regs, gs);
78 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); 79 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
79 OFFSET(PT_EIP, pt_regs, ip); 80 OFFSET(PT_EIP, pt_regs, ip);
80 OFFSET(PT_CS, pt_regs, cs); 81 OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
48#endif 47#endif
49 BLANK(); 48 BLANK();
50#undef ENTRY 49#undef ENTRY
51#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
52 ENTRY(kernelstack);
53 ENTRY(oldrsp);
54 ENTRY(pcurrent);
55 ENTRY(irqcount);
56 ENTRY(cpunumber);
57 ENTRY(irqstackptr);
58 ENTRY(data_offset);
59 BLANK();
60#undef ENTRY
61#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
62 BLANK(); 51 BLANK();
63 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 52 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..d4356f8b7522 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,6 +14,8 @@ obj-y += vmware.o hypervisor.o
14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
15obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
16 16
17obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
18
17obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
18obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
19obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2cf23634b6d9..6882a735d9c0 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h> 10#include <asm/apic.h>
11 11
12struct cpuid_bit { 12struct cpuid_bit {
13 u16 feature; 13 u16 feature;
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
69 */ 69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) 70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{ 71{
72#ifdef CONFIG_X86_SMP 72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index; 73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width; 74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings; 75 unsigned int core_select_mask, core_level_siblings;
@@ -116,22 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
116 116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; 117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118 118
119#ifdef CONFIG_X86_32 119 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 120 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 121 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123 /* 122 /*
124 * Reinit the apicid, now that we have extended initial_apicid. 123 * Reinit the apicid, now that we have extended initial_apicid.
125 */ 124 */
126 c->apicid = phys_pkg_id(c->initial_apicid, 0); 125 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
127#else 126
128 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
129 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
130 /*
131 * Reinit the apicid, now that we have extended initial_apicid.
132 */
133 c->apicid = phys_pkg_id(0);
134#endif
135 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
136 128
137 129
@@ -143,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
143 return; 135 return;
144#endif 136#endif
145} 137}
146
147#ifdef CONFIG_X86_PAT
148void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
149{
150 if (!cpu_has_pat)
151 pat_disable("PAT not supported by CPU.");
152
153 switch (c->x86_vendor) {
154 case X86_VENDOR_INTEL:
155 /*
156 * There is a known erratum on Pentium III and Core Solo
157 * and Core Duo CPUs.
158 * " Page with PAT set to WC while associated MTRR is UC
159 * may consolidate to UC "
160 * Because of this erratum, it is better to stick with
161 * setting WC in MTRR rather than using PAT on these CPUs.
162 *
163 * Enable PAT WC only on P4, Core 2 or later CPUs.
164 */
165 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
166 return;
167
168 pat_disable("PAT WC disabled due to known CPU erratum.");
169 return;
170
171 case X86_VENDOR_AMD:
172 case X86_VENDOR_CENTAUR:
173 case X86_VENDOR_TRANSMETA:
174 return;
175 }
176
177 pat_disable("PAT disabled. Not yet verified on this CPU type.");
178}
179#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c878f6aa919..f47df59016c5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10# include <asm/numa_64.h> 11# include <asm/numa_64.h>
@@ -12,8 +13,6 @@
12# include <asm/cacheflush.h> 13# include <asm/cacheflush.h>
13#endif 14#endif
14 15
15#include <mach_apic.h>
16
17#include "cpu.h" 16#include "cpu.h"
18 17
19#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
@@ -143,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
143 } 142 }
144} 143}
145 144
145static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
146{
147#ifdef CONFIG_SMP
148 /* calling is from identify_secondary_cpu() ? */
149 if (c->cpu_index == boot_cpu_id)
150 return;
151
152 /*
153 * Certain Athlons might work (for various values of 'work') in SMP
154 * but they are not certified as MP capable.
155 */
156 /* Athlon 660/661 is valid. */
157 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
158 (c->x86_mask == 1)))
159 goto valid_k7;
160
161 /* Duron 670 is valid */
162 if ((c->x86_model == 7) && (c->x86_mask == 0))
163 goto valid_k7;
164
165 /*
166 * Athlon 662, Duron 671, and Athlon >model 7 have capability
167 * bit. It's worth noting that the A5 stepping (662) of some
168 * Athlon XP's have the MP bit set.
169 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
170 * more.
171 */
172 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
173 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
174 (c->x86_model > 7))
175 if (cpu_has_mp)
176 goto valid_k7;
177
178 /* If we get here, not a certified SMP capable AMD system. */
179
180 /*
181 * Don't taint if we are running SMP kernel on a single non-MP
182 * approved Athlon
183 */
184 WARN_ONCE(1, "WARNING: This combination of AMD"
185 "processors is not suitable for SMP.\n");
186 if (!test_taint(TAINT_UNSAFE_SMP))
187 add_taint(TAINT_UNSAFE_SMP);
188
189valid_k7:
190 ;
191#endif
192}
193
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 194static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{ 195{
148 u32 l, h; 196 u32 l, h;
@@ -177,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
177 } 225 }
178 226
179 set_cpu_cap(c, X86_FEATURE_K7); 227 set_cpu_cap(c, X86_FEATURE_K7);
228
229 amd_k7_smp_check(c);
180} 230}
181#endif 231#endif
182 232
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b1..f8869978bbb7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,14 +21,14 @@
21#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/smp.h> 23#include <asm/smp.h>
24#ifdef CONFIG_X86_LOCAL_APIC 24#include <asm/cpu.h>
25#include <asm/mpspec.h> 25#include <asm/cpumask.h>
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <mach_apic.h> 27
28#include <asm/genapic.h> 28#ifdef CONFIG_X86_LOCAL_APIC
29#include <asm/uv/uv.h>
29#endif 30#endif
30 31
31#include <asm/pda.h>
32#include <asm/pgtable.h> 32#include <asm/pgtable.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/desc.h> 34#include <asm/desc.h>
@@ -37,6 +37,7 @@
37#include <asm/sections.h> 37#include <asm/sections.h>
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/hypervisor.h> 39#include <asm/hypervisor.h>
40#include <asm/stackprotector.h>
40 41
41#include "cpu.h" 42#include "cpu.h"
42 43
@@ -50,6 +51,15 @@ cpumask_var_t cpu_initialized_mask;
50/* representing cpus for which sibling maps can be computed */ 51/* representing cpus for which sibling maps can be computed */
51cpumask_var_t cpu_sibling_setup_mask; 52cpumask_var_t cpu_sibling_setup_mask;
52 53
54/* correctly size the local cpu masks */
55void __init setup_cpu_local_masks(void)
56{
57 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
58 alloc_bootmem_cpumask_var(&cpu_callin_mask);
59 alloc_bootmem_cpumask_var(&cpu_callout_mask);
60 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
61}
62
53#else /* CONFIG_X86_32 */ 63#else /* CONFIG_X86_32 */
54 64
55cpumask_t cpu_callin_map; 65cpumask_t cpu_callin_map;
@@ -62,23 +72,23 @@ cpumask_t cpu_sibling_setup_map;
62 72
63static struct cpu_dev *this_cpu __cpuinitdata; 73static struct cpu_dev *this_cpu __cpuinitdata;
64 74
75DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 76#ifdef CONFIG_X86_64
66/* We need valid kernel segments for data and code in long mode too 77 /*
67 * IRET will check the segment types kkeil 2000/10/28 78 * We need valid kernel segments for data and code in long mode too
68 * Also sysret mandates a special GDT layout 79 * IRET will check the segment types kkeil 2000/10/28
69 */ 80 * Also sysret mandates a special GDT layout
70/* The TLS descriptors are currently at a different place compared to i386. 81 *
71 Hopefully nobody expects them at a fixed place (Wine?) */ 82 * The TLS descriptors are currently at a different place compared to i386.
72DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 83 * Hopefully nobody expects them at a fixed place (Wine?)
84 */
73 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 85 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
74 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 86 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
75 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 87 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
76 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 88 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
77 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 89 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
78 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 90 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
79} };
80#else 91#else
81DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
82 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 92 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
83 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 93 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
84 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 94 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -110,9 +120,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
110 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 120 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
111 121
112 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 122 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
113 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 123 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
114} }; 124 GDT_STACK_CANARY_INIT
115#endif 125#endif
126} };
116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 127EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
117 128
118#ifdef CONFIG_X86_32 129#ifdef CONFIG_X86_32
@@ -213,6 +224,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
213#endif 224#endif
214 225
215/* 226/*
227 * Some CPU features depend on higher CPUID levels, which may not always
228 * be available due to CPUID level capping or broken virtualization
229 * software. Add those features to this table to auto-disable them.
230 */
231struct cpuid_dependent_feature {
232 u32 feature;
233 u32 level;
234};
235static const struct cpuid_dependent_feature __cpuinitconst
236cpuid_dependent_features[] = {
237 { X86_FEATURE_MWAIT, 0x00000005 },
238 { X86_FEATURE_DCA, 0x00000009 },
239 { X86_FEATURE_XSAVE, 0x0000000d },
240 { 0, 0 }
241};
242
243static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
244{
245 const struct cpuid_dependent_feature *df;
246 for (df = cpuid_dependent_features; df->feature; df++) {
247 /*
248 * Note: cpuid_level is set to -1 if unavailable, but
249 * extended_extended_level is set to 0 if unavailable
250 * and the legitimate extended levels are all negative
251 * when signed; hence the weird messing around with
252 * signs here...
253 */
254 if (cpu_has(c, df->feature) &&
255 ((s32)df->level < 0 ?
256 (u32)df->level > (u32)c->extended_cpuid_level :
257 (s32)df->level > (s32)c->cpuid_level)) {
258 clear_cpu_cap(c, df->feature);
259 if (warn)
260 printk(KERN_WARNING
261 "CPU: CPU feature %s disabled "
262 "due to lack of CPUID level 0x%x\n",
263 x86_cap_flags[df->feature],
264 df->level);
265 }
266 }
267}
268
269/*
216 * Naming convention should be: <Name> [(<Codename>)] 270 * Naming convention should be: <Name> [(<Codename>)]
217 * This table only is used unless init_<vendor>() below doesn't set it; 271 * This table only is used unless init_<vendor>() below doesn't set it;
218 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 272 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
@@ -242,18 +296,29 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
242 296
243__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 297__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
244 298
299void load_percpu_segment(int cpu)
300{
301#ifdef CONFIG_X86_32
302 loadsegment(fs, __KERNEL_PERCPU);
303#else
304 loadsegment(gs, 0);
305 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
306#endif
307 load_stack_canary_segment();
308}
309
245/* Current gdt points %fs at the "master" per-cpu area: after this, 310/* Current gdt points %fs at the "master" per-cpu area: after this,
246 * it's on the real one. */ 311 * it's on the real one. */
247void switch_to_new_gdt(void) 312void switch_to_new_gdt(int cpu)
248{ 313{
249 struct desc_ptr gdt_descr; 314 struct desc_ptr gdt_descr;
250 315
251 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 316 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
252 gdt_descr.size = GDT_SIZE - 1; 317 gdt_descr.size = GDT_SIZE - 1;
253 load_gdt(&gdt_descr); 318 load_gdt(&gdt_descr);
254#ifdef CONFIG_X86_32 319 /* Reload the per-cpu base */
255 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 320
256#endif 321 load_percpu_segment(cpu);
257} 322}
258 323
259static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 324static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -383,11 +448,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
383 } 448 }
384 449
385 index_msb = get_count_order(smp_num_siblings); 450 index_msb = get_count_order(smp_num_siblings);
386#ifdef CONFIG_X86_64 451 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
387 c->phys_proc_id = phys_pkg_id(index_msb);
388#else
389 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
390#endif
391 452
392 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 453 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
393 454
@@ -395,13 +456,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
395 456
396 core_bits = get_count_order(c->x86_max_cores); 457 core_bits = get_count_order(c->x86_max_cores);
397 458
398#ifdef CONFIG_X86_64 459 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
399 c->cpu_core_id = phys_pkg_id(index_msb) &
400 ((1 << core_bits) - 1); 460 ((1 << core_bits) - 1);
401#else
402 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
403 ((1 << core_bits) - 1);
404#endif
405 } 461 }
406 462
407out: 463out:
@@ -570,11 +626,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
570 if (this_cpu->c_early_init) 626 if (this_cpu->c_early_init)
571 this_cpu->c_early_init(c); 627 this_cpu->c_early_init(c);
572 628
573 validate_pat_support(c);
574
575#ifdef CONFIG_SMP 629#ifdef CONFIG_SMP
576 c->cpu_index = boot_cpu_id; 630 c->cpu_index = boot_cpu_id;
577#endif 631#endif
632 filter_cpuid_features(c, false);
578} 633}
579 634
580void __init early_cpu_init(void) 635void __init early_cpu_init(void)
@@ -637,7 +692,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
637 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; 692 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
638#ifdef CONFIG_X86_32 693#ifdef CONFIG_X86_32
639# ifdef CONFIG_X86_HT 694# ifdef CONFIG_X86_HT
640 c->apicid = phys_pkg_id(c->initial_apicid, 0); 695 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
641# else 696# else
642 c->apicid = c->initial_apicid; 697 c->apicid = c->initial_apicid;
643# endif 698# endif
@@ -684,7 +739,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
684 this_cpu->c_identify(c); 739 this_cpu->c_identify(c);
685 740
686#ifdef CONFIG_X86_64 741#ifdef CONFIG_X86_64
687 c->apicid = phys_pkg_id(0); 742 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
688#endif 743#endif
689 744
690 /* 745 /*
@@ -708,6 +763,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
708 * we do "generic changes." 763 * we do "generic changes."
709 */ 764 */
710 765
766 /* Filter out anything that depends on CPUID levels we don't have */
767 filter_cpuid_features(c, true);
768
711 /* If the model name is still unset, do table lookup. */ 769 /* If the model name is still unset, do table lookup. */
712 if (!c->x86_model_id[0]) { 770 if (!c->x86_model_id[0]) {
713 char *p; 771 char *p;
@@ -877,54 +935,22 @@ static __init int setup_disablecpuid(char *arg)
877__setup("clearcpuid=", setup_disablecpuid); 935__setup("clearcpuid=", setup_disablecpuid);
878 936
879#ifdef CONFIG_X86_64 937#ifdef CONFIG_X86_64
880struct x8664_pda **_cpu_pda __read_mostly;
881EXPORT_SYMBOL(_cpu_pda);
882
883struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 938struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
884 939
885static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 940DEFINE_PER_CPU_FIRST(union irq_stack_union,
941 irq_stack_union) __aligned(PAGE_SIZE);
942DEFINE_PER_CPU(char *, irq_stack_ptr) =
943 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
886 944
887void __cpuinit pda_init(int cpu) 945DEFINE_PER_CPU(unsigned long, kernel_stack) =
888{ 946 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
889 struct x8664_pda *pda = cpu_pda(cpu); 947EXPORT_PER_CPU_SYMBOL(kernel_stack);
890 948
891 /* Setup up data that may be needed in __get_free_pages early */ 949DEFINE_PER_CPU(unsigned int, irq_count) = -1;
892 loadsegment(fs, 0);
893 loadsegment(gs, 0);
894 /* Memory clobbers used to order PDA accessed */
895 mb();
896 wrmsrl(MSR_GS_BASE, pda);
897 mb();
898
899 pda->cpunumber = cpu;
900 pda->irqcount = -1;
901 pda->kernelstack = (unsigned long)stack_thread_info() -
902 PDA_STACKOFFSET + THREAD_SIZE;
903 pda->active_mm = &init_mm;
904 pda->mmu_state = 0;
905
906 if (cpu == 0) {
907 /* others are initialized in smpboot.c */
908 pda->pcurrent = &init_task;
909 pda->irqstackptr = boot_cpu_stack;
910 pda->irqstackptr += IRQSTACKSIZE - 64;
911 } else {
912 if (!pda->irqstackptr) {
913 pda->irqstackptr = (char *)
914 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
915 if (!pda->irqstackptr)
916 panic("cannot allocate irqstack for cpu %d",
917 cpu);
918 pda->irqstackptr += IRQSTACKSIZE - 64;
919 }
920 950
921 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 951static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
922 pda->nodenumber = cpu_to_node(cpu); 952 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
923 } 953 __aligned(PAGE_SIZE);
924}
925
926static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
927 DEBUG_STKSZ] __page_aligned_bss;
928 954
929extern asmlinkage void ignore_sysret(void); 955extern asmlinkage void ignore_sysret(void);
930 956
@@ -957,16 +983,21 @@ unsigned long kernel_eflags;
957 */ 983 */
958DEFINE_PER_CPU(struct orig_ist, orig_ist); 984DEFINE_PER_CPU(struct orig_ist, orig_ist);
959 985
960#else 986#else /* x86_64 */
961 987
962/* Make sure %fs is initialized properly in idle threads */ 988#ifdef CONFIG_CC_STACKPROTECTOR
989DEFINE_PER_CPU(unsigned long, stack_canary);
990#endif
991
992/* Make sure %fs and %gs are initialized properly in idle threads */
963struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 993struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
964{ 994{
965 memset(regs, 0, sizeof(struct pt_regs)); 995 memset(regs, 0, sizeof(struct pt_regs));
966 regs->fs = __KERNEL_PERCPU; 996 regs->fs = __KERNEL_PERCPU;
997 regs->gs = __KERNEL_STACK_CANARY;
967 return regs; 998 return regs;
968} 999}
969#endif 1000#endif /* x86_64 */
970 1001
971/* 1002/*
972 * cpu_init() initializes state that is per-CPU. Some data is already 1003 * cpu_init() initializes state that is per-CPU. Some data is already
@@ -982,15 +1013,14 @@ void __cpuinit cpu_init(void)
982 struct tss_struct *t = &per_cpu(init_tss, cpu); 1013 struct tss_struct *t = &per_cpu(init_tss, cpu);
983 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); 1014 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
984 unsigned long v; 1015 unsigned long v;
985 char *estacks = NULL;
986 struct task_struct *me; 1016 struct task_struct *me;
987 int i; 1017 int i;
988 1018
989 /* CPU 0 is initialised in head64.c */ 1019#ifdef CONFIG_NUMA
990 if (cpu != 0) 1020 if (cpu != 0 && percpu_read(node_number) == 0 &&
991 pda_init(cpu); 1021 cpu_to_node(cpu) != NUMA_NO_NODE)
992 else 1022 percpu_write(node_number, cpu_to_node(cpu));
993 estacks = boot_exception_stacks; 1023#endif
994 1024
995 me = current; 1025 me = current;
996 1026
@@ -1006,7 +1036,9 @@ void __cpuinit cpu_init(void)
1006 * and set up the GDT descriptor: 1036 * and set up the GDT descriptor:
1007 */ 1037 */
1008 1038
1009 switch_to_new_gdt(); 1039 switch_to_new_gdt(cpu);
1040 loadsegment(fs, 0);
1041
1010 load_idt((const struct desc_ptr *)&idt_descr); 1042 load_idt((const struct desc_ptr *)&idt_descr);
1011 1043
1012 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 1044 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1017,25 +1049,20 @@ void __cpuinit cpu_init(void)
1017 barrier(); 1049 barrier();
1018 1050
1019 check_efer(); 1051 check_efer();
1020 if (cpu != 0 && x2apic) 1052 if (cpu != 0)
1021 enable_x2apic(); 1053 enable_x2apic();
1022 1054
1023 /* 1055 /*
1024 * set up and load the per-CPU TSS 1056 * set up and load the per-CPU TSS
1025 */ 1057 */
1026 if (!orig_ist->ist[0]) { 1058 if (!orig_ist->ist[0]) {
1027 static const unsigned int order[N_EXCEPTION_STACKS] = { 1059 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1028 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1060 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1029 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 1061 [DEBUG_STACK - 1] = DEBUG_STKSZ
1030 }; 1062 };
1063 char *estacks = per_cpu(exception_stacks, cpu);
1031 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1064 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1032 if (cpu) { 1065 estacks += sizes[v];
1033 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1034 if (!estacks)
1035 panic("Cannot allocate exception "
1036 "stack %ld %d\n", v, cpu);
1037 }
1038 estacks += PAGE_SIZE << order[v];
1039 orig_ist->ist[v] = t->x86_tss.ist[v] = 1066 orig_ist->ist[v] = t->x86_tss.ist[v] =
1040 (unsigned long)estacks; 1067 (unsigned long)estacks;
1041 } 1068 }
@@ -1051,8 +1078,7 @@ void __cpuinit cpu_init(void)
1051 1078
1052 atomic_inc(&init_mm.mm_count); 1079 atomic_inc(&init_mm.mm_count);
1053 me->active_mm = &init_mm; 1080 me->active_mm = &init_mm;
1054 if (me->mm) 1081 BUG_ON(me->mm);
1055 BUG();
1056 enter_lazy_tlb(&init_mm, me); 1082 enter_lazy_tlb(&init_mm, me);
1057 1083
1058 load_sp0(t, &current->thread); 1084 load_sp0(t, &current->thread);
@@ -1069,22 +1095,19 @@ void __cpuinit cpu_init(void)
1069 */ 1095 */
1070 if (kgdb_connected && arch_kgdb_ops.correct_hw_break) 1096 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1071 arch_kgdb_ops.correct_hw_break(); 1097 arch_kgdb_ops.correct_hw_break();
1072 else { 1098 else
1073#endif 1099#endif
1074 /* 1100 {
1075 * Clear all 6 debug registers: 1101 /*
1076 */ 1102 * Clear all 6 debug registers:
1077 1103 */
1078 set_debugreg(0UL, 0); 1104 set_debugreg(0UL, 0);
1079 set_debugreg(0UL, 1); 1105 set_debugreg(0UL, 1);
1080 set_debugreg(0UL, 2); 1106 set_debugreg(0UL, 2);
1081 set_debugreg(0UL, 3); 1107 set_debugreg(0UL, 3);
1082 set_debugreg(0UL, 6); 1108 set_debugreg(0UL, 6);
1083 set_debugreg(0UL, 7); 1109 set_debugreg(0UL, 7);
1084#ifdef CONFIG_KGDB
1085 /* If the kgdb is connected no debug regs should be altered. */
1086 } 1110 }
1087#endif
1088 1111
1089 fpu_init(); 1112 fpu_init();
1090 1113
@@ -1114,15 +1137,14 @@ void __cpuinit cpu_init(void)
1114 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1137 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1115 1138
1116 load_idt(&idt_descr); 1139 load_idt(&idt_descr);
1117 switch_to_new_gdt(); 1140 switch_to_new_gdt(cpu);
1118 1141
1119 /* 1142 /*
1120 * Set up and load the per-CPU TSS and LDT 1143 * Set up and load the per-CPU TSS and LDT
1121 */ 1144 */
1122 atomic_inc(&init_mm.mm_count); 1145 atomic_inc(&init_mm.mm_count);
1123 curr->active_mm = &init_mm; 1146 curr->active_mm = &init_mm;
1124 if (curr->mm) 1147 BUG_ON(curr->mm);
1125 BUG();
1126 enter_lazy_tlb(&init_mm, curr); 1148 enter_lazy_tlb(&init_mm, curr);
1127 1149
1128 load_sp0(t, thread); 1150 load_sp0(t, thread);
@@ -1135,9 +1157,6 @@ void __cpuinit cpu_init(void)
1135 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 1157 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1136#endif 1158#endif
1137 1159
1138 /* Clear %gs. */
1139 asm volatile ("mov %0, %%gs" : : "r" (0));
1140
1141 /* Clear all 6 debug registers: */ 1160 /* Clear all 6 debug registers: */
1142 set_debugreg(0, 0); 1161 set_debugreg(0, 0);
1143 set_debugreg(0, 1); 1162 set_debugreg(0, 1);
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 000000000000..9abbcbd933cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,785 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/percpu.h>
17#include <linux/signal.h>
18#include <linux/errno.h>
19#include <linux/sched.h>
20#include <linux/types.h>
21#include <linux/init.h>
22#include <linux/slab.h>
23#include <linux/smp.h>
24
25#include <asm/cpu_debug.h>
26#include <asm/paravirt.h>
27#include <asm/system.h>
28#include <asm/traps.h>
29#include <asm/apic.h>
30#include <asm/desc.h>
31
32static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
33static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
34static DEFINE_PER_CPU(unsigned, cpu_modelflag);
35static DEFINE_PER_CPU(int, cpu_priv_count);
36static DEFINE_PER_CPU(unsigned, cpu_model);
37
38static DEFINE_MUTEX(cpu_debug_lock);
39
40static struct dentry *cpu_debugfs_dir;
41
42static struct cpu_debug_base cpu_base[] = {
43 { "mc", CPU_MC }, /* Machine Check */
44 { "monitor", CPU_MONITOR }, /* Monitor */
45 { "time", CPU_TIME }, /* Time */
46 { "pmc", CPU_PMC }, /* Performance Monitor */
47 { "platform", CPU_PLATFORM }, /* Platform */
48 { "apic", CPU_APIC }, /* APIC */
49 { "poweron", CPU_POWERON }, /* Power-on */
50 { "control", CPU_CONTROL }, /* Control */
51 { "features", CPU_FEATURES }, /* Features control */
52 { "lastbranch", CPU_LBRANCH }, /* Last Branch */
53 { "bios", CPU_BIOS }, /* BIOS */
54 { "freq", CPU_FREQ }, /* Frequency */
55 { "mtrr", CPU_MTRR }, /* MTRR */
56 { "perf", CPU_PERF }, /* Performance */
57 { "cache", CPU_CACHE }, /* Cache */
58 { "sysenter", CPU_SYSENTER }, /* Sysenter */
59 { "therm", CPU_THERM }, /* Thermal */
60 { "misc", CPU_MISC }, /* Miscellaneous */
61 { "debug", CPU_DEBUG }, /* Debug */
62 { "pat", CPU_PAT }, /* PAT */
63 { "vmx", CPU_VMX }, /* VMX */
64 { "call", CPU_CALL }, /* System Call */
65 { "base", CPU_BASE }, /* BASE Address */
66 { "smm", CPU_SMM }, /* System mgmt mode */
67 { "svm", CPU_SVM }, /*Secure Virtial Machine*/
68 { "osvm", CPU_OSVM }, /* OS-Visible Workaround*/
69 { "tss", CPU_TSS }, /* Task Stack Segment */
70 { "cr", CPU_CR }, /* Control Registers */
71 { "dt", CPU_DT }, /* Descriptor Table */
72 { "registers", CPU_REG_ALL }, /* Select all Registers */
73};
74
75static struct cpu_file_base cpu_file[] = {
76 { "index", CPU_REG_ALL }, /* index */
77 { "value", CPU_REG_ALL }, /* value */
78};
79
80/* Intel Registers Range */
81static struct cpu_debug_range cpu_intel_range[] = {
82 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
83 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
84 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
85 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
86 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
87 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
88
89 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
90 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
91 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
92 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
93
94 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
95 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
96 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
97 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
98
99 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
100 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
101 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
102 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
103
104 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
105 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
106 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
107 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
108
109 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
110 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
111 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
112 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
113 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
114
115 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
116 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
117 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
118 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
119 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
120 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
121 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
122 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
123
124 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
125 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
126 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
127 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
128 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
129 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
130 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
131 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
132
133 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
134 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
135 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
136 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
137 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
138 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
139
140 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
141 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
142 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
143 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
144 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
145 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
146 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
147 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
148 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
149 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
150 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
151 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
152
153 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
154 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
155 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
156 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
157 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
158 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
159 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
160 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
161 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
162 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
163 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
164
165 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
166 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
167 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
168
169 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
170
171 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
172 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
173 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
174 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
175};
176
177/* AMD Registers Range */
178static struct cpu_debug_range cpu_amd_range[] = {
179 { 0x00000010, 0x00000010, CPU_TIME, CPU_ALL, },
180 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_ALL, },
181 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_ALL, },
182
183 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_ALL, },
184 { 0x00000179, 0x0000017A, CPU_MC, CPU_ALL, },
185 { 0x0000017B, 0x0000017B, CPU_MC, CPU_ALL, },
186 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_ALL, },
187 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_ALL, },
188
189 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_ALL, },
190 { 0x00000250, 0x00000250, CPU_MTRR, CPU_ALL, },
191 { 0x00000258, 0x00000259, CPU_MTRR, CPU_ALL, },
192 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_ALL, },
193 { 0x00000277, 0x00000277, CPU_PAT, CPU_ALL, },
194 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_ALL, },
195
196 { 0x00000400, 0x00000417, CPU_MC, CPU_ALL, },
197
198 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_ALL, },
199 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_ALL, },
200 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_ALL, },
201 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_ALL, },
202
203 { 0xC0000408, 0xC000040A, CPU_MC, CPU_ALL, },
204
205 { 0xc0010000, 0xc0010007, CPU_PMC, CPU_ALL, },
206 { 0xc0010010, 0xc0010010, CPU_MTRR, CPU_ALL, },
207 { 0xc0010016, 0xc001001A, CPU_MTRR, CPU_ALL, },
208 { 0xc001001D, 0xc001001D, CPU_MTRR, CPU_ALL, },
209 { 0xc0010030, 0xc0010035, CPU_BIOS, CPU_ALL, },
210 { 0xc0010056, 0xc0010056, CPU_SMM, CPU_ALL, },
211 { 0xc0010061, 0xc0010063, CPU_SMM, CPU_ALL, },
212 { 0xc0010074, 0xc0010074, CPU_MC, CPU_ALL, },
213 { 0xc0010111, 0xc0010113, CPU_SMM, CPU_ALL, },
214 { 0xc0010114, 0xc0010118, CPU_SVM, CPU_ALL, },
215 { 0xc0010119, 0xc001011A, CPU_SMM, CPU_ALL, },
216 { 0xc0010140, 0xc0010141, CPU_OSVM, CPU_ALL, },
217 { 0xc0010156, 0xc0010156, CPU_SMM, CPU_ALL, },
218};
219
220
221static int get_cpu_modelflag(unsigned cpu)
222{
223 int flag;
224
225 switch (per_cpu(cpu_model, cpu)) {
226 /* Intel */
227 case 0x0501:
228 case 0x0502:
229 case 0x0504:
230 flag = CPU_INTEL_PENTIUM;
231 break;
232 case 0x0601:
233 case 0x0603:
234 case 0x0605:
235 case 0x0607:
236 case 0x0608:
237 case 0x060A:
238 case 0x060B:
239 flag = CPU_INTEL_P6;
240 break;
241 case 0x0609:
242 case 0x060D:
243 flag = CPU_INTEL_PENTIUM_M;
244 break;
245 case 0x060E:
246 flag = CPU_INTEL_CORE;
247 break;
248 case 0x060F:
249 case 0x0617:
250 flag = CPU_INTEL_CORE2;
251 break;
252 case 0x061C:
253 flag = CPU_INTEL_ATOM;
254 break;
255 case 0x0F00:
256 case 0x0F01:
257 case 0x0F02:
258 case 0x0F03:
259 case 0x0F04:
260 flag = CPU_INTEL_XEON_P4;
261 break;
262 case 0x0F06:
263 flag = CPU_INTEL_XEON_MP;
264 break;
265 default:
266 flag = CPU_NONE;
267 break;
268 }
269
270 return flag;
271}
272
273static int get_cpu_range_count(unsigned cpu)
274{
275 int index;
276
277 switch (per_cpu(cpu_model, cpu) >> 16) {
278 case X86_VENDOR_INTEL:
279 index = ARRAY_SIZE(cpu_intel_range);
280 break;
281 case X86_VENDOR_AMD:
282 index = ARRAY_SIZE(cpu_amd_range);
283 break;
284 default:
285 index = 0;
286 break;
287 }
288
289 return index;
290}
291
292static int is_typeflag_valid(unsigned cpu, unsigned flag)
293{
294 unsigned vendor, modelflag;
295 int i, index;
296
297 /* Standard Registers should be always valid */
298 if (flag >= CPU_TSS)
299 return 1;
300
301 modelflag = per_cpu(cpu_modelflag, cpu);
302 vendor = per_cpu(cpu_model, cpu) >> 16;
303 index = get_cpu_range_count(cpu);
304
305 for (i = 0; i < index; i++) {
306 switch (vendor) {
307 case X86_VENDOR_INTEL:
308 if ((cpu_intel_range[i].model & modelflag) &&
309 (cpu_intel_range[i].flag & flag))
310 return 1;
311 break;
312 case X86_VENDOR_AMD:
313 if (cpu_amd_range[i].flag & flag)
314 return 1;
315 break;
316 }
317 }
318
319 /* Invalid */
320 return 0;
321}
322
323static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
324 int index, unsigned flag)
325{
326 unsigned modelflag;
327
328 modelflag = per_cpu(cpu_modelflag, cpu);
329 *max = 0;
330 switch (per_cpu(cpu_model, cpu) >> 16) {
331 case X86_VENDOR_INTEL:
332 if ((cpu_intel_range[index].model & modelflag) &&
333 (cpu_intel_range[index].flag & flag)) {
334 *min = cpu_intel_range[index].min;
335 *max = cpu_intel_range[index].max;
336 }
337 break;
338 case X86_VENDOR_AMD:
339 if (cpu_amd_range[index].flag & flag) {
340 *min = cpu_amd_range[index].min;
341 *max = cpu_amd_range[index].max;
342 }
343 break;
344 }
345
346 return *max;
347}
348
349/* This function can also be called with seq = NULL for printk */
350static void print_cpu_data(struct seq_file *seq, unsigned type,
351 u32 low, u32 high)
352{
353 struct cpu_private *priv;
354 u64 val = high;
355
356 if (seq) {
357 priv = seq->private;
358 if (priv->file) {
359 val = (val << 32) | low;
360 seq_printf(seq, "0x%llx\n", val);
361 } else
362 seq_printf(seq, " %08x: %08x_%08x\n",
363 type, high, low);
364 } else
365 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
366}
367
368/* This function can also be called with seq = NULL for printk */
369static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
370{
371 unsigned msr, msr_min, msr_max;
372 struct cpu_private *priv;
373 u32 low, high;
374 int i, range;
375
376 if (seq) {
377 priv = seq->private;
378 if (priv->file) {
379 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
380 &low, &high))
381 print_cpu_data(seq, priv->reg, low, high);
382 return;
383 }
384 }
385
386 range = get_cpu_range_count(cpu);
387
388 for (i = 0; i < range; i++) {
389 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
390 continue;
391
392 for (msr = msr_min; msr <= msr_max; msr++) {
393 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
394 continue;
395 print_cpu_data(seq, msr, low, high);
396 }
397 }
398}
399
400static void print_tss(void *arg)
401{
402 struct pt_regs *regs = task_pt_regs(current);
403 struct seq_file *seq = arg;
404 unsigned int seg;
405
406 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
407 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
408 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
409 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
410
411 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
412 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
413 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
414 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
415
416#ifdef CONFIG_X86_64
417 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
418 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
419 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
420 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
421 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
422 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
423 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
424 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
425#endif
426
427 asm("movl %%cs,%0" : "=r" (seg));
428 seq_printf(seq, " CS\t: %04x\n", seg);
429 asm("movl %%ds,%0" : "=r" (seg));
430 seq_printf(seq, " DS\t: %04x\n", seg);
431 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
432 asm("movl %%es,%0" : "=r" (seg));
433 seq_printf(seq, " ES\t: %04x\n", seg);
434 asm("movl %%fs,%0" : "=r" (seg));
435 seq_printf(seq, " FS\t: %04x\n", seg);
436 asm("movl %%gs,%0" : "=r" (seg));
437 seq_printf(seq, " GS\t: %04x\n", seg);
438
439 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
440
441 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
442}
443
444static void print_cr(void *arg)
445{
446 struct seq_file *seq = arg;
447
448 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
449 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
450 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
451 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
452#ifdef CONFIG_X86_64
453 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
454#endif
455}
456
457static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
458{
459 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
460}
461
462static void print_dt(void *seq)
463{
464 struct desc_ptr dt;
465 unsigned long ldt;
466
467 /* IDT */
468 store_idt((struct desc_ptr *)&dt);
469 print_desc_ptr("IDT", seq, dt);
470
471 /* GDT */
472 store_gdt((struct desc_ptr *)&dt);
473 print_desc_ptr("GDT", seq, dt);
474
475 /* LDT */
476 store_ldt(ldt);
477 seq_printf(seq, " LDT\t: %016lx\n", ldt);
478
479 /* TR */
480 store_tr(ldt);
481 seq_printf(seq, " TR\t: %016lx\n", ldt);
482}
483
484static void print_dr(void *arg)
485{
486 struct seq_file *seq = arg;
487 unsigned long dr;
488 int i;
489
490 for (i = 0; i < 8; i++) {
491 /* Ignore db4, db5 */
492 if ((i == 4) || (i == 5))
493 continue;
494 get_debugreg(dr, i);
495 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
496 }
497
498 seq_printf(seq, "\n MSR\t:\n");
499}
500
501static void print_apic(void *arg)
502{
503 struct seq_file *seq = arg;
504
505#ifdef CONFIG_X86_LOCAL_APIC
506 seq_printf(seq, " LAPIC\t:\n");
507 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
508 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
509 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
510 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
511 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
512 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
513 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
514 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
515 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
516 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
517 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
518 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
519 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
520 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
521 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
522 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
523 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
524 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
525 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
526 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
527 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
528#endif /* CONFIG_X86_LOCAL_APIC */
529
530 seq_printf(seq, "\n MSR\t:\n");
531}
532
533static int cpu_seq_show(struct seq_file *seq, void *v)
534{
535 struct cpu_private *priv = seq->private;
536
537 if (priv == NULL)
538 return -EINVAL;
539
540 switch (cpu_base[priv->type].flag) {
541 case CPU_TSS:
542 smp_call_function_single(priv->cpu, print_tss, seq, 1);
543 break;
544 case CPU_CR:
545 smp_call_function_single(priv->cpu, print_cr, seq, 1);
546 break;
547 case CPU_DT:
548 smp_call_function_single(priv->cpu, print_dt, seq, 1);
549 break;
550 case CPU_DEBUG:
551 if (priv->file == CPU_INDEX_BIT)
552 smp_call_function_single(priv->cpu, print_dr, seq, 1);
553 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
554 break;
555 case CPU_APIC:
556 if (priv->file == CPU_INDEX_BIT)
557 smp_call_function_single(priv->cpu, print_apic, seq, 1);
558 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
559 break;
560
561 default:
562 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
563 break;
564 }
565 seq_printf(seq, "\n");
566
567 return 0;
568}
569
570static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
571{
572 if (*pos == 0) /* One time is enough ;-) */
573 return seq;
574
575 return NULL;
576}
577
578static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
579{
580 (*pos)++;
581
582 return cpu_seq_start(seq, pos);
583}
584
585static void cpu_seq_stop(struct seq_file *seq, void *v)
586{
587}
588
589static const struct seq_operations cpu_seq_ops = {
590 .start = cpu_seq_start,
591 .next = cpu_seq_next,
592 .stop = cpu_seq_stop,
593 .show = cpu_seq_show,
594};
595
596static int cpu_seq_open(struct inode *inode, struct file *file)
597{
598 struct cpu_private *priv = inode->i_private;
599 struct seq_file *seq;
600 int err;
601
602 err = seq_open(file, &cpu_seq_ops);
603 if (!err) {
604 seq = file->private_data;
605 seq->private = priv;
606 }
607
608 return err;
609}
610
611static const struct file_operations cpu_fops = {
612 .open = cpu_seq_open,
613 .read = seq_read,
614 .llseek = seq_lseek,
615 .release = seq_release,
616};
617
618static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
619 unsigned file, struct dentry *dentry)
620{
621 struct cpu_private *priv = NULL;
622
623 /* Already intialized */
624 if (file == CPU_INDEX_BIT)
625 if (per_cpu(cpu_arr[type].init, cpu))
626 return 0;
627
628 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
629 if (priv == NULL)
630 return -ENOMEM;
631
632 priv->cpu = cpu;
633 priv->type = type;
634 priv->reg = reg;
635 priv->file = file;
636 mutex_lock(&cpu_debug_lock);
637 per_cpu(priv_arr[type], cpu) = priv;
638 per_cpu(cpu_priv_count, cpu)++;
639 mutex_unlock(&cpu_debug_lock);
640
641 if (file)
642 debugfs_create_file(cpu_file[file].name, S_IRUGO,
643 dentry, (void *)priv, &cpu_fops);
644 else {
645 debugfs_create_file(cpu_base[type].name, S_IRUGO,
646 per_cpu(cpu_arr[type].dentry, cpu),
647 (void *)priv, &cpu_fops);
648 mutex_lock(&cpu_debug_lock);
649 per_cpu(cpu_arr[type].init, cpu) = 1;
650 mutex_unlock(&cpu_debug_lock);
651 }
652
653 return 0;
654}
655
656static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
657 struct dentry *dentry)
658{
659 unsigned file;
660 int err = 0;
661
662 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
663 err = cpu_create_file(cpu, type, reg, file, dentry);
664 if (err)
665 return err;
666 }
667
668 return err;
669}
670
671static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
672{
673 struct dentry *cpu_dentry = NULL;
674 unsigned reg, reg_min, reg_max;
675 int i, range, err = 0;
676 char reg_dir[12];
677 u32 low, high;
678
679 range = get_cpu_range_count(cpu);
680
681 for (i = 0; i < range; i++) {
682 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
683 cpu_base[type].flag))
684 continue;
685
686 for (reg = reg_min; reg <= reg_max; reg++) {
687 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
688 continue;
689
690 sprintf(reg_dir, "0x%x", reg);
691 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
692 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
693 if (err)
694 return err;
695 }
696 }
697
698 return err;
699}
700
701static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
702{
703 struct dentry *cpu_dentry = NULL;
704 unsigned type;
705 int err = 0;
706
707 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
708 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
709 continue;
710 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
711 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
712
713 if (type < CPU_TSS_BIT)
714 err = cpu_init_msr(cpu, type, cpu_dentry);
715 else
716 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
717 cpu_dentry);
718 if (err)
719 return err;
720 }
721
722 return err;
723}
724
725static int cpu_init_cpu(void)
726{
727 struct dentry *cpu_dentry = NULL;
728 struct cpuinfo_x86 *cpui;
729 char cpu_dir[12];
730 unsigned cpu;
731 int err = 0;
732
733 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
734 cpui = &cpu_data(cpu);
735 if (!cpu_has(cpui, X86_FEATURE_MSR))
736 continue;
737 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
738 (cpui->x86 << 8) |
739 (cpui->x86_model));
740 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
741
742 sprintf(cpu_dir, "cpu%d", cpu);
743 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
744 err = cpu_init_allreg(cpu, cpu_dentry);
745
746 pr_info("cpu%d(%d) debug files %d\n",
747 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
748 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
749 pr_err("Register files count %d exceeds limit %d\n",
750 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
751 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
752 err = -ENFILE;
753 }
754 if (err)
755 return err;
756 }
757
758 return err;
759}
760
761static int __init cpu_debug_init(void)
762{
763 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
764
765 return cpu_init_cpu();
766}
767
768static void __exit cpu_debug_exit(void)
769{
770 int i, cpu;
771
772 if (cpu_debugfs_dir)
773 debugfs_remove_recursive(cpu_debugfs_dir);
774
775 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
776 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
777 kfree(per_cpu(priv_arr[i], cpu));
778}
779
780module_init(cpu_debug_init);
781module_exit(cpu_debug_exit);
782
783MODULE_AUTHOR("Jaswinder Singh Rajput");
784MODULE_DESCRIPTION("CPU Debug module");
785MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..5e40f54171e7 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h> 36#include <trace/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
70 70
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
72 72
73DEFINE_TRACE(power_mark);
74
73/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
74static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
75 77
@@ -601,7 +603,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 603 if (!data)
602 return -ENOMEM; 604 return -ENOMEM;
603 605
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 606 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 607 per_cpu(drv_data, cpu) = data;
606 608
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 609 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index c2f930d86640..41ab3f064cb1 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
204 } 204 }
205 /* Enable Enhanced PowerSaver */ 205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val); 206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & 1 << 16)) { 207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= 1 << 16; 208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val); 209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */ 210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val); 211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & 1 << 16)) { 212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); 213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV; 214 return -ENODEV;
215 } 215 }
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index f08998278a3a..c9f1fdc02830 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
390 enable it if not. */ 390 enable it if not. */
391 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 391 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
392 392
393 if (!(l & (1<<16))) { 393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 l |= (1<<16); 394 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
395 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); 395 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
396 wrmsr(MSR_IA32_MISC_ENABLE, l, h); 396 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
397 397
398 /* check to see if it stuck */ 398 /* check to see if it stuck */
399 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
400 if (!(l & (1<<16))) { 400 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
401 printk(KERN_INFO PFX 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n"); 402 "couldn't enable Enhanced SpeedStep\n");
403 return -ENODEV; 403 return -ENODEV;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 24ff26a38ade..c1c04bf0df77 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bitops.h> 5#include <linux/bitops.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/sched.h>
7#include <linux/thread_info.h> 8#include <linux/thread_info.h>
8#include <linux/module.h> 9#include <linux/module.h>
9 10
@@ -13,6 +14,7 @@
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
14#include <asm/ds.h> 15#include <asm/ds.h>
15#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h>
16 18
17#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
18#include <asm/topology.h> 20#include <asm/topology.h>
@@ -24,7 +26,6 @@
24#ifdef CONFIG_X86_LOCAL_APIC 26#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 27#include <asm/mpspec.h>
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <mach_apic.h>
28#endif 29#endif
29 30
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
@@ -56,13 +57,30 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
56 57
57 /* 58 /*
58 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 59 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
59 * with P/T states and does not stop in deep C-states 60 * with P/T states and does not stop in deep C-states.
61 *
62 * It is also reliable across cores and sockets. (but not across
63 * cabinets - we turn it off in that case explicitly.)
60 */ 64 */
61 if (c->x86_power & (1 << 8)) { 65 if (c->x86_power & (1 << 8)) {
62 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 66 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
63 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 67 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
68 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
69 sched_clock_stable = 1;
64 } 70 }
65 71
72 /*
73 * There is a known erratum on Pentium III and Core Solo
74 * and Core Duo CPUs.
75 * " Page with PAT set to WC while associated MTRR is UC
76 * may consolidate to UC "
77 * Because of this erratum, it is better to stick with
78 * setting WC in MTRR rather than using PAT on these CPUs.
79 *
80 * Enable PAT WC only on P4, Core 2 or later CPUs.
81 */
82 if (c->x86 == 6 && c->x86_model < 15)
83 clear_cpu_cap(c, X86_FEATURE_PAT);
66} 84}
67 85
68#ifdef CONFIG_X86_32 86#ifdef CONFIG_X86_32
@@ -99,6 +117,28 @@ static void __cpuinit trap_init_f00f_bug(void)
99} 117}
100#endif 118#endif
101 119
120static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
121{
122#ifdef CONFIG_SMP
123 /* calling is from identify_secondary_cpu() ? */
124 if (c->cpu_index == boot_cpu_id)
125 return;
126
127 /*
128 * Mask B, Pentium, but not Pentium MMX
129 */
130 if (c->x86 == 5 &&
131 c->x86_mask >= 1 && c->x86_mask <= 4 &&
132 c->x86_model <= 3) {
133 /*
134 * Remember we have B step Pentia with bugs
135 */
136 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
137 "with B stepping processors.\n");
138 }
139#endif
140}
141
102static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 142static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
103{ 143{
104 unsigned long lo, hi; 144 unsigned long lo, hi;
@@ -135,10 +175,10 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
135 */ 175 */
136 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 176 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
137 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 177 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
138 if ((lo & (1<<9)) == 0) { 178 if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
139 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); 179 printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
140 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); 180 printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
141 lo |= (1<<9); /* Disable hw prefetching */ 181 lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
142 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 182 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
143 } 183 }
144 } 184 }
@@ -175,6 +215,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
175#ifdef CONFIG_X86_NUMAQ 215#ifdef CONFIG_X86_NUMAQ
176 numaq_tsc_disable(); 216 numaq_tsc_disable();
177#endif 217#endif
218
219 intel_smp_check(c);
178} 220}
179#else 221#else
180static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 222static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index da299eb85fc0..7293508d8f5c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -147,7 +147,16 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ecx ecx; 147 union _cpuid4_leaf_ecx ecx;
148 unsigned long size; 148 unsigned long size;
149 unsigned long can_disable; 149 unsigned long can_disable;
150 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 150 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
151};
152
153/* subset of above _cpuid4_info w/o shared_cpu_map */
154struct _cpuid4_info_regs {
155 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx;
158 unsigned long size;
159 unsigned long can_disable;
151}; 160};
152 161
153#ifdef CONFIG_PCI 162#ifdef CONFIG_PCI
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
278} 287}
279 288
280static void __cpuinit 289static void __cpuinit
281amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 290amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
282{ 291{
283 if (index < 3) 292 if (index < 3)
284 return; 293 return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
286} 295}
287 296
288static int 297static int
289__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 298__cpuinit cpuid4_cache_lookup_regs(int index,
299 struct _cpuid4_info_regs *this_leaf)
290{ 300{
291 union _cpuid4_leaf_eax eax; 301 union _cpuid4_leaf_eax eax;
292 union _cpuid4_leaf_ebx ebx; 302 union _cpuid4_leaf_ebx ebx;
@@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
314 return 0; 324 return 0;
315} 325}
316 326
327static int
328__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
329{
330 struct _cpuid4_info_regs *leaf_regs =
331 (struct _cpuid4_info_regs *)this_leaf;
332
333 return cpuid4_cache_lookup_regs(index, leaf_regs);
334}
335
317static int __cpuinit find_num_cache_leaves(void) 336static int __cpuinit find_num_cache_leaves(void)
318{ 337{
319 unsigned int eax, ebx, ecx, edx; 338 unsigned int eax, ebx, ecx, edx;
@@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
353 * parameters cpuid leaf to find the cache details 372 * parameters cpuid leaf to find the cache details
354 */ 373 */
355 for (i = 0; i < num_cache_leaves; i++) { 374 for (i = 0; i < num_cache_leaves; i++) {
356 struct _cpuid4_info this_leaf; 375 struct _cpuid4_info_regs this_leaf;
357
358 int retval; 376 int retval;
359 377
360 retval = cpuid4_cache_lookup(i, &this_leaf); 378 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
361 if (retval >= 0) { 379 if (retval >= 0) {
362 switch(this_leaf.eax.split.level) { 380 switch(this_leaf.eax.split.level) {
363 case 1: 381 case 1:
@@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
506 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 524 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
507 525
508 if (num_threads_sharing == 1) 526 if (num_threads_sharing == 1)
509 cpu_set(cpu, this_leaf->shared_cpu_map); 527 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
510 else { 528 else {
511 index_msb = get_count_order(num_threads_sharing); 529 index_msb = get_count_order(num_threads_sharing);
512 530
513 for_each_online_cpu(i) { 531 for_each_online_cpu(i) {
514 if (cpu_data(i).apicid >> index_msb == 532 if (cpu_data(i).apicid >> index_msb ==
515 c->apicid >> index_msb) { 533 c->apicid >> index_msb) {
516 cpu_set(i, this_leaf->shared_cpu_map); 534 cpumask_set_cpu(i,
535 to_cpumask(this_leaf->shared_cpu_map));
517 if (i != cpu && per_cpu(cpuid4_info, i)) { 536 if (i != cpu && per_cpu(cpuid4_info, i)) {
518 sibling_leaf = CPUID4_INFO_IDX(i, index); 537 sibling_leaf =
519 cpu_set(cpu, sibling_leaf->shared_cpu_map); 538 CPUID4_INFO_IDX(i, index);
539 cpumask_set_cpu(cpu, to_cpumask(
540 sibling_leaf->shared_cpu_map));
520 } 541 }
521 } 542 }
522 } 543 }
@@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
528 int sibling; 549 int sibling;
529 550
530 this_leaf = CPUID4_INFO_IDX(cpu, index); 551 this_leaf = CPUID4_INFO_IDX(cpu, index);
531 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 552 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
532 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 553 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
533 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 554 cpumask_clear_cpu(cpu,
555 to_cpumask(sibling_leaf->shared_cpu_map));
534 } 556 }
535} 557}
536#else 558#else
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
635 int n = 0; 657 int n = 0;
636 658
637 if (len > 1) { 659 if (len > 1) {
638 cpumask_t *mask = &this_leaf->shared_cpu_map; 660 const struct cpumask *mask;
639 661
662 mask = to_cpumask(this_leaf->shared_cpu_map);
640 n = type? 663 n = type?
641 cpulist_scnprintf(buf, len-2, mask) : 664 cpulist_scnprintf(buf, len-2, mask) :
642 cpumask_scnprintf(buf, len-2, mask); 665 cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
699 722
700static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
701{ 724{
702 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
726 int node = cpu_to_node(cpumask_first(mask));
703 struct pci_dev *dev = NULL; 727 struct pci_dev *dev = NULL;
704 ssize_t ret = 0; 728 ssize_t ret = 0;
705 int i; 729 int i;
@@ -733,7 +757,8 @@ static ssize_t
733store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
734 size_t count) 758 size_t count)
735{ 759{
736 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
761 int node = cpu_to_node(cpumask_first(mask));
737 struct pci_dev *dev = NULL; 762 struct pci_dev *dev = NULL;
738 unsigned int ret, index, val; 763 unsigned int ret, index, val;
739 764
@@ -878,7 +903,7 @@ err_out:
878 return -ENOMEM; 903 return -ENOMEM;
879} 904}
880 905
881static cpumask_t cache_dev_map = CPU_MASK_NONE; 906static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
882 907
883/* Add/Remove cache interface for CPU device */ 908/* Add/Remove cache interface for CPU device */
884static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 909static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
918 } 943 }
919 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 944 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
920 } 945 }
921 cpu_set(cpu, cache_dev_map); 946 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
922 947
923 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 948 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
924 return 0; 949 return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
931 956
932 if (per_cpu(cpuid4_info, cpu) == NULL) 957 if (per_cpu(cpuid4_info, cpu) == NULL)
933 return; 958 return;
934 if (!cpu_isset(cpu, cache_dev_map)) 959 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
935 return; 960 return;
936 cpu_clear(cpu, cache_dev_map); 961 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
937 962
938 for (i = 0; i < num_cache_leaves; i++) 963 for (i = 0; i < num_cache_leaves; i++)
939 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 964 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..ca14604611ec 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
848 cpu_set(cpu, mce_device_initialized); 1024 cpu_set(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpu_clear(cpu, mce_device_initialized);
873} 1057}
874 1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 err = mce_init_banks();
1166 if (err)
1167 return err;
1168
909 err = sysdev_class_register(&mce_sysclass); 1169 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1170 if (err)
911 return err; 1171 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index f2ee0ae29bd6..c5a32f92d07e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
@@ -481,7 +477,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 477
482#ifdef CONFIG_SMP 478#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 479 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = first_cpu(per_cpu(cpu_core_map, cpu)); 480 i = cpumask_first(&per_cpu(cpu_core_map, cpu));
485 481
486 /* first core not up yet */ 482 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 483 if (cpu_data(i).cpu_core_id)
@@ -501,7 +497,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 497 if (err)
502 goto out; 498 goto out;
503 499
504 b->cpus = per_cpu(cpu_core_map, cpu); 500 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 501 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 502 goto out;
507 } 503 }
@@ -512,15 +508,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
512 err = -ENOMEM; 508 err = -ENOMEM;
513 goto out; 509 goto out;
514 } 510 }
511 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
512 kfree(b);
513 err = -ENOMEM;
514 goto out;
515 }
515 516
516 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 517 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
517 if (!b->kobj) 518 if (!b->kobj)
518 goto out_free; 519 goto out_free;
519 520
520#ifndef CONFIG_SMP 521#ifndef CONFIG_SMP
521 b->cpus = CPU_MASK_ALL; 522 cpumask_setall(b->cpus);
522#else 523#else
523 b->cpus = per_cpu(cpu_core_map, cpu); 524 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
524#endif 525#endif
525 526
526 per_cpu(threshold_banks, cpu)[bank] = b; 527 per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 if (err) 530 if (err)
530 goto out_free; 531 goto out_free;
531 532
532 for_each_cpu_mask_nr(i, b->cpus) { 533 for_each_cpu(i, b->cpus) {
533 if (i == cpu) 534 if (i == cpu)
534 continue; 535 continue;
535 536
@@ -545,6 +546,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
545 546
546out_free: 547out_free:
547 per_cpu(threshold_banks, cpu)[bank] = NULL; 548 per_cpu(threshold_banks, cpu)[bank] = NULL;
549 free_cpumask_var(b->cpus);
548 kfree(b); 550 kfree(b);
549out: 551out:
550 return err; 552 return err;
@@ -619,7 +621,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
619#endif 621#endif
620 622
621 /* remove all sibling symlinks before unregistering */ 623 /* remove all sibling symlinks before unregistering */
622 for_each_cpu_mask_nr(i, b->cpus) { 624 for_each_cpu(i, b->cpus) {
623 if (i == cpu) 625 if (i == cpu)
624 continue; 626 continue;
625 627
@@ -632,6 +634,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
632free_out: 634free_out:
633 kobject_del(b->kobj); 635 kobject_del(b->kobj);
634 kobject_put(b->kobj); 636 kobject_put(b->kobj);
637 free_cpumask_var(b->cpus);
635 kfree(b); 638 kfree(b);
636 per_cpu(threshold_banks, cpu)[bank] = NULL; 639 per_cpu(threshold_banks, cpu)[bank] = NULL;
637} 640}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index f44c36624360..aaa7d9730938 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,17 +1,21 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/interrupt.h> 9#include <linux/interrupt.h>
8#include <linux/percpu.h> 10#include <linux/percpu.h>
9#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/apic.h>
10#include <asm/msr.h> 13#include <asm/msr.h>
11#include <asm/mce.h> 14#include <asm/mce.h>
12#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
13#include <asm/idle.h> 16#include <asm/idle.h>
14#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
15 19
16asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
17{ 21{
@@ -24,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
24 28
25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
26 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
28 32
29 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 34 irq_exit();
@@ -48,13 +52,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
48 */ 52 */
49 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
50 h = apic_read(APIC_LVTTHMR); 54 h = apic_read(APIC_LVTTHMR);
51 if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { 55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
52 printk(KERN_DEBUG 56 printk(KERN_DEBUG
53 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
54 return; 58 return;
55 } 59 }
56 60
57 if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) 61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
58 tm2 = 1; 62 tm2 = 1;
59 63
60 if (h & APIC_VECTOR_MASK) { 64 if (h & APIC_VECTOR_MASK) {
@@ -72,7 +76,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
72 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); 76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
73 77
74 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
75 wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); 79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
76 80
77 l = apic_read(APIC_LVTTHMR); 81 l = apic_read(APIC_LVTTHMR);
78 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
@@ -84,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
84 return; 88 return;
85} 89}
86 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static __cpuinit void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
87void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
88{ 293{
89 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
90} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 9b60fce09f75..f53bdcbaf382 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -85,7 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 */ 85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR); 87 h = apic_read(APIC_LVTTHMR);
88 if ((l & (1<<3)) && (h & APIC_DM_SMI)) { 88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", 89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu); 90 cpu);
91 return; /* -EBUSY */ 91 return; /* -EBUSY */
@@ -111,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
111 vendor_thermal_interrupt = intel_thermal_interrupt; 111 vendor_thermal_interrupt = intel_thermal_interrupt;
112 112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..f6c70a164e32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,7 +19,7 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/genapic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 01b1244ef1c0..d67e0e48bc2d 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,11 +7,10 @@
7/* 7/*
8 * Get CPU information for use by the procfs. 8 * Get CPU information for use by the procfs.
9 */ 9 */
10#ifdef CONFIG_X86_32
11static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, 10static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
12 unsigned int cpu) 11 unsigned int cpu)
13{ 12{
14#ifdef CONFIG_X86_HT 13#ifdef CONFIG_SMP
15 if (c->x86_max_cores * smp_num_siblings > 1) { 14 if (c->x86_max_cores * smp_num_siblings > 1) {
16 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 15 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
17 seq_printf(m, "siblings\t: %d\n", 16 seq_printf(m, "siblings\t: %d\n",
@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
24#endif 23#endif
25} 24}
26 25
26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /* 29 /*
@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
50 c->wp_works_ok ? "yes" : "no"); 50 c->wp_works_ok ? "yes" : "no");
51} 51}
52#else 52#else
53static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
54 unsigned int cpu)
55{
56#ifdef CONFIG_SMP
57 if (c->x86_max_cores * smp_num_siblings > 1) {
58 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
59 seq_printf(m, "siblings\t: %d\n",
60 cpus_weight(per_cpu(cpu_core_map, cpu)));
61 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
62 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
63 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
64 seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
65 }
66#endif
67}
68
69static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 53static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
70{ 54{
71 seq_printf(m, 55 seq_printf(m,
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c689d19e35ab..ff958248e61d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,12 +24,10 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
31#include <mach_ipi.h>
32
33 31
34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
35 33
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..95ea5fa7d444 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,10 +10,12 @@
10#include <linux/kdebug.h> 10#include <linux/kdebug.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/ftrace.h>
13#include <linux/kexec.h> 14#include <linux/kexec.h>
14#include <linux/bug.h> 15#include <linux/bug.h>
15#include <linux/nmi.h> 16#include <linux/nmi.h>
16#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/ftrace.h>
17 19
18#include <asm/stacktrace.h> 20#include <asm/stacktrace.h>
19 21
@@ -99,7 +101,7 @@ print_context_stack(struct thread_info *tinfo,
99 frame = frame->next_frame; 101 frame = frame->next_frame;
100 bp = (unsigned long) frame; 102 bp = (unsigned long) frame;
101 } else { 103 } else {
102 ops->address(data, addr, bp == 0); 104 ops->address(data, addr, 0);
103 } 105 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph); 106 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 } 107 }
@@ -195,6 +197,11 @@ unsigned __kprobes long oops_begin(void)
195 int cpu; 197 int cpu;
196 unsigned long flags; 198 unsigned long flags;
197 199
200 /* notify the hw-branch tracer so it may disable tracing and
201 add the last trace to the trace buffer -
202 the earlier this happens, the more useful the trace. */
203 trace_hw_branch_oops();
204
198 oops_enter(); 205 oops_enter();
199 206
200 /* racy, but better than risking deadlock. */ 207 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
106 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
107{ 107{
108 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
110 unsigned used = 0; 111 unsigned used = 0;
111 struct thread_info *tinfo; 112 struct thread_info *tinfo;
112 int graph = 0; 113 int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
161 continue; 162 continue;
162 } 163 }
163 if (irqstack_end) { 164 if (irq_stack_end) {
164 unsigned long *irqstack; 165 unsigned long *irq_stack;
165 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
166 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
167 168
168 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
169 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
170 break; 171 break;
171 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
172 ops, data, irqstack_end, &graph); 173 ops, data, irq_stack_end, &graph);
173 /* 174 /*
174 * We link to the next stack (which would be 175 * We link to the next stack (which would be
175 * the process stack normally) the last 176 * the process stack normally) the last
176 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
177 */ 178 */
178 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
179 irqstack_end = NULL; 180 irq_stack_end = NULL;
180 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
181 continue; 182 continue;
182 } 183 }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
199 unsigned long *stack; 200 unsigned long *stack;
200 int i; 201 int i;
201 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
202 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
203 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
204 unsigned long *irqstack = 205 unsigned long *irq_stack =
205 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
206 207
207 /* 208 /*
208 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
218 219
219 stack = sp; 220 stack = sp;
220 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
221 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
222 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
223 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
224 printk(" <EOI> "); 225 printk(" <EOI> ");
225 } 226 }
226 } else { 227 } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
241 int i; 242 int i;
242 unsigned long sp; 243 unsigned long sp;
243 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
244 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
245 246
246 sp = regs->sp; 247 sp = regs->sp;
247 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e85826829cf2..508bec1cee27 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -858,6 +858,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
858 */ 858 */
859void __init reserve_early(u64 start, u64 end, char *name) 859void __init reserve_early(u64 start, u64 end, char *name)
860{ 860{
861 if (start >= end)
862 return;
863
861 drop_overlaps_that_are_ok(start, end); 864 drop_overlaps_that_are_ok(start, end);
862 __reserve_early(start, end, name, 0); 865 __reserve_early(start, end, name, 0);
863} 866}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 504ad198e4ad..639ad98238a2 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,8 +13,8 @@
13#include <asm/setup.h> 13#include <asm/setup.h>
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 18#include <linux/usb/ehci_def.h>
19 19
20/* Simple VGA output */ 20/* Simple VGA output */
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index eb1ef3b67dd5..1736acc4d7aa 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index cb783b92c50c..22c3b7828c50 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
index ef00bb77d7e4..fbe66e626c09 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#include <linux/linkage.h> 8#include <linux/linkage.h>
9#include <asm/page.h> 9#include <asm/page_types.h>
10 10
11/* 11/*
12 * efi_call_phys(void *, ...) is a function with variable parameters. 12 * efi_call_phys(void *, ...) is a function with variable parameters.
@@ -113,6 +113,7 @@ ENTRY(efi_call_phys)
113 movl (%edx), %ecx 113 movl (%edx), %ecx
114 pushl %ecx 114 pushl %ecx
115 ret 115 ret
116ENDPROC(efi_call_phys)
116.previous 117.previous
117 118
118.data 119.data
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
index 99b47d48c9f4..4c07ccab8146 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -41,6 +41,7 @@ ENTRY(efi_call0)
41 addq $32, %rsp 41 addq $32, %rsp
42 RESTORE_XMM 42 RESTORE_XMM
43 ret 43 ret
44ENDPROC(efi_call0)
44 45
45ENTRY(efi_call1) 46ENTRY(efi_call1)
46 SAVE_XMM 47 SAVE_XMM
@@ -50,6 +51,7 @@ ENTRY(efi_call1)
50 addq $32, %rsp 51 addq $32, %rsp
51 RESTORE_XMM 52 RESTORE_XMM
52 ret 53 ret
54ENDPROC(efi_call1)
53 55
54ENTRY(efi_call2) 56ENTRY(efi_call2)
55 SAVE_XMM 57 SAVE_XMM
@@ -59,6 +61,7 @@ ENTRY(efi_call2)
59 addq $32, %rsp 61 addq $32, %rsp
60 RESTORE_XMM 62 RESTORE_XMM
61 ret 63 ret
64ENDPROC(efi_call2)
62 65
63ENTRY(efi_call3) 66ENTRY(efi_call3)
64 SAVE_XMM 67 SAVE_XMM
@@ -69,6 +72,7 @@ ENTRY(efi_call3)
69 addq $32, %rsp 72 addq $32, %rsp
70 RESTORE_XMM 73 RESTORE_XMM
71 ret 74 ret
75ENDPROC(efi_call3)
72 76
73ENTRY(efi_call4) 77ENTRY(efi_call4)
74 SAVE_XMM 78 SAVE_XMM
@@ -80,6 +84,7 @@ ENTRY(efi_call4)
80 addq $32, %rsp 84 addq $32, %rsp
81 RESTORE_XMM 85 RESTORE_XMM
82 ret 86 ret
87ENDPROC(efi_call4)
83 88
84ENTRY(efi_call5) 89ENTRY(efi_call5)
85 SAVE_XMM 90 SAVE_XMM
@@ -92,6 +97,7 @@ ENTRY(efi_call5)
92 addq $48, %rsp 97 addq $48, %rsp
93 RESTORE_XMM 98 RESTORE_XMM
94 ret 99 ret
100ENDPROC(efi_call5)
95 101
96ENTRY(efi_call6) 102ENTRY(efi_call6)
97 SAVE_XMM 103 SAVE_XMM
@@ -107,3 +113,4 @@ ENTRY(efi_call6)
107 addq $48, %rsp 113 addq $48, %rsp
108 RESTORE_XMM 114 RESTORE_XMM
109 ret 115 ret
116ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d3..c929add475c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
30 * 1C(%esp) - %ds 30 * 1C(%esp) - %ds
31 * 20(%esp) - %es 31 * 20(%esp) - %es
32 * 24(%esp) - %fs 32 * 24(%esp) - %fs
33 * 28(%esp) - orig_eax 33 * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
34 * 2C(%esp) - %eip 34 * 2C(%esp) - orig_eax
35 * 30(%esp) - %cs 35 * 30(%esp) - %eip
36 * 34(%esp) - %eflags 36 * 34(%esp) - %cs
37 * 38(%esp) - %oldesp 37 * 38(%esp) - %eflags
38 * 3C(%esp) - %oldss 38 * 3C(%esp) - %oldesp
39 * 40(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -46,7 +47,7 @@
46#include <asm/errno.h> 47#include <asm/errno.h>
47#include <asm/segment.h> 48#include <asm/segment.h>
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page_types.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
51#include <asm/percpu.h> 52#include <asm/percpu.h>
52#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
@@ -101,121 +102,221 @@
101#define resume_userspace_sig resume_userspace 102#define resume_userspace_sig resume_userspace
102#endif 103#endif
103 104
104#define SAVE_ALL \ 105/*
105 cld; \ 106 * User gs save/restore
106 pushl %fs; \ 107 *
107 CFI_ADJUST_CFA_OFFSET 4;\ 108 * %gs is used for userland TLS and kernel only uses it for stack
108 /*CFI_REL_OFFSET fs, 0;*/\ 109 * canary which is required to be at %gs:20 by gcc. Read the comment
109 pushl %es; \ 110 * at the top of stackprotector.h for more info.
110 CFI_ADJUST_CFA_OFFSET 4;\ 111 *
111 /*CFI_REL_OFFSET es, 0;*/\ 112 * Local labels 98 and 99 are used.
112 pushl %ds; \ 113 */
113 CFI_ADJUST_CFA_OFFSET 4;\ 114#ifdef CONFIG_X86_32_LAZY_GS
114 /*CFI_REL_OFFSET ds, 0;*/\ 115
115 pushl %eax; \ 116 /* unfortunately push/pop can't be no-op */
116 CFI_ADJUST_CFA_OFFSET 4;\ 117.macro PUSH_GS
117 CFI_REL_OFFSET eax, 0;\ 118 pushl $0
118 pushl %ebp; \ 119 CFI_ADJUST_CFA_OFFSET 4
119 CFI_ADJUST_CFA_OFFSET 4;\ 120.endm
120 CFI_REL_OFFSET ebp, 0;\ 121.macro POP_GS pop=0
121 pushl %edi; \ 122 addl $(4 + \pop), %esp
122 CFI_ADJUST_CFA_OFFSET 4;\ 123 CFI_ADJUST_CFA_OFFSET -(4 + \pop)
123 CFI_REL_OFFSET edi, 0;\ 124.endm
124 pushl %esi; \ 125.macro POP_GS_EX
125 CFI_ADJUST_CFA_OFFSET 4;\ 126.endm
126 CFI_REL_OFFSET esi, 0;\ 127
127 pushl %edx; \ 128 /* all the rest are no-op */
128 CFI_ADJUST_CFA_OFFSET 4;\ 129.macro PTGS_TO_GS
129 CFI_REL_OFFSET edx, 0;\ 130.endm
130 pushl %ecx; \ 131.macro PTGS_TO_GS_EX
131 CFI_ADJUST_CFA_OFFSET 4;\ 132.endm
132 CFI_REL_OFFSET ecx, 0;\ 133.macro GS_TO_REG reg
133 pushl %ebx; \ 134.endm
134 CFI_ADJUST_CFA_OFFSET 4;\ 135.macro REG_TO_PTGS reg
135 CFI_REL_OFFSET ebx, 0;\ 136.endm
136 movl $(__USER_DS), %edx; \ 137.macro SET_KERNEL_GS reg
137 movl %edx, %ds; \ 138.endm
138 movl %edx, %es; \ 139
139 movl $(__KERNEL_PERCPU), %edx; \ 140#else /* CONFIG_X86_32_LAZY_GS */
141
142.macro PUSH_GS
143 pushl %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/
146.endm
147
148.macro POP_GS pop=0
14998: popl %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/
152 .if \pop <> 0
153 add $\pop, %esp
154 CFI_ADJUST_CFA_OFFSET -\pop
155 .endif
156.endm
157.macro POP_GS_EX
158.pushsection .fixup, "ax"
15999: movl $0, (%esp)
160 jmp 98b
161.section __ex_table, "a"
162 .align 4
163 .long 98b, 99b
164.popsection
165.endm
166
167.macro PTGS_TO_GS
16898: mov PT_GS(%esp), %gs
169.endm
170.macro PTGS_TO_GS_EX
171.pushsection .fixup, "ax"
17299: movl $0, PT_GS(%esp)
173 jmp 98b
174.section __ex_table, "a"
175 .align 4
176 .long 98b, 99b
177.popsection
178.endm
179
180.macro GS_TO_REG reg
181 movl %gs, \reg
182 /*CFI_REGISTER gs, \reg*/
183.endm
184.macro REG_TO_PTGS reg
185 movl \reg, PT_GS(%esp)
186 /*CFI_REL_OFFSET gs, PT_GS*/
187.endm
188.macro SET_KERNEL_GS reg
189 movl $(__KERNEL_STACK_CANARY), \reg
190 movl \reg, %gs
191.endm
192
193#endif /* CONFIG_X86_32_LAZY_GS */
194
195.macro SAVE_ALL
196 cld
197 PUSH_GS
198 pushl %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0
210 pushl %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0
213 pushl %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0
216 pushl %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0
219 pushl %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0
222 pushl %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0
225 pushl %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx
229 movl %edx, %ds
230 movl %edx, %es
231 movl $(__KERNEL_PERCPU), %edx
140 movl %edx, %fs 232 movl %edx, %fs
233 SET_KERNEL_GS %edx
234.endm
141 235
142#define RESTORE_INT_REGS \ 236.macro RESTORE_INT_REGS
143 popl %ebx; \ 237 popl %ebx
144 CFI_ADJUST_CFA_OFFSET -4;\ 238 CFI_ADJUST_CFA_OFFSET -4
145 CFI_RESTORE ebx;\ 239 CFI_RESTORE ebx
146 popl %ecx; \ 240 popl %ecx
147 CFI_ADJUST_CFA_OFFSET -4;\ 241 CFI_ADJUST_CFA_OFFSET -4
148 CFI_RESTORE ecx;\ 242 CFI_RESTORE ecx
149 popl %edx; \ 243 popl %edx
150 CFI_ADJUST_CFA_OFFSET -4;\ 244 CFI_ADJUST_CFA_OFFSET -4
151 CFI_RESTORE edx;\ 245 CFI_RESTORE edx
152 popl %esi; \ 246 popl %esi
153 CFI_ADJUST_CFA_OFFSET -4;\ 247 CFI_ADJUST_CFA_OFFSET -4
154 CFI_RESTORE esi;\ 248 CFI_RESTORE esi
155 popl %edi; \ 249 popl %edi
156 CFI_ADJUST_CFA_OFFSET -4;\ 250 CFI_ADJUST_CFA_OFFSET -4
157 CFI_RESTORE edi;\ 251 CFI_RESTORE edi
158 popl %ebp; \ 252 popl %ebp
159 CFI_ADJUST_CFA_OFFSET -4;\ 253 CFI_ADJUST_CFA_OFFSET -4
160 CFI_RESTORE ebp;\ 254 CFI_RESTORE ebp
161 popl %eax; \ 255 popl %eax
162 CFI_ADJUST_CFA_OFFSET -4;\ 256 CFI_ADJUST_CFA_OFFSET -4
163 CFI_RESTORE eax 257 CFI_RESTORE eax
258.endm
164 259
165#define RESTORE_REGS \ 260.macro RESTORE_REGS pop=0
166 RESTORE_INT_REGS; \ 261 RESTORE_INT_REGS
1671: popl %ds; \ 2621: popl %ds
168 CFI_ADJUST_CFA_OFFSET -4;\ 263 CFI_ADJUST_CFA_OFFSET -4
169 /*CFI_RESTORE ds;*/\ 264 /*CFI_RESTORE ds;*/
1702: popl %es; \ 2652: popl %es
171 CFI_ADJUST_CFA_OFFSET -4;\ 266 CFI_ADJUST_CFA_OFFSET -4
172 /*CFI_RESTORE es;*/\ 267 /*CFI_RESTORE es;*/
1733: popl %fs; \ 2683: popl %fs
174 CFI_ADJUST_CFA_OFFSET -4;\ 269 CFI_ADJUST_CFA_OFFSET -4
175 /*CFI_RESTORE fs;*/\ 270 /*CFI_RESTORE fs;*/
176.pushsection .fixup,"ax"; \ 271 POP_GS \pop
1774: movl $0,(%esp); \ 272.pushsection .fixup, "ax"
178 jmp 1b; \ 2734: movl $0, (%esp)
1795: movl $0,(%esp); \ 274 jmp 1b
180 jmp 2b; \ 2755: movl $0, (%esp)
1816: movl $0,(%esp); \ 276 jmp 2b
182 jmp 3b; \ 2776: movl $0, (%esp)
183.section __ex_table,"a";\ 278 jmp 3b
184 .align 4; \ 279.section __ex_table, "a"
185 .long 1b,4b; \ 280 .align 4
186 .long 2b,5b; \ 281 .long 1b, 4b
187 .long 3b,6b; \ 282 .long 2b, 5b
283 .long 3b, 6b
188.popsection 284.popsection
285 POP_GS_EX
286.endm
189 287
190#define RING0_INT_FRAME \ 288.macro RING0_INT_FRAME
191 CFI_STARTPROC simple;\ 289 CFI_STARTPROC simple
192 CFI_SIGNAL_FRAME;\ 290 CFI_SIGNAL_FRAME
193 CFI_DEF_CFA esp, 3*4;\ 291 CFI_DEF_CFA esp, 3*4
194 /*CFI_OFFSET cs, -2*4;*/\ 292 /*CFI_OFFSET cs, -2*4;*/
195 CFI_OFFSET eip, -3*4 293 CFI_OFFSET eip, -3*4
294.endm
196 295
197#define RING0_EC_FRAME \ 296.macro RING0_EC_FRAME
198 CFI_STARTPROC simple;\ 297 CFI_STARTPROC simple
199 CFI_SIGNAL_FRAME;\ 298 CFI_SIGNAL_FRAME
200 CFI_DEF_CFA esp, 4*4;\ 299 CFI_DEF_CFA esp, 4*4
201 /*CFI_OFFSET cs, -2*4;*/\ 300 /*CFI_OFFSET cs, -2*4;*/
202 CFI_OFFSET eip, -3*4 301 CFI_OFFSET eip, -3*4
302.endm
203 303
204#define RING0_PTREGS_FRAME \ 304.macro RING0_PTREGS_FRAME
205 CFI_STARTPROC simple;\ 305 CFI_STARTPROC simple
206 CFI_SIGNAL_FRAME;\ 306 CFI_SIGNAL_FRAME
207 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ 307 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
208 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ 308 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
209 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ 309 CFI_OFFSET eip, PT_EIP-PT_OLDESP
210 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ 310 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
211 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ 311 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
212 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ 312 CFI_OFFSET eax, PT_EAX-PT_OLDESP
213 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ 313 CFI_OFFSET ebp, PT_EBP-PT_OLDESP
214 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ 314 CFI_OFFSET edi, PT_EDI-PT_OLDESP
215 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ 315 CFI_OFFSET esi, PT_ESI-PT_OLDESP
216 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ 316 CFI_OFFSET edx, PT_EDX-PT_OLDESP
217 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ 317 CFI_OFFSET ecx, PT_ECX-PT_OLDESP
218 CFI_OFFSET ebx, PT_EBX-PT_OLDESP 318 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
319.endm
219 320
220ENTRY(ret_from_fork) 321ENTRY(ret_from_fork)
221 CFI_STARTPROC 322 CFI_STARTPROC
@@ -341,8 +442,7 @@ sysenter_past_esp:
341 442
342 GET_THREAD_INFO(%ebp) 443 GET_THREAD_INFO(%ebp)
343 444
344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 445 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
346 jnz sysenter_audit 446 jnz sysenter_audit
347sysenter_do_call: 447sysenter_do_call:
348 cmpl $(nr_syscalls), %eax 448 cmpl $(nr_syscalls), %eax
@@ -353,7 +453,7 @@ sysenter_do_call:
353 DISABLE_INTERRUPTS(CLBR_ANY) 453 DISABLE_INTERRUPTS(CLBR_ANY)
354 TRACE_IRQS_OFF 454 TRACE_IRQS_OFF
355 movl TI_flags(%ebp), %ecx 455 movl TI_flags(%ebp), %ecx
356 testw $_TIF_ALLWORK_MASK, %cx 456 testl $_TIF_ALLWORK_MASK, %ecx
357 jne sysexit_audit 457 jne sysexit_audit
358sysenter_exit: 458sysenter_exit:
359/* if something modifies registers it must also disable sysexit */ 459/* if something modifies registers it must also disable sysexit */
@@ -362,11 +462,12 @@ sysenter_exit:
362 xorl %ebp,%ebp 462 xorl %ebp,%ebp
363 TRACE_IRQS_ON 463 TRACE_IRQS_ON
3641: mov PT_FS(%esp), %fs 4641: mov PT_FS(%esp), %fs
465 PTGS_TO_GS
365 ENABLE_INTERRUPTS_SYSEXIT 466 ENABLE_INTERRUPTS_SYSEXIT
366 467
367#ifdef CONFIG_AUDITSYSCALL 468#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit: 469sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 470 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry 471 jnz syscall_trace_entry
371 addl $4,%esp 472 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4 473 CFI_ADJUST_CFA_OFFSET -4
@@ -383,7 +484,7 @@ sysenter_audit:
383 jmp sysenter_do_call 484 jmp sysenter_do_call
384 485
385sysexit_audit: 486sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 487 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
387 jne syscall_exit_work 488 jne syscall_exit_work
388 TRACE_IRQS_ON 489 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY) 490 ENABLE_INTERRUPTS(CLBR_ANY)
@@ -396,7 +497,7 @@ sysexit_audit:
396 DISABLE_INTERRUPTS(CLBR_ANY) 497 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx 499 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 500 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
400 jne syscall_exit_work 501 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */ 502 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit 503 jmp sysenter_exit
@@ -410,6 +511,7 @@ sysexit_audit:
410 .align 4 511 .align 4
411 .long 1b,2b 512 .long 1b,2b
412.popsection 513.popsection
514 PTGS_TO_GS_EX
413ENDPROC(ia32_sysenter_target) 515ENDPROC(ia32_sysenter_target)
414 516
415 # system call handler stub 517 # system call handler stub
@@ -420,8 +522,7 @@ ENTRY(system_call)
420 SAVE_ALL 522 SAVE_ALL
421 GET_THREAD_INFO(%ebp) 523 GET_THREAD_INFO(%ebp)
422 # system call tracing in operation / emulation 524 # system call tracing in operation / emulation
423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 525 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
425 jnz syscall_trace_entry 526 jnz syscall_trace_entry
426 cmpl $(nr_syscalls), %eax 527 cmpl $(nr_syscalls), %eax
427 jae syscall_badsys 528 jae syscall_badsys
@@ -435,7 +536,7 @@ syscall_exit:
435 # between sampling and the iret 536 # between sampling and the iret
436 TRACE_IRQS_OFF 537 TRACE_IRQS_OFF
437 movl TI_flags(%ebp), %ecx 538 movl TI_flags(%ebp), %ecx
438 testw $_TIF_ALLWORK_MASK, %cx # current->work 539 testl $_TIF_ALLWORK_MASK, %ecx # current->work
439 jne syscall_exit_work 540 jne syscall_exit_work
440 541
441restore_all: 542restore_all:
@@ -452,8 +553,7 @@ restore_all:
452restore_nocheck: 553restore_nocheck:
453 TRACE_IRQS_IRET 554 TRACE_IRQS_IRET
454restore_nocheck_notrace: 555restore_nocheck_notrace:
455 RESTORE_REGS 556 RESTORE_REGS 4 # skip orig_eax/error_code
456 addl $4, %esp # skip orig_eax/error_code
457 CFI_ADJUST_CFA_OFFSET -4 557 CFI_ADJUST_CFA_OFFSET -4
458irq_return: 558irq_return:
459 INTERRUPT_RETURN 559 INTERRUPT_RETURN
@@ -571,7 +671,7 @@ END(syscall_trace_entry)
571 # perform syscall exit tracing 671 # perform syscall exit tracing
572 ALIGN 672 ALIGN
573syscall_exit_work: 673syscall_exit_work:
574 testb $_TIF_WORK_SYSCALL_EXIT, %cl 674 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
575 jz work_pending 675 jz work_pending
576 TRACE_IRQS_ON 676 TRACE_IRQS_ON
577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 677 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
@@ -595,28 +695,50 @@ syscall_badsys:
595END(syscall_badsys) 695END(syscall_badsys)
596 CFI_ENDPROC 696 CFI_ENDPROC
597 697
598#define FIXUP_ESPFIX_STACK \ 698/*
599 /* since we are on a wrong stack, we cant make it a C code :( */ \ 699 * System calls that need a pt_regs pointer.
600 PER_CPU(gdt_page, %ebx); \ 700 */
601 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 701#define PTREGSCALL(name) \
602 addl %esp, %eax; \ 702 ALIGN; \
603 pushl $__KERNEL_DS; \ 703ptregs_##name: \
604 CFI_ADJUST_CFA_OFFSET 4; \ 704 leal 4(%esp),%eax; \
605 pushl %eax; \ 705 jmp sys_##name;
606 CFI_ADJUST_CFA_OFFSET 4; \ 706
607 lss (%esp), %esp; \ 707PTREGSCALL(iopl)
608 CFI_ADJUST_CFA_OFFSET -8; 708PTREGSCALL(fork)
609#define UNWIND_ESPFIX_STACK \ 709PTREGSCALL(clone)
610 movl %ss, %eax; \ 710PTREGSCALL(vfork)
611 /* see if on espfix stack */ \ 711PTREGSCALL(execve)
612 cmpw $__ESPFIX_SS, %ax; \ 712PTREGSCALL(sigaltstack)
613 jne 27f; \ 713PTREGSCALL(sigreturn)
614 movl $__KERNEL_DS, %eax; \ 714PTREGSCALL(rt_sigreturn)
615 movl %eax, %ds; \ 715PTREGSCALL(vm86)
616 movl %eax, %es; \ 716PTREGSCALL(vm86old)
617 /* switch to normal stack */ \ 717
618 FIXUP_ESPFIX_STACK; \ 718.macro FIXUP_ESPFIX_STACK
61927:; 719 /* since we are on a wrong stack, we cant make it a C code :( */
720 PER_CPU(gdt_page, %ebx)
721 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
722 addl %esp, %eax
723 pushl $__KERNEL_DS
724 CFI_ADJUST_CFA_OFFSET 4
725 pushl %eax
726 CFI_ADJUST_CFA_OFFSET 4
727 lss (%esp), %esp
728 CFI_ADJUST_CFA_OFFSET -8
729.endm
730.macro UNWIND_ESPFIX_STACK
731 movl %ss, %eax
732 /* see if on espfix stack */
733 cmpw $__ESPFIX_SS, %ax
734 jne 27f
735 movl $__KERNEL_DS, %eax
736 movl %eax, %ds
737 movl %eax, %es
738 /* switch to normal stack */
739 FIXUP_ESPFIX_STACK
74027:
741.endm
620 742
621/* 743/*
622 * Build the entry stubs and pointer table with some assembler magic. 744 * Build the entry stubs and pointer table with some assembler magic.
@@ -672,7 +794,7 @@ common_interrupt:
672ENDPROC(common_interrupt) 794ENDPROC(common_interrupt)
673 CFI_ENDPROC 795 CFI_ENDPROC
674 796
675#define BUILD_INTERRUPT(name, nr) \ 797#define BUILD_INTERRUPT3(name, nr, fn) \
676ENTRY(name) \ 798ENTRY(name) \
677 RING0_INT_FRAME; \ 799 RING0_INT_FRAME; \
678 pushl $~(nr); \ 800 pushl $~(nr); \
@@ -680,13 +802,15 @@ ENTRY(name) \
680 SAVE_ALL; \ 802 SAVE_ALL; \
681 TRACE_IRQS_OFF \ 803 TRACE_IRQS_OFF \
682 movl %esp,%eax; \ 804 movl %esp,%eax; \
683 call smp_##name; \ 805 call fn; \
684 jmp ret_from_intr; \ 806 jmp ret_from_intr; \
685 CFI_ENDPROC; \ 807 CFI_ENDPROC; \
686ENDPROC(name) 808ENDPROC(name)
687 809
810#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
811
688/* The include is where all of the SMP etc. interrupts come from */ 812/* The include is where all of the SMP etc. interrupts come from */
689#include "entry_arch.h" 813#include <asm/entry_arch.h>
690 814
691ENTRY(coprocessor_error) 815ENTRY(coprocessor_error)
692 RING0_INT_FRAME 816 RING0_INT_FRAME
@@ -1068,7 +1192,10 @@ ENTRY(page_fault)
1068 CFI_ADJUST_CFA_OFFSET 4 1192 CFI_ADJUST_CFA_OFFSET 4
1069 ALIGN 1193 ALIGN
1070error_code: 1194error_code:
1071 /* the function address is in %fs's slot on the stack */ 1195 /* the function address is in %gs's slot on the stack */
1196 pushl %fs
1197 CFI_ADJUST_CFA_OFFSET 4
1198 /*CFI_REL_OFFSET fs, 0*/
1072 pushl %es 1199 pushl %es
1073 CFI_ADJUST_CFA_OFFSET 4 1200 CFI_ADJUST_CFA_OFFSET 4
1074 /*CFI_REL_OFFSET es, 0*/ 1201 /*CFI_REL_OFFSET es, 0*/
@@ -1097,20 +1224,15 @@ error_code:
1097 CFI_ADJUST_CFA_OFFSET 4 1224 CFI_ADJUST_CFA_OFFSET 4
1098 CFI_REL_OFFSET ebx, 0 1225 CFI_REL_OFFSET ebx, 0
1099 cld 1226 cld
1100 pushl %fs
1101 CFI_ADJUST_CFA_OFFSET 4
1102 /*CFI_REL_OFFSET fs, 0*/
1103 movl $(__KERNEL_PERCPU), %ecx 1227 movl $(__KERNEL_PERCPU), %ecx
1104 movl %ecx, %fs 1228 movl %ecx, %fs
1105 UNWIND_ESPFIX_STACK 1229 UNWIND_ESPFIX_STACK
1106 popl %ecx 1230 GS_TO_REG %ecx
1107 CFI_ADJUST_CFA_OFFSET -4 1231 movl PT_GS(%esp), %edi # get the function address
1108 /*CFI_REGISTER es, ecx*/
1109 movl PT_FS(%esp), %edi # get the function address
1110 movl PT_ORIG_EAX(%esp), %edx # get the error code 1232 movl PT_ORIG_EAX(%esp), %edx # get the error code
1111 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 1233 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1112 mov %ecx, PT_FS(%esp) 1234 REG_TO_PTGS %ecx
1113 /*CFI_REL_OFFSET fs, ES*/ 1235 SET_KERNEL_GS %ecx
1114 movl $(__USER_DS), %ecx 1236 movl $(__USER_DS), %ecx
1115 movl %ecx, %ds 1237 movl %ecx, %ds
1116 movl %ecx, %es 1238 movl %ecx, %es
@@ -1134,26 +1256,27 @@ END(page_fault)
1134 * by hand onto the new stack - while updating the return eip past 1256 * by hand onto the new stack - while updating the return eip past
1135 * the instruction that would have done it for sysenter. 1257 * the instruction that would have done it for sysenter.
1136 */ 1258 */
1137#define FIX_STACK(offset, ok, label) \ 1259.macro FIX_STACK offset ok label
1138 cmpw $__KERNEL_CS,4(%esp); \ 1260 cmpw $__KERNEL_CS, 4(%esp)
1139 jne ok; \ 1261 jne \ok
1140label: \ 1262\label:
1141 movl TSS_sysenter_sp0+offset(%esp),%esp; \ 1263 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1142 CFI_DEF_CFA esp, 0; \ 1264 CFI_DEF_CFA esp, 0
1143 CFI_UNDEFINED eip; \ 1265 CFI_UNDEFINED eip
1144 pushfl; \ 1266 pushfl
1145 CFI_ADJUST_CFA_OFFSET 4; \ 1267 CFI_ADJUST_CFA_OFFSET 4
1146 pushl $__KERNEL_CS; \ 1268 pushl $__KERNEL_CS
1147 CFI_ADJUST_CFA_OFFSET 4; \ 1269 CFI_ADJUST_CFA_OFFSET 4
1148 pushl $sysenter_past_esp; \ 1270 pushl $sysenter_past_esp
1149 CFI_ADJUST_CFA_OFFSET 4; \ 1271 CFI_ADJUST_CFA_OFFSET 4
1150 CFI_REL_OFFSET eip, 0 1272 CFI_REL_OFFSET eip, 0
1273.endm
1151 1274
1152ENTRY(debug) 1275ENTRY(debug)
1153 RING0_INT_FRAME 1276 RING0_INT_FRAME
1154 cmpl $ia32_sysenter_target,(%esp) 1277 cmpl $ia32_sysenter_target,(%esp)
1155 jne debug_stack_correct 1278 jne debug_stack_correct
1156 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 1279 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1157debug_stack_correct: 1280debug_stack_correct:
1158 pushl $-1 # mark this as an int 1281 pushl $-1 # mark this as an int
1159 CFI_ADJUST_CFA_OFFSET 4 1282 CFI_ADJUST_CFA_OFFSET 4
@@ -1211,7 +1334,7 @@ nmi_stack_correct:
1211 1334
1212nmi_stack_fixup: 1335nmi_stack_fixup:
1213 RING0_INT_FRAME 1336 RING0_INT_FRAME
1214 FIX_STACK(12,nmi_stack_correct, 1) 1337 FIX_STACK 12, nmi_stack_correct, 1
1215 jmp nmi_stack_correct 1338 jmp nmi_stack_correct
1216 1339
1217nmi_debug_stack_check: 1340nmi_debug_stack_check:
@@ -1222,7 +1345,7 @@ nmi_debug_stack_check:
1222 jb nmi_stack_correct 1345 jb nmi_stack_correct
1223 cmpl $debug_esp_fix_insn,(%esp) 1346 cmpl $debug_esp_fix_insn,(%esp)
1224 ja nmi_stack_correct 1347 ja nmi_stack_correct
1225 FIX_STACK(24,nmi_stack_correct, 1) 1348 FIX_STACK 24, nmi_stack_correct, 1
1226 jmp nmi_stack_correct 1349 jmp nmi_stack_correct
1227 1350
1228nmi_espfix_stack: 1351nmi_espfix_stack:
@@ -1234,7 +1357,7 @@ nmi_espfix_stack:
1234 CFI_ADJUST_CFA_OFFSET 4 1357 CFI_ADJUST_CFA_OFFSET 4
1235 pushl %esp 1358 pushl %esp
1236 CFI_ADJUST_CFA_OFFSET 4 1359 CFI_ADJUST_CFA_OFFSET 4
1237 addw $4, (%esp) 1360 addl $4, (%esp)
1238 /* copy the iret frame of 12 bytes */ 1361 /* copy the iret frame of 12 bytes */
1239 .rept 3 1362 .rept 3
1240 pushl 16(%esp) 1363 pushl 16(%esp)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a1346217e43c..a331ec38af9e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -48,10 +48,11 @@
48#include <asm/unistd.h> 48#include <asm/unistd.h>
49#include <asm/thread_info.h> 49#include <asm/thread_info.h>
50#include <asm/hw_irq.h> 50#include <asm/hw_irq.h>
51#include <asm/page.h> 51#include <asm/page_types.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -76,20 +77,17 @@ ENTRY(ftrace_caller)
76 movq 8(%rbp), %rsi 77 movq 8(%rbp), %rsi
77 subq $MCOUNT_INSN_SIZE, %rdi 78 subq $MCOUNT_INSN_SIZE, %rdi
78 79
79.globl ftrace_call 80GLOBAL(ftrace_call)
80ftrace_call:
81 call ftrace_stub 81 call ftrace_stub
82 82
83 MCOUNT_RESTORE_FRAME 83 MCOUNT_RESTORE_FRAME
84 84
85#ifdef CONFIG_FUNCTION_GRAPH_TRACER 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
86.globl ftrace_graph_call 86GLOBAL(ftrace_graph_call)
87ftrace_graph_call:
88 jmp ftrace_stub 87 jmp ftrace_stub
89#endif 88#endif
90 89
91.globl ftrace_stub 90GLOBAL(ftrace_stub)
92ftrace_stub:
93 retq 91 retq
94END(ftrace_caller) 92END(ftrace_caller)
95 93
@@ -109,8 +107,7 @@ ENTRY(mcount)
109 jnz ftrace_graph_caller 107 jnz ftrace_graph_caller
110#endif 108#endif
111 109
112.globl ftrace_stub 110GLOBAL(ftrace_stub)
113ftrace_stub:
114 retq 111 retq
115 112
116trace: 113trace:
@@ -147,9 +144,7 @@ ENTRY(ftrace_graph_caller)
147 retq 144 retq
148END(ftrace_graph_caller) 145END(ftrace_graph_caller)
149 146
150 147GLOBAL(return_to_handler)
151.globl return_to_handler
152return_to_handler:
153 subq $80, %rsp 148 subq $80, %rsp
154 149
155 movq %rax, (%rsp) 150 movq %rax, (%rsp)
@@ -187,6 +182,7 @@ return_to_handler:
187ENTRY(native_usergs_sysret64) 182ENTRY(native_usergs_sysret64)
188 swapgs 183 swapgs
189 sysretq 184 sysretq
185ENDPROC(native_usergs_sysret64)
190#endif /* CONFIG_PARAVIRT */ 186#endif /* CONFIG_PARAVIRT */
191 187
192 188
@@ -209,7 +205,7 @@ ENTRY(native_usergs_sysret64)
209 205
210 /* %rsp:at FRAMEEND */ 206 /* %rsp:at FRAMEEND */
211 .macro FIXUP_TOP_OF_STACK tmp offset=0 207 .macro FIXUP_TOP_OF_STACK tmp offset=0
212 movq %gs:pda_oldrsp,\tmp 208 movq PER_CPU_VAR(old_rsp),\tmp
213 movq \tmp,RSP+\offset(%rsp) 209 movq \tmp,RSP+\offset(%rsp)
214 movq $__USER_DS,SS+\offset(%rsp) 210 movq $__USER_DS,SS+\offset(%rsp)
215 movq $__USER_CS,CS+\offset(%rsp) 211 movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +216,7 @@ ENTRY(native_usergs_sysret64)
220 216
221 .macro RESTORE_TOP_OF_STACK tmp offset=0 217 .macro RESTORE_TOP_OF_STACK tmp offset=0
222 movq RSP+\offset(%rsp),\tmp 218 movq RSP+\offset(%rsp),\tmp
223 movq \tmp,%gs:pda_oldrsp 219 movq \tmp,PER_CPU_VAR(old_rsp)
224 movq EFLAGS+\offset(%rsp),\tmp 220 movq EFLAGS+\offset(%rsp),\tmp
225 movq \tmp,R11+\offset(%rsp) 221 movq \tmp,R11+\offset(%rsp)
226 .endm 222 .endm
@@ -336,15 +332,15 @@ ENTRY(save_args)
336 je 1f 332 je 1f
337 SWAPGS 333 SWAPGS
338 /* 334 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack 335 * irq_count is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is 336 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of 337 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work) 338 * moving irq_enter into assembly, which would be too much work)
343 */ 339 */
3441: incl %gs:pda_irqcount 3401: incl PER_CPU_VAR(irq_count)
345 jne 2f 341 jne 2f
346 popq_cfi %rax /* move return address... */ 342 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp 343 mov PER_CPU_VAR(irq_stack_ptr),%rsp
348 EMPTY_FRAME 0 344 EMPTY_FRAME 0
349 pushq_cfi %rbp /* backlink for unwinder */ 345 pushq_cfi %rbp /* backlink for unwinder */
350 pushq_cfi %rax /* ... to the new stack */ 346 pushq_cfi %rax /* ... to the new stack */
@@ -372,6 +368,7 @@ ENTRY(save_rest)
372END(save_rest) 368END(save_rest)
373 369
374/* save complete stack frame */ 370/* save complete stack frame */
371 .pushsection .kprobes.text, "ax"
375ENTRY(save_paranoid) 372ENTRY(save_paranoid)
376 XCPT_FRAME 1 RDI+8 373 XCPT_FRAME 1 RDI+8
377 cld 374 cld
@@ -400,6 +397,7 @@ ENTRY(save_paranoid)
4001: ret 3971: ret
401 CFI_ENDPROC 398 CFI_ENDPROC
402END(save_paranoid) 399END(save_paranoid)
400 .popsection
403 401
404/* 402/*
405 * A newly forked process directly context switches into this address. 403 * A newly forked process directly context switches into this address.
@@ -409,6 +407,8 @@ END(save_paranoid)
409ENTRY(ret_from_fork) 407ENTRY(ret_from_fork)
410 DEFAULT_FRAME 408 DEFAULT_FRAME
411 409
410 LOCK ; btr $TIF_FORK,TI_flags(%r8)
411
412 push kernel_eflags(%rip) 412 push kernel_eflags(%rip)
413 CFI_ADJUST_CFA_OFFSET 8 413 CFI_ADJUST_CFA_OFFSET 8
414 popf # reset kernel eflags 414 popf # reset kernel eflags
@@ -418,7 +418,6 @@ ENTRY(ret_from_fork)
418 418
419 GET_THREAD_INFO(%rcx) 419 GET_THREAD_INFO(%rcx)
420 420
421 CFI_REMEMBER_STATE
422 RESTORE_REST 421 RESTORE_REST
423 422
424 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -430,7 +429,6 @@ ENTRY(ret_from_fork)
430 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
431 jmp ret_from_sys_call # go to the SYSRET fastpath 430 jmp ret_from_sys_call # go to the SYSRET fastpath
432 431
433 CFI_RESTORE_STATE
434 CFI_ENDPROC 432 CFI_ENDPROC
435END(ret_from_fork) 433END(ret_from_fork)
436 434
@@ -468,7 +466,7 @@ END(ret_from_fork)
468ENTRY(system_call) 466ENTRY(system_call)
469 CFI_STARTPROC simple 467 CFI_STARTPROC simple
470 CFI_SIGNAL_FRAME 468 CFI_SIGNAL_FRAME
471 CFI_DEF_CFA rsp,PDA_STACKOFFSET 469 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
472 CFI_REGISTER rip,rcx 470 CFI_REGISTER rip,rcx
473 /*CFI_REGISTER rflags,r11*/ 471 /*CFI_REGISTER rflags,r11*/
474 SWAPGS_UNSAFE_STACK 472 SWAPGS_UNSAFE_STACK
@@ -479,8 +477,8 @@ ENTRY(system_call)
479 */ 477 */
480ENTRY(system_call_after_swapgs) 478ENTRY(system_call_after_swapgs)
481 479
482 movq %rsp,%gs:pda_oldrsp 480 movq %rsp,PER_CPU_VAR(old_rsp)
483 movq %gs:pda_kernelstack,%rsp 481 movq PER_CPU_VAR(kernel_stack),%rsp
484 /* 482 /*
485 * No need to follow this irqs off/on section - it's straight 483 * No need to follow this irqs off/on section - it's straight
486 * and short: 484 * and short:
@@ -523,7 +521,7 @@ sysret_check:
523 CFI_REGISTER rip,rcx 521 CFI_REGISTER rip,rcx
524 RESTORE_ARGS 0,-ARG_SKIP,1 522 RESTORE_ARGS 0,-ARG_SKIP,1
525 /*CFI_REGISTER rflags,r11*/ 523 /*CFI_REGISTER rflags,r11*/
526 movq %gs:pda_oldrsp, %rsp 524 movq PER_CPU_VAR(old_rsp), %rsp
527 USERGS_SYSRET64 525 USERGS_SYSRET64
528 526
529 CFI_RESTORE_STATE 527 CFI_RESTORE_STATE
@@ -630,16 +628,14 @@ tracesys:
630 * Syscall return path ending with IRET. 628 * Syscall return path ending with IRET.
631 * Has correct top of stack, but partial stack frame. 629 * Has correct top of stack, but partial stack frame.
632 */ 630 */
633 .globl int_ret_from_sys_call 631GLOBAL(int_ret_from_sys_call)
634 .globl int_with_check
635int_ret_from_sys_call:
636 DISABLE_INTERRUPTS(CLBR_NONE) 632 DISABLE_INTERRUPTS(CLBR_NONE)
637 TRACE_IRQS_OFF 633 TRACE_IRQS_OFF
638 testl $3,CS-ARGOFFSET(%rsp) 634 testl $3,CS-ARGOFFSET(%rsp)
639 je retint_restore_args 635 je retint_restore_args
640 movl $_TIF_ALLWORK_MASK,%edi 636 movl $_TIF_ALLWORK_MASK,%edi
641 /* edi: mask to check */ 637 /* edi: mask to check */
642int_with_check: 638GLOBAL(int_with_check)
643 LOCKDEP_SYS_EXIT_IRQ 639 LOCKDEP_SYS_EXIT_IRQ
644 GET_THREAD_INFO(%rcx) 640 GET_THREAD_INFO(%rcx)
645 movl TI_flags(%rcx),%edx 641 movl TI_flags(%rcx),%edx
@@ -833,11 +829,11 @@ common_interrupt:
833 XCPT_FRAME 829 XCPT_FRAME
834 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 830 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
835 interrupt do_IRQ 831 interrupt do_IRQ
836 /* 0(%rsp): oldrsp-ARGOFFSET */ 832 /* 0(%rsp): old_rsp-ARGOFFSET */
837ret_from_intr: 833ret_from_intr:
838 DISABLE_INTERRUPTS(CLBR_NONE) 834 DISABLE_INTERRUPTS(CLBR_NONE)
839 TRACE_IRQS_OFF 835 TRACE_IRQS_OFF
840 decl %gs:pda_irqcount 836 decl PER_CPU_VAR(irq_count)
841 leaveq 837 leaveq
842 CFI_DEF_CFA_REGISTER rsp 838 CFI_DEF_CFA_REGISTER rsp
843 CFI_ADJUST_CFA_OFFSET -8 839 CFI_ADJUST_CFA_OFFSET -8
@@ -982,10 +978,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
982 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 978 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
983#endif 979#endif
984 980
981#ifdef CONFIG_X86_UV
985apicinterrupt UV_BAU_MESSAGE \ 982apicinterrupt UV_BAU_MESSAGE \
986 uv_bau_message_intr1 uv_bau_message_interrupt 983 uv_bau_message_intr1 uv_bau_message_interrupt
984#endif
987apicinterrupt LOCAL_TIMER_VECTOR \ 985apicinterrupt LOCAL_TIMER_VECTOR \
988 apic_timer_interrupt smp_apic_timer_interrupt 986 apic_timer_interrupt smp_apic_timer_interrupt
987apicinterrupt GENERIC_INTERRUPT_VECTOR \
988 generic_interrupt smp_generic_interrupt
989 989
990#ifdef CONFIG_SMP 990#ifdef CONFIG_SMP
991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1073,10 +1073,10 @@ ENTRY(\sym)
1073 TRACE_IRQS_OFF 1073 TRACE_IRQS_OFF
1074 movq %rsp,%rdi /* pt_regs pointer */ 1074 movq %rsp,%rdi /* pt_regs pointer */
1075 xorl %esi,%esi /* no error code */ 1075 xorl %esi,%esi /* no error code */
1076 movq %gs:pda_data_offset, %rbp 1076 PER_CPU(init_tss, %rbp)
1077 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1077 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1078 call \do_sym 1078 call \do_sym
1079 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1079 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1080 jmp paranoid_exit /* %ebx: no swapgs flag */ 1080 jmp paranoid_exit /* %ebx: no swapgs flag */
1081 CFI_ENDPROC 1081 CFI_ENDPROC
1082END(\sym) 1082END(\sym)
@@ -1138,7 +1138,7 @@ ENTRY(native_load_gs_index)
1138 CFI_STARTPROC 1138 CFI_STARTPROC
1139 pushf 1139 pushf
1140 CFI_ADJUST_CFA_OFFSET 8 1140 CFI_ADJUST_CFA_OFFSET 8
1141 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1141 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1142 SWAPGS 1142 SWAPGS
1143gs_change: 1143gs_change:
1144 movl %edi,%gs 1144 movl %edi,%gs
@@ -1260,14 +1260,14 @@ ENTRY(call_softirq)
1260 CFI_REL_OFFSET rbp,0 1260 CFI_REL_OFFSET rbp,0
1261 mov %rsp,%rbp 1261 mov %rsp,%rbp
1262 CFI_DEF_CFA_REGISTER rbp 1262 CFI_DEF_CFA_REGISTER rbp
1263 incl %gs:pda_irqcount 1263 incl PER_CPU_VAR(irq_count)
1264 cmove %gs:pda_irqstackptr,%rsp 1264 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1265 push %rbp # backlink for old unwinder 1265 push %rbp # backlink for old unwinder
1266 call __do_softirq 1266 call __do_softirq
1267 leaveq 1267 leaveq
1268 CFI_DEF_CFA_REGISTER rsp 1268 CFI_DEF_CFA_REGISTER rsp
1269 CFI_ADJUST_CFA_OFFSET -8 1269 CFI_ADJUST_CFA_OFFSET -8
1270 decl %gs:pda_irqcount 1270 decl PER_CPU_VAR(irq_count)
1271 ret 1271 ret
1272 CFI_ENDPROC 1272 CFI_ENDPROC
1273END(call_softirq) 1273END(call_softirq)
@@ -1297,15 +1297,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1297 movq %rdi, %rsp # we don't return, adjust the stack frame 1297 movq %rdi, %rsp # we don't return, adjust the stack frame
1298 CFI_ENDPROC 1298 CFI_ENDPROC
1299 DEFAULT_FRAME 1299 DEFAULT_FRAME
130011: incl %gs:pda_irqcount 130011: incl PER_CPU_VAR(irq_count)
1301 movq %rsp,%rbp 1301 movq %rsp,%rbp
1302 CFI_DEF_CFA_REGISTER rbp 1302 CFI_DEF_CFA_REGISTER rbp
1303 cmovzq %gs:pda_irqstackptr,%rsp 1303 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1304 pushq %rbp # backlink for old unwinder 1304 pushq %rbp # backlink for old unwinder
1305 call xen_evtchn_do_upcall 1305 call xen_evtchn_do_upcall
1306 popq %rsp 1306 popq %rsp
1307 CFI_DEF_CFA_REGISTER rsp 1307 CFI_DEF_CFA_REGISTER rsp
1308 decl %gs:pda_irqcount 1308 decl PER_CPU_VAR(irq_count)
1309 jmp error_exit 1309 jmp error_exit
1310 CFI_ENDPROC 1310 CFI_ENDPROC
1311END(do_hypervisor_callback) 1311END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
deleted file mode 100644
index 53699c931ad4..000000000000
--- a/arch/x86/kernel/es7000_32.c
+++ /dev/null
@@ -1,378 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/smp.h>
31#include <linux/string.h>
32#include <linux/spinlock.h>
33#include <linux/errno.h>
34#include <linux/notifier.h>
35#include <linux/reboot.h>
36#include <linux/init.h>
37#include <linux/acpi.h>
38#include <asm/io.h>
39#include <asm/nmi.h>
40#include <asm/smp.h>
41#include <asm/atomic.h>
42#include <asm/apicdef.h>
43#include <mach_mpparse.h>
44#include <asm/genapic.h>
45#include <asm/setup.h>
46
47/*
48 * ES7000 chipsets
49 */
50
51#define NON_UNISYS 0
52#define ES7000_CLASSIC 1
53#define ES7000_ZORRO 2
54
55
56#define MIP_REG 1
57#define MIP_PSAI_REG 4
58
59#define MIP_BUSY 1
60#define MIP_SPIN 0xf0000
61#define MIP_VALID 0x0100000000000000ULL
62#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
63
64#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
65
66struct mip_reg_info {
67 unsigned long long mip_info;
68 unsigned long long delivery_info;
69 unsigned long long host_reg;
70 unsigned long long mip_reg;
71};
72
73struct part_info {
74 unsigned char type;
75 unsigned char length;
76 unsigned char part_id;
77 unsigned char apic_mode;
78 unsigned long snum;
79 char ptype[16];
80 char sname[64];
81 char pname[64];
82};
83
84struct psai {
85 unsigned long long entry_type;
86 unsigned long long addr;
87 unsigned long long bep_addr;
88};
89
90struct es7000_mem_info {
91 unsigned char type;
92 unsigned char length;
93 unsigned char resv[6];
94 unsigned long long start;
95 unsigned long long size;
96};
97
98struct es7000_oem_table {
99 unsigned long long hdr;
100 struct mip_reg_info mip;
101 struct part_info pif;
102 struct es7000_mem_info shm;
103 struct psai psai;
104};
105
106#ifdef CONFIG_ACPI
107
108struct oem_table {
109 struct acpi_table_header Header;
110 u32 OEMTableAddr;
111 u32 OEMTableSize;
112};
113
114extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
115extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
116#endif
117
118struct mip_reg {
119 unsigned long long off_0;
120 unsigned long long off_8;
121 unsigned long long off_10;
122 unsigned long long off_18;
123 unsigned long long off_20;
124 unsigned long long off_28;
125 unsigned long long off_30;
126 unsigned long long off_38;
127};
128
129#define MIP_SW_APIC 0x1020b
130#define MIP_FUNC(VALUE) (VALUE & 0xff)
131
132/*
133 * ES7000 Globals
134 */
135
136static volatile unsigned long *psai = NULL;
137static struct mip_reg *mip_reg;
138static struct mip_reg *host_reg;
139static int mip_port;
140static unsigned long mip_addr, host_addr;
141
142int es7000_plat;
143
144/*
145 * GSI override for ES7000 platforms.
146 */
147
148static unsigned int base;
149
150static int
151es7000_rename_gsi(int ioapic, int gsi)
152{
153 if (es7000_plat == ES7000_ZORRO)
154 return gsi;
155
156 if (!base) {
157 int i;
158 for (i = 0; i < nr_ioapics; i++)
159 base += nr_ioapic_registers[i];
160 }
161
162 if (!ioapic && (gsi < 16))
163 gsi += base;
164 return gsi;
165}
166
167static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
168{
169 unsigned long vect = 0, psaival = 0;
170
171 if (psai == NULL)
172 return -1;
173
174 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
175 psaival = (0x1000000 | vect | cpu);
176
177 while (*psai & 0x1000000)
178 ;
179
180 *psai = psaival;
181
182 return 0;
183}
184
185static void noop_wait_for_deassert(atomic_t *deassert_not_used)
186{
187}
188
189static int __init es7000_update_genapic(void)
190{
191 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
192
193 /* MPENTIUMIII */
194 if (boot_cpu_data.x86 == 6 &&
195 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
196 es7000_update_genapic_to_cluster();
197 genapic->wait_for_init_deassert = noop_wait_for_deassert;
198 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
199 }
200
201 return 0;
202}
203
204void __init
205setup_unisys(void)
206{
207 /*
208 * Determine the generation of the ES7000 currently running.
209 *
210 * es7000_plat = 1 if the machine is a 5xx ES7000 box
211 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
212 *
213 */
214 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
215 es7000_plat = ES7000_ZORRO;
216 else
217 es7000_plat = ES7000_CLASSIC;
218 ioapic_renumber_irq = es7000_rename_gsi;
219
220 x86_quirks->update_genapic = es7000_update_genapic;
221}
222
223/*
224 * Parse the OEM Table
225 */
226
227int __init
228parse_unisys_oem (char *oemptr)
229{
230 int i;
231 int success = 0;
232 unsigned char type, size;
233 unsigned long val;
234 char *tp = NULL;
235 struct psai *psaip = NULL;
236 struct mip_reg_info *mi;
237 struct mip_reg *host, *mip;
238
239 tp = oemptr;
240
241 tp += 8;
242
243 for (i=0; i <= 6; i++) {
244 type = *tp++;
245 size = *tp++;
246 tp -= 2;
247 switch (type) {
248 case MIP_REG:
249 mi = (struct mip_reg_info *)tp;
250 val = MIP_RD_LO(mi->host_reg);
251 host_addr = val;
252 host = (struct mip_reg *)val;
253 host_reg = __va(host);
254 val = MIP_RD_LO(mi->mip_reg);
255 mip_port = MIP_PORT(mi->mip_info);
256 mip_addr = val;
257 mip = (struct mip_reg *)val;
258 mip_reg = __va(mip);
259 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
260 (unsigned long)host_reg);
261 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
262 (unsigned long)mip_reg);
263 success++;
264 break;
265 case MIP_PSAI_REG:
266 psaip = (struct psai *)tp;
267 if (tp != NULL) {
268 if (psaip->addr)
269 psai = __va(psaip->addr);
270 else
271 psai = NULL;
272 success++;
273 }
274 break;
275 default:
276 break;
277 }
278 tp += size;
279 }
280
281 if (success < 2) {
282 es7000_plat = NON_UNISYS;
283 } else
284 setup_unisys();
285 return es7000_plat;
286}
287
288#ifdef CONFIG_ACPI
289static unsigned long oem_addrX;
290static unsigned long oem_size;
291int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
292{
293 struct acpi_table_header *header = NULL;
294 int i = 0;
295
296 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
297 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
298 struct oem_table *t = (struct oem_table *)header;
299
300 oem_addrX = t->OEMTableAddr;
301 oem_size = t->OEMTableSize;
302
303 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
304 oem_size);
305 return 0;
306 }
307 }
308 return -1;
309}
310
311void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
312{
313}
314#endif
315
316static void
317es7000_spin(int n)
318{
319 int i = 0;
320
321 while (i++ < n)
322 rep_nop();
323}
324
325static int __init
326es7000_mip_write(struct mip_reg *mip_reg)
327{
328 int status = 0;
329 int spin;
330
331 spin = MIP_SPIN;
332 while (((unsigned long long)host_reg->off_38 &
333 (unsigned long long)MIP_VALID) != 0) {
334 if (--spin <= 0) {
335 printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
336 return -1;
337 }
338 es7000_spin(MIP_SPIN);
339 }
340
341 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
342 outb(1, mip_port);
343
344 spin = MIP_SPIN;
345
346 while (((unsigned long long)mip_reg->off_38 &
347 (unsigned long long)MIP_VALID) == 0) {
348 if (--spin <= 0) {
349 printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
350 return -1;
351 }
352 es7000_spin(MIP_SPIN);
353 }
354
355 status = ((unsigned long long)mip_reg->off_0 &
356 (unsigned long long)0xffff0000000000ULL) >> 48;
357 mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
358 (unsigned long long)~MIP_VALID);
359 return status;
360}
361
362void __init
363es7000_sw_apic(void)
364{
365 if (es7000_plat) {
366 int mip_status;
367 struct mip_reg es7000_mip_reg;
368
369 printk("ES7000: Enabling APIC mode.\n");
370 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
371 es7000_mip_reg.off_0 = MIP_SW_APIC;
372 es7000_mip_reg.off_38 = (MIP_VALID);
373 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
374 printk("es7000_sw_apic: command failed, status = %x\n",
375 mip_status);
376 return;
377 }
378}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1c..1d0d7f42efe3 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/list.h> 19#include <linux/list.h>
20 20
21#include <asm/cacheflush.h>
21#include <asm/ftrace.h> 22#include <asm/ftrace.h>
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <asm/nops.h> 24#include <asm/nops.h>
@@ -26,6 +27,18 @@
26 27
27#ifdef CONFIG_DYNAMIC_FTRACE 28#ifdef CONFIG_DYNAMIC_FTRACE
28 29
30int ftrace_arch_code_modify_prepare(void)
31{
32 set_kernel_text_rw();
33 return 0;
34}
35
36int ftrace_arch_code_modify_post_process(void)
37{
38 set_kernel_text_ro();
39 return 0;
40}
41
29union ftrace_code_union { 42union ftrace_code_union {
30 char code[MCOUNT_INSN_SIZE]; 43 char code[MCOUNT_INSN_SIZE];
31 struct { 44 struct {
@@ -82,7 +95,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
82 * are the same as what exists. 95 * are the same as what exists.
83 */ 96 */
84 97
85static atomic_t in_nmi = ATOMIC_INIT(0); 98static atomic_t nmi_running = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */ 99static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */ 100static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */ 101static void *mod_code_ip; /* holds the IP to write to */
@@ -111,12 +124,16 @@ static void ftrace_mod_code(void)
111 */ 124 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, 125 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE); 126 MCOUNT_INSN_SIZE);
127
128 /* if we fail, then kill any new writers */
129 if (mod_code_status)
130 mod_code_write = 0;
114} 131}
115 132
116void ftrace_nmi_enter(void) 133void ftrace_nmi_enter(void)
117{ 134{
118 atomic_inc(&in_nmi); 135 atomic_inc(&nmi_running);
119 /* Must have in_nmi seen before reading write flag */ 136 /* Must have nmi_running seen before reading write flag */
120 smp_mb(); 137 smp_mb();
121 if (mod_code_write) { 138 if (mod_code_write) {
122 ftrace_mod_code(); 139 ftrace_mod_code();
@@ -126,22 +143,21 @@ void ftrace_nmi_enter(void)
126 143
127void ftrace_nmi_exit(void) 144void ftrace_nmi_exit(void)
128{ 145{
129 /* Finish all executions before clearing in_nmi */ 146 /* Finish all executions before clearing nmi_running */
130 smp_wmb(); 147 smp_wmb();
131 atomic_dec(&in_nmi); 148 atomic_dec(&nmi_running);
132} 149}
133 150
134static void wait_for_nmi(void) 151static void wait_for_nmi(void)
135{ 152{
136 int waited = 0; 153 if (!atomic_read(&nmi_running))
154 return;
137 155
138 while (atomic_read(&in_nmi)) { 156 do {
139 waited = 1;
140 cpu_relax(); 157 cpu_relax();
141 } 158 } while (atomic_read(&nmi_running));
142 159
143 if (waited) 160 nmi_wait_count++;
144 nmi_wait_count++;
145} 161}
146 162
147static int 163static int
@@ -368,100 +384,8 @@ int ftrace_disable_ftrace_graph_caller(void)
368 return ftrace_mod_jmp(ip, old_offset, new_offset); 384 return ftrace_mod_jmp(ip, old_offset, new_offset);
369} 385}
370 386
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */ 387#endif /* !CONFIG_DYNAMIC_FTRACE */
391 388
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/* 389/*
466 * Hook the return address and push it in the stack of return addrs 390 * Hook the return address and push it in the stack of return addrs
467 * in current thread info. 391 * in current thread info.
@@ -476,7 +400,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
476 &return_to_handler; 400 &return_to_handler;
477 401
478 /* Nmi's are currently unsupported */ 402 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi))) 403 if (unlikely(in_nmi()))
480 return; 404 return;
481 405
482 if (unlikely(atomic_read(&current->tracing_graph_pause))) 406 if (unlikely(atomic_read(&current->tracing_graph_pause)))
@@ -512,16 +436,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
512 return; 436 return;
513 } 437 }
514 438
515 if (unlikely(!__kernel_text_address(old))) { 439 calltime = trace_clock_local();
516 ftrace_graph_stop();
517 *parent = old;
518 WARN_ON(1);
519 return;
520 }
521
522 calltime = cpu_clock(raw_smp_processor_id());
523 440
524 if (push_return_trace(old, calltime, 441 if (ftrace_push_return_trace(old, calltime,
525 self_addr, &trace.depth) == -EBUSY) { 442 self_addr, &trace.depth) == -EBUSY) {
526 *parent = old; 443 *parent = old;
527 return; 444 return;
@@ -536,3 +453,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
536 } 453 }
537} 454}
538#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
456
457#ifdef CONFIG_FTRACE_SYSCALLS
458
459extern unsigned long __start_syscalls_metadata[];
460extern unsigned long __stop_syscalls_metadata[];
461extern unsigned long *sys_call_table;
462
463static struct syscall_metadata **syscalls_metadata;
464
465static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
466{
467 struct syscall_metadata *start;
468 struct syscall_metadata *stop;
469 char str[KSYM_SYMBOL_LEN];
470
471
472 start = (struct syscall_metadata *)__start_syscalls_metadata;
473 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
474 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
475
476 for ( ; start < stop; start++) {
477 if (start->name && !strcmp(start->name, str))
478 return start;
479 }
480 return NULL;
481}
482
483struct syscall_metadata *syscall_nr_to_meta(int nr)
484{
485 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
486 return NULL;
487
488 return syscalls_metadata[nr];
489}
490
491void arch_init_ftrace_syscalls(void)
492{
493 int i;
494 struct syscall_metadata *meta;
495 unsigned long **psys_syscall_table = &sys_call_table;
496 static atomic_t refs;
497
498 if (atomic_inc_return(&refs) != 1)
499 goto end;
500
501 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
502 FTRACE_SYSCALL_MAX, GFP_KERNEL);
503 if (!syscalls_metadata) {
504 WARN_ON(1);
505 return;
506 }
507
508 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
509 meta = find_syscall_meta(psys_syscall_table[i]);
510 syscalls_metadata[i] = meta;
511 }
512 return;
513
514 /* Paranoid: avoid overflow */
515end:
516 atomic_dec(&refs);
517}
518#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda */
30static struct x8664_pda _boot_cpu_pda;
31
32#ifdef CONFIG_SMP
33/*
34 * We install an empty cpu_pda pointer table to indicate to early users
35 * (numa_set_node) that the cpu_pda pointer table for cpus other than
36 * the boot cpu is not yet setup.
37 */
38static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39#else
40static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
41#endif
42
43void __init x86_64_init_pda(void)
44{
45 _cpu_pda = __cpu_pda;
46 cpu_pda(0) = &_boot_cpu_pda;
47 pda_init(0);
48}
49
50static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
51{ 30{
52 pgd_t *pgd = pgd_offset_k(0UL); 31 pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
112 if (console_loglevel == 10) 91 if (console_loglevel == 10)
113 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
114 93
115 x86_64_init_pda();
116
117 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
118} 95}
119 96
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..c32ca19d591a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -11,14 +11,15 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/page.h> 14#include <asm/page_types.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable_types.h>
16#include <asm/desc.h> 16#include <asm/desc.h>
17#include <asm/cache.h> 17#include <asm/cache.h>
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/asm-offsets.h> 19#include <asm/asm-offsets.h>
20#include <asm/setup.h> 20#include <asm/setup.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23/* Physical address */ 24/* Physical address */
24#define pa(X) ((X) - __PAGE_OFFSET) 25#define pa(X) ((X) - __PAGE_OFFSET)
@@ -429,14 +430,34 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 430 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4311: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 432 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 433
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 434 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 435 movl %eax,%ds
436 movl %eax,%es 436 movl %eax,%es
437 437
438 xorl %eax,%eax # Clear GS and LDT 438 movl $(__KERNEL_PERCPU), %eax
439 movl %eax,%fs # set this cpu's percpu
440
441#ifdef CONFIG_CC_STACKPROTECTOR
442 /*
443 * The linker can't handle this by relocation. Manually set
444 * base address in stack canary segment descriptor.
445 */
446 cmpb $0,ready
447 jne 1f
448 movl $per_cpu__gdt_page,%eax
449 movl $per_cpu__stack_canary,%ecx
450 subl $20, %ecx
451 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
452 shrl $16, %ecx
453 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
454 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
4551:
456#endif
457 movl $(__KERNEL_STACK_CANARY),%eax
439 movl %eax,%gs 458 movl %eax,%gs
459
460 xorl %eax,%eax # Clear LDT
440 lldt %ax 461 lldt %ax
441 462
442 cld # gcc2 wants the direction flag cleared at all times 463 cld # gcc2 wants the direction flag cleared at all times
@@ -446,8 +467,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 467 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 468 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 469 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 470 movl (stack_start), %esp
4521: 4711:
453#endif /* CONFIG_SMP */ 472#endif /* CONFIG_SMP */
@@ -548,12 +567,8 @@ early_fault:
548 pushl %eax 567 pushl %eax
549 pushl %edx /* trapno */ 568 pushl %edx /* trapno */
550 pushl $fault_msg 569 pushl $fault_msg
551#ifdef CONFIG_EARLY_PRINTK
552 call early_printk
553#else
554 call printk 570 call printk
555#endif 571#endif
556#endif
557 call dump_stack 572 call dump_stack
558hlt_loop: 573hlt_loop:
559 hlt 574 hlt
@@ -580,11 +595,10 @@ ignore_int:
580 pushl 32(%esp) 595 pushl 32(%esp)
581 pushl 40(%esp) 596 pushl 40(%esp)
582 pushl $int_msg 597 pushl $int_msg
583#ifdef CONFIG_EARLY_PRINTK
584 call early_printk
585#else
586 call printk 598 call printk
587#endif 599
600 call dump_stack
601
588 addl $(5*4),%esp 602 addl $(5*4),%esp
589 popl %ds 603 popl %ds
590 popl %es 604 popl %es
@@ -660,7 +674,7 @@ early_recursion_flag:
660 .long 0 674 .long 0
661 675
662int_msg: 676int_msg:
663 .asciz "Unknown interrupt or fault at EIP %p %p %p\n" 677 .asciz "Unknown interrupt or fault at: %p %p %p\n"
664 678
665fault_msg: 679fault_msg:
666/* fault info: */ 680/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..54b29bb24e71 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 227 movl %eax,%fs
227 movl %eax,%gs 228 movl %eax,%gs
228 229
229 /* 230 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 231 *
231 * that does in_interrupt() 232 * The base of %gs always points to the bottom of the irqstack
232 */ 233 * union. If the stack protector canary is enabled, it is
234 * located at %gs:40. Note that, on SMP, the boot cpu uses
235 * init data section till per cpu areas are set up.
236 */
233 movl $MSR_GS_BASE,%ecx 237 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 238 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 239 movq %rax,%rdx
236 shrq $32,%rdx 240 shrq $32,%rdx
237 wrmsr 241 wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
257 .align 8 261 .align 8
258 ENTRY(initial_code) 262 ENTRY(initial_code)
259 .quad x86_64_start_kernel 263 .quad x86_64_start_kernel
264 ENTRY(initial_gs)
265 .quad INIT_PER_CPU_VAR(irq_stack_union)
260 __FINITDATA 266 __FINITDATA
261 267
262 ENTRY(stack_start) 268 ENTRY(stack_start)
@@ -323,8 +329,6 @@ early_idt_ripmsg:
323#endif /* CONFIG_EARLY_PRINTK */ 329#endif /* CONFIG_EARLY_PRINTK */
324 .previous 330 .previous
325 331
326.balign PAGE_SIZE
327
328#define NEXT_PAGE(name) \ 332#define NEXT_PAGE(name) \
329 .balign PAGE_SIZE; \ 333 .balign PAGE_SIZE; \
330ENTRY(name) 334ENTRY(name)
@@ -401,7 +405,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 405 .globl early_gdt_descr
402early_gdt_descr: 406early_gdt_descr:
403 .word GDT_ENTRIES*8-1 407 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 408early_gdt_descr_base:
409 .quad INIT_PER_CPU_VAR(gdt_page)
405 410
406ENTRY(phys_base) 411ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 412 /* This must match the first entry in level2_kernel_pgt */
@@ -412,7 +417,7 @@ ENTRY(phys_base)
412 .section .bss, "aw", @nobits 417 .section .bss, "aw", @nobits
413 .align L1_CACHE_BYTES 418 .align L1_CACHE_BYTES
414ENTRY(idt_table) 419ENTRY(idt_table)
415 .skip 256 * 16 420 .skip IDT_ENTRIES * 16
416 421
417 .section .bss.page_aligned, "aw", @nobits 422 .section .bss.page_aligned, "aw", @nobits
418 .align PAGE_SIZE 423 .align PAGE_SIZE
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 11d5093eb281..df89102bef80 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -22,7 +22,6 @@
22#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/arch_hooks.h>
26#include <asm/i8259.h> 25#include <asm/i8259.h>
27 26
28/* 27/*
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b12208f4dfee..99c4d308f16b 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
85 85
86 t->io_bitmap_max = bytes; 86 t->io_bitmap_max = bytes;
87 87
88#ifdef CONFIG_X86_32
89 /*
90 * Sets the lazy trigger so that the next I/O operation will
91 * reload the correct bitmap.
92 * Reset the owner so that a process switch will not set
93 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
94 */
95 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
96 tss->io_bitmap_owner = NULL;
97#else
98 /* Update the TSS: */ 88 /* Update the TSS: */
99 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); 89 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
100#endif
101 90
102 put_cpu(); 91 put_cpu();
103 92
@@ -131,9 +120,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
131} 120}
132 121
133#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
134asmlinkage long sys_iopl(unsigned long regsp) 123long sys_iopl(struct pt_regs *regs)
135{ 124{
136 struct pt_regs *regs = (struct pt_regs *)&regsp;
137 unsigned int level = regs->bx; 125 unsigned int level = regs->bx;
138 struct thread_struct *t = &current->thread; 126 struct thread_struct *t = &current->thread;
139 int rc; 127 int rc;
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
deleted file mode 100644
index 285bbf8831fa..000000000000
--- a/arch/x86/kernel/ipi.c
+++ /dev/null
@@ -1,190 +0,0 @@
1#include <linux/cpumask.h>
2#include <linux/interrupt.h>
3#include <linux/init.h>
4
5#include <linux/mm.h>
6#include <linux/delay.h>
7#include <linux/spinlock.h>
8#include <linux/kernel_stat.h>
9#include <linux/mc146818rtc.h>
10#include <linux/cache.h>
11#include <linux/cpu.h>
12#include <linux/module.h>
13
14#include <asm/smp.h>
15#include <asm/mtrr.h>
16#include <asm/tlbflush.h>
17#include <asm/mmu_context.h>
18#include <asm/apic.h>
19#include <asm/proto.h>
20
21#ifdef CONFIG_X86_32
22#include <mach_apic.h>
23#include <mach_ipi.h>
24
25/*
26 * the following functions deal with sending IPIs between CPUs.
27 *
28 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
29 */
30
31static inline int __prepare_ICR(unsigned int shortcut, int vector)
32{
33 unsigned int icr = shortcut | APIC_DEST_LOGICAL;
34
35 switch (vector) {
36 default:
37 icr |= APIC_DM_FIXED | vector;
38 break;
39 case NMI_VECTOR:
40 icr |= APIC_DM_NMI;
41 break;
42 }
43 return icr;
44}
45
46static inline int __prepare_ICR2(unsigned int mask)
47{
48 return SET_APIC_DEST_FIELD(mask);
49}
50
51void __send_IPI_shortcut(unsigned int shortcut, int vector)
52{
53 /*
54 * Subtle. In the case of the 'never do double writes' workaround
55 * we have to lock out interrupts to be safe. As we don't care
56 * of the value read we use an atomic rmw access to avoid costly
57 * cli/sti. Otherwise we use an even cheaper single atomic write
58 * to the APIC.
59 */
60 unsigned int cfg;
61
62 /*
63 * Wait for idle.
64 */
65 apic_wait_icr_idle();
66
67 /*
68 * No need to touch the target chip field
69 */
70 cfg = __prepare_ICR(shortcut, vector);
71
72 /*
73 * Send the IPI. The write to APIC_ICR fires this off.
74 */
75 apic_write(APIC_ICR, cfg);
76}
77
78void send_IPI_self(int vector)
79{
80 __send_IPI_shortcut(APIC_DEST_SELF, vector);
81}
82
83/*
84 * This is used to send an IPI with no shorthand notation (the destination is
85 * specified in bits 56 to 63 of the ICR).
86 */
87static inline void __send_IPI_dest_field(unsigned long mask, int vector)
88{
89 unsigned long cfg;
90
91 /*
92 * Wait for idle.
93 */
94 if (unlikely(vector == NMI_VECTOR))
95 safe_apic_wait_icr_idle();
96 else
97 apic_wait_icr_idle();
98
99 /*
100 * prepare target chip field
101 */
102 cfg = __prepare_ICR2(mask);
103 apic_write(APIC_ICR2, cfg);
104
105 /*
106 * program the ICR
107 */
108 cfg = __prepare_ICR(0, vector);
109
110 /*
111 * Send the IPI. The write to APIC_ICR fires this off.
112 */
113 apic_write(APIC_ICR, cfg);
114}
115
116/*
117 * This is only used on smaller machines.
118 */
119void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
120{
121 unsigned long mask = cpumask_bits(cpumask)[0];
122 unsigned long flags;
123
124 local_irq_save(flags);
125 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
126 __send_IPI_dest_field(mask, vector);
127 local_irq_restore(flags);
128}
129
130void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
131{
132 unsigned long flags;
133 unsigned int query_cpu;
134
135 /*
136 * Hack. The clustered APIC addressing mode doesn't allow us to send
137 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
138 * should be modified to do 1 message per cluster ID - mbligh
139 */
140
141 local_irq_save(flags);
142 for_each_cpu(query_cpu, mask)
143 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
144 local_irq_restore(flags);
145}
146
147void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
148{
149 unsigned long flags;
150 unsigned int query_cpu;
151 unsigned int this_cpu = smp_processor_id();
152
153 /* See Hack comment above */
154
155 local_irq_save(flags);
156 for_each_cpu(query_cpu, mask)
157 if (query_cpu != this_cpu)
158 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
159 vector);
160 local_irq_restore(flags);
161}
162
163/* must come after the send_IPI functions above for inlining */
164static int convert_apicid_to_cpu(int apic_id)
165{
166 int i;
167
168 for_each_possible_cpu(i) {
169 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
170 return i;
171 }
172 return -1;
173}
174
175int safe_smp_processor_id(void)
176{
177 int apicid, cpuid;
178
179 if (!boot_cpu_has(X86_FEATURE_APIC))
180 return 0;
181
182 apicid = hard_smp_processor_id();
183 if (apicid == BAD_APICID)
184 return 0;
185
186 cpuid = convert_apicid_to_cpu(apicid);
187
188 return cpuid >= 0 ? cpuid : 0;
189}
190#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..b864341dcc45 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -6,13 +6,18 @@
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/ftrace.h>
9 10
10#include <asm/apic.h> 11#include <asm/apic.h>
11#include <asm/io_apic.h> 12#include <asm/io_apic.h>
12#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h>
13 15
14atomic_t irq_err_count; 16atomic_t irq_err_count;
15 17
18/* Function pointer for generic interrupt vector handling */
19void (*generic_interrupt_extension)(void) = NULL;
20
16/* 21/*
17 * 'what should we do if we get a hw irq event on an illegal vector'. 22 * 'what should we do if we get a hw irq event on an illegal vector'.
18 * each architecture has to answer this themselves. 23 * each architecture has to answer this themselves.
@@ -36,11 +41,7 @@ void ack_bad_irq(unsigned int irq)
36#endif 41#endif
37} 42}
38 43
39#ifdef CONFIG_X86_32 44#define irq_stats(x) (&per_cpu(irq_stat, x))
40# define irq_stats(x) (&per_cpu(irq_stat, x))
41#else
42# define irq_stats(x) cpu_pda(x)
43#endif
44/* 45/*
45 * /proc/interrupts printing: 46 * /proc/interrupts printing:
46 */ 47 */
@@ -58,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 59 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
59 seq_printf(p, " Local timer interrupts\n"); 60 seq_printf(p, " Local timer interrupts\n");
60#endif 61#endif
62 if (generic_interrupt_extension) {
63 seq_printf(p, "PLT: ");
64 for_each_online_cpu(j)
65 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
66 seq_printf(p, " Platform interrupts\n");
67 }
61#ifdef CONFIG_SMP 68#ifdef CONFIG_SMP
62 seq_printf(p, "RES: "); 69 seq_printf(p, "RES: ");
63 for_each_online_cpu(j) 70 for_each_online_cpu(j)
@@ -165,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
165#ifdef CONFIG_X86_LOCAL_APIC 172#ifdef CONFIG_X86_LOCAL_APIC
166 sum += irq_stats(cpu)->apic_timer_irqs; 173 sum += irq_stats(cpu)->apic_timer_irqs;
167#endif 174#endif
175 if (generic_interrupt_extension)
176 sum += irq_stats(cpu)->generic_irqs;
168#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
169 sum += irq_stats(cpu)->irq_resched_count; 178 sum += irq_stats(cpu)->irq_resched_count;
170 sum += irq_stats(cpu)->irq_call_count; 179 sum += irq_stats(cpu)->irq_call_count;
@@ -192,4 +201,63 @@ u64 arch_irq_stat(void)
192 return sum; 201 return sum;
193} 202}
194 203
204
205/*
206 * do_IRQ handles all normal device IRQ's (the special
207 * SMP cross-CPU interrupts have their own specific
208 * handlers).
209 */
210unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
211{
212 struct pt_regs *old_regs = set_irq_regs(regs);
213
214 /* high bit used in ret_from_ code */
215 unsigned vector = ~regs->orig_ax;
216 unsigned irq;
217
218 exit_idle();
219 irq_enter();
220
221 irq = __get_cpu_var(vector_irq)[vector];
222
223 if (!handle_irq(irq, regs)) {
224#ifdef CONFIG_X86_64
225 if (!disable_apic)
226 ack_APIC_irq();
227#endif
228
229 if (printk_ratelimit())
230 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
231 __func__, smp_processor_id(), vector, irq);
232 }
233
234 irq_exit();
235
236 set_irq_regs(old_regs);
237 return 1;
238}
239
240/*
241 * Handler for GENERIC_INTERRUPT_VECTOR.
242 */
243void smp_generic_interrupt(struct pt_regs *regs)
244{
245 struct pt_regs *old_regs = set_irq_regs(regs);
246
247 ack_APIC_irq();
248
249 exit_idle();
250
251 irq_enter();
252
253 inc_irq_stat(generic_irqs);
254
255 if (generic_interrupt_extension)
256 generic_interrupt_extension();
257
258 irq_exit();
259
260 set_irq_regs(old_regs);
261}
262
195EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 263EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
@@ -191,33 +192,16 @@ static inline int
191execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } 192execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
192#endif 193#endif
193 194
194/* 195bool handle_irq(unsigned irq, struct pt_regs *regs)
195 * do_IRQ handles all normal device IRQ's (the special
196 * SMP cross-CPU interrupts have their own specific
197 * handlers).
198 */
199unsigned int do_IRQ(struct pt_regs *regs)
200{ 196{
201 struct pt_regs *old_regs;
202 /* high bit used in ret_from_ code */
203 int overflow;
204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc; 197 struct irq_desc *desc;
206 unsigned irq; 198 int overflow;
207
208
209 old_regs = set_irq_regs(regs);
210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
212 199
213 overflow = check_stack_overflow(); 200 overflow = check_stack_overflow();
214 201
215 desc = irq_to_desc(irq); 202 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) { 203 if (unlikely(!desc))
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", 204 return false;
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221 205
222 if (!execute_on_irq_stack(overflow, desc, irq)) { 206 if (!execute_on_irq_stack(overflow, desc, irq)) {
223 if (unlikely(overflow)) 207 if (unlikely(overflow))
@@ -225,13 +209,10 @@ unsigned int do_IRQ(struct pt_regs *regs)
225 desc->handle_irq(irq, desc); 209 desc->handle_irq(irq, desc);
226 } 210 }
227 211
228 irq_exit(); 212 return true;
229 set_irq_regs(old_regs);
230 return 1;
231} 213}
232 214
233#ifdef CONFIG_HOTPLUG_CPU 215#ifdef CONFIG_HOTPLUG_CPU
234#include <mach_apic.h>
235 216
236/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
237void fixup_irqs(void) 218void fixup_irqs(void)
@@ -248,7 +229,7 @@ void fixup_irqs(void)
248 if (irq == 2) 229 if (irq == 2)
249 continue; 230 continue;
250 231
251 affinity = &desc->affinity; 232 affinity = desc->affinity;
252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 233 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
253 printk("Breaking affinity for irq %i\n", irq); 234 printk("Breaking affinity for irq %i\n", irq);
254 affinity = cpu_all_mask; 235 affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..977d8b43a0dd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
21 28
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
@@ -41,42 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
41#endif 48#endif
42} 49}
43 50
44/* 51bool handle_irq(unsigned irq, struct pt_regs *regs)
45 * do_IRQ handles all normal device IRQ's (the special
46 * SMP cross-CPU interrupts have their own specific
47 * handlers).
48 */
49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
50{ 52{
51 struct pt_regs *old_regs = set_irq_regs(regs);
52 struct irq_desc *desc; 53 struct irq_desc *desc;
53 54
54 /* high bit used in ret_from_ code */
55 unsigned vector = ~regs->orig_ax;
56 unsigned irq;
57
58 exit_idle();
59 irq_enter();
60 irq = __get_cpu_var(vector_irq)[vector];
61
62 stack_overflow_check(regs); 55 stack_overflow_check(regs);
63 56
64 desc = irq_to_desc(irq); 57 desc = irq_to_desc(irq);
65 if (likely(desc)) 58 if (unlikely(!desc))
66 generic_handle_irq_desc(irq, desc); 59 return false;
67 else {
68 if (!disable_apic)
69 ack_APIC_irq();
70
71 if (printk_ratelimit())
72 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
73 __func__, smp_processor_id(), vector);
74 }
75
76 irq_exit();
77 60
78 set_irq_regs(old_regs); 61 generic_handle_irq_desc(irq, desc);
79 return 1; 62 return true;
80} 63}
81 64
82#ifdef CONFIG_HOTPLUG_CPU 65#ifdef CONFIG_HOTPLUG_CPU
@@ -100,7 +83,7 @@ void fixup_irqs(void)
100 /* interrupt's are disabled at this point */ 83 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 84 spin_lock(&desc->lock);
102 85
103 affinity = &desc->affinity; 86 affinity = desc->affinity;
104 if (!irq_has_action(irq) || 87 if (!irq_has_action(irq) ||
105 cpumask_equal(affinity, cpu_online_mask)) { 88 cpumask_equal(affinity, cpu_online_mask)) {
106 spin_unlock(&desc->lock); 89 spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 10a09c2f1828..bc1326105448 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -18,7 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/arch_hooks.h> 21#include <asm/setup.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23#include <asm/traps.h> 23#include <asm/traps.h>
24 24
@@ -78,6 +78,15 @@ void __init init_ISA_irqs(void)
78 } 78 }
79} 79}
80 80
81/*
82 * IRQ2 is cascade interrupt to second interrupt controller
83 */
84static struct irqaction irq2 = {
85 .handler = no_action,
86 .mask = CPU_MASK_NONE,
87 .name = "cascade",
88};
89
81DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 90DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
82 [0 ... IRQ0_VECTOR - 1] = -1, 91 [0 ... IRQ0_VECTOR - 1] = -1,
83 [IRQ0_VECTOR] = 0, 92 [IRQ0_VECTOR] = 0,
@@ -118,8 +127,8 @@ void __init native_init_IRQ(void)
118{ 127{
119 int i; 128 int i;
120 129
121 /* all the set up before the call gates are initialised */ 130 /* Execute any quirks before the call gates are initialised: */
122 pre_intr_init_hook(); 131 x86_quirk_pre_intr_init();
123 132
124 /* 133 /*
125 * Cover the whole vector space, no vector can escape 134 * Cover the whole vector space, no vector can escape
@@ -140,8 +149,15 @@ void __init native_init_IRQ(void)
140 */ 149 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 150 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142 151
143 /* IPI for invalidation */ 152 /* IPIs for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 153 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
154 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
155 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
156 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
157 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
158 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
159 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
160 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
145 161
146 /* IPI for generic function call */ 162 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 163 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -159,6 +175,9 @@ void __init native_init_IRQ(void)
159 /* self generated IPI for local APIC timer */ 175 /* self generated IPI for local APIC timer */
160 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 176 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
161 177
178 /* generic IPI for platform specific use */
179 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
180
162 /* IPI vectors for APIC spurious and error interrupts */ 181 /* IPI vectors for APIC spurious and error interrupts */
163 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 182 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
164 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 183 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
@@ -169,10 +188,14 @@ void __init native_init_IRQ(void)
169 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
170#endif 189#endif
171 190
172 /* setup after call gates are initialised (usually add in 191 if (!acpi_ioapic)
173 * the architecture specific gates) 192 setup_irq(2, &irq2);
193
194 /*
195 * Call quirks after call gates are initialised (usually add in
196 * the architecture specific gates):
174 */ 197 */
175 intr_init_hook(); 198 x86_quirk_intr_init();
176 199
177 /* 200 /*
178 * External FPU? Set up irq13 if so, for 201 * External FPU? Set up irq13 if so, for
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..c7a49e0ffbfb 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
147 /* self generated IPI for local APIC timer */ 147 /* self generated IPI for local APIC timer */
148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
149 149
150 /* generic IPI for platform specific use */
151 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
152
150 /* IPI vectors for APIC spurious and error interrupts */ 153 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 154 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 155 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 10435a120d22..eedfaebe1063 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
46#include <asm/apicdef.h> 46#include <asm/apicdef.h>
47#include <asm/system.h> 47#include <asm/system.h>
48 48
49#include <mach_ipi.h> 49#include <asm/apic.h>
50 50
51/* 51/*
52 * Put the error code here just in case the user cares: 52 * Put the error code here just in case the user cares:
@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
347 */ 347 */
348void kgdb_roundup_cpus(unsigned long flags) 348void kgdb_roundup_cpus(unsigned long flags)
349{ 349{
350 send_IPI_allbutself(APIC_DM_NMI); 350 apic->send_IPI_allbutself(APIC_DM_NMI);
351} 351}
352#endif 352#endif
353 353
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 652fce6d2cce..137f2e8132df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -19,7 +19,6 @@
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h> 21#include <asm/pvclock.h>
22#include <asm/arch_hooks.h>
23#include <asm/msr.h> 22#include <asm/msr.h>
24#include <asm/apic.h> 23#include <asm/apic.h>
25#include <linux/percpu.h> 24#include <linux/percpu.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 37f420018a41..e7368c1da01d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/io.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/io.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
63 "\tmovl %%eax,%%fs\n" 63 "\tmovl %%eax,%%fs\n"
64 "\tmovl %%eax,%%gs\n" 64 "\tmovl %%eax,%%gs\n"
65 "\tmovl %%eax,%%ss\n" 65 "\tmovl %%eax,%%ss\n"
66 ::: "eax", "memory"); 66 : : : "eax", "memory");
67#undef STR 67#undef STR
68#undef __STR 68#undef __STR
69} 69}
@@ -121,7 +121,7 @@ static void machine_kexec_page_table_set_one(
121static void machine_kexec_prepare_page_tables(struct kimage *image) 121static void machine_kexec_prepare_page_tables(struct kimage *image)
122{ 122{
123 void *control_page; 123 void *control_page;
124 pmd_t *pmd = 0; 124 pmd_t *pmd = NULL;
125 125
126 control_page = page_address(image->control_code_page); 126 control_page = page_address(image->control_code_page);
127#ifdef CONFIG_X86_PAE 127#ifdef CONFIG_X86_PAE
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
208 /* We need to put APICs in legacy mode so that we can 208 /*
209 * We need to put APICs in legacy mode so that we can
209 * get timer interrupts in second kernel. kexec/kdump 210 * get timer interrupts in second kernel. kexec/kdump
210 * paths already have calls to disable_IO_APIC() in 211 * paths already have calls to disable_IO_APIC() in
211 * one form or other. kexec jump path also need 212 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 228 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
228 << PAGE_SHIFT); 229 << PAGE_SHIFT);
229 230
230 /* The segment registers are funny things, they have both a 231 /*
232 * The segment registers are funny things, they have both a
231 * visible and an invisible part. Whenever the visible part is 233 * visible and an invisible part. Whenever the visible part is
232 * set to a specific selector, the invisible part is loaded 234 * set to a specific selector, the invisible part is loaded
233 * with from a table in memory. At no other time is the 235 * with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
237 * segments, before I zap the gdt with an invalid value. 239 * segments, before I zap the gdt with an invalid value.
238 */ 240 */
239 load_segments(); 241 load_segments();
240 /* The gdt & idt are now invalid. 242 /*
243 * The gdt & idt are now invalid.
241 * If you want to load them you must set up your own idt & gdt. 244 * If you want to load them you must set up your own idt & gdt.
242 */ 245 */
243 set_gdt(phys_to_virt(0),0); 246 set_gdt(phys_to_virt(0), 0);
244 set_idt(phys_to_virt(0),0); 247 set_idt(phys_to_virt(0), 0);
245 248
246 /* now call it */ 249 /* now call it */
247 image->start = relocate_kernel_ptr((unsigned long)image->head, 250 image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c43caa3a91f3..89cea4d44679 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,20 +12,47 @@
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/io.h>
16#include <linux/suspend.h>
15 17
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19#include <asm/io.h>
20 21
21#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 22static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
22static u64 kexec_pgd[512] PAGE_ALIGNED; 23 unsigned long addr)
23static u64 kexec_pud0[512] PAGE_ALIGNED; 24{
24static u64 kexec_pmd0[512] PAGE_ALIGNED; 25 pud_t *pud;
25static u64 kexec_pte0[512] PAGE_ALIGNED; 26 pmd_t *pmd;
26static u64 kexec_pud1[512] PAGE_ALIGNED; 27 struct page *page;
27static u64 kexec_pmd1[512] PAGE_ALIGNED; 28 int result = -ENOMEM;
28static u64 kexec_pte1[512] PAGE_ALIGNED; 29
30 addr &= PMD_MASK;
31 pgd += pgd_index(addr);
32 if (!pgd_present(*pgd)) {
33 page = kimage_alloc_control_pages(image, 0);
34 if (!page)
35 goto out;
36 pud = (pud_t *)page_address(page);
37 memset(pud, 0, PAGE_SIZE);
38 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
39 }
40 pud = pud_offset(pgd, addr);
41 if (!pud_present(*pud)) {
42 page = kimage_alloc_control_pages(image, 0);
43 if (!page)
44 goto out;
45 pmd = (pmd_t *)page_address(page);
46 memset(pmd, 0, PAGE_SIZE);
47 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
48 }
49 pmd = pmd_offset(pud, addr);
50 if (!pmd_present(*pmd))
51 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
52 result = 0;
53out:
54 return result;
55}
29 56
30static void init_level2_page(pmd_t *level2p, unsigned long addr) 57static void init_level2_page(pmd_t *level2p, unsigned long addr)
31{ 58{
@@ -92,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
92 } 119 }
93 level3p = (pud_t *)page_address(page); 120 level3p = (pud_t *)page_address(page);
94 result = init_level3_page(image, level3p, addr, last_addr); 121 result = init_level3_page(image, level3p, addr, last_addr);
95 if (result) { 122 if (result)
96 goto out; 123 goto out;
97 }
98 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 124 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
99 addr += PGDIR_SIZE; 125 addr += PGDIR_SIZE;
100 } 126 }
@@ -107,12 +133,72 @@ out:
107 return result; 133 return result;
108} 134}
109 135
136static void free_transition_pgtable(struct kimage *image)
137{
138 free_page((unsigned long)image->arch.pud);
139 free_page((unsigned long)image->arch.pmd);
140 free_page((unsigned long)image->arch.pte);
141}
142
143static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
144{
145 pud_t *pud;
146 pmd_t *pmd;
147 pte_t *pte;
148 unsigned long vaddr, paddr;
149 int result = -ENOMEM;
150
151 vaddr = (unsigned long)relocate_kernel;
152 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
153 pgd += pgd_index(vaddr);
154 if (!pgd_present(*pgd)) {
155 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
156 if (!pud)
157 goto err;
158 image->arch.pud = pud;
159 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
160 }
161 pud = pud_offset(pgd, vaddr);
162 if (!pud_present(*pud)) {
163 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
164 if (!pmd)
165 goto err;
166 image->arch.pmd = pmd;
167 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
168 }
169 pmd = pmd_offset(pud, vaddr);
170 if (!pmd_present(*pmd)) {
171 pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
172 if (!pte)
173 goto err;
174 image->arch.pte = pte;
175 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
176 }
177 pte = pte_offset_kernel(pmd, vaddr);
178 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
179 return 0;
180err:
181 free_transition_pgtable(image);
182 return result;
183}
184
110 185
111static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 186static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
112{ 187{
113 pgd_t *level4p; 188 pgd_t *level4p;
189 int result;
114 level4p = (pgd_t *)__va(start_pgtable); 190 level4p = (pgd_t *)__va(start_pgtable);
115 return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 191 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
192 if (result)
193 return result;
194 /*
195 * image->start may be outside 0 ~ max_pfn, for example when
196 * jump back to original kernel from kexeced kernel
197 */
198 result = init_one_level2_page(image, level4p, image->start);
199 if (result)
200 return result;
201 return init_transition_pgtable(image, level4p);
116} 202}
117 203
118static void set_idt(void *newidt, u16 limit) 204static void set_idt(void *newidt, u16 limit)
@@ -174,7 +260,7 @@ int machine_kexec_prepare(struct kimage *image)
174 260
175void machine_kexec_cleanup(struct kimage *image) 261void machine_kexec_cleanup(struct kimage *image)
176{ 262{
177 return; 263 free_transition_pgtable(image);
178} 264}
179 265
180/* 266/*
@@ -185,36 +271,45 @@ void machine_kexec(struct kimage *image)
185{ 271{
186 unsigned long page_list[PAGES_NR]; 272 unsigned long page_list[PAGES_NR];
187 void *control_page; 273 void *control_page;
274 int save_ftrace_enabled;
188 275
189 tracer_disable(); 276#ifdef CONFIG_KEXEC_JUMP
277 if (kexec_image->preserve_context)
278 save_processor_state();
279#endif
280
281 save_ftrace_enabled = __ftrace_enabled_save();
190 282
191 /* Interrupts aren't acceptable while we reboot */ 283 /* Interrupts aren't acceptable while we reboot */
192 local_irq_disable(); 284 local_irq_disable();
193 285
286 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC
288 /*
289 * We need to put APICs in legacy mode so that we can
290 * get timer interrupts in second kernel. kexec/kdump
291 * paths already have calls to disable_IO_APIC() in
292 * one form or other. kexec jump path also need
293 * one.
294 */
295 disable_IO_APIC();
296#endif
297 }
298
194 control_page = page_address(image->control_code_page) + PAGE_SIZE; 299 control_page = page_address(image->control_code_page) + PAGE_SIZE;
195 memcpy(control_page, relocate_kernel, PAGE_SIZE); 300 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
196 301
197 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); 302 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
198 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 303 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
199 page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
200 page_list[VA_PGD] = (unsigned long)kexec_pgd;
201 page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
202 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
203 page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
204 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
205 page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
206 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
207 page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
208 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
209 page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
210 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
211 page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
212 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
213
214 page_list[PA_TABLE_PAGE] = 304 page_list[PA_TABLE_PAGE] =
215 (unsigned long)__pa(page_address(image->control_code_page)); 305 (unsigned long)__pa(page_address(image->control_code_page));
216 306
217 /* The segment registers are funny things, they have both a 307 if (image->type == KEXEC_TYPE_DEFAULT)
308 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
309 << PAGE_SHIFT);
310
311 /*
312 * The segment registers are funny things, they have both a
218 * visible and an invisible part. Whenever the visible part is 313 * visible and an invisible part. Whenever the visible part is
219 * set to a specific selector, the invisible part is loaded 314 * set to a specific selector, the invisible part is loaded
220 * with from a table in memory. At no other time is the 315 * with from a table in memory. At no other time is the
@@ -224,15 +319,25 @@ void machine_kexec(struct kimage *image)
224 * segments, before I zap the gdt with an invalid value. 319 * segments, before I zap the gdt with an invalid value.
225 */ 320 */
226 load_segments(); 321 load_segments();
227 /* The gdt & idt are now invalid. 322 /*
323 * The gdt & idt are now invalid.
228 * If you want to load them you must set up your own idt & gdt. 324 * If you want to load them you must set up your own idt & gdt.
229 */ 325 */
230 set_gdt(phys_to_virt(0),0); 326 set_gdt(phys_to_virt(0), 0);
231 set_idt(phys_to_virt(0),0); 327 set_idt(phys_to_virt(0), 0);
232 328
233 /* now call it */ 329 /* now call it */
234 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 330 image->start = relocate_kernel((unsigned long)image->head,
235 image->start); 331 (unsigned long)page_list,
332 image->start,
333 image->preserve_context);
334
335#ifdef CONFIG_KEXEC_JUMP
336 if (kexec_image->preserve_context)
337 restore_processor_state();
338#endif
339
340 __ftrace_enabled_restore(save_ftrace_enabled);
236} 341}
237 342
238void arch_crash_save_vmcoreinfo(void) 343void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 2dc183758be3..845d80ce1ef1 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -51,7 +51,6 @@
51#include <linux/ioport.h> 51#include <linux/ioport.h>
52#include <asm/uaccess.h> 52#include <asm/uaccess.h>
53#include <linux/init.h> 53#include <linux/init.h>
54#include <asm/arch_hooks.h>
55 54
56static unsigned char which_scsi; 55static unsigned char which_scsi;
57 56
@@ -474,6 +473,4 @@ void __kprobes mca_handle_nmi(void)
474 * adapter was responsible for the error. 473 * adapter was responsible for the error.
475 */ 474 */
476 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); 475 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
477 476}
478 mca_nmi_hook();
479} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index b7f4c929e615..5e9f4fc51385 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
87#include <linux/cpu.h> 87#include <linux/cpu.h>
88#include <linux/firmware.h> 88#include <linux/firmware.h>
89#include <linux/platform_device.h> 89#include <linux/platform_device.h>
90#include <linux/uaccess.h>
90 91
91#include <asm/msr.h> 92#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h> 93#include <asm/processor.h>
94#include <asm/microcode.h> 94#include <asm/microcode.h>
95 95
@@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; 196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
197} 197}
198 198
199static inline int 199static inline int
200update_match_revision(struct microcode_header_intel *mc_header, int rev) 200update_match_revision(struct microcode_header_intel *mc_header, int rev)
201{ 201{
202 return (mc_header->rev <= rev) ? 0 : 1; 202 return (mc_header->rev <= rev) ? 0 : 1;
@@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
442 return ret; 442 return ret;
443 } 443 }
444 444
445 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 445 ret = generic_load_microcode(cpu, (void *)firmware->data,
446 &get_ucode_fw); 446 firmware->size, &get_ucode_fw);
447 447
448 release_firmware(firmware); 448 release_firmware(firmware);
449 449
@@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
460 /* We should bind the task to the CPU */ 460 /* We should bind the task to the CPU */
461 BUG_ON(cpu != raw_smp_processor_id()); 461 BUG_ON(cpu != raw_smp_processor_id());
462 462
463 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); 463 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
464} 464}
465 465
466static void microcode_fini_cpu(int cpu) 466static void microcode_fini_cpu(int cpu)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb1..0edd819050e7 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
42{ 42{
43 vfree(module_region); 43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception 44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */ 45 table entries. */
46} 46}
47 47
48/* We don't need anything special. */ 48/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
113 *para = NULL; 113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115 115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name)) 117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s; 118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s; 120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s; 122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s; 124 para = s;
125 } 125 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b1..c23880b90b5c 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/pgtable.h> 31#include <asm/pgtable.h>
32 32
33#define DEBUGP(fmt...) 33#define DEBUGP(fmt...)
34 34
35#ifndef CONFIG_UML 35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region) 36void module_free(struct module *mod, void *module_region)
37{ 37{
38 vfree(module_region); 38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception 39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */ 40 table entries. */
41} 41}
42 42
43void *module_alloc(unsigned long size) 43void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; 77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
78 Elf64_Sym *sym; 78 Elf64_Sym *sym;
79 void *loc; 79 void *loc;
80 u64 val; 80 u64 val;
81 81
82 DEBUGP("Applying relocate section %u to %u\n", relsec, 82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info); 83 sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr 91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
92 + ELF64_R_SYM(rel[i].r_info); 92 + ELF64_R_SYM(rel[i].r_info);
93 93
94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
95 (int)ELF64_R_TYPE(rel[i].r_info), 95 (int)ELF64_R_TYPE(rel[i].r_info),
96 sym->st_value, rel[i].r_addend, (u64)loc); 96 sym->st_value, rel[i].r_addend, (u64)loc);
97 97
98 val = sym->st_value + rel[i].r_addend; 98 val = sym->st_value + rel[i].r_addend;
99 99
100 switch (ELF64_R_TYPE(rel[i].r_info)) { 100 switch (ELF64_R_TYPE(rel[i].r_info)) {
101 case R_X86_64_NONE: 101 case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
113 if ((s64)val != *(s32 *)loc) 113 if ((s64)val != *(s32 *)loc)
114 goto overflow; 114 goto overflow;
115 break; 115 break;
116 case R_X86_64_PC32: 116 case R_X86_64_PC32:
117 val -= (u64)loc; 117 val -= (u64)loc;
118 *(u32 *)loc = val; 118 *(u32 *)loc = val;
119#if 0 119#if 0
120 if ((s64)val != *(s32 *)loc) 120 if ((s64)val != *(s32 *)loc)
121 goto overflow; 121 goto overflow;
122#endif 122#endif
123 break; 123 break;
124 default: 124 default:
125 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", 125 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
126 me->name, ELF64_R_TYPE(rel[i].r_info)); 126 me->name, ELF64_R_TYPE(rel[i].r_info));
127 return -ENOEXEC; 127 return -ENOEXEC;
128 } 128 }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
130 return 0; 130 return 0;
131 131
132overflow: 132overflow:
133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
134 (int)ELF64_R_TYPE(rel[i].r_info), val); 134 (int)ELF64_R_TYPE(rel[i].r_info), val);
135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
136 me->name); 136 me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
143 unsigned int relsec, 143 unsigned int relsec,
144 struct module *me) 144 struct module *me)
145{ 145{
146 printk("non add relocation not supported\n"); 146 printk(KERN_ERR "non add relocation not supported\n");
147 return -ENOSYS; 147 return -ENOSYS;
148} 148}
149 149
150int module_finalize(const Elf_Ehdr *hdr, 150int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
152 struct module *me) 152 struct module *me)
153{ 153{
154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL; 155 *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
161 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 161 if (!strcmp(".altinstructions", secstrings + s->sh_name))
162 alt = s; 162 alt = s;
163 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
164 locks= s; 164 locks = s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s; 166 para = s;
167 } 167 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a649a4ccad43..e8192401da47 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -3,7 +3,7 @@
3 * compliant MP-table parsing routines. 3 * compliant MP-table parsing routines.
4 * 4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 5 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 6 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de> 7 * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
8 */ 8 */
9 9
@@ -29,12 +29,7 @@
29#include <asm/setup.h> 29#include <asm/setup.h>
30#include <asm/smp.h> 30#include <asm/smp.h>
31 31
32#include <mach_apic.h> 32#include <asm/apic.h>
33#ifdef CONFIG_X86_32
34#include <mach_apicdef.h>
35#include <mach_mpparse.h>
36#endif
37
38/* 33/*
39 * Checksum an MP configuration block. 34 * Checksum an MP configuration block.
40 */ 35 */
@@ -144,11 +139,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
144 if (bad_ioapic(m->apicaddr)) 139 if (bad_ioapic(m->apicaddr))
145 return; 140 return;
146 141
147 mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; 142 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->apicid; 143 mp_ioapics[nr_ioapics].apicid = m->apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->type; 144 mp_ioapics[nr_ioapics].type = m->type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->apicver; 145 mp_ioapics[nr_ioapics].apicver = m->apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->flags; 146 mp_ioapics[nr_ioapics].flags = m->flags;
152 nr_ioapics++; 147 nr_ioapics++;
153} 148}
154 149
@@ -160,55 +155,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
160 m->srcbusirq, m->dstapic, m->dstirq); 155 m->srcbusirq, m->dstapic, m->dstirq);
161} 156}
162 157
163static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 158static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
164{ 159{
165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 160 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
166 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 161 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
167 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 162 mp_irq->irqtype, mp_irq->irqflag & 3,
168 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 163 (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
169 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); 164 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
170} 165}
171 166
172static void __init assign_to_mp_irq(struct mpc_intsrc *m, 167static void __init assign_to_mp_irq(struct mpc_intsrc *m,
173 struct mp_config_intsrc *mp_irq) 168 struct mpc_intsrc *mp_irq)
174{ 169{
175 mp_irq->mp_dstapic = m->dstapic; 170 mp_irq->dstapic = m->dstapic;
176 mp_irq->mp_type = m->type; 171 mp_irq->type = m->type;
177 mp_irq->mp_irqtype = m->irqtype; 172 mp_irq->irqtype = m->irqtype;
178 mp_irq->mp_irqflag = m->irqflag; 173 mp_irq->irqflag = m->irqflag;
179 mp_irq->mp_srcbus = m->srcbus; 174 mp_irq->srcbus = m->srcbus;
180 mp_irq->mp_srcbusirq = m->srcbusirq; 175 mp_irq->srcbusirq = m->srcbusirq;
181 mp_irq->mp_dstirq = m->dstirq; 176 mp_irq->dstirq = m->dstirq;
182} 177}
183 178
184static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, 179static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
185 struct mpc_intsrc *m) 180 struct mpc_intsrc *m)
186{ 181{
187 m->dstapic = mp_irq->mp_dstapic; 182 m->dstapic = mp_irq->dstapic;
188 m->type = mp_irq->mp_type; 183 m->type = mp_irq->type;
189 m->irqtype = mp_irq->mp_irqtype; 184 m->irqtype = mp_irq->irqtype;
190 m->irqflag = mp_irq->mp_irqflag; 185 m->irqflag = mp_irq->irqflag;
191 m->srcbus = mp_irq->mp_srcbus; 186 m->srcbus = mp_irq->srcbus;
192 m->srcbusirq = mp_irq->mp_srcbusirq; 187 m->srcbusirq = mp_irq->srcbusirq;
193 m->dstirq = mp_irq->mp_dstirq; 188 m->dstirq = mp_irq->dstirq;
194} 189}
195 190
196static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, 191static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
197 struct mpc_intsrc *m) 192 struct mpc_intsrc *m)
198{ 193{
199 if (mp_irq->mp_dstapic != m->dstapic) 194 if (mp_irq->dstapic != m->dstapic)
200 return 1; 195 return 1;
201 if (mp_irq->mp_type != m->type) 196 if (mp_irq->type != m->type)
202 return 2; 197 return 2;
203 if (mp_irq->mp_irqtype != m->irqtype) 198 if (mp_irq->irqtype != m->irqtype)
204 return 3; 199 return 3;
205 if (mp_irq->mp_irqflag != m->irqflag) 200 if (mp_irq->irqflag != m->irqflag)
206 return 4; 201 return 4;
207 if (mp_irq->mp_srcbus != m->srcbus) 202 if (mp_irq->srcbus != m->srcbus)
208 return 5; 203 return 5;
209 if (mp_irq->mp_srcbusirq != m->srcbusirq) 204 if (mp_irq->srcbusirq != m->srcbusirq)
210 return 6; 205 return 6;
211 if (mp_irq->mp_dstirq != m->dstirq) 206 if (mp_irq->dstirq != m->dstirq)
212 return 7; 207 return 7;
213 208
214 return 0; 209 return 0;
@@ -292,16 +287,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
292 return 0; 287 return 0;
293 288
294#ifdef CONFIG_X86_32 289#ifdef CONFIG_X86_32
295 /* 290 generic_mps_oem_check(mpc, oem, str);
296 * need to make sure summit and es7000's mps_oem_check is safe to be
297 * called early via genericarch 's mps_oem_check
298 */
299 if (early) {
300#ifdef CONFIG_X86_NUMAQ
301 numaq_mps_oem_check(mpc, oem, str);
302#endif
303 } else
304 mps_oem_check(mpc, oem, str);
305#endif 291#endif
306 /* save the local APIC address, it might be non-default */ 292 /* save the local APIC address, it might be non-default */
307 if (!acpi_lapic) 293 if (!acpi_lapic)
@@ -386,13 +372,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
386 (*x86_quirks->mpc_record)++; 372 (*x86_quirks->mpc_record)++;
387 } 373 }
388 374
389#ifdef CONFIG_X86_GENERICARCH 375#ifdef CONFIG_X86_BIGSMP
390 generic_bigsmp_probe(); 376 generic_bigsmp_probe();
391#endif 377#endif
392 378
393#ifdef CONFIG_X86_32 379 if (apic->setup_apic_routing)
394 setup_apic_routing(); 380 apic->setup_apic_routing();
395#endif 381
396 if (!num_processors) 382 if (!num_processors)
397 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 383 printk(KERN_ERR "MPTABLE: no processors registered!\n");
398 return num_processors; 384 return num_processors;
@@ -417,7 +403,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
417 intsrc.type = MP_INTSRC; 403 intsrc.type = MP_INTSRC;
418 intsrc.irqflag = 0; /* conforming */ 404 intsrc.irqflag = 0; /* conforming */
419 intsrc.srcbus = 0; 405 intsrc.srcbus = 0;
420 intsrc.dstapic = mp_ioapics[0].mp_apicid; 406 intsrc.dstapic = mp_ioapics[0].apicid;
421 407
422 intsrc.irqtype = mp_INT; 408 intsrc.irqtype = mp_INT;
423 409
@@ -570,14 +556,27 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
570 } 556 }
571} 557}
572 558
573static struct intel_mp_floating *mpf_found; 559static struct mpf_intel *mpf_found;
560
561static unsigned long __init get_mpc_size(unsigned long physptr)
562{
563 struct mpc_table *mpc;
564 unsigned long size;
565
566 mpc = early_ioremap(physptr, PAGE_SIZE);
567 size = mpc->length;
568 early_iounmap(mpc, PAGE_SIZE);
569 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
570
571 return size;
572}
574 573
575/* 574/*
576 * Scan the memory blocks for an SMP configuration block. 575 * Scan the memory blocks for an SMP configuration block.
577 */ 576 */
578static void __init __get_smp_config(unsigned int early) 577static void __init __get_smp_config(unsigned int early)
579{ 578{
580 struct intel_mp_floating *mpf = mpf_found; 579 struct mpf_intel *mpf = mpf_found;
581 580
582 if (!mpf) 581 if (!mpf)
583 return; 582 return;
@@ -598,9 +597,9 @@ static void __init __get_smp_config(unsigned int early)
598 } 597 }
599 598
600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 599 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
601 mpf->mpf_specification); 600 mpf->specification);
602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 601#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
603 if (mpf->mpf_feature2 & (1 << 7)) { 602 if (mpf->feature2 & (1 << 7)) {
604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 603 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
605 pic_mode = 1; 604 pic_mode = 1;
606 } else { 605 } else {
@@ -611,7 +610,7 @@ static void __init __get_smp_config(unsigned int early)
611 /* 610 /*
612 * Now see if we need to read further. 611 * Now see if we need to read further.
613 */ 612 */
614 if (mpf->mpf_feature1 != 0) { 613 if (mpf->feature1 != 0) {
615 if (early) { 614 if (early) {
616 /* 615 /*
617 * local APIC has default address 616 * local APIC has default address
@@ -621,16 +620,20 @@ static void __init __get_smp_config(unsigned int early)
621 } 620 }
622 621
623 printk(KERN_INFO "Default MP configuration #%d\n", 622 printk(KERN_INFO "Default MP configuration #%d\n",
624 mpf->mpf_feature1); 623 mpf->feature1);
625 construct_default_ISA_mptable(mpf->mpf_feature1); 624 construct_default_ISA_mptable(mpf->feature1);
626 625
627 } else if (mpf->mpf_physptr) { 626 } else if (mpf->physptr) {
627 struct mpc_table *mpc;
628 unsigned long size;
628 629
630 size = get_mpc_size(mpf->physptr);
631 mpc = early_ioremap(mpf->physptr, size);
629 /* 632 /*
630 * Read the physical hardware table. Anything here will 633 * Read the physical hardware table. Anything here will
631 * override the defaults. 634 * override the defaults.
632 */ 635 */
633 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 636 if (!smp_read_mpc(mpc, early)) {
634#ifdef CONFIG_X86_LOCAL_APIC 637#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 638 smp_found_config = 0;
636#endif 639#endif
@@ -638,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
638 "BIOS bug, MP table errors detected!...\n"); 641 "BIOS bug, MP table errors detected!...\n");
639 printk(KERN_ERR "... disabling SMP support. " 642 printk(KERN_ERR "... disabling SMP support. "
640 "(tell your hw vendor)\n"); 643 "(tell your hw vendor)\n");
644 early_iounmap(mpc, size);
641 return; 645 return;
642 } 646 }
647 early_iounmap(mpc, size);
643 648
644 if (early) 649 if (early)
645 return; 650 return;
@@ -688,33 +693,33 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
688 unsigned reserve) 693 unsigned reserve)
689{ 694{
690 unsigned int *bp = phys_to_virt(base); 695 unsigned int *bp = phys_to_virt(base);
691 struct intel_mp_floating *mpf; 696 struct mpf_intel *mpf;
692 697
693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 698 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
694 bp, length); 699 bp, length);
695 BUILD_BUG_ON(sizeof(*mpf) != 16); 700 BUILD_BUG_ON(sizeof(*mpf) != 16);
696 701
697 while (length > 0) { 702 while (length > 0) {
698 mpf = (struct intel_mp_floating *)bp; 703 mpf = (struct mpf_intel *)bp;
699 if ((*bp == SMP_MAGIC_IDENT) && 704 if ((*bp == SMP_MAGIC_IDENT) &&
700 (mpf->mpf_length == 1) && 705 (mpf->length == 1) &&
701 !mpf_checksum((unsigned char *)bp, 16) && 706 !mpf_checksum((unsigned char *)bp, 16) &&
702 ((mpf->mpf_specification == 1) 707 ((mpf->specification == 1)
703 || (mpf->mpf_specification == 4))) { 708 || (mpf->specification == 4))) {
704#ifdef CONFIG_X86_LOCAL_APIC 709#ifdef CONFIG_X86_LOCAL_APIC
705 smp_found_config = 1; 710 smp_found_config = 1;
706#endif 711#endif
707 mpf_found = mpf; 712 mpf_found = mpf;
708 713
709 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", 714 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
710 mpf, virt_to_phys(mpf)); 715 mpf, (u64)virt_to_phys(mpf));
711 716
712 if (!reserve) 717 if (!reserve)
713 return 1; 718 return 1;
714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 719 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
715 BOOTMEM_DEFAULT); 720 BOOTMEM_DEFAULT);
716 if (mpf->mpf_physptr) { 721 if (mpf->physptr) {
717 unsigned long size = PAGE_SIZE; 722 unsigned long size = get_mpc_size(mpf->physptr);
718#ifdef CONFIG_X86_32 723#ifdef CONFIG_X86_32
719 /* 724 /*
720 * We cannot access to MPC table to compute 725 * We cannot access to MPC table to compute
@@ -722,15 +727,24 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
722 * the bottom is mapped now. 727 * the bottom is mapped now.
723 * PC-9800's MPC table places on the very last 728 * PC-9800's MPC table places on the very last
724 * of physical memory; so that simply reserving 729 * of physical memory; so that simply reserving
725 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 730 * PAGE_SIZE from mpf->physptr yields BUG()
726 * in reserve_bootmem. 731 * in reserve_bootmem.
732 * also need to make sure physptr is below than
733 * max_low_pfn
734 * we don't need reserve the area above max_low_pfn
727 */ 735 */
728 unsigned long end = max_low_pfn * PAGE_SIZE; 736 unsigned long end = max_low_pfn * PAGE_SIZE;
729 if (mpf->mpf_physptr + size > end) 737
730 size = end - mpf->mpf_physptr; 738 if (mpf->physptr < end) {
731#endif 739 if (mpf->physptr + size > end)
732 reserve_bootmem_generic(mpf->mpf_physptr, size, 740 size = end - mpf->physptr;
741 reserve_bootmem_generic(mpf->physptr, size,
742 BOOTMEM_DEFAULT);
743 }
744#else
745 reserve_bootmem_generic(mpf->physptr, size,
733 BOOTMEM_DEFAULT); 746 BOOTMEM_DEFAULT);
747#endif
734 } 748 }
735 749
736 return 1; 750 return 1;
@@ -809,15 +823,15 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
809 /* not legacy */ 823 /* not legacy */
810 824
811 for (i = 0; i < mp_irq_entries; i++) { 825 for (i = 0; i < mp_irq_entries; i++) {
812 if (mp_irqs[i].mp_irqtype != mp_INT) 826 if (mp_irqs[i].irqtype != mp_INT)
813 continue; 827 continue;
814 828
815 if (mp_irqs[i].mp_irqflag != 0x0f) 829 if (mp_irqs[i].irqflag != 0x0f)
816 continue; 830 continue;
817 831
818 if (mp_irqs[i].mp_srcbus != m->srcbus) 832 if (mp_irqs[i].srcbus != m->srcbus)
819 continue; 833 continue;
820 if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) 834 if (mp_irqs[i].srcbusirq != m->srcbusirq)
821 continue; 835 continue;
822 if (irq_used[i]) { 836 if (irq_used[i]) {
823 /* already claimed */ 837 /* already claimed */
@@ -922,10 +936,10 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
922 if (irq_used[i]) 936 if (irq_used[i])
923 continue; 937 continue;
924 938
925 if (mp_irqs[i].mp_irqtype != mp_INT) 939 if (mp_irqs[i].irqtype != mp_INT)
926 continue; 940 continue;
927 941
928 if (mp_irqs[i].mp_irqflag != 0x0f) 942 if (mp_irqs[i].irqflag != 0x0f)
929 continue; 943 continue;
930 944
931 if (nr_m_spare > 0) { 945 if (nr_m_spare > 0) {
@@ -1001,7 +1015,7 @@ static int __init update_mp_table(void)
1001{ 1015{
1002 char str[16]; 1016 char str[16];
1003 char oem[10]; 1017 char oem[10];
1004 struct intel_mp_floating *mpf; 1018 struct mpf_intel *mpf;
1005 struct mpc_table *mpc, *mpc_new; 1019 struct mpc_table *mpc, *mpc_new;
1006 1020
1007 if (!enable_update_mptable) 1021 if (!enable_update_mptable)
@@ -1014,19 +1028,19 @@ static int __init update_mp_table(void)
1014 /* 1028 /*
1015 * Now see if we need to go further. 1029 * Now see if we need to go further.
1016 */ 1030 */
1017 if (mpf->mpf_feature1 != 0) 1031 if (mpf->feature1 != 0)
1018 return 0; 1032 return 0;
1019 1033
1020 if (!mpf->mpf_physptr) 1034 if (!mpf->physptr)
1021 return 0; 1035 return 0;
1022 1036
1023 mpc = phys_to_virt(mpf->mpf_physptr); 1037 mpc = phys_to_virt(mpf->physptr);
1024 1038
1025 if (!smp_check_mpc(mpc, oem, str)) 1039 if (!smp_check_mpc(mpc, oem, str))
1026 return 0; 1040 return 0;
1027 1041
1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); 1042 printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
1029 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); 1043 printk(KERN_INFO "physptr: %x\n", mpf->physptr);
1030 1044
1031 if (mpc_new_phys && mpc->length > mpc_new_length) { 1045 if (mpc_new_phys && mpc->length > mpc_new_length) {
1032 mpc_new_phys = 0; 1046 mpc_new_phys = 0;
@@ -1047,23 +1061,23 @@ static int __init update_mp_table(void)
1047 } 1061 }
1048 printk(KERN_INFO "use in-positon replacing\n"); 1062 printk(KERN_INFO "use in-positon replacing\n");
1049 } else { 1063 } else {
1050 mpf->mpf_physptr = mpc_new_phys; 1064 mpf->physptr = mpc_new_phys;
1051 mpc_new = phys_to_virt(mpc_new_phys); 1065 mpc_new = phys_to_virt(mpc_new_phys);
1052 memcpy(mpc_new, mpc, mpc->length); 1066 memcpy(mpc_new, mpc, mpc->length);
1053 mpc = mpc_new; 1067 mpc = mpc_new;
1054 /* check if we can modify that */ 1068 /* check if we can modify that */
1055 if (mpc_new_phys - mpf->mpf_physptr) { 1069 if (mpc_new_phys - mpf->physptr) {
1056 struct intel_mp_floating *mpf_new; 1070 struct mpf_intel *mpf_new;
1057 /* steal 16 bytes from [0, 1k) */ 1071 /* steal 16 bytes from [0, 1k) */
1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); 1072 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1059 mpf_new = phys_to_virt(0x400 - 16); 1073 mpf_new = phys_to_virt(0x400 - 16);
1060 memcpy(mpf_new, mpf, 16); 1074 memcpy(mpf_new, mpf, 16);
1061 mpf = mpf_new; 1075 mpf = mpf_new;
1062 mpf->mpf_physptr = mpc_new_phys; 1076 mpf->physptr = mpc_new_phys;
1063 } 1077 }
1064 mpf->mpf_checksum = 0; 1078 mpf->checksum = 0;
1065 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); 1079 mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
1066 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); 1080 printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
1067 } 1081 }
1068 1082
1069 /* 1083 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 726266695b2c..3cf3413ec626 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/uaccess.h>
38 39
39#include <asm/processor.h> 40#include <asm/processor.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h> 42#include <asm/system.h>
43 43
44static struct class *msr_class; 44static struct class *msr_class;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
deleted file mode 100644
index f2191d4f2717..000000000000
--- a/arch/x86/kernel/numaq_32.c
+++ /dev/null
@@ -1,293 +0,0 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/module.h>
30#include <linux/nodemask.h>
31#include <asm/numaq.h>
32#include <asm/topology.h>
33#include <asm/processor.h>
34#include <asm/genapic.h>
35#include <asm/e820.h>
36#include <asm/setup.h>
37
38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
39
40/*
41 * Function: smp_dump_qct()
42 *
43 * Description: gets memory layout from the quad config table. This
44 * function also updates node_online_map with the nodes (quads) present.
45 */
46static void __init smp_dump_qct(void)
47{
48 int node;
49 struct eachquadmem *eq;
50 struct sys_cfg_data *scd =
51 (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
52
53 nodes_clear(node_online_map);
54 for_each_node(node) {
55 if (scd->quads_present31_0 & (1 << node)) {
56 node_set_online(node);
57 eq = &scd->eq[node];
58 /* Convert to pages */
59 node_start_pfn[node] = MB_TO_PAGES(
60 eq->hi_shrd_mem_start - eq->priv_mem_size);
61 node_end_pfn[node] = MB_TO_PAGES(
62 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
63
64 e820_register_active_regions(node, node_start_pfn[node],
65 node_end_pfn[node]);
66 memory_present(node,
67 node_start_pfn[node], node_end_pfn[node]);
68 node_remap_size[node] = node_memmap_size_bytes(node,
69 node_start_pfn[node],
70 node_end_pfn[node]);
71 }
72 }
73}
74
75
76void __cpuinit numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_cpu *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
127 (m->cpufeature & CPU_MODEL_MASK) >> 4,
128 m->apicver, quad, logical_apicid);
129 return logical_apicid;
130}
131
132int mp_bus_id_to_node[MAX_MP_BUSSES];
133
134int mp_bus_id_to_local[MAX_MP_BUSSES];
135
136/* x86_quirks member */
137static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
138{
139 int quad = translation_table[mpc_record]->trans_quad;
140 int local = translation_table[mpc_record]->trans_local;
141
142 mp_bus_id_to_node[m->busid] = quad;
143 mp_bus_id_to_local[m->busid] = local;
144 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
145 m->busid, name, quad);
146}
147
148int quad_local_to_mp_bus_id [NR_CPUS/4][4];
149
150/* x86_quirks member */
151static void mpc_oem_pci_bus(struct mpc_bus *m)
152{
153 int quad = translation_table[mpc_record]->trans_quad;
154 int local = translation_table[mpc_record]->trans_local;
155
156 quad_local_to_mp_bus_id[quad][local] = m->busid;
157}
158
159static void __init MP_translation_info(struct mpc_config_translation *m)
160{
161 printk(KERN_INFO
162 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
163 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
164 m->trans_local);
165
166 if (mpc_record >= MAX_MPC_ENTRY)
167 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
168 else
169 translation_table[mpc_record] = m; /* stash this for later */
170 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
171 node_set_online(m->trans_quad);
172}
173
174static int __init mpf_checksum(unsigned char *mp, int len)
175{
176 int sum = 0;
177
178 while (len--)
179 sum += *mp++;
180
181 return sum & 0xFF;
182}
183
184/*
185 * Read/parse the MPC oem tables
186 */
187
188static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable,
189 unsigned short oemsize)
190{
191 int count = sizeof(*oemtable); /* the header size */
192 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
193
194 mpc_record = 0;
195 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
196 oemtable);
197 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
198 printk(KERN_WARNING
199 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
200 oemtable->signature[0], oemtable->signature[1],
201 oemtable->signature[2], oemtable->signature[3]);
202 return;
203 }
204 if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
205 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
206 return;
207 }
208 while (count < oemtable->length) {
209 switch (*oemptr) {
210 case MP_TRANSLATION:
211 {
212 struct mpc_config_translation *m =
213 (struct mpc_config_translation *)oemptr;
214 MP_translation_info(m);
215 oemptr += sizeof(*m);
216 count += sizeof(*m);
217 ++mpc_record;
218 break;
219 }
220 default:
221 {
222 printk(KERN_WARNING
223 "Unrecognised OEM table entry type! - %d\n",
224 (int)*oemptr);
225 return;
226 }
227 }
228 }
229}
230
231static int __init numaq_setup_ioapic_ids(void)
232{
233 /* so can skip it */
234 return 1;
235}
236
237static int __init numaq_update_genapic(void)
238{
239 genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
240
241 return 0;
242}
243
244static struct x86_quirks numaq_x86_quirks __initdata = {
245 .arch_pre_time_init = numaq_pre_time_init,
246 .arch_time_init = NULL,
247 .arch_pre_intr_init = NULL,
248 .arch_memory_setup = NULL,
249 .arch_intr_init = NULL,
250 .arch_trap_init = NULL,
251 .mach_get_smp_config = NULL,
252 .mach_find_smp_config = NULL,
253 .mpc_record = &mpc_record,
254 .mpc_apic_id = mpc_apic_id,
255 .mpc_oem_bus_info = mpc_oem_bus_info,
256 .mpc_oem_pci_bus = mpc_oem_pci_bus,
257 .smp_read_mpc_oem = smp_read_mpc_oem,
258 .setup_ioapic_ids = numaq_setup_ioapic_ids,
259 .update_genapic = numaq_update_genapic,
260};
261
262void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
263{
264 if (strncmp(oem, "IBM NUMA", 8))
265 printk("Warning! Not a NUMA-Q system!\n");
266 else
267 found_numaq = 1;
268}
269
270static __init void early_check_numaq(void)
271{
272 /*
273 * Find possible boot-time SMP configuration:
274 */
275 early_find_smp_config();
276 /*
277 * get boot-time SMP configuration:
278 */
279 if (smp_found_config)
280 early_get_smp_config();
281
282 if (found_numaq)
283 x86_quirks = &numaq_x86_quirks;
284}
285
286int __init get_memcfg_numaq(void)
287{
288 early_check_numaq();
289 if (!found_numaq)
290 return 0;
291 smp_dump_qct();
292 return 1;
293}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 95777b0faa73..3a7c5a44082e 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -26,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {
26}; 26};
27EXPORT_SYMBOL(pv_lock_ops); 27EXPORT_SYMBOL(pv_lock_ops);
28 28
29void __init paravirt_use_bytelocks(void)
30{
31#ifdef CONFIG_SMP
32 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
33 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
34 pv_lock_ops.spin_lock = __byte_spin_lock;
35 pv_lock_ops.spin_trylock = __byte_spin_trylock;
36 pv_lock_ops.spin_unlock = __byte_spin_unlock;
37#endif
38}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c6520a4e85d4..63dd358d8ee1 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -28,7 +28,6 @@
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h> 31#include <asm/pgtable.h>
33#include <asm/time.h> 32#include <asm/time.h>
34#include <asm/pgalloc.h> 33#include <asm/pgalloc.h>
@@ -44,6 +43,17 @@ void _paravirt_nop(void)
44{ 43{
45} 44}
46 45
46/* identity function, which can be inlined */
47u32 _paravirt_ident_32(u32 x)
48{
49 return x;
50}
51
52u64 _paravirt_ident_64(u64 x)
53{
54 return x;
55}
56
47static void __init default_banner(void) 57static void __init default_banner(void)
48{ 58{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +148,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 148 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 149 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 150 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 151 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 152 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 153 ret = paravirt_patch_nop();
154
155 /* identity functions just return their single argument */
156 else if (opfunc == _paravirt_ident_32)
157 ret = paravirt_patch_ident_32(insnbuf, len);
158 else if (opfunc == _paravirt_ident_64)
159 ret = paravirt_patch_ident_64(insnbuf, len);
160
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 161 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 162 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -318,10 +335,10 @@ struct pv_time_ops pv_time_ops = {
318 335
319struct pv_irq_ops pv_irq_ops = { 336struct pv_irq_ops pv_irq_ops = {
320 .init_IRQ = native_init_IRQ, 337 .init_IRQ = native_init_IRQ,
321 .save_fl = native_save_fl, 338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
322 .restore_fl = native_restore_fl, 339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
323 .irq_disable = native_irq_disable, 340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
324 .irq_enable = native_irq_enable, 341 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
325 .safe_halt = native_safe_halt, 342 .safe_halt = native_safe_halt,
326 .halt = native_halt, 343 .halt = native_halt,
327#ifdef CONFIG_X86_64 344#ifdef CONFIG_X86_64
@@ -399,6 +416,14 @@ struct pv_apic_ops pv_apic_ops = {
399#endif 416#endif
400}; 417};
401 418
419#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
420/* 32-bit pagetable entries */
421#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
422#else
423/* 64-bit pagetable entries */
424#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
425#endif
426
402struct pv_mmu_ops pv_mmu_ops = { 427struct pv_mmu_ops pv_mmu_ops = {
403#ifndef CONFIG_X86_64 428#ifndef CONFIG_X86_64
404 .pagetable_setup_start = native_pagetable_setup_start, 429 .pagetable_setup_start = native_pagetable_setup_start,
@@ -450,22 +475,23 @@ struct pv_mmu_ops pv_mmu_ops = {
450 .pmd_clear = native_pmd_clear, 475 .pmd_clear = native_pmd_clear,
451#endif 476#endif
452 .set_pud = native_set_pud, 477 .set_pud = native_set_pud,
453 .pmd_val = native_pmd_val, 478
454 .make_pmd = native_make_pmd, 479 .pmd_val = PTE_IDENT,
480 .make_pmd = PTE_IDENT,
455 481
456#if PAGETABLE_LEVELS == 4 482#if PAGETABLE_LEVELS == 4
457 .pud_val = native_pud_val, 483 .pud_val = PTE_IDENT,
458 .make_pud = native_make_pud, 484 .make_pud = PTE_IDENT,
485
459 .set_pgd = native_set_pgd, 486 .set_pgd = native_set_pgd,
460#endif 487#endif
461#endif /* PAGETABLE_LEVELS >= 3 */ 488#endif /* PAGETABLE_LEVELS >= 3 */
462 489
463 .pte_val = native_pte_val, 490 .pte_val = PTE_IDENT,
464 .pte_flags = native_pte_flags, 491 .pgd_val = PTE_IDENT,
465 .pgd_val = native_pgd_val,
466 492
467 .make_pte = native_make_pte, 493 .make_pte = PTE_IDENT,
468 .make_pgd = native_make_pgd, 494 .make_pgd = PTE_IDENT,
469 495
470 .dup_mmap = paravirt_nop, 496 .dup_mmap = paravirt_nop,
471 .exit_mmap = paravirt_nop, 497 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
index 675a48c404a5..071e7fea42e5 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -18,7 +18,7 @@
18#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/sections.h> 19#include <asm/sections.h>
20#include <asm/io.h> 20#include <asm/io.h>
21#include <setup_arch.h> 21#include <asm/setup_arch.h>
22 22
23static struct resource system_rom_resource = { 23static struct resource system_rom_resource = {
24 .name = "System ROM", 24 .name = "System ROM",
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6d12f7e37f8c..8c037051b353 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,16 +1,19 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
5#include <linux/smp.h> 4#include <linux/smp.h>
5#include <linux/prctl.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h> 11#include <trace/power.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/idle.h>
15#include <asm/uaccess.h>
16#include <asm/i387.h>
14 17
15unsigned long idle_halt; 18unsigned long idle_halt;
16EXPORT_SYMBOL(idle_halt); 19EXPORT_SYMBOL(idle_halt);
@@ -19,6 +22,9 @@ EXPORT_SYMBOL(idle_nomwait);
19 22
20struct kmem_cache *task_xstate_cachep; 23struct kmem_cache *task_xstate_cachep;
21 24
25DEFINE_TRACE(power_start);
26DEFINE_TRACE(power_end);
27
22int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 28int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
23{ 29{
24 *dst = *src; 30 *dst = *src;
@@ -56,6 +62,192 @@ void arch_task_cache_init(void)
56} 62}
57 63
58/* 64/*
65 * Free current thread data structures etc..
66 */
67void exit_thread(void)
68{
69 struct task_struct *me = current;
70 struct thread_struct *t = &me->thread;
71
72 if (me->thread.io_bitmap_ptr) {
73 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
74
75 kfree(t->io_bitmap_ptr);
76 t->io_bitmap_ptr = NULL;
77 clear_thread_flag(TIF_IO_BITMAP);
78 /*
79 * Careful, clear this in the TSS too:
80 */
81 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
82 t->io_bitmap_max = 0;
83 put_cpu();
84 }
85
86 ds_exit_thread(current);
87}
88
89void flush_thread(void)
90{
91 struct task_struct *tsk = current;
92
93#ifdef CONFIG_X86_64
94 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
95 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
96 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
97 clear_tsk_thread_flag(tsk, TIF_IA32);
98 } else {
99 set_tsk_thread_flag(tsk, TIF_IA32);
100 current_thread_info()->status |= TS_COMPAT;
101 }
102 }
103#endif
104
105 clear_tsk_thread_flag(tsk, TIF_DEBUG);
106
107 tsk->thread.debugreg0 = 0;
108 tsk->thread.debugreg1 = 0;
109 tsk->thread.debugreg2 = 0;
110 tsk->thread.debugreg3 = 0;
111 tsk->thread.debugreg6 = 0;
112 tsk->thread.debugreg7 = 0;
113 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
114 /*
115 * Forget coprocessor state..
116 */
117 tsk->fpu_counter = 0;
118 clear_fpu(tsk);
119 clear_used_math();
120}
121
122static void hard_disable_TSC(void)
123{
124 write_cr4(read_cr4() | X86_CR4_TSD);
125}
126
127void disable_TSC(void)
128{
129 preempt_disable();
130 if (!test_and_set_thread_flag(TIF_NOTSC))
131 /*
132 * Must flip the CPU state synchronously with
133 * TIF_NOTSC in the current running context.
134 */
135 hard_disable_TSC();
136 preempt_enable();
137}
138
139static void hard_enable_TSC(void)
140{
141 write_cr4(read_cr4() & ~X86_CR4_TSD);
142}
143
144static void enable_TSC(void)
145{
146 preempt_disable();
147 if (test_and_clear_thread_flag(TIF_NOTSC))
148 /*
149 * Must flip the CPU state synchronously with
150 * TIF_NOTSC in the current running context.
151 */
152 hard_enable_TSC();
153 preempt_enable();
154}
155
156int get_tsc_mode(unsigned long adr)
157{
158 unsigned int val;
159
160 if (test_thread_flag(TIF_NOTSC))
161 val = PR_TSC_SIGSEGV;
162 else
163 val = PR_TSC_ENABLE;
164
165 return put_user(val, (unsigned int __user *)adr);
166}
167
168int set_tsc_mode(unsigned int val)
169{
170 if (val == PR_TSC_SIGSEGV)
171 disable_TSC();
172 else if (val == PR_TSC_ENABLE)
173 enable_TSC();
174 else
175 return -EINVAL;
176
177 return 0;
178}
179
180void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
181 struct tss_struct *tss)
182{
183 struct thread_struct *prev, *next;
184
185 prev = &prev_p->thread;
186 next = &next_p->thread;
187
188 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
189 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
190 ds_switch_to(prev_p, next_p);
191 else if (next->debugctlmsr != prev->debugctlmsr)
192 update_debugctlmsr(next->debugctlmsr);
193
194 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
195 set_debugreg(next->debugreg0, 0);
196 set_debugreg(next->debugreg1, 1);
197 set_debugreg(next->debugreg2, 2);
198 set_debugreg(next->debugreg3, 3);
199 /* no 4 and 5 */
200 set_debugreg(next->debugreg6, 6);
201 set_debugreg(next->debugreg7, 7);
202 }
203
204 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
205 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
206 /* prev and next are different */
207 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
208 hard_disable_TSC();
209 else
210 hard_enable_TSC();
211 }
212
213 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
214 /*
215 * Copy the relevant range of the IO bitmap.
216 * Normally this is 128 bytes or less:
217 */
218 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
219 max(prev->io_bitmap_max, next->io_bitmap_max));
220 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
221 /*
222 * Clear any possible leftover bits:
223 */
224 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
225 }
226}
227
228int sys_fork(struct pt_regs *regs)
229{
230 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
231}
232
233/*
234 * This is trivial, and on the face of it looks like it
235 * could equally well be done in user mode.
236 *
237 * Not so, for quite unobvious reasons - register pressure.
238 * In user mode vfork() cannot have a stack frame, and if
239 * done by calling the "clone()" system call directly, you
240 * do not have enough call-clobbered registers to hold all
241 * the information you need.
242 */
243int sys_vfork(struct pt_regs *regs)
244{
245 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
246 NULL, NULL);
247}
248
249
250/*
59 * Idle related variables and functions 251 * Idle related variables and functions
60 */ 252 */
61unsigned long boot_option_idle_override = 0; 253unsigned long boot_option_idle_override = 0;
@@ -350,7 +542,7 @@ static void c1e_idle(void)
350 542
351void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 543void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
352{ 544{
353#ifdef CONFIG_X86_SMP 545#ifdef CONFIG_SMP
354 if (pm_idle == poll_idle && smp_num_siblings > 1) { 546 if (pm_idle == poll_idle && smp_num_siblings > 1) {
355 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 547 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
356 " performance may degrade.\n"); 548 " performance may degrade.\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd4da2af08ae..14014d766cad 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
11 11
12#include <stdarg.h> 12#include <stdarg.h>
13 13
14#include <linux/stackprotector.h>
14#include <linux/cpu.h> 15#include <linux/cpu.h>
15#include <linux/errno.h> 16#include <linux/errno.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -66,9 +67,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 67DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 68EXPORT_PER_CPU_SYMBOL(current_task);
68 69
69DEFINE_PER_CPU(int, cpu_number);
70EXPORT_PER_CPU_SYMBOL(cpu_number);
71
72/* 70/*
73 * Return saved PC of a blocked thread. 71 * Return saved PC of a blocked thread.
74 */ 72 */
@@ -94,6 +92,15 @@ void cpu_idle(void)
94{ 92{
95 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
96 94
95 /*
96 * If we're the non-boot CPU, nothing set the stack canary up
97 * for us. CPU0 already has it initialized but no harm in
98 * doing it again. This is a good place for updating it, as
99 * we wont ever return from this function (so the invalid
100 * canaries already on the stack wont ever trigger).
101 */
102 boot_init_stack_canary();
103
97 current_thread_info()->status |= TS_POLLING; 104 current_thread_info()->status |= TS_POLLING;
98 105
99 /* endless idle loop with no priority at all */ 106 /* endless idle loop with no priority at all */
@@ -108,7 +115,6 @@ void cpu_idle(void)
108 play_dead(); 115 play_dead();
109 116
110 local_irq_disable(); 117 local_irq_disable();
111 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
112 /* Don't trace irqs off for idle */ 118 /* Don't trace irqs off for idle */
113 stop_critical_timings(); 119 stop_critical_timings();
114 pm_idle(); 120 pm_idle();
@@ -132,7 +138,7 @@ void __show_regs(struct pt_regs *regs, int all)
132 if (user_mode_vm(regs)) { 138 if (user_mode_vm(regs)) {
133 sp = regs->sp; 139 sp = regs->sp;
134 ss = regs->ss & 0xffff; 140 ss = regs->ss & 0xffff;
135 savesegment(gs, gs); 141 gs = get_user_gs(regs);
136 } else { 142 } else {
137 sp = (unsigned long) (&regs->sp); 143 sp = (unsigned long) (&regs->sp);
138 savesegment(ss, ss); 144 savesegment(ss, ss);
@@ -213,6 +219,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
213 regs.ds = __USER_DS; 219 regs.ds = __USER_DS;
214 regs.es = __USER_DS; 220 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU; 221 regs.fs = __KERNEL_PERCPU;
222 regs.gs = __KERNEL_STACK_CANARY;
216 regs.orig_ax = -1; 223 regs.orig_ax = -1;
217 regs.ip = (unsigned long) kernel_thread_helper; 224 regs.ip = (unsigned long) kernel_thread_helper;
218 regs.cs = __KERNEL_CS | get_kernel_rpl(); 225 regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -223,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
223} 230}
224EXPORT_SYMBOL(kernel_thread); 231EXPORT_SYMBOL(kernel_thread);
225 232
226/*
227 * Free current thread data structures etc..
228 */
229void exit_thread(void)
230{
231 /* The process may have allocated an io port bitmap... nuke it. */
232 if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
233 struct task_struct *tsk = current;
234 struct thread_struct *t = &tsk->thread;
235 int cpu = get_cpu();
236 struct tss_struct *tss = &per_cpu(init_tss, cpu);
237
238 kfree(t->io_bitmap_ptr);
239 t->io_bitmap_ptr = NULL;
240 clear_thread_flag(TIF_IO_BITMAP);
241 /*
242 * Careful, clear this in the TSS too:
243 */
244 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
245 t->io_bitmap_max = 0;
246 tss->io_bitmap_owner = NULL;
247 tss->io_bitmap_max = 0;
248 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
249 put_cpu();
250 }
251
252 ds_exit_thread(current);
253}
254
255void flush_thread(void)
256{
257 struct task_struct *tsk = current;
258
259 tsk->thread.debugreg0 = 0;
260 tsk->thread.debugreg1 = 0;
261 tsk->thread.debugreg2 = 0;
262 tsk->thread.debugreg3 = 0;
263 tsk->thread.debugreg6 = 0;
264 tsk->thread.debugreg7 = 0;
265 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
266 clear_tsk_thread_flag(tsk, TIF_DEBUG);
267 /*
268 * Forget coprocessor state..
269 */
270 tsk->fpu_counter = 0;
271 clear_fpu(tsk);
272 clear_used_math();
273}
274
275void release_thread(struct task_struct *dead_task) 233void release_thread(struct task_struct *dead_task)
276{ 234{
277 BUG_ON(dead_task->mm); 235 BUG_ON(dead_task->mm);
@@ -305,7 +263,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
305 263
306 p->thread.ip = (unsigned long) ret_from_fork; 264 p->thread.ip = (unsigned long) ret_from_fork;
307 265
308 savesegment(gs, p->thread.gs); 266 task_user_gs(p) = get_user_gs(regs);
309 267
310 tsk = current; 268 tsk = current;
311 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -343,7 +301,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
343void 301void
344start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 302start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
345{ 303{
346 __asm__("movl %0, %%gs" : : "r"(0)); 304 set_user_gs(regs, 0);
347 regs->fs = 0; 305 regs->fs = 0;
348 set_fs(USER_DS); 306 set_fs(USER_DS);
349 regs->ds = __USER_DS; 307 regs->ds = __USER_DS;
@@ -359,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
359} 317}
360EXPORT_SYMBOL_GPL(start_thread); 318EXPORT_SYMBOL_GPL(start_thread);
361 319
362static void hard_disable_TSC(void)
363{
364 write_cr4(read_cr4() | X86_CR4_TSD);
365}
366
367void disable_TSC(void)
368{
369 preempt_disable();
370 if (!test_and_set_thread_flag(TIF_NOTSC))
371 /*
372 * Must flip the CPU state synchronously with
373 * TIF_NOTSC in the current running context.
374 */
375 hard_disable_TSC();
376 preempt_enable();
377}
378
379static void hard_enable_TSC(void)
380{
381 write_cr4(read_cr4() & ~X86_CR4_TSD);
382}
383
384static void enable_TSC(void)
385{
386 preempt_disable();
387 if (test_and_clear_thread_flag(TIF_NOTSC))
388 /*
389 * Must flip the CPU state synchronously with
390 * TIF_NOTSC in the current running context.
391 */
392 hard_enable_TSC();
393 preempt_enable();
394}
395
396int get_tsc_mode(unsigned long adr)
397{
398 unsigned int val;
399
400 if (test_thread_flag(TIF_NOTSC))
401 val = PR_TSC_SIGSEGV;
402 else
403 val = PR_TSC_ENABLE;
404
405 return put_user(val, (unsigned int __user *)adr);
406}
407
408int set_tsc_mode(unsigned int val)
409{
410 if (val == PR_TSC_SIGSEGV)
411 disable_TSC();
412 else if (val == PR_TSC_ENABLE)
413 enable_TSC();
414 else
415 return -EINVAL;
416
417 return 0;
418}
419
420static noinline void
421__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
422 struct tss_struct *tss)
423{
424 struct thread_struct *prev, *next;
425
426 prev = &prev_p->thread;
427 next = &next_p->thread;
428
429 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
430 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
431 ds_switch_to(prev_p, next_p);
432 else if (next->debugctlmsr != prev->debugctlmsr)
433 update_debugctlmsr(next->debugctlmsr);
434
435 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
436 set_debugreg(next->debugreg0, 0);
437 set_debugreg(next->debugreg1, 1);
438 set_debugreg(next->debugreg2, 2);
439 set_debugreg(next->debugreg3, 3);
440 /* no 4 and 5 */
441 set_debugreg(next->debugreg6, 6);
442 set_debugreg(next->debugreg7, 7);
443 }
444
445 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
446 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
447 /* prev and next are different */
448 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
449 hard_disable_TSC();
450 else
451 hard_enable_TSC();
452 }
453
454 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
455 /*
456 * Disable the bitmap via an invalid offset. We still cache
457 * the previous bitmap owner and the IO bitmap contents:
458 */
459 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
460 return;
461 }
462
463 if (likely(next == tss->io_bitmap_owner)) {
464 /*
465 * Previous owner of the bitmap (hence the bitmap content)
466 * matches the next task, we dont have to do anything but
467 * to set a valid offset in the TSS:
468 */
469 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
470 return;
471 }
472 /*
473 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
474 * and we let the task to get a GPF in case an I/O instruction
475 * is performed. The handler of the GPF will verify that the
476 * faulting task has a valid I/O bitmap and, it true, does the
477 * real copy and restart the instruction. This will save us
478 * redundant copies when the currently switched task does not
479 * perform any I/O during its timeslice.
480 */
481 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
482}
483 320
484/* 321/*
485 * switch_to(x,yn) should switch tasks from x to y. 322 * switch_to(x,yn) should switch tasks from x to y.
@@ -540,7 +377,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
540 * used %fs or %gs (it does not today), or if the kernel is 377 * used %fs or %gs (it does not today), or if the kernel is
541 * running inside of a hypervisor layer. 378 * running inside of a hypervisor layer.
542 */ 379 */
543 savesegment(gs, prev->gs); 380 lazy_save_gs(prev->gs);
544 381
545 /* 382 /*
546 * Load the per-thread Thread-Local Storage descriptor. 383 * Load the per-thread Thread-Local Storage descriptor.
@@ -586,64 +423,44 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 * Restore %gs if needed (which is common) 423 * Restore %gs if needed (which is common)
587 */ 424 */
588 if (prev->gs | next->gs) 425 if (prev->gs | next->gs)
589 loadsegment(gs, next->gs); 426 lazy_load_gs(next->gs);
590 427
591 x86_write_percpu(current_task, next_p); 428 percpu_write(current_task, next_p);
592 429
593 return prev_p; 430 return prev_p;
594} 431}
595 432
596asmlinkage int sys_fork(struct pt_regs regs) 433int sys_clone(struct pt_regs *regs)
597{
598 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
599}
600
601asmlinkage int sys_clone(struct pt_regs regs)
602{ 434{
603 unsigned long clone_flags; 435 unsigned long clone_flags;
604 unsigned long newsp; 436 unsigned long newsp;
605 int __user *parent_tidptr, *child_tidptr; 437 int __user *parent_tidptr, *child_tidptr;
606 438
607 clone_flags = regs.bx; 439 clone_flags = regs->bx;
608 newsp = regs.cx; 440 newsp = regs->cx;
609 parent_tidptr = (int __user *)regs.dx; 441 parent_tidptr = (int __user *)regs->dx;
610 child_tidptr = (int __user *)regs.di; 442 child_tidptr = (int __user *)regs->di;
611 if (!newsp) 443 if (!newsp)
612 newsp = regs.sp; 444 newsp = regs->sp;
613 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
614}
615
616/*
617 * This is trivial, and on the face of it looks like it
618 * could equally well be done in user mode.
619 *
620 * Not so, for quite unobvious reasons - register pressure.
621 * In user mode vfork() cannot have a stack frame, and if
622 * done by calling the "clone()" system call directly, you
623 * do not have enough call-clobbered registers to hold all
624 * the information you need.
625 */
626asmlinkage int sys_vfork(struct pt_regs regs)
627{
628 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
629} 446}
630 447
631/* 448/*
632 * sys_execve() executes a new program. 449 * sys_execve() executes a new program.
633 */ 450 */
634asmlinkage int sys_execve(struct pt_regs regs) 451int sys_execve(struct pt_regs *regs)
635{ 452{
636 int error; 453 int error;
637 char *filename; 454 char *filename;
638 455
639 filename = getname((char __user *) regs.bx); 456 filename = getname((char __user *) regs->bx);
640 error = PTR_ERR(filename); 457 error = PTR_ERR(filename);
641 if (IS_ERR(filename)) 458 if (IS_ERR(filename))
642 goto out; 459 goto out;
643 error = do_execve(filename, 460 error = do_execve(filename,
644 (char __user * __user *) regs.cx, 461 (char __user * __user *) regs->cx,
645 (char __user * __user *) regs.dx, 462 (char __user * __user *) regs->dx,
646 &regs); 463 regs);
647 if (error == 0) { 464 if (error == 0) {
648 /* Make sure we don't return using sysenter.. */ 465 /* Make sure we don't return using sysenter.. */
649 set_thread_flag(TIF_IRET); 466 set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 85b4cb5c1980..abb7e6a7f0c6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -47,7 +48,6 @@
47#include <asm/processor.h> 48#include <asm/processor.h>
48#include <asm/i387.h> 49#include <asm/i387.h>
49#include <asm/mmu_context.h> 50#include <asm/mmu_context.h>
50#include <asm/pda.h>
51#include <asm/prctl.h> 51#include <asm/prctl.h>
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/proto.h> 53#include <asm/proto.h>
@@ -58,6 +58,12 @@
58 58
59asmlinkage extern void ret_from_fork(void); 59asmlinkage extern void ret_from_fork(void);
60 60
61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62EXPORT_PER_CPU_SYMBOL(current_task);
63
64DEFINE_PER_CPU(unsigned long, old_rsp);
65static DEFINE_PER_CPU(unsigned char, is_idle);
66
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62 68
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
76 82
77void enter_idle(void) 83void enter_idle(void)
78{ 84{
79 write_pda(isidle, 1); 85 percpu_write(is_idle, 1);
80 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81} 87}
82 88
83static void __exit_idle(void) 89static void __exit_idle(void)
84{ 90{
85 if (test_and_clear_bit_pda(0, isidle) == 0) 91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
86 return; 92 return;
87 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88} 94}
@@ -112,6 +118,16 @@ static inline void play_dead(void)
112void cpu_idle(void) 118void cpu_idle(void)
113{ 119{
114 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121
122 /*
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
128 */
129 boot_init_stack_canary();
130
115 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
116 while (1) { 132 while (1) {
117 tick_nohz_stop_sched_tick(1); 133 tick_nohz_stop_sched_tick(1);
@@ -221,61 +237,6 @@ void show_regs(struct pt_regs *regs)
221 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
222} 238}
223 239
224/*
225 * Free current thread data structures etc..
226 */
227void exit_thread(void)
228{
229 struct task_struct *me = current;
230 struct thread_struct *t = &me->thread;
231
232 if (me->thread.io_bitmap_ptr) {
233 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
234
235 kfree(t->io_bitmap_ptr);
236 t->io_bitmap_ptr = NULL;
237 clear_thread_flag(TIF_IO_BITMAP);
238 /*
239 * Careful, clear this in the TSS too:
240 */
241 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
242 t->io_bitmap_max = 0;
243 put_cpu();
244 }
245
246 ds_exit_thread(current);
247}
248
249void flush_thread(void)
250{
251 struct task_struct *tsk = current;
252
253 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
254 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
255 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
256 clear_tsk_thread_flag(tsk, TIF_IA32);
257 } else {
258 set_tsk_thread_flag(tsk, TIF_IA32);
259 current_thread_info()->status |= TS_COMPAT;
260 }
261 }
262 clear_tsk_thread_flag(tsk, TIF_DEBUG);
263
264 tsk->thread.debugreg0 = 0;
265 tsk->thread.debugreg1 = 0;
266 tsk->thread.debugreg2 = 0;
267 tsk->thread.debugreg3 = 0;
268 tsk->thread.debugreg6 = 0;
269 tsk->thread.debugreg7 = 0;
270 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
271 /*
272 * Forget coprocessor state..
273 */
274 tsk->fpu_counter = 0;
275 clear_fpu(tsk);
276 clear_used_math();
277}
278
279void release_thread(struct task_struct *dead_task) 240void release_thread(struct task_struct *dead_task)
280{ 241{
281 if (dead_task->mm) { 242 if (dead_task->mm) {
@@ -397,7 +358,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
397 load_gs_index(0); 358 load_gs_index(0);
398 regs->ip = new_ip; 359 regs->ip = new_ip;
399 regs->sp = new_sp; 360 regs->sp = new_sp;
400 write_pda(oldrsp, new_sp); 361 percpu_write(old_rsp, new_sp);
401 regs->cs = __USER_CS; 362 regs->cs = __USER_CS;
402 regs->ss = __USER_DS; 363 regs->ss = __USER_DS;
403 regs->flags = 0x200; 364 regs->flags = 0x200;
@@ -409,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
409} 370}
410EXPORT_SYMBOL_GPL(start_thread); 371EXPORT_SYMBOL_GPL(start_thread);
411 372
412static void hard_disable_TSC(void)
413{
414 write_cr4(read_cr4() | X86_CR4_TSD);
415}
416
417void disable_TSC(void)
418{
419 preempt_disable();
420 if (!test_and_set_thread_flag(TIF_NOTSC))
421 /*
422 * Must flip the CPU state synchronously with
423 * TIF_NOTSC in the current running context.
424 */
425 hard_disable_TSC();
426 preempt_enable();
427}
428
429static void hard_enable_TSC(void)
430{
431 write_cr4(read_cr4() & ~X86_CR4_TSD);
432}
433
434static void enable_TSC(void)
435{
436 preempt_disable();
437 if (test_and_clear_thread_flag(TIF_NOTSC))
438 /*
439 * Must flip the CPU state synchronously with
440 * TIF_NOTSC in the current running context.
441 */
442 hard_enable_TSC();
443 preempt_enable();
444}
445
446int get_tsc_mode(unsigned long adr)
447{
448 unsigned int val;
449
450 if (test_thread_flag(TIF_NOTSC))
451 val = PR_TSC_SIGSEGV;
452 else
453 val = PR_TSC_ENABLE;
454
455 return put_user(val, (unsigned int __user *)adr);
456}
457
458int set_tsc_mode(unsigned int val)
459{
460 if (val == PR_TSC_SIGSEGV)
461 disable_TSC();
462 else if (val == PR_TSC_ENABLE)
463 enable_TSC();
464 else
465 return -EINVAL;
466
467 return 0;
468}
469
470/*
471 * This special macro can be used to load a debugging register
472 */
473#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
474
475static inline void __switch_to_xtra(struct task_struct *prev_p,
476 struct task_struct *next_p,
477 struct tss_struct *tss)
478{
479 struct thread_struct *prev, *next;
480
481 prev = &prev_p->thread,
482 next = &next_p->thread;
483
484 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
485 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
486 ds_switch_to(prev_p, next_p);
487 else if (next->debugctlmsr != prev->debugctlmsr)
488 update_debugctlmsr(next->debugctlmsr);
489
490 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
491 loaddebug(next, 0);
492 loaddebug(next, 1);
493 loaddebug(next, 2);
494 loaddebug(next, 3);
495 /* no 4 and 5 */
496 loaddebug(next, 6);
497 loaddebug(next, 7);
498 }
499
500 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
501 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
502 /* prev and next are different */
503 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
504 hard_disable_TSC();
505 else
506 hard_enable_TSC();
507 }
508
509 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
510 /*
511 * Copy the relevant range of the IO bitmap.
512 * Normally this is 128 bytes or less:
513 */
514 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
515 max(prev->io_bitmap_max, next->io_bitmap_max));
516 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
517 /*
518 * Clear any possible leftover bits:
519 */
520 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
521 }
522}
523
524/* 373/*
525 * switch_to(x,y) should switch tasks from x to y. 374 * switch_to(x,y) should switch tasks from x to y.
526 * 375 *
@@ -618,21 +467,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* 467 /*
619 * Switch the PDA and FPU contexts. 468 * Switch the PDA and FPU contexts.
620 */ 469 */
621 prev->usersp = read_pda(oldrsp); 470 prev->usersp = percpu_read(old_rsp);
622 write_pda(oldrsp, next->usersp); 471 percpu_write(old_rsp, next->usersp);
623 write_pda(pcurrent, next_p); 472 percpu_write(current_task, next_p);
624 473
625 write_pda(kernelstack, 474 percpu_write(kernel_stack,
626 (unsigned long)task_stack_page(next_p) + 475 (unsigned long)task_stack_page(next_p) +
627 THREAD_SIZE - PDA_STACKOFFSET); 476 THREAD_SIZE - KERNEL_STACK_OFFSET);
628#ifdef CONFIG_CC_STACKPROTECTOR
629 write_pda(stack_canary, next_p->stack_canary);
630 /*
631 * Build time only check to make sure the stack_canary is at
632 * offset 40 in the pda; this is a gcc ABI requirement
633 */
634 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
635#endif
636 477
637 /* 478 /*
638 * Now maybe reload the debug registers and handle I/O bitmaps 479 * Now maybe reload the debug registers and handle I/O bitmaps
@@ -686,11 +527,6 @@ void set_personality_64bit(void)
686 current->personality &= ~READ_IMPLIES_EXEC; 527 current->personality &= ~READ_IMPLIES_EXEC;
687} 528}
688 529
689asmlinkage long sys_fork(struct pt_regs *regs)
690{
691 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
692}
693
694asmlinkage long 530asmlinkage long
695sys_clone(unsigned long clone_flags, unsigned long newsp, 531sys_clone(unsigned long clone_flags, unsigned long newsp,
696 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 532 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -700,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
700 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 536 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
701} 537}
702 538
703/*
704 * This is trivial, and on the face of it looks like it
705 * could equally well be done in user mode.
706 *
707 * Not so, for quite unobvious reasons - register pressure.
708 * In user mode vfork() cannot have a stack frame, and if
709 * done by calling the "clone()" system call directly, you
710 * do not have enough call-clobbered registers to hold all
711 * the information you need.
712 */
713asmlinkage long sys_vfork(struct pt_regs *regs)
714{
715 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
716 NULL, NULL);
717}
718
719unsigned long get_wchan(struct task_struct *p) 539unsigned long get_wchan(struct task_struct *p)
720{ 540{
721 unsigned long stack; 541 unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 06ca07f6ad86..99749d6e87a8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/ftrace.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -75,10 +76,7 @@ static inline bool invalid_selector(u16 value)
75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 76static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
76{ 77{
77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 78 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
78 regno >>= 2; 79 return &regs->bx + (regno >> 2);
79 if (regno > FS)
80 --regno;
81 return &regs->bx + regno;
82} 80}
83 81
84static u16 get_segment_reg(struct task_struct *task, unsigned long offset) 82static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +88,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
90 if (offset != offsetof(struct user_regs_struct, gs)) 88 if (offset != offsetof(struct user_regs_struct, gs))
91 retval = *pt_regs_access(task_pt_regs(task), offset); 89 retval = *pt_regs_access(task_pt_regs(task), offset);
92 else { 90 else {
93 retval = task->thread.gs;
94 if (task == current) 91 if (task == current)
95 savesegment(gs, retval); 92 retval = get_user_gs(task_pt_regs(task));
93 else
94 retval = task_user_gs(task);
96 } 95 }
97 return retval; 96 return retval;
98} 97}
@@ -126,13 +125,10 @@ static int set_segment_reg(struct task_struct *task,
126 break; 125 break;
127 126
128 case offsetof(struct user_regs_struct, gs): 127 case offsetof(struct user_regs_struct, gs):
129 task->thread.gs = value;
130 if (task == current) 128 if (task == current)
131 /* 129 set_user_gs(task_pt_regs(task), value);
132 * The user-mode %gs is not affected by 130 else
133 * kernel entry, so we must update the CPU. 131 task_user_gs(task) = value;
134 */
135 loadsegment(gs, value);
136 } 132 }
137 133
138 return 0; 134 return 0;
@@ -273,7 +269,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task)
273 if (test_tsk_thread_flag(task, TIF_IA32)) 269 if (test_tsk_thread_flag(task, TIF_IA32))
274 return IA32_PAGE_OFFSET - 3; 270 return IA32_PAGE_OFFSET - 3;
275#endif 271#endif
276 return TASK_SIZE64 - 7; 272 return TASK_SIZE_MAX - 7;
277} 273}
278 274
279#endif /* CONFIG_X86_32 */ 275#endif /* CONFIG_X86_32 */
@@ -1421,6 +1417,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1421 tracehook_report_syscall_entry(regs)) 1417 tracehook_report_syscall_entry(regs))
1422 ret = -1L; 1418 ret = -1L;
1423 1419
1420 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1421 ftrace_syscall_enter(regs);
1422
1424 if (unlikely(current->audit_context)) { 1423 if (unlikely(current->audit_context)) {
1425 if (IS_IA32) 1424 if (IS_IA32)
1426 audit_syscall_entry(AUDIT_ARCH_I386, 1425 audit_syscall_entry(AUDIT_ARCH_I386,
@@ -1444,6 +1443,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1444 if (unlikely(current->audit_context)) 1443 if (unlikely(current->audit_context))
1445 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1444 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1446 1445
1446 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1447 ftrace_syscall_exit(regs);
1448
1447 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1449 if (test_thread_flag(TIF_SYSCALL_TRACE))
1448 tracehook_report_syscall_exit(regs, 0); 1450 tracehook_report_syscall_exit(regs, 0);
1449 1451
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c1..6a5a2970f4c5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
74 if (!force_hpet_address) 74 if (!force_hpet_address)
75 return; 75 return;
76 76
77 if (rcba_base == NULL) 77 BUG_ON(rcba_base == NULL);
78 BUG();
79 78
80 /* read the Function Disable register, dword mode only */ 79 /* read the Function Disable register, dword mode only */
81 val = readl(rcba_base + 0x3404); 80 val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4526b3a75ed2..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -14,6 +14,7 @@
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <asm/virtext.h> 16#include <asm/virtext.h>
17#include <asm/cpu.h>
17 18
18#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
19# include <linux/dmi.h> 20# include <linux/dmi.h>
@@ -23,8 +24,6 @@
23# include <asm/iommu.h> 24# include <asm/iommu.h>
24#endif 25#endif
25 26
26#include <mach_ipi.h>
27
28/* 27/*
29 * Power off function, if any 28 * Power off function, if any
30 */ 29 */
@@ -658,7 +657,7 @@ static int crash_nmi_callback(struct notifier_block *self,
658 657
659static void smp_send_nmi_allbutself(void) 658static void smp_send_nmi_allbutself(void)
660{ 659{
661 send_IPI_allbutself(NMI_VECTOR); 660 apic->send_IPI_allbutself(NMI_VECTOR);
662} 661}
663 662
664static struct notifier_block crash_nmi_nb = { 663static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index a160f3119725..41235531b11c 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -7,7 +7,7 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h> 10#include <asm/page_types.h>
11#include <asm/kexec.h> 11#include <asm/kexec.h>
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13 13
@@ -17,7 +17,8 @@
17 17
18#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
19 19
20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/*
21 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
21 * ~ control_page + PAGE_SIZE are used as data storage and stack for 22 * ~ control_page + PAGE_SIZE are used as data storage and stack for
22 * jumping back 23 * jumping back
23 */ 24 */
@@ -76,8 +77,10 @@ relocate_kernel:
76 movl %eax, CP_PA_SWAP_PAGE(%edi) 77 movl %eax, CP_PA_SWAP_PAGE(%edi)
77 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) 78 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
78 79
79 /* get physical address of control page now */ 80 /*
80 /* this is impossible after page table switch */ 81 * get physical address of control page now
82 * this is impossible after page table switch
83 */
81 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 84 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
82 85
83 /* switch to new set of page tables */ 86 /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
97 /* store the start address on the stack */ 100 /* store the start address on the stack */
98 pushl %edx 101 pushl %edx
99 102
100 /* Set cr0 to a known state: 103 /*
104 * Set cr0 to a known state:
101 * - Paging disabled 105 * - Paging disabled
102 * - Alignment check disabled 106 * - Alignment check disabled
103 * - Write protect disabled 107 * - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
113 /* clear cr4 if applicable */ 117 /* clear cr4 if applicable */
114 testl %ecx, %ecx 118 testl %ecx, %ecx
115 jz 1f 119 jz 1f
116 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
117 * Setting everything to zero seems safe. 122 * Setting everything to zero seems safe.
118 */ 123 */
119 xorl %eax, %eax 124 xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
132 call swap_pages 137 call swap_pages
133 addl $8, %esp 138 addl $8, %esp
134 139
135 /* To be certain of avoiding problems with self-modifying code 140 /*
141 * To be certain of avoiding problems with self-modifying code
136 * I need to execute a serializing instruction here. 142 * I need to execute a serializing instruction here.
137 * So I flush the TLB, it's handy, and not processor dependent. 143 * So I flush the TLB, it's handy, and not processor dependent.
138 */ 144 */
139 xorl %eax, %eax 145 xorl %eax, %eax
140 movl %eax, %cr3 146 movl %eax, %cr3
141 147
142 /* set all of the registers to known values */ 148 /*
143 /* leave %esp alone */ 149 * set all of the registers to known values
150 * leave %esp alone
151 */
144 152
145 testl %esi, %esi 153 testl %esi, %esi
146 jnz 1f 154 jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index f5afe665a82b..4de8f5b3d476 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -7,10 +7,10 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h> 10#include <asm/page_types.h>
11#include <asm/kexec.h> 11#include <asm/kexec.h>
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable_types.h>
14 14
15/* 15/*
16 * Must be relocatable PIC code callable as a C function 16 * Must be relocatable PIC code callable as a C function
@@ -19,145 +19,76 @@
19#define PTR(x) (x << 3) 19#define PTR(x) (x << 3)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21 21
22/*
23 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define RSP DATA(0x0)
31#define CR0 DATA(0x8)
32#define CR3 DATA(0x10)
33#define CR4 DATA(0x18)
34
35/* other data */
36#define CP_PA_TABLE_PAGE DATA(0x20)
37#define CP_PA_SWAP_PAGE DATA(0x28)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
39
22 .text 40 .text
23 .align PAGE_SIZE 41 .align PAGE_SIZE
24 .code64 42 .code64
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 /* %rdi indirection_page 45 /*
46 * %rdi indirection_page
28 * %rsi page_list 47 * %rsi page_list
29 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context
30 */ 50 */
31 51
32 /* map the control page at its virtual address */ 52 /* Save the CPU context, used for jumping back */
33 53 pushq %rbx
34 movq $0x0000ff8000000000, %r10 /* mask */ 54 pushq %rbp
35 mov $(39 - 3), %cl /* bits to shift */ 55 pushq %r12
36 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ 56 pushq %r13
37 57 pushq %r14
38 movq %r11, %r9 58 pushq %r15
39 andq %r10, %r9 59 pushf
40 shrq %cl, %r9 60
41 61 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
42 movq PTR(VA_PGD)(%rsi), %r8 62 movq %rsp, RSP(%r11)
43 addq %r8, %r9 63 movq %cr0, %rax
44 movq PTR(PA_PUD_0)(%rsi), %r8 64 movq %rax, CR0(%r11)
45 orq $PAGE_ATTR, %r8 65 movq %cr3, %rax
46 movq %r8, (%r9) 66 movq %rax, CR3(%r11)
47 67 movq %cr4, %rax
48 shrq $9, %r10 68 movq %rax, CR4(%r11)
49 sub $9, %cl
50
51 movq %r11, %r9
52 andq %r10, %r9
53 shrq %cl, %r9
54
55 movq PTR(VA_PUD_0)(%rsi), %r8
56 addq %r8, %r9
57 movq PTR(PA_PMD_0)(%rsi), %r8
58 orq $PAGE_ATTR, %r8
59 movq %r8, (%r9)
60
61 shrq $9, %r10
62 sub $9, %cl
63
64 movq %r11, %r9
65 andq %r10, %r9
66 shrq %cl, %r9
67
68 movq PTR(VA_PMD_0)(%rsi), %r8
69 addq %r8, %r9
70 movq PTR(PA_PTE_0)(%rsi), %r8
71 orq $PAGE_ATTR, %r8
72 movq %r8, (%r9)
73
74 shrq $9, %r10
75 sub $9, %cl
76
77 movq %r11, %r9
78 andq %r10, %r9
79 shrq %cl, %r9
80
81 movq PTR(VA_PTE_0)(%rsi), %r8
82 addq %r8, %r9
83 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
84 orq $PAGE_ATTR, %r8
85 movq %r8, (%r9)
86
87 /* identity map the control page at its physical address */
88
89 movq $0x0000ff8000000000, %r10 /* mask */
90 mov $(39 - 3), %cl /* bits to shift */
91 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
92
93 movq %r11, %r9
94 andq %r10, %r9
95 shrq %cl, %r9
96
97 movq PTR(VA_PGD)(%rsi), %r8
98 addq %r8, %r9
99 movq PTR(PA_PUD_1)(%rsi), %r8
100 orq $PAGE_ATTR, %r8
101 movq %r8, (%r9)
102
103 shrq $9, %r10
104 sub $9, %cl
105
106 movq %r11, %r9
107 andq %r10, %r9
108 shrq %cl, %r9
109
110 movq PTR(VA_PUD_1)(%rsi), %r8
111 addq %r8, %r9
112 movq PTR(PA_PMD_1)(%rsi), %r8
113 orq $PAGE_ATTR, %r8
114 movq %r8, (%r9)
115
116 shrq $9, %r10
117 sub $9, %cl
118
119 movq %r11, %r9
120 andq %r10, %r9
121 shrq %cl, %r9
122
123 movq PTR(VA_PMD_1)(%rsi), %r8
124 addq %r8, %r9
125 movq PTR(PA_PTE_1)(%rsi), %r8
126 orq $PAGE_ATTR, %r8
127 movq %r8, (%r9)
128
129 shrq $9, %r10
130 sub $9, %cl
131
132 movq %r11, %r9
133 andq %r10, %r9
134 shrq %cl, %r9
135
136 movq PTR(VA_PTE_1)(%rsi), %r8
137 addq %r8, %r9
138 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
139 orq $PAGE_ATTR, %r8
140 movq %r8, (%r9)
141
142relocate_new_kernel:
143 /* %rdi indirection_page
144 * %rsi page_list
145 * %rdx start address
146 */
147 69
148 /* zero out flags, and disable interrupts */ 70 /* zero out flags, and disable interrupts */
149 pushq $0 71 pushq $0
150 popfq 72 popfq
151 73
152 /* get physical address of control page now */ 74 /*
153 /* this is impossible after page table switch */ 75 * get physical address of control page now
76 * this is impossible after page table switch
77 */
154 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 78 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
155 79
156 /* get physical address of page table now too */ 80 /* get physical address of page table now too */
157 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx 81 movq PTR(PA_TABLE_PAGE)(%rsi), %r9
82
83 /* get physical address of swap page now */
84 movq PTR(PA_SWAP_PAGE)(%rsi), %r10
158 85
159 /* switch to new set of page tables */ 86 /* save some information for jumping back */
160 movq PTR(PA_PGD)(%rsi), %r9 87 movq %r9, CP_PA_TABLE_PAGE(%r11)
88 movq %r10, CP_PA_SWAP_PAGE(%r11)
89 movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
90
91 /* Switch to the identity mapped page tables */
161 movq %r9, %cr3 92 movq %r9, %cr3
162 93
163 /* setup a new stack at the end of the physical control page */ 94 /* setup a new stack at the end of the physical control page */
@@ -172,7 +103,8 @@ identity_mapped:
172 /* store the start address on the stack */ 103 /* store the start address on the stack */
173 pushq %rdx 104 pushq %rdx
174 105
175 /* Set cr0 to a known state: 106 /*
107 * Set cr0 to a known state:
176 * - Paging enabled 108 * - Paging enabled
177 * - Alignment check disabled 109 * - Alignment check disabled
178 * - Write protect disabled 110 * - Write protect disabled
@@ -185,7 +117,8 @@ identity_mapped:
185 orl $(X86_CR0_PG | X86_CR0_PE), %eax 117 orl $(X86_CR0_PG | X86_CR0_PE), %eax
186 movq %rax, %cr0 118 movq %rax, %cr0
187 119
188 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
189 * - physical address extension enabled 122 * - physical address extension enabled
190 */ 123 */
191 movq $X86_CR4_PAE, %rax 124 movq $X86_CR4_PAE, %rax
@@ -194,12 +127,88 @@ identity_mapped:
194 jmp 1f 127 jmp 1f
1951: 1281:
196 129
197 /* Switch to the identity mapped page tables, 130 /* Flush the TLB (needed?) */
198 * and flush the TLB. 131 movq %r9, %cr3
199 */ 132
200 movq %rcx, %cr3 133 movq %rcx, %r11
134 call swap_pages
135
136 /*
137 * To be certain of avoiding problems with self-modifying code
138 * I need to execute a serializing instruction here.
139 * So I flush the TLB by reloading %cr3 here, it's handy,
140 * and not processor dependent.
141 */
142 movq %cr3, %rax
143 movq %rax, %cr3
144
145 /*
146 * set all of the registers to known values
147 * leave %rsp alone
148 */
149
150 testq %r11, %r11
151 jnz 1f
152 xorq %rax, %rax
153 xorq %rbx, %rbx
154 xorq %rcx, %rcx
155 xorq %rdx, %rdx
156 xorq %rsi, %rsi
157 xorq %rdi, %rdi
158 xorq %rbp, %rbp
159 xorq %r8, %r8
160 xorq %r9, %r9
161 xorq %r10, %r9
162 xorq %r11, %r11
163 xorq %r12, %r12
164 xorq %r13, %r13
165 xorq %r14, %r14
166 xorq %r15, %r15
167
168 ret
169
1701:
171 popq %rdx
172 leaq PAGE_SIZE(%r10), %rsp
173 call *%rdx
174
175 /* get the re-entry point of the peer system */
176 movq 0(%rsp), %rbp
177 call 1f
1781:
179 popq %r8
180 subq $(1b - relocate_kernel), %r8
181 movq CP_PA_SWAP_PAGE(%r8), %r10
182 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
183 movq CP_PA_TABLE_PAGE(%r8), %rax
184 movq %rax, %cr3
185 lea PAGE_SIZE(%r8), %rsp
186 call swap_pages
187 movq $virtual_mapped, %rax
188 pushq %rax
189 ret
190
191virtual_mapped:
192 movq RSP(%r8), %rsp
193 movq CR4(%r8), %rax
194 movq %rax, %cr4
195 movq CR3(%r8), %rax
196 movq CR0(%r8), %r8
197 movq %rax, %cr3
198 movq %r8, %cr0
199 movq %rbp, %rax
200
201 popf
202 popq %r15
203 popq %r14
204 popq %r13
205 popq %r12
206 popq %rbp
207 popq %rbx
208 ret
201 209
202 /* Do the copies */ 210 /* Do the copies */
211swap_pages:
203 movq %rdi, %rcx /* Put the page_list in %rcx */ 212 movq %rdi, %rcx /* Put the page_list in %rcx */
204 xorq %rdi, %rdi 213 xorq %rdi, %rdi
205 xorq %rsi, %rsi 214 xorq %rsi, %rsi
@@ -231,36 +240,27 @@ identity_mapped:
231 movq %rcx, %rsi /* For ever source page do a copy */ 240 movq %rcx, %rsi /* For ever source page do a copy */
232 andq $0xfffffffffffff000, %rsi 241 andq $0xfffffffffffff000, %rsi
233 242
243 movq %rdi, %rdx
244 movq %rsi, %rax
245
246 movq %r10, %rdi
234 movq $512, %rcx 247 movq $512, %rcx
235 rep ; movsq 248 rep ; movsq
236 jmp 0b
2373:
238
239 /* To be certain of avoiding problems with self-modifying code
240 * I need to execute a serializing instruction here.
241 * So I flush the TLB by reloading %cr3 here, it's handy,
242 * and not processor dependent.
243 */
244 movq %cr3, %rax
245 movq %rax, %cr3
246 249
247 /* set all of the registers to known values */ 250 movq %rax, %rdi
248 /* leave %rsp alone */ 251 movq %rdx, %rsi
252 movq $512, %rcx
253 rep ; movsq
249 254
250 xorq %rax, %rax 255 movq %rdx, %rdi
251 xorq %rbx, %rbx 256 movq %r10, %rsi
252 xorq %rcx, %rcx 257 movq $512, %rcx
253 xorq %rdx, %rdx 258 rep ; movsq
254 xorq %rsi, %rsi
255 xorq %rdi, %rdi
256 xorq %rbp, %rbp
257 xorq %r8, %r8
258 xorq %r9, %r9
259 xorq %r10, %r9
260 xorq %r11, %r11
261 xorq %r12, %r12
262 xorq %r13, %r13
263 xorq %r14, %r14
264 xorq %r15, %r15
265 259
260 lea PAGE_SIZE(%rax), %rsi
261 jmp 0b
2623:
266 ret 263 ret
264
265 .globl kexec_control_code_size
266.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6a8811a69324..f28c56e6bf94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -74,14 +74,15 @@
74#include <asm/e820.h> 74#include <asm/e820.h>
75#include <asm/mpspec.h> 75#include <asm/mpspec.h>
76#include <asm/setup.h> 76#include <asm/setup.h>
77#include <asm/arch_hooks.h>
78#include <asm/efi.h> 77#include <asm/efi.h>
78#include <asm/timer.h>
79#include <asm/i8259.h>
79#include <asm/sections.h> 80#include <asm/sections.h>
80#include <asm/dmi.h> 81#include <asm/dmi.h>
81#include <asm/io_apic.h> 82#include <asm/io_apic.h>
82#include <asm/ist.h> 83#include <asm/ist.h>
83#include <asm/vmi.h> 84#include <asm/vmi.h>
84#include <setup_arch.h> 85#include <asm/setup_arch.h>
85#include <asm/bios_ebda.h> 86#include <asm/bios_ebda.h>
86#include <asm/cacheflush.h> 87#include <asm/cacheflush.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -89,7 +90,7 @@
89 90
90#include <asm/system.h> 91#include <asm/system.h>
91#include <asm/vsyscall.h> 92#include <asm/vsyscall.h>
92#include <asm/smp.h> 93#include <asm/cpu.h>
93#include <asm/desc.h> 94#include <asm/desc.h>
94#include <asm/dma.h> 95#include <asm/dma.h>
95#include <asm/iommu.h> 96#include <asm/iommu.h>
@@ -97,7 +98,6 @@
97#include <asm/mmu_context.h> 98#include <asm/mmu_context.h>
98#include <asm/proto.h> 99#include <asm/proto.h>
99 100
100#include <mach_apic.h>
101#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h> 102#include <asm/hypervisor.h>
103 103
@@ -112,6 +112,20 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115unsigned int boot_cpu_id __read_mostly;
116
117#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu)
119{
120 return __default_cpu_present_to_apicid(mps_cpu);
121}
122
123int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
124{
125 return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
126}
127#endif
128
115#ifndef CONFIG_DEBUG_BOOT_PARAMS 129#ifndef CONFIG_DEBUG_BOOT_PARAMS
116struct boot_params __initdata boot_params; 130struct boot_params __initdata boot_params;
117#else 131#else
@@ -188,7 +202,9 @@ struct ist_info ist_info;
188#endif 202#endif
189 203
190#else 204#else
191struct cpuinfo_x86 boot_cpu_data __read_mostly; 205struct cpuinfo_x86 boot_cpu_data __read_mostly = {
206 .x86_phys_bits = MAX_PHYSMEM_BITS,
207};
192EXPORT_SYMBOL(boot_cpu_data); 208EXPORT_SYMBOL(boot_cpu_data);
193#endif 209#endif
194 210
@@ -586,20 +602,7 @@ static int __init setup_elfcorehdr(char *arg)
586early_param("elfcorehdr", setup_elfcorehdr); 602early_param("elfcorehdr", setup_elfcorehdr);
587#endif 603#endif
588 604
589static int __init default_update_genapic(void) 605static struct x86_quirks default_x86_quirks __initdata;
590{
591#ifdef CONFIG_X86_SMP
592# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
593 genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
594# endif
595#endif
596
597 return 0;
598}
599
600static struct x86_quirks default_x86_quirks __initdata = {
601 .update_genapic = default_update_genapic,
602};
603 606
604struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 607struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
605 608
@@ -656,7 +659,6 @@ void __init setup_arch(char **cmdline_p)
656#ifdef CONFIG_X86_32 659#ifdef CONFIG_X86_32
657 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 660 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
658 visws_early_detect(); 661 visws_early_detect();
659 pre_setup_arch_hook();
660#else 662#else
661 printk(KERN_INFO "Command line: %s\n", boot_command_line); 663 printk(KERN_INFO "Command line: %s\n", boot_command_line);
662#endif 664#endif
@@ -824,8 +826,7 @@ void __init setup_arch(char **cmdline_p)
824#else 826#else
825 num_physpages = max_pfn; 827 num_physpages = max_pfn;
826 828
827 if (cpu_has_x2apic) 829 check_x2apic();
828 check_x2apic();
829 830
830 /* How many end-of-memory variables you have, grandma! */ 831 /* How many end-of-memory variables you have, grandma! */
831 /* need this before calling reserve_initrd */ 832 /* need this before calling reserve_initrd */
@@ -865,9 +866,7 @@ void __init setup_arch(char **cmdline_p)
865 866
866 reserve_initrd(); 867 reserve_initrd();
867 868
868#ifdef CONFIG_X86_64
869 vsmp_init(); 869 vsmp_init();
870#endif
871 870
872 io_delay_init(); 871 io_delay_init();
873 872
@@ -893,12 +892,11 @@ void __init setup_arch(char **cmdline_p)
893 */ 892 */
894 acpi_reserve_bootmem(); 893 acpi_reserve_bootmem();
895#endif 894#endif
896#ifdef CONFIG_X86_FIND_SMP_CONFIG
897 /* 895 /*
898 * Find and reserve possible boot-time SMP configuration: 896 * Find and reserve possible boot-time SMP configuration:
899 */ 897 */
900 find_smp_config(); 898 find_smp_config();
901#endif 899
902 reserve_crashkernel(); 900 reserve_crashkernel();
903 901
904#ifdef CONFIG_X86_64 902#ifdef CONFIG_X86_64
@@ -925,9 +923,7 @@ void __init setup_arch(char **cmdline_p)
925 map_vsyscall(); 923 map_vsyscall();
926#endif 924#endif
927 925
928#ifdef CONFIG_X86_GENERICARCH
929 generic_apic_probe(); 926 generic_apic_probe();
930#endif
931 927
932 early_quirks(); 928 early_quirks();
933 929
@@ -978,4 +974,95 @@ void __init setup_arch(char **cmdline_p)
978#endif 974#endif
979} 975}
980 976
977#ifdef CONFIG_X86_32
978
979/**
980 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
981 *
982 * Description:
983 * Perform any necessary interrupt initialisation prior to setting up
984 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
985 * interrupts should be initialised here if the machine emulates a PC
986 * in any way.
987 **/
988void __init x86_quirk_pre_intr_init(void)
989{
990 if (x86_quirks->arch_pre_intr_init) {
991 if (x86_quirks->arch_pre_intr_init())
992 return;
993 }
994 init_ISA_irqs();
995}
996
997/**
998 * x86_quirk_intr_init - post gate setup interrupt initialisation
999 *
1000 * Description:
1001 * Fill in any interrupts that may have been left out by the general
1002 * init_IRQ() routine. interrupts having to do with the machine rather
1003 * than the devices on the I/O bus (like APIC interrupts in intel MP
1004 * systems) are started here.
1005 **/
1006void __init x86_quirk_intr_init(void)
1007{
1008 if (x86_quirks->arch_intr_init) {
1009 if (x86_quirks->arch_intr_init())
1010 return;
1011 }
1012}
1013
1014/**
1015 * x86_quirk_trap_init - initialise system specific traps
1016 *
1017 * Description:
1018 * Called as the final act of trap_init(). Used in VISWS to initialise
1019 * the various board specific APIC traps.
1020 **/
1021void __init x86_quirk_trap_init(void)
1022{
1023 if (x86_quirks->arch_trap_init) {
1024 if (x86_quirks->arch_trap_init())
1025 return;
1026 }
1027}
1028
1029static struct irqaction irq0 = {
1030 .handler = timer_interrupt,
1031 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1032 .mask = CPU_MASK_NONE,
1033 .name = "timer"
1034};
1035
1036/**
1037 * x86_quirk_pre_time_init - do any specific initialisations before.
1038 *
1039 **/
1040void __init x86_quirk_pre_time_init(void)
1041{
1042 if (x86_quirks->arch_pre_time_init)
1043 x86_quirks->arch_pre_time_init();
1044}
981 1045
1046/**
1047 * x86_quirk_time_init - do any specific initialisations for the system timer.
1048 *
1049 * Description:
1050 * Must plug the system timer interrupt source at HZ into the IRQ listed
1051 * in irq_vectors.h:TIMER_IRQ
1052 **/
1053void __init x86_quirk_time_init(void)
1054{
1055 if (x86_quirks->arch_time_init) {
1056 /*
1057 * A nonzero return code does not mean failure, it means
1058 * that the architecture quirk does not want any
1059 * generic (timer) setup to be performed after this:
1060 */
1061 if (x86_quirks->arch_time_init())
1062 return;
1063 }
1064
1065 irq0.mask = cpumask_of_cpu(0);
1066 setup_irq(0, &irq0);
1067}
1068#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49c..efa615f2bf43 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,402 +7,482 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/mpspec.h> 14#include <asm/mpspec.h>
14#include <asm/apicdef.h> 15#include <asm/apicdef.h>
15#include <asm/highmem.h> 16#include <asm/highmem.h>
17#include <asm/proto.h>
18#include <asm/cpumask.h>
19#include <asm/cpu.h>
20#include <asm/stackprotector.h>
16 21
17#ifdef CONFIG_X86_LOCAL_APIC 22#ifdef CONFIG_DEBUG_PER_CPU_MAPS
18unsigned int num_processors; 23# define DBG(x...) printk(KERN_DEBUG x)
19unsigned disabled_cpus __cpuinitdata; 24#else
20/* Processor that is doing the boot up */ 25# define DBG(x...)
21unsigned int boot_cpu_physical_apicid = -1U;
22EXPORT_SYMBOL(boot_cpu_physical_apicid);
23unsigned int max_physical_apicid;
24
25/* Bitmask of physically existing CPUs */
26physid_mask_t phys_cpu_present_map;
27#endif 26#endif
28 27
29/* map cpu index to physical APIC ID */ 28DEFINE_PER_CPU(int, cpu_number);
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 29EXPORT_PER_CPU_SYMBOL(cpu_number);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1
37 30
38/* map cpu index to node index */ 31#ifdef CONFIG_X86_64
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 32#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 33#else
34#define BOOT_PERCPU_OFFSET 0
35#endif
41 36
42/* which logical CPUs are on which nodes */ 37DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
43cpumask_t *node_to_cpumask_map; 38EXPORT_PER_CPU_SYMBOL(this_cpu_off);
44EXPORT_SYMBOL(node_to_cpumask_map);
45 39
46/* setup node_to_cpumask_map */ 40unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
47static void __init setup_node_to_cpumask_map(void); 41 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
42};
43EXPORT_SYMBOL(__per_cpu_offset);
48 44
45/*
46 * On x86_64 symbols referenced from code should be reachable using
47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
51 */
52#ifdef CONFIG_X86_64
53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
49#else 54#else
50static inline void setup_node_to_cpumask_map(void) { } 55#define PERCPU_FIRST_CHUNK_RESERVE 0
51#endif 56#endif
52 57
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 58/**
54/* 59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
55 * Copy data used in early init routines from the initial arrays to the 60 *
56 * per cpu data areas. These arrays then become expendable and the 61 * If NUMA is not configured or there is only one NUMA node available,
57 * *_early_ptr's are zeroed indicating that the static arrays are gone. 62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
58 */ 67 */
59static void __init setup_per_cpu_maps(void) 68static bool __init pcpu_need_numa(void)
60{ 69{
61 int cpu; 70#ifdef CONFIG_NEED_MULTIPLE_NODES
71 pg_data_t *last = NULL;
72 unsigned int cpu;
62 73
63 for_each_possible_cpu(cpu) { 74 for_each_possible_cpu(cpu) {
64 per_cpu(x86_cpu_to_apicid, cpu) = 75 int node = early_cpu_to_node(cpu);
65 early_per_cpu_map(x86_cpu_to_apicid, cpu);
66 per_cpu(x86_bios_cpu_apicid, cpu) =
67 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
68#ifdef X86_64_NUMA
69 per_cpu(x86_cpu_to_node_map, cpu) =
70 early_per_cpu_map(x86_cpu_to_node_map, cpu);
71#endif
72 }
73 76
74 /* indicate the early static arrays will soon be gone */ 77 if (node_online(node) && NODE_DATA(node) &&
75 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 78 last && last != NODE_DATA(node))
76 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 79 return true;
77#ifdef X86_64_NUMA 80
78 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 81 last = NODE_DATA(node);
82 }
79#endif 83#endif
84 return false;
80} 85}
81 86
82#ifdef CONFIG_X86_32 87/**
83/* 88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 89 * @cpu: cpu to allocate for
85 * the same way 90 * @size: size allocation in bytes
86 */ 91 * @align: alignment
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 92 *
88EXPORT_SYMBOL(__per_cpu_offset); 93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
89static inline void setup_cpu_pda_map(void) { } 94 * does the right thing for NUMA regardless of the current
90 95 * configuration.
91#elif !defined(CONFIG_SMP) 96 *
92static inline void setup_cpu_pda_map(void) { } 97 * RETURNS:
93 98 * Pointer to the allocated area on success, NULL on failure.
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95
96/*
97 * Allocate cpu_pda pointer table and array via alloc_bootmem.
98 */ 99 */
99static void __init setup_cpu_pda_map(void) 100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
100{ 102{
101 char *pda; 103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
102 struct x8664_pda **new_cpu_pda; 104#ifdef CONFIG_NEED_MULTIPLE_NODES
103 unsigned long size; 105 int node = early_cpu_to_node(cpu);
104 int cpu; 106 void *ptr;
105 107
106 size = roundup(sizeof(struct x8664_pda), cache_line_size()); 108 if (!node_online(node) || !NODE_DATA(node)) {
107 109 ptr = __alloc_bootmem_nopanic(size, align, goal);
108 /* allocate cpu_pda array and pointer table */ 110 pr_info("cpu %d has no node %d or node-local memory\n",
109 { 111 cpu, node);
110 unsigned long tsize = nr_cpu_ids * sizeof(void *); 112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
111 unsigned long asize = size * (nr_cpu_ids - 1); 113 cpu, size, __pa(ptr));
112 114 } else {
113 tsize = roundup(tsize, cache_line_size()); 115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
114 new_cpu_pda = alloc_bootmem(tsize + asize); 116 size, align, goal);
115 pda = (char *)new_cpu_pda + tsize; 117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
118 "%016lx\n", cpu, size, node, __pa(ptr));
116 } 119 }
117 120 return ptr;
118 /* initialize pointer table to static pda's */ 121#else
119 for_each_possible_cpu(cpu) { 122 return __alloc_bootmem_nopanic(size, align, goal);
120 if (cpu == 0) { 123#endif
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 }
129
130 /* point to new pointer table */
131 _cpu_pda = new_cpu_pda;
132} 124}
133 125
134#endif /* CONFIG_SMP && CONFIG_X86_64 */ 126/*
135 127 * Remap allocator
136#ifdef CONFIG_X86_64 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata;
141static void **pcpur_ptrs __initdata;
137 142
138/* correctly size the local cpu masks */ 143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
139static void __init setup_cpu_local_masks(void)
140{ 144{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask); 145 size_t off = (size_t)pageno << PAGE_SHIFT;
142 alloc_bootmem_cpumask_var(&cpu_callin_mask);
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146 146
147#else /* CONFIG_X86_32 */ 147 if (off >= pcpur_size)
148 return NULL;
148 149
149static inline void setup_cpu_local_masks(void) 150 return virt_to_page(pcpur_ptrs[cpu] + off);
150{
151} 151}
152 152
153#endif /* CONFIG_X86_32 */ 153static ssize_t __init setup_pcpu_remap(size_t static_size)
154
155/*
156 * Great future plan:
157 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
158 * Always point %gs to its beginning
159 */
160void __init setup_per_cpu_areas(void)
161{ 154{
162 ssize_t size, old_size; 155 static struct vm_struct vm;
163 char *ptr; 156 pg_data_t *last;
164 int cpu; 157 size_t ptrs_size, dyn_size;
165 unsigned long align = 1; 158 unsigned int cpu;
166 159 ssize_t ret;
167 /* Setup cpu_pda map */ 160
168 setup_cpu_pda_map(); 161 /*
162 * If large page isn't supported, there's no benefit in doing
163 * this. Also, on non-NUMA, embedding is better.
164 */
165 if (!cpu_has_pse || pcpu_need_numa())
166 return -EINVAL;
167
168 last = NULL;
169 for_each_possible_cpu(cpu) {
170 int node = early_cpu_to_node(cpu);
169 171
170 /* Copy section for each CPU (we discard the original) */ 172 if (node_online(node) && NODE_DATA(node) &&
171 old_size = PERCPU_ENOUGH_ROOM; 173 last && last != NODE_DATA(node))
172 align = max_t(unsigned long, PAGE_SIZE, align); 174 goto proceed;
173 size = roundup(old_size, align);
174 175
175 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 176 last = NODE_DATA(node);
176 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 177 }
178 return -EINVAL;
179
180proceed:
181 /*
182 * Currently supports only single page. Supporting multiple
183 * pages won't be too difficult if it ever becomes necessary.
184 */
185 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
186 PERCPU_DYNAMIC_RESERVE);
187 if (pcpur_size > PMD_SIZE) {
188 pr_warning("PERCPU: static data is larger than large page, "
189 "can't use large page\n");
190 return -EINVAL;
191 }
192 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
177 193
178 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 194 /* allocate pointer array and alloc large pages */
195 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
196 pcpur_ptrs = alloc_bootmem(ptrs_size);
179 197
180 for_each_possible_cpu(cpu) { 198 for_each_possible_cpu(cpu) {
181#ifndef CONFIG_NEED_MULTIPLE_NODES 199 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
182 ptr = __alloc_bootmem(size, align, 200 if (!pcpur_ptrs[cpu])
183 __pa(MAX_DMA_ADDRESS)); 201 goto enomem;
184#else 202
185 int node = early_cpu_to_node(cpu); 203 /*
186 if (!node_online(node) || !NODE_DATA(node)) { 204 * Only use pcpur_size bytes and give back the rest.
187 ptr = __alloc_bootmem(size, align, 205 *
188 __pa(MAX_DMA_ADDRESS)); 206 * Ingo: The 2MB up-rounding bootmem is needed to make
189 pr_info("cpu %d has no node %d or node-local memory\n", 207 * sure the partial 2MB page is still fully RAM - it's
190 cpu, node); 208 * not well-specified to have a PAT-incompatible area
191 pr_debug("per cpu data for cpu%d at %016lx\n", 209 * (unmapped RAM, device memory, etc.) in that hole.
192 cpu, __pa(ptr)); 210 */
193 } else { 211 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
194 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 212 PMD_SIZE - pcpur_size);
195 __pa(MAX_DMA_ADDRESS)); 213
196 pr_debug("per cpu data for cpu%d on node%d at %016lx\n", 214 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
197 cpu, node, __pa(ptr));
198 }
199#endif
200 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
202 } 215 }
203 216
204 /* Setup percpu data maps */ 217 /* allocate address and map */
205 setup_per_cpu_maps(); 218 vm.flags = VM_ALLOC;
219 vm.size = num_possible_cpus() * PMD_SIZE;
220 vm_area_register_early(&vm, PMD_SIZE);
206 221
207 /* Setup node to cpumask map */ 222 for_each_possible_cpu(cpu) {
208 setup_node_to_cpumask_map(); 223 pmd_t *pmd;
209 224
210 /* Setup cpu initialized, callin, callout masks */ 225 pmd = populate_extra_pmd((unsigned long)vm.addr
211 setup_cpu_local_masks(); 226 + cpu * PMD_SIZE);
212} 227 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
228 PAGE_KERNEL_LARGE));
229 }
213 230
231 /* we're ready, commit */
232 pr_info("PERCPU: Remapped at %p with large pages, static data "
233 "%zu bytes\n", vm.addr, static_size);
234
235 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
236 PERCPU_FIRST_CHUNK_RESERVE,
237 PMD_SIZE, dyn_size, vm.addr, NULL);
238 goto out_free_ar;
239
240enomem:
241 for_each_possible_cpu(cpu)
242 if (pcpur_ptrs[cpu])
243 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
244 ret = -ENOMEM;
245out_free_ar:
246 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
247 return ret;
248}
249#else
250static ssize_t __init setup_pcpu_remap(size_t static_size)
251{
252 return -EINVAL;
253}
214#endif 254#endif
215 255
216#ifdef X86_64_NUMA
217
218/* 256/*
219 * Allocate node_to_cpumask_map based on number of available nodes 257 * Embedding allocator
220 * Requires node_possible_map to be valid.
221 * 258 *
222 * Note: node_to_cpumask() is not valid until after this is done. 259 * The first chunk is sized to just contain the static area plus
260 * module and dynamic reserves, and allocated as a contiguous area
261 * using bootmem allocator and used as-is without being mapped into
262 * vmalloc area. This enables the first chunk to piggy back on the
263 * linear physical PMD mapping and doesn't add any additional pressure
264 * to TLB. Note that if the needed size is smaller than the minimum
265 * unit size, the leftover is returned to the bootmem allocator.
223 */ 266 */
224static void __init setup_node_to_cpumask_map(void) 267static void *pcpue_ptr __initdata;
225{ 268static size_t pcpue_size __initdata;
226 unsigned int node, num = 0; 269static size_t pcpue_unit_size __initdata;
227 cpumask_t *map;
228
229 /* setup nr_node_ids if not done yet */
230 if (nr_node_ids == MAX_NUMNODES) {
231 for_each_node_mask(node, node_possible_map)
232 num = node;
233 nr_node_ids = num + 1;
234 }
235 270
236 /* allocate the map */ 271static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 272{
273 size_t off = (size_t)pageno << PAGE_SHIFT;
238 274
239 pr_debug("Node to cpumask map at %p for %d nodes\n", 275 if (off >= pcpue_size)
240 map, nr_node_ids); 276 return NULL;
241 277
242 /* node_to_cpumask() will now work */ 278 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
243 node_to_cpumask_map = map;
244} 279}
245 280
246void __cpuinit numa_set_node(int cpu, int node) 281static ssize_t __init setup_pcpu_embed(size_t static_size)
247{ 282{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 283 unsigned int cpu;
249 284 size_t dyn_size;
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE) 285
251 cpu_pda(cpu)->nodenumber = node; 286 /*
287 * If large page isn't supported, there's no benefit in doing
288 * this. Also, embedding allocation doesn't play well with
289 * NUMA.
290 */
291 if (!cpu_has_pse || pcpu_need_numa())
292 return -EINVAL;
293
294 /* allocate and copy */
295 pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
296 PERCPU_DYNAMIC_RESERVE);
297 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
298 dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
299
300 pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
301 PAGE_SIZE);
302 if (!pcpue_ptr)
303 return -ENOMEM;
252 304
253 if (cpu_to_node_map) 305 for_each_possible_cpu(cpu) {
254 cpu_to_node_map[cpu] = node; 306 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
255 307
256 else if (per_cpu_offset(cpu)) 308 free_bootmem(__pa(ptr + pcpue_size),
257 per_cpu(x86_cpu_to_node_map, cpu) = node; 309 pcpue_unit_size - pcpue_size);
310 memcpy(ptr, __per_cpu_load, static_size);
311 }
258 312
259 else 313 /* we're ready, commit */
260 pr_debug("Setting node for non-present cpu %d\n", cpu); 314 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
261} 315 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
262 316
263void __cpuinit numa_clear_node(int cpu) 317 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
264{ 318 PERCPU_FIRST_CHUNK_RESERVE,
265 numa_set_node(cpu, NUMA_NO_NODE); 319 pcpue_unit_size, dyn_size,
320 pcpue_ptr, NULL);
266} 321}
267 322
268#ifndef CONFIG_DEBUG_PER_CPU_MAPS 323/*
324 * 4k page allocator
325 *
326 * This is the basic allocator. Static percpu area is allocated
327 * page-by-page and most of initialization is done by the generic
328 * setup function.
329 */
330static struct page **pcpu4k_pages __initdata;
331static int pcpu4k_nr_static_pages __initdata;
269 332
270void __cpuinit numa_add_cpu(int cpu) 333static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
271{ 334{
272 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 335 if (pageno < pcpu4k_nr_static_pages)
336 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
337 return NULL;
273} 338}
274 339
275void __cpuinit numa_remove_cpu(int cpu) 340static void __init pcpu4k_populate_pte(unsigned long addr)
276{ 341{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); 342 populate_extra_pte(addr);
278} 343}
279 344
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 345static ssize_t __init setup_pcpu_4k(size_t static_size)
281
282/*
283 * --------- debug versions of the numa functions ---------
284 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable)
286{ 346{
287 int node = cpu_to_node(cpu); 347 size_t pages_size;
288 cpumask_t *mask; 348 unsigned int cpu;
289 char buf[64]; 349 int i, j;
290 350 ssize_t ret;
291 if (node_to_cpumask_map == NULL) { 351
292 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 352 pcpu4k_nr_static_pages = PFN_UP(static_size);
293 dump_stack(); 353
294 return; 354 /* unaligned allocations can't be freed, round up to page size */
295 } 355 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
296 356 * sizeof(pcpu4k_pages[0]));
297 mask = &node_to_cpumask_map[node]; 357 pcpu4k_pages = alloc_bootmem(pages_size);
298 if (enable) 358
299 cpu_set(cpu, *mask); 359 /* allocate and copy */
300 else 360 j = 0;
301 cpu_clear(cpu, *mask); 361 for_each_possible_cpu(cpu)
302 362 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
303 cpulist_scnprintf(buf, sizeof(buf), mask); 363 void *ptr;
304 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 364
305 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 365 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
306} 366 if (!ptr)
367 goto enomem;
368
369 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
370 pcpu4k_pages[j++] = virt_to_page(ptr);
371 }
307 372
308void __cpuinit numa_add_cpu(int cpu) 373 /* we're ready, commit */
309{ 374 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
310 numa_set_cpumask(cpu, 1); 375 pcpu4k_nr_static_pages, static_size);
376
377 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
378 PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
379 pcpu4k_populate_pte);
380 goto out_free_ar;
381
382enomem:
383 while (--j >= 0)
384 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
385 ret = -ENOMEM;
386out_free_ar:
387 free_bootmem(__pa(pcpu4k_pages), pages_size);
388 return ret;
311} 389}
312 390
313void __cpuinit numa_remove_cpu(int cpu) 391static inline void setup_percpu_segment(int cpu)
314{ 392{
315 numa_set_cpumask(cpu, 0); 393#ifdef CONFIG_X86_32
316} 394 struct desc_struct gdt;
317 395
318int cpu_to_node(int cpu) 396 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
319{ 397 0x2 | DESCTYPE_S, 0x8);
320 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 398 gdt.s = 1;
321 printk(KERN_WARNING 399 write_gdt_entry(get_cpu_gdt_table(cpu),
322 "cpu_to_node(%d): usage too early!\n", cpu); 400 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
323 dump_stack(); 401#endif
324 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
325 }
326 return per_cpu(x86_cpu_to_node_map, cpu);
327} 402}
328EXPORT_SYMBOL(cpu_to_node);
329 403
330/* 404/*
331 * Same function as cpu_to_node() but used if called before the 405 * Great future plan:
332 * per_cpu areas are setup. 406 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
407 * Always point %gs to its beginning
333 */ 408 */
334int early_cpu_to_node(int cpu) 409void __init setup_per_cpu_areas(void)
335{ 410{
336 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 411 size_t static_size = __per_cpu_end - __per_cpu_start;
337 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 412 unsigned int cpu;
338 413 unsigned long delta;
339 if (!per_cpu_offset(cpu)) { 414 size_t pcpu_unit_size;
340 printk(KERN_WARNING 415 ssize_t ret;
341 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
342 dump_stack();
343 return NUMA_NO_NODE;
344 }
345 return per_cpu(x86_cpu_to_node_map, cpu);
346}
347
348
349/* empty cpumask */
350static const cpumask_t cpu_mask_none;
351 416
352/* 417 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
353 * Returns a pointer to the bitmask of CPUs on Node 'node'. 418 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
354 */
355const cpumask_t *cpumask_of_node(int node)
356{
357 if (node_to_cpumask_map == NULL) {
358 printk(KERN_WARNING
359 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
360 node);
361 dump_stack();
362 return (const cpumask_t *)&cpu_online_map;
363 }
364 if (node >= nr_node_ids) {
365 printk(KERN_WARNING
366 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
367 node, nr_node_ids);
368 dump_stack();
369 return &cpu_mask_none;
370 }
371 return &node_to_cpumask_map[node];
372}
373EXPORT_SYMBOL(cpumask_of_node);
374 419
375/* 420 /*
376 * Returns a bitmask of CPUs on Node 'node'. 421 * Allocate percpu area. If PSE is supported, try to make use
377 * 422 * of large page mappings. Please read comments on top of
378 * Side note: this function creates the returned cpumask on the stack 423 * each allocator for details.
379 * so with a high NR_CPUS count, excessive stack space is used. The 424 */
380 * node_to_cpumask_ptr function should be used whenever possible. 425 ret = setup_pcpu_remap(static_size);
381 */ 426 if (ret < 0)
382cpumask_t node_to_cpumask(int node) 427 ret = setup_pcpu_embed(static_size);
383{ 428 if (ret < 0)
384 if (node_to_cpumask_map == NULL) { 429 ret = setup_pcpu_4k(static_size);
385 printk(KERN_WARNING 430 if (ret < 0)
386 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); 431 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
387 dump_stack(); 432 static_size, ret);
388 return cpu_online_map; 433
389 } 434 pcpu_unit_size = ret;
390 if (node >= nr_node_ids) { 435
391 printk(KERN_WARNING 436 /* alrighty, percpu areas up and running */
392 "node_to_cpumask(%d): node > nr_node_ids(%d)\n", 437 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
393 node, nr_node_ids); 438 for_each_possible_cpu(cpu) {
394 dump_stack(); 439 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
395 return cpu_mask_none; 440 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
441 per_cpu(cpu_number, cpu) = cpu;
442 setup_percpu_segment(cpu);
443 setup_stack_canary_segment(cpu);
444 /*
445 * Copy data used in early init routines from the
446 * initial arrays to the per cpu data areas. These
447 * arrays then become expendable and the *_early_ptr's
448 * are zeroed indicating that the static arrays are
449 * gone.
450 */
451#ifdef CONFIG_X86_LOCAL_APIC
452 per_cpu(x86_cpu_to_apicid, cpu) =
453 early_per_cpu_map(x86_cpu_to_apicid, cpu);
454 per_cpu(x86_bios_cpu_apicid, cpu) =
455 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
456#endif
457#ifdef CONFIG_X86_64
458 per_cpu(irq_stack_ptr, cpu) =
459 per_cpu(irq_stack_union.irq_stack, cpu) +
460 IRQ_STACK_SIZE - 64;
461#ifdef CONFIG_NUMA
462 per_cpu(x86_cpu_to_node_map, cpu) =
463 early_per_cpu_map(x86_cpu_to_node_map, cpu);
464#endif
465#endif
466 /*
467 * Up to this point, the boot CPU has been using .data.init
468 * area. Reload any changed state for the boot CPU.
469 */
470 if (cpu == boot_cpu_id)
471 switch_to_new_gdt(cpu);
396 } 472 }
397 return node_to_cpumask_map[node];
398}
399EXPORT_SYMBOL(node_to_cpumask);
400
401/*
402 * --------- end of debug versions of the numa functions ---------
403 */
404 473
405#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 474 /* indicate the early static arrays will soon be gone */
475#ifdef CONFIG_X86_LOCAL_APIC
476 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
477 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
478#endif
479#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
480 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
481#endif
406 482
407#endif /* X86_64_NUMA */ 483 /* Setup node to cpumask map */
484 setup_node_to_cpumask_map();
408 485
486 /* Setup cpu initialized, callin, callout masks */
487 setup_cpu_local_masks();
488}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index df0587f24c54..d2cc6428c587 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -50,27 +50,23 @@
50# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
51#endif 51#endif
52 52
53#define COPY(x) { \ 53#define COPY(x) do { \
54 err |= __get_user(regs->x, &sc->x); \ 54 get_user_ex(regs->x, &sc->x); \
55} 55} while (0)
56 56
57#define COPY_SEG(seg) { \ 57#define GET_SEG(seg) ({ \
58 unsigned short tmp; \ 58 unsigned short tmp; \
59 err |= __get_user(tmp, &sc->seg); \ 59 get_user_ex(tmp, &sc->seg); \
60 regs->seg = tmp; \ 60 tmp; \
61} 61})
62 62
63#define COPY_SEG_CPL3(seg) { \ 63#define COPY_SEG(seg) do { \
64 unsigned short tmp; \ 64 regs->seg = GET_SEG(seg); \
65 err |= __get_user(tmp, &sc->seg); \ 65} while (0)
66 regs->seg = tmp | 3; \
67}
68 66
69#define GET_SEG(seg) { \ 67#define COPY_SEG_CPL3(seg) do { \
70 unsigned short tmp; \ 68 regs->seg = GET_SEG(seg) | 3; \
71 err |= __get_user(tmp, &sc->seg); \ 69} while (0)
72 loadsegment(seg, tmp); \
73}
74 70
75static int 71static int
76restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 72restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
@@ -83,45 +79,49 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
83 /* Always make any pending restarted system calls return -EINTR */ 79 /* Always make any pending restarted system calls return -EINTR */
84 current_thread_info()->restart_block.fn = do_no_restart_syscall; 80 current_thread_info()->restart_block.fn = do_no_restart_syscall;
85 81
82 get_user_try {
83
86#ifdef CONFIG_X86_32 84#ifdef CONFIG_X86_32
87 GET_SEG(gs); 85 set_user_gs(regs, GET_SEG(gs));
88 COPY_SEG(fs); 86 COPY_SEG(fs);
89 COPY_SEG(es); 87 COPY_SEG(es);
90 COPY_SEG(ds); 88 COPY_SEG(ds);
91#endif /* CONFIG_X86_32 */ 89#endif /* CONFIG_X86_32 */
92 90
93 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 91 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
94 COPY(dx); COPY(cx); COPY(ip); 92 COPY(dx); COPY(cx); COPY(ip);
95 93
96#ifdef CONFIG_X86_64 94#ifdef CONFIG_X86_64
97 COPY(r8); 95 COPY(r8);
98 COPY(r9); 96 COPY(r9);
99 COPY(r10); 97 COPY(r10);
100 COPY(r11); 98 COPY(r11);
101 COPY(r12); 99 COPY(r12);
102 COPY(r13); 100 COPY(r13);
103 COPY(r14); 101 COPY(r14);
104 COPY(r15); 102 COPY(r15);
105#endif /* CONFIG_X86_64 */ 103#endif /* CONFIG_X86_64 */
106 104
107#ifdef CONFIG_X86_32 105#ifdef CONFIG_X86_32
108 COPY_SEG_CPL3(cs); 106 COPY_SEG_CPL3(cs);
109 COPY_SEG_CPL3(ss); 107 COPY_SEG_CPL3(ss);
110#else /* !CONFIG_X86_32 */ 108#else /* !CONFIG_X86_32 */
111 /* Kernel saves and restores only the CS segment register on signals, 109 /* Kernel saves and restores only the CS segment register on signals,
112 * which is the bare minimum needed to allow mixed 32/64-bit code. 110 * which is the bare minimum needed to allow mixed 32/64-bit code.
113 * App's signal handler can save/restore other segments if needed. */ 111 * App's signal handler can save/restore other segments if needed. */
114 COPY_SEG_CPL3(cs); 112 COPY_SEG_CPL3(cs);
115#endif /* CONFIG_X86_32 */ 113#endif /* CONFIG_X86_32 */
116 114
117 err |= __get_user(tmpflags, &sc->flags); 115 get_user_ex(tmpflags, &sc->flags);
118 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 116 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
119 regs->orig_ax = -1; /* disable syscall checks */ 117 regs->orig_ax = -1; /* disable syscall checks */
120 118
121 err |= __get_user(buf, &sc->fpstate); 119 get_user_ex(buf, &sc->fpstate);
122 err |= restore_i387_xstate(buf); 120 err |= restore_i387_xstate(buf);
121
122 get_user_ex(*pax, &sc->ax);
123 } get_user_catch(err);
123 124
124 err |= __get_user(*pax, &sc->ax);
125 return err; 125 return err;
126} 126}
127 127
@@ -131,57 +131,55 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
131{ 131{
132 int err = 0; 132 int err = 0;
133 133
134#ifdef CONFIG_X86_32 134 put_user_try {
135 {
136 unsigned int tmp;
137 135
138 savesegment(gs, tmp); 136#ifdef CONFIG_X86_32
139 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 137 put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
140 } 138 put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
141 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 139 put_user_ex(regs->es, (unsigned int __user *)&sc->es);
142 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 140 put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
143 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
144#endif /* CONFIG_X86_32 */ 141#endif /* CONFIG_X86_32 */
145 142
146 err |= __put_user(regs->di, &sc->di); 143 put_user_ex(regs->di, &sc->di);
147 err |= __put_user(regs->si, &sc->si); 144 put_user_ex(regs->si, &sc->si);
148 err |= __put_user(regs->bp, &sc->bp); 145 put_user_ex(regs->bp, &sc->bp);
149 err |= __put_user(regs->sp, &sc->sp); 146 put_user_ex(regs->sp, &sc->sp);
150 err |= __put_user(regs->bx, &sc->bx); 147 put_user_ex(regs->bx, &sc->bx);
151 err |= __put_user(regs->dx, &sc->dx); 148 put_user_ex(regs->dx, &sc->dx);
152 err |= __put_user(regs->cx, &sc->cx); 149 put_user_ex(regs->cx, &sc->cx);
153 err |= __put_user(regs->ax, &sc->ax); 150 put_user_ex(regs->ax, &sc->ax);
154#ifdef CONFIG_X86_64 151#ifdef CONFIG_X86_64
155 err |= __put_user(regs->r8, &sc->r8); 152 put_user_ex(regs->r8, &sc->r8);
156 err |= __put_user(regs->r9, &sc->r9); 153 put_user_ex(regs->r9, &sc->r9);
157 err |= __put_user(regs->r10, &sc->r10); 154 put_user_ex(regs->r10, &sc->r10);
158 err |= __put_user(regs->r11, &sc->r11); 155 put_user_ex(regs->r11, &sc->r11);
159 err |= __put_user(regs->r12, &sc->r12); 156 put_user_ex(regs->r12, &sc->r12);
160 err |= __put_user(regs->r13, &sc->r13); 157 put_user_ex(regs->r13, &sc->r13);
161 err |= __put_user(regs->r14, &sc->r14); 158 put_user_ex(regs->r14, &sc->r14);
162 err |= __put_user(regs->r15, &sc->r15); 159 put_user_ex(regs->r15, &sc->r15);
163#endif /* CONFIG_X86_64 */ 160#endif /* CONFIG_X86_64 */
164 161
165 err |= __put_user(current->thread.trap_no, &sc->trapno); 162 put_user_ex(current->thread.trap_no, &sc->trapno);
166 err |= __put_user(current->thread.error_code, &sc->err); 163 put_user_ex(current->thread.error_code, &sc->err);
167 err |= __put_user(regs->ip, &sc->ip); 164 put_user_ex(regs->ip, &sc->ip);
168#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
169 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 166 put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
170 err |= __put_user(regs->flags, &sc->flags); 167 put_user_ex(regs->flags, &sc->flags);
171 err |= __put_user(regs->sp, &sc->sp_at_signal); 168 put_user_ex(regs->sp, &sc->sp_at_signal);
172 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 169 put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
173#else /* !CONFIG_X86_32 */ 170#else /* !CONFIG_X86_32 */
174 err |= __put_user(regs->flags, &sc->flags); 171 put_user_ex(regs->flags, &sc->flags);
175 err |= __put_user(regs->cs, &sc->cs); 172 put_user_ex(regs->cs, &sc->cs);
176 err |= __put_user(0, &sc->gs); 173 put_user_ex(0, &sc->gs);
177 err |= __put_user(0, &sc->fs); 174 put_user_ex(0, &sc->fs);
178#endif /* CONFIG_X86_32 */ 175#endif /* CONFIG_X86_32 */
179 176
180 err |= __put_user(fpstate, &sc->fpstate); 177 put_user_ex(fpstate, &sc->fpstate);
181 178
182 /* non-iBCS2 extensions.. */ 179 /* non-iBCS2 extensions.. */
183 err |= __put_user(mask, &sc->oldmask); 180 put_user_ex(mask, &sc->oldmask);
184 err |= __put_user(current->thread.cr2, &sc->cr2); 181 put_user_ex(current->thread.cr2, &sc->cr2);
182 } put_user_catch(err);
185 183
186 return err; 184 return err;
187} 185}
@@ -189,40 +187,35 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
189/* 187/*
190 * Set up a signal frame. 188 * Set up a signal frame.
191 */ 189 */
192#ifdef CONFIG_X86_32
193static const struct {
194 u16 poplmovl;
195 u32 val;
196 u16 int80;
197} __attribute__((packed)) retcode = {
198 0xb858, /* popl %eax; movl $..., %eax */
199 __NR_sigreturn,
200 0x80cd, /* int $0x80 */
201};
202
203static const struct {
204 u8 movl;
205 u32 val;
206 u16 int80;
207 u8 pad;
208} __attribute__((packed)) rt_retcode = {
209 0xb8, /* movl $..., %eax */
210 __NR_rt_sigreturn,
211 0x80cd, /* int $0x80 */
212 0
213};
214 190
215/* 191/*
216 * Determine which stack to use.. 192 * Determine which stack to use..
217 */ 193 */
194static unsigned long align_sigframe(unsigned long sp)
195{
196#ifdef CONFIG_X86_32
197 /*
198 * Align the stack pointer according to the i386 ABI,
199 * i.e. so that on function entry ((sp + 4) & 15) == 0.
200 */
201 sp = ((sp + 4) & -16ul) - 4;
202#else /* !CONFIG_X86_32 */
203 sp = round_down(sp, 16) - 8;
204#endif
205 return sp;
206}
207
218static inline void __user * 208static inline void __user *
219get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, 209get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
220 void **fpstate) 210 void __user **fpstate)
221{ 211{
222 unsigned long sp;
223
224 /* Default to using normal stack */ 212 /* Default to using normal stack */
225 sp = regs->sp; 213 unsigned long sp = regs->sp;
214
215#ifdef CONFIG_X86_64
216 /* redzone */
217 sp -= 128;
218#endif /* CONFIG_X86_64 */
226 219
227 /* 220 /*
228 * If we are on the alternate signal stack and would overflow it, don't. 221 * If we are on the alternate signal stack and would overflow it, don't.
@@ -236,30 +229,52 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
236 if (sas_ss_flags(sp) == 0) 229 if (sas_ss_flags(sp) == 0)
237 sp = current->sas_ss_sp + current->sas_ss_size; 230 sp = current->sas_ss_sp + current->sas_ss_size;
238 } else { 231 } else {
232#ifdef CONFIG_X86_32
239 /* This is the legacy signal stack switching. */ 233 /* This is the legacy signal stack switching. */
240 if ((regs->ss & 0xffff) != __USER_DS && 234 if ((regs->ss & 0xffff) != __USER_DS &&
241 !(ka->sa.sa_flags & SA_RESTORER) && 235 !(ka->sa.sa_flags & SA_RESTORER) &&
242 ka->sa.sa_restorer) 236 ka->sa.sa_restorer)
243 sp = (unsigned long) ka->sa.sa_restorer; 237 sp = (unsigned long) ka->sa.sa_restorer;
238#endif /* CONFIG_X86_32 */
244 } 239 }
245 240
246 if (used_math()) { 241 if (used_math()) {
247 sp = sp - sig_xstate_size; 242 sp -= sig_xstate_size;
248 *fpstate = (struct _fpstate *) sp; 243#ifdef CONFIG_X86_64
244 sp = round_down(sp, 64);
245#endif /* CONFIG_X86_64 */
246 *fpstate = (void __user *)sp;
247
249 if (save_i387_xstate(*fpstate) < 0) 248 if (save_i387_xstate(*fpstate) < 0)
250 return (void __user *)-1L; 249 return (void __user *)-1L;
251 } 250 }
252 251
253 sp -= frame_size; 252 return (void __user *)align_sigframe(sp - frame_size);
254 /*
255 * Align the stack pointer according to the i386 ABI,
256 * i.e. so that on function entry ((sp + 4) & 15) == 0.
257 */
258 sp = ((sp + 4) & -16ul) - 4;
259
260 return (void __user *) sp;
261} 253}
262 254
255#ifdef CONFIG_X86_32
256static const struct {
257 u16 poplmovl;
258 u32 val;
259 u16 int80;
260} __attribute__((packed)) retcode = {
261 0xb858, /* popl %eax; movl $..., %eax */
262 __NR_sigreturn,
263 0x80cd, /* int $0x80 */
264};
265
266static const struct {
267 u8 movl;
268 u32 val;
269 u16 int80;
270 u8 pad;
271} __attribute__((packed)) rt_retcode = {
272 0xb8, /* movl $..., %eax */
273 __NR_rt_sigreturn,
274 0x80cd, /* int $0x80 */
275 0
276};
277
263static int 278static int
264__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 279__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
265 struct pt_regs *regs) 280 struct pt_regs *regs)
@@ -336,43 +351,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
336 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 351 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
337 return -EFAULT; 352 return -EFAULT;
338 353
339 err |= __put_user(sig, &frame->sig); 354 put_user_try {
340 err |= __put_user(&frame->info, &frame->pinfo); 355 put_user_ex(sig, &frame->sig);
341 err |= __put_user(&frame->uc, &frame->puc); 356 put_user_ex(&frame->info, &frame->pinfo);
342 err |= copy_siginfo_to_user(&frame->info, info); 357 put_user_ex(&frame->uc, &frame->puc);
343 if (err) 358 err |= copy_siginfo_to_user(&frame->info, info);
344 return -EFAULT;
345 359
346 /* Create the ucontext. */ 360 /* Create the ucontext. */
347 if (cpu_has_xsave) 361 if (cpu_has_xsave)
348 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 362 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
349 else 363 else
350 err |= __put_user(0, &frame->uc.uc_flags); 364 put_user_ex(0, &frame->uc.uc_flags);
351 err |= __put_user(0, &frame->uc.uc_link); 365 put_user_ex(0, &frame->uc.uc_link);
352 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 366 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
353 err |= __put_user(sas_ss_flags(regs->sp), 367 put_user_ex(sas_ss_flags(regs->sp),
354 &frame->uc.uc_stack.ss_flags); 368 &frame->uc.uc_stack.ss_flags);
355 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 369 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
356 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, 370 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
357 regs, set->sig[0]); 371 regs, set->sig[0]);
358 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 372 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
359 if (err) 373
360 return -EFAULT; 374 /* Set up to return from userspace. */
361 375 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
362 /* Set up to return from userspace. */ 376 if (ka->sa.sa_flags & SA_RESTORER)
363 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 377 restorer = ka->sa.sa_restorer;
364 if (ka->sa.sa_flags & SA_RESTORER) 378 put_user_ex(restorer, &frame->pretcode);
365 restorer = ka->sa.sa_restorer;
366 err |= __put_user(restorer, &frame->pretcode);
367 379
368 /* 380 /*
369 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 381 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
370 * 382 *
371 * WE DO NOT USE IT ANY MORE! It's only left here for historical 383 * WE DO NOT USE IT ANY MORE! It's only left here for historical
372 * reasons and because gdb uses it as a signature to notice 384 * reasons and because gdb uses it as a signature to notice
373 * signal handler stack frames. 385 * signal handler stack frames.
374 */ 386 */
375 err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); 387 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
388 } put_user_catch(err);
376 389
377 if (err) 390 if (err)
378 return -EFAULT; 391 return -EFAULT;
@@ -392,24 +405,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
392 return 0; 405 return 0;
393} 406}
394#else /* !CONFIG_X86_32 */ 407#else /* !CONFIG_X86_32 */
395/*
396 * Determine which stack to use..
397 */
398static void __user *
399get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
400{
401 /* Default to using normal stack - redzone*/
402 sp -= 128;
403
404 /* This is the X/Open sanctioned signal stack switching. */
405 if (ka->sa.sa_flags & SA_ONSTACK) {
406 if (sas_ss_flags(sp) == 0)
407 sp = current->sas_ss_sp + current->sas_ss_size;
408 }
409
410 return (void __user *)round_down(sp - size, 64);
411}
412
413static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
415{ 410{
@@ -418,15 +413,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
418 int err = 0; 413 int err = 0;
419 struct task_struct *me = current; 414 struct task_struct *me = current;
420 415
421 if (used_math()) { 416 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
422 fp = get_stack(ka, regs->sp, sig_xstate_size);
423 frame = (void __user *)round_down(
424 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
425
426 if (save_i387_xstate(fp) < 0)
427 return -EFAULT;
428 } else
429 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
430 417
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 return -EFAULT; 419 return -EFAULT;
@@ -436,28 +423,30 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
436 return -EFAULT; 423 return -EFAULT;
437 } 424 }
438 425
439 /* Create the ucontext. */ 426 put_user_try {
440 if (cpu_has_xsave) 427 /* Create the ucontext. */
441 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 428 if (cpu_has_xsave)
442 else 429 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
443 err |= __put_user(0, &frame->uc.uc_flags); 430 else
444 err |= __put_user(0, &frame->uc.uc_link); 431 put_user_ex(0, &frame->uc.uc_flags);
445 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 432 put_user_ex(0, &frame->uc.uc_link);
446 err |= __put_user(sas_ss_flags(regs->sp), 433 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
447 &frame->uc.uc_stack.ss_flags); 434 put_user_ex(sas_ss_flags(regs->sp),
448 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 435 &frame->uc.uc_stack.ss_flags);
449 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); 436 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
450 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 437 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
451 438 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
452 /* Set up to return from userspace. If provided, use a stub 439
453 already in userspace. */ 440 /* Set up to return from userspace. If provided, use a stub
454 /* x86-64 should always use SA_RESTORER. */ 441 already in userspace. */
455 if (ka->sa.sa_flags & SA_RESTORER) { 442 /* x86-64 should always use SA_RESTORER. */
456 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 443 if (ka->sa.sa_flags & SA_RESTORER) {
457 } else { 444 put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
458 /* could use a vstub here */ 445 } else {
459 return -EFAULT; 446 /* could use a vstub here */
460 } 447 err |= -EFAULT;
448 }
449 } put_user_catch(err);
461 450
462 if (err) 451 if (err)
463 return -EFAULT; 452 return -EFAULT;
@@ -509,31 +498,41 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
509 struct old_sigaction __user *oact) 498 struct old_sigaction __user *oact)
510{ 499{
511 struct k_sigaction new_ka, old_ka; 500 struct k_sigaction new_ka, old_ka;
512 int ret; 501 int ret = 0;
513 502
514 if (act) { 503 if (act) {
515 old_sigset_t mask; 504 old_sigset_t mask;
516 505
517 if (!access_ok(VERIFY_READ, act, sizeof(*act)) || 506 if (!access_ok(VERIFY_READ, act, sizeof(*act)))
518 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
519 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
520 return -EFAULT; 507 return -EFAULT;
521 508
522 __get_user(new_ka.sa.sa_flags, &act->sa_flags); 509 get_user_try {
523 __get_user(mask, &act->sa_mask); 510 get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
511 get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
512 get_user_ex(mask, &act->sa_mask);
513 get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
514 } get_user_catch(ret);
515
516 if (ret)
517 return -EFAULT;
524 siginitset(&new_ka.sa.sa_mask, mask); 518 siginitset(&new_ka.sa.sa_mask, mask);
525 } 519 }
526 520
527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); 521 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
528 522
529 if (!ret && oact) { 523 if (!ret && oact) {
530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || 524 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
531 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
532 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
533 return -EFAULT; 525 return -EFAULT;
534 526
535 __put_user(old_ka.sa.sa_flags, &oact->sa_flags); 527 put_user_try {
536 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); 528 put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
529 put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
530 put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
531 put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
532 } put_user_catch(ret);
533
534 if (ret)
535 return -EFAULT;
537 } 536 }
538 537
539 return ret; 538 return ret;
@@ -541,14 +540,9 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
541#endif /* CONFIG_X86_32 */ 540#endif /* CONFIG_X86_32 */
542 541
543#ifdef CONFIG_X86_32 542#ifdef CONFIG_X86_32
544asmlinkage int sys_sigaltstack(unsigned long bx) 543int sys_sigaltstack(struct pt_regs *regs)
545{ 544{
546 /* 545 const stack_t __user *uss = (const stack_t __user *)regs->bx;
547 * This is needed to make gcc realize it doesn't own the
548 * "struct pt_regs"
549 */
550 struct pt_regs *regs = (struct pt_regs *)&bx;
551 const stack_t __user *uss = (const stack_t __user *)bx;
552 stack_t __user *uoss = (stack_t __user *)regs->cx; 546 stack_t __user *uoss = (stack_t __user *)regs->cx;
553 547
554 return do_sigaltstack(uss, uoss, regs->sp); 548 return do_sigaltstack(uss, uoss, regs->sp);
@@ -566,14 +560,12 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
566 * Do a signal return; undo the signal stack. 560 * Do a signal return; undo the signal stack.
567 */ 561 */
568#ifdef CONFIG_X86_32 562#ifdef CONFIG_X86_32
569asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 563unsigned long sys_sigreturn(struct pt_regs *regs)
570{ 564{
571 struct sigframe __user *frame; 565 struct sigframe __user *frame;
572 struct pt_regs *regs;
573 unsigned long ax; 566 unsigned long ax;
574 sigset_t set; 567 sigset_t set;
575 568
576 regs = (struct pt_regs *) &__unused;
577 frame = (struct sigframe __user *)(regs->sp - 8); 569 frame = (struct sigframe __user *)(regs->sp - 8);
578 570
579 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 571 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -600,7 +592,7 @@ badframe:
600} 592}
601#endif /* CONFIG_X86_32 */ 593#endif /* CONFIG_X86_32 */
602 594
603static long do_rt_sigreturn(struct pt_regs *regs) 595long sys_rt_sigreturn(struct pt_regs *regs)
604{ 596{
605 struct rt_sigframe __user *frame; 597 struct rt_sigframe __user *frame;
606 unsigned long ax; 598 unsigned long ax;
@@ -631,25 +623,6 @@ badframe:
631 return 0; 623 return 0;
632} 624}
633 625
634#ifdef CONFIG_X86_32
635/*
636 * Note: do not pass in pt_regs directly as with tail-call optimization
637 * GCC will incorrectly stomp on the caller's frame and corrupt user-space
638 * register state:
639 */
640asmlinkage int sys_rt_sigreturn(unsigned long __unused)
641{
642 struct pt_regs *regs = (struct pt_regs *)&__unused;
643
644 return do_rt_sigreturn(regs);
645}
646#else /* !CONFIG_X86_32 */
647asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
648{
649 return do_rt_sigreturn(regs);
650}
651#endif /* CONFIG_X86_32 */
652
653/* 626/*
654 * OK, we're invoking a handler: 627 * OK, we're invoking a handler:
655 */ 628 */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e6faa3316bd2..13f33ea8ccaa 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -2,7 +2,7 @@
2 * Intel SMP support routines. 2 * Intel SMP support routines.
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * (c) 2002,2003 Andi Kleen, SuSE Labs. 6 * (c) 2002,2003 Andi Kleen, SuSE Labs.
7 * 7 *
8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> 8 * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,8 +26,7 @@
26#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
27#include <asm/mmu_context.h> 27#include <asm/mmu_context.h>
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <mach_ipi.h> 29#include <asm/apic.h>
30#include <mach_apic.h>
31/* 30/*
32 * Some notes on x86 processor bugs affecting SMP operation: 31 * Some notes on x86 processor bugs affecting SMP operation:
33 * 32 *
@@ -118,12 +117,12 @@ static void native_smp_send_reschedule(int cpu)
118 WARN_ON(1); 117 WARN_ON(1);
119 return; 118 return;
120 } 119 }
121 send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); 120 apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
122} 121}
123 122
124void native_send_call_func_single_ipi(int cpu) 123void native_send_call_func_single_ipi(int cpu)
125{ 124{
126 send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); 125 apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
127} 126}
128 127
129void native_send_call_func_ipi(const struct cpumask *mask) 128void native_send_call_func_ipi(const struct cpumask *mask)
@@ -131,7 +130,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)
131 cpumask_var_t allbutself; 130 cpumask_var_t allbutself;
132 131
133 if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { 132 if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
134 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 133 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
135 return; 134 return;
136 } 135 }
137 136
@@ -140,9 +139,9 @@ void native_send_call_func_ipi(const struct cpumask *mask)
140 139
141 if (cpumask_equal(mask, allbutself) && 140 if (cpumask_equal(mask, allbutself) &&
142 cpumask_equal(cpu_online_mask, cpu_callout_mask)) 141 cpumask_equal(cpu_online_mask, cpu_callout_mask))
143 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 142 apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
144 else 143 else
145 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 144 apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
146 145
147 free_cpumask_var(allbutself); 146 free_cpumask_var(allbutself);
148} 147}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bb1a3b1fc87f..ef7d10170c30 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -2,7 +2,7 @@
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 5 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs. 6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 * 7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to 8 * Much of the core SMP work is based on previous work by Thomas Radke, to
@@ -53,7 +53,6 @@
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h> 55#include <asm/idle.h>
56#include <asm/smp.h>
57#include <asm/trampoline.h> 56#include <asm/trampoline.h>
58#include <asm/cpu.h> 57#include <asm/cpu.h>
59#include <asm/numa.h> 58#include <asm/numa.h>
@@ -61,13 +60,12 @@
61#include <asm/tlbflush.h> 60#include <asm/tlbflush.h>
62#include <asm/mtrr.h> 61#include <asm/mtrr.h>
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/apic.h>
65#include <asm/setup.h> 64#include <asm/setup.h>
65#include <asm/uv/uv.h>
66#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
67 67
68#include <mach_apic.h> 68#include <asm/smpboot_hooks.h>
69#include <mach_wakecpu.h>
70#include <smpboot_hooks.h>
71 69
72#ifdef CONFIG_X86_32 70#ifdef CONFIG_X86_32
73u8 apicid_2_node[MAX_APICID]; 71u8 apicid_2_node[MAX_APICID];
@@ -114,11 +112,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
114DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 112DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
115EXPORT_PER_CPU_SYMBOL(cpu_info); 113EXPORT_PER_CPU_SYMBOL(cpu_info);
116 114
117static atomic_t init_deasserted; 115atomic_t init_deasserted;
118
119
120/* Set if we find a B stepping CPU */
121static int __cpuinitdata smp_b_stepping;
122 116
123#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 117#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
124 118
@@ -163,7 +157,7 @@ static void map_cpu_to_logical_apicid(void)
163{ 157{
164 int cpu = smp_processor_id(); 158 int cpu = smp_processor_id();
165 int apicid = logical_smp_processor_id(); 159 int apicid = logical_smp_processor_id();
166 int node = apicid_to_node(apicid); 160 int node = apic->apicid_to_node(apicid);
167 161
168 if (!node_online(node)) 162 if (!node_online(node))
169 node = first_online_node; 163 node = first_online_node;
@@ -196,7 +190,8 @@ static void __cpuinit smp_callin(void)
196 * our local APIC. We have to wait for the IPI or we'll 190 * our local APIC. We have to wait for the IPI or we'll
197 * lock up on an APIC access. 191 * lock up on an APIC access.
198 */ 192 */
199 wait_for_init_deassert(&init_deasserted); 193 if (apic->wait_for_init_deassert)
194 apic->wait_for_init_deassert(&init_deasserted);
200 195
201 /* 196 /*
202 * (This works even if the APIC is not enabled.) 197 * (This works even if the APIC is not enabled.)
@@ -243,7 +238,8 @@ static void __cpuinit smp_callin(void)
243 */ 238 */
244 239
245 pr_debug("CALLIN, before setup_local_APIC().\n"); 240 pr_debug("CALLIN, before setup_local_APIC().\n");
246 smp_callin_clear_local_apic(); 241 if (apic->smp_callin_clear_local_apic)
242 apic->smp_callin_clear_local_apic();
247 setup_local_APIC(); 243 setup_local_APIC();
248 end_local_APIC_setup(); 244 end_local_APIC_setup();
249 map_cpu_to_logical_apicid(); 245 map_cpu_to_logical_apicid();
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
271 cpumask_set_cpu(cpuid, cpu_callin_mask); 267 cpumask_set_cpu(cpuid, cpu_callin_mask);
272} 268}
273 269
274static int __cpuinitdata unsafe_smp;
275
276/* 270/*
277 * Activate a secondary processor. 271 * Activate a secondary processor.
278 */ 272 */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
340 cpu_idle(); 334 cpu_idle();
341} 335}
342 336
343static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
344{
345 /*
346 * Mask B, Pentium, but not Pentium MMX
347 */
348 if (c->x86_vendor == X86_VENDOR_INTEL &&
349 c->x86 == 5 &&
350 c->x86_mask >= 1 && c->x86_mask <= 4 &&
351 c->x86_model <= 3)
352 /*
353 * Remember we have B step Pentia with bugs
354 */
355 smp_b_stepping = 1;
356
357 /*
358 * Certain Athlons might work (for various values of 'work') in SMP
359 * but they are not certified as MP capable.
360 */
361 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
362
363 if (num_possible_cpus() == 1)
364 goto valid_k7;
365
366 /* Athlon 660/661 is valid. */
367 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
368 (c->x86_mask == 1)))
369 goto valid_k7;
370
371 /* Duron 670 is valid */
372 if ((c->x86_model == 7) && (c->x86_mask == 0))
373 goto valid_k7;
374
375 /*
376 * Athlon 662, Duron 671, and Athlon >model 7 have capability
377 * bit. It's worth noting that the A5 stepping (662) of some
378 * Athlon XP's have the MP bit set.
379 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
380 * more.
381 */
382 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
383 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
384 (c->x86_model > 7))
385 if (cpu_has_mp)
386 goto valid_k7;
387
388 /* If we get here, not a certified SMP capable AMD system. */
389 unsafe_smp = 1;
390 }
391
392valid_k7:
393 ;
394}
395
396static void __cpuinit smp_checks(void)
397{
398 if (smp_b_stepping)
399 printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
400 "with B stepping processors.\n");
401
402 /*
403 * Don't taint if we are running SMP kernel on a single non-MP
404 * approved Athlon
405 */
406 if (unsafe_smp && num_online_cpus() > 1) {
407 printk(KERN_INFO "WARNING: This combination of AMD"
408 "processors is not suitable for SMP.\n");
409 add_taint(TAINT_UNSAFE_SMP);
410 }
411}
412
413/* 337/*
414 * The bootstrap kernel entry code has set these up. Save them for 338 * The bootstrap kernel entry code has set these up. Save them for
415 * a given CPU 339 * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
423 c->cpu_index = id; 347 c->cpu_index = id;
424 if (id != 0) 348 if (id != 0)
425 identify_secondary_cpu(c); 349 identify_secondary_cpu(c);
426 smp_apply_quirks(c);
427} 350}
428 351
429 352
@@ -583,7 +506,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
583 /* Target chip */ 506 /* Target chip */
584 /* Boot on the stack */ 507 /* Boot on the stack */
585 /* Kick the second */ 508 /* Kick the second */
586 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); 509 apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
587 510
588 pr_debug("Waiting for send to finish...\n"); 511 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 512 send_status = safe_apic_wait_icr_idle();
@@ -614,12 +537,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
614 unsigned long send_status, accept_status = 0; 537 unsigned long send_status, accept_status = 0;
615 int maxlvt, num_starts, j; 538 int maxlvt, num_starts, j;
616 539
617 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
618 send_status = uv_wakeup_secondary(phys_apicid, start_eip);
619 atomic_set(&init_deasserted, 1);
620 return send_status;
621 }
622
623 maxlvt = lapic_get_maxlvt(); 540 maxlvt = lapic_get_maxlvt();
624 541
625 /* 542 /*
@@ -745,78 +662,23 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
745 complete(&c_idle->done); 662 complete(&c_idle->done);
746} 663}
747 664
748#ifdef CONFIG_X86_64
749
750/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
751static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
752{
753 if (!after_bootmem)
754 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
755}
756
757/*
758 * Allocate node local memory for the AP pda.
759 *
760 * Must be called after the _cpu_pda pointer table is initialized.
761 */
762int __cpuinit get_local_pda(int cpu)
763{
764 struct x8664_pda *oldpda, *newpda;
765 unsigned long size = sizeof(struct x8664_pda);
766 int node = cpu_to_node(cpu);
767
768 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
769 return 0;
770
771 oldpda = cpu_pda(cpu);
772 newpda = kmalloc_node(size, GFP_ATOMIC, node);
773 if (!newpda) {
774 printk(KERN_ERR "Could not allocate node local PDA "
775 "for CPU %d on node %d\n", cpu, node);
776
777 if (oldpda)
778 return 0; /* have a usable pda */
779 else
780 return -1;
781 }
782
783 if (oldpda) {
784 memcpy(newpda, oldpda, size);
785 free_bootmem_pda(oldpda);
786 }
787
788 newpda->in_bootmem = 0;
789 cpu_pda(cpu) = newpda;
790 return 0;
791}
792#endif /* CONFIG_X86_64 */
793
794static int __cpuinit do_boot_cpu(int apicid, int cpu)
795/* 665/*
796 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 666 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
797 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 667 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
798 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. 668 * Returns zero if CPU booted OK, else error code from
669 * ->wakeup_secondary_cpu.
799 */ 670 */
671static int __cpuinit do_boot_cpu(int apicid, int cpu)
800{ 672{
801 unsigned long boot_error = 0; 673 unsigned long boot_error = 0;
802 int timeout;
803 unsigned long start_ip; 674 unsigned long start_ip;
804 unsigned short nmi_high = 0, nmi_low = 0; 675 int timeout;
805 struct create_idle c_idle = { 676 struct create_idle c_idle = {
806 .cpu = cpu, 677 .cpu = cpu,
807 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 678 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
808 }; 679 };
809 INIT_WORK(&c_idle.work, do_fork_idle);
810 680
811#ifdef CONFIG_X86_64 681 INIT_WORK(&c_idle.work, do_fork_idle);
812 /* Allocate node local memory for AP pdas */
813 if (cpu > 0) {
814 boot_error = get_local_pda(cpu);
815 if (boot_error)
816 goto restore_state;
817 /* if can't get pda memory, can't start cpu */
818 }
819#endif
820 682
821 alternatives_smp_switch(1); 683 alternatives_smp_switch(1);
822 684
@@ -847,14 +709,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
847 709
848 set_idle_for_cpu(cpu, c_idle.idle); 710 set_idle_for_cpu(cpu, c_idle.idle);
849do_rest: 711do_rest:
850#ifdef CONFIG_X86_32
851 per_cpu(current_task, cpu) = c_idle.idle; 712 per_cpu(current_task, cpu) = c_idle.idle;
852 init_gdt(cpu); 713#ifdef CONFIG_X86_32
853 /* Stack for startup_32 can be just as for start_secondary onwards */ 714 /* Stack for startup_32 can be just as for start_secondary onwards */
854 irq_ctx_init(cpu); 715 irq_ctx_init(cpu);
855#else 716#else
856 cpu_pda(cpu)->pcurrent = c_idle.idle;
857 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 717 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
718 initial_gs = per_cpu_offset(cpu);
719 per_cpu(kernel_stack, cpu) =
720 (unsigned long)task_stack_page(c_idle.idle) -
721 KERNEL_STACK_OFFSET + THREAD_SIZE;
858#endif 722#endif
859 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 723 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
860 initial_code = (unsigned long)start_secondary; 724 initial_code = (unsigned long)start_secondary;
@@ -878,8 +742,6 @@ do_rest:
878 742
879 pr_debug("Setting warm reset code and vector.\n"); 743 pr_debug("Setting warm reset code and vector.\n");
880 744
881 store_NMI_vector(&nmi_high, &nmi_low);
882
883 smpboot_setup_warm_reset_vector(start_ip); 745 smpboot_setup_warm_reset_vector(start_ip);
884 /* 746 /*
885 * Be paranoid about clearing APIC errors. 747 * Be paranoid about clearing APIC errors.
@@ -891,9 +753,13 @@ do_rest:
891 } 753 }
892 754
893 /* 755 /*
894 * Starting actual IPI sequence... 756 * Kick the secondary CPU. Use the method in the APIC driver
757 * if it's defined - or use an INIT boot APIC message otherwise:
895 */ 758 */
896 boot_error = wakeup_secondary_cpu(apicid, start_ip); 759 if (apic->wakeup_secondary_cpu)
760 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
761 else
762 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
897 763
898 if (!boot_error) { 764 if (!boot_error) {
899 /* 765 /*
@@ -927,13 +793,11 @@ do_rest:
927 else 793 else
928 /* trampoline code not run */ 794 /* trampoline code not run */
929 printk(KERN_ERR "Not responding.\n"); 795 printk(KERN_ERR "Not responding.\n");
930 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) 796 if (apic->inquire_remote_apic)
931 inquire_remote_apic(apicid); 797 apic->inquire_remote_apic(apicid);
932 } 798 }
933 } 799 }
934#ifdef CONFIG_X86_64 800
935restore_state:
936#endif
937 if (boot_error) { 801 if (boot_error) {
938 /* Try to put things back the way they were before ... */ 802 /* Try to put things back the way they were before ... */
939 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 803 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -961,7 +825,7 @@ restore_state:
961 825
962int __cpuinit native_cpu_up(unsigned int cpu) 826int __cpuinit native_cpu_up(unsigned int cpu)
963{ 827{
964 int apicid = cpu_present_to_apicid(cpu); 828 int apicid = apic->cpu_present_to_apicid(cpu);
965 unsigned long flags; 829 unsigned long flags;
966 int err; 830 int err;
967 831
@@ -1054,14 +918,14 @@ static int __init smp_sanity_check(unsigned max_cpus)
1054{ 918{
1055 preempt_disable(); 919 preempt_disable();
1056 920
1057#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) 921#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1058 if (def_to_bigsmp && nr_cpu_ids > 8) { 922 if (def_to_bigsmp && nr_cpu_ids > 8) {
1059 unsigned int cpu; 923 unsigned int cpu;
1060 unsigned nr; 924 unsigned nr;
1061 925
1062 printk(KERN_WARNING 926 printk(KERN_WARNING
1063 "More than 8 CPUs detected - skipping them.\n" 927 "More than 8 CPUs detected - skipping them.\n"
1064 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); 928 "Use CONFIG_X86_BIGSMP.\n");
1065 929
1066 nr = 0; 930 nr = 0;
1067 for_each_present_cpu(cpu) { 931 for_each_present_cpu(cpu) {
@@ -1107,7 +971,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1107 * Should not be necessary because the MP table should list the boot 971 * Should not be necessary because the MP table should list the boot
1108 * CPU too, but we do it for the sake of robustness anyway. 972 * CPU too, but we do it for the sake of robustness anyway.
1109 */ 973 */
1110 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { 974 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1111 printk(KERN_NOTICE 975 printk(KERN_NOTICE
1112 "weird, boot CPU (#%d) not listed by the BIOS.\n", 976 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1113 boot_cpu_physical_apicid); 977 boot_cpu_physical_apicid);
@@ -1125,6 +989,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1125 printk(KERN_ERR "... forcing use of dummy APIC emulation." 989 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1126 "(tell your hw vendor)\n"); 990 "(tell your hw vendor)\n");
1127 smpboot_clear_io_apic(); 991 smpboot_clear_io_apic();
992 arch_disable_smp_support();
1128 return -1; 993 return -1;
1129 } 994 }
1130 995
@@ -1181,9 +1046,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1181 current_thread_info()->cpu = 0; /* needed? */ 1046 current_thread_info()->cpu = 0; /* needed? */
1182 set_cpu_sibling_map(0); 1047 set_cpu_sibling_map(0);
1183 1048
1184#ifdef CONFIG_X86_64
1185 enable_IR_x2apic(); 1049 enable_IR_x2apic();
1186 setup_apic_routing(); 1050#ifdef CONFIG_X86_64
1051 default_setup_apic_routing();
1187#endif 1052#endif
1188 1053
1189 if (smp_sanity_check(max_cpus) < 0) { 1054 if (smp_sanity_check(max_cpus) < 0) {
@@ -1207,18 +1072,18 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1207 */ 1072 */
1208 setup_local_APIC(); 1073 setup_local_APIC();
1209 1074
1210#ifdef CONFIG_X86_64
1211 /* 1075 /*
1212 * Enable IO APIC before setting up error vector 1076 * Enable IO APIC before setting up error vector
1213 */ 1077 */
1214 if (!skip_ioapic_setup && nr_ioapics) 1078 if (!skip_ioapic_setup && nr_ioapics)
1215 enable_IO_APIC(); 1079 enable_IO_APIC();
1216#endif 1080
1217 end_local_APIC_setup(); 1081 end_local_APIC_setup();
1218 1082
1219 map_cpu_to_logical_apicid(); 1083 map_cpu_to_logical_apicid();
1220 1084
1221 setup_portio_remap(); 1085 if (apic->setup_portio_remap)
1086 apic->setup_portio_remap();
1222 1087
1223 smpboot_setup_io_apic(); 1088 smpboot_setup_io_apic();
1224 /* 1089 /*
@@ -1240,10 +1105,7 @@ out:
1240void __init native_smp_prepare_boot_cpu(void) 1105void __init native_smp_prepare_boot_cpu(void)
1241{ 1106{
1242 int me = smp_processor_id(); 1107 int me = smp_processor_id();
1243#ifdef CONFIG_X86_32 1108 switch_to_new_gdt(me);
1244 init_gdt(me);
1245#endif
1246 switch_to_new_gdt();
1247 /* already set me in cpu_online_mask in boot_cpu_init() */ 1109 /* already set me in cpu_online_mask in boot_cpu_init() */
1248 cpumask_set_cpu(me, cpu_callout_mask); 1110 cpumask_set_cpu(me, cpu_callout_mask);
1249 per_cpu(cpu_state, me) = CPU_ONLINE; 1111 per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1254,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1254 pr_debug("Boot done.\n"); 1116 pr_debug("Boot done.\n");
1255 1117
1256 impress_friends(); 1118 impress_friends();
1257 smp_checks();
1258#ifdef CONFIG_X86_IO_APIC 1119#ifdef CONFIG_X86_IO_APIC
1259 setup_ioapic_dest(); 1120 setup_ioapic_dest();
1260#endif 1121#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7#ifdef CONFIG_X86_32
8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10
11/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
16__cpuinit void init_gdt(int cpu)
17{
18 struct desc_struct gdt;
19
20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
23
24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29}
30#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 10786af95545..f7bddc2e37d1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Stack trace management functions 2 * Stack trace management functions
3 * 3 *
4 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 4 * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 */ 5 */
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
deleted file mode 100644
index 7b987852e876..000000000000
--- a/arch/x86/kernel/summit_32.c
+++ /dev/null
@@ -1,188 +0,0 @@
1/*
2 * IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/bios_ebda.h>
33#include <asm/summit/mpparse.h>
34
35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
37static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
38
39#ifndef CONFIG_X86_NUMAQ
40static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
41#endif
42
43static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
44{
45 int twister = 0, node = 0;
46 int i, bus, num_buses;
47
48 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
49 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
50 twister = rio_devs[i]->owner_id;
51 break;
52 }
53 }
54 if (i == rio_table_hdr->num_rio_dev) {
55 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
56 return last_bus;
57 }
58
59 for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
60 if (scal_devs[i]->node_id == twister) {
61 node = scal_devs[i]->node_id;
62 break;
63 }
64 }
65 if (i == rio_table_hdr->num_scal_dev) {
66 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
67 return last_bus;
68 }
69
70 switch (rio_devs[wpeg_num]->type) {
71 case CompatWPEG:
72 /*
73 * The Compatibility Winnipeg controls the 2 legacy buses,
74 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
75 * a PCI-PCI bridge card is used in either slot: total 5 buses.
76 */
77 num_buses = 5;
78 break;
79 case AltWPEG:
80 /*
81 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
82 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
83 * the "extra" buses for each of those slots: total 7 buses.
84 */
85 num_buses = 7;
86 break;
87 case LookOutAWPEG:
88 case LookOutBWPEG:
89 /*
90 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
91 * & the "extra" buses for each of those slots: total 9 buses.
92 */
93 num_buses = 9;
94 break;
95 default:
96 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
97 return last_bus;
98 }
99
100 for (bus = last_bus; bus < last_bus + num_buses; bus++)
101 mp_bus_id_to_node[bus] = node;
102 return bus;
103}
104
105static int __init build_detail_arrays(void)
106{
107 unsigned long ptr;
108 int i, scal_detail_size, rio_detail_size;
109
110 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
111 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
112 return 0;
113 }
114
115 switch (rio_table_hdr->version) {
116 default:
117 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
118 return 0;
119 case 2:
120 scal_detail_size = 11;
121 rio_detail_size = 13;
122 break;
123 case 3:
124 scal_detail_size = 12;
125 rio_detail_size = 15;
126 break;
127 }
128
129 ptr = (unsigned long)rio_table_hdr + 3;
130 for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
131 scal_devs[i] = (struct scal_detail *)ptr;
132
133 for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
134 rio_devs[i] = (struct rio_detail *)ptr;
135
136 return 1;
137}
138
139void __init setup_summit(void)
140{
141 unsigned long ptr;
142 unsigned short offset;
143 int i, next_wpeg, next_bus = 0;
144
145 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
146 ptr = get_bios_ebda();
147 ptr = (unsigned long)phys_to_virt(ptr);
148
149 rio_table_hdr = NULL;
150 offset = 0x180;
151 while (offset) {
152 /* The block id is stored in the 2nd word */
153 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
154 /* set the pointer past the offset & block id */
155 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
156 break;
157 }
158 /* The next offset is stored in the 1st word. 0 means no more */
159 offset = *((unsigned short *)(ptr + offset));
160 }
161 if (!rio_table_hdr) {
162 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
163 return;
164 }
165
166 if (!build_detail_arrays())
167 return;
168
169 /* The first Winnipeg we're looking for has an index of 0 */
170 next_wpeg = 0;
171 do {
172 for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
173 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
174 /* It's the Winnipeg we're looking for! */
175 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
176 next_wpeg++;
177 break;
178 }
179 }
180 /*
181 * If we go through all Rio devices and don't find one with
182 * the next index, it means we've found all the Winnipegs,
183 * and thus all the PCI buses.
184 */
185 if (i == rio_table_hdr->num_rio_dev)
186 next_wpeg = 0;
187 } while (next_wpeg != 0);
188}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e2e86a08f31d..3bdb64829b82 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
1ENTRY(sys_call_table) 1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ 2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit 3 .long sys_exit
4 .long sys_fork 4 .long ptregs_fork
5 .long sys_read 5 .long sys_read
6 .long sys_write 6 .long sys_write
7 .long sys_open /* 5 */ 7 .long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
10 .long sys_creat 10 .long sys_creat
11 .long sys_link 11 .long sys_link
12 .long sys_unlink /* 10 */ 12 .long sys_unlink /* 10 */
13 .long sys_execve 13 .long ptregs_execve
14 .long sys_chdir 14 .long sys_chdir
15 .long sys_time 15 .long sys_time
16 .long sys_mknod 16 .long sys_mknod
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
109 .long sys_newlstat 109 .long sys_newlstat
110 .long sys_newfstat 110 .long sys_newfstat
111 .long sys_uname 111 .long sys_uname
112 .long sys_iopl /* 110 */ 112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup 113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */ 114 .long sys_ni_syscall /* old "idle" system call */
115 .long sys_vm86old 115 .long ptregs_vm86old
116 .long sys_wait4 116 .long sys_wait4
117 .long sys_swapoff /* 115 */ 117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo 118 .long sys_sysinfo
119 .long sys_ipc 119 .long sys_ipc
120 .long sys_fsync 120 .long sys_fsync
121 .long sys_sigreturn 121 .long ptregs_sigreturn
122 .long sys_clone /* 120 */ 122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname 123 .long sys_setdomainname
124 .long sys_newuname 124 .long sys_newuname
125 .long sys_modify_ldt 125 .long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
165 .long sys_mremap 165 .long sys_mremap
166 .long sys_setresuid16 166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */ 167 .long sys_getresuid16 /* 165 */
168 .long sys_vm86 168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */ 169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll 170 .long sys_poll
171 .long sys_nfsservctl 171 .long sys_nfsservctl
172 .long sys_setresgid16 /* 170 */ 172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16 173 .long sys_getresgid16
174 .long sys_prctl 174 .long sys_prctl
175 .long sys_rt_sigreturn 175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction 176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */ 177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending 178 .long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
185 .long sys_getcwd 185 .long sys_getcwd
186 .long sys_capget 186 .long sys_capget
187 .long sys_capset /* 185 */ 187 .long sys_capset /* 185 */
188 .long sys_sigaltstack 188 .long ptregs_sigaltstack
189 .long sys_sendfile 189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */ 190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long sys_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap2
195 .long sys_truncate64 195 .long sys_truncate64
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 3985cac0ed47..5c5d87f0b2e1 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -33,12 +33,12 @@
33#include <linux/time.h> 33#include <linux/time.h>
34#include <linux/mca.h> 34#include <linux/mca.h>
35 35
36#include <asm/arch_hooks.h> 36#include <asm/setup.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h> 39#include <asm/timer.h>
40 40
41#include "do_timer.h" 41#include <asm/do_timer.h>
42 42
43int timer_ack; 43int timer_ack;
44 44
@@ -118,7 +118,7 @@ void __init hpet_time_init(void)
118{ 118{
119 if (!hpet_enable()) 119 if (!hpet_enable())
120 setup_pit_timer(); 120 setup_pit_timer();
121 time_init_hook(); 121 x86_quirk_time_init();
122} 122}
123 123
124/* 124/*
@@ -131,7 +131,7 @@ void __init hpet_time_init(void)
131 */ 131 */
132void __init time_init(void) 132void __init time_init(void)
133{ 133{
134 pre_time_init_hook(); 134 x86_quirk_pre_time_init();
135 tsc_init(); 135 tsc_init();
136 late_time_init = choose_time_init(); 136 late_time_init = choose_time_init();
137} 137}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 load_cr3(swapper_pg_dir);
40}
41EXPORT_SYMBOL_GPL(leave_mm);
42
43/*
44 *
45 * The flush IPI assumes that a thread switch happens in this order:
46 * [cpu0: the cpu that switches]
47 * 1) switch_mm() either 1a) or 1b)
48 * 1a) thread switch to a different mm
49 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
50 * Stop ipi delivery for the old mm. This is not synchronized with
51 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
52 * for the wrong mm, and in the worst case we perform a superfluous
53 * tlb flush.
54 * 1a2) set cpu_tlbstate to TLBSTATE_OK
55 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
56 * was in lazy tlb mode.
57 * 1a3) update cpu_tlbstate[].active_mm
58 * Now cpu0 accepts tlb flushes for the new mm.
59 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
60 * Now the other cpus will send tlb flush ipis.
61 * 1a4) change cr3.
62 * 1b) thread switch without mm change
63 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
64 * flush ipis.
65 * 1b1) set cpu_tlbstate to TLBSTATE_OK
66 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
67 * Atomically set the bit [other cpus will start sending flush ipis],
68 * and test the bit.
69 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
70 * 2) switch %%esp, ie current
71 *
72 * The interrupt must handle 2 special cases:
73 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
74 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
75 * runs in kernel space, the cpu could load tlb entries for user space
76 * pages.
77 *
78 * The good news is that cpu_tlbstate is local to each cpu, no
79 * write/read ordering problems.
80 */
81
82/*
83 * TLB flush IPI:
84 *
85 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
86 * 2) Leave the mm if we are in the lazy tlb mode.
87 */
88
89void smp_invalidate_interrupt(struct pt_regs *regs)
90{
91 unsigned long cpu;
92
93 cpu = get_cpu();
94
95 if (!cpu_isset(cpu, flush_cpumask))
96 goto out;
97 /*
98 * This was a BUG() but until someone can quote me the
99 * line from the intel manual that guarantees an IPI to
100 * multiple CPUs is retried _only_ on the erroring CPUs
101 * its staying as a return
102 *
103 * BUG();
104 */
105
106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (flush_va == TLB_FLUSH_ALL)
109 local_flush_tlb();
110 else
111 __flush_tlb_one(flush_va);
112 } else
113 leave_mm(cpu);
114 }
115 ack_APIC_irq();
116 smp_mb__before_clear_bit();
117 cpu_clear(cpu, flush_cpumask);
118 smp_mb__after_clear_bit();
119out:
120 put_cpu_no_resched();
121 inc_irq_stat(irq_tlb_count);
122}
123
124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
125 unsigned long va)
126{
127 cpumask_t cpumask = *cpumaskp;
128
129 /*
130 * A couple of (to be removed) sanity checks:
131 *
132 * - current CPU must not be in mask
133 * - mask must exist :)
134 */
135 BUG_ON(cpus_empty(cpumask));
136 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
137 BUG_ON(!mm);
138
139#ifdef CONFIG_HOTPLUG_CPU
140 /* If a CPU which we ran on has gone down, OK. */
141 cpus_and(cpumask, cpumask, cpu_online_map);
142 if (unlikely(cpus_empty(cpumask)))
143 return;
144#endif
145
146 /*
147 * i'm not happy about this global shared spinlock in the
148 * MM hot path, but we'll see how contended it is.
149 * AK: x86-64 has a faster method that could be ported.
150 */
151 spin_lock(&tlbstate_lock);
152
153 flush_mm = mm;
154 flush_va = va;
155 cpus_or(flush_cpumask, cpumask, flush_cpumask);
156
157 /*
158 * Make the above memory operations globally visible before
159 * sending the IPI.
160 */
161 smp_mb();
162 /*
163 * We have to send the IPI only to
164 * CPUs affected.
165 */
166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167
168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */
170 cpu_relax();
171
172 flush_mm = NULL;
173 flush_va = 0;
174 spin_unlock(&tlbstate_lock);
175}
176
177void flush_tlb_current_task(void)
178{
179 struct mm_struct *mm = current->mm;
180 cpumask_t cpu_mask;
181
182 preempt_disable();
183 cpu_mask = mm->cpu_vm_mask;
184 cpu_clear(smp_processor_id(), cpu_mask);
185
186 local_flush_tlb();
187 if (!cpus_empty(cpu_mask))
188 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
189 preempt_enable();
190}
191
192void flush_tlb_mm(struct mm_struct *mm)
193{
194 cpumask_t cpu_mask;
195
196 preempt_disable();
197 cpu_mask = mm->cpu_vm_mask;
198 cpu_clear(smp_processor_id(), cpu_mask);
199
200 if (current->active_mm == mm) {
201 if (current->mm)
202 local_flush_tlb();
203 else
204 leave_mm(smp_processor_id());
205 }
206 if (!cpus_empty(cpu_mask))
207 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
208
209 preempt_enable();
210}
211
212void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
213{
214 struct mm_struct *mm = vma->vm_mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 if (current->active_mm == mm) {
222 if (current->mm)
223 __flush_tlb_one(va);
224 else
225 leave_mm(smp_processor_id());
226 }
227
228 if (!cpus_empty(cpu_mask))
229 flush_tlb_others(cpu_mask, mm, va);
230
231 preempt_enable();
232}
233EXPORT_SYMBOL(flush_tlb_page);
234
235static void do_flush_tlb_all(void *info)
236{
237 unsigned long cpu = smp_processor_id();
238
239 __flush_tlb_all();
240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
241 leave_mm(cpu);
242}
243
244void flush_tlb_all(void)
245{
246 on_each_cpu(do_flush_tlb_all, NULL, 1);
247}
248
249void reset_lazy_tlbstate(void)
250{
251 int cpu = raw_smp_processor_id();
252
253 per_cpu(cpu_tlbstate, cpu).state = 0;
254 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
255}
256
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6812b829ed83..d038b9c45cf8 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,16 +11,15 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
17#include <asm/genapic.h> 18#include <asm/apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
19#include <asm/tsc.h> 20#include <asm/tsc.h>
20#include <asm/irq_vectors.h> 21#include <asm/irq_vectors.h>
21 22
22#include <mach_apic.h>
23
24static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
25static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
26 25
@@ -210,14 +209,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
210 * 209 *
211 * Send a broadcast and wait for a broadcast message to complete. 210 * Send a broadcast and wait for a broadcast message to complete.
212 * 211 *
213 * The cpumaskp mask contains the cpus the broadcast was sent to. 212 * The flush_mask contains the cpus the broadcast was sent to.
214 * 213 *
215 * Returns 1 if all remote flushing was done. The mask is zeroed. 214 * Returns NULL if all remote flushing was done. The mask is zeroed.
216 * Returns 0 if some remote flushing remains to be done. The mask is left 215 * Returns @flush_mask if some remote flushing remains to be done. The
217 * unchanged. 216 * mask will have some bits still set.
218 */ 217 */
219int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 218const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
220 cpumask_t *cpumaskp) 219 struct bau_desc *bau_desc,
220 struct cpumask *flush_mask)
221{ 221{
222 int completion_status = 0; 222 int completion_status = 0;
223 int right_shift; 223 int right_shift;
@@ -257,66 +257,74 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
257 * the cpu's, all of which are still in the mask. 257 * the cpu's, all of which are still in the mask.
258 */ 258 */
259 __get_cpu_var(ptcstats).ptc_i++; 259 __get_cpu_var(ptcstats).ptc_i++;
260 return 0; 260 return flush_mask;
261 } 261 }
262 262
263 /* 263 /*
264 * Success, so clear the remote cpu's from the mask so we don't 264 * Success, so clear the remote cpu's from the mask so we don't
265 * use the IPI method of shootdown on them. 265 * use the IPI method of shootdown on them.
266 */ 266 */
267 for_each_cpu_mask(bit, *cpumaskp) { 267 for_each_cpu(bit, flush_mask) {
268 blade = uv_cpu_to_blade_id(bit); 268 blade = uv_cpu_to_blade_id(bit);
269 if (blade == this_blade) 269 if (blade == this_blade)
270 continue; 270 continue;
271 cpu_clear(bit, *cpumaskp); 271 cpumask_clear_cpu(bit, flush_mask);
272 } 272 }
273 if (!cpus_empty(*cpumaskp)) 273 if (!cpumask_empty(flush_mask))
274 return 0; 274 return flush_mask;
275 return 1; 275 return NULL;
276} 276}
277 277
278/** 278/**
279 * uv_flush_tlb_others - globally purge translation cache of a virtual 279 * uv_flush_tlb_others - globally purge translation cache of a virtual
280 * address or all TLB's 280 * address or all TLB's
281 * @cpumaskp: mask of all cpu's in which the address is to be removed 281 * @cpumask: mask of all cpu's in which the address is to be removed
282 * @mm: mm_struct containing virtual address range 282 * @mm: mm_struct containing virtual address range
283 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 283 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
284 * @cpu: the current cpu
284 * 285 *
285 * This is the entry point for initiating any UV global TLB shootdown. 286 * This is the entry point for initiating any UV global TLB shootdown.
286 * 287 *
287 * Purges the translation caches of all specified processors of the given 288 * Purges the translation caches of all specified processors of the given
288 * virtual address, or purges all TLB's on specified processors. 289 * virtual address, or purges all TLB's on specified processors.
289 * 290 *
290 * The caller has derived the cpumaskp from the mm_struct and has subtracted 291 * The caller has derived the cpumask from the mm_struct. This function
291 * the local cpu from the mask. This function is called only if there 292 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
292 * are bits set in the mask. (e.g. flush_tlb_page())
293 * 293 *
294 * The cpumaskp is converted into a nodemask of the nodes containing 294 * The cpumask is converted into a nodemask of the nodes containing
295 * the cpus. 295 * the cpus.
296 * 296 *
297 * Returns 1 if all remote flushing was done. 297 * Note that this function should be called with preemption disabled.
298 * Returns 0 if some remote flushing remains to be done. 298 *
299 * Returns NULL if all remote flushing was done.
300 * Returns pointer to cpumask if some remote flushing remains to be
301 * done. The returned pointer is valid till preemption is re-enabled.
299 */ 302 */
300int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 303const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
301 unsigned long va) 304 struct mm_struct *mm,
305 unsigned long va, unsigned int cpu)
302{ 306{
307 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
308 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
303 int i; 309 int i;
304 int bit; 310 int bit;
305 int blade; 311 int blade;
306 int cpu; 312 int uv_cpu;
307 int this_blade; 313 int this_blade;
308 int locals = 0; 314 int locals = 0;
309 struct bau_desc *bau_desc; 315 struct bau_desc *bau_desc;
310 316
311 cpu = uv_blade_processor_id(); 317 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
318
319 uv_cpu = uv_blade_processor_id();
312 this_blade = uv_numa_blade_id(); 320 this_blade = uv_numa_blade_id();
313 bau_desc = __get_cpu_var(bau_control).descriptor_base; 321 bau_desc = __get_cpu_var(bau_control).descriptor_base;
314 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 322 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
315 323
316 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 324 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
317 325
318 i = 0; 326 i = 0;
319 for_each_cpu_mask(bit, *cpumaskp) { 327 for_each_cpu(bit, flush_mask) {
320 blade = uv_cpu_to_blade_id(bit); 328 blade = uv_cpu_to_blade_id(bit);
321 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 329 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
322 if (blade == this_blade) { 330 if (blade == this_blade) {
@@ -331,17 +339,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
331 * no off_node flushing; return status for local node 339 * no off_node flushing; return status for local node
332 */ 340 */
333 if (locals) 341 if (locals)
334 return 0; 342 return flush_mask;
335 else 343 else
336 return 1; 344 return NULL;
337 } 345 }
338 __get_cpu_var(ptcstats).requestor++; 346 __get_cpu_var(ptcstats).requestor++;
339 __get_cpu_var(ptcstats).ntargeted += i; 347 __get_cpu_var(ptcstats).ntargeted += i;
340 348
341 bau_desc->payload.address = va; 349 bau_desc->payload.address = va;
342 bau_desc->payload.sending_cpu = smp_processor_id(); 350 bau_desc->payload.sending_cpu = cpu;
343 351
344 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 352 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
345} 353}
346 354
347/* 355/*
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index d8ccc3c6552f..66d874e5404c 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -29,7 +29,7 @@
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/page.h> 32#include <asm/page_types.h>
33 33
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 34/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 35#ifndef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 894293c598db..cddfb8d386b9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,10 +25,11 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable_types.h>
29#include <asm/page.h> 29#include <asm/page_types.h>
30#include <asm/msr.h> 30#include <asm/msr.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/processor-flags.h>
32 33
33.section .rodata, "a", @progbits 34.section .rodata, "a", @progbits
34 35
@@ -37,7 +38,7 @@
37ENTRY(trampoline_data) 38ENTRY(trampoline_data)
38r_base = . 39r_base = .
39 cli # We should be safe anyway 40 cli # We should be safe anyway
40 wbinvd 41 wbinvd
41 mov %cs, %ax # Code and data in the same place 42 mov %cs, %ax # Code and data in the same place
42 mov %ax, %ds 43 mov %ax, %ds
43 mov %ax, %es 44 mov %ax, %es
@@ -73,9 +74,8 @@ r_base = .
73 lidtl tidt - r_base # load idt with 0, 0 74 lidtl tidt - r_base # load idt with 0, 0
74 lgdtl tgdt - r_base # load gdt with whatever is appropriate 75 lgdtl tgdt - r_base # load gdt with whatever is appropriate
75 76
76 xor %ax, %ax 77 mov $X86_CR0_PE, %ax # protected mode (PE) bit
77 inc %ax # protected mode (PE) bit 78 lmsw %ax # into protected mode
78 lmsw %ax # into protected mode
79 79
80 # flush prefetch and jump to startup_32 80 # flush prefetch and jump to startup_32
81 ljmpl *(startup_32_vector - r_base) 81 ljmpl *(startup_32_vector - r_base)
@@ -86,9 +86,8 @@ startup_32:
86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register 86 movl $__KERNEL_DS, %eax # Initialize the %ds segment register
87 movl %eax, %ds 87 movl %eax, %ds
88 88
89 xorl %eax, %eax 89 movl $X86_CR4_PAE, %eax
90 btsl $5, %eax # Enable PAE mode 90 movl %eax, %cr4 # Enable PAE mode
91 movl %eax, %cr4
92 91
93 # Setup trampoline 4 level pagetables 92 # Setup trampoline 4 level pagetables
94 leal (trampoline_level4_pgt - r_base)(%esi), %eax 93 leal (trampoline_level4_pgt - r_base)(%esi), %eax
@@ -99,9 +98,9 @@ startup_32:
99 xorl %edx, %edx 98 xorl %edx, %edx
100 wrmsr 99 wrmsr
101 100
102 xorl %eax, %eax 101 # Enable paging and in turn activate Long Mode
103 btsl $31, %eax # Enable paging and in turn activate Long Mode 102 # Enable protected mode
104 btsl $0, %eax # Enable protected mode 103 movl $(X86_CR0_PG | X86_CR0_PE), %eax
105 movl %eax, %cr0 104 movl %eax, %cr0
106 105
107 /* 106 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a9e7548e1790..a1d288327ff0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,15 +54,14 @@
54#include <asm/desc.h> 54#include <asm/desc.h>
55#include <asm/i387.h> 55#include <asm/i387.h>
56 56
57#include <mach_traps.h> 57#include <asm/mach_traps.h>
58 58
59#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
60#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
61#include <asm/proto.h> 61#include <asm/proto.h>
62#include <asm/pda.h>
63#else 62#else
64#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
65#include <asm/arch_hooks.h> 64#include <asm/setup.h>
66#include <asm/traps.h> 65#include <asm/traps.h>
67 66
68#include "cpu/mcheck/mce.h" 67#include "cpu/mcheck/mce.h"
@@ -119,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
119 if (!user_mode_vm(regs)) 118 if (!user_mode_vm(regs))
120 die(str, regs, err); 119 die(str, regs, err);
121} 120}
122
123/*
124 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
125 * invalid offset set (the LAZY one) and the faulting thread has
126 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
127 * we set the offset field correctly and return 1.
128 */
129static int lazy_iobitmap_copy(void)
130{
131 struct thread_struct *thread;
132 struct tss_struct *tss;
133 int cpu;
134
135 cpu = get_cpu();
136 tss = &per_cpu(init_tss, cpu);
137 thread = &current->thread;
138
139 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
140 thread->io_bitmap_ptr) {
141 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
142 thread->io_bitmap_max);
143 /*
144 * If the previously set map was extending to higher ports
145 * than the current one, pad extra space with 0xff (no access).
146 */
147 if (thread->io_bitmap_max < tss->io_bitmap_max) {
148 memset((char *) tss->io_bitmap +
149 thread->io_bitmap_max, 0xff,
150 tss->io_bitmap_max - thread->io_bitmap_max);
151 }
152 tss->io_bitmap_max = thread->io_bitmap_max;
153 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
154 tss->io_bitmap_owner = thread;
155 put_cpu();
156
157 return 1;
158 }
159 put_cpu();
160
161 return 0;
162}
163#endif 121#endif
164 122
165static void __kprobes 123static void __kprobes
@@ -310,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)
310 conditional_sti(regs); 268 conditional_sti(regs);
311 269
312#ifdef CONFIG_X86_32 270#ifdef CONFIG_X86_32
313 if (lazy_iobitmap_copy()) {
314 /* restart the faulting instruction */
315 return;
316 }
317
318 if (regs->flags & X86_VM_MASK) 271 if (regs->flags & X86_VM_MASK)
319 goto gp_in_vm86; 272 goto gp_in_vm86;
320#endif 273#endif
@@ -914,19 +867,20 @@ void math_emulate(struct math_emu_info *info)
914} 867}
915#endif /* CONFIG_MATH_EMULATION */ 868#endif /* CONFIG_MATH_EMULATION */
916 869
917dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) 870dotraplinkage void __kprobes
871do_device_not_available(struct pt_regs *regs, long error_code)
918{ 872{
919#ifdef CONFIG_X86_32 873#ifdef CONFIG_X86_32
920 if (read_cr0() & X86_CR0_EM) { 874 if (read_cr0() & X86_CR0_EM) {
921 struct math_emu_info info = { }; 875 struct math_emu_info info = { };
922 876
923 conditional_sti(&regs); 877 conditional_sti(regs);
924 878
925 info.regs = &regs; 879 info.regs = regs;
926 math_emulate(&info); 880 math_emulate(&info);
927 } else { 881 } else {
928 math_state_restore(); /* interrupts still off */ 882 math_state_restore(); /* interrupts still off */
929 conditional_sti(&regs); 883 conditional_sti(regs);
930 } 884 }
931#else 885#else
932 math_state_restore(); 886 math_state_restore();
@@ -942,7 +896,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
942 info.si_signo = SIGILL; 896 info.si_signo = SIGILL;
943 info.si_errno = 0; 897 info.si_errno = 0;
944 info.si_code = ILL_BADSTK; 898 info.si_code = ILL_BADSTK;
945 info.si_addr = 0; 899 info.si_addr = NULL;
946 if (notify_die(DIE_TRAP, "iret exception", 900 if (notify_die(DIE_TRAP, "iret exception",
947 regs, error_code, 32, SIGILL) == NOTIFY_STOP) 901 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
948 return; 902 return;
@@ -1026,6 +980,6 @@ void __init trap_init(void)
1026 cpu_init(); 980 cpu_init();
1027 981
1028#ifdef CONFIG_X86_32 982#ifdef CONFIG_X86_32
1029 trap_init_hook(); 983 x86_quirk_trap_init();
1030#endif 984#endif
1031} 985}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d5cebb52d45b..462b9ba67e92 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -793,7 +793,7 @@ __cpuinit int unsynchronized_tsc(void)
793 if (!cpu_has_tsc || tsc_unstable) 793 if (!cpu_has_tsc || tsc_unstable)
794 return 1; 794 return 1;
795 795
796#ifdef CONFIG_X86_SMP 796#ifdef CONFIG_SMP
797 if (apic_is_clustered_box()) 797 if (apic_is_clustered_box())
798 return 1; 798 return 1;
799#endif 799#endif
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 000000000000..2ffb6c53326e
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22
23#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv.h>
27#include <asm/apic.h>
28#include <asm/cpu.h>
29
30#define RTC_NAME "sgi_rtc"
31
32static cycle_t uv_read_rtc(void);
33static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
34static void uv_rtc_timer_setup(enum clock_event_mode,
35 struct clock_event_device *);
36
37static struct clocksource clocksource_uv = {
38 .name = RTC_NAME,
39 .rating = 400,
40 .read = uv_read_rtc,
41 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
42 .shift = 10,
43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
44};
45
46static struct clock_event_device clock_event_device_uv = {
47 .name = RTC_NAME,
48 .features = CLOCK_EVT_FEAT_ONESHOT,
49 .shift = 20,
50 .rating = 400,
51 .irq = -1,
52 .set_next_event = uv_rtc_next_event,
53 .set_mode = uv_rtc_timer_setup,
54 .event_handler = NULL,
55};
56
57static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
58
59/* There is one of these allocated per node */
60struct uv_rtc_timer_head {
61 spinlock_t lock;
62 /* next cpu waiting for timer, local node relative: */
63 int next_cpu;
64 /* number of cpus on this node: */
65 int ncpus;
66 struct {
67 int lcpu; /* systemwide logical cpu number */
68 u64 expires; /* next timer expiration for this cpu */
69 } cpu[1];
70};
71
72/*
73 * Access to uv_rtc_timer_head via blade id.
74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly;
76
77static int uv_rtc_enable;
78
79/*
80 * Hardware interface routines
81 */
82
83/* Send IPIs to another node */
84static void uv_rtc_send_IPI(int cpu)
85{
86 unsigned long apicid, val;
87 int pnode;
88
89 apicid = cpu_physical_id(cpu);
90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96}
97
98/* Check for an RTC interrupt pending */
99static int uv_intr_pending(int pnode)
100{
101 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
102 UVH_EVENT_OCCURRED0_RTC1_MASK;
103}
104
105/* Setup interrupt and return non-zero if early expiration occurred. */
106static int uv_setup_intr(int cpu, u64 expires)
107{
108 u64 val;
109 int pnode = uv_cpu_to_pnode(cpu);
110
111 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
112 UVH_RTC1_INT_CONFIG_M_MASK);
113 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
114
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120
121 /* Set configuration */
122 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125
126 return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
127}
128
129/*
130 * Per-cpu timer tracking routines
131 */
132
133static __init void uv_rtc_deallocate_timers(void)
134{
135 int bid;
136
137 for_each_possible_blade(bid) {
138 kfree(blade_info[bid]);
139 }
140 kfree(blade_info);
141}
142
143/* Allocate per-node list of cpu timer expiration times. */
144static __init int uv_rtc_allocate_timers(void)
145{
146 int cpu;
147
148 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
149 if (!blade_info)
150 return -ENOMEM;
151 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
152
153 for_each_present_cpu(cpu) {
154 int nid = cpu_to_node(cpu);
155 int bid = uv_cpu_to_blade_id(cpu);
156 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
157 struct uv_rtc_timer_head *head = blade_info[bid];
158
159 if (!head) {
160 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
161 (uv_blade_nr_possible_cpus(bid) *
162 2 * sizeof(u64)),
163 GFP_KERNEL, nid);
164 if (!head) {
165 uv_rtc_deallocate_timers();
166 return -ENOMEM;
167 }
168 spin_lock_init(&head->lock);
169 head->ncpus = uv_blade_nr_possible_cpus(bid);
170 head->next_cpu = -1;
171 blade_info[bid] = head;
172 }
173
174 head->cpu[bcpu].lcpu = cpu;
175 head->cpu[bcpu].expires = ULLONG_MAX;
176 }
177
178 return 0;
179}
180
181/* Find and set the next expiring timer. */
182static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
183{
184 u64 lowest = ULLONG_MAX;
185 int c, bcpu = -1;
186
187 head->next_cpu = -1;
188 for (c = 0; c < head->ncpus; c++) {
189 u64 exp = head->cpu[c].expires;
190 if (exp < lowest) {
191 bcpu = c;
192 lowest = exp;
193 }
194 }
195 if (bcpu >= 0) {
196 head->next_cpu = bcpu;
197 c = head->cpu[bcpu].lcpu;
198 if (uv_setup_intr(c, lowest))
199 /* If we didn't set it up in time, trigger */
200 uv_rtc_send_IPI(c);
201 } else {
202 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
203 UVH_RTC1_INT_CONFIG_M_MASK);
204 }
205}
206
207/*
208 * Set expiration time for current cpu.
209 *
210 * Returns 1 if we missed the expiration time.
211 */
212static int uv_rtc_set_timer(int cpu, u64 expires)
213{
214 int pnode = uv_cpu_to_pnode(cpu);
215 int bid = uv_cpu_to_blade_id(cpu);
216 struct uv_rtc_timer_head *head = blade_info[bid];
217 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
218 u64 *t = &head->cpu[bcpu].expires;
219 unsigned long flags;
220 int next_cpu;
221
222 spin_lock_irqsave(&head->lock, flags);
223
224 next_cpu = head->next_cpu;
225 *t = expires;
226 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) {
229 head->next_cpu = bcpu;
230 if (uv_setup_intr(cpu, expires)) {
231 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags);
234 return 1;
235 }
236 }
237
238 spin_unlock_irqrestore(&head->lock, flags);
239 return 0;
240}
241
242/*
243 * Unset expiration time for current cpu.
244 *
245 * Returns 1 if this timer was pending.
246 */
247static int uv_rtc_unset_timer(int cpu)
248{
249 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu);
251 struct uv_rtc_timer_head *head = blade_info[bid];
252 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
253 u64 *t = &head->cpu[bcpu].expires;
254 unsigned long flags;
255 int rc = 0;
256
257 spin_lock_irqsave(&head->lock, flags);
258
259 if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
260 rc = 1;
261
262 *t = ULLONG_MAX;
263
264 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode);
267
268 spin_unlock_irqrestore(&head->lock, flags);
269
270 return rc;
271}
272
273
274/*
275 * Kernel interface routines.
276 */
277
278/*
279 * Read the RTC.
280 */
281static cycle_t uv_read_rtc(void)
282{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC);
284}
285
286/*
287 * Program the next event, relative to now
288 */
289static int uv_rtc_next_event(unsigned long delta,
290 struct clock_event_device *ced)
291{
292 int ced_cpu = cpumask_first(ced->cpumask);
293
294 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
295}
296
297/*
298 * Setup the RTC timer in oneshot mode
299 */
300static void uv_rtc_timer_setup(enum clock_event_mode mode,
301 struct clock_event_device *evt)
302{
303 int ced_cpu = cpumask_first(evt->cpumask);
304
305 switch (mode) {
306 case CLOCK_EVT_MODE_PERIODIC:
307 case CLOCK_EVT_MODE_ONESHOT:
308 case CLOCK_EVT_MODE_RESUME:
309 /* Nothing to do here yet */
310 break;
311 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu);
314 break;
315 }
316}
317
318static void uv_rtc_interrupt(void)
319{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id();
322
323 if (!ced || !ced->event_handler)
324 return;
325
326 if (uv_rtc_unset_timer(cpu) != 1)
327 return;
328
329 ced->event_handler(ced);
330}
331
332static int __init uv_enable_rtc(char *str)
333{
334 uv_rtc_enable = 1;
335
336 return 1;
337}
338__setup("uvrtc", uv_enable_rtc);
339
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{
342 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
343
344 *ced = clock_event_device_uv;
345 ced->cpumask = cpumask_of(smp_processor_id());
346 clockevents_register_device(ced);
347}
348
349static __init int uv_rtc_setup_clock(void)
350{
351 int rc;
352
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
354 return -ENODEV;
355
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift);
360
361 rc = clocksource_register(&clocksource_uv);
362 if (rc) {
363 generic_interrupt_extension = NULL;
364 return rc;
365 }
366
367 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers();
369 if (rc) {
370 clocksource_unregister(&clocksource_uv);
371 generic_interrupt_extension = NULL;
372 return rc;
373 }
374
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift);
377
378 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
379 sn_rtc_cycles_per_second;
380
381 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
382 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
383
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) {
386 clocksource_unregister(&clocksource_uv);
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers();
389 }
390
391 return rc;
392}
393arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index d801d06af068..31ffc24eec4d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -24,18 +24,14 @@
24 24
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h> 27#include <asm/io_apic.h>
29#include <asm/fixmap.h> 28#include <asm/fixmap.h>
30#include <asm/reboot.h> 29#include <asm/reboot.h>
31#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h>
36
37#include "mach_apic.h"
38
39#include <linux/kernel_stat.h> 35#include <linux/kernel_stat.h>
40 36
41#include <asm/i8259.h> 37#include <asm/i8259.h>
@@ -49,8 +45,6 @@
49 45
50extern int no_broadcast; 46extern int no_broadcast;
51 47
52#include <asm/apic.h>
53
54char visws_board_type = -1; 48char visws_board_type = -1;
55char visws_board_rev = -1; 49char visws_board_rev = -1;
56 50
@@ -200,7 +194,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
200 return; 194 return;
201 } 195 }
202 196
203 apic_cpus = apicid_to_cpu_present(m->apicid); 197 apic_cpus = apic->apicid_to_cpu_present(m->apicid);
204 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 198 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
205 /* 199 /*
206 * Validate version 200 * Validate version
@@ -584,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
584static irqreturn_t piix4_master_intr(int irq, void *dev_id) 578static irqreturn_t piix4_master_intr(int irq, void *dev_id)
585{ 579{
586 int realirq; 580 int realirq;
587 irq_desc_t *desc; 581 struct irq_desc *desc;
588 unsigned long flags; 582 unsigned long flags;
589 583
590 spin_lock_irqsave(&i8259A_lock, flags); 584 spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720d..d7ac84e7fc1c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
158 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159 159
160 ret->fs = current->thread.saved_fs; 160 ret->fs = current->thread.saved_fs;
161 loadsegment(gs, current->thread.saved_gs); 161 set_user_gs(ret, current->thread.saved_gs);
162 162
163 return ret; 163 return ret;
164} 164}
@@ -197,9 +197,9 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200asmlinkage int sys_vm86old(struct pt_regs regs) 200int sys_vm86old(struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; 202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 203 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 204 * this avoids wasting of stack space.
205 * This remains on the stack until we 205 * This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
218 if (tmp) 218 if (tmp)
219 goto out; 219 goto out;
220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
221 info.regs32 = &regs; 221 info.regs32 = regs;
222 tsk->thread.vm86_info = v86; 222 tsk->thread.vm86_info = v86;
223 do_sys_vm86(&info, tsk); 223 do_sys_vm86(&info, tsk);
224 ret = 0; /* we never return here */ 224 ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
227} 227}
228 228
229 229
230asmlinkage int sys_vm86(struct pt_regs regs) 230int sys_vm86(struct pt_regs *regs)
231{ 231{
232 struct kernel_vm86_struct info; /* declare this _on top_, 232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 233 * this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
239 struct vm86plus_struct __user *v86; 239 struct vm86plus_struct __user *v86;
240 240
241 tsk = current; 241 tsk = current;
242 switch (regs.bx) { 242 switch (regs->bx) {
243 case VM86_REQUEST_IRQ: 243 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 244 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 245 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 246 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); 247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
248 goto out; 248 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 249 case VM86_PLUS_INSTALL_CHECK:
250 /* 250 /*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
261 ret = -EPERM; 261 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 262 if (tsk->thread.saved_sp0)
263 goto out; 263 goto out;
264 v86 = (struct vm86plus_struct __user *)regs.cx; 264 v86 = (struct vm86plus_struct __user *)regs->cx;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 266 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 267 sizeof(info.regs));
268 ret = -EFAULT; 268 ret = -EFAULT;
269 if (tmp) 269 if (tmp)
270 goto out; 270 goto out;
271 info.regs32 = &regs; 271 info.regs32 = regs;
272 info.vm86plus.is_vm86pus = 1; 272 info.vm86plus.is_vm86pus = 1;
273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 273 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
274 do_sys_vm86(&info, tsk); 274 do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
323 info->regs32->ax = 0; 323 info->regs32->ax = 0;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 324 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 325 tsk->thread.saved_fs = info->regs32->fs;
326 savesegment(gs, tsk->thread.saved_gs); 326 tsk->thread.saved_gs = get_user_gs(info->regs32);
327 327
328 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index bef58b4982db..2cc4a90e2cb3 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -680,10 +680,11 @@ static inline int __init activate_vmi(void)
680 para_fill(pv_mmu_ops.write_cr2, SetCR2); 680 para_fill(pv_mmu_ops.write_cr2, SetCR2);
681 para_fill(pv_mmu_ops.write_cr3, SetCR3); 681 para_fill(pv_mmu_ops.write_cr3, SetCR3);
682 para_fill(pv_cpu_ops.write_cr4, SetCR4); 682 para_fill(pv_cpu_ops.write_cr4, SetCR4);
683 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 683
684 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 684 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
685 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 685 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
686 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 686 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
687 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
687 688
688 para_fill(pv_cpu_ops.wbinvd, WBINVD); 689 para_fill(pv_cpu_ops.wbinvd, WBINVD);
689 para_fill(pv_cpu_ops.read_tsc, RDTSC); 690 para_fill(pv_cpu_ops.read_tsc, RDTSC);
@@ -797,8 +798,8 @@ static inline int __init activate_vmi(void)
797#endif 798#endif
798 799
799#ifdef CONFIG_X86_LOCAL_APIC 800#ifdef CONFIG_X86_LOCAL_APIC
800 para_fill(apic_ops->read, APICRead); 801 para_fill(apic->read, APICRead);
801 para_fill(apic_ops->write, APICWrite); 802 para_fill(apic->write, APICWrite);
802#endif 803#endif
803 804
804 /* 805 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index e5b088fffa40..33a788d5879c 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -28,7 +28,6 @@
28 28
29#include <asm/vmi.h> 29#include <asm/vmi.h>
30#include <asm/vmi_time.h> 30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h> 31#include <asm/apicdef.h>
33#include <asm/apic.h> 32#include <asm/apic.h>
34#include <asm/timer.h> 33#include <asm/timer.h>
@@ -256,7 +255,7 @@ void __devinit vmi_time_bsp_init(void)
256 */ 255 */
257 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 256 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
258 local_irq_disable(); 257 local_irq_disable();
259#ifdef CONFIG_X86_SMP 258#ifdef CONFIG_SMP
260 /* 259 /*
261 * XXX handle_percpu_irq only defined for SMP; we need to switch over 260 * XXX handle_percpu_irq only defined for SMP; we need to switch over
262 * to using it, since this is a local interrupt, which each CPU must 261 * to using it, since this is a local interrupt, which each CPU must
@@ -288,8 +287,7 @@ static struct clocksource clocksource_vmi;
288static cycle_t read_real_cycles(void) 287static cycle_t read_real_cycles(void)
289{ 288{
290 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); 289 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
291 return ret >= clocksource_vmi.cycle_last ? 290 return max(ret, clocksource_vmi.cycle_last);
292 ret : clocksource_vmi.cycle_last;
293} 291}
294 292
295static struct clocksource clocksource_vmi = { 293static struct clocksource clocksource_vmi = {
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..0d860963f268 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -12,7 +12,7 @@
12 12
13#include <asm-generic/vmlinux.lds.h> 13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h> 14#include <asm/thread_info.h>
15#include <asm/page.h> 15#include <asm/page_types.h>
16#include <asm/cache.h> 16#include <asm/cache.h>
17#include <asm/boot.h> 17#include <asm/boot.h>
18 18
@@ -178,14 +178,7 @@ SECTIONS
178 __initramfs_end = .; 178 __initramfs_end = .;
179 } 179 }
180#endif 180#endif
181 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
182 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
183 __per_cpu_start = .;
184 *(.data.percpu.page_aligned)
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
190 /* freed after init ends here */ 183 /* freed after init ends here */
191 184
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..5bf54e40c6ef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,7 +5,8 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/page.h> 8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
11 12
@@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(0); /* ___ */ 26 note PT_NOTE FLAGS(0); /* ___ */
23} 27}
24SECTIONS 28SECTIONS
@@ -208,14 +212,28 @@ SECTIONS
208 __initramfs_end = .; 212 __initramfs_end = .;
209#endif 213#endif
210 214
215#ifdef CONFIG_SMP
216 /*
217 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
218 * output PHDR, so the next output section - __data_nosave - should
219 * start another section data.init2. Also, pda should be at the head of
220 * percpu area. Preallocate it and define the percpu offset symbol
221 * so that it can be accessed as a percpu variable.
222 */
223 . = ALIGN(PAGE_SIZE);
224 PERCPU_VADDR(0, :percpu)
225#else
211 PERCPU(PAGE_SIZE) 226 PERCPU(PAGE_SIZE)
227#endif
212 228
213 . = ALIGN(PAGE_SIZE); 229 . = ALIGN(PAGE_SIZE);
214 __init_end = .; 230 __init_end = .;
215 231
216 . = ALIGN(PAGE_SIZE); 232 . = ALIGN(PAGE_SIZE);
217 __nosave_begin = .; 233 __nosave_begin = .;
218 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave)
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
219 . = ALIGN(PAGE_SIZE); 237 . = ALIGN(PAGE_SIZE);
220 __nosave_end = .; 238 __nosave_end = .;
221 239
@@ -239,8 +257,28 @@ SECTIONS
239 DWARF_DEBUG 257 DWARF_DEBUG
240} 258}
241 259
260 /*
261 * Per-cpu symbols which need to be offset from __per_cpu_load
262 * for the boot processor.
263 */
264#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
265INIT_PER_CPU(gdt_page);
266INIT_PER_CPU(irq_stack_union);
267
242/* 268/*
243 * Build-time check on the image size: 269 * Build-time check on the image size:
244 */ 270 */
245ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 271ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
246 "kernel image bigger than KERNEL_IMAGE_SIZE") 272 "kernel image bigger than KERNEL_IMAGE_SIZE")
273
274#ifdef CONFIG_SMP
275ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area");
277#endif
278
279#ifdef CONFIG_KEXEC
280#include <asm/kexec.h>
281
282ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
283 "kexec control code size is too big")
284#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..74de562812cc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -22,7 +22,7 @@
22#include <asm/paravirt.h> 22#include <asm/paravirt.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24 24
25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT 25#ifdef CONFIG_PARAVIRT
26/* 26/*
27 * Interrupt control on vSMPowered systems: 27 * Interrupt control on vSMPowered systems:
28 * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' 28 * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
@@ -110,7 +114,6 @@ static void __init set_vsmp_pv_ops(void)
110} 114}
111#endif 115#endif
112 116
113#ifdef CONFIG_PCI
114static int is_vsmp = -1; 117static int is_vsmp = -1;
115 118
116static void __init detect_vsmp_box(void) 119static void __init detect_vsmp_box(void)
@@ -135,15 +138,6 @@ int is_vsmp_box(void)
135 return 0; 138 return 0;
136 } 139 }
137} 140}
138#else
139static void __init detect_vsmp_box(void)
140{
141}
142int is_vsmp_box(void)
143{
144 return 0;
145}
146#endif
147 141
148void __init vsmp_init(void) 142void __init vsmp_init(void)
149{ 143{
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f0bdee..c7da3683f4c5 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -55,7 +55,8 @@ config KVM_AMD
55 55
56config KVM_TRACE 56config KVM_TRACE
57 bool "KVM trace support" 57 bool "KVM trace support"
58 depends on KVM && MARKERS && SYSFS 58 depends on KVM && SYSFS
59 select MARKERS
59 select RELAY 60 select RELAY
60 select DEBUG_FS 61 select DEBUG_FS
61 default n 62 default n
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index c70e12b1a637..8dab8f7844d3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -3,7 +3,6 @@ config LGUEST_GUEST
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE 5 depends on !X86_PAE
6 depends on !X86_VOYAGER
7 select VIRTIO 6 select VIRTIO
8 select VIRTIO_RING 7 select VIRTIO_RING
9 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 960a8d9c049c..9fe4ddaa8f6f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
173{ 173{
174 return lguest_data.irq_enabled; 174 return lguest_data.irq_enabled;
175} 175}
176PV_CALLEE_SAVE_REGS_THUNK(save_fl);
176 177
177/* restore_flags() just sets the flags back to the value given. */ 178/* restore_flags() just sets the flags back to the value given. */
178static void restore_fl(unsigned long flags) 179static void restore_fl(unsigned long flags)
179{ 180{
180 lguest_data.irq_enabled = flags; 181 lguest_data.irq_enabled = flags;
181} 182}
183PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
182 184
183/* Interrupts go off... */ 185/* Interrupts go off... */
184static void irq_disable(void) 186static void irq_disable(void)
185{ 187{
186 lguest_data.irq_enabled = 0; 188 lguest_data.irq_enabled = 0;
187} 189}
190PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
188 191
189/* Interrupts go on... */ 192/* Interrupts go on... */
190static void irq_enable(void) 193static void irq_enable(void)
191{ 194{
192 lguest_data.irq_enabled = X86_EFLAGS_IF; 195 lguest_data.irq_enabled = X86_EFLAGS_IF;
193} 196}
197PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
198
194/*:*/ 199/*:*/
195/*M:003 Note that we don't check for outstanding interrupts when we re-enable 200/*M:003 Note that we don't check for outstanding interrupts when we re-enable
196 * them (or when we unmask an interrupt). This seems to work for the moment, 201 * them (or when we unmask an interrupt). This seems to work for the moment,
@@ -278,7 +283,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
278 /* There's one problem which normal hardware doesn't have: the Host 283 /* There's one problem which normal hardware doesn't have: the Host
279 * can't handle us removing entries we're currently using. So we clear 284 * can't handle us removing entries we're currently using. So we clear
280 * the GS register here: if it's needed it'll be reloaded anyway. */ 285 * the GS register here: if it's needed it'll be reloaded anyway. */
281 loadsegment(gs, 0); 286 lazy_load_gs(0);
282 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); 287 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
283} 288}
284 289
@@ -830,13 +835,14 @@ static u32 lguest_apic_safe_wait_icr_idle(void)
830 return 0; 835 return 0;
831} 836}
832 837
833static struct apic_ops lguest_basic_apic_ops = { 838static void set_lguest_basic_apic_ops(void)
834 .read = lguest_apic_read, 839{
835 .write = lguest_apic_write, 840 apic->read = lguest_apic_read;
836 .icr_read = lguest_apic_icr_read, 841 apic->write = lguest_apic_write;
837 .icr_write = lguest_apic_icr_write, 842 apic->icr_read = lguest_apic_icr_read;
838 .wait_icr_idle = lguest_apic_wait_icr_idle, 843 apic->icr_write = lguest_apic_icr_write;
839 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle, 844 apic->wait_icr_idle = lguest_apic_wait_icr_idle;
845 apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
840}; 846};
841#endif 847#endif
842 848
@@ -991,10 +997,10 @@ __init void lguest_init(void)
991 997
992 /* interrupt-related operations */ 998 /* interrupt-related operations */
993 pv_irq_ops.init_IRQ = lguest_init_IRQ; 999 pv_irq_ops.init_IRQ = lguest_init_IRQ;
994 pv_irq_ops.save_fl = save_fl; 1000 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
995 pv_irq_ops.restore_fl = restore_fl; 1001 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
996 pv_irq_ops.irq_disable = irq_disable; 1002 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
997 pv_irq_ops.irq_enable = irq_enable; 1003 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
998 pv_irq_ops.safe_halt = lguest_safe_halt; 1004 pv_irq_ops.safe_halt = lguest_safe_halt;
999 1005
1000 /* init-time operations */ 1006 /* init-time operations */
@@ -1037,7 +1043,7 @@ __init void lguest_init(void)
1037 1043
1038#ifdef CONFIG_X86_LOCAL_APIC 1044#ifdef CONFIG_X86_LOCAL_APIC
1039 /* apic read/write intercepts */ 1045 /* apic read/write intercepts */
1040 apic_ops = &lguest_basic_apic_ops; 1046 set_lguest_basic_apic_ops();
1041#endif 1047#endif
1042 1048
1043 /* time operations */ 1049 /* time operations */
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index ad374003742f..51f1504cddd9 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -28,7 +28,7 @@
28 28
29#include <linux/linkage.h> 29#include <linux/linkage.h>
30#include <asm/dwarf2.h> 30#include <asm/dwarf2.h>
31#include <asm/page.h> 31#include <asm/page_types.h>
32#include <asm/errno.h> 32#include <asm/errno.h>
33#include <asm/asm-offsets.h> 33#include <asm/asm-offsets.h>
34#include <asm/thread_info.h> 34#include <asm/thread_info.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3a..ad5441ed1b57 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h>
6 7
7/* 8/*
8 * memcpy - Copy a memory block. 9 * memcpy - Copy a memory block.
9 * 10 *
10 * Input: 11 * Input:
11 * rdi destination 12 * rdi destination
12 * rsi source 13 * rsi source
13 * rdx count 14 * rdx count
14 * 15 *
15 * Output: 16 * Output:
16 * rax original destination 17 * rax original destination
17 */ 18 */
18 19
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
19 ALIGN 26 ALIGN
20memcpy_c: 27memcpy_c:
21 CFI_STARTPROC 28 CFI_STARTPROC
22 movq %rdi,%rax 29 movq %rdi, %rax
23 movl %edx,%ecx 30
24 shrl $3,%ecx 31 movl %edx, %ecx
25 andl $7,%edx 32 shrl $3, %ecx
33 andl $7, %edx
26 rep movsq 34 rep movsq
27 movl %edx,%ecx 35 movl %edx, %ecx
28 rep movsb 36 rep movsb
29 ret 37 ret
30 CFI_ENDPROC 38 CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
33ENTRY(__memcpy) 41ENTRY(__memcpy)
34ENTRY(memcpy) 42ENTRY(memcpy)
35 CFI_STARTPROC 43 CFI_STARTPROC
36 pushq %rbx
37 CFI_ADJUST_CFA_OFFSET 8
38 CFI_REL_OFFSET rbx, 0
39 movq %rdi,%rax
40 44
41 movl %edx,%ecx 45 /*
42 shrl $6,%ecx 46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
43 jz .Lhandle_tail 52 jz .Lhandle_tail
44 53
45 .p2align 4 54 .p2align 4
46.Lloop_64: 55.Lloop_64:
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
47 decl %ecx 61 decl %ecx
48 62
49 movq (%rsi),%r11 63 /*
50 movq 8(%rsi),%r8 64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
51 70
52 movq %r11,(%rdi) 71 movq 2*8(%rsi), %r9
53 movq %r8,1*8(%rdi) 72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
54 75
55 movq 2*8(%rsi),%r9 76 movq 4*8(%rsi), %r11
56 movq 3*8(%rsi),%r10 77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
57 80
58 movq %r9,2*8(%rdi) 81 movq 6*8(%rsi), %r9
59 movq %r10,3*8(%rdi) 82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
60 85
61 movq 4*8(%rsi),%r11 86 leaq 64(%rsi), %rsi
62 movq 5*8(%rsi),%r8 87 leaq 64(%rdi), %rdi
63 88
64 movq %r11,4*8(%rdi)
65 movq %r8,5*8(%rdi)
66
67 movq 6*8(%rsi),%r9
68 movq 7*8(%rsi),%r10
69
70 movq %r9,6*8(%rdi)
71 movq %r10,7*8(%rdi)
72
73 leaq 64(%rsi),%rsi
74 leaq 64(%rdi),%rdi
75 jnz .Lloop_64 89 jnz .Lloop_64
76 90
77.Lhandle_tail: 91.Lhandle_tail:
78 movl %edx,%ecx 92 movl %edx, %ecx
79 andl $63,%ecx 93 andl $63, %ecx
80 shrl $3,%ecx 94 shrl $3, %ecx
81 jz .Lhandle_7 95 jz .Lhandle_7
96
82 .p2align 4 97 .p2align 4
83.Lloop_8: 98.Lloop_8:
84 decl %ecx 99 decl %ecx
85 movq (%rsi),%r8 100 movq (%rsi), %r8
86 movq %r8,(%rdi) 101 movq %r8, (%rdi)
87 leaq 8(%rdi),%rdi 102 leaq 8(%rdi), %rdi
88 leaq 8(%rsi),%rsi 103 leaq 8(%rsi), %rsi
89 jnz .Lloop_8 104 jnz .Lloop_8
90 105
91.Lhandle_7: 106.Lhandle_7:
92 movl %edx,%ecx 107 movl %edx, %ecx
93 andl $7,%ecx 108 andl $7, %ecx
94 jz .Lende 109 jz .Lend
110
95 .p2align 4 111 .p2align 4
96.Lloop_1: 112.Lloop_1:
97 movb (%rsi),%r8b 113 movb (%rsi), %r8b
98 movb %r8b,(%rdi) 114 movb %r8b, (%rdi)
99 incq %rdi 115 incq %rdi
100 incq %rsi 116 incq %rsi
101 decl %ecx 117 decl %ecx
102 jnz .Lloop_1 118 jnz .Lloop_1
103 119
104.Lende: 120.Lend:
105 popq %rbx
106 CFI_ADJUST_CFA_OFFSET -8
107 CFI_RESTORE rbx
108 ret 121 ret
109.Lfinal:
110 CFI_ENDPROC 122 CFI_ENDPROC
111ENDPROC(memcpy) 123ENDPROC(memcpy)
112ENDPROC(__memcpy) 124ENDPROC(__memcpy)
113 125
114 /* Some CPUs run faster using the string copy instructions. 126 /*
115 It is also a lot simpler. Use this when possible */ 127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
116 130
117 .section .altinstr_replacement,"ax" 131 .section .altinstr_replacement, "ax"
1181: .byte 0xeb /* jmp <disp8> */ 1321: .byte 0xeb /* jmp <disp8> */
119 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1202: 1342:
121 .previous 135 .previous
122 .section .altinstructions,"a" 136
137 .section .altinstructions, "a"
123 .align 8 138 .align 8
124 .quad memcpy 139 .quad memcpy
125 .quad 1b 140 .quad 1b
126 .byte X86_FEATURE_REP_GOOD 141 .byte X86_FEATURE_REP_GOOD
127 /* Replace only beginning, memcpy is used to apply alternatives, so it 142
128 * is silly to overwrite itself with nops - reboot is only outcome... */ 143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
129 .byte 2b - 1b 148 .byte 2b - 1b
130 .byte 2b - 1b 149 .byte 2b - 1b
131 .previous 150 .previous
diff --git a/arch/x86/mach-default/Makefile b/arch/x86/mach-default/Makefile
deleted file mode 100644
index 012fe34459e6..000000000000
--- a/arch/x86/mach-default/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y := setup.o
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
deleted file mode 100644
index 50b591871128..000000000000
--- a/arch/x86/mach-default/setup.c
+++ /dev/null
@@ -1,174 +0,0 @@
1/*
2 * Machine specific setup for generic
3 */
4
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <asm/acpi.h>
9#include <asm/arch_hooks.h>
10#include <asm/e820.h>
11#include <asm/setup.h>
12
13#include <mach_ipi.h>
14
15#ifdef CONFIG_HOTPLUG_CPU
16#define DEFAULT_SEND_IPI (1)
17#else
18#define DEFAULT_SEND_IPI (0)
19#endif
20
21int no_broadcast = DEFAULT_SEND_IPI;
22
23/**
24 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
25 *
26 * Description:
27 * Perform any necessary interrupt initialisation prior to setting up
28 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
29 * interrupts should be initialised here if the machine emulates a PC
30 * in any way.
31 **/
32void __init pre_intr_init_hook(void)
33{
34 if (x86_quirks->arch_pre_intr_init) {
35 if (x86_quirks->arch_pre_intr_init())
36 return;
37 }
38 init_ISA_irqs();
39}
40
41/*
42 * IRQ2 is cascade interrupt to second interrupt controller
43 */
44static struct irqaction irq2 = {
45 .handler = no_action,
46 .mask = CPU_MASK_NONE,
47 .name = "cascade",
48};
49
50/**
51 * intr_init_hook - post gate setup interrupt initialisation
52 *
53 * Description:
54 * Fill in any interrupts that may have been left out by the general
55 * init_IRQ() routine. interrupts having to do with the machine rather
56 * than the devices on the I/O bus (like APIC interrupts in intel MP
57 * systems) are started here.
58 **/
59void __init intr_init_hook(void)
60{
61 if (x86_quirks->arch_intr_init) {
62 if (x86_quirks->arch_intr_init())
63 return;
64 }
65 if (!acpi_ioapic)
66 setup_irq(2, &irq2);
67
68}
69
70/**
71 * pre_setup_arch_hook - hook called prior to any setup_arch() execution
72 *
73 * Description:
74 * generally used to activate any machine specific identification
75 * routines that may be needed before setup_arch() runs. On Voyager
76 * this is used to get the board revision and type.
77 **/
78void __init pre_setup_arch_hook(void)
79{
80}
81
82/**
83 * trap_init_hook - initialise system specific traps
84 *
85 * Description:
86 * Called as the final act of trap_init(). Used in VISWS to initialise
87 * the various board specific APIC traps.
88 **/
89void __init trap_init_hook(void)
90{
91 if (x86_quirks->arch_trap_init) {
92 if (x86_quirks->arch_trap_init())
93 return;
94 }
95}
96
97static struct irqaction irq0 = {
98 .handler = timer_interrupt,
99 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
100 .mask = CPU_MASK_NONE,
101 .name = "timer"
102};
103
104/**
105 * pre_time_init_hook - do any specific initialisations before.
106 *
107 **/
108void __init pre_time_init_hook(void)
109{
110 if (x86_quirks->arch_pre_time_init)
111 x86_quirks->arch_pre_time_init();
112}
113
114/**
115 * time_init_hook - do any specific initialisations for the system timer.
116 *
117 * Description:
118 * Must plug the system timer interrupt source at HZ into the IRQ listed
119 * in irq_vectors.h:TIMER_IRQ
120 **/
121void __init time_init_hook(void)
122{
123 if (x86_quirks->arch_time_init) {
124 /*
125 * A nonzero return code does not mean failure, it means
126 * that the architecture quirk does not want any
127 * generic (timer) setup to be performed after this:
128 */
129 if (x86_quirks->arch_time_init())
130 return;
131 }
132
133 irq0.mask = cpumask_of_cpu(0);
134 setup_irq(0, &irq0);
135}
136
137#ifdef CONFIG_MCA
138/**
139 * mca_nmi_hook - hook into MCA specific NMI chain
140 *
141 * Description:
142 * The MCA (Microchannel Architecture) has an NMI chain for NMI sources
143 * along the MCA bus. Use this to hook into that chain if you will need
144 * it.
145 **/
146void mca_nmi_hook(void)
147{
148 /*
149 * If I recall correctly, there's a whole bunch of other things that
150 * we can do to check for NMI problems, but that's all I know about
151 * at the moment.
152 */
153 pr_warning("NMI generated from unknown source!\n");
154}
155#endif
156
157static __init int no_ipi_broadcast(char *str)
158{
159 get_option(&str, &no_broadcast);
160 pr_info("Using %s mode\n",
161 no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
162 return 1;
163}
164__setup("no_ipi_broadcast=", no_ipi_broadcast);
165
166static int __init print_ipi_mode(void)
167{
168 pr_info("Using IPI %s mode\n",
169 no_broadcast ? "No-Shortcut" : "Shortcut");
170 return 0;
171}
172
173late_initcall(print_ipi_mode);
174
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
deleted file mode 100644
index 6730f4e7c744..000000000000
--- a/arch/x86/mach-generic/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
1#
2# Makefile for the generic architecture
3#
4
5EXTRA_CFLAGS := -Iarch/x86/kernel
6
7obj-y := probe.o default.o
8obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
deleted file mode 100644
index bc4c7840b2a8..000000000000
--- a/arch/x86/mach-generic/bigsmp.c
+++ /dev/null
@@ -1,60 +0,0 @@
1/*
2 * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
3 * Drives the local APIC in "clustered mode".
4 */
5#define APIC_DEFINITION 1
6#include <linux/threads.h>
7#include <linux/cpumask.h>
8#include <asm/mpspec.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/init.h>
14#include <linux/dmi.h>
15#include <asm/bigsmp/apicdef.h>
16#include <linux/smp.h>
17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
19#include <asm/mach-default/mach_mpparse.h>
20#include <asm/mach-default/mach_wakecpu.h>
21
22static int dmi_bigsmp; /* can be set by dmi scanners */
23
24static int hp_ht_bigsmp(const struct dmi_system_id *d)
25{
26 printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
27 dmi_bigsmp = 1;
28 return 0;
29}
30
31
32static const struct dmi_system_id bigsmp_dmi_table[] = {
33 { hp_ht_bigsmp, "HP ProLiant DL760 G2",
34 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
35 DMI_MATCH(DMI_BIOS_VERSION, "P44-"),}
36 },
37
38 { hp_ht_bigsmp, "HP ProLiant DL740",
39 { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
40 DMI_MATCH(DMI_BIOS_VERSION, "P47-"),}
41 },
42 { }
43};
44
45static void vector_allocation_domain(int cpu, cpumask_t *retmask)
46{
47 cpus_clear(*retmask);
48 cpu_set(cpu, *retmask);
49}
50
51static int probe_bigsmp(void)
52{
53 if (def_to_bigsmp)
54 dmi_bigsmp = 1;
55 else
56 dmi_check_system(bigsmp_dmi_table);
57 return dmi_bigsmp;
58}
59
60struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp);
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
deleted file mode 100644
index e63a4a76d8cd..000000000000
--- a/arch/x86/mach-generic/default.c
+++ /dev/null
@@ -1,27 +0,0 @@
1/*
2 * Default generic APIC driver. This handles up to 8 CPUs.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/mach-default/mach_apicdef.h>
9#include <asm/genapic.h>
10#include <asm/fixmap.h>
11#include <asm/apicdef.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h>
16#include <asm/mach-default/mach_apic.h>
17#include <asm/mach-default/mach_ipi.h>
18#include <asm/mach-default/mach_mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
20
21/* should be called last. */
22static int probe_default(void)
23{
24 return 1;
25}
26
27struct genapic apic_default = APIC_INIT("default", probe_default);
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
deleted file mode 100644
index c2ded1448024..000000000000
--- a/arch/x86/mach-generic/es7000.c
+++ /dev/null
@@ -1,103 +0,0 @@
1/*
2 * APIC driver for the Unisys ES7000 chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/genapic.h>
9#include <asm/fixmap.h>
10#include <asm/apicdef.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <asm/es7000/apicdef.h>
15#include <linux/smp.h>
16#include <asm/es7000/apic.h>
17#include <asm/es7000/ipi.h>
18#include <asm/es7000/mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
20
21void __init es7000_update_genapic_to_cluster(void)
22{
23 genapic->target_cpus = target_cpus_cluster;
24 genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
25 genapic->int_dest_mode = INT_DEST_MODE_CLUSTER;
26 genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER;
27
28 genapic->init_apic_ldr = init_apic_ldr_cluster;
29
30 genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster;
31}
32
33static int probe_es7000(void)
34{
35 /* probed later in mptable/ACPI hooks */
36 return 0;
37}
38
39extern void es7000_sw_apic(void);
40static void __init enable_apic_mode(void)
41{
42 es7000_sw_apic();
43 return;
44}
45
46static __init int
47mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
48{
49 if (mpc->oemptr) {
50 struct mpc_oemtable *oem_table =
51 (struct mpc_oemtable *)mpc->oemptr;
52 if (!strncmp(oem, "UNISYS", 6))
53 return parse_unisys_oem((char *)oem_table);
54 }
55 return 0;
56}
57
58#ifdef CONFIG_ACPI
59/* Hook from generic ACPI tables.c */
60static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
61{
62 unsigned long oem_addr = 0;
63 int check_dsdt;
64 int ret = 0;
65
66 /* check dsdt at first to avoid clear fix_map for oem_addr */
67 check_dsdt = es7000_check_dsdt();
68
69 if (!find_unisys_acpi_oem_table(&oem_addr)) {
70 if (check_dsdt)
71 ret = parse_unisys_oem((char *)oem_addr);
72 else {
73 setup_unisys();
74 ret = 1;
75 }
76 /*
77 * we need to unmap it
78 */
79 unmap_unisys_acpi_oem_table(oem_addr);
80 }
81 return ret;
82}
83#else
84static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
85{
86 return 0;
87}
88#endif
89
90static void vector_allocation_domain(int cpu, cpumask_t *retmask)
91{
92 /* Careful. Some cpus do not strictly honor the set of cpus
93 * specified in the interrupt destination when using lowest
94 * priority interrupt delivery mode.
95 *
96 * In particular there was a hyperthreading cpu observed to
97 * deliver interrupts to the wrong hyperthread when only one
98 * hyperthread was specified in the interrupt desitination.
99 */
100 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
101}
102
103struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
deleted file mode 100644
index 3679e2255645..000000000000
--- a/arch/x86/mach-generic/numaq.c
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * APIC driver for the IBM NUMAQ chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/genapic.h>
9#include <asm/fixmap.h>
10#include <asm/apicdef.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <asm/numaq/apicdef.h>
15#include <linux/smp.h>
16#include <asm/numaq/apic.h>
17#include <asm/numaq/ipi.h>
18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h>
21
22static int mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
23{
24 numaq_mps_oem_check(mpc, oem, productid);
25 return found_numaq;
26}
27
28static int probe_numaq(void)
29{
30 /* already know from get_memcfg_numaq() */
31 return found_numaq;
32}
33
34/* Hook from generic ACPI tables.c */
35static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
36{
37 return 0;
38}
39
40static void vector_allocation_domain(int cpu, cpumask_t *retmask)
41{
42 /* Careful. Some cpus do not strictly honor the set of cpus
43 * specified in the interrupt destination when using lowest
44 * priority interrupt delivery mode.
45 *
46 * In particular there was a hyperthreading cpu observed to
47 * deliver interrupts to the wrong hyperthread when only one
48 * hyperthread was specified in the interrupt desitination.
49 */
50 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
51}
52
53struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
deleted file mode 100644
index 15a38daef1a8..000000000000
--- a/arch/x86/mach-generic/probe.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/*
2 * Copyright 2003 Andi Kleen, SuSE Labs.
3 * Subject to the GNU Public License, v.2
4 *
5 * Generic x86 APIC driver probe layer.
6 */
7#include <linux/threads.h>
8#include <linux/cpumask.h>
9#include <linux/string.h>
10#include <linux/kernel.h>
11#include <linux/ctype.h>
12#include <linux/init.h>
13#include <linux/errno.h>
14#include <asm/fixmap.h>
15#include <asm/mpspec.h>
16#include <asm/apicdef.h>
17#include <asm/genapic.h>
18#include <asm/setup.h>
19
20extern struct genapic apic_numaq;
21extern struct genapic apic_summit;
22extern struct genapic apic_bigsmp;
23extern struct genapic apic_es7000;
24extern struct genapic apic_default;
25
26struct genapic *genapic = &apic_default;
27
28static struct genapic *apic_probe[] __initdata = {
29#ifdef CONFIG_X86_NUMAQ
30 &apic_numaq,
31#endif
32#ifdef CONFIG_X86_SUMMIT
33 &apic_summit,
34#endif
35#ifdef CONFIG_X86_BIGSMP
36 &apic_bigsmp,
37#endif
38#ifdef CONFIG_X86_ES7000
39 &apic_es7000,
40#endif
41 &apic_default, /* must be last */
42 NULL,
43};
44
45static int cmdline_apic __initdata;
46static int __init parse_apic(char *arg)
47{
48 int i;
49
50 if (!arg)
51 return -EINVAL;
52
53 for (i = 0; apic_probe[i]; i++) {
54 if (!strcmp(apic_probe[i]->name, arg)) {
55 genapic = apic_probe[i];
56 cmdline_apic = 1;
57 return 0;
58 }
59 }
60
61 if (x86_quirks->update_genapic)
62 x86_quirks->update_genapic();
63
64 /* Parsed again by __setup for debug/verbose */
65 return 0;
66}
67early_param("apic", parse_apic);
68
69void __init generic_bigsmp_probe(void)
70{
71#ifdef CONFIG_X86_BIGSMP
72 /*
73 * This routine is used to switch to bigsmp mode when
74 * - There is no apic= option specified by the user
75 * - generic_apic_probe() has chosen apic_default as the sub_arch
76 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
77 */
78
79 if (!cmdline_apic && genapic == &apic_default) {
80 if (apic_bigsmp.probe()) {
81 genapic = &apic_bigsmp;
82 if (x86_quirks->update_genapic)
83 x86_quirks->update_genapic();
84 printk(KERN_INFO "Overriding APIC driver with %s\n",
85 genapic->name);
86 }
87 }
88#endif
89}
90
91void __init generic_apic_probe(void)
92{
93 if (!cmdline_apic) {
94 int i;
95 for (i = 0; apic_probe[i]; i++) {
96 if (apic_probe[i]->probe()) {
97 genapic = apic_probe[i];
98 break;
99 }
100 }
101 /* Not visible without early console */
102 if (!apic_probe[i])
103 panic("Didn't find an APIC driver");
104
105 if (x86_quirks->update_genapic)
106 x86_quirks->update_genapic();
107 }
108 printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
109}
110
111/* These functions can switch the APIC even after the initial ->probe() */
112
113int __init mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
114{
115 int i;
116 for (i = 0; apic_probe[i]; ++i) {
117 if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
118 if (!cmdline_apic) {
119 genapic = apic_probe[i];
120 if (x86_quirks->update_genapic)
121 x86_quirks->update_genapic();
122 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
123 genapic->name);
124 }
125 return 1;
126 }
127 }
128 return 0;
129}
130
131int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
132{
133 int i;
134 for (i = 0; apic_probe[i]; ++i) {
135 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
136 if (!cmdline_apic) {
137 genapic = apic_probe[i];
138 if (x86_quirks->update_genapic)
139 x86_quirks->update_genapic();
140 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
141 genapic->name);
142 }
143 return 1;
144 }
145 }
146 return 0;
147}
148
149int hard_smp_processor_id(void)
150{
151 return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
152}
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
deleted file mode 100644
index 2821ffc188b5..000000000000
--- a/arch/x86/mach-generic/summit.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * APIC driver for the IBM "Summit" chipset.
3 */
4#define APIC_DEFINITION 1
5#include <linux/threads.h>
6#include <linux/cpumask.h>
7#include <asm/mpspec.h>
8#include <asm/genapic.h>
9#include <asm/fixmap.h>
10#include <asm/apicdef.h>
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <asm/summit/apicdef.h>
15#include <linux/smp.h>
16#include <asm/summit/apic.h>
17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
20
21static int probe_summit(void)
22{
23 /* probed later in mptable/ACPI hooks */
24 return 0;
25}
26
27static void vector_allocation_domain(int cpu, cpumask_t *retmask)
28{
29 /* Careful. Some cpus do not strictly honor the set of cpus
30 * specified in the interrupt destination when using lowest
31 * priority interrupt delivery mode.
32 *
33 * In particular there was a hyperthreading cpu observed to
34 * deliver interrupts to the wrong hyperthread when only one
35 * hyperthread was specified in the interrupt desitination.
36 */
37 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
38}
39
40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
deleted file mode 100644
index 8325b4ca431c..000000000000
--- a/arch/x86/mach-rdc321x/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the RDC321x specific parts of the kernel
3#
4obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o
5
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
deleted file mode 100644
index 247f33d3a407..000000000000
--- a/arch/x86/mach-rdc321x/gpio.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/*
2 * GPIO support for RDC SoC R3210/R8610
3 *
4 * Copyright (C) 2007, Florian Fainelli <florian@openwrt.org>
5 * Copyright (C) 2008, Volker Weiss <dev@tintuc.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23
24#include <linux/spinlock.h>
25#include <linux/io.h>
26#include <linux/types.h>
27#include <linux/module.h>
28
29#include <asm/gpio.h>
30#include <asm/mach-rdc321x/rdc321x_defs.h>
31
32
33/* spin lock to protect our private copy of GPIO data register plus
34 the access to PCI conf registers. */
35static DEFINE_SPINLOCK(gpio_lock);
36
37/* copy of GPIO data registers */
38static u32 gpio_data_reg1;
39static u32 gpio_data_reg2;
40
41static u32 gpio_request_data[2];
42
43
44static inline void rdc321x_conf_write(unsigned addr, u32 value)
45{
46 outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
47 outl(value, RDC3210_CFGREG_DATA);
48}
49
50static inline void rdc321x_conf_or(unsigned addr, u32 value)
51{
52 outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
53 value |= inl(RDC3210_CFGREG_DATA);
54 outl(value, RDC3210_CFGREG_DATA);
55}
56
57static inline u32 rdc321x_conf_read(unsigned addr)
58{
59 outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
60
61 return inl(RDC3210_CFGREG_DATA);
62}
63
64/* configure pin as GPIO */
65static void rdc321x_configure_gpio(unsigned gpio)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&gpio_lock, flags);
70 rdc321x_conf_or(gpio < 32
71 ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2,
72 1 << (gpio & 0x1f));
73 spin_unlock_irqrestore(&gpio_lock, flags);
74}
75
76/* initially setup the 2 copies of the gpio data registers.
77 This function must be called by the platform setup code. */
78void __init rdc321x_gpio_setup()
79{
80 /* this might not be, what others (BIOS, bootloader, etc.)
81 wrote to these registers before, but it's a good guess. Still
82 better than just using 0xffffffff. */
83
84 gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1);
85 gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2);
86}
87
88/* determine, if gpio number is valid */
89static inline int rdc321x_is_gpio(unsigned gpio)
90{
91 return gpio <= RDC321X_MAX_GPIO;
92}
93
94/* request GPIO */
95int rdc_gpio_request(unsigned gpio, const char *label)
96{
97 unsigned long flags;
98
99 if (!rdc321x_is_gpio(gpio))
100 return -EINVAL;
101
102 spin_lock_irqsave(&gpio_lock, flags);
103 if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f)))
104 goto inuse;
105 gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f));
106 spin_unlock_irqrestore(&gpio_lock, flags);
107
108 return 0;
109inuse:
110 spin_unlock_irqrestore(&gpio_lock, flags);
111 return -EINVAL;
112}
113EXPORT_SYMBOL(rdc_gpio_request);
114
115/* release previously-claimed GPIO */
116void rdc_gpio_free(unsigned gpio)
117{
118 unsigned long flags;
119
120 if (!rdc321x_is_gpio(gpio))
121 return;
122
123 spin_lock_irqsave(&gpio_lock, flags);
124 gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f));
125 spin_unlock_irqrestore(&gpio_lock, flags);
126}
127EXPORT_SYMBOL(rdc_gpio_free);
128
129/* read GPIO pin */
130int rdc_gpio_get_value(unsigned gpio)
131{
132 u32 reg;
133 unsigned long flags;
134
135 spin_lock_irqsave(&gpio_lock, flags);
136 reg = rdc321x_conf_read(gpio < 32
137 ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2);
138 spin_unlock_irqrestore(&gpio_lock, flags);
139
140 return (1 << (gpio & 0x1f)) & reg ? 1 : 0;
141}
142EXPORT_SYMBOL(rdc_gpio_get_value);
143
144/* set GPIO pin to value */
145void rdc_gpio_set_value(unsigned gpio, int value)
146{
147 unsigned long flags;
148 u32 reg;
149
150 reg = 1 << (gpio & 0x1f);
151 if (gpio < 32) {
152 spin_lock_irqsave(&gpio_lock, flags);
153 if (value)
154 gpio_data_reg1 |= reg;
155 else
156 gpio_data_reg1 &= ~reg;
157 rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1);
158 spin_unlock_irqrestore(&gpio_lock, flags);
159 } else {
160 spin_lock_irqsave(&gpio_lock, flags);
161 if (value)
162 gpio_data_reg2 |= reg;
163 else
164 gpio_data_reg2 &= ~reg;
165 rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2);
166 spin_unlock_irqrestore(&gpio_lock, flags);
167 }
168}
169EXPORT_SYMBOL(rdc_gpio_set_value);
170
171/* configure GPIO pin as input */
172int rdc_gpio_direction_input(unsigned gpio)
173{
174 if (!rdc321x_is_gpio(gpio))
175 return -EINVAL;
176
177 rdc321x_configure_gpio(gpio);
178
179 return 0;
180}
181EXPORT_SYMBOL(rdc_gpio_direction_input);
182
183/* configure GPIO pin as output and set value */
184int rdc_gpio_direction_output(unsigned gpio, int value)
185{
186 if (!rdc321x_is_gpio(gpio))
187 return -EINVAL;
188
189 gpio_set_value(gpio, value);
190 rdc321x_configure_gpio(gpio);
191
192 return 0;
193}
194EXPORT_SYMBOL(rdc_gpio_direction_output);
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
deleted file mode 100644
index 4f4e50c3ad3b..000000000000
--- a/arch/x86/mach-rdc321x/platform.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/*
2 * Generic RDC321x platform devices
3 *
4 * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/list.h>
26#include <linux/device.h>
27#include <linux/platform_device.h>
28#include <linux/leds.h>
29
30#include <asm/gpio.h>
31
32/* LEDS */
33static struct gpio_led default_leds[] = {
34 { .name = "rdc:dmz", .gpio = 1, },
35};
36
37static struct gpio_led_platform_data rdc321x_led_data = {
38 .num_leds = ARRAY_SIZE(default_leds),
39 .leds = default_leds,
40};
41
42static struct platform_device rdc321x_leds = {
43 .name = "leds-gpio",
44 .id = -1,
45 .dev = {
46 .platform_data = &rdc321x_led_data,
47 }
48};
49
50/* Watchdog */
51static struct platform_device rdc321x_wdt = {
52 .name = "rdc321x-wdt",
53 .id = -1,
54 .num_resources = 0,
55};
56
57static struct platform_device *rdc321x_devs[] = {
58 &rdc321x_leds,
59 &rdc321x_wdt
60};
61
62static int __init rdc_board_setup(void)
63{
64 rdc321x_gpio_setup();
65
66 return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
67}
68
69arch_initcall(rdc_board_setup);
diff --git a/arch/x86/mach-voyager/Makefile b/arch/x86/mach-voyager/Makefile
deleted file mode 100644
index 15c250b371d3..000000000000
--- a/arch/x86/mach-voyager/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5EXTRA_CFLAGS := -Iarch/x86/kernel
6obj-y := setup.o voyager_basic.o voyager_thread.o
7
8obj-$(CONFIG_SMP) += voyager_smp.o voyager_cat.o
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
deleted file mode 100644
index 8e5118371f0f..000000000000
--- a/arch/x86/mach-voyager/setup.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * Machine specific setup for generic
3 */
4
5#include <linux/init.h>
6#include <linux/interrupt.h>
7#include <asm/arch_hooks.h>
8#include <asm/voyager.h>
9#include <asm/e820.h>
10#include <asm/io.h>
11#include <asm/setup.h>
12
13void __init pre_intr_init_hook(void)
14{
15 init_ISA_irqs();
16}
17
18/*
19 * IRQ2 is cascade interrupt to second interrupt controller
20 */
21static struct irqaction irq2 = {
22 .handler = no_action,
23 .mask = CPU_MASK_NONE,
24 .name = "cascade",
25};
26
27void __init intr_init_hook(void)
28{
29#ifdef CONFIG_SMP
30 voyager_smp_intr_init();
31#endif
32
33 setup_irq(2, &irq2);
34}
35
36static void voyager_disable_tsc(void)
37{
38 /* Voyagers run their CPUs from independent clocks, so disable
39 * the TSC code because we can't sync them */
40 setup_clear_cpu_cap(X86_FEATURE_TSC);
41}
42
43void __init pre_setup_arch_hook(void)
44{
45 voyager_disable_tsc();
46}
47
48void __init pre_time_init_hook(void)
49{
50 voyager_disable_tsc();
51}
52
53void __init trap_init_hook(void)
54{
55}
56
57static struct irqaction irq0 = {
58 .handler = timer_interrupt,
59 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
60 .mask = CPU_MASK_NONE,
61 .name = "timer"
62};
63
64void __init time_init_hook(void)
65{
66 irq0.mask = cpumask_of_cpu(safe_smp_processor_id());
67 setup_irq(0, &irq0);
68}
69
70/* Hook for machine specific memory setup. */
71
72char *__init machine_specific_memory_setup(void)
73{
74 char *who;
75 int new_nr;
76
77 who = "NOT VOYAGER";
78
79 if (voyager_level == 5) {
80 __u32 addr, length;
81 int i;
82
83 who = "Voyager-SUS";
84
85 e820.nr_map = 0;
86 for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
87 e820_add_region(addr, length, E820_RAM);
88 }
89 return who;
90 } else if (voyager_level == 4) {
91 __u32 tom;
92 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
93 /* select the DINO config space */
94 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
95 /* Read DINO top of memory register */
96 tom = ((inb(catbase + 0x4) & 0xf0) << 16)
97 + ((inb(catbase + 0x5) & 0x7f) << 24);
98
99 if (inb(catbase) != VOYAGER_DINO) {
100 printk(KERN_ERR
101 "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
102 tom = (boot_params.screen_info.ext_mem_k) << 10;
103 }
104 who = "Voyager-TOM";
105 e820_add_region(0, 0x9f000, E820_RAM);
106 /* map from 1M to top of memory */
107 e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
108 E820_RAM);
109 /* FIXME: Should check the ASICs to see if I need to
110 * take out the 8M window. Just do it at the moment
111 * */
112 e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024,
113 E820_RESERVED);
114 return who;
115 }
116
117 return default_machine_specific_memory_setup();
118}
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
deleted file mode 100644
index 46d6f8067690..000000000000
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * This file contains all the voyager specific routines for getting
6 * initialisation of the architecture to function. For additional
7 * features see:
8 *
9 * voyager_cat.c - Voyager CAT bus interface
10 * voyager_smp.c - Voyager SMP hal (emulates linux smp.c)
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/ptrace.h>
17#include <linux/ioport.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/delay.h>
21#include <linux/reboot.h>
22#include <linux/sysrq.h>
23#include <linux/smp.h>
24#include <linux/nodemask.h>
25#include <asm/io.h>
26#include <asm/voyager.h>
27#include <asm/vic.h>
28#include <linux/pm.h>
29#include <asm/tlbflush.h>
30#include <asm/arch_hooks.h>
31#include <asm/i8253.h>
32
33/*
34 * Power off function, if any
35 */
36void (*pm_power_off) (void);
37EXPORT_SYMBOL(pm_power_off);
38
39int voyager_level = 0;
40
41struct voyager_SUS *voyager_SUS = NULL;
42
43#ifdef CONFIG_SMP
44static void voyager_dump(int dummy1, struct tty_struct *dummy3)
45{
46 /* get here via a sysrq */
47 voyager_smp_dump();
48}
49
50static struct sysrq_key_op sysrq_voyager_dump_op = {
51 .handler = voyager_dump,
52 .help_msg = "Voyager",
53 .action_msg = "Dump Voyager Status",
54};
55#endif
56
57void voyager_detect(struct voyager_bios_info *bios)
58{
59 if (bios->len != 0xff) {
60 int class = (bios->class_1 << 8)
61 | (bios->class_2 & 0xff);
62
63 printk("Voyager System detected.\n"
64 " Class %x, Revision %d.%d\n",
65 class, bios->major, bios->minor);
66 if (class == VOYAGER_LEVEL4)
67 voyager_level = 4;
68 else if (class < VOYAGER_LEVEL5_AND_ABOVE)
69 voyager_level = 3;
70 else
71 voyager_level = 5;
72 printk(" Architecture Level %d\n", voyager_level);
73 if (voyager_level < 4)
74 printk
75 ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
76 /* install the power off handler */
77 pm_power_off = voyager_power_off;
78#ifdef CONFIG_SMP
79 register_sysrq_key('v', &sysrq_voyager_dump_op);
80#endif
81 } else {
82 printk("\n\n**WARNING**: No Voyager Subsystem Found\n");
83 }
84}
85
86void voyager_system_interrupt(int cpl, void *dev_id)
87{
88 printk("Voyager: detected system interrupt\n");
89}
90
91/* Routine to read information from the extended CMOS area */
92__u8 voyager_extended_cmos_read(__u16 addr)
93{
94 outb(addr & 0xff, 0x74);
95 outb((addr >> 8) & 0xff, 0x75);
96 return inb(0x76);
97}
98
99/* internal definitions for the SUS Click Map of memory */
100
101#define CLICK_ENTRIES 16
102#define CLICK_SIZE 4096 /* click to byte conversion for Length */
103
104typedef struct ClickMap {
105 struct Entry {
106 __u32 Address;
107 __u32 Length;
108 } Entry[CLICK_ENTRIES];
109} ClickMap_t;
110
111/* This routine is pretty much an awful hack to read the bios clickmap by
112 * mapping it into page 0. There are usually three regions in the map:
113 * Base Memory
114 * Extended Memory
115 * zero length marker for end of map
116 *
117 * Returns are 0 for failure and 1 for success on extracting region.
118 */
119int __init voyager_memory_detect(int region, __u32 * start, __u32 * length)
120{
121 int i;
122 int retval = 0;
123 __u8 cmos[4];
124 ClickMap_t *map;
125 unsigned long map_addr;
126 unsigned long old;
127
128 if (region >= CLICK_ENTRIES) {
129 printk("Voyager: Illegal ClickMap region %d\n", region);
130 return 0;
131 }
132
133 for (i = 0; i < sizeof(cmos); i++)
134 cmos[i] =
135 voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
136
137 map_addr = *(unsigned long *)cmos;
138
139 /* steal page 0 for this */
140 old = pg0[0];
141 pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
142 local_flush_tlb();
143 /* now clear everything out but page 0 */
144 map = (ClickMap_t *) (map_addr & (~PAGE_MASK));
145
146 /* zero length is the end of the clickmap */
147 if (map->Entry[region].Length != 0) {
148 *length = map->Entry[region].Length * CLICK_SIZE;
149 *start = map->Entry[region].Address;
150 retval = 1;
151 }
152
153 /* replace the mapping */
154 pg0[0] = old;
155 local_flush_tlb();
156 return retval;
157}
158
159/* voyager specific handling code for timer interrupts. Used to hand
160 * off the timer tick to the SMP code, since the VIC doesn't have an
161 * internal timer (The QIC does, but that's another story). */
162void voyager_timer_interrupt(void)
163{
164 if ((jiffies & 0x3ff) == 0) {
165
166 /* There seems to be something flaky in either
167 * hardware or software that is resetting the timer 0
168 * count to something much higher than it should be
169 * This seems to occur in the boot sequence, just
170 * before root is mounted. Therefore, every 10
171 * seconds or so, we sanity check the timer zero count
172 * and kick it back to where it should be.
173 *
174 * FIXME: This is the most awful hack yet seen. I
175 * should work out exactly what is interfering with
176 * the timer count settings early in the boot sequence
177 * and swiftly introduce it to something sharp and
178 * pointy. */
179 __u16 val;
180
181 spin_lock(&i8253_lock);
182
183 outb_p(0x00, 0x43);
184 val = inb_p(0x40);
185 val |= inb(0x40) << 8;
186 spin_unlock(&i8253_lock);
187
188 if (val > LATCH) {
189 printk
190 ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n",
191 val);
192 spin_lock(&i8253_lock);
193 outb(0x34, 0x43);
194 outb_p(LATCH & 0xff, 0x40); /* LSB */
195 outb(LATCH >> 8, 0x40); /* MSB */
196 spin_unlock(&i8253_lock);
197 }
198 }
199#ifdef CONFIG_SMP
200 smp_vic_timer_interrupt();
201#endif
202}
203
204void voyager_power_off(void)
205{
206 printk("VOYAGER Power Off\n");
207
208 if (voyager_level == 5) {
209 voyager_cat_power_off();
210 } else if (voyager_level == 4) {
211 /* This doesn't apparently work on most L4 machines,
212 * but the specs say to do this to get automatic power
213 * off. Unfortunately, if it doesn't power off the
214 * machine, it ends up doing a cold restart, which
215 * isn't really intended, so comment out the code */
216#if 0
217 int port;
218
219 /* enable the voyager Configuration Space */
220 outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP);
221 /* the port for the power off flag is an offset from the
222 floating base */
223 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
224 /* set the power off flag */
225 outb(inb(port) | 0x1, port);
226#endif
227 }
228 /* and wait for it to happen */
229 local_irq_disable();
230 for (;;)
231 halt();
232}
233
234/* copied from process.c */
235static inline void kb_wait(void)
236{
237 int i;
238
239 for (i = 0; i < 0x10000; i++)
240 if ((inb_p(0x64) & 0x02) == 0)
241 break;
242}
243
244void machine_shutdown(void)
245{
246 /* Architecture specific shutdown needed before a kexec */
247}
248
249void machine_restart(char *cmd)
250{
251 printk("Voyager Warm Restart\n");
252 kb_wait();
253
254 if (voyager_level == 5) {
255 /* write magic values to the RTC to inform system that
256 * shutdown is beginning */
257 outb(0x8f, 0x70);
258 outb(0x5, 0x71);
259
260 udelay(50);
261 outb(0xfe, 0x64); /* pull reset low */
262 } else if (voyager_level == 4) {
263 __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
264 __u8 basebd = inb(VOYAGER_MC_SETUP);
265
266 outb(basebd | 0x08, VOYAGER_MC_SETUP);
267 outb(0x02, catbase + 0x21);
268 }
269 local_irq_disable();
270 for (;;)
271 halt();
272}
273
274void machine_emergency_restart(void)
275{
276 /*for now, just hook this to a warm restart */
277 machine_restart(NULL);
278}
279
280void mca_nmi_hook(void)
281{
282 __u8 dumpval __maybe_unused = inb(0xf823);
283 __u8 swnmi __maybe_unused = inb(0xf813);
284
285 /* FIXME: assume dump switch pressed */
286 /* check to see if the dump switch was pressed */
287 VDEBUG(("VOYAGER: dumpval = 0x%x, swnmi = 0x%x\n", dumpval, swnmi));
288 /* clear swnmi */
289 outb(0xff, 0xf813);
290 /* tell SUS to ignore dump */
291 if (voyager_level == 5 && voyager_SUS != NULL) {
292 if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
293 voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
294 voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
295 udelay(1000);
296 voyager_SUS->kernel_mbox = VOYAGER_IGNORE_DUMP;
297 voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
298 }
299 }
300 printk(KERN_ERR
301 "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n",
302 smp_processor_id());
303 show_stack(NULL, NULL);
304 show_state();
305}
306
307void machine_halt(void)
308{
309 /* treat a halt like a power off */
310 machine_power_off();
311}
312
313void machine_power_off(void)
314{
315 if (pm_power_off)
316 pm_power_off();
317}
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
deleted file mode 100644
index 2ad598c104af..000000000000
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ /dev/null
@@ -1,1197 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 1999,2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * This file contains all the logic for manipulating the CAT bus
8 * in a level 5 machine.
9 *
10 * The CAT bus is a serial configuration and test bus. Its primary
11 * uses are to probe the initial configuration of the system and to
12 * diagnose error conditions when a system interrupt occurs. The low
13 * level interface is fairly primitive, so most of this file consists
14 * of bit shift manipulations to send and receive packets on the
15 * serial bus */
16
17#include <linux/types.h>
18#include <linux/completion.h>
19#include <linux/sched.h>
20#include <asm/voyager.h>
21#include <asm/vic.h>
22#include <linux/ioport.h>
23#include <linux/init.h>
24#include <linux/slab.h>
25#include <linux/delay.h>
26#include <asm/io.h>
27
28#ifdef VOYAGER_CAT_DEBUG
29#define CDEBUG(x) printk x
30#else
31#define CDEBUG(x)
32#endif
33
34/* the CAT command port */
35#define CAT_CMD (sspb + 0xe)
36/* the CAT data port */
37#define CAT_DATA (sspb + 0xd)
38
39/* the internal cat functions */
40static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits);
41static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data,
42 __u16 num_bits);
43static void cat_build_header(__u8 * header, const __u16 len,
44 const __u16 smallest_reg_bits,
45 const __u16 longest_reg_bits);
46static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp,
47 __u8 reg, __u8 op);
48static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp,
49 __u8 reg, __u8 * value);
50static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes,
51 __u8 pad_bits);
52static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
53 __u8 value);
54static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
55 __u8 * value);
56static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp,
57 __u16 offset, __u16 len, void *buf);
58static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
59 __u8 reg, __u8 value);
60static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp);
61static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp);
62
63static inline const char *cat_module_name(int module_id)
64{
65 switch (module_id) {
66 case 0x10:
67 return "Processor Slot 0";
68 case 0x11:
69 return "Processor Slot 1";
70 case 0x12:
71 return "Processor Slot 2";
72 case 0x13:
73 return "Processor Slot 4";
74 case 0x14:
75 return "Memory Slot 0";
76 case 0x15:
77 return "Memory Slot 1";
78 case 0x18:
79 return "Primary Microchannel";
80 case 0x19:
81 return "Secondary Microchannel";
82 case 0x1a:
83 return "Power Supply Interface";
84 case 0x1c:
85 return "Processor Slot 5";
86 case 0x1d:
87 return "Processor Slot 6";
88 case 0x1e:
89 return "Processor Slot 7";
90 case 0x1f:
91 return "Processor Slot 8";
92 default:
93 return "Unknown Module";
94 }
95}
96
97static int sspb = 0; /* stores the super port location */
98int voyager_8slot = 0; /* set to true if a 51xx monster */
99
100voyager_module_t *voyager_cat_list;
101
102/* the I/O port assignments for the VIC and QIC */
103static struct resource vic_res = {
104 .name = "Voyager Interrupt Controller",
105 .start = 0xFC00,
106 .end = 0xFC6F
107};
108static struct resource qic_res = {
109 .name = "Quad Interrupt Controller",
110 .start = 0xFC70,
111 .end = 0xFCFF
112};
113
114/* This function is used to pack a data bit stream inside a message.
115 * It writes num_bits of the data buffer in msg starting at start_bit.
116 * Note: This function assumes that any unused bit in the data stream
117 * is set to zero so that the ors will work correctly */
118static void
119cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
120{
121 /* compute initial shift needed */
122 const __u16 offset = start_bit % BITS_PER_BYTE;
123 __u16 len = num_bits / BITS_PER_BYTE;
124 __u16 byte = start_bit / BITS_PER_BYTE;
125 __u16 residue = (num_bits % BITS_PER_BYTE) + offset;
126 int i;
127
128 /* adjust if we have more than a byte of residue */
129 if (residue >= BITS_PER_BYTE) {
130 residue -= BITS_PER_BYTE;
131 len++;
132 }
133
134 /* clear out the bits. We assume here that if len==0 then
135 * residue >= offset. This is always true for the catbus
136 * operations */
137 msg[byte] &= 0xff << (BITS_PER_BYTE - offset);
138 msg[byte++] |= data[0] >> offset;
139 if (len == 0)
140 return;
141 for (i = 1; i < len; i++)
142 msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset))
143 | (data[i] >> offset);
144 if (residue != 0) {
145 __u8 mask = 0xff >> residue;
146 __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset)
147 | (data[i] >> offset);
148
149 last_byte &= ~mask;
150 msg[byte] &= mask;
151 msg[byte] |= last_byte;
152 }
153 return;
154}
155
156/* unpack the data again (same arguments as cat_pack()). data buffer
157 * must be zero populated.
158 *
159 * Function: given a message string move to start_bit and copy num_bits into
160 * data (starting at bit 0 in data).
161 */
162static void
163cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
164{
165 /* compute initial shift needed */
166 const __u16 offset = start_bit % BITS_PER_BYTE;
167 __u16 len = num_bits / BITS_PER_BYTE;
168 const __u8 last_bits = num_bits % BITS_PER_BYTE;
169 __u16 byte = start_bit / BITS_PER_BYTE;
170 int i;
171
172 if (last_bits != 0)
173 len++;
174
175 /* special case: want < 8 bits from msg and we can get it from
176 * a single byte of the msg */
177 if (len == 0 && BITS_PER_BYTE - offset >= num_bits) {
178 data[0] = msg[byte] << offset;
179 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
180 return;
181 }
182 for (i = 0; i < len; i++) {
183 /* this annoying if has to be done just in case a read of
184 * msg one beyond the array causes a panic */
185 if (offset != 0) {
186 data[i] = msg[byte++] << offset;
187 data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
188 } else {
189 data[i] = msg[byte++];
190 }
191 }
192 /* do we need to truncate the final byte */
193 if (last_bits != 0) {
194 data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits);
195 }
196 return;
197}
198
199static void
200cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits,
201 const __u16 longest_reg_bits)
202{
203 int i;
204 __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
205 __u8 *last_byte = &header[len - 1];
206
207 if (start_bit == 0)
208 start_bit = 1; /* must have at least one bit in the hdr */
209
210 for (i = 0; i < len; i++)
211 header[i] = 0;
212
213 for (i = start_bit; i > 0; i--)
214 *last_byte = ((*last_byte) << 1) + 1;
215
216}
217
218static int
219cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op)
220{
221 __u8 parity, inst, inst_buf[4] = { 0 };
222 __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
223 __u16 ibytes, hbytes, padbits;
224 int i;
225
226 /*
227 * Parity is the parity of the register number + 1 (READ_REGISTER
228 * and WRITE_REGISTER always add '1' to the number of bits == 1)
229 */
230 parity = (__u8) (1 + (reg & 0x01) +
231 ((__u8) (reg & 0x02) >> 1) +
232 ((__u8) (reg & 0x04) >> 2) +
233 ((__u8) (reg & 0x08) >> 3)) % 2;
234
235 inst = ((parity << 7) | (reg << 2) | op);
236
237 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
238 if (!modp->scan_path_connected) {
239 if (asicp->asic_id != VOYAGER_CAT_ID) {
240 printk
241 ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
242 return 1;
243 }
244 outb(VOYAGER_CAT_HEADER, CAT_DATA);
245 outb(inst, CAT_DATA);
246 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
247 CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
248 return 1;
249 }
250 return 0;
251 }
252 ibytes = modp->inst_bits / BITS_PER_BYTE;
253 if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
254 padbits = BITS_PER_BYTE - padbits;
255 ibytes++;
256 }
257 hbytes = modp->largest_reg / BITS_PER_BYTE;
258 if (modp->largest_reg % BITS_PER_BYTE)
259 hbytes++;
260 CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
261 /* initialise the instruction sequence to 0xff */
262 for (i = 0; i < ibytes + hbytes; i++)
263 iseq[i] = 0xff;
264 cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
265 cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
266 inst_buf[0] = inst;
267 inst_buf[1] = 0xFF >> (modp->largest_reg % BITS_PER_BYTE);
268 cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
269#ifdef VOYAGER_CAT_DEBUG
270 printk("ins = 0x%x, iseq: ", inst);
271 for (i = 0; i < ibytes + hbytes; i++)
272 printk("0x%x ", iseq[i]);
273 printk("\n");
274#endif
275 if (cat_shiftout(iseq, ibytes, hbytes, padbits)) {
276 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
277 return 1;
278 }
279 CDEBUG(("CAT SHIFTOUT DONE\n"));
280 return 0;
281}
282
283static int
284cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
285 __u8 * value)
286{
287 if (!modp->scan_path_connected) {
288 if (asicp->asic_id != VOYAGER_CAT_ID) {
289 CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
290 return 1;
291 }
292 if (reg > VOYAGER_SUBADDRHI)
293 outb(VOYAGER_CAT_RUN, CAT_CMD);
294 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
295 outb(VOYAGER_CAT_HEADER, CAT_DATA);
296 *value = inb(CAT_DATA);
297 outb(0xAA, CAT_DATA);
298 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
299 CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
300 return 1;
301 }
302 return 0;
303 } else {
304 __u16 sbits = modp->num_asics - 1 + asicp->ireg_length;
305 __u16 sbytes = sbits / BITS_PER_BYTE;
306 __u16 tbytes;
307 __u8 string[VOYAGER_MAX_SCAN_PATH],
308 trailer[VOYAGER_MAX_REG_SIZE];
309 __u8 padbits;
310 int i;
311
312 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
313
314 if ((padbits = sbits % BITS_PER_BYTE) != 0) {
315 padbits = BITS_PER_BYTE - padbits;
316 sbytes++;
317 }
318 tbytes = asicp->ireg_length / BITS_PER_BYTE;
319 if (asicp->ireg_length % BITS_PER_BYTE)
320 tbytes++;
321 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
322 tbytes, sbytes, padbits));
323 cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
324
325 for (i = tbytes - 1; i >= 0; i--) {
326 outb(trailer[i], CAT_DATA);
327 string[sbytes + i] = inb(CAT_DATA);
328 }
329
330 for (i = sbytes - 1; i >= 0; i--) {
331 outb(0xaa, CAT_DATA);
332 string[i] = inb(CAT_DATA);
333 }
334 *value = 0;
335 cat_unpack(string,
336 padbits + (tbytes * BITS_PER_BYTE) +
337 asicp->asic_location, value, asicp->ireg_length);
338#ifdef VOYAGER_CAT_DEBUG
339 printk("value=0x%x, string: ", *value);
340 for (i = 0; i < tbytes + sbytes; i++)
341 printk("0x%x ", string[i]);
342 printk("\n");
343#endif
344
345 /* sanity check the rest of the return */
346 for (i = 0; i < tbytes; i++) {
347 __u8 input = 0;
348
349 cat_unpack(string, padbits + (i * BITS_PER_BYTE),
350 &input, BITS_PER_BYTE);
351 if (trailer[i] != input) {
352 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
353 return 1;
354 }
355 }
356 CDEBUG(("cat_getdata DONE\n"));
357 return 0;
358 }
359}
360
361static int
362cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
363{
364 int i;
365
366 for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
367 outb(data[i], CAT_DATA);
368
369 for (i = header_bytes - 1; i >= 0; i--) {
370 __u8 header = 0;
371 __u8 input;
372
373 outb(data[i], CAT_DATA);
374 input = inb(CAT_DATA);
375 CDEBUG(("cat_shiftout: returned 0x%x\n", input));
376 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
377 &header, BITS_PER_BYTE);
378 if (input != header) {
379 CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
380 return 1;
381 }
382 }
383 return 0;
384}
385
386static int
387cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
388 __u8 reg, __u8 value)
389{
390 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
391 if (!modp->scan_path_connected) {
392 if (asicp->asic_id != VOYAGER_CAT_ID) {
393 CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
394 return 1;
395 }
396 outb(VOYAGER_CAT_HEADER, CAT_DATA);
397 outb(value, CAT_DATA);
398 if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
399 CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
400 return 1;
401 }
402 if (reg > VOYAGER_SUBADDRHI) {
403 outb(VOYAGER_CAT_RUN, CAT_CMD);
404 outb(VOYAGER_CAT_END, CAT_CMD);
405 outb(VOYAGER_CAT_RUN, CAT_CMD);
406 }
407
408 return 0;
409 } else {
410 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
411 __u16 dbytes =
412 (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE;
413 __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH],
414 hseq[VOYAGER_MAX_REG_SIZE];
415 int i;
416
417 if ((padbits = (modp->num_asics - 1
418 + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
419 padbits = BITS_PER_BYTE - padbits;
420 dbytes++;
421 }
422 if (asicp->ireg_length % BITS_PER_BYTE)
423 hbytes++;
424
425 cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
426
427 for (i = 0; i < dbytes + hbytes; i++)
428 dseq[i] = 0xff;
429 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
430 dbytes, hbytes, padbits));
431 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
432 hseq, hbytes * BITS_PER_BYTE);
433 cat_pack(dseq, asicp->asic_location, &value,
434 asicp->ireg_length);
435#ifdef VOYAGER_CAT_DEBUG
436 printk("dseq ");
437 for (i = 0; i < hbytes + dbytes; i++) {
438 printk("0x%x ", dseq[i]);
439 }
440 printk("\n");
441#endif
442 return cat_shiftout(dseq, dbytes, hbytes, padbits);
443 }
444}
445
446static int
447cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value)
448{
449 if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
450 return 1;
451 return cat_senddata(modp, asicp, reg, value);
452}
453
454static int
455cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
456 __u8 * value)
457{
458 if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
459 return 1;
460 return cat_getdata(modp, asicp, reg, value);
461}
462
463static int
464cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
465 __u16 len)
466{
467 __u8 val;
468
469 if (len > 1) {
470 /* set auto increment */
471 __u8 newval;
472
473 if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
474 CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
475 return 1;
476 }
477 CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n",
478 val));
479 newval = val | VOYAGER_AUTO_INC;
480 if (newval != val) {
481 if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
482 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
483 return 1;
484 }
485 }
486 }
487 if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) {
488 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
489 return 1;
490 }
491 if (asicp->subaddr > VOYAGER_SUBADDR_LO) {
492 if (cat_write
493 (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) {
494 CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
495 return 1;
496 }
497 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
498 CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset,
499 val));
500 }
501 cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
502 CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
503 return 0;
504}
505
506static int
507cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
508 __u16 len, void *buf)
509{
510 int i, retval;
511
512 /* FIXME: need special actions for VOYAGER_CAT_ID here */
513 if (asicp->asic_id == VOYAGER_CAT_ID) {
514 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
515 /* FIXME -- This is supposed to be handled better
516 * There is a problem writing to the cat asic in the
517 * PSI. The 30us delay seems to work, though */
518 udelay(30);
519 }
520
521 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
522 printk("cat_subwrite: cat_subaddrsetup FAILED\n");
523 return retval;
524 }
525
526 if (cat_sendinst
527 (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
528 printk("cat_subwrite: cat_sendinst FAILED\n");
529 return 1;
530 }
531 for (i = 0; i < len; i++) {
532 if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) {
533 printk
534 ("cat_subwrite: cat_sendata element at %d FAILED\n",
535 i);
536 return 1;
537 }
538 }
539 return 0;
540}
541static int
542cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
543 __u16 len, void *buf)
544{
545 int i, retval;
546
547 if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
548 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
549 return retval;
550 }
551
552 if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
553 CDEBUG(("cat_subread: cat_sendinst failed\n"));
554 return 1;
555 }
556 for (i = 0; i < len; i++) {
557 if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) {
558 CDEBUG(("cat_subread: cat_getdata element %d failed\n",
559 i));
560 return 1;
561 }
562 }
563 return 0;
564}
565
566/* buffer for storing EPROM data read in during initialisation */
567static __initdata __u8 eprom_buf[0xFFFF];
568static voyager_module_t *voyager_initial_module;
569
570/* Initialise the cat bus components. We assume this is called by the
571 * boot cpu *after* all memory initialisation has been done (so we can
572 * use kmalloc) but before smp initialisation, so we can probe the SMP
573 * configuration and pick up necessary information. */
574void __init voyager_cat_init(void)
575{
576 voyager_module_t **modpp = &voyager_initial_module;
577 voyager_asic_t **asicpp;
578 voyager_asic_t *qabc_asic = NULL;
579 int i, j;
580 unsigned long qic_addr = 0;
581 __u8 qabc_data[0x20];
582 __u8 num_submodules, val;
583 voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0];
584
585 __u8 cmos[4];
586 unsigned long addr;
587
588 /* initiallise the SUS mailbox */
589 for (i = 0; i < sizeof(cmos); i++)
590 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
591 addr = *(unsigned long *)cmos;
592 if ((addr & 0xff000000) != 0xff000000) {
593 printk(KERN_ERR
594 "Voyager failed to get SUS mailbox (addr = 0x%lx\n",
595 addr);
596 } else {
597 static struct resource res;
598
599 res.name = "voyager SUS";
600 res.start = addr;
601 res.end = addr + 0x3ff;
602
603 request_resource(&iomem_resource, &res);
604 voyager_SUS = (struct voyager_SUS *)
605 ioremap(addr, 0x400);
606 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
607 voyager_SUS->SUS_version);
608 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
609 voyager_SUS->kernel_flags = VOYAGER_OS_HAS_SYSINT;
610 }
611
612 /* clear the processor counts */
613 voyager_extended_vic_processors = 0;
614 voyager_quad_processors = 0;
615
616 printk("VOYAGER: beginning CAT bus probe\n");
617 /* set up the SuperSet Port Block which tells us where the
618 * CAT communication port is */
619 sspb = inb(VOYAGER_SSPB_RELOCATION_PORT) * 0x100;
620 VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
621
622 /* now find out if were 8 slot or normal */
623 if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
624 == EIGHT_SLOT_IDENTIFIER) {
625 voyager_8slot = 1;
626 printk(KERN_NOTICE
627 "Voyager: Eight slot 51xx configuration detected\n");
628 }
629
630 for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) {
631 __u8 input;
632 int asic;
633 __u16 eprom_size;
634 __u16 sp_offset;
635
636 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
637 outb(i, VOYAGER_CAT_CONFIG_PORT);
638
639 /* check the presence of the module */
640 outb(VOYAGER_CAT_RUN, CAT_CMD);
641 outb(VOYAGER_CAT_IRCYC, CAT_CMD);
642 outb(VOYAGER_CAT_HEADER, CAT_DATA);
643 /* stream series of alternating 1's and 0's to stimulate
644 * response */
645 outb(0xAA, CAT_DATA);
646 input = inb(CAT_DATA);
647 outb(VOYAGER_CAT_END, CAT_CMD);
648 if (input != VOYAGER_CAT_HEADER) {
649 continue;
650 }
651 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
652 cat_module_name(i)));
653 *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */
654 if (*modpp == NULL) {
655 printk("**WARNING** kmalloc failure in cat_init\n");
656 continue;
657 }
658 memset(*modpp, 0, sizeof(voyager_module_t));
659 /* need temporary asic for cat_subread. It will be
660 * filled in correctly later */
661 (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */
662 if ((*modpp)->asic == NULL) {
663 printk("**WARNING** kmalloc failure in cat_init\n");
664 continue;
665 }
666 memset((*modpp)->asic, 0, sizeof(voyager_asic_t));
667 (*modpp)->asic->asic_id = VOYAGER_CAT_ID;
668 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
669 (*modpp)->module_addr = i;
670 (*modpp)->scan_path_connected = 0;
671 if (i == VOYAGER_PSI) {
672 /* Exception leg for modules with no EEPROM */
673 printk("Module \"%s\"\n", cat_module_name(i));
674 continue;
675 }
676
677 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
678 outb(VOYAGER_CAT_RUN, CAT_CMD);
679 cat_disconnect(*modpp, (*modpp)->asic);
680 if (cat_subread(*modpp, (*modpp)->asic,
681 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
682 &eprom_size)) {
683 printk
684 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
685 i);
686 outb(VOYAGER_CAT_END, CAT_CMD);
687 continue;
688 }
689 if (eprom_size > sizeof(eprom_buf)) {
690 printk
691 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
692 i, eprom_size);
693 outb(VOYAGER_CAT_END, CAT_CMD);
694 continue;
695 }
696 outb(VOYAGER_CAT_END, CAT_CMD);
697 outb(VOYAGER_CAT_RUN, CAT_CMD);
698 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
699 eprom_size));
700 if (cat_subread
701 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
702 outb(VOYAGER_CAT_END, CAT_CMD);
703 continue;
704 }
705 outb(VOYAGER_CAT_END, CAT_CMD);
706 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
707 cat_module_name(i), eprom_hdr->version_id,
708 *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics);
709 (*modpp)->ee_size = eprom_hdr->ee_size;
710 (*modpp)->num_asics = eprom_hdr->num_asics;
711 asicpp = &((*modpp)->asic);
712 sp_offset = eprom_hdr->scan_path_offset;
713 /* All we really care about are the Quad cards. We
714 * identify them because they are in a processor slot
715 * and have only four asics */
716 if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) {
717 modpp = &((*modpp)->next);
718 continue;
719 }
720 /* Now we know it's in a processor slot, does it have
721 * a quad baseboard submodule */
722 outb(VOYAGER_CAT_RUN, CAT_CMD);
723 cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODPRESENT,
724 &num_submodules);
725 /* lowest two bits, active low */
726 num_submodules = ~(0xfc | num_submodules);
727 CDEBUG(("VOYAGER CAT: %d submodules present\n",
728 num_submodules));
729 if (num_submodules == 0) {
730 /* fill in the dyadic extended processors */
731 __u8 cpu = i & 0x07;
732
733 printk("Module \"%s\": Dyadic Processor Card\n",
734 cat_module_name(i));
735 voyager_extended_vic_processors |= (1 << cpu);
736 cpu += 4;
737 voyager_extended_vic_processors |= (1 << cpu);
738 outb(VOYAGER_CAT_END, CAT_CMD);
739 continue;
740 }
741
742 /* now we want to read the asics on the first submodule,
743 * which should be the quad base board */
744
745 cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, &val);
746 CDEBUG(("cat_init: SUBMODSELECT value = 0x%x\n", val));
747 val = (val & 0x7c) | VOYAGER_QUAD_BASEBOARD;
748 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
749
750 outb(VOYAGER_CAT_END, CAT_CMD);
751
752 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
753 outb(VOYAGER_CAT_RUN, CAT_CMD);
754 cat_disconnect(*modpp, (*modpp)->asic);
755 if (cat_subread(*modpp, (*modpp)->asic,
756 VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
757 &eprom_size)) {
758 printk
759 ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
760 i);
761 outb(VOYAGER_CAT_END, CAT_CMD);
762 continue;
763 }
764 if (eprom_size > sizeof(eprom_buf)) {
765 printk
766 ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n",
767 i, eprom_size);
768 outb(VOYAGER_CAT_END, CAT_CMD);
769 continue;
770 }
771 outb(VOYAGER_CAT_END, CAT_CMD);
772 outb(VOYAGER_CAT_RUN, CAT_CMD);
773 CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
774 eprom_size));
775 if (cat_subread
776 (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
777 outb(VOYAGER_CAT_END, CAT_CMD);
778 continue;
779 }
780 outb(VOYAGER_CAT_END, CAT_CMD);
781 /* Now do everything for the QBB submodule 1 */
782 (*modpp)->ee_size = eprom_hdr->ee_size;
783 (*modpp)->num_asics = eprom_hdr->num_asics;
784 asicpp = &((*modpp)->asic);
785 sp_offset = eprom_hdr->scan_path_offset;
786 /* get rid of the dummy CAT asic and read the real one */
787 kfree((*modpp)->asic);
788 for (asic = 0; asic < (*modpp)->num_asics; asic++) {
789 int j;
790 voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */
791 voyager_sp_table_t *sp_table;
792 voyager_at_t *asic_table;
793 voyager_jtt_t *jtag_table;
794
795 if (asicp == NULL) {
796 printk
797 ("**WARNING** kmalloc failure in cat_init\n");
798 continue;
799 }
800 asicpp = &(asicp->next);
801 asicp->asic_location = asic;
802 sp_table =
803 (voyager_sp_table_t *) (eprom_buf + sp_offset);
804 asicp->asic_id = sp_table->asic_id;
805 asic_table =
806 (voyager_at_t *) (eprom_buf +
807 sp_table->asic_data_offset);
808 for (j = 0; j < 4; j++)
809 asicp->jtag_id[j] = asic_table->jtag_id[j];
810 jtag_table =
811 (voyager_jtt_t *) (eprom_buf +
812 asic_table->jtag_offset);
813 asicp->ireg_length = jtag_table->ireg_len;
814 asicp->bit_location = (*modpp)->inst_bits;
815 (*modpp)->inst_bits += asicp->ireg_length;
816 if (asicp->ireg_length > (*modpp)->largest_reg)
817 (*modpp)->largest_reg = asicp->ireg_length;
818 if (asicp->ireg_length < (*modpp)->smallest_reg ||
819 (*modpp)->smallest_reg == 0)
820 (*modpp)->smallest_reg = asicp->ireg_length;
821 CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
822 asicp->asic_id, asicp->ireg_length,
823 asicp->bit_location));
824 if (asicp->asic_id == VOYAGER_QUAD_QABC) {
825 CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
826 qabc_asic = asicp;
827 }
828 sp_offset += sizeof(voyager_sp_table_t);
829 }
830 CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg));
831 /* OK, now we have the QUAD ASICs set up, use them.
832 * we need to:
833 *
834 * 1. Find the Memory area for the Quad CPIs.
835 * 2. Find the Extended VIC processor
836 * 3. Configure a second extended VIC processor (This
837 * cannot be done for the 51xx.
838 * */
839 outb(VOYAGER_CAT_RUN, CAT_CMD);
840 cat_connect(*modpp, (*modpp)->asic);
841 CDEBUG(("CAT CONNECTED!!\n"));
842 cat_subread(*modpp, qabc_asic, 0, sizeof(qabc_data), qabc_data);
843 qic_addr = qabc_data[5] << 8;
844 qic_addr = (qic_addr | qabc_data[6]) << 8;
845 qic_addr = (qic_addr | qabc_data[7]) << 8;
846 printk
847 ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
848 cat_module_name(i), qic_addr, qabc_data[8]);
849#if 0 /* plumbing fails---FIXME */
850 if ((qabc_data[8] & 0xf0) == 0) {
851 /* FIXME: 32 way 8 CPU slot monster cannot be
852 * plumbed this way---need to check for it */
853
854 printk("Plumbing second Extended Quad Processor\n");
855 /* second VIC line hardwired to Quad CPU 1 */
856 qabc_data[8] |= 0x20;
857 cat_subwrite(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
858#ifdef VOYAGER_CAT_DEBUG
859 /* verify plumbing */
860 cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
861 if ((qabc_data[8] & 0xf0) == 0) {
862 CDEBUG(("PLUMBING FAILED: 0x%x\n",
863 qabc_data[8]));
864 }
865#endif
866 }
867#endif
868
869 {
870 struct resource *res =
871 kzalloc(sizeof(struct resource), GFP_KERNEL);
872 res->name = kmalloc(128, GFP_KERNEL);
873 sprintf((char *)res->name, "Voyager %s Quad CPI",
874 cat_module_name(i));
875 res->start = qic_addr;
876 res->end = qic_addr + 0x3ff;
877 request_resource(&iomem_resource, res);
878 }
879
880 qic_addr = (unsigned long)ioremap_cache(qic_addr, 0x400);
881
882 for (j = 0; j < 4; j++) {
883 __u8 cpu;
884
885 if (voyager_8slot) {
886 /* 8 slot has a different mapping,
887 * each slot has only one vic line, so
888 * 1 cpu in each slot must be < 8 */
889 cpu = (i & 0x07) + j * 8;
890 } else {
891 cpu = (i & 0x03) + j * 4;
892 }
893 if ((qabc_data[8] & (1 << j))) {
894 voyager_extended_vic_processors |= (1 << cpu);
895 }
896 if (qabc_data[8] & (1 << (j + 4))) {
897 /* Second SET register plumbed: Quad
898 * card has two VIC connected CPUs.
899 * Secondary cannot be booted as a VIC
900 * CPU */
901 voyager_extended_vic_processors |= (1 << cpu);
902 voyager_allowed_boot_processors &=
903 (~(1 << cpu));
904 }
905
906 voyager_quad_processors |= (1 << cpu);
907 voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
908 (qic_addr + (j << 8));
909 CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
910 (unsigned long)voyager_quad_cpi_addr[cpu]));
911 }
912 outb(VOYAGER_CAT_END, CAT_CMD);
913
914 *asicpp = NULL;
915 modpp = &((*modpp)->next);
916 }
917 *modpp = NULL;
918 printk
919 ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n",
920 voyager_extended_vic_processors, voyager_quad_processors,
921 voyager_allowed_boot_processors);
922 request_resource(&ioport_resource, &vic_res);
923 if (voyager_quad_processors)
924 request_resource(&ioport_resource, &qic_res);
925 /* set up the front power switch */
926}
927
928int voyager_cat_readb(__u8 module, __u8 asic, int reg)
929{
930 return 0;
931}
932
933static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp)
934{
935 __u8 val;
936 int err = 0;
937
938 if (!modp->scan_path_connected)
939 return 0;
940 if (asicp->asic_id != VOYAGER_CAT_ID) {
941 CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
942 return 1;
943 }
944 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
945 if (err) {
946 CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
947 return err;
948 }
949 val &= VOYAGER_DISCONNECT_ASIC;
950 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
951 if (err) {
952 CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
953 return err;
954 }
955 outb(VOYAGER_CAT_END, CAT_CMD);
956 outb(VOYAGER_CAT_RUN, CAT_CMD);
957 modp->scan_path_connected = 0;
958
959 return 0;
960}
961
962static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp)
963{
964 __u8 val;
965 int err = 0;
966
967 if (modp->scan_path_connected)
968 return 0;
969 if (asicp->asic_id != VOYAGER_CAT_ID) {
970 CDEBUG(("cat_connect: ASIC is not CAT\n"));
971 return 1;
972 }
973
974 err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
975 if (err) {
976 CDEBUG(("cat_connect: failed to read SCANPATH\n"));
977 return err;
978 }
979 val |= VOYAGER_CONNECT_ASIC;
980 err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
981 if (err) {
982 CDEBUG(("cat_connect: failed to write SCANPATH\n"));
983 return err;
984 }
985 outb(VOYAGER_CAT_END, CAT_CMD);
986 outb(VOYAGER_CAT_RUN, CAT_CMD);
987 modp->scan_path_connected = 1;
988
989 return 0;
990}
991
992void voyager_cat_power_off(void)
993{
994 /* Power the machine off by writing to the PSI over the CAT
995 * bus */
996 __u8 data;
997 voyager_module_t psi = { 0 };
998 voyager_asic_t psi_asic = { 0 };
999
1000 psi.asic = &psi_asic;
1001 psi.asic->asic_id = VOYAGER_CAT_ID;
1002 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
1003 psi.module_addr = VOYAGER_PSI;
1004 psi.scan_path_connected = 0;
1005
1006 outb(VOYAGER_CAT_END, CAT_CMD);
1007 /* Connect the PSI to the CAT Bus */
1008 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
1009 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1010 outb(VOYAGER_CAT_RUN, CAT_CMD);
1011 cat_disconnect(&psi, &psi_asic);
1012 /* Read the status */
1013 cat_subread(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
1014 outb(VOYAGER_CAT_END, CAT_CMD);
1015 CDEBUG(("PSI STATUS 0x%x\n", data));
1016 /* These two writes are power off prep and perform */
1017 data = PSI_CLEAR;
1018 outb(VOYAGER_CAT_RUN, CAT_CMD);
1019 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
1020 outb(VOYAGER_CAT_END, CAT_CMD);
1021 data = PSI_POWER_DOWN;
1022 outb(VOYAGER_CAT_RUN, CAT_CMD);
1023 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
1024 outb(VOYAGER_CAT_END, CAT_CMD);
1025}
1026
1027struct voyager_status voyager_status = { 0 };
1028
1029void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data)
1030{
1031 voyager_module_t psi = { 0 };
1032 voyager_asic_t psi_asic = { 0 };
1033
1034 psi.asic = &psi_asic;
1035 psi.asic->asic_id = VOYAGER_CAT_ID;
1036 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
1037 psi.module_addr = VOYAGER_PSI;
1038 psi.scan_path_connected = 0;
1039
1040 outb(VOYAGER_CAT_END, CAT_CMD);
1041 /* Connect the PSI to the CAT Bus */
1042 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
1043 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1044 outb(VOYAGER_CAT_RUN, CAT_CMD);
1045 cat_disconnect(&psi, &psi_asic);
1046 switch (cmd) {
1047 case VOYAGER_PSI_READ:
1048 cat_read(&psi, &psi_asic, reg, data);
1049 break;
1050 case VOYAGER_PSI_WRITE:
1051 cat_write(&psi, &psi_asic, reg, *data);
1052 break;
1053 case VOYAGER_PSI_SUBREAD:
1054 cat_subread(&psi, &psi_asic, reg, 1, data);
1055 break;
1056 case VOYAGER_PSI_SUBWRITE:
1057 cat_subwrite(&psi, &psi_asic, reg, 1, data);
1058 break;
1059 default:
1060 printk(KERN_ERR "Voyager PSI, unrecognised command %d\n", cmd);
1061 break;
1062 }
1063 outb(VOYAGER_CAT_END, CAT_CMD);
1064}
1065
1066void voyager_cat_do_common_interrupt(void)
1067{
1068 /* This is caused either by a memory parity error or something
1069 * in the PSI */
1070 __u8 data;
1071 voyager_module_t psi = { 0 };
1072 voyager_asic_t psi_asic = { 0 };
1073 struct voyager_psi psi_reg;
1074 int i;
1075 re_read:
1076 psi.asic = &psi_asic;
1077 psi.asic->asic_id = VOYAGER_CAT_ID;
1078 psi.asic->subaddr = VOYAGER_SUBADDR_HI;
1079 psi.module_addr = VOYAGER_PSI;
1080 psi.scan_path_connected = 0;
1081
1082 outb(VOYAGER_CAT_END, CAT_CMD);
1083 /* Connect the PSI to the CAT Bus */
1084 outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
1085 outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
1086 outb(VOYAGER_CAT_RUN, CAT_CMD);
1087 cat_disconnect(&psi, &psi_asic);
1088 /* Read the status. NOTE: Need to read *all* the PSI regs here
1089 * otherwise the cmn int will be reasserted */
1090 for (i = 0; i < sizeof(psi_reg.regs); i++) {
1091 cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]);
1092 }
1093 outb(VOYAGER_CAT_END, CAT_CMD);
1094 if ((psi_reg.regs.checkbit & 0x02) == 0) {
1095 psi_reg.regs.checkbit |= 0x02;
1096 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
1097 printk("VOYAGER RE-READ PSI\n");
1098 goto re_read;
1099 }
1100 outb(VOYAGER_CAT_RUN, CAT_CMD);
1101 for (i = 0; i < sizeof(psi_reg.subregs); i++) {
1102 /* This looks strange, but the PSI doesn't do auto increment
1103 * correctly */
1104 cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i,
1105 1, &((__u8 *) & psi_reg.subregs)[i]);
1106 }
1107 outb(VOYAGER_CAT_END, CAT_CMD);
1108#ifdef VOYAGER_CAT_DEBUG
1109 printk("VOYAGER PSI: ");
1110 for (i = 0; i < sizeof(psi_reg.regs); i++)
1111 printk("%02x ", ((__u8 *) & psi_reg.regs)[i]);
1112 printk("\n ");
1113 for (i = 0; i < sizeof(psi_reg.subregs); i++)
1114 printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]);
1115 printk("\n");
1116#endif
1117 if (psi_reg.regs.intstatus & PSI_MON) {
1118 /* switch off or power fail */
1119
1120 if (psi_reg.subregs.supply & PSI_SWITCH_OFF) {
1121 if (voyager_status.switch_off) {
1122 printk(KERN_ERR
1123 "Voyager front panel switch turned off again---Immediate power off!\n");
1124 voyager_cat_power_off();
1125 /* not reached */
1126 } else {
1127 printk(KERN_ERR
1128 "Voyager front panel switch turned off\n");
1129 voyager_status.switch_off = 1;
1130 voyager_status.request_from_kernel = 1;
1131 wake_up_process(voyager_thread);
1132 }
1133 /* Tell the hardware we're taking care of the
1134 * shutdown, otherwise it will power the box off
1135 * within 3 seconds of the switch being pressed and,
1136 * which is much more important to us, continue to
1137 * assert the common interrupt */
1138 data = PSI_CLR_SWITCH_OFF;
1139 outb(VOYAGER_CAT_RUN, CAT_CMD);
1140 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG,
1141 1, &data);
1142 outb(VOYAGER_CAT_END, CAT_CMD);
1143 } else {
1144
1145 VDEBUG(("Voyager ac fail reg 0x%x\n",
1146 psi_reg.subregs.ACfail));
1147 if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
1148 /* No further update */
1149 return;
1150 }
1151#if 0
1152 /* Don't bother trying to find out who failed.
1153 * FIXME: This probably makes the code incorrect on
1154 * anything other than a 345x */
1155 for (i = 0; i < 5; i++) {
1156 if (psi_reg.subregs.ACfail & (1 << i)) {
1157 break;
1158 }
1159 }
1160 printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
1161#endif
1162 /* DON'T do this: it shuts down the AC PSI
1163 outb(VOYAGER_CAT_RUN, CAT_CMD);
1164 data = PSI_MASK_MASK | i;
1165 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
1166 1, &data);
1167 outb(VOYAGER_CAT_END, CAT_CMD);
1168 */
1169 printk(KERN_ERR "Voyager AC power failure\n");
1170 outb(VOYAGER_CAT_RUN, CAT_CMD);
1171 data = PSI_COLD_START;
1172 cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG,
1173 1, &data);
1174 outb(VOYAGER_CAT_END, CAT_CMD);
1175 voyager_status.power_fail = 1;
1176 voyager_status.request_from_kernel = 1;
1177 wake_up_process(voyager_thread);
1178 }
1179
1180 } else if (psi_reg.regs.intstatus & PSI_FAULT) {
1181 /* Major fault! */
1182 printk(KERN_ERR
1183 "Voyager PSI Detected major fault, immediate power off!\n");
1184 voyager_cat_power_off();
1185 /* not reached */
1186 } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
1187 | PSI_CURRENT | PSI_DVM
1188 | PSI_PSCFAULT | PSI_STAT_CHG)) {
1189 /* other psi fault */
1190
1191 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
1192 /* clear the PSI fault */
1193 outb(VOYAGER_CAT_RUN, CAT_CMD);
1194 cat_write(&psi, &psi_asic, VOYAGER_PSI_STATUS_REG, 0);
1195 outb(VOYAGER_CAT_END, CAT_CMD);
1196 }
1197}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
deleted file mode 100644
index b9cc84a2a4fc..000000000000
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ /dev/null
@@ -1,1807 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 1999,2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * This file provides all the same external entries as smp.c but uses
8 * the voyager hal to provide the functionality
9 */
10#include <linux/cpu.h>
11#include <linux/module.h>
12#include <linux/mm.h>
13#include <linux/kernel_stat.h>
14#include <linux/delay.h>
15#include <linux/mc146818rtc.h>
16#include <linux/cache.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/kernel.h>
20#include <linux/bootmem.h>
21#include <linux/completion.h>
22#include <asm/desc.h>
23#include <asm/voyager.h>
24#include <asm/vic.h>
25#include <asm/mtrr.h>
26#include <asm/pgalloc.h>
27#include <asm/tlbflush.h>
28#include <asm/arch_hooks.h>
29#include <asm/trampoline.h>
30
31/* TLB state -- visible externally, indexed physically */
32DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 };
33
34/* CPU IRQ affinity -- set to all ones initially */
35static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned =
36 {[0 ... NR_CPUS-1] = ~0UL };
37
38/* per CPU data structure (for /proc/cpuinfo et al), visible externally
39 * indexed physically */
40DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
41EXPORT_PER_CPU_SYMBOL(cpu_info);
42
43/* physical ID of the CPU used to boot the system */
44unsigned char boot_cpu_id;
45
46/* The memory line addresses for the Quad CPIs */
47struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS] __cacheline_aligned;
48
49/* The masks for the Extended VIC processors, filled in by cat_init */
50__u32 voyager_extended_vic_processors = 0;
51
52/* Masks for the extended Quad processors which cannot be VIC booted */
53__u32 voyager_allowed_boot_processors = 0;
54
55/* The mask for the Quad Processors (both extended and non-extended) */
56__u32 voyager_quad_processors = 0;
57
58/* Total count of live CPUs, used in process.c to display
59 * the CPU information and in irq.c for the per CPU irq
60 * activity count. Finally exported by i386_ksyms.c */
61static int voyager_extended_cpus = 1;
62
63/* Used for the invalidate map that's also checked in the spinlock */
64static volatile unsigned long smp_invalidate_needed;
65
66/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
67 * by scheduler but indexed physically */
68static cpumask_t voyager_phys_cpu_present_map = CPU_MASK_NONE;
69
70/* The internal functions */
71static void send_CPI(__u32 cpuset, __u8 cpi);
72static void ack_CPI(__u8 cpi);
73static int ack_QIC_CPI(__u8 cpi);
74static void ack_special_QIC_CPI(__u8 cpi);
75static void ack_VIC_CPI(__u8 cpi);
76static void send_CPI_allbutself(__u8 cpi);
77static void mask_vic_irq(unsigned int irq);
78static void unmask_vic_irq(unsigned int irq);
79static unsigned int startup_vic_irq(unsigned int irq);
80static void enable_local_vic_irq(unsigned int irq);
81static void disable_local_vic_irq(unsigned int irq);
82static void before_handle_vic_irq(unsigned int irq);
83static void after_handle_vic_irq(unsigned int irq);
84static void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask);
85static void ack_vic_irq(unsigned int irq);
86static void vic_enable_cpi(void);
87static void do_boot_cpu(__u8 cpuid);
88static void do_quad_bootstrap(void);
89static void initialize_secondary(void);
90
91int hard_smp_processor_id(void);
92int safe_smp_processor_id(void);
93
94/* Inline functions */
95static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi)
96{
97 voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
98 (smp_processor_id() << 16) + cpi;
99}
100
101static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi)
102{
103 int cpu;
104
105 for_each_online_cpu(cpu) {
106 if (cpuset & (1 << cpu)) {
107#ifdef VOYAGER_DEBUG
108 if (!cpu_online(cpu))
109 VDEBUG(("CPU%d sending cpi %d to CPU%d not in "
110 "cpu_online_map\n",
111 hard_smp_processor_id(), cpi, cpu));
112#endif
113 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
114 }
115 }
116}
117
118static inline void wrapper_smp_local_timer_interrupt(void)
119{
120 irq_enter();
121 smp_local_timer_interrupt();
122 irq_exit();
123}
124
125static inline void send_one_CPI(__u8 cpu, __u8 cpi)
126{
127 if (voyager_quad_processors & (1 << cpu))
128 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
129 else
130 send_CPI(1 << cpu, cpi);
131}
132
133static inline void send_CPI_allbutself(__u8 cpi)
134{
135 __u8 cpu = smp_processor_id();
136 __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
137 send_CPI(mask, cpi);
138}
139
140static inline int is_cpu_quad(void)
141{
142 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
143 return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
144}
145
146static inline int is_cpu_extended(void)
147{
148 __u8 cpu = hard_smp_processor_id();
149
150 return (voyager_extended_vic_processors & (1 << cpu));
151}
152
153static inline int is_cpu_vic_boot(void)
154{
155 __u8 cpu = hard_smp_processor_id();
156
157 return (voyager_extended_vic_processors
158 & voyager_allowed_boot_processors & (1 << cpu));
159}
160
161static inline void ack_CPI(__u8 cpi)
162{
163 switch (cpi) {
164 case VIC_CPU_BOOT_CPI:
165 if (is_cpu_quad() && !is_cpu_vic_boot())
166 ack_QIC_CPI(cpi);
167 else
168 ack_VIC_CPI(cpi);
169 break;
170 case VIC_SYS_INT:
171 case VIC_CMN_INT:
172 /* These are slightly strange. Even on the Quad card,
173 * They are vectored as VIC CPIs */
174 if (is_cpu_quad())
175 ack_special_QIC_CPI(cpi);
176 else
177 ack_VIC_CPI(cpi);
178 break;
179 default:
180 printk("VOYAGER ERROR: CPI%d is in common CPI code\n", cpi);
181 break;
182 }
183}
184
185/* local variables */
186
187/* The VIC IRQ descriptors -- these look almost identical to the
188 * 8259 IRQs except that masks and things must be kept per processor
189 */
190static struct irq_chip vic_chip = {
191 .name = "VIC",
192 .startup = startup_vic_irq,
193 .mask = mask_vic_irq,
194 .unmask = unmask_vic_irq,
195 .set_affinity = set_vic_irq_affinity,
196};
197
198/* used to count up as CPUs are brought on line (starts at 0) */
199static int cpucount = 0;
200
201/* The per cpu profile stuff - used in smp_local_timer_interrupt */
202static DEFINE_PER_CPU(int, prof_multiplier) = 1;
203static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
204static DEFINE_PER_CPU(int, prof_counter) = 1;
205
206/* the map used to check if a CPU has booted */
207static __u32 cpu_booted_map;
208
209/* the synchronize flag used to hold all secondary CPUs spinning in
210 * a tight loop until the boot sequence is ready for them */
211static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
212
213/* This is for the new dynamic CPU boot code */
214
215/* The per processor IRQ masks (these are usually kept in sync) */
216static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
217
218/* the list of IRQs to be enabled by the VIC_ENABLE_IRQ_CPI */
219static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
220
221/* Lock for enable/disable of VIC interrupts */
222static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
223
224/* The boot processor is correctly set up in PC mode when it
225 * comes up, but the secondaries need their master/slave 8259
226 * pairs initializing correctly */
227
228/* Interrupt counters (per cpu) and total - used to try to
229 * even up the interrupt handling routines */
230static long vic_intr_total = 0;
231static long vic_intr_count[NR_CPUS] __cacheline_aligned = { 0 };
232static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
233
234/* Since we can only use CPI0, we fake all the other CPIs */
235static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
236
237/* debugging routine to read the isr of the cpu's pic */
238static inline __u16 vic_read_isr(void)
239{
240 __u16 isr;
241
242 outb(0x0b, 0xa0);
243 isr = inb(0xa0) << 8;
244 outb(0x0b, 0x20);
245 isr |= inb(0x20);
246
247 return isr;
248}
249
250static __init void qic_setup(void)
251{
252 if (!is_cpu_quad()) {
253 /* not a quad, no setup */
254 return;
255 }
256 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
257 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
258
259 if (is_cpu_extended()) {
260 /* the QIC duplicate of the VIC base register */
261 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
262 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
263
264 /* FIXME: should set up the QIC timer and memory parity
265 * error vectors here */
266 }
267}
268
269static __init void vic_setup_pic(void)
270{
271 outb(1, VIC_REDIRECT_REGISTER_1);
272 /* clear the claim registers for dynamic routing */
273 outb(0, VIC_CLAIM_REGISTER_0);
274 outb(0, VIC_CLAIM_REGISTER_1);
275
276 outb(0, VIC_PRIORITY_REGISTER);
277 /* Set the Primary and Secondary Microchannel vector
278 * bases to be the same as the ordinary interrupts
279 *
280 * FIXME: This would be more efficient using separate
281 * vectors. */
282 outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
283 outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
284 /* Now initiallise the master PIC belonging to this CPU by
285 * sending the four ICWs */
286
287 /* ICW1: level triggered, ICW4 needed */
288 outb(0x19, 0x20);
289
290 /* ICW2: vector base */
291 outb(FIRST_EXTERNAL_VECTOR, 0x21);
292
293 /* ICW3: slave at line 2 */
294 outb(0x04, 0x21);
295
296 /* ICW4: 8086 mode */
297 outb(0x01, 0x21);
298
299 /* now the same for the slave PIC */
300
301 /* ICW1: level trigger, ICW4 needed */
302 outb(0x19, 0xA0);
303
304 /* ICW2: slave vector base */
305 outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
306
307 /* ICW3: slave ID */
308 outb(0x02, 0xA1);
309
310 /* ICW4: 8086 mode */
311 outb(0x01, 0xA1);
312}
313
314static void do_quad_bootstrap(void)
315{
316 if (is_cpu_quad() && is_cpu_vic_boot()) {
317 int i;
318 unsigned long flags;
319 __u8 cpuid = hard_smp_processor_id();
320
321 local_irq_save(flags);
322
323 for (i = 0; i < 4; i++) {
324 /* FIXME: this would be >>3 &0x7 on the 32 way */
325 if (((cpuid >> 2) & 0x03) == i)
326 /* don't lower our own mask! */
327 continue;
328
329 /* masquerade as local Quad CPU */
330 outb(QIC_CPUID_ENABLE | i, QIC_PROCESSOR_ID);
331 /* enable the startup CPI */
332 outb(QIC_BOOT_CPI_MASK, QIC_MASK_REGISTER1);
333 /* restore cpu id */
334 outb(0, QIC_PROCESSOR_ID);
335 }
336 local_irq_restore(flags);
337 }
338}
339
340void prefill_possible_map(void)
341{
342 /* This is empty on voyager because we need a much
343 * earlier detection which is done in find_smp_config */
344}
345
346/* Set up all the basic stuff: read the SMP config and make all the
347 * SMP information reflect only the boot cpu. All others will be
348 * brought on-line later. */
349void __init find_smp_config(void)
350{
351 int i;
352
353 boot_cpu_id = hard_smp_processor_id();
354
355 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
356
357 /* initialize the CPU structures (moved from smp_boot_cpus) */
358 for (i = 0; i < nr_cpu_ids; i++)
359 cpu_irq_affinity[i] = ~0;
360 cpu_online_map = cpumask_of_cpu(boot_cpu_id);
361
362 /* The boot CPU must be extended */
363 voyager_extended_vic_processors = 1 << boot_cpu_id;
364 /* initially, all of the first 8 CPUs can boot */
365 voyager_allowed_boot_processors = 0xff;
366 /* set up everything for just this CPU, we can alter
367 * this as we start the other CPUs later */
368 /* now get the CPU disposition from the extended CMOS */
369 cpus_addr(voyager_phys_cpu_present_map)[0] =
370 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
371 cpus_addr(voyager_phys_cpu_present_map)[0] |=
372 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
373 cpus_addr(voyager_phys_cpu_present_map)[0] |=
374 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
375 2) << 16;
376 cpus_addr(voyager_phys_cpu_present_map)[0] |=
377 voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
378 3) << 24;
379 init_cpu_possible(&voyager_phys_cpu_present_map);
380 printk("VOYAGER SMP: voyager_phys_cpu_present_map = 0x%lx\n",
381 cpus_addr(voyager_phys_cpu_present_map)[0]);
382 /* Here we set up the VIC to enable SMP */
383 /* enable the CPIs by writing the base vector to their register */
384 outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
385 outb(1, VIC_REDIRECT_REGISTER_1);
386 /* set the claim registers for static routing --- Boot CPU gets
387 * all interrupts untill all other CPUs started */
388 outb(0xff, VIC_CLAIM_REGISTER_0);
389 outb(0xff, VIC_CLAIM_REGISTER_1);
390 /* Set the Primary and Secondary Microchannel vector
391 * bases to be the same as the ordinary interrupts
392 *
393 * FIXME: This would be more efficient using separate
394 * vectors. */
395 outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
396 outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
397
398 /* Finally tell the firmware that we're driving */
399 outb(inb(VOYAGER_SUS_IN_CONTROL_PORT) | VOYAGER_IN_CONTROL_FLAG,
400 VOYAGER_SUS_IN_CONTROL_PORT);
401
402 current_thread_info()->cpu = boot_cpu_id;
403 x86_write_percpu(cpu_number, boot_cpu_id);
404}
405
406/*
407 * The bootstrap kernel entry code has set these up. Save them
408 * for a given CPU, id is physical */
409void __init smp_store_cpu_info(int id)
410{
411 struct cpuinfo_x86 *c = &cpu_data(id);
412
413 *c = boot_cpu_data;
414 c->cpu_index = id;
415
416 identify_secondary_cpu(c);
417}
418
419/* Routine initially called when a non-boot CPU is brought online */
420static void __init start_secondary(void *unused)
421{
422 __u8 cpuid = hard_smp_processor_id();
423
424 cpu_init();
425
426 /* OK, we're in the routine */
427 ack_CPI(VIC_CPU_BOOT_CPI);
428
429 /* setup the 8259 master slave pair belonging to this CPU ---
430 * we won't actually receive any until the boot CPU
431 * relinquishes it's static routing mask */
432 vic_setup_pic();
433
434 qic_setup();
435
436 if (is_cpu_quad() && !is_cpu_vic_boot()) {
437 /* clear the boot CPI */
438 __u8 dummy;
439
440 dummy =
441 voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
442 printk("read dummy %d\n", dummy);
443 }
444
445 /* lower the mask to receive CPIs */
446 vic_enable_cpi();
447
448 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
449
450 notify_cpu_starting(cpuid);
451
452 /* enable interrupts */
453 local_irq_enable();
454
455 /* get our bogomips */
456 calibrate_delay();
457
458 /* save our processor parameters */
459 smp_store_cpu_info(cpuid);
460
461 /* if we're a quad, we may need to bootstrap other CPUs */
462 do_quad_bootstrap();
463
464 /* FIXME: this is rather a poor hack to prevent the CPU
465 * activating softirqs while it's supposed to be waiting for
466 * permission to proceed. Without this, the new per CPU stuff
467 * in the softirqs will fail */
468 local_irq_disable();
469 cpu_set(cpuid, cpu_callin_map);
470
471 /* signal that we're done */
472 cpu_booted_map = 1;
473
474 while (!cpu_isset(cpuid, smp_commenced_mask))
475 rep_nop();
476 local_irq_enable();
477
478 local_flush_tlb();
479
480 cpu_set(cpuid, cpu_online_map);
481 wmb();
482 cpu_idle();
483}
484
485/* Routine to kick start the given CPU and wait for it to report ready
486 * (or timeout in startup). When this routine returns, the requested
487 * CPU is either fully running and configured or known to be dead.
488 *
489 * We call this routine sequentially 1 CPU at a time, so no need for
490 * locking */
491
492static void __init do_boot_cpu(__u8 cpu)
493{
494 struct task_struct *idle;
495 int timeout;
496 unsigned long flags;
497 int quad_boot = (1 << cpu) & voyager_quad_processors
498 & ~(voyager_extended_vic_processors
499 & voyager_allowed_boot_processors);
500
501 /* This is the format of the CPI IDT gate (in real mode) which
502 * we're hijacking to boot the CPU */
503 union IDTFormat {
504 struct seg {
505 __u16 Offset;
506 __u16 Segment;
507 } idt;
508 __u32 val;
509 } hijack_source;
510
511 __u32 *hijack_vector;
512 __u32 start_phys_address = setup_trampoline();
513
514 /* There's a clever trick to this: The linux trampoline is
515 * compiled to begin at absolute location zero, so make the
516 * address zero but have the data segment selector compensate
517 * for the actual address */
518 hijack_source.idt.Offset = start_phys_address & 0x000F;
519 hijack_source.idt.Segment = (start_phys_address >> 4) & 0xFFFF;
520
521 cpucount++;
522 alternatives_smp_switch(1);
523
524 idle = fork_idle(cpu);
525 if (IS_ERR(idle))
526 panic("failed fork for CPU%d", cpu);
527 idle->thread.ip = (unsigned long)start_secondary;
528 /* init_tasks (in sched.c) is indexed logically */
529 stack_start.sp = (void *)idle->thread.sp;
530
531 init_gdt(cpu);
532 per_cpu(current_task, cpu) = idle;
533 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
534 irq_ctx_init(cpu);
535
536 /* Note: Don't modify initial ss override */
537 VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
538 (unsigned long)hijack_source.val, hijack_source.idt.Segment,
539 hijack_source.idt.Offset, stack_start.sp));
540
541 /* init lowmem identity mapping */
542 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
543 min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
544 flush_tlb_all();
545
546 if (quad_boot) {
547 printk("CPU %d: non extended Quad boot\n", cpu);
548 hijack_vector =
549 (__u32 *)
550 phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4);
551 *hijack_vector = hijack_source.val;
552 } else {
553 printk("CPU%d: extended VIC boot\n", cpu);
554 hijack_vector =
555 (__u32 *)
556 phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4);
557 *hijack_vector = hijack_source.val;
558 /* VIC errata, may also receive interrupt at this address */
559 hijack_vector =
560 (__u32 *)
561 phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI +
562 VIC_DEFAULT_CPI_BASE) * 4);
563 *hijack_vector = hijack_source.val;
564 }
565 /* All non-boot CPUs start with interrupts fully masked. Need
566 * to lower the mask of the CPI we're about to send. We do
567 * this in the VIC by masquerading as the processor we're
568 * about to boot and lowering its interrupt mask */
569 local_irq_save(flags);
570 if (quad_boot) {
571 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
572 } else {
573 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
574 /* here we're altering registers belonging to `cpu' */
575
576 outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
577 /* now go back to our original identity */
578 outb(boot_cpu_id, VIC_PROCESSOR_ID);
579
580 /* and boot the CPU */
581
582 send_CPI((1 << cpu), VIC_CPU_BOOT_CPI);
583 }
584 cpu_booted_map = 0;
585 local_irq_restore(flags);
586
587 /* now wait for it to become ready (or timeout) */
588 for (timeout = 0; timeout < 50000; timeout++) {
589 if (cpu_booted_map)
590 break;
591 udelay(100);
592 }
593 /* reset the page table */
594 zap_low_mappings();
595
596 if (cpu_booted_map) {
597 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
598 cpu, smp_processor_id()));
599
600 printk("CPU%d: ", cpu);
601 print_cpu_info(&cpu_data(cpu));
602 wmb();
603 cpu_set(cpu, cpu_callout_map);
604 cpu_set(cpu, cpu_present_map);
605 } else {
606 printk("CPU%d FAILED TO BOOT: ", cpu);
607 if (*
608 ((volatile unsigned char *)phys_to_virt(start_phys_address))
609 == 0xA5)
610 printk("Stuck.\n");
611 else
612 printk("Not responding.\n");
613
614 cpucount--;
615 }
616}
617
618void __init smp_boot_cpus(void)
619{
620 int i;
621
622 /* CAT BUS initialisation must be done after the memory */
623 /* FIXME: The L4 has a catbus too, it just needs to be
624 * accessed in a totally different way */
625 if (voyager_level == 5) {
626 voyager_cat_init();
627
628 /* now that the cat has probed the Voyager System Bus, sanity
629 * check the cpu map */
630 if (((voyager_quad_processors | voyager_extended_vic_processors)
631 & cpus_addr(voyager_phys_cpu_present_map)[0]) !=
632 cpus_addr(voyager_phys_cpu_present_map)[0]) {
633 /* should panic */
634 printk("\n\n***WARNING*** "
635 "Sanity check of CPU present map FAILED\n");
636 }
637 } else if (voyager_level == 4)
638 voyager_extended_vic_processors =
639 cpus_addr(voyager_phys_cpu_present_map)[0];
640
641 /* this sets up the idle task to run on the current cpu */
642 voyager_extended_cpus = 1;
643 /* Remove the global_irq_holder setting, it triggers a BUG() on
644 * schedule at the moment */
645 //global_irq_holder = boot_cpu_id;
646
647 /* FIXME: Need to do something about this but currently only works
648 * on CPUs with a tsc which none of mine have.
649 smp_tune_scheduling();
650 */
651 smp_store_cpu_info(boot_cpu_id);
652 /* setup the jump vector */
653 initial_code = (unsigned long)initialize_secondary;
654 printk("CPU%d: ", boot_cpu_id);
655 print_cpu_info(&cpu_data(boot_cpu_id));
656
657 if (is_cpu_quad()) {
658 /* booting on a Quad CPU */
659 printk("VOYAGER SMP: Boot CPU is Quad\n");
660 qic_setup();
661 do_quad_bootstrap();
662 }
663
664 /* enable our own CPIs */
665 vic_enable_cpi();
666
667 cpu_set(boot_cpu_id, cpu_online_map);
668 cpu_set(boot_cpu_id, cpu_callout_map);
669
670 /* loop over all the extended VIC CPUs and boot them. The
671 * Quad CPUs must be bootstrapped by their extended VIC cpu */
672 for (i = 0; i < nr_cpu_ids; i++) {
673 if (i == boot_cpu_id || !cpu_isset(i, voyager_phys_cpu_present_map))
674 continue;
675 do_boot_cpu(i);
676 /* This udelay seems to be needed for the Quad boots
677 * don't remove unless you know what you're doing */
678 udelay(1000);
679 }
680 /* we could compute the total bogomips here, but why bother?,
681 * Code added from smpboot.c */
682 {
683 unsigned long bogosum = 0;
684
685 for_each_online_cpu(i)
686 bogosum += cpu_data(i).loops_per_jiffy;
687 printk(KERN_INFO "Total of %d processors activated "
688 "(%lu.%02lu BogoMIPS).\n",
689 cpucount + 1, bogosum / (500000 / HZ),
690 (bogosum / (5000 / HZ)) % 100);
691 }
692 voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
693 printk("VOYAGER: Extended (interrupt handling CPUs): "
694 "%d, non-extended: %d\n", voyager_extended_cpus,
695 num_booting_cpus() - voyager_extended_cpus);
696 /* that's it, switch to symmetric mode */
697 outb(0, VIC_PRIORITY_REGISTER);
698 outb(0, VIC_CLAIM_REGISTER_0);
699 outb(0, VIC_CLAIM_REGISTER_1);
700
701 VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
702}
703
704/* Reload the secondary CPUs task structure (this function does not
705 * return ) */
706static void __init initialize_secondary(void)
707{
708#if 0
709 // AC kernels only
710 set_current(hard_get_current());
711#endif
712
713 /*
714 * We don't actually need to load the full TSS,
715 * basically just the stack pointer and the eip.
716 */
717
718 asm volatile ("movl %0,%%esp\n\t"
719 "jmp *%1"::"r" (current->thread.sp),
720 "r"(current->thread.ip));
721}
722
723/* handle a Voyager SYS_INT -- If we don't, the base board will
724 * panic the system.
725 *
726 * System interrupts occur because some problem was detected on the
727 * various busses. To find out what you have to probe all the
728 * hardware via the CAT bus. FIXME: At the moment we do nothing. */
729void smp_vic_sys_interrupt(struct pt_regs *regs)
730{
731 ack_CPI(VIC_SYS_INT);
732 printk("Voyager SYSTEM INTERRUPT\n");
733}
734
735/* Handle a voyager CMN_INT; These interrupts occur either because of
736 * a system status change or because a single bit memory error
737 * occurred. FIXME: At the moment, ignore all this. */
738void smp_vic_cmn_interrupt(struct pt_regs *regs)
739{
740 static __u8 in_cmn_int = 0;
741 static DEFINE_SPINLOCK(cmn_int_lock);
742
743 /* common ints are broadcast, so make sure we only do this once */
744 _raw_spin_lock(&cmn_int_lock);
745 if (in_cmn_int)
746 goto unlock_end;
747
748 in_cmn_int++;
749 _raw_spin_unlock(&cmn_int_lock);
750
751 VDEBUG(("Voyager COMMON INTERRUPT\n"));
752
753 if (voyager_level == 5)
754 voyager_cat_do_common_interrupt();
755
756 _raw_spin_lock(&cmn_int_lock);
757 in_cmn_int = 0;
758 unlock_end:
759 _raw_spin_unlock(&cmn_int_lock);
760 ack_CPI(VIC_CMN_INT);
761}
762
763/*
764 * Reschedule call back. Nothing to do, all the work is done
765 * automatically when we return from the interrupt. */
766static void smp_reschedule_interrupt(void)
767{
768 /* do nothing */
769}
770
771static struct mm_struct *flush_mm;
772static unsigned long flush_va;
773static DEFINE_SPINLOCK(tlbstate_lock);
774
775/*
776 * We cannot call mmdrop() because we are in interrupt context,
777 * instead update mm->cpu_vm_mask.
778 *
779 * We need to reload %cr3 since the page tables may be going
780 * away from under us..
781 */
782static inline void voyager_leave_mm(unsigned long cpu)
783{
784 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
785 BUG();
786 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
787 load_cr3(swapper_pg_dir);
788}
789
790/*
791 * Invalidate call-back
792 */
793static void smp_invalidate_interrupt(void)
794{
795 __u8 cpu = smp_processor_id();
796
797 if (!test_bit(cpu, &smp_invalidate_needed))
798 return;
799 /* This will flood messages. Don't uncomment unless you see
800 * Problems with cross cpu invalidation
801 VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
802 smp_processor_id()));
803 */
804
805 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
806 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
807 if (flush_va == TLB_FLUSH_ALL)
808 local_flush_tlb();
809 else
810 __flush_tlb_one(flush_va);
811 } else
812 voyager_leave_mm(cpu);
813 }
814 smp_mb__before_clear_bit();
815 clear_bit(cpu, &smp_invalidate_needed);
816 smp_mb__after_clear_bit();
817}
818
819/* All the new flush operations for 2.4 */
820
821/* This routine is called with a physical cpu mask */
822static void
823voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm,
824 unsigned long va)
825{
826 int stuck = 50000;
827
828 if (!cpumask)
829 BUG();
830 if ((cpumask & cpus_addr(cpu_online_map)[0]) != cpumask)
831 BUG();
832 if (cpumask & (1 << smp_processor_id()))
833 BUG();
834 if (!mm)
835 BUG();
836
837 spin_lock(&tlbstate_lock);
838
839 flush_mm = mm;
840 flush_va = va;
841 atomic_set_mask(cpumask, &smp_invalidate_needed);
842 /*
843 * We have to send the CPI only to
844 * CPUs affected.
845 */
846 send_CPI(cpumask, VIC_INVALIDATE_CPI);
847
848 while (smp_invalidate_needed) {
849 mb();
850 if (--stuck == 0) {
851 printk("***WARNING*** Stuck doing invalidate CPI "
852 "(CPU%d)\n", smp_processor_id());
853 break;
854 }
855 }
856
857 /* Uncomment only to debug invalidation problems
858 VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
859 */
860
861 flush_mm = NULL;
862 flush_va = 0;
863 spin_unlock(&tlbstate_lock);
864}
865
866void flush_tlb_current_task(void)
867{
868 struct mm_struct *mm = current->mm;
869 unsigned long cpu_mask;
870
871 preempt_disable();
872
873 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
874 local_flush_tlb();
875 if (cpu_mask)
876 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
877
878 preempt_enable();
879}
880
881void flush_tlb_mm(struct mm_struct *mm)
882{
883 unsigned long cpu_mask;
884
885 preempt_disable();
886
887 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
888
889 if (current->active_mm == mm) {
890 if (current->mm)
891 local_flush_tlb();
892 else
893 voyager_leave_mm(smp_processor_id());
894 }
895 if (cpu_mask)
896 voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
897
898 preempt_enable();
899}
900
901void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
902{
903 struct mm_struct *mm = vma->vm_mm;
904 unsigned long cpu_mask;
905
906 preempt_disable();
907
908 cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
909 if (current->active_mm == mm) {
910 if (current->mm)
911 __flush_tlb_one(va);
912 else
913 voyager_leave_mm(smp_processor_id());
914 }
915
916 if (cpu_mask)
917 voyager_flush_tlb_others(cpu_mask, mm, va);
918
919 preempt_enable();
920}
921
922EXPORT_SYMBOL(flush_tlb_page);
923
924/* enable the requested IRQs */
925static void smp_enable_irq_interrupt(void)
926{
927 __u8 irq;
928 __u8 cpu = get_cpu();
929
930 VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
931 vic_irq_enable_mask[cpu]));
932
933 spin_lock(&vic_irq_lock);
934 for (irq = 0; irq < 16; irq++) {
935 if (vic_irq_enable_mask[cpu] & (1 << irq))
936 enable_local_vic_irq(irq);
937 }
938 vic_irq_enable_mask[cpu] = 0;
939 spin_unlock(&vic_irq_lock);
940
941 put_cpu_no_resched();
942}
943
944/*
945 * CPU halt call-back
946 */
947static void smp_stop_cpu_function(void *dummy)
948{
949 VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
950 cpu_clear(smp_processor_id(), cpu_online_map);
951 local_irq_disable();
952 for (;;)
953 halt();
954}
955
956/* execute a thread on a new CPU. The function to be called must be
957 * previously set up. This is used to schedule a function for
958 * execution on all CPUs - set up the function then broadcast a
959 * function_interrupt CPI to come here on each CPU */
960static void smp_call_function_interrupt(void)
961{
962 irq_enter();
963 generic_smp_call_function_interrupt();
964 __get_cpu_var(irq_stat).irq_call_count++;
965 irq_exit();
966}
967
968static void smp_call_function_single_interrupt(void)
969{
970 irq_enter();
971 generic_smp_call_function_single_interrupt();
972 __get_cpu_var(irq_stat).irq_call_count++;
973 irq_exit();
974}
975
976/* Sorry about the name. In an APIC based system, the APICs
977 * themselves are programmed to send a timer interrupt. This is used
978 * by linux to reschedule the processor. Voyager doesn't have this,
979 * so we use the system clock to interrupt one processor, which in
980 * turn, broadcasts a timer CPI to all the others --- we receive that
981 * CPI here. We don't use this actually for counting so losing
982 * ticks doesn't matter
983 *
984 * FIXME: For those CPUs which actually have a local APIC, we could
985 * try to use it to trigger this interrupt instead of having to
986 * broadcast the timer tick. Unfortunately, all my pentium DYADs have
987 * no local APIC, so I can't do this
988 *
989 * This function is currently a placeholder and is unused in the code */
990void smp_apic_timer_interrupt(struct pt_regs *regs)
991{
992 struct pt_regs *old_regs = set_irq_regs(regs);
993 wrapper_smp_local_timer_interrupt();
994 set_irq_regs(old_regs);
995}
996
997/* All of the QUAD interrupt GATES */
998void smp_qic_timer_interrupt(struct pt_regs *regs)
999{
1000 struct pt_regs *old_regs = set_irq_regs(regs);
1001 ack_QIC_CPI(QIC_TIMER_CPI);
1002 wrapper_smp_local_timer_interrupt();
1003 set_irq_regs(old_regs);
1004}
1005
1006void smp_qic_invalidate_interrupt(struct pt_regs *regs)
1007{
1008 ack_QIC_CPI(QIC_INVALIDATE_CPI);
1009 smp_invalidate_interrupt();
1010}
1011
1012void smp_qic_reschedule_interrupt(struct pt_regs *regs)
1013{
1014 ack_QIC_CPI(QIC_RESCHEDULE_CPI);
1015 smp_reschedule_interrupt();
1016}
1017
1018void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
1019{
1020 ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
1021 smp_enable_irq_interrupt();
1022}
1023
1024void smp_qic_call_function_interrupt(struct pt_regs *regs)
1025{
1026 ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
1027 smp_call_function_interrupt();
1028}
1029
1030void smp_qic_call_function_single_interrupt(struct pt_regs *regs)
1031{
1032 ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI);
1033 smp_call_function_single_interrupt();
1034}
1035
1036void smp_vic_cpi_interrupt(struct pt_regs *regs)
1037{
1038 struct pt_regs *old_regs = set_irq_regs(regs);
1039 __u8 cpu = smp_processor_id();
1040
1041 if (is_cpu_quad())
1042 ack_QIC_CPI(VIC_CPI_LEVEL0);
1043 else
1044 ack_VIC_CPI(VIC_CPI_LEVEL0);
1045
1046 if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
1047 wrapper_smp_local_timer_interrupt();
1048 if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
1049 smp_invalidate_interrupt();
1050 if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
1051 smp_reschedule_interrupt();
1052 if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
1053 smp_enable_irq_interrupt();
1054 if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
1055 smp_call_function_interrupt();
1056 if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu]))
1057 smp_call_function_single_interrupt();
1058 set_irq_regs(old_regs);
1059}
1060
1061static void do_flush_tlb_all(void *info)
1062{
1063 unsigned long cpu = smp_processor_id();
1064
1065 __flush_tlb_all();
1066 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
1067 voyager_leave_mm(cpu);
1068}
1069
1070/* flush the TLB of every active CPU in the system */
1071void flush_tlb_all(void)
1072{
1073 on_each_cpu(do_flush_tlb_all, 0, 1);
1074}
1075
1076/* send a reschedule CPI to one CPU by physical CPU number*/
1077static void voyager_smp_send_reschedule(int cpu)
1078{
1079 send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
1080}
1081
1082int hard_smp_processor_id(void)
1083{
1084 __u8 i;
1085 __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
1086 if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
1087 return cpumask & 0x1F;
1088
1089 for (i = 0; i < 8; i++) {
1090 if (cpumask & (1 << i))
1091 return i;
1092 }
1093 printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
1094 return 0;
1095}
1096
1097int safe_smp_processor_id(void)
1098{
1099 return hard_smp_processor_id();
1100}
1101
1102/* broadcast a halt to all other CPUs */
1103static void voyager_smp_send_stop(void)
1104{
1105 smp_call_function(smp_stop_cpu_function, NULL, 1);
1106}
1107
1108/* this function is triggered in time.c when a clock tick fires
1109 * we need to re-broadcast the tick to all CPUs */
1110void smp_vic_timer_interrupt(void)
1111{
1112 send_CPI_allbutself(VIC_TIMER_CPI);
1113 smp_local_timer_interrupt();
1114}
1115
1116/* local (per CPU) timer interrupt. It does both profiling and
1117 * process statistics/rescheduling.
1118 *
1119 * We do profiling in every local tick, statistics/rescheduling
1120 * happen only every 'profiling multiplier' ticks. The default
1121 * multiplier is 1 and it can be changed by writing the new multiplier
1122 * value into /proc/profile.
1123 */
1124void smp_local_timer_interrupt(void)
1125{
1126 int cpu = smp_processor_id();
1127 long weight;
1128
1129 profile_tick(CPU_PROFILING);
1130 if (--per_cpu(prof_counter, cpu) <= 0) {
1131 /*
1132 * The multiplier may have changed since the last time we got
1133 * to this point as a result of the user writing to
1134 * /proc/profile. In this case we need to adjust the APIC
1135 * timer accordingly.
1136 *
1137 * Interrupts are already masked off at this point.
1138 */
1139 per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
1140 if (per_cpu(prof_counter, cpu) !=
1141 per_cpu(prof_old_multiplier, cpu)) {
1142 /* FIXME: need to update the vic timer tick here */
1143 per_cpu(prof_old_multiplier, cpu) =
1144 per_cpu(prof_counter, cpu);
1145 }
1146
1147 update_process_times(user_mode_vm(get_irq_regs()));
1148 }
1149
1150 if (((1 << cpu) & voyager_extended_vic_processors) == 0)
1151 /* only extended VIC processors participate in
1152 * interrupt distribution */
1153 return;
1154
1155 /*
1156 * We take the 'long' return path, and there every subsystem
1157 * grabs the appropriate locks (kernel lock/ irq lock).
1158 *
1159 * we might want to decouple profiling from the 'long path',
1160 * and do the profiling totally in assembly.
1161 *
1162 * Currently this isn't too much of an issue (performance wise),
1163 * we can take more than 100K local irqs per second on a 100 MHz P5.
1164 */
1165
1166 if ((++vic_tick[cpu] & 0x7) != 0)
1167 return;
1168 /* get here every 16 ticks (about every 1/6 of a second) */
1169
1170 /* Change our priority to give someone else a chance at getting
1171 * the IRQ. The algorithm goes like this:
1172 *
1173 * In the VIC, the dynamically routed interrupt is always
1174 * handled by the lowest priority eligible (i.e. receiving
1175 * interrupts) CPU. If >1 eligible CPUs are equal lowest, the
1176 * lowest processor number gets it.
1177 *
1178 * The priority of a CPU is controlled by a special per-CPU
1179 * VIC priority register which is 3 bits wide 0 being lowest
1180 * and 7 highest priority..
1181 *
1182 * Therefore we subtract the average number of interrupts from
1183 * the number we've fielded. If this number is negative, we
1184 * lower the activity count and if it is positive, we raise
1185 * it.
1186 *
1187 * I'm afraid this still leads to odd looking interrupt counts:
1188 * the totals are all roughly equal, but the individual ones
1189 * look rather skewed.
1190 *
1191 * FIXME: This algorithm is total crap when mixed with SMP
1192 * affinity code since we now try to even up the interrupt
1193 * counts when an affinity binding is keeping them on a
1194 * particular CPU*/
1195 weight = (vic_intr_count[cpu] * voyager_extended_cpus
1196 - vic_intr_total) >> 4;
1197 weight += 4;
1198 if (weight > 7)
1199 weight = 7;
1200 if (weight < 0)
1201 weight = 0;
1202
1203 outb((__u8) weight, VIC_PRIORITY_REGISTER);
1204
1205#ifdef VOYAGER_DEBUG
1206 if ((vic_tick[cpu] & 0xFFF) == 0) {
1207 /* print this message roughly every 25 secs */
1208 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
1209 cpu, vic_tick[cpu], weight);
1210 }
1211#endif
1212}
1213
1214/* setup the profiling timer */
1215int setup_profiling_timer(unsigned int multiplier)
1216{
1217 int i;
1218
1219 if ((!multiplier))
1220 return -EINVAL;
1221
1222 /*
1223 * Set the new multiplier for each CPU. CPUs don't start using the
1224 * new values until the next timer interrupt in which they do process
1225 * accounting.
1226 */
1227 for (i = 0; i < nr_cpu_ids; ++i)
1228 per_cpu(prof_multiplier, i) = multiplier;
1229
1230 return 0;
1231}
1232
1233/* This is a bit of a mess, but forced on us by the genirq changes
1234 * there's no genirq handler that really does what voyager wants
1235 * so hack it up with the simple IRQ handler */
1236static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
1237{
1238 before_handle_vic_irq(irq);
1239 handle_simple_irq(irq, desc);
1240 after_handle_vic_irq(irq);
1241}
1242
1243/* The CPIs are handled in the per cpu 8259s, so they must be
1244 * enabled to be received: FIX: enabling the CPIs in the early
1245 * boot sequence interferes with bug checking; enable them later
1246 * on in smp_init */
1247#define VIC_SET_GATE(cpi, vector) \
1248 set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector))
1249#define QIC_SET_GATE(cpi, vector) \
1250 set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
1251
1252void __init voyager_smp_intr_init(void)
1253{
1254 int i;
1255
1256 /* initialize the per cpu irq mask to all disabled */
1257 for (i = 0; i < nr_cpu_ids; i++)
1258 vic_irq_mask[i] = 0xFFFF;
1259
1260 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
1261
1262 VIC_SET_GATE(VIC_SYS_INT, vic_sys_interrupt);
1263 VIC_SET_GATE(VIC_CMN_INT, vic_cmn_interrupt);
1264
1265 QIC_SET_GATE(QIC_TIMER_CPI, qic_timer_interrupt);
1266 QIC_SET_GATE(QIC_INVALIDATE_CPI, qic_invalidate_interrupt);
1267 QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
1268 QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
1269 QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
1270
1271 /* now put the VIC descriptor into the first 48 IRQs
1272 *
1273 * This is for later: first 16 correspond to PC IRQs; next 16
1274 * are Primary MC IRQs and final 16 are Secondary MC IRQs */
1275 for (i = 0; i < 48; i++)
1276 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
1277}
1278
1279/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
1280 * processor to receive CPI */
1281static void send_CPI(__u32 cpuset, __u8 cpi)
1282{
1283 int cpu;
1284 __u32 quad_cpuset = (cpuset & voyager_quad_processors);
1285
1286 if (cpi < VIC_START_FAKE_CPI) {
1287 /* fake CPI are only used for booting, so send to the
1288 * extended quads as well---Quads must be VIC booted */
1289 outb((__u8) (cpuset), VIC_CPI_Registers[cpi]);
1290 return;
1291 }
1292 if (quad_cpuset)
1293 send_QIC_CPI(quad_cpuset, cpi);
1294 cpuset &= ~quad_cpuset;
1295 cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */
1296 if (cpuset == 0)
1297 return;
1298 for_each_online_cpu(cpu) {
1299 if (cpuset & (1 << cpu))
1300 set_bit(cpi, &vic_cpi_mailbox[cpu]);
1301 }
1302 if (cpuset)
1303 outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
1304}
1305
1306/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
1307 * set the cache line to shared by reading it.
1308 *
1309 * DON'T make this inline otherwise the cache line read will be
1310 * optimised away
1311 * */
1312static int ack_QIC_CPI(__u8 cpi)
1313{
1314 __u8 cpu = hard_smp_processor_id();
1315
1316 cpi &= 7;
1317
1318 outb(1 << cpi, QIC_INTERRUPT_CLEAR1);
1319 return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
1320}
1321
1322static void ack_special_QIC_CPI(__u8 cpi)
1323{
1324 switch (cpi) {
1325 case VIC_CMN_INT:
1326 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
1327 break;
1328 case VIC_SYS_INT:
1329 outb(QIC_SYS_INT, QIC_INTERRUPT_CLEAR0);
1330 break;
1331 }
1332 /* also clear at the VIC, just in case (nop for non-extended proc) */
1333 ack_VIC_CPI(cpi);
1334}
1335
1336/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
1337static void ack_VIC_CPI(__u8 cpi)
1338{
1339#ifdef VOYAGER_DEBUG
1340 unsigned long flags;
1341 __u16 isr;
1342 __u8 cpu = smp_processor_id();
1343
1344 local_irq_save(flags);
1345 isr = vic_read_isr();
1346 if ((isr & (1 << (cpi & 7))) == 0) {
1347 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
1348 }
1349#endif
1350 /* send specific EOI; the two system interrupts have
1351 * bit 4 set for a separate vector but behave as the
1352 * corresponding 3 bit intr */
1353 outb_p(0x60 | (cpi & 7), 0x20);
1354
1355#ifdef VOYAGER_DEBUG
1356 if ((vic_read_isr() & (1 << (cpi & 7))) != 0) {
1357 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
1358 }
1359 local_irq_restore(flags);
1360#endif
1361}
1362
1363/* cribbed with thanks from irq.c */
1364#define __byte(x,y) (((unsigned char *)&(y))[x])
1365#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu]))
1366#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu]))
1367
1368static unsigned int startup_vic_irq(unsigned int irq)
1369{
1370 unmask_vic_irq(irq);
1371
1372 return 0;
1373}
1374
1375/* The enable and disable routines. This is where we run into
1376 * conflicting architectural philosophy. Fundamentally, the voyager
1377 * architecture does not expect to have to disable interrupts globally
1378 * (the IRQ controllers belong to each CPU). The processor masquerade
1379 * which is used to start the system shouldn't be used in a running OS
1380 * since it will cause great confusion if two separate CPUs drive to
1381 * the same IRQ controller (I know, I've tried it).
1382 *
1383 * The solution is a variant on the NCR lazy SPL design:
1384 *
1385 * 1) To disable an interrupt, do nothing (other than set the
1386 * IRQ_DISABLED flag). This dares the interrupt actually to arrive.
1387 *
1388 * 2) If the interrupt dares to come in, raise the local mask against
1389 * it (this will result in all the CPU masks being raised
1390 * eventually).
1391 *
1392 * 3) To enable the interrupt, lower the mask on the local CPU and
1393 * broadcast an Interrupt enable CPI which causes all other CPUs to
1394 * adjust their masks accordingly. */
1395
1396static void unmask_vic_irq(unsigned int irq)
1397{
1398 /* linux doesn't to processor-irq affinity, so enable on
1399 * all CPUs we know about */
1400 int cpu = smp_processor_id(), real_cpu;
1401 __u16 mask = (1 << irq);
1402 __u32 processorList = 0;
1403 unsigned long flags;
1404
1405 VDEBUG(("VOYAGER: unmask_vic_irq(%d) CPU%d affinity 0x%lx\n",
1406 irq, cpu, cpu_irq_affinity[cpu]));
1407 spin_lock_irqsave(&vic_irq_lock, flags);
1408 for_each_online_cpu(real_cpu) {
1409 if (!(voyager_extended_vic_processors & (1 << real_cpu)))
1410 continue;
1411 if (!(cpu_irq_affinity[real_cpu] & mask)) {
1412 /* irq has no affinity for this CPU, ignore */
1413 continue;
1414 }
1415 if (real_cpu == cpu) {
1416 enable_local_vic_irq(irq);
1417 } else if (vic_irq_mask[real_cpu] & mask) {
1418 vic_irq_enable_mask[real_cpu] |= mask;
1419 processorList |= (1 << real_cpu);
1420 }
1421 }
1422 spin_unlock_irqrestore(&vic_irq_lock, flags);
1423 if (processorList)
1424 send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
1425}
1426
1427static void mask_vic_irq(unsigned int irq)
1428{
1429 /* lazy disable, do nothing */
1430}
1431
1432static void enable_local_vic_irq(unsigned int irq)
1433{
1434 __u8 cpu = smp_processor_id();
1435 __u16 mask = ~(1 << irq);
1436 __u16 old_mask = vic_irq_mask[cpu];
1437
1438 vic_irq_mask[cpu] &= mask;
1439 if (vic_irq_mask[cpu] == old_mask)
1440 return;
1441
1442 VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
1443 irq, cpu));
1444
1445 if (irq & 8) {
1446 outb_p(cached_A1(cpu), 0xA1);
1447 (void)inb_p(0xA1);
1448 } else {
1449 outb_p(cached_21(cpu), 0x21);
1450 (void)inb_p(0x21);
1451 }
1452}
1453
1454static void disable_local_vic_irq(unsigned int irq)
1455{
1456 __u8 cpu = smp_processor_id();
1457 __u16 mask = (1 << irq);
1458 __u16 old_mask = vic_irq_mask[cpu];
1459
1460 if (irq == 7)
1461 return;
1462
1463 vic_irq_mask[cpu] |= mask;
1464 if (old_mask == vic_irq_mask[cpu])
1465 return;
1466
1467 VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
1468 irq, cpu));
1469
1470 if (irq & 8) {
1471 outb_p(cached_A1(cpu), 0xA1);
1472 (void)inb_p(0xA1);
1473 } else {
1474 outb_p(cached_21(cpu), 0x21);
1475 (void)inb_p(0x21);
1476 }
1477}
1478
1479/* The VIC is level triggered, so the ack can only be issued after the
1480 * interrupt completes. However, we do Voyager lazy interrupt
1481 * handling here: It is an extremely expensive operation to mask an
1482 * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If
1483 * this interrupt actually comes in, then we mask and ack here to push
1484 * the interrupt off to another CPU */
1485static void before_handle_vic_irq(unsigned int irq)
1486{
1487 irq_desc_t *desc = irq_to_desc(irq);
1488 __u8 cpu = smp_processor_id();
1489
1490 _raw_spin_lock(&vic_irq_lock);
1491 vic_intr_total++;
1492 vic_intr_count[cpu]++;
1493
1494 if (!(cpu_irq_affinity[cpu] & (1 << irq))) {
1495 /* The irq is not in our affinity mask, push it off
1496 * onto another CPU */
1497 VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d "
1498 "on cpu %d\n", irq, cpu));
1499 disable_local_vic_irq(irq);
1500 /* set IRQ_INPROGRESS to prevent the handler in irq.c from
1501 * actually calling the interrupt routine */
1502 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
1503 } else if (desc->status & IRQ_DISABLED) {
1504 /* Damn, the interrupt actually arrived, do the lazy
1505 * disable thing. The interrupt routine in irq.c will
1506 * not handle a IRQ_DISABLED interrupt, so nothing more
1507 * need be done here */
1508 VDEBUG(("VOYAGER DEBUG: lazy disable of irq %d on CPU %d\n",
1509 irq, cpu));
1510 disable_local_vic_irq(irq);
1511 desc->status |= IRQ_REPLAY;
1512 } else {
1513 desc->status &= ~IRQ_REPLAY;
1514 }
1515
1516 _raw_spin_unlock(&vic_irq_lock);
1517}
1518
1519/* Finish the VIC interrupt: basically mask */
1520static void after_handle_vic_irq(unsigned int irq)
1521{
1522 irq_desc_t *desc = irq_to_desc(irq);
1523
1524 _raw_spin_lock(&vic_irq_lock);
1525 {
1526 unsigned int status = desc->status & ~IRQ_INPROGRESS;
1527#ifdef VOYAGER_DEBUG
1528 __u16 isr;
1529#endif
1530
1531 desc->status = status;
1532 if ((status & IRQ_DISABLED))
1533 disable_local_vic_irq(irq);
1534#ifdef VOYAGER_DEBUG
1535 /* DEBUG: before we ack, check what's in progress */
1536 isr = vic_read_isr();
1537 if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) {
1538 int i;
1539 __u8 cpu = smp_processor_id();
1540 __u8 real_cpu;
1541 int mask; /* Um... initialize me??? --RR */
1542
1543 printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
1544 cpu, irq);
1545 for_each_possible_cpu(real_cpu, mask) {
1546
1547 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
1548 VIC_PROCESSOR_ID);
1549 isr = vic_read_isr();
1550 if (isr & (1 << irq)) {
1551 printk
1552 ("VOYAGER SMP: CPU%d ack irq %d\n",
1553 real_cpu, irq);
1554 ack_vic_irq(irq);
1555 }
1556 outb(cpu, VIC_PROCESSOR_ID);
1557 }
1558 }
1559#endif /* VOYAGER_DEBUG */
1560 /* as soon as we ack, the interrupt is eligible for
1561 * receipt by another CPU so everything must be in
1562 * order here */
1563 ack_vic_irq(irq);
1564 if (status & IRQ_REPLAY) {
1565 /* replay is set if we disable the interrupt
1566 * in the before_handle_vic_irq() routine, so
1567 * clear the in progress bit here to allow the
1568 * next CPU to handle this correctly */
1569 desc->status &= ~(IRQ_REPLAY | IRQ_INPROGRESS);
1570 }
1571#ifdef VOYAGER_DEBUG
1572 isr = vic_read_isr();
1573 if ((isr & (1 << irq)) != 0)
1574 printk("VOYAGER SMP: after_handle_vic_irq() after "
1575 "ack irq=%d, isr=0x%x\n", irq, isr);
1576#endif /* VOYAGER_DEBUG */
1577 }
1578 _raw_spin_unlock(&vic_irq_lock);
1579
1580 /* All code after this point is out of the main path - the IRQ
1581 * may be intercepted by another CPU if reasserted */
1582}
1583
1584/* Linux processor - interrupt affinity manipulations.
1585 *
1586 * For each processor, we maintain a 32 bit irq affinity mask.
1587 * Initially it is set to all 1's so every processor accepts every
1588 * interrupt. In this call, we change the processor's affinity mask:
1589 *
1590 * Change from enable to disable:
1591 *
1592 * If the interrupt ever comes in to the processor, we will disable it
1593 * and ack it to push it off to another CPU, so just accept the mask here.
1594 *
1595 * Change from disable to enable:
1596 *
1597 * change the mask and then do an interrupt enable CPI to re-enable on
1598 * the selected processors */
1599
1600void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask)
1601{
1602 /* Only extended processors handle interrupts */
1603 unsigned long real_mask;
1604 unsigned long irq_mask = 1 << irq;
1605 int cpu;
1606
1607 real_mask = cpus_addr(*mask)[0] & voyager_extended_vic_processors;
1608
1609 if (cpus_addr(*mask)[0] == 0)
1610 /* can't have no CPUs to accept the interrupt -- extremely
1611 * bad things will happen */
1612 return;
1613
1614 if (irq == 0)
1615 /* can't change the affinity of the timer IRQ. This
1616 * is due to the constraint in the voyager
1617 * architecture that the CPI also comes in on and IRQ
1618 * line and we have chosen IRQ0 for this. If you
1619 * raise the mask on this interrupt, the processor
1620 * will no-longer be able to accept VIC CPIs */
1621 return;
1622
1623 if (irq >= 32)
1624 /* You can only have 32 interrupts in a voyager system
1625 * (and 32 only if you have a secondary microchannel
1626 * bus) */
1627 return;
1628
1629 for_each_online_cpu(cpu) {
1630 unsigned long cpu_mask = 1 << cpu;
1631
1632 if (cpu_mask & real_mask) {
1633 /* enable the interrupt for this cpu */
1634 cpu_irq_affinity[cpu] |= irq_mask;
1635 } else {
1636 /* disable the interrupt for this cpu */
1637 cpu_irq_affinity[cpu] &= ~irq_mask;
1638 }
1639 }
1640 /* this is magic, we now have the correct affinity maps, so
1641 * enable the interrupt. This will send an enable CPI to
1642 * those CPUs who need to enable it in their local masks,
1643 * causing them to correct for the new affinity . If the
1644 * interrupt is currently globally disabled, it will simply be
1645 * disabled again as it comes in (voyager lazy disable). If
1646 * the affinity map is tightened to disable the interrupt on a
1647 * cpu, it will be pushed off when it comes in */
1648 unmask_vic_irq(irq);
1649}
1650
1651static void ack_vic_irq(unsigned int irq)
1652{
1653 if (irq & 8) {
1654 outb(0x62, 0x20); /* Specific EOI to cascade */
1655 outb(0x60 | (irq & 7), 0xA0);
1656 } else {
1657 outb(0x60 | (irq & 7), 0x20);
1658 }
1659}
1660
1661/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259
1662 * but are not vectored by it. This means that the 8259 mask must be
1663 * lowered to receive them */
1664static __init void vic_enable_cpi(void)
1665{
1666 __u8 cpu = smp_processor_id();
1667
1668 /* just take a copy of the current mask (nop for boot cpu) */
1669 vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
1670
1671 enable_local_vic_irq(VIC_CPI_LEVEL0);
1672 enable_local_vic_irq(VIC_CPI_LEVEL1);
1673 /* for sys int and cmn int */
1674 enable_local_vic_irq(7);
1675
1676 if (is_cpu_quad()) {
1677 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
1678 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
1679 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
1680 cpu, QIC_CPI_ENABLE));
1681 }
1682
1683 VDEBUG(("VOYAGER SMP: ENABLE CPI: CPU%d: MASK 0x%x\n",
1684 cpu, vic_irq_mask[cpu]));
1685}
1686
1687void voyager_smp_dump()
1688{
1689 int old_cpu = smp_processor_id(), cpu;
1690
1691 /* dump the interrupt masks of each processor */
1692 for_each_online_cpu(cpu) {
1693 __u16 imr, isr, irr;
1694 unsigned long flags;
1695
1696 local_irq_save(flags);
1697 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
1698 imr = (inb(0xa1) << 8) | inb(0x21);
1699 outb(0x0a, 0xa0);
1700 irr = inb(0xa0) << 8;
1701 outb(0x0a, 0x20);
1702 irr |= inb(0x20);
1703 outb(0x0b, 0xa0);
1704 isr = inb(0xa0) << 8;
1705 outb(0x0b, 0x20);
1706 isr |= inb(0x20);
1707 outb(old_cpu, VIC_PROCESSOR_ID);
1708 local_irq_restore(flags);
1709 printk("\tCPU%d: mask=0x%x, IMR=0x%x, IRR=0x%x, ISR=0x%x\n",
1710 cpu, vic_irq_mask[cpu], imr, irr, isr);
1711#if 0
1712 /* These lines are put in to try to unstick an un ack'd irq */
1713 if (isr != 0) {
1714 int irq;
1715 for (irq = 0; irq < 16; irq++) {
1716 if (isr & (1 << irq)) {
1717 printk("\tCPU%d: ack irq %d\n",
1718 cpu, irq);
1719 local_irq_save(flags);
1720 outb(VIC_CPU_MASQUERADE_ENABLE | cpu,
1721 VIC_PROCESSOR_ID);
1722 ack_vic_irq(irq);
1723 outb(old_cpu, VIC_PROCESSOR_ID);
1724 local_irq_restore(flags);
1725 }
1726 }
1727 }
1728#endif
1729 }
1730}
1731
1732void smp_voyager_power_off(void *dummy)
1733{
1734 if (smp_processor_id() == boot_cpu_id)
1735 voyager_power_off();
1736 else
1737 smp_stop_cpu_function(NULL);
1738}
1739
1740static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1741{
1742 /* FIXME: ignore max_cpus for now */
1743 smp_boot_cpus();
1744}
1745
1746static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1747{
1748 init_gdt(smp_processor_id());
1749 switch_to_new_gdt();
1750
1751 cpu_online_map = cpumask_of_cpu(smp_processor_id());
1752 cpu_callout_map = cpumask_of_cpu(smp_processor_id());
1753 cpu_callin_map = CPU_MASK_NONE;
1754 cpu_present_map = cpumask_of_cpu(smp_processor_id());
1755
1756}
1757
1758static int __cpuinit voyager_cpu_up(unsigned int cpu)
1759{
1760 /* This only works at boot for x86. See "rewrite" above. */
1761 if (cpu_isset(cpu, smp_commenced_mask))
1762 return -ENOSYS;
1763
1764 /* In case one didn't come up */
1765 if (!cpu_isset(cpu, cpu_callin_map))
1766 return -EIO;
1767 /* Unleash the CPU! */
1768 cpu_set(cpu, smp_commenced_mask);
1769 while (!cpu_online(cpu))
1770 mb();
1771 return 0;
1772}
1773
1774static void __init voyager_smp_cpus_done(unsigned int max_cpus)
1775{
1776 zap_low_mappings();
1777}
1778
1779void __init smp_setup_processor_id(void)
1780{
1781 current_thread_info()->cpu = hard_smp_processor_id();
1782 x86_write_percpu(cpu_number, hard_smp_processor_id());
1783}
1784
1785static void voyager_send_call_func(const struct cpumask *callmask)
1786{
1787 __u32 mask = cpus_addr(*callmask)[0] & ~(1 << smp_processor_id());
1788 send_CPI(mask, VIC_CALL_FUNCTION_CPI);
1789}
1790
1791static void voyager_send_call_func_single(int cpu)
1792{
1793 send_CPI(1 << cpu, VIC_CALL_FUNCTION_SINGLE_CPI);
1794}
1795
1796struct smp_ops smp_ops = {
1797 .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu,
1798 .smp_prepare_cpus = voyager_smp_prepare_cpus,
1799 .cpu_up = voyager_cpu_up,
1800 .smp_cpus_done = voyager_smp_cpus_done,
1801
1802 .smp_send_stop = voyager_smp_send_stop,
1803 .smp_send_reschedule = voyager_smp_send_reschedule,
1804
1805 .send_call_func_ipi = voyager_send_call_func,
1806 .send_call_func_single_ipi = voyager_send_call_func_single,
1807};
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
deleted file mode 100644
index 15464a20fb38..000000000000
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ /dev/null
@@ -1,128 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 2001
4 *
5 * Author: J.E.J.Bottomley@HansenPartnership.com
6 *
7 * This module provides the machine status monitor thread for the
8 * voyager architecture. This allows us to monitor the machine
9 * environment (temp, voltage, fan function) and the front panel and
10 * internal UPS. If a fault is detected, this thread takes corrective
11 * action (usually just informing init)
12 * */
13
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/kernel_stat.h>
17#include <linux/delay.h>
18#include <linux/mc146818rtc.h>
19#include <linux/init.h>
20#include <linux/bootmem.h>
21#include <linux/kmod.h>
22#include <linux/completion.h>
23#include <linux/sched.h>
24#include <linux/kthread.h>
25#include <asm/desc.h>
26#include <asm/voyager.h>
27#include <asm/vic.h>
28#include <asm/mtrr.h>
29#include <asm/msr.h>
30
31struct task_struct *voyager_thread;
32static __u8 set_timeout;
33
34static int execute(const char *string)
35{
36 int ret;
37
38 char *envp[] = {
39 "HOME=/",
40 "TERM=linux",
41 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
42 NULL,
43 };
44 char *argv[] = {
45 "/bin/bash",
46 "-c",
47 (char *)string,
48 NULL,
49 };
50
51 if ((ret =
52 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
53 printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string,
54 ret);
55 }
56 return ret;
57}
58
59static void check_from_kernel(void)
60{
61 if (voyager_status.switch_off) {
62
63 /* FIXME: This should be configurable via proc */
64 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
65 } else if (voyager_status.power_fail) {
66 VDEBUG(("Voyager daemon detected AC power failure\n"));
67
68 /* FIXME: This should be configureable via proc */
69 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
70 set_timeout = 1;
71 }
72}
73
74static void check_continuing_condition(void)
75{
76 if (voyager_status.power_fail) {
77 __u8 data;
78 voyager_cat_psi(VOYAGER_PSI_SUBREAD,
79 VOYAGER_PSI_AC_FAIL_REG, &data);
80 if ((data & 0x1f) == 0) {
81 /* all power restored */
82 printk(KERN_NOTICE
83 "VOYAGER AC power restored, cancelling shutdown\n");
84 /* FIXME: should be user configureable */
85 execute
86 ("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
87 set_timeout = 0;
88 }
89 }
90}
91
92static int thread(void *unused)
93{
94 printk(KERN_NOTICE "Voyager starting monitor thread\n");
95
96 for (;;) {
97 set_current_state(TASK_INTERRUPTIBLE);
98 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
99
100 VDEBUG(("Voyager Daemon awoken\n"));
101 if (voyager_status.request_from_kernel == 0) {
102 /* probably awoken from timeout */
103 check_continuing_condition();
104 } else {
105 check_from_kernel();
106 voyager_status.request_from_kernel = 0;
107 }
108 }
109}
110
111static int __init voyager_thread_start(void)
112{
113 voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
114 if (IS_ERR(voyager_thread)) {
115 printk(KERN_ERR
116 "Voyager: Failed to create system monitor thread.\n");
117 return PTR_ERR(voyager_thread);
118 }
119 return 0;
120}
121
122static void __exit voyager_thread_stop(void)
123{
124 kthread_stop(voyager_thread);
125}
126
127module_init(voyager_thread_start);
128module_exit(voyager_thread_stop);
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 420b3b6e3915..6ef5e99380f9 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -150,11 +150,9 @@ static long pm_address(u_char FPU_modrm, u_char segment,
150#endif /* PARANOID */ 150#endif /* PARANOID */
151 151
152 switch (segment) { 152 switch (segment) {
153 /* gs isn't used by the kernel, so it still has its
154 user-space value. */
155 case PREFIX_GS_ - 1: 153 case PREFIX_GS_ - 1:
156 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ 154 /* user gs handling can be lazy, use special accessors */
157 savesegment(gs, addr->selector); 155 addr->selector = get_user_gs(FPU_info->regs);
158 break; 156 break;
159 default: 157 default:
160 addr->selector = PM_REG_(segment); 158 addr->selector = PM_REG_(segment);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..08537747cb58 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
23 23
24 fixup = search_exception_tables(regs->ip); 24 fixup = search_exception_tables(regs->ip);
25 if (fixup) { 25 if (fixup) {
26 /* If fixup is less than 16, it means uaccess error */
27 if (fixup->fixup < 16) {
28 current_thread_info()->uaccess_err = -EFAULT;
29 regs->ip += fixup->fixup;
30 return 1;
31 }
26 regs->ip = fixup->fixup; 32 regs->ip = fixup->fixup;
27 return 1; 33 return 1;
28 } 34 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..a03b7279efa0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,73 +1,79 @@
1/* 1/*
2 * Copyright (C) 1995 Linus Torvalds 2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
4 */ 5 */
5
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h> 6#include <linux/interrupt.h>
18#include <linux/init.h> 7#include <linux/mmiotrace.h>
19#include <linux/tty.h> 8#include <linux/bootmem.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h> 9#include <linux/compiler.h>
22#include <linux/highmem.h> 10#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/kprobes.h> 11#include <linux/kprobes.h>
27#include <linux/uaccess.h> 12#include <linux/uaccess.h>
13#include <linux/vmalloc.h>
14#include <linux/vt_kern.h>
15#include <linux/signal.h>
16#include <linux/kernel.h>
17#include <linux/ptrace.h>
18#include <linux/string.h>
19#include <linux/module.h>
28#include <linux/kdebug.h> 20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
29 32
30#include <asm/system.h>
31#include <asm/desc.h>
32#include <asm/segment.h>
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
36#include <asm/proto.h> 37#include <asm/proto.h>
37#include <asm-generic/sections.h>
38#include <asm/traps.h> 38#include <asm/traps.h>
39#include <asm/desc.h>
39 40
40/* 41/*
41 * Page fault error code bits 42 * Page fault error code bits:
42 * bit 0 == 0 means no page found, 1 means protection fault 43 *
43 * bit 1 == 0 means read, 1 means write 44 * bit 0 == 0: no page found 1: protection fault
44 * bit 2 == 0 means kernel, 1 means user-mode 45 * bit 1 == 0: read access 1: write access
45 * bit 3 == 1 means use of reserved bit detected 46 * bit 2 == 0: kernel-mode access 1: user-mode access
46 * bit 4 == 1 means fault was an instruction fetch 47 * bit 3 == 1: use of reserved bit detected
48 * bit 4 == 1: fault was an instruction fetch
47 */ 49 */
48#define PF_PROT (1<<0) 50enum x86_pf_error_code {
49#define PF_WRITE (1<<1) 51
50#define PF_USER (1<<2) 52 PF_PROT = 1 << 0,
51#define PF_RSVD (1<<3) 53 PF_WRITE = 1 << 1,
52#define PF_INSTR (1<<4) 54 PF_USER = 1 << 2,
55 PF_RSVD = 1 << 3,
56 PF_INSTR = 1 << 4,
57};
53 58
59/*
60 * Returns 0 if mmiotrace is disabled, or if the fault is not
61 * handled by mmiotrace:
62 */
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 63static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 64{
56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 65 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 66 if (kmmio_handler(regs, addr) == 1)
59 return -1; 67 return -1;
60#endif
61 return 0; 68 return 0;
62} 69}
63 70
64static inline int notify_page_fault(struct pt_regs *regs) 71static inline int notify_page_fault(struct pt_regs *regs)
65{ 72{
66#ifdef CONFIG_KPROBES
67 int ret = 0; 73 int ret = 0;
68 74
69 /* kprobe_running() needs smp_processor_id() */ 75 /* kprobe_running() needs smp_processor_id() */
70 if (!user_mode_vm(regs)) { 76 if (kprobes_built_in() && !user_mode_vm(regs)) {
71 preempt_disable(); 77 preempt_disable();
72 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 78 if (kprobe_running() && kprobe_fault_handler(regs, 14))
73 ret = 1; 79 ret = 1;
@@ -75,29 +81,76 @@ static inline int notify_page_fault(struct pt_regs *regs)
75 } 81 }
76 82
77 return ret; 83 return ret;
78#else
79 return 0;
80#endif
81} 84}
82 85
83/* 86/*
84 * X86_32 87 * Prefetch quirks:
85 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 88 *
86 * Check that here and ignore it. 89 * 32-bit mode:
90 *
91 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
92 * Check that here and ignore it.
93 *
94 * 64-bit mode:
87 * 95 *
88 * X86_64 96 * Sometimes the CPU reports invalid exceptions on prefetch.
89 * Sometimes the CPU reports invalid exceptions on prefetch. 97 * Check that here and ignore it.
90 * Check that here and ignore it.
91 * 98 *
92 * Opcode checker based on code by Richard Brunner 99 * Opcode checker based on code by Richard Brunner.
93 */ 100 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 101static inline int
95 unsigned long error_code) 102check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
103 unsigned char opcode, int *prefetch)
96{ 104{
105 unsigned char instr_hi = opcode & 0xf0;
106 unsigned char instr_lo = opcode & 0x0f;
107
108 switch (instr_hi) {
109 case 0x20:
110 case 0x30:
111 /*
112 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
113 * In X86_64 long mode, the CPU will signal invalid
114 * opcode if some of these prefixes are present so
115 * X86_64 will never get here anyway
116 */
117 return ((instr_lo & 7) == 0x6);
118#ifdef CONFIG_X86_64
119 case 0x40:
120 /*
121 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
122 * Need to figure out under what instruction mode the
123 * instruction was issued. Could check the LDT for lm,
124 * but for now it's good enough to assume that long
125 * mode only uses well known segments or kernel.
126 */
127 return (!user_mode(regs)) || (regs->cs == __USER_CS);
128#endif
129 case 0x60:
130 /* 0x64 thru 0x67 are valid prefixes in all modes. */
131 return (instr_lo & 0xC) == 0x4;
132 case 0xF0:
133 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
134 return !instr_lo || (instr_lo>>1) == 1;
135 case 0x00:
136 /* Prefetch instruction is 0x0F0D or 0x0F18 */
137 if (probe_kernel_address(instr, opcode))
138 return 0;
139
140 *prefetch = (instr_lo == 0xF) &&
141 (opcode == 0x0D || opcode == 0x18);
142 return 0;
143 default:
144 return 0;
145 }
146}
147
148static int
149is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
150{
151 unsigned char *max_instr;
97 unsigned char *instr; 152 unsigned char *instr;
98 int scan_more = 1;
99 int prefetch = 0; 153 int prefetch = 0;
100 unsigned char *max_instr;
101 154
102 /* 155 /*
103 * If it was a exec (instruction fetch) fault on NX page, then 156 * If it was a exec (instruction fetch) fault on NX page, then
@@ -106,106 +159,170 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
106 if (error_code & PF_INSTR) 159 if (error_code & PF_INSTR)
107 return 0; 160 return 0;
108 161
109 instr = (unsigned char *)convert_ip_to_linear(current, regs); 162 instr = (void *)convert_ip_to_linear(current, regs);
110 max_instr = instr + 15; 163 max_instr = instr + 15;
111 164
112 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 165 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
113 return 0; 166 return 0;
114 167
115 while (scan_more && instr < max_instr) { 168 while (instr < max_instr) {
116 unsigned char opcode; 169 unsigned char opcode;
117 unsigned char instr_hi;
118 unsigned char instr_lo;
119 170
120 if (probe_kernel_address(instr, opcode)) 171 if (probe_kernel_address(instr, opcode))
121 break; 172 break;
122 173
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
125 instr++; 174 instr++;
126 175
127 switch (instr_hi) { 176 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
128 case 0x20:
129 case 0x30:
130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
136 scan_more = ((instr_lo & 7) == 0x6);
137 break; 177 break;
138#ifdef CONFIG_X86_64
139 case 0x40:
140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
148 break;
149#endif
150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
153 break;
154 case 0xF0:
155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
156 scan_more = !instr_lo || (instr_lo>>1) == 1;
157 break;
158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
161
162 if (probe_kernel_address(instr, opcode))
163 break;
164 prefetch = (instr_lo == 0xF) &&
165 (opcode == 0x0D || opcode == 0x18);
166 break;
167 default:
168 scan_more = 0;
169 break;
170 }
171 } 178 }
172 return prefetch; 179 return prefetch;
173} 180}
174 181
175static void force_sig_info_fault(int si_signo, int si_code, 182static void
176 unsigned long address, struct task_struct *tsk) 183force_sig_info_fault(int si_signo, int si_code, unsigned long address,
184 struct task_struct *tsk)
177{ 185{
178 siginfo_t info; 186 siginfo_t info;
179 187
180 info.si_signo = si_signo; 188 info.si_signo = si_signo;
181 info.si_errno = 0; 189 info.si_errno = 0;
182 info.si_code = si_code; 190 info.si_code = si_code;
183 info.si_addr = (void __user *)address; 191 info.si_addr = (void __user *)address;
192
184 force_sig_info(si_signo, &info, tsk); 193 force_sig_info(si_signo, &info, tsk);
185} 194}
186 195
187#ifdef CONFIG_X86_64 196DEFINE_SPINLOCK(pgd_lock);
188static int bad_address(void *p) 197LIST_HEAD(pgd_list);
198
199#ifdef CONFIG_X86_32
200static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
189{ 201{
190 unsigned long dummy; 202 unsigned index = pgd_index(address);
191 return probe_kernel_address((unsigned long *)p, dummy); 203 pgd_t *pgd_k;
204 pud_t *pud, *pud_k;
205 pmd_t *pmd, *pmd_k;
206
207 pgd += index;
208 pgd_k = init_mm.pgd + index;
209
210 if (!pgd_present(*pgd_k))
211 return NULL;
212
213 /*
214 * set_pgd(pgd, *pgd_k); here would be useless on PAE
215 * and redundant with the set_pmd() on non-PAE. As would
216 * set_pud.
217 */
218 pud = pud_offset(pgd, address);
219 pud_k = pud_offset(pgd_k, address);
220 if (!pud_present(*pud_k))
221 return NULL;
222
223 pmd = pmd_offset(pud, address);
224 pmd_k = pmd_offset(pud_k, address);
225 if (!pmd_present(*pmd_k))
226 return NULL;
227
228 if (!pmd_present(*pmd)) {
229 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode();
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234
235 return pmd_k;
236}
237
238void vmalloc_sync_all(void)
239{
240 unsigned long address;
241
242 if (SHARED_KERNEL_PMD)
243 return;
244
245 for (address = VMALLOC_START & PMD_MASK;
246 address >= TASK_SIZE && address < FIXADDR_TOP;
247 address += PMD_SIZE) {
248
249 unsigned long flags;
250 struct page *page;
251
252 spin_lock_irqsave(&pgd_lock, flags);
253 list_for_each_entry(page, &pgd_list, lru) {
254 if (!vmalloc_sync_one(page_address(page), address))
255 break;
256 }
257 spin_unlock_irqrestore(&pgd_lock, flags);
258 }
259}
260
261/*
262 * 32-bit:
263 *
264 * Handle a fault on the vmalloc or module mapping area
265 */
266static noinline int vmalloc_fault(unsigned long address)
267{
268 unsigned long pgd_paddr;
269 pmd_t *pmd_k;
270 pte_t *pte_k;
271
272 /* Make sure we are in vmalloc area: */
273 if (!(address >= VMALLOC_START && address < VMALLOC_END))
274 return -1;
275
276 /*
277 * Synchronize this task's top level page-table
278 * with the 'reference' page table.
279 *
280 * Do _not_ use "current" here. We might be inside
281 * an interrupt in the middle of a task switch..
282 */
283 pgd_paddr = read_cr3();
284 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
285 if (!pmd_k)
286 return -1;
287
288 pte_k = pte_offset_kernel(pmd_k, address);
289 if (!pte_present(*pte_k))
290 return -1;
291
292 return 0;
293}
294
295/*
296 * Did it hit the DOS screen memory VA from vm86 mode?
297 */
298static inline void
299check_v8086_mode(struct pt_regs *regs, unsigned long address,
300 struct task_struct *tsk)
301{
302 unsigned long bit;
303
304 if (!v8086_mode(regs))
305 return;
306
307 bit = (address - 0xA0000) >> PAGE_SHIFT;
308 if (bit < 32)
309 tsk->thread.screen_bitmap |= 1 << bit;
192} 310}
193#endif
194 311
195static void dump_pagetable(unsigned long address) 312static void dump_pagetable(unsigned long address)
196{ 313{
197#ifdef CONFIG_X86_32
198 __typeof__(pte_val(__pte(0))) page; 314 __typeof__(pte_val(__pte(0))) page;
199 315
200 page = read_cr3(); 316 page = read_cr3();
201 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 317 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
318
202#ifdef CONFIG_X86_PAE 319#ifdef CONFIG_X86_PAE
203 printk("*pdpt = %016Lx ", page); 320 printk("*pdpt = %016Lx ", page);
204 if ((page >> PAGE_SHIFT) < max_low_pfn 321 if ((page >> PAGE_SHIFT) < max_low_pfn
205 && page & _PAGE_PRESENT) { 322 && page & _PAGE_PRESENT) {
206 page &= PAGE_MASK; 323 page &= PAGE_MASK;
207 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 324 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208 & (PTRS_PER_PMD - 1)]; 325 & (PTRS_PER_PMD - 1)];
209 printk(KERN_CONT "*pde = %016Lx ", page); 326 printk(KERN_CONT "*pde = %016Lx ", page);
210 page &= ~_PAGE_NX; 327 page &= ~_PAGE_NX;
211 } 328 }
@@ -217,19 +334,145 @@ static void dump_pagetable(unsigned long address)
217 * We must not directly access the pte in the highpte 334 * We must not directly access the pte in the highpte
218 * case if the page table is located in highmem. 335 * case if the page table is located in highmem.
219 * And let's rather not kmap-atomic the pte, just in case 336 * And let's rather not kmap-atomic the pte, just in case
220 * it's allocated already. 337 * it's allocated already:
221 */ 338 */
222 if ((page >> PAGE_SHIFT) < max_low_pfn 339 if ((page >> PAGE_SHIFT) < max_low_pfn
223 && (page & _PAGE_PRESENT) 340 && (page & _PAGE_PRESENT)
224 && !(page & _PAGE_PSE)) { 341 && !(page & _PAGE_PSE)) {
342
225 page &= PAGE_MASK; 343 page &= PAGE_MASK;
226 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 344 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227 & (PTRS_PER_PTE - 1)]; 345 & (PTRS_PER_PTE - 1)];
228 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 346 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229 } 347 }
230 348
231 printk("\n"); 349 printk("\n");
232#else /* CONFIG_X86_64 */ 350}
351
352#else /* CONFIG_X86_64: */
353
354void vmalloc_sync_all(void)
355{
356 unsigned long address;
357
358 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
359 address += PGDIR_SIZE) {
360
361 const pgd_t *pgd_ref = pgd_offset_k(address);
362 unsigned long flags;
363 struct page *page;
364
365 if (pgd_none(*pgd_ref))
366 continue;
367
368 spin_lock_irqsave(&pgd_lock, flags);
369 list_for_each_entry(page, &pgd_list, lru) {
370 pgd_t *pgd;
371 pgd = (pgd_t *)page_address(page) + pgd_index(address);
372 if (pgd_none(*pgd))
373 set_pgd(pgd, *pgd_ref);
374 else
375 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
376 }
377 spin_unlock_irqrestore(&pgd_lock, flags);
378 }
379}
380
381/*
382 * 64-bit:
383 *
384 * Handle a fault on the vmalloc area
385 *
386 * This assumes no large pages in there.
387 */
388static noinline int vmalloc_fault(unsigned long address)
389{
390 pgd_t *pgd, *pgd_ref;
391 pud_t *pud, *pud_ref;
392 pmd_t *pmd, *pmd_ref;
393 pte_t *pte, *pte_ref;
394
395 /* Make sure we are in vmalloc area: */
396 if (!(address >= VMALLOC_START && address < VMALLOC_END))
397 return -1;
398
399 /*
400 * Copy kernel mappings over when needed. This can also
401 * happen within a race in page table update. In the later
402 * case just flush:
403 */
404 pgd = pgd_offset(current->active_mm, address);
405 pgd_ref = pgd_offset_k(address);
406 if (pgd_none(*pgd_ref))
407 return -1;
408
409 if (pgd_none(*pgd))
410 set_pgd(pgd, *pgd_ref);
411 else
412 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
413
414 /*
415 * Below here mismatches are bugs because these lower tables
416 * are shared:
417 */
418
419 pud = pud_offset(pgd, address);
420 pud_ref = pud_offset(pgd_ref, address);
421 if (pud_none(*pud_ref))
422 return -1;
423
424 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
425 BUG();
426
427 pmd = pmd_offset(pud, address);
428 pmd_ref = pmd_offset(pud_ref, address);
429 if (pmd_none(*pmd_ref))
430 return -1;
431
432 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
433 BUG();
434
435 pte_ref = pte_offset_kernel(pmd_ref, address);
436 if (!pte_present(*pte_ref))
437 return -1;
438
439 pte = pte_offset_kernel(pmd, address);
440
441 /*
442 * Don't use pte_page here, because the mappings can point
443 * outside mem_map, and the NUMA hash lookup cannot handle
444 * that:
445 */
446 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
447 BUG();
448
449 return 0;
450}
451
452static const char errata93_warning[] =
453KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
454KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
455KERN_ERR "******* Please consider a BIOS update.\n"
456KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
457
458/*
459 * No vm86 mode in 64-bit mode:
460 */
461static inline void
462check_v8086_mode(struct pt_regs *regs, unsigned long address,
463 struct task_struct *tsk)
464{
465}
466
467static int bad_address(void *p)
468{
469 unsigned long dummy;
470
471 return probe_kernel_address((unsigned long *)p, dummy);
472}
473
474static void dump_pagetable(unsigned long address)
475{
233 pgd_t *pgd; 476 pgd_t *pgd;
234 pud_t *pud; 477 pud_t *pud;
235 pmd_t *pmd; 478 pmd_t *pmd;
@@ -238,102 +481,77 @@ static void dump_pagetable(unsigned long address)
238 pgd = (pgd_t *)read_cr3(); 481 pgd = (pgd_t *)read_cr3();
239 482
240 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 483 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
484
241 pgd += pgd_index(address); 485 pgd += pgd_index(address);
242 if (bad_address(pgd)) goto bad; 486 if (bad_address(pgd))
487 goto bad;
488
243 printk("PGD %lx ", pgd_val(*pgd)); 489 printk("PGD %lx ", pgd_val(*pgd));
244 if (!pgd_present(*pgd)) goto ret; 490
491 if (!pgd_present(*pgd))
492 goto out;
245 493
246 pud = pud_offset(pgd, address); 494 pud = pud_offset(pgd, address);
247 if (bad_address(pud)) goto bad; 495 if (bad_address(pud))
496 goto bad;
497
248 printk("PUD %lx ", pud_val(*pud)); 498 printk("PUD %lx ", pud_val(*pud));
249 if (!pud_present(*pud) || pud_large(*pud)) 499 if (!pud_present(*pud) || pud_large(*pud))
250 goto ret; 500 goto out;
251 501
252 pmd = pmd_offset(pud, address); 502 pmd = pmd_offset(pud, address);
253 if (bad_address(pmd)) goto bad; 503 if (bad_address(pmd))
504 goto bad;
505
254 printk("PMD %lx ", pmd_val(*pmd)); 506 printk("PMD %lx ", pmd_val(*pmd));
255 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 507 if (!pmd_present(*pmd) || pmd_large(*pmd))
508 goto out;
256 509
257 pte = pte_offset_kernel(pmd, address); 510 pte = pte_offset_kernel(pmd, address);
258 if (bad_address(pte)) goto bad; 511 if (bad_address(pte))
512 goto bad;
513
259 printk("PTE %lx", pte_val(*pte)); 514 printk("PTE %lx", pte_val(*pte));
260ret: 515out:
261 printk("\n"); 516 printk("\n");
262 return; 517 return;
263bad: 518bad:
264 printk("BAD\n"); 519 printk("BAD\n");
265#endif
266}
267
268#ifdef CONFIG_X86_32
269static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270{
271 unsigned index = pgd_index(address);
272 pgd_t *pgd_k;
273 pud_t *pud, *pud_k;
274 pmd_t *pmd, *pmd_k;
275
276 pgd += index;
277 pgd_k = init_mm.pgd + index;
278
279 if (!pgd_present(*pgd_k))
280 return NULL;
281
282 /*
283 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284 * and redundant with the set_pmd() on non-PAE. As would
285 * set_pud.
286 */
287
288 pud = pud_offset(pgd, address);
289 pud_k = pud_offset(pgd_k, address);
290 if (!pud_present(*pud_k))
291 return NULL;
292
293 pmd = pmd_offset(pud, address);
294 pmd_k = pmd_offset(pud_k, address);
295 if (!pmd_present(*pmd_k))
296 return NULL;
297 if (!pmd_present(*pmd)) {
298 set_pmd(pmd, *pmd_k);
299 arch_flush_lazy_mmu_mode();
300 } else
301 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302 return pmd_k;
303} 520}
304#endif
305 521
306#ifdef CONFIG_X86_64 522#endif /* CONFIG_X86_64 */
307static const char errata93_warning[] =
308KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310KERN_ERR "******* Please consider a BIOS update.\n"
311KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
312#endif
313 523
314/* Workaround for K8 erratum #93 & buggy BIOS. 524/*
315 BIOS SMM functions are required to use a specific workaround 525 * Workaround for K8 erratum #93 & buggy BIOS.
316 to avoid corruption of the 64bit RIP register on C stepping K8. 526 *
317 A lot of BIOS that didn't get tested properly miss this. 527 * BIOS SMM functions are required to use a specific workaround
318 The OS sees this as a page fault with the upper 32bits of RIP cleared. 528 * to avoid corruption of the 64bit RIP register on C stepping K8.
319 Try to work around it here. 529 *
320 Note we only handle faults in kernel here. 530 * A lot of BIOS that didn't get tested properly miss this.
321 Does nothing for X86_32 531 *
532 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
533 * Try to work around it here.
534 *
535 * Note we only handle faults in kernel here.
536 * Does nothing on 32-bit.
322 */ 537 */
323static int is_errata93(struct pt_regs *regs, unsigned long address) 538static int is_errata93(struct pt_regs *regs, unsigned long address)
324{ 539{
325#ifdef CONFIG_X86_64 540#ifdef CONFIG_X86_64
326 static int warned; 541 static int once;
542
327 if (address != regs->ip) 543 if (address != regs->ip)
328 return 0; 544 return 0;
545
329 if ((address >> 32) != 0) 546 if ((address >> 32) != 0)
330 return 0; 547 return 0;
548
331 address |= 0xffffffffUL << 32; 549 address |= 0xffffffffUL << 32;
332 if ((address >= (u64)_stext && address <= (u64)_etext) || 550 if ((address >= (u64)_stext && address <= (u64)_etext) ||
333 (address >= MODULES_VADDR && address <= MODULES_END)) { 551 (address >= MODULES_VADDR && address <= MODULES_END)) {
334 if (!warned) { 552 if (!once) {
335 printk(errata93_warning); 553 printk(errata93_warning);
336 warned = 1; 554 once = 1;
337 } 555 }
338 regs->ip = address; 556 regs->ip = address;
339 return 1; 557 return 1;
@@ -343,16 +561,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
343} 561}
344 562
345/* 563/*
346 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 564 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
347 * addresses >4GB. We catch this in the page fault handler because these 565 * to illegal addresses >4GB.
348 * addresses are not reachable. Just detect this case and return. Any code 566 *
567 * We catch this in the page fault handler because these addresses
568 * are not reachable. Just detect this case and return. Any code
349 * segment in LDT is compatibility mode. 569 * segment in LDT is compatibility mode.
350 */ 570 */
351static int is_errata100(struct pt_regs *regs, unsigned long address) 571static int is_errata100(struct pt_regs *regs, unsigned long address)
352{ 572{
353#ifdef CONFIG_X86_64 573#ifdef CONFIG_X86_64
354 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 574 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
355 (address >> 32))
356 return 1; 575 return 1;
357#endif 576#endif
358 return 0; 577 return 0;
@@ -362,8 +581,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362{ 581{
363#ifdef CONFIG_X86_F00F_BUG 582#ifdef CONFIG_X86_F00F_BUG
364 unsigned long nr; 583 unsigned long nr;
584
365 /* 585 /*
366 * Pentium F0 0F C7 C8 bug workaround. 586 * Pentium F0 0F C7 C8 bug workaround:
367 */ 587 */
368 if (boot_cpu_data.f00f_bug) { 588 if (boot_cpu_data.f00f_bug) {
369 nr = (address - idt_descr.address) >> 3; 589 nr = (address - idt_descr.address) >> 3;
@@ -377,62 +597,277 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
377 return 0; 597 return 0;
378} 598}
379 599
380static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 600static const char nx_warning[] = KERN_CRIT
381 unsigned long address) 601"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
602
603static void
604show_fault_oops(struct pt_regs *regs, unsigned long error_code,
605 unsigned long address)
382{ 606{
383#ifdef CONFIG_X86_32
384 if (!oops_may_print()) 607 if (!oops_may_print())
385 return; 608 return;
386#endif
387 609
388#ifdef CONFIG_X86_PAE
389 if (error_code & PF_INSTR) { 610 if (error_code & PF_INSTR) {
390 unsigned int level; 611 unsigned int level;
612
391 pte_t *pte = lookup_address(address, &level); 613 pte_t *pte = lookup_address(address, &level);
392 614
393 if (pte && pte_present(*pte) && !pte_exec(*pte)) 615 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute " 616 printk(nx_warning, current_uid());
395 "NX-protected page - exploit attempt? "
396 "(uid: %d)\n", current_uid());
397 } 617 }
398#endif
399 618
400 printk(KERN_ALERT "BUG: unable to handle kernel "); 619 printk(KERN_ALERT "BUG: unable to handle kernel ");
401 if (address < PAGE_SIZE) 620 if (address < PAGE_SIZE)
402 printk(KERN_CONT "NULL pointer dereference"); 621 printk(KERN_CONT "NULL pointer dereference");
403 else 622 else
404 printk(KERN_CONT "paging request"); 623 printk(KERN_CONT "paging request");
624
405 printk(KERN_CONT " at %p\n", (void *) address); 625 printk(KERN_CONT " at %p\n", (void *) address);
406 printk(KERN_ALERT "IP:"); 626 printk(KERN_ALERT "IP:");
407 printk_address(regs->ip, 1); 627 printk_address(regs->ip, 1);
628
408 dump_pagetable(address); 629 dump_pagetable(address);
409} 630}
410 631
411#ifdef CONFIG_X86_64 632static noinline void
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 633pgtable_bad(struct pt_regs *regs, unsigned long error_code,
413 unsigned long error_code) 634 unsigned long address)
414{ 635{
415 unsigned long flags = oops_begin();
416 int sig = SIGKILL;
417 struct task_struct *tsk; 636 struct task_struct *tsk;
637 unsigned long flags;
638 int sig;
639
640 flags = oops_begin();
641 tsk = current;
642 sig = SIGKILL;
418 643
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 644 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 645 tsk->comm, address);
421 dump_pagetable(address); 646 dump_pagetable(address);
422 tsk = current; 647
423 tsk->thread.cr2 = address; 648 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14; 649 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code; 650 tsk->thread.error_code = error_code;
651
426 if (__die("Bad pagetable", regs, error_code)) 652 if (__die("Bad pagetable", regs, error_code))
427 sig = 0; 653 sig = 0;
654
428 oops_end(flags, regs, sig); 655 oops_end(flags, regs, sig);
429} 656}
430#endif 657
658static noinline void
659no_context(struct pt_regs *regs, unsigned long error_code,
660 unsigned long address)
661{
662 struct task_struct *tsk = current;
663 unsigned long *stackend;
664 unsigned long flags;
665 int sig;
666
667 /* Are we prepared to handle this kernel fault? */
668 if (fixup_exception(regs))
669 return;
670
671 /*
672 * 32-bit:
673 *
674 * Valid to do another page fault here, because if this fault
675 * had been triggered by is_prefetch fixup_exception would have
676 * handled it.
677 *
678 * 64-bit:
679 *
680 * Hall of shame of CPU/BIOS bugs.
681 */
682 if (is_prefetch(regs, error_code, address))
683 return;
684
685 if (is_errata93(regs, address))
686 return;
687
688 /*
689 * Oops. The kernel tried to access some bad page. We'll have to
690 * terminate things with extreme prejudice:
691 */
692 flags = oops_begin();
693
694 show_fault_oops(regs, error_code, address);
695
696 stackend = end_of_stack(tsk);
697 if (*stackend != STACK_END_MAGIC)
698 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
699
700 tsk->thread.cr2 = address;
701 tsk->thread.trap_no = 14;
702 tsk->thread.error_code = error_code;
703
704 sig = SIGKILL;
705 if (__die("Oops", regs, error_code))
706 sig = 0;
707
708 /* Executive summary in case the body of the oops scrolled away */
709 printk(KERN_EMERG "CR2: %016lx\n", address);
710
711 oops_end(flags, regs, sig);
712}
713
714/*
715 * Print out info about fatal segfaults, if the show_unhandled_signals
716 * sysctl is set:
717 */
718static inline void
719show_signal_msg(struct pt_regs *regs, unsigned long error_code,
720 unsigned long address, struct task_struct *tsk)
721{
722 if (!unhandled_signal(tsk, SIGSEGV))
723 return;
724
725 if (!printk_ratelimit())
726 return;
727
728 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
729 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
730 tsk->comm, task_pid_nr(tsk), address,
731 (void *)regs->ip, (void *)regs->sp, error_code);
732
733 print_vma_addr(KERN_CONT " in ", regs->ip);
734
735 printk(KERN_CONT "\n");
736}
737
738static void
739__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
740 unsigned long address, int si_code)
741{
742 struct task_struct *tsk = current;
743
744 /* User mode accesses just cause a SIGSEGV */
745 if (error_code & PF_USER) {
746 /*
747 * It's possible to have interrupts off here:
748 */
749 local_irq_enable();
750
751 /*
752 * Valid to do another page fault here because this one came
753 * from user space:
754 */
755 if (is_prefetch(regs, error_code, address))
756 return;
757
758 if (is_errata100(regs, address))
759 return;
760
761 if (unlikely(show_unhandled_signals))
762 show_signal_msg(regs, error_code, address, tsk);
763
764 /* Kernel addresses are always protection faults: */
765 tsk->thread.cr2 = address;
766 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
767 tsk->thread.trap_no = 14;
768
769 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
770
771 return;
772 }
773
774 if (is_f00f_bug(regs, address))
775 return;
776
777 no_context(regs, error_code, address);
778}
779
780static noinline void
781bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
782 unsigned long address)
783{
784 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
785}
786
787static void
788__bad_area(struct pt_regs *regs, unsigned long error_code,
789 unsigned long address, int si_code)
790{
791 struct mm_struct *mm = current->mm;
792
793 /*
794 * Something tried to access memory that isn't in our memory map..
795 * Fix it, but check if it's kernel or user first..
796 */
797 up_read(&mm->mmap_sem);
798
799 __bad_area_nosemaphore(regs, error_code, address, si_code);
800}
801
802static noinline void
803bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
804{
805 __bad_area(regs, error_code, address, SEGV_MAPERR);
806}
807
808static noinline void
809bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
810 unsigned long address)
811{
812 __bad_area(regs, error_code, address, SEGV_ACCERR);
813}
814
815/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
816static void
817out_of_memory(struct pt_regs *regs, unsigned long error_code,
818 unsigned long address)
819{
820 /*
821 * We ran out of memory, call the OOM killer, and return the userspace
822 * (which will retry the fault, or kill us if we got oom-killed):
823 */
824 up_read(&current->mm->mmap_sem);
825
826 pagefault_out_of_memory();
827}
828
829static void
830do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
831{
832 struct task_struct *tsk = current;
833 struct mm_struct *mm = tsk->mm;
834
835 up_read(&mm->mmap_sem);
836
837 /* Kernel mode? Handle exceptions or die: */
838 if (!(error_code & PF_USER))
839 no_context(regs, error_code, address);
840
841 /* User-space => ok to do another page fault: */
842 if (is_prefetch(regs, error_code, address))
843 return;
844
845 tsk->thread.cr2 = address;
846 tsk->thread.error_code = error_code;
847 tsk->thread.trap_no = 14;
848
849 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
850}
851
852static noinline void
853mm_fault_error(struct pt_regs *regs, unsigned long error_code,
854 unsigned long address, unsigned int fault)
855{
856 if (fault & VM_FAULT_OOM) {
857 out_of_memory(regs, error_code, address);
858 } else {
859 if (fault & VM_FAULT_SIGBUS)
860 do_sigbus(regs, error_code, address);
861 else
862 BUG();
863 }
864}
431 865
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 866static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 867{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 868 if ((error_code & PF_WRITE) && !pte_write(*pte))
435 return 0; 869 return 0;
870
436 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 871 if ((error_code & PF_INSTR) && !pte_exec(*pte))
437 return 0; 872 return 0;
438 873
@@ -440,21 +875,25 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
440} 875}
441 876
442/* 877/*
443 * Handle a spurious fault caused by a stale TLB entry. This allows 878 * Handle a spurious fault caused by a stale TLB entry.
444 * us to lazily refresh the TLB when increasing the permissions of a 879 *
445 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 880 * This allows us to lazily refresh the TLB when increasing the
446 * expensive since that implies doing a full cross-processor TLB 881 * permissions of a kernel page (RO -> RW or NX -> X). Doing it
447 * flush, even if no stale TLB entries exist on other processors. 882 * eagerly is very expensive since that implies doing a full
883 * cross-processor TLB flush, even if no stale TLB entries exist
884 * on other processors.
885 *
448 * There are no security implications to leaving a stale TLB when 886 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 887 * increasing the permissions on a page.
450 */ 888 */
451static int spurious_fault(unsigned long address, 889static noinline int
452 unsigned long error_code) 890spurious_fault(unsigned long error_code, unsigned long address)
453{ 891{
454 pgd_t *pgd; 892 pgd_t *pgd;
455 pud_t *pud; 893 pud_t *pud;
456 pmd_t *pmd; 894 pmd_t *pmd;
457 pte_t *pte; 895 pte_t *pte;
896 int ret;
458 897
459 /* Reserved-bit violation or user access to kernel space? */ 898 /* Reserved-bit violation or user access to kernel space? */
460 if (error_code & (PF_USER | PF_RSVD)) 899 if (error_code & (PF_USER | PF_RSVD))
@@ -482,127 +921,71 @@ static int spurious_fault(unsigned long address,
482 if (!pte_present(*pte)) 921 if (!pte_present(*pte))
483 return 0; 922 return 0;
484 923
485 return spurious_fault_check(error_code, pte); 924 ret = spurious_fault_check(error_code, pte);
486} 925 if (!ret)
487 926 return 0;
488/*
489 * X86_32
490 * Handle a fault on the vmalloc or module mapping area
491 *
492 * X86_64
493 * Handle a fault on the vmalloc area
494 *
495 * This assumes no large pages in there.
496 */
497static int vmalloc_fault(unsigned long address)
498{
499#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr;
501 pmd_t *pmd_k;
502 pte_t *pte_k;
503
504 /* Make sure we are in vmalloc area */
505 if (!(address >= VMALLOC_START && address < VMALLOC_END))
506 return -1;
507 927
508 /* 928 /*
509 * Synchronize this task's top level page-table 929 * Make sure we have permissions in PMD.
510 * with the 'reference' page table. 930 * If not, then there's a bug in the page tables:
511 *
512 * Do _not_ use "current" here. We might be inside
513 * an interrupt in the middle of a task switch..
514 */ 931 */
515 pgd_paddr = read_cr3(); 932 ret = spurious_fault_check(error_code, (pte_t *) pmd);
516 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 933 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
517 if (!pmd_k)
518 return -1;
519 pte_k = pte_offset_kernel(pmd_k, address);
520 if (!pte_present(*pte_k))
521 return -1;
522 return 0;
523#else
524 pgd_t *pgd, *pgd_ref;
525 pud_t *pud, *pud_ref;
526 pmd_t *pmd, *pmd_ref;
527 pte_t *pte, *pte_ref;
528 934
529 /* Make sure we are in vmalloc area */ 935 return ret;
530 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 936}
531 return -1;
532 937
533 /* Copy kernel mappings over when needed. This can also 938int show_unhandled_signals = 1;
534 happen within a race in page table update. In the later
535 case just flush. */
536 939
537 pgd = pgd_offset(current->active_mm, address); 940static inline int
538 pgd_ref = pgd_offset_k(address); 941access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
539 if (pgd_none(*pgd_ref)) 942{
540 return -1; 943 if (write) {
541 if (pgd_none(*pgd)) 944 /* write, present and write, not present: */
542 set_pgd(pgd, *pgd_ref); 945 if (unlikely(!(vma->vm_flags & VM_WRITE)))
543 else 946 return 1;
544 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 947 return 0;
948 }
545 949
546 /* Below here mismatches are bugs because these lower tables 950 /* read, present: */
547 are shared */ 951 if (unlikely(error_code & PF_PROT))
952 return 1;
953
954 /* read, not present: */
955 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
956 return 1;
548 957
549 pud = pud_offset(pgd, address);
550 pud_ref = pud_offset(pgd_ref, address);
551 if (pud_none(*pud_ref))
552 return -1;
553 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
554 BUG();
555 pmd = pmd_offset(pud, address);
556 pmd_ref = pmd_offset(pud_ref, address);
557 if (pmd_none(*pmd_ref))
558 return -1;
559 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
560 BUG();
561 pte_ref = pte_offset_kernel(pmd_ref, address);
562 if (!pte_present(*pte_ref))
563 return -1;
564 pte = pte_offset_kernel(pmd, address);
565 /* Don't use pte_page here, because the mappings can point
566 outside mem_map, and the NUMA hash lookup cannot handle
567 that. */
568 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
569 BUG();
570 return 0; 958 return 0;
571#endif
572} 959}
573 960
574int show_unhandled_signals = 1; 961static int fault_in_kernel_space(unsigned long address)
962{
963 return address >= TASK_SIZE_MAX;
964}
575 965
576/* 966/*
577 * This routine handles page faults. It determines the address, 967 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 968 * and the problem, and then passes it off to one of the appropriate
579 * routines. 969 * routines.
580 */ 970 */
581#ifdef CONFIG_X86_64 971dotraplinkage void __kprobes
582asmlinkage 972do_page_fault(struct pt_regs *regs, unsigned long error_code)
583#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 973{
586 struct task_struct *tsk;
587 struct mm_struct *mm;
588 struct vm_area_struct *vma; 974 struct vm_area_struct *vma;
975 struct task_struct *tsk;
589 unsigned long address; 976 unsigned long address;
590 int write, si_code; 977 struct mm_struct *mm;
978 int write;
591 int fault; 979 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 980
597 tsk = current; 981 tsk = current;
598 mm = tsk->mm; 982 mm = tsk->mm;
983
599 prefetchw(&mm->mmap_sem); 984 prefetchw(&mm->mmap_sem);
600 985
601 /* get the address */ 986 /* Get the faulting address: */
602 address = read_cr2(); 987 address = read_cr2();
603 988
604 si_code = SEGV_MAPERR;
605
606 if (unlikely(kmmio_fault(regs, address))) 989 if (unlikely(kmmio_fault(regs, address)))
607 return; 990 return;
608 991
@@ -619,319 +1002,147 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
619 * (error_code & 4) == 0, and that the fault was not a 1002 * (error_code & 4) == 0, and that the fault was not a
620 * protection error (error_code & 9) == 0. 1003 * protection error (error_code & 9) == 0.
621 */ 1004 */
622#ifdef CONFIG_X86_32 1005 if (unlikely(fault_in_kernel_space(address))) {
623 if (unlikely(address >= TASK_SIZE)) {
624#else
625 if (unlikely(address >= TASK_SIZE64)) {
626#endif
627 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 1006 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
628 vmalloc_fault(address) >= 0) 1007 vmalloc_fault(address) >= 0)
629 return; 1008 return;
630 1009
631 /* Can handle a stale RO->RW TLB */ 1010 /* Can handle a stale RO->RW TLB: */
632 if (spurious_fault(address, error_code)) 1011 if (spurious_fault(error_code, address))
633 return; 1012 return;
634 1013
635 /* kprobes don't want to hook the spurious faults. */ 1014 /* kprobes don't want to hook the spurious faults: */
636 if (notify_page_fault(regs)) 1015 if (notify_page_fault(regs))
637 return; 1016 return;
638 /* 1017 /*
639 * Don't take the mm semaphore here. If we fixup a prefetch 1018 * Don't take the mm semaphore here. If we fixup a prefetch
640 * fault we could otherwise deadlock. 1019 * fault we could otherwise deadlock:
641 */ 1020 */
642 goto bad_area_nosemaphore; 1021 bad_area_nosemaphore(regs, error_code, address);
643 }
644 1022
645 /* kprobes don't want to hook the spurious faults. */
646 if (notify_page_fault(regs))
647 return; 1023 return;
1024 }
648 1025
1026 /* kprobes don't want to hook the spurious faults: */
1027 if (unlikely(notify_page_fault(regs)))
1028 return;
649 /* 1029 /*
650 * It's safe to allow irq's after cr2 has been saved and the 1030 * It's safe to allow irq's after cr2 has been saved and the
651 * vmalloc fault has been handled. 1031 * vmalloc fault has been handled.
652 * 1032 *
653 * User-mode registers count as a user access even for any 1033 * User-mode registers count as a user access even for any
654 * potential system fault or CPU buglet. 1034 * potential system fault or CPU buglet:
655 */ 1035 */
656 if (user_mode_vm(regs)) { 1036 if (user_mode_vm(regs)) {
657 local_irq_enable(); 1037 local_irq_enable();
658 error_code |= PF_USER; 1038 error_code |= PF_USER;
659 } else if (regs->flags & X86_EFLAGS_IF) 1039 } else {
660 local_irq_enable(); 1040 if (regs->flags & X86_EFLAGS_IF)
1041 local_irq_enable();
1042 }
661 1043
662#ifdef CONFIG_X86_64
663 if (unlikely(error_code & PF_RSVD)) 1044 if (unlikely(error_code & PF_RSVD))
664 pgtable_bad(address, regs, error_code); 1045 pgtable_bad(regs, error_code, address);
665#endif
666 1046
667 /* 1047 /*
668 * If we're in an interrupt, have no user context or are running in an 1048 * If we're in an interrupt, have no user context or are running
669 * atomic region then we must not take the fault. 1049 * in an atomic region then we must not take the fault:
670 */ 1050 */
671 if (unlikely(in_atomic() || !mm)) 1051 if (unlikely(in_atomic() || !mm)) {
672 goto bad_area_nosemaphore; 1052 bad_area_nosemaphore(regs, error_code, address);
1053 return;
1054 }
673 1055
674 /* 1056 /*
675 * When running in the kernel we expect faults to occur only to 1057 * When running in the kernel we expect faults to occur only to
676 * addresses in user space. All other faults represent errors in the 1058 * addresses in user space. All other faults represent errors in
677 * kernel and should generate an OOPS. Unfortunately, in the case of an 1059 * the kernel and should generate an OOPS. Unfortunately, in the
678 * erroneous fault occurring in a code path which already holds mmap_sem 1060 * case of an erroneous fault occurring in a code path which already
679 * we will deadlock attempting to validate the fault against the 1061 * holds mmap_sem we will deadlock attempting to validate the fault
680 * address space. Luckily the kernel only validly references user 1062 * against the address space. Luckily the kernel only validly
681 * space from well defined areas of code, which are listed in the 1063 * references user space from well defined areas of code, which are
682 * exceptions table. 1064 * listed in the exceptions table.
683 * 1065 *
684 * As the vast majority of faults will be valid we will only perform 1066 * As the vast majority of faults will be valid we will only perform
685 * the source reference check when there is a possibility of a deadlock. 1067 * the source reference check when there is a possibility of a
686 * Attempt to lock the address space, if we cannot we then validate the 1068 * deadlock. Attempt to lock the address space, if we cannot we then
687 * source. If this is invalid we can skip the address space check, 1069 * validate the source. If this is invalid we can skip the address
688 * thus avoiding the deadlock. 1070 * space check, thus avoiding the deadlock:
689 */ 1071 */
690 if (!down_read_trylock(&mm->mmap_sem)) { 1072 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
691 if ((error_code & PF_USER) == 0 && 1073 if ((error_code & PF_USER) == 0 &&
692 !search_exception_tables(regs->ip)) 1074 !search_exception_tables(regs->ip)) {
693 goto bad_area_nosemaphore; 1075 bad_area_nosemaphore(regs, error_code, address);
1076 return;
1077 }
694 down_read(&mm->mmap_sem); 1078 down_read(&mm->mmap_sem);
1079 } else {
1080 /*
1081 * The above down_read_trylock() might have succeeded in
1082 * which case we'll have missed the might_sleep() from
1083 * down_read():
1084 */
1085 might_sleep();
695 } 1086 }
696 1087
697 vma = find_vma(mm, address); 1088 vma = find_vma(mm, address);
698 if (!vma) 1089 if (unlikely(!vma)) {
699 goto bad_area; 1090 bad_area(regs, error_code, address);
700 if (vma->vm_start <= address) 1091 return;
1092 }
1093 if (likely(vma->vm_start <= address))
701 goto good_area; 1094 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN)) 1095 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
703 goto bad_area; 1096 bad_area(regs, error_code, address);
1097 return;
1098 }
704 if (error_code & PF_USER) { 1099 if (error_code & PF_USER) {
705 /* 1100 /*
706 * Accessing the stack below %sp is always a bug. 1101 * Accessing the stack below %sp is always a bug.
707 * The large cushion allows instructions like enter 1102 * The large cushion allows instructions like enter
708 * and pusha to work. ("enter $65535,$31" pushes 1103 * and pusha to work. ("enter $65535, $31" pushes
709 * 32 pointers and then decrements %sp by 65535.) 1104 * 32 pointers and then decrements %sp by 65535.)
710 */ 1105 */
711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 1106 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
712 goto bad_area; 1107 bad_area(regs, error_code, address);
1108 return;
1109 }
713 } 1110 }
714 if (expand_stack(vma, address)) 1111 if (unlikely(expand_stack(vma, address))) {
715 goto bad_area; 1112 bad_area(regs, error_code, address);
716/* 1113 return;
717 * Ok, we have a good vm_area for this memory access, so 1114 }
718 * we can handle it.. 1115
719 */ 1116 /*
1117 * Ok, we have a good vm_area for this memory access, so
1118 * we can handle it..
1119 */
720good_area: 1120good_area:
721 si_code = SEGV_ACCERR; 1121 write = error_code & PF_WRITE;
722 write = 0; 1122
723 switch (error_code & (PF_PROT|PF_WRITE)) { 1123 if (unlikely(access_error(error_code, write, vma))) {
724 default: /* 3: write, present */ 1124 bad_area_access_error(regs, error_code, address);
725 /* fall through */ 1125 return;
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
735 goto bad_area;
736 } 1126 }
737 1127
738 /* 1128 /*
739 * If for any reason at all we couldn't handle the fault, 1129 * If for any reason at all we couldn't handle the fault,
740 * make sure we exit gracefully rather than endlessly redo 1130 * make sure we exit gracefully rather than endlessly redo
741 * the fault. 1131 * the fault:
742 */ 1132 */
743 fault = handle_mm_fault(mm, vma, address, write); 1133 fault = handle_mm_fault(mm, vma, address, write);
1134
744 if (unlikely(fault & VM_FAULT_ERROR)) { 1135 if (unlikely(fault & VM_FAULT_ERROR)) {
745 if (fault & VM_FAULT_OOM) 1136 mm_fault_error(regs, error_code, address, fault);
746 goto out_of_memory; 1137 return;
747 else if (fault & VM_FAULT_SIGBUS)
748 goto do_sigbus;
749 BUG();
750 } 1138 }
1139
751 if (fault & VM_FAULT_MAJOR) 1140 if (fault & VM_FAULT_MAJOR)
752 tsk->maj_flt++; 1141 tsk->maj_flt++;
753 else 1142 else
754 tsk->min_flt++; 1143 tsk->min_flt++;
755 1144
756#ifdef CONFIG_X86_32 1145 check_v8086_mode(regs, address, tsk);
757 /*
758 * Did it hit the DOS screen memory VA from vm86 mode?
759 */
760 if (v8086_mode(regs)) {
761 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
762 if (bit < 32)
763 tsk->thread.screen_bitmap |= 1 << bit;
764 }
765#endif
766 up_read(&mm->mmap_sem);
767 return;
768 1146
769/*
770 * Something tried to access memory that isn't in our memory map..
771 * Fix it, but check if it's kernel or user first..
772 */
773bad_area:
774 up_read(&mm->mmap_sem); 1147 up_read(&mm->mmap_sem);
775
776bad_area_nosemaphore:
777 /* User mode accesses just cause a SIGSEGV */
778 if (error_code & PF_USER) {
779 /*
780 * It's possible to have interrupts off here.
781 */
782 local_irq_enable();
783
784 /*
785 * Valid to do another page fault here because this one came
786 * from user space.
787 */
788 if (is_prefetch(regs, address, error_code))
789 return;
790
791 if (is_errata100(regs, address))
792 return;
793
794 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
795 printk_ratelimit()) {
796 printk(
797 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
798 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
799 tsk->comm, task_pid_nr(tsk), address,
800 (void *) regs->ip, (void *) regs->sp, error_code);
801 print_vma_addr(" in ", regs->ip);
802 printk("\n");
803 }
804
805 tsk->thread.cr2 = address;
806 /* Kernel addresses are always protection faults */
807 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
808 tsk->thread.trap_no = 14;
809 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
810 return;
811 }
812
813 if (is_f00f_bug(regs, address))
814 return;
815
816no_context:
817 /* Are we prepared to handle this kernel fault? */
818 if (fixup_exception(regs))
819 return;
820
821 /*
822 * X86_32
823 * Valid to do another page fault here, because if this fault
824 * had been triggered by is_prefetch fixup_exception would have
825 * handled it.
826 *
827 * X86_64
828 * Hall of shame of CPU/BIOS bugs.
829 */
830 if (is_prefetch(regs, address, error_code))
831 return;
832
833 if (is_errata93(regs, address))
834 return;
835
836/*
837 * Oops. The kernel tried to access some bad page. We'll have to
838 * terminate things with extreme prejudice.
839 */
840#ifdef CONFIG_X86_32
841 bust_spinlocks(1);
842#else
843 flags = oops_begin();
844#endif
845
846 show_fault_oops(regs, error_code, address);
847
848 tsk->thread.cr2 = address;
849 tsk->thread.trap_no = 14;
850 tsk->thread.error_code = error_code;
851
852#ifdef CONFIG_X86_32
853 die("Oops", regs, error_code);
854 bust_spinlocks(0);
855 do_exit(SIGKILL);
856#else
857 sig = SIGKILL;
858 if (__die("Oops", regs, error_code))
859 sig = 0;
860 /* Executive summary in case the body of the oops scrolled away */
861 printk(KERN_EMERG "CR2: %016lx\n", address);
862 oops_end(flags, regs, sig);
863#endif
864
865out_of_memory:
866 /*
867 * We ran out of memory, call the OOM killer, and return the userspace
868 * (which will retry the fault, or kill us if we got oom-killed).
869 */
870 up_read(&mm->mmap_sem);
871 pagefault_out_of_memory();
872 return;
873
874do_sigbus:
875 up_read(&mm->mmap_sem);
876
877 /* Kernel mode? Handle exceptions or die */
878 if (!(error_code & PF_USER))
879 goto no_context;
880#ifdef CONFIG_X86_32
881 /* User space => ok to do another page fault */
882 if (is_prefetch(regs, address, error_code))
883 return;
884#endif
885 tsk->thread.cr2 = address;
886 tsk->thread.error_code = error_code;
887 tsk->thread.trap_no = 14;
888 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
889}
890
891DEFINE_SPINLOCK(pgd_lock);
892LIST_HEAD(pgd_list);
893
894void vmalloc_sync_all(void)
895{
896 unsigned long address;
897
898#ifdef CONFIG_X86_32
899 if (SHARED_KERNEL_PMD)
900 return;
901
902 for (address = VMALLOC_START & PMD_MASK;
903 address >= TASK_SIZE && address < FIXADDR_TOP;
904 address += PMD_SIZE) {
905 unsigned long flags;
906 struct page *page;
907
908 spin_lock_irqsave(&pgd_lock, flags);
909 list_for_each_entry(page, &pgd_list, lru) {
910 if (!vmalloc_sync_one(page_address(page),
911 address))
912 break;
913 }
914 spin_unlock_irqrestore(&pgd_lock, flags);
915 }
916#else /* CONFIG_X86_64 */
917 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
918 address += PGDIR_SIZE) {
919 const pgd_t *pgd_ref = pgd_offset_k(address);
920 unsigned long flags;
921 struct page *page;
922
923 if (pgd_none(*pgd_ref))
924 continue;
925 spin_lock_irqsave(&pgd_lock, flags);
926 list_for_each_entry(page, &pgd_list, lru) {
927 pgd_t *pgd;
928 pgd = (pgd_t *)page_address(page) + pgd_index(address);
929 if (pgd_none(*pgd))
930 set_pgd(pgd, *pgd_ref);
931 else
932 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
933 }
934 spin_unlock_irqrestore(&pgd_lock, flags);
935 }
936#endif
937} 1148}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index bcc079c282dd..f256e73542d7 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,6 @@
1#include <linux/highmem.h> 1#include <linux/highmem.h>
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/swap.h> /* for totalram_pages */
3 4
4void *kmap(struct page *page) 5void *kmap(struct page *page)
5{ 6{
@@ -120,23 +121,30 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
120 pagefault_enable(); 121 pagefault_enable();
121} 122}
122 123
123/* This is the same as kmap_atomic() but can map memory that doesn't 124void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
124 * have a struct page associated with it.
125 */
126void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
127{ 125{
128 enum fixed_addresses idx; 126 enum fixed_addresses idx;
129 unsigned long vaddr; 127 unsigned long vaddr;
130 128
131 pagefault_disable(); 129 pagefault_disable();
132 130
133 idx = type + KM_TYPE_NR*smp_processor_id(); 131 debug_kmap_atomic_prot(type);
132
133 idx = type + KM_TYPE_NR * smp_processor_id();
134 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 134 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
135 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); 135 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
136 arch_flush_lazy_mmu_mode(); 136 arch_flush_lazy_mmu_mode();
137 137
138 return (void*) vaddr; 138 return (void*) vaddr;
139} 139}
140
141/* This is the same as kmap_atomic() but can map memory that doesn't
142 * have a struct page associated with it.
143 */
144void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
145{
146 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
147}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ 148EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
141 149
142struct page *kmap_atomic_to_page(void *ptr) 150struct page *kmap_atomic_to_page(void *ptr)
@@ -156,3 +164,27 @@ EXPORT_SYMBOL(kmap);
156EXPORT_SYMBOL(kunmap); 164EXPORT_SYMBOL(kunmap);
157EXPORT_SYMBOL(kmap_atomic); 165EXPORT_SYMBOL(kmap_atomic);
158EXPORT_SYMBOL(kunmap_atomic); 166EXPORT_SYMBOL(kunmap_atomic);
167
168void __init set_highmem_pages_init(void)
169{
170 struct zone *zone;
171 int nid;
172
173 for_each_zone(zone) {
174 unsigned long zone_start_pfn, zone_end_pfn;
175
176 if (!is_highmem(zone))
177 continue;
178
179 zone_start_pfn = zone->zone_start_pfn;
180 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
181
182 nid = zone_to_nid(zone);
183 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
184 zone->name, nid, zone_start_pfn, zone_end_pfn);
185
186 add_highpages_with_active_regions(nid, zone_start_pfn,
187 zone_end_pfn);
188 }
189 totalram_pages += totalhigh_pages;
190}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000000..15219e0d1243
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,393 @@
1#include <linux/ioport.h>
2#include <linux/swap.h>
3
4#include <asm/cacheflush.h>
5#include <asm/e820.h>
6#include <asm/init.h>
7#include <asm/page.h>
8#include <asm/page_types.h>
9#include <asm/sections.h>
10#include <asm/system.h>
11#include <asm/tlbflush.h>
12
13unsigned long __initdata e820_table_start;
14unsigned long __meminitdata e820_table_end;
15unsigned long __meminitdata e820_table_top;
16
17int after_bootmem;
18
19int direct_gbpages
20#ifdef CONFIG_DIRECT_GBPAGES
21 = 1
22#endif
23;
24
25static void __init find_early_table_space(unsigned long end, int use_pse,
26 int use_gbpages)
27{
28 unsigned long puds, pmds, ptes, tables, start;
29
30 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
31 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
32
33 if (use_gbpages) {
34 unsigned long extra;
35
36 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
37 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
38 } else
39 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
40
41 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
42
43 if (use_pse) {
44 unsigned long extra;
45
46 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
47#ifdef CONFIG_X86_32
48 extra += PMD_SIZE;
49#endif
50 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
51 } else
52 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
53
54 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
55
56#ifdef CONFIG_X86_32
57 /* for fixmap */
58 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
59#endif
60
61 /*
62 * RED-PEN putting page tables only on node 0 could
63 * cause a hotspot and fill up ZONE_DMA. The page tables
64 * need roughly 0.5KB per GB.
65 */
66#ifdef CONFIG_X86_32
67 start = 0x7000;
68 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
69 tables, PAGE_SIZE);
70#else /* CONFIG_X86_64 */
71 start = 0x8000;
72 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
73#endif
74 if (e820_table_start == -1UL)
75 panic("Cannot find space for the kernel page tables");
76
77 e820_table_start >>= PAGE_SHIFT;
78 e820_table_end = e820_table_start;
79 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
80
81 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
82 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
83}
84
85struct map_range {
86 unsigned long start;
87 unsigned long end;
88 unsigned page_size_mask;
89};
90
91#ifdef CONFIG_X86_32
92#define NR_RANGE_MR 3
93#else /* CONFIG_X86_64 */
94#define NR_RANGE_MR 5
95#endif
96
97static int save_mr(struct map_range *mr, int nr_range,
98 unsigned long start_pfn, unsigned long end_pfn,
99 unsigned long page_size_mask)
100{
101 if (start_pfn < end_pfn) {
102 if (nr_range >= NR_RANGE_MR)
103 panic("run out of range for init_memory_mapping\n");
104 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
105 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
106 mr[nr_range].page_size_mask = page_size_mask;
107 nr_range++;
108 }
109
110 return nr_range;
111}
112
113#ifdef CONFIG_X86_64
114static void __init init_gbpages(void)
115{
116 if (direct_gbpages && cpu_has_gbpages)
117 printk(KERN_INFO "Using GB pages for direct mapping\n");
118 else
119 direct_gbpages = 0;
120}
121#else
122static inline void init_gbpages(void)
123{
124}
125#endif
126
127/*
128 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
129 * This runs before bootmem is initialized and gets pages directly from
130 * the physical memory. To access them they are temporarily mapped.
131 */
132unsigned long __init_refok init_memory_mapping(unsigned long start,
133 unsigned long end)
134{
135 unsigned long page_size_mask = 0;
136 unsigned long start_pfn, end_pfn;
137 unsigned long ret = 0;
138 unsigned long pos;
139
140 struct map_range mr[NR_RANGE_MR];
141 int nr_range, i;
142 int use_pse, use_gbpages;
143
144 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
145
146 if (!after_bootmem)
147 init_gbpages();
148
149#ifdef CONFIG_DEBUG_PAGEALLOC
150 /*
151 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
152 * This will simplify cpa(), which otherwise needs to support splitting
153 * large pages into small in interrupt context, etc.
154 */
155 use_pse = use_gbpages = 0;
156#else
157 use_pse = cpu_has_pse;
158 use_gbpages = direct_gbpages;
159#endif
160
161#ifdef CONFIG_X86_32
162#ifdef CONFIG_X86_PAE
163 set_nx();
164 if (nx_enabled)
165 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
166#endif
167
168 /* Enable PSE if available */
169 if (cpu_has_pse)
170 set_in_cr4(X86_CR4_PSE);
171
172 /* Enable PGE if available */
173 if (cpu_has_pge) {
174 set_in_cr4(X86_CR4_PGE);
175 __supported_pte_mask |= _PAGE_GLOBAL;
176 }
177#endif
178
179 if (use_gbpages)
180 page_size_mask |= 1 << PG_LEVEL_1G;
181 if (use_pse)
182 page_size_mask |= 1 << PG_LEVEL_2M;
183
184 memset(mr, 0, sizeof(mr));
185 nr_range = 0;
186
187 /* head if not big page alignment ? */
188 start_pfn = start >> PAGE_SHIFT;
189 pos = start_pfn << PAGE_SHIFT;
190#ifdef CONFIG_X86_32
191 /*
192 * Don't use a large page for the first 2/4MB of memory
193 * because there are often fixed size MTRRs in there
194 * and overlapping MTRRs into large pages can cause
195 * slowdowns.
196 */
197 if (pos == 0)
198 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
199 else
200 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
201 << (PMD_SHIFT - PAGE_SHIFT);
202#else /* CONFIG_X86_64 */
203 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
204 << (PMD_SHIFT - PAGE_SHIFT);
205#endif
206 if (end_pfn > (end >> PAGE_SHIFT))
207 end_pfn = end >> PAGE_SHIFT;
208 if (start_pfn < end_pfn) {
209 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
210 pos = end_pfn << PAGE_SHIFT;
211 }
212
213 /* big page (2M) range */
214 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
215 << (PMD_SHIFT - PAGE_SHIFT);
216#ifdef CONFIG_X86_32
217 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
218#else /* CONFIG_X86_64 */
219 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
220 << (PUD_SHIFT - PAGE_SHIFT);
221 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
222 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
223#endif
224
225 if (start_pfn < end_pfn) {
226 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
227 page_size_mask & (1<<PG_LEVEL_2M));
228 pos = end_pfn << PAGE_SHIFT;
229 }
230
231#ifdef CONFIG_X86_64
232 /* big page (1G) range */
233 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
234 << (PUD_SHIFT - PAGE_SHIFT);
235 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
236 if (start_pfn < end_pfn) {
237 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
238 page_size_mask &
239 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
240 pos = end_pfn << PAGE_SHIFT;
241 }
242
243 /* tail is not big page (1G) alignment */
244 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
245 << (PMD_SHIFT - PAGE_SHIFT);
246 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
247 if (start_pfn < end_pfn) {
248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
249 page_size_mask & (1<<PG_LEVEL_2M));
250 pos = end_pfn << PAGE_SHIFT;
251 }
252#endif
253
254 /* tail is not big page (2M) alignment */
255 start_pfn = pos>>PAGE_SHIFT;
256 end_pfn = end>>PAGE_SHIFT;
257 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
258
259 /* try to merge same page size and continuous */
260 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
261 unsigned long old_start;
262 if (mr[i].end != mr[i+1].start ||
263 mr[i].page_size_mask != mr[i+1].page_size_mask)
264 continue;
265 /* move it */
266 old_start = mr[i].start;
267 memmove(&mr[i], &mr[i+1],
268 (nr_range - 1 - i) * sizeof(struct map_range));
269 mr[i--].start = old_start;
270 nr_range--;
271 }
272
273 for (i = 0; i < nr_range; i++)
274 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
275 mr[i].start, mr[i].end,
276 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
277 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 *
282 * Later we should allocate these tables in the local node of the
283 * memory mapped. Unfortunately this is done currently before the
284 * nodes are discovered.
285 */
286 if (!after_bootmem)
287 find_early_table_space(end, use_pse, use_gbpages);
288
289#ifdef CONFIG_X86_32
290 for (i = 0; i < nr_range; i++)
291 kernel_physical_mapping_init(mr[i].start, mr[i].end,
292 mr[i].page_size_mask);
293 ret = end;
294#else /* CONFIG_X86_64 */
295 for (i = 0; i < nr_range; i++)
296 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
297 mr[i].page_size_mask);
298#endif
299
300#ifdef CONFIG_X86_32
301 early_ioremap_page_table_range_init();
302
303 load_cr3(swapper_pg_dir);
304#endif
305
306#ifdef CONFIG_X86_64
307 if (!after_bootmem)
308 mmu_cr4_features = read_cr4();
309#endif
310 __flush_tlb_all();
311
312 if (!after_bootmem && e820_table_end > e820_table_start)
313 reserve_early(e820_table_start << PAGE_SHIFT,
314 e820_table_end << PAGE_SHIFT, "PGTABLE");
315
316 if (!after_bootmem)
317 early_memtest(start, end);
318
319 return ret >> PAGE_SHIFT;
320}
321
322
323/*
324 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
325 * is valid. The argument is a physical page number.
326 *
327 *
328 * On x86, access has to be given to the first megabyte of ram because that area
329 * contains bios code and data regions used by X and dosemu and similar apps.
330 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
331 * mmio resources as well as potential bios/acpi data regions.
332 */
333int devmem_is_allowed(unsigned long pagenr)
334{
335 if (pagenr <= 256)
336 return 1;
337 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
338 return 0;
339 if (!page_is_ram(pagenr))
340 return 1;
341 return 0;
342}
343
344void free_init_pages(char *what, unsigned long begin, unsigned long end)
345{
346 unsigned long addr = begin;
347
348 if (addr >= end)
349 return;
350
351 /*
352 * If debugging page accesses then do not free this memory but
353 * mark them not present - any buggy init-section access will
354 * create a kernel page fault:
355 */
356#ifdef CONFIG_DEBUG_PAGEALLOC
357 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
358 begin, PAGE_ALIGN(end));
359 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
360#else
361 /*
362 * We just marked the kernel text read only above, now that
363 * we are going to free part of that, we need to make that
364 * writeable first.
365 */
366 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
367
368 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
369
370 for (; addr < end; addr += PAGE_SIZE) {
371 ClearPageReserved(virt_to_page(addr));
372 init_page_count(virt_to_page(addr));
373 memset((void *)(addr & ~(PAGE_SIZE-1)),
374 POISON_FREE_INITMEM, PAGE_SIZE);
375 free_page(addr);
376 totalram_pages++;
377 }
378#endif
379}
380
381void free_initmem(void)
382{
383 free_init_pages("unused kernel memory",
384 (unsigned long)(&__init_begin),
385 (unsigned long)(&__init_end));
386}
387
388#ifdef CONFIG_BLK_DEV_INITRD
389void free_initrd_mem(unsigned long start, unsigned long end)
390{
391 free_init_pages("initrd memory", start, end);
392}
393#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cef05074413..749559ed80f5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,9 +49,7 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h> 52#include <asm/init.h>
53
54unsigned int __VMALLOC_RESERVE = 128 << 20;
55 53
56unsigned long max_low_pfn_mapped; 54unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
@@ -61,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
61 59
62static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
63 61
64 62bool __read_mostly __vmalloc_start_set = false;
65static unsigned long __initdata table_start;
66static unsigned long __meminitdata table_end;
67static unsigned long __meminitdata table_top;
68
69static int __initdata after_init_bootmem;
70 63
71static __init void *alloc_low_page(void) 64static __init void *alloc_low_page(void)
72{ 65{
73 unsigned long pfn = table_end++; 66 unsigned long pfn = e820_table_end++;
74 void *adr; 67 void *adr;
75 68
76 if (pfn >= table_top) 69 if (pfn >= e820_table_top)
77 panic("alloc_low_page: ran out of memory"); 70 panic("alloc_low_page: ran out of memory");
78 71
79 adr = __va(pfn * PAGE_SIZE); 72 adr = __va(pfn * PAGE_SIZE);
@@ -93,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
93 86
94#ifdef CONFIG_X86_PAE 87#ifdef CONFIG_X86_PAE
95 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 88 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
96 if (after_init_bootmem) 89 if (after_bootmem)
97 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 90 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
98 else 91 else
99 pmd_table = (pmd_t *)alloc_low_page(); 92 pmd_table = (pmd_t *)alloc_low_page();
@@ -120,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
120 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 113 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
121 pte_t *page_table = NULL; 114 pte_t *page_table = NULL;
122 115
123 if (after_init_bootmem) { 116 if (after_bootmem) {
124#ifdef CONFIG_DEBUG_PAGEALLOC 117#ifdef CONFIG_DEBUG_PAGEALLOC
125 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
126#endif 119#endif
@@ -138,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
138 return pte_offset_kernel(pmd, 0); 131 return pte_offset_kernel(pmd, 0);
139} 132}
140 133
134pmd_t * __init populate_extra_pmd(unsigned long vaddr)
135{
136 int pgd_idx = pgd_index(vaddr);
137 int pmd_idx = pmd_index(vaddr);
138
139 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
140}
141
142pte_t * __init populate_extra_pte(unsigned long vaddr)
143{
144 int pte_idx = pte_index(vaddr);
145 pmd_t *pmd;
146
147 pmd = populate_extra_pmd(vaddr);
148 return one_page_table_init(pmd) + pte_idx;
149}
150
141static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 151static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
142 unsigned long vaddr, pte_t *lastpte) 152 unsigned long vaddr, pte_t *lastpte)
143{ 153{
@@ -154,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
154 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 164 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
155 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 165 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
156 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 166 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
157 && ((__pa(pte) >> PAGE_SHIFT) < table_start 167 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
158 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { 168 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
159 pte_t *newpte; 169 pte_t *newpte;
160 int i; 170 int i;
161 171
162 BUG_ON(after_init_bootmem); 172 BUG_ON(after_bootmem);
163 newpte = alloc_low_page(); 173 newpte = alloc_low_page();
164 for (i = 0; i < PTRS_PER_PTE; i++) 174 for (i = 0; i < PTRS_PER_PTE; i++)
165 set_pte(newpte + i, pte[i]); 175 set_pte(newpte + i, pte[i]);
@@ -228,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
228 * of max_low_pfn pages, by creating page tables starting from address 238 * of max_low_pfn pages, by creating page tables starting from address
229 * PAGE_OFFSET: 239 * PAGE_OFFSET:
230 */ 240 */
231static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 241unsigned long __init
232 unsigned long start_pfn, 242kernel_physical_mapping_init(unsigned long start,
233 unsigned long end_pfn, 243 unsigned long end,
234 int use_pse) 244 unsigned long page_size_mask)
235{ 245{
246 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
247 unsigned long start_pfn, end_pfn;
248 pgd_t *pgd_base = swapper_pg_dir;
236 int pgd_idx, pmd_idx, pte_ofs; 249 int pgd_idx, pmd_idx, pte_ofs;
237 unsigned long pfn; 250 unsigned long pfn;
238 pgd_t *pgd; 251 pgd_t *pgd;
@@ -241,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
241 unsigned pages_2m, pages_4k; 254 unsigned pages_2m, pages_4k;
242 int mapping_iter; 255 int mapping_iter;
243 256
257 start_pfn = start >> PAGE_SHIFT;
258 end_pfn = end >> PAGE_SHIFT;
259
244 /* 260 /*
245 * First iteration will setup identity mapping using large/small pages 261 * First iteration will setup identity mapping using large/small pages
246 * based on use_pse, with other attributes same as set by 262 * based on use_pse, with other attributes same as set by
@@ -355,26 +371,6 @@ repeat:
355 mapping_iter = 2; 371 mapping_iter = 2;
356 goto repeat; 372 goto repeat;
357 } 373 }
358}
359
360/*
361 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
362 * is valid. The argument is a physical page number.
363 *
364 *
365 * On x86, access has to be given to the first megabyte of ram because that area
366 * contains bios code and data regions used by X and dosemu and similar apps.
367 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
368 * mmio resources as well as potential bios/acpi data regions.
369 */
370int devmem_is_allowed(unsigned long pagenr)
371{
372 if (pagenr <= 256)
373 return 1;
374 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
375 return 0;
376 if (!page_is_ram(pagenr))
377 return 1;
378 return 0; 374 return 0;
379} 375}
380 376
@@ -470,22 +466,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
470 work_with_active_regions(nid, add_highpages_work_fn, &data); 466 work_with_active_regions(nid, add_highpages_work_fn, &data);
471} 467}
472 468
473#ifndef CONFIG_NUMA
474static void __init set_highmem_pages_init(void)
475{
476 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
477
478 totalram_pages += totalhigh_pages;
479}
480#endif /* !CONFIG_NUMA */
481
482#else 469#else
483static inline void permanent_kmaps_init(pgd_t *pgd_base) 470static inline void permanent_kmaps_init(pgd_t *pgd_base)
484{ 471{
485} 472}
486static inline void set_highmem_pages_init(void)
487{
488}
489#endif /* CONFIG_HIGHMEM */ 473#endif /* CONFIG_HIGHMEM */
490 474
491void __init native_pagetable_setup_start(pgd_t *base) 475void __init native_pagetable_setup_start(pgd_t *base)
@@ -543,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
543 * be partially populated, and so it avoids stomping on any existing 527 * be partially populated, and so it avoids stomping on any existing
544 * mappings. 528 * mappings.
545 */ 529 */
546static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 530void __init early_ioremap_page_table_range_init(void)
547{ 531{
532 pgd_t *pgd_base = swapper_pg_dir;
548 unsigned long vaddr, end; 533 unsigned long vaddr, end;
549 534
550 /* 535 /*
@@ -639,7 +624,7 @@ static int __init noexec_setup(char *str)
639} 624}
640early_param("noexec", noexec_setup); 625early_param("noexec", noexec_setup);
641 626
642static void __init set_nx(void) 627void __init set_nx(void)
643{ 628{
644 unsigned int v[4], l, h; 629 unsigned int v[4], l, h;
645 630
@@ -675,75 +660,97 @@ static int __init parse_highmem(char *arg)
675} 660}
676early_param("highmem", parse_highmem); 661early_param("highmem", parse_highmem);
677 662
663#define MSG_HIGHMEM_TOO_BIG \
664 "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
665
666#define MSG_LOWMEM_TOO_SMALL \
667 "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
678/* 668/*
679 * Determine low and high memory ranges: 669 * All of RAM fits into lowmem - but if user wants highmem
670 * artificially via the highmem=x boot parameter then create
671 * it:
680 */ 672 */
681void __init find_low_pfn_range(void) 673void __init lowmem_pfn_init(void)
682{ 674{
683 /* it could update max_pfn */
684
685 /* max_low_pfn is 0, we already have early_res support */ 675 /* max_low_pfn is 0, we already have early_res support */
686
687 max_low_pfn = max_pfn; 676 max_low_pfn = max_pfn;
688 if (max_low_pfn > MAXMEM_PFN) { 677
689 if (highmem_pages == -1) 678 if (highmem_pages == -1)
690 highmem_pages = max_pfn - MAXMEM_PFN; 679 highmem_pages = 0;
691 if (highmem_pages + MAXMEM_PFN < max_pfn) 680#ifdef CONFIG_HIGHMEM
692 max_pfn = MAXMEM_PFN + highmem_pages; 681 if (highmem_pages >= max_pfn) {
693 if (highmem_pages + MAXMEM_PFN > max_pfn) { 682 printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
694 printk(KERN_WARNING "only %luMB highmem pages " 683 pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
695 "available, ignoring highmem size of %uMB.\n", 684 highmem_pages = 0;
696 pages_to_mb(max_pfn - MAXMEM_PFN), 685 }
686 if (highmem_pages) {
687 if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
688 printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
697 pages_to_mb(highmem_pages)); 689 pages_to_mb(highmem_pages));
698 highmem_pages = 0; 690 highmem_pages = 0;
699 } 691 }
700 max_low_pfn = MAXMEM_PFN; 692 max_low_pfn -= highmem_pages;
693 }
694#else
695 if (highmem_pages)
696 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
697#endif
698}
699
700#define MSG_HIGHMEM_TOO_SMALL \
701 "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
702
703#define MSG_HIGHMEM_TRIMMED \
704 "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
705/*
706 * We have more RAM than fits into lowmem - we try to put it into
707 * highmem, also taking the highmem=x boot parameter into account:
708 */
709void __init highmem_pfn_init(void)
710{
711 max_low_pfn = MAXMEM_PFN;
712
713 if (highmem_pages == -1)
714 highmem_pages = max_pfn - MAXMEM_PFN;
715
716 if (highmem_pages + MAXMEM_PFN < max_pfn)
717 max_pfn = MAXMEM_PFN + highmem_pages;
718
719 if (highmem_pages + MAXMEM_PFN > max_pfn) {
720 printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
721 pages_to_mb(max_pfn - MAXMEM_PFN),
722 pages_to_mb(highmem_pages));
723 highmem_pages = 0;
724 }
701#ifndef CONFIG_HIGHMEM 725#ifndef CONFIG_HIGHMEM
702 /* Maximum memory usable is what is directly addressable */ 726 /* Maximum memory usable is what is directly addressable */
703 printk(KERN_WARNING "Warning only %ldMB will be used.\n", 727 printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
704 MAXMEM>>20); 728 if (max_pfn > MAX_NONPAE_PFN)
705 if (max_pfn > MAX_NONPAE_PFN) 729 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
706 printk(KERN_WARNING 730 else
707 "Use a HIGHMEM64G enabled kernel.\n"); 731 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
708 else 732 max_pfn = MAXMEM_PFN;
709 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
710 max_pfn = MAXMEM_PFN;
711#else /* !CONFIG_HIGHMEM */ 733#else /* !CONFIG_HIGHMEM */
712#ifndef CONFIG_HIGHMEM64G 734#ifndef CONFIG_HIGHMEM64G
713 if (max_pfn > MAX_NONPAE_PFN) { 735 if (max_pfn > MAX_NONPAE_PFN) {
714 max_pfn = MAX_NONPAE_PFN; 736 max_pfn = MAX_NONPAE_PFN;
715 printk(KERN_WARNING "Warning only 4GB will be used." 737 printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
716 "Use a HIGHMEM64G enabled kernel.\n"); 738 }
717 }
718#endif /* !CONFIG_HIGHMEM64G */ 739#endif /* !CONFIG_HIGHMEM64G */
719#endif /* !CONFIG_HIGHMEM */ 740#endif /* !CONFIG_HIGHMEM */
720 } else { 741}
721 if (highmem_pages == -1) 742
722 highmem_pages = 0; 743/*
723#ifdef CONFIG_HIGHMEM 744 * Determine low and high memory ranges:
724 if (highmem_pages >= max_pfn) { 745 */
725 printk(KERN_ERR "highmem size specified (%uMB) is " 746void __init find_low_pfn_range(void)
726 "bigger than pages available (%luMB)!.\n", 747{
727 pages_to_mb(highmem_pages), 748 /* it could update max_pfn */
728 pages_to_mb(max_pfn)); 749
729 highmem_pages = 0; 750 if (max_pfn <= MAXMEM_PFN)
730 } 751 lowmem_pfn_init();
731 if (highmem_pages) { 752 else
732 if (max_low_pfn - highmem_pages < 753 highmem_pfn_init();
733 64*1024*1024/PAGE_SIZE){
734 printk(KERN_ERR "highmem size %uMB results in "
735 "smaller than 64MB lowmem, ignoring it.\n"
736 , pages_to_mb(highmem_pages));
737 highmem_pages = 0;
738 }
739 max_low_pfn -= highmem_pages;
740 }
741#else
742 if (highmem_pages)
743 printk(KERN_ERR "ignoring highmem size on non-highmem"
744 " kernel!\n");
745#endif
746 }
747} 754}
748 755
749#ifndef CONFIG_NEED_MULTIPLE_NODES 756#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -769,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
769#ifdef CONFIG_FLATMEM 776#ifdef CONFIG_FLATMEM
770 max_mapnr = num_physpages; 777 max_mapnr = num_physpages;
771#endif 778#endif
779 __vmalloc_start_set = true;
780
772 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 781 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
773 pages_to_mb(max_low_pfn)); 782 pages_to_mb(max_low_pfn));
774 783
@@ -790,176 +799,66 @@ static void __init zone_sizes_init(void)
790 free_area_init_nodes(max_zone_pfns); 799 free_area_init_nodes(max_zone_pfns);
791} 800}
792 801
802static unsigned long __init setup_node_bootmem(int nodeid,
803 unsigned long start_pfn,
804 unsigned long end_pfn,
805 unsigned long bootmap)
806{
807 unsigned long bootmap_size;
808
809 /* don't touch min_low_pfn */
810 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
811 bootmap >> PAGE_SHIFT,
812 start_pfn, end_pfn);
813 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
814 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
815 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
816 nodeid, bootmap, bootmap + bootmap_size);
817 free_bootmem_with_active_regions(nodeid, end_pfn);
818 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
819
820 return bootmap + bootmap_size;
821}
822
793void __init setup_bootmem_allocator(void) 823void __init setup_bootmem_allocator(void)
794{ 824{
795 int i; 825 int nodeid;
796 unsigned long bootmap_size, bootmap; 826 unsigned long bootmap_size, bootmap;
797 /* 827 /*
798 * Initialize the boot-time allocator (with low memory only): 828 * Initialize the boot-time allocator (with low memory only):
799 */ 829 */
800 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 830 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
801 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 831 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
802 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
803 PAGE_SIZE); 832 PAGE_SIZE);
804 if (bootmap == -1L) 833 if (bootmap == -1L)
805 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 834 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
806 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 835 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
807 836
808 /* don't touch min_low_pfn */
809 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
810 min_low_pfn, max_low_pfn);
811 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 837 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
812 max_pfn_mapped<<PAGE_SHIFT); 838 max_pfn_mapped<<PAGE_SHIFT);
813 printk(KERN_INFO " low ram: %08lx - %08lx\n", 839 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
814 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
815 printk(KERN_INFO " bootmap %08lx - %08lx\n",
816 bootmap, bootmap + bootmap_size);
817 for_each_online_node(i)
818 free_bootmem_with_active_regions(i, max_low_pfn);
819 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
820
821 after_init_bootmem = 1;
822}
823
824static void __init find_early_table_space(unsigned long end, int use_pse)
825{
826 unsigned long puds, pmds, ptes, tables, start;
827
828 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
829 tables = PAGE_ALIGN(puds * sizeof(pud_t));
830 840
831 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 841 for_each_online_node(nodeid) {
832 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 842 unsigned long start_pfn, end_pfn;
833 843
834 if (use_pse) { 844#ifdef CONFIG_NEED_MULTIPLE_NODES
835 unsigned long extra; 845 start_pfn = node_start_pfn[nodeid];
836 846 end_pfn = node_end_pfn[nodeid];
837 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 847 if (start_pfn > max_low_pfn)
838 extra += PMD_SIZE; 848 continue;
839 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 849 if (end_pfn > max_low_pfn)
840 } else 850 end_pfn = max_low_pfn;
841 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
842
843 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
844
845 /* for fixmap */
846 tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
847
848 /*
849 * RED-PEN putting page tables only on node 0 could
850 * cause a hotspot and fill up ZONE_DMA. The page tables
851 * need roughly 0.5KB per GB.
852 */
853 start = 0x7000;
854 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
855 tables, PAGE_SIZE);
856 if (table_start == -1UL)
857 panic("Cannot find space for the kernel page tables");
858
859 table_start >>= PAGE_SHIFT;
860 table_end = table_start;
861 table_top = table_start + (tables>>PAGE_SHIFT);
862
863 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
864 end, table_start << PAGE_SHIFT,
865 (table_start << PAGE_SHIFT) + tables);
866}
867
868unsigned long __init_refok init_memory_mapping(unsigned long start,
869 unsigned long end)
870{
871 pgd_t *pgd_base = swapper_pg_dir;
872 unsigned long start_pfn, end_pfn;
873 unsigned long big_page_start;
874#ifdef CONFIG_DEBUG_PAGEALLOC
875 /*
876 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
877 * This will simplify cpa(), which otherwise needs to support splitting
878 * large pages into small in interrupt context, etc.
879 */
880 int use_pse = 0;
881#else 851#else
882 int use_pse = cpu_has_pse; 852 start_pfn = 0;
853 end_pfn = max_low_pfn;
883#endif 854#endif
884 855 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
885 /* 856 bootmap);
886 * Find space for the kernel direct mapping tables.
887 */
888 if (!after_init_bootmem)
889 find_early_table_space(end, use_pse);
890
891#ifdef CONFIG_X86_PAE
892 set_nx();
893 if (nx_enabled)
894 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
895#endif
896
897 /* Enable PSE if available */
898 if (cpu_has_pse)
899 set_in_cr4(X86_CR4_PSE);
900
901 /* Enable PGE if available */
902 if (cpu_has_pge) {
903 set_in_cr4(X86_CR4_PGE);
904 __supported_pte_mask |= _PAGE_GLOBAL;
905 }
906
907 /*
908 * Don't use a large page for the first 2/4MB of memory
909 * because there are often fixed size MTRRs in there
910 * and overlapping MTRRs into large pages can cause
911 * slowdowns.
912 */
913 big_page_start = PMD_SIZE;
914
915 if (start < big_page_start) {
916 start_pfn = start >> PAGE_SHIFT;
917 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
918 } else {
919 /* head is not big page alignment ? */
920 start_pfn = start >> PAGE_SHIFT;
921 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
922 << (PMD_SHIFT - PAGE_SHIFT);
923 }
924 if (start_pfn < end_pfn)
925 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
926
927 /* big page range */
928 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
929 << (PMD_SHIFT - PAGE_SHIFT);
930 if (start_pfn < (big_page_start >> PAGE_SHIFT))
931 start_pfn = big_page_start >> PAGE_SHIFT;
932 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
933 if (start_pfn < end_pfn)
934 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
935 use_pse);
936
937 /* tail is not big page alignment ? */
938 start_pfn = end_pfn;
939 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
940 end_pfn = end >> PAGE_SHIFT;
941 if (start_pfn < end_pfn)
942 kernel_physical_mapping_init(pgd_base, start_pfn,
943 end_pfn, 0);
944 } 857 }
945 858
946 early_ioremap_page_table_range_init(pgd_base); 859 after_bootmem = 1;
947
948 load_cr3(swapper_pg_dir);
949
950 __flush_tlb_all();
951
952 if (!after_init_bootmem)
953 reserve_early(table_start << PAGE_SHIFT,
954 table_end << PAGE_SHIFT, "PGTABLE");
955
956 if (!after_init_bootmem)
957 early_memtest(start, end);
958
959 return end >> PAGE_SHIFT;
960} 860}
961 861
962
963/* 862/*
964 * paging_init() sets up the page tables - note that the first 8MB are 863 * paging_init() sets up the page tables - note that the first 8MB are
965 * already mapped by head.S. 864 * already mapped by head.S.
@@ -1155,17 +1054,47 @@ static noinline int do_test_wp_bit(void)
1155const int rodata_test_data = 0xC3; 1054const int rodata_test_data = 0xC3;
1156EXPORT_SYMBOL_GPL(rodata_test_data); 1055EXPORT_SYMBOL_GPL(rodata_test_data);
1157 1056
1057static int kernel_set_to_readonly;
1058
1059void set_kernel_text_rw(void)
1060{
1061 unsigned long start = PFN_ALIGN(_text);
1062 unsigned long size = PFN_ALIGN(_etext) - start;
1063
1064 if (!kernel_set_to_readonly)
1065 return;
1066
1067 pr_debug("Set kernel text: %lx - %lx for read write\n",
1068 start, start+size);
1069
1070 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
1071}
1072
1073void set_kernel_text_ro(void)
1074{
1075 unsigned long start = PFN_ALIGN(_text);
1076 unsigned long size = PFN_ALIGN(_etext) - start;
1077
1078 if (!kernel_set_to_readonly)
1079 return;
1080
1081 pr_debug("Set kernel text: %lx - %lx for read only\n",
1082 start, start+size);
1083
1084 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1085}
1086
1158void mark_rodata_ro(void) 1087void mark_rodata_ro(void)
1159{ 1088{
1160 unsigned long start = PFN_ALIGN(_text); 1089 unsigned long start = PFN_ALIGN(_text);
1161 unsigned long size = PFN_ALIGN(_etext) - start; 1090 unsigned long size = PFN_ALIGN(_etext) - start;
1162 1091
1163#ifndef CONFIG_DYNAMIC_FTRACE
1164 /* Dynamic tracing modifies the kernel text section */
1165 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1092 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1166 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1093 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
1167 size >> 10); 1094 size >> 10);
1168 1095
1096 kernel_set_to_readonly = 1;
1097
1169#ifdef CONFIG_CPA_DEBUG 1098#ifdef CONFIG_CPA_DEBUG
1170 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", 1099 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
1171 start, start+size); 1100 start, start+size);
@@ -1174,7 +1103,6 @@ void mark_rodata_ro(void)
1174 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1103 printk(KERN_INFO "Testing CPA: write protecting again\n");
1175 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1104 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1176#endif 1105#endif
1177#endif /* CONFIG_DYNAMIC_FTRACE */
1178 1106
1179 start += size; 1107 start += size;
1180 size = (unsigned long)__end_rodata - start; 1108 size = (unsigned long)__end_rodata - start;
@@ -1193,52 +1121,6 @@ void mark_rodata_ro(void)
1193} 1121}
1194#endif 1122#endif
1195 1123
1196void free_init_pages(char *what, unsigned long begin, unsigned long end)
1197{
1198#ifdef CONFIG_DEBUG_PAGEALLOC
1199 /*
1200 * If debugging page accesses then do not free this memory but
1201 * mark them not present - any buggy init-section access will
1202 * create a kernel page fault:
1203 */
1204 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
1205 begin, PAGE_ALIGN(end));
1206 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
1207#else
1208 unsigned long addr;
1209
1210 /*
1211 * We just marked the kernel text read only above, now that
1212 * we are going to free part of that, we need to make that
1213 * writeable first.
1214 */
1215 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
1216
1217 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1218 ClearPageReserved(virt_to_page(addr));
1219 init_page_count(virt_to_page(addr));
1220 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1221 free_page(addr);
1222 totalram_pages++;
1223 }
1224 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
1225#endif
1226}
1227
1228void free_initmem(void)
1229{
1230 free_init_pages("unused kernel memory",
1231 (unsigned long)(&__init_begin),
1232 (unsigned long)(&__init_end));
1233}
1234
1235#ifdef CONFIG_BLK_DEV_INITRD
1236void free_initrd_mem(unsigned long start, unsigned long end)
1237{
1238 free_init_pages("initrd memory", start, end);
1239}
1240#endif
1241
1242int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1124int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1243 int flags) 1125 int flags)
1244{ 1126{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b1352250096e..1753e8020df6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
48#include <asm/kdebug.h> 48#include <asm/kdebug.h>
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h>
51 52
52/* 53/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
61 62
62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
63 64
64int direct_gbpages
65#ifdef CONFIG_DIRECT_GBPAGES
66 = 1
67#endif
68;
69
70static int __init parse_direct_gbpages_off(char *arg) 65static int __init parse_direct_gbpages_off(char *arg)
71{ 66{
72 direct_gbpages = 0; 67 direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
87 * around without checking the pgd every time. 82 * around without checking the pgd every time.
88 */ 83 */
89 84
90int after_bootmem;
91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 86EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 87
95static int do_not_nx __cpuinitdata; 88static int disable_nx __cpuinitdata;
96 89
97/* 90/*
98 * noexec=on|off 91 * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
107 return -EINVAL; 100 return -EINVAL;
108 if (!strncmp(str, "on", 2)) { 101 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX; 102 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0; 103 disable_nx = 0;
111 } else if (!strncmp(str, "off", 3)) { 104 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1; 105 disable_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX; 106 __supported_pte_mask &= ~_PAGE_NX;
114 } 107 }
115 return 0; 108 return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
121 unsigned long efer; 114 unsigned long efer;
122 115
123 rdmsrl(MSR_EFER, efer); 116 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx) 117 if (!(efer & EFER_NX) || disable_nx)
125 __supported_pte_mask &= ~_PAGE_NX; 118 __supported_pte_mask &= ~_PAGE_NX;
126} 119}
127 120
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 161 return ptr;
169} 162}
170 163
171void 164static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 165{
174 pud_t *pud; 166 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 167 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 168 pgd_populate(&init_mm, pgd, pud);
169 if (pud != pud_offset(pgd, 0))
170 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
171 pud, pud_offset(pgd, 0));
172 }
173 return pud_offset(pgd, vaddr);
174}
177 175
178 pud = pud_page + pud_index(vaddr); 176static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
177{
179 if (pud_none(*pud)) { 178 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 179 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 180 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 181 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 182 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 183 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 184 }
188 pmd = pmd_offset(pud, vaddr); 185 return pmd_offset(pud, vaddr);
186}
187
188static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
189{
189 if (pmd_none(*pmd)) { 190 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 191 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 192 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 193 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 194 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 195 }
196 return pte_offset_kernel(pmd, vaddr);
197}
198
199void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
200{
201 pud_t *pud;
202 pmd_t *pmd;
203 pte_t *pte;
204
205 pud = pud_page + pud_index(vaddr);
206 pmd = fill_pmd(pud, vaddr);
207 pte = fill_pte(pmd, vaddr);
197 208
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 209 set_pte(pte, new_pte);
200 210
201 /* 211 /*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 215 __flush_tlb_one(vaddr);
206} 216}
207 217
208void 218void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 219{
211 pgd_t *pgd; 220 pgd_t *pgd;
212 pud_t *pud_page; 221 pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 232 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 233}
225 234
235pmd_t * __init populate_extra_pmd(unsigned long vaddr)
236{
237 pgd_t *pgd;
238 pud_t *pud;
239
240 pgd = pgd_offset_k(vaddr);
241 pud = fill_pud(pgd, vaddr);
242 return fill_pmd(pud, vaddr);
243}
244
245pte_t * __init populate_extra_pte(unsigned long vaddr)
246{
247 pmd_t *pmd;
248
249 pmd = populate_extra_pmd(vaddr);
250 return fill_pte(pmd, vaddr);
251}
252
226/* 253/*
227 * Create large page table mappings for a range of physical addresses. 254 * Create large page table mappings for a range of physical addresses.
228 */ 255 */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
291 } 318 }
292} 319}
293 320
294static unsigned long __initdata table_start;
295static unsigned long __meminitdata table_end;
296static unsigned long __meminitdata table_top;
297
298static __ref void *alloc_low_page(unsigned long *phys) 321static __ref void *alloc_low_page(unsigned long *phys)
299{ 322{
300 unsigned long pfn = table_end++; 323 unsigned long pfn = e820_table_end++;
301 void *adr; 324 void *adr;
302 325
303 if (after_bootmem) { 326 if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
307 return adr; 330 return adr;
308 } 331 }
309 332
310 if (pfn >= table_top) 333 if (pfn >= e820_table_top)
311 panic("alloc_low_page: ran out of memory"); 334 panic("alloc_low_page: ran out of memory");
312 335
313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 336 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
547 return phys_pud_init(pud, addr, end, page_size_mask); 570 return phys_pud_init(pud, addr, end, page_size_mask);
548} 571}
549 572
550static void __init find_early_table_space(unsigned long end, int use_pse, 573unsigned long __init
551 int use_gbpages) 574kernel_physical_mapping_init(unsigned long start,
552{ 575 unsigned long end,
553 unsigned long puds, pmds, ptes, tables, start; 576 unsigned long page_size_mask)
554
555 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
556 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
557 if (use_gbpages) {
558 unsigned long extra;
559 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
560 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
561 } else
562 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
563 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
564
565 if (use_pse) {
566 unsigned long extra;
567 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
568 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
569 } else
570 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
571 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
572
573 /*
574 * RED-PEN putting page tables only on node 0 could
575 * cause a hotspot and fill up ZONE_DMA. The page tables
576 * need roughly 0.5KB per GB.
577 */
578 start = 0x8000;
579 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
580 if (table_start == -1UL)
581 panic("Cannot find space for the kernel page tables");
582
583 table_start >>= PAGE_SHIFT;
584 table_end = table_start;
585 table_top = table_start + (tables >> PAGE_SHIFT);
586
587 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
588 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
589}
590
591static void __init init_gbpages(void)
592{
593 if (direct_gbpages && cpu_has_gbpages)
594 printk(KERN_INFO "Using GB pages for direct mapping\n");
595 else
596 direct_gbpages = 0;
597}
598
599static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 unsigned long end,
601 unsigned long page_size_mask)
602{ 577{
603 578
604 unsigned long next, last_map_addr = end; 579 unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
635 return last_map_addr; 610 return last_map_addr;
636} 611}
637 612
638struct map_range {
639 unsigned long start;
640 unsigned long end;
641 unsigned page_size_mask;
642};
643
644#define NR_RANGE_MR 5
645
646static int save_mr(struct map_range *mr, int nr_range,
647 unsigned long start_pfn, unsigned long end_pfn,
648 unsigned long page_size_mask)
649{
650
651 if (start_pfn < end_pfn) {
652 if (nr_range >= NR_RANGE_MR)
653 panic("run out of range for init_memory_mapping\n");
654 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
655 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
656 mr[nr_range].page_size_mask = page_size_mask;
657 nr_range++;
658 }
659
660 return nr_range;
661}
662
663/*
664 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
665 * This runs before bootmem is initialized and gets pages directly from
666 * the physical memory. To access them they are temporarily mapped.
667 */
668unsigned long __init_refok init_memory_mapping(unsigned long start,
669 unsigned long end)
670{
671 unsigned long last_map_addr = 0;
672 unsigned long page_size_mask = 0;
673 unsigned long start_pfn, end_pfn;
674 unsigned long pos;
675
676 struct map_range mr[NR_RANGE_MR];
677 int nr_range, i;
678 int use_pse, use_gbpages;
679
680 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
681
682 /*
683 * Find space for the kernel direct mapping tables.
684 *
685 * Later we should allocate these tables in the local node of the
686 * memory mapped. Unfortunately this is done currently before the
687 * nodes are discovered.
688 */
689 if (!after_bootmem)
690 init_gbpages();
691
692#ifdef CONFIG_DEBUG_PAGEALLOC
693 /*
694 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
695 * This will simplify cpa(), which otherwise needs to support splitting
696 * large pages into small in interrupt context, etc.
697 */
698 use_pse = use_gbpages = 0;
699#else
700 use_pse = cpu_has_pse;
701 use_gbpages = direct_gbpages;
702#endif
703
704 if (use_gbpages)
705 page_size_mask |= 1 << PG_LEVEL_1G;
706 if (use_pse)
707 page_size_mask |= 1 << PG_LEVEL_2M;
708
709 memset(mr, 0, sizeof(mr));
710 nr_range = 0;
711
712 /* head if not big page alignment ?*/
713 start_pfn = start >> PAGE_SHIFT;
714 pos = start_pfn << PAGE_SHIFT;
715 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
716 << (PMD_SHIFT - PAGE_SHIFT);
717 if (end_pfn > (end >> PAGE_SHIFT))
718 end_pfn = end >> PAGE_SHIFT;
719 if (start_pfn < end_pfn) {
720 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
721 pos = end_pfn << PAGE_SHIFT;
722 }
723
724 /* big page (2M) range*/
725 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
726 << (PMD_SHIFT - PAGE_SHIFT);
727 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
728 << (PUD_SHIFT - PAGE_SHIFT);
729 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
730 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
731 if (start_pfn < end_pfn) {
732 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
733 page_size_mask & (1<<PG_LEVEL_2M));
734 pos = end_pfn << PAGE_SHIFT;
735 }
736
737 /* big page (1G) range */
738 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
739 << (PUD_SHIFT - PAGE_SHIFT);
740 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
741 if (start_pfn < end_pfn) {
742 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
743 page_size_mask &
744 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
745 pos = end_pfn << PAGE_SHIFT;
746 }
747
748 /* tail is not big page (1G) alignment */
749 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
750 << (PMD_SHIFT - PAGE_SHIFT);
751 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
752 if (start_pfn < end_pfn) {
753 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
754 page_size_mask & (1<<PG_LEVEL_2M));
755 pos = end_pfn << PAGE_SHIFT;
756 }
757
758 /* tail is not big page (2M) alignment */
759 start_pfn = pos>>PAGE_SHIFT;
760 end_pfn = end>>PAGE_SHIFT;
761 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
762
763 /* try to merge same page size and continuous */
764 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
765 unsigned long old_start;
766 if (mr[i].end != mr[i+1].start ||
767 mr[i].page_size_mask != mr[i+1].page_size_mask)
768 continue;
769 /* move it */
770 old_start = mr[i].start;
771 memmove(&mr[i], &mr[i+1],
772 (nr_range - 1 - i) * sizeof (struct map_range));
773 mr[i--].start = old_start;
774 nr_range--;
775 }
776
777 for (i = 0; i < nr_range; i++)
778 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
779 mr[i].start, mr[i].end,
780 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
781 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
782
783 if (!after_bootmem)
784 find_early_table_space(end, use_pse, use_gbpages);
785
786 for (i = 0; i < nr_range; i++)
787 last_map_addr = kernel_physical_mapping_init(
788 mr[i].start, mr[i].end,
789 mr[i].page_size_mask);
790
791 if (!after_bootmem)
792 mmu_cr4_features = read_cr4();
793 __flush_tlb_all();
794
795 if (!after_bootmem && table_end > table_start)
796 reserve_early(table_start << PAGE_SHIFT,
797 table_end << PAGE_SHIFT, "PGTABLE");
798
799 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
800 last_map_addr, end);
801
802 if (!after_bootmem)
803 early_memtest(start, end);
804
805 return last_map_addr >> PAGE_SHIFT;
806}
807
808#ifndef CONFIG_NUMA 613#ifndef CONFIG_NUMA
809void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 614void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
810{ 615{
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
876 681
877#endif /* CONFIG_MEMORY_HOTPLUG */ 682#endif /* CONFIG_MEMORY_HOTPLUG */
878 683
879/*
880 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
881 * is valid. The argument is a physical page number.
882 *
883 *
884 * On x86, access has to be given to the first megabyte of ram because that area
885 * contains bios code and data regions used by X and dosemu and similar apps.
886 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
887 * mmio resources as well as potential bios/acpi data regions.
888 */
889int devmem_is_allowed(unsigned long pagenr)
890{
891 if (pagenr <= 256)
892 return 1;
893 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
894 return 0;
895 if (!page_is_ram(pagenr))
896 return 1;
897 return 0;
898}
899
900
901static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 684static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
902 kcore_modules, kcore_vsyscall; 685 kcore_modules, kcore_vsyscall;
903 686
@@ -947,46 +730,39 @@ void __init mem_init(void)
947 initsize >> 10); 730 initsize >> 10);
948} 731}
949 732
950void free_init_pages(char *what, unsigned long begin, unsigned long end) 733#ifdef CONFIG_DEBUG_RODATA
734const int rodata_test_data = 0xC3;
735EXPORT_SYMBOL_GPL(rodata_test_data);
736
737static int kernel_set_to_readonly;
738
739void set_kernel_text_rw(void)
951{ 740{
952 unsigned long addr = begin; 741 unsigned long start = PFN_ALIGN(_stext);
742 unsigned long end = PFN_ALIGN(__start_rodata);
953 743
954 if (addr >= end) 744 if (!kernel_set_to_readonly)
955 return; 745 return;
956 746
957 /* 747 pr_debug("Set kernel text: %lx - %lx for read write\n",
958 * If debugging page accesses then do not free this memory but 748 start, end);
959 * mark them not present - any buggy init-section access will 749
960 * create a kernel page fault: 750 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
961 */
962#ifdef CONFIG_DEBUG_PAGEALLOC
963 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
964 begin, PAGE_ALIGN(end));
965 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
966#else
967 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
968
969 for (; addr < end; addr += PAGE_SIZE) {
970 ClearPageReserved(virt_to_page(addr));
971 init_page_count(virt_to_page(addr));
972 memset((void *)(addr & ~(PAGE_SIZE-1)),
973 POISON_FREE_INITMEM, PAGE_SIZE);
974 free_page(addr);
975 totalram_pages++;
976 }
977#endif
978} 751}
979 752
980void free_initmem(void) 753void set_kernel_text_ro(void)
981{ 754{
982 free_init_pages("unused kernel memory", 755 unsigned long start = PFN_ALIGN(_stext);
983 (unsigned long)(&__init_begin), 756 unsigned long end = PFN_ALIGN(__start_rodata);
984 (unsigned long)(&__init_end));
985}
986 757
987#ifdef CONFIG_DEBUG_RODATA 758 if (!kernel_set_to_readonly)
988const int rodata_test_data = 0xC3; 759 return;
989EXPORT_SYMBOL_GPL(rodata_test_data); 760
761 pr_debug("Set kernel text: %lx - %lx for read only\n",
762 start, end);
763
764 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
765}
990 766
991void mark_rodata_ro(void) 767void mark_rodata_ro(void)
992{ 768{
@@ -994,15 +770,12 @@ void mark_rodata_ro(void)
994 unsigned long rodata_start = 770 unsigned long rodata_start =
995 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 771 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
996 772
997#ifdef CONFIG_DYNAMIC_FTRACE
998 /* Dynamic tracing modifies the kernel text section */
999 start = rodata_start;
1000#endif
1001
1002 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 773 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1003 (end - start) >> 10); 774 (end - start) >> 10);
1004 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 775 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1005 776
777 kernel_set_to_readonly = 1;
778
1006 /* 779 /*
1007 * The rodata section (but not the kernel text!) should also be 780 * The rodata section (but not the kernel text!) should also be
1008 * not-executable. 781 * not-executable.
@@ -1022,13 +795,6 @@ void mark_rodata_ro(void)
1022 795
1023#endif 796#endif
1024 797
1025#ifdef CONFIG_BLK_DEV_INITRD
1026void free_initrd_mem(unsigned long start, unsigned long end)
1027{
1028 free_init_pages("initrd memory", start, end);
1029}
1030#endif
1031
1032int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 798int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1033 int flags) 799 int flags)
1034{ 800{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff42..592984e5496b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -18,6 +18,7 @@
18 18
19#include <asm/iomap.h> 19#include <asm/iomap.h>
20#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/highmem.h>
21#include <linux/module.h> 22#include <linux/module.h>
22 23
23int is_io_mapping_possible(resource_size_t base, unsigned long size) 24int is_io_mapping_possible(resource_size_t base, unsigned long size)
@@ -36,11 +37,6 @@ EXPORT_SYMBOL_GPL(is_io_mapping_possible);
36void * 37void *
37iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 38iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
38{ 39{
39 enum fixed_addresses idx;
40 unsigned long vaddr;
41
42 pagefault_disable();
43
44 /* 40 /*
45 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 41 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
46 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the 42 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +46,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
50 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 46 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
51 prot = PAGE_KERNEL_UC_MINUS; 47 prot = PAGE_KERNEL_UC_MINUS;
52 48
53 idx = type + KM_TYPE_NR*smp_processor_id(); 49 return kmap_atomic_prot_pfn(pfn, type, prot);
54 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
55 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
56 arch_flush_lazy_mmu_mode();
57
58 return (void*) vaddr;
59} 50}
60EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 51EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
61 52
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index f45d5e29a72e..aca924a30ee6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
38 } else { 38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET; 40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : 41 VIRTUAL_BUG_ON(!phys_addr_valid(x));
42 !phys_addr_valid(x));
43 } 42 }
44 return x; 43 return x;
45} 44}
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
56 if (x < PAGE_OFFSET) 55 if (x < PAGE_OFFSET)
57 return false; 56 return false;
58 x -= PAGE_OFFSET; 57 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ? 58 if (!phys_addr_valid(x))
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false; 59 return false;
62 }
63 } 60 }
64 61
65 return pfn_valid(x >> PAGE_SHIFT); 62 return pfn_valid(x >> PAGE_SHIFT);
@@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr)
76#ifdef CONFIG_DEBUG_VIRTUAL 73#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x) 74unsigned long __phys_addr(unsigned long x)
78{ 75{
79 /* VMALLOC_* aren't constants; not available at the boot time */ 76 /* VMALLOC_* aren't constants */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 77 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && 78 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET; 79 return x - PAGE_OFFSET;
84} 80}
85EXPORT_SYMBOL(__phys_addr); 81EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +85,9 @@ bool __virt_addr_valid(unsigned long x)
89{ 85{
90 if (x < PAGE_OFFSET) 86 if (x < PAGE_OFFSET)
91 return false; 87 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) 88 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
89 return false;
90 if (x >= FIXADDR_START)
93 return false; 91 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); 92 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95} 93}
@@ -348,7 +346,7 @@ EXPORT_SYMBOL(ioremap_nocache);
348 * 346 *
349 * Must be freed with iounmap. 347 * Must be freed with iounmap.
350 */ 348 */
351void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 349void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
352{ 350{
353 if (pat_enabled) 351 if (pat_enabled)
354 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 352 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -508,13 +506,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
508 return &bm_pte[pte_index(addr)]; 506 return &bm_pte[pte_index(addr)];
509} 507}
510 508
509static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
510
511void __init early_ioremap_init(void) 511void __init early_ioremap_init(void)
512{ 512{
513 pmd_t *pmd; 513 pmd_t *pmd;
514 int i;
514 515
515 if (early_ioremap_debug) 516 if (early_ioremap_debug)
516 printk(KERN_INFO "early_ioremap_init()\n"); 517 printk(KERN_INFO "early_ioremap_init()\n");
517 518
519 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
520 slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
521
518 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 522 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
519 memset(bm_pte, 0, sizeof(bm_pte)); 523 memset(bm_pte, 0, sizeof(bm_pte));
520 pmd_populate_kernel(&init_mm, pmd, bm_pte); 524 pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +585,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
581 585
582static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 586static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
583static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 587static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
588
584static int __init check_early_ioremap_leak(void) 589static int __init check_early_ioremap_leak(void)
585{ 590{
586 int count = 0; 591 int count = 0;
@@ -602,7 +607,8 @@ static int __init check_early_ioremap_leak(void)
602} 607}
603late_initcall(check_early_ioremap_leak); 608late_initcall(check_early_ioremap_leak);
604 609
605static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) 610static void __init __iomem *
611__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
606{ 612{
607 unsigned long offset, last_addr; 613 unsigned long offset, last_addr;
608 unsigned int nrpages; 614 unsigned int nrpages;
@@ -668,9 +674,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
668 --nrpages; 674 --nrpages;
669 } 675 }
670 if (early_ioremap_debug) 676 if (early_ioremap_debug)
671 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 677 printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
672 678
673 prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); 679 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
674 return prev_map[slot]; 680 return prev_map[slot];
675} 681}
676 682
@@ -738,8 +744,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
738 } 744 }
739 prev_map[slot] = NULL; 745 prev_map[slot] = NULL;
740} 746}
741
742void __this_fixmap_does_not_exist(void)
743{
744 WARN_ON(1);
745}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 6a518dd08a36..4f115e00486b 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
311 311
312 if (!ctx->active) { 312 if (!ctx->active) {
313 pr_warning("kmmio: spurious debug trap on CPU %d.\n", 313 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
314 smp_processor_id()); 314 smp_processor_id());
315 goto out; 315 goto out;
316 } 316 }
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 9cab18b0b857..605c8be06217 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -9,44 +9,44 @@
9 9
10#include <asm/e820.h> 10#include <asm/e820.h>
11 11
12static void __init memtest(unsigned long start_phys, unsigned long size, 12static u64 patterns[] __initdata = {
13 unsigned pattern) 13 0,
14 0xffffffffffffffffULL,
15 0x5555555555555555ULL,
16 0xaaaaaaaaaaaaaaaaULL,
17 0x1111111111111111ULL,
18 0x2222222222222222ULL,
19 0x4444444444444444ULL,
20 0x8888888888888888ULL,
21 0x3333333333333333ULL,
22 0x6666666666666666ULL,
23 0x9999999999999999ULL,
24 0xccccccccccccccccULL,
25 0x7777777777777777ULL,
26 0xbbbbbbbbbbbbbbbbULL,
27 0xddddddddddddddddULL,
28 0xeeeeeeeeeeeeeeeeULL,
29 0x7a6c7258554e494cULL, /* yeah ;-) */
30};
31
32static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
14{ 33{
15 unsigned long i; 34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
16 unsigned long *start; 35 (unsigned long long) pattern,
17 unsigned long start_bad; 36 (unsigned long long) start_bad,
18 unsigned long last_bad; 37 (unsigned long long) end_bad);
19 unsigned long val; 38 reserve_early(start_bad, end_bad, "BAD RAM");
20 unsigned long start_phys_aligned; 39}
21 unsigned long count; 40
22 unsigned long incr; 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
23 42{
24 switch (pattern) { 43 u64 i, count;
25 case 0: 44 u64 *start;
26 val = 0UL; 45 u64 start_bad, last_bad;
27 break; 46 u64 start_phys_aligned;
28 case 1: 47 size_t incr;
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48 48
49 incr = sizeof(unsigned long); 49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr; 51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 52 start = __va(start_phys_aligned);
@@ -54,25 +54,42 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
54 last_bad = 0; 54 last_bad = 0;
55 55
56 for (i = 0; i < count; i++) 56 for (i = 0; i < count; i++)
57 start[i] = val; 57 start[i] = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) { 59 if (*start == pattern)
60 if (start_phys_aligned == last_bad + incr) { 60 continue;
61 last_bad += incr; 61 if (start_phys_aligned == last_bad + incr) {
62 } else { 62 last_bad += incr;
63 if (start_bad) { 63 continue;
64 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad + incr, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 } 64 }
65 if (start_bad)
66 reserve_bad_mem(pattern, start_bad, last_bad + incr);
67 start_bad = last_bad = start_phys_aligned;
71 } 68 }
72 if (start_bad) { 69 if (start_bad)
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", 70 reserve_bad_mem(pattern, start_bad, last_bad + incr);
74 val, start_bad, last_bad + incr); 71}
75 reserve_early(start_bad, last_bad + incr, "BAD RAM"); 72
73static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74{
75 u64 size = 0;
76
77 while (start < end) {
78 start = find_e820_area_size(start, &size, 1);
79
80 /* done ? */
81 if (start >= end)
82 break;
83 if (start + size > end)
84 size = end - start;
85
86 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
87 (unsigned long long) start,
88 (unsigned long long) start + size,
89 (unsigned long long) cpu_to_be64(pattern));
90 memtest(pattern, start, size);
91
92 start += size;
76 } 93 }
77} 94}
78 95
@@ -83,6 +100,9 @@ static int __init parse_memtest(char *arg)
83{ 100{
84 if (arg) 101 if (arg)
85 memtest_pattern = simple_strtoul(arg, NULL, 0); 102 memtest_pattern = simple_strtoul(arg, NULL, 0);
103 else
104 memtest_pattern = ARRAY_SIZE(patterns);
105
86 return 0; 106 return 0;
87} 107}
88 108
@@ -90,33 +110,22 @@ early_param("memtest", parse_memtest);
90 110
91void __init early_memtest(unsigned long start, unsigned long end) 111void __init early_memtest(unsigned long start, unsigned long end)
92{ 112{
93 u64 t_start, t_size; 113 unsigned int i;
94 unsigned pattern; 114 unsigned int idx = 0;
95 115
96 if (!memtest_pattern) 116 if (!memtest_pattern)
97 return; 117 return;
98 118
99 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 119 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
100 for (pattern = 0; pattern < memtest_pattern; pattern++) { 120 for (i = 0; i < memtest_pattern; i++) {
101 t_start = start; 121 idx = i % ARRAY_SIZE(patterns);
102 t_size = 0; 122 do_one_pass(patterns[idx], start, end);
103 while (t_start < end) { 123 }
104 t_start = find_e820_area_size(t_start, &t_size, 1);
105
106 /* done ? */
107 if (t_start >= end)
108 break;
109 if (t_start + t_size > end)
110 t_size = end - t_start;
111
112 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
113 (unsigned long long)t_start,
114 (unsigned long long)t_start + t_size, pattern);
115
116 memtest(t_start, t_size, pattern);
117 124
118 t_start += t_size; 125 if (idx > 0) {
119 } 126 printk(KERN_INFO "early_memtest: wipe out "
127 "test pattern from memory\n");
128 /* additional test with pattern 0 will do this */
129 do_one_pass(0, start, end);
120 } 130 }
121 printk(KERN_CONT "\n");
122} 131}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 56fe7124fbec..165829600566 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -4,7 +4,7 @@
4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted 4 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
5 * as follows: 5 * as follows:
6 * 6 *
7 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 7 * Copyright 2003-2009 Red Hat Inc.
8 * All Rights Reserved. 8 * All Rights Reserved.
9 * Copyright 2005 Andi Kleen, SUSE Labs. 9 * Copyright 2005 Andi Kleen, SUSE Labs.
10 * Copyright 2007 Jiri Kosina, SUSE Labs. 10 * Copyright 2007 Jiri Kosina, SUSE Labs.
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d1f7439d173c..3daefa04ace5 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size)
194 size = ALIGN(size, L1_CACHE_BYTES); 194 size = ALIGN(size, L1_CACHE_BYTES);
195 195
196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
197 return 0; 197 return NULL;
198 198
199 node_remap_alloc_vaddr[nid] += size; 199 node_remap_alloc_vaddr[nid] += size;
200 memset(allocation, 0, size); 200 memset(allocation, 0, size);
@@ -416,39 +416,14 @@ void __init initmem_init(unsigned long start_pfn,
416 for_each_online_node(nid) 416 for_each_online_node(nid)
417 propagate_e820_map_node(nid); 417 propagate_e820_map_node(nid);
418 418
419 for_each_online_node(nid) 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
422 }
421 423
422 NODE_DATA(0)->bdata = &bootmem_node_data[0];
423 setup_bootmem_allocator(); 424 setup_bootmem_allocator();
424} 425}
425 426
426void __init set_highmem_pages_init(void)
427{
428#ifdef CONFIG_HIGHMEM
429 struct zone *zone;
430 int nid;
431
432 for_each_zone(zone) {
433 unsigned long zone_start_pfn, zone_end_pfn;
434
435 if (!is_highmem(zone))
436 continue;
437
438 zone_start_pfn = zone->zone_start_pfn;
439 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
440
441 nid = zone_to_nid(zone);
442 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
443 zone->name, nid, zone_start_pfn, zone_end_pfn);
444
445 add_highpages_with_active_regions(nid, zone_start_pfn,
446 zone_end_pfn);
447 }
448 totalram_pages += totalhigh_pages;
449#endif
450}
451
452#ifdef CONFIG_MEMORY_HOTPLUG 427#ifdef CONFIG_MEMORY_HOTPLUG
453static int paddr_to_nid(u64 addr) 428static int paddr_to_nid(u64 addr)
454{ 429{
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f3516da035d1..64c9cf043cdd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 30EXPORT_SYMBOL(node_data);
25 31
@@ -33,6 +39,21 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
35 41
42DEFINE_PER_CPU(int, node_number) = 0;
43EXPORT_PER_CPU_SYMBOL(node_number);
44
45/*
46 * Map cpu index to node index
47 */
48DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50
51/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
36/* 57/*
37 * Given a shift value, try to populate memnodemap[] 58 * Given a shift value, try to populate memnodemap[]
38 * Returns : 59 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
640#endif 661#endif
641 662
642 663
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node)
695{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
697
698 /* early setting, no percpu area yet */
699 if (cpu_to_node_map) {
700 cpu_to_node_map[cpu] = node;
701 return;
702 }
703
704#ifdef CONFIG_DEBUG_PER_CPU_MAPS
705 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
707 dump_stack();
708 return;
709 }
710#endif
711 per_cpu(x86_cpu_to_node_map, cpu) = node;
712
713 if (node != NUMA_NO_NODE)
714 per_cpu(node_number, cpu) = node;
715}
716
717void __cpuinit numa_clear_node(int cpu)
718{
719 numa_set_node(cpu, NUMA_NO_NODE);
720}
721
722#ifndef CONFIG_DEBUG_PER_CPU_MAPS
723
724void __cpuinit numa_add_cpu(int cpu)
725{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727}
728
729void __cpuinit numa_remove_cpu(int cpu)
730{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732}
733
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */
735
736/*
737 * --------- debug versions of the numa functions ---------
738 */
739static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{
741 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask;
743 char buf[64];
744
745 if (node_to_cpumask_map == NULL) {
746 printk(KERN_ERR "node_to_cpumask_map NULL\n");
747 dump_stack();
748 return;
749 }
750
751 mask = &node_to_cpumask_map[node];
752 if (enable)
753 cpu_set(cpu, *mask);
754 else
755 cpu_clear(cpu, *mask);
756
757 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
760}
761
762void __cpuinit numa_add_cpu(int cpu)
763{
764 numa_set_cpumask(cpu, 1);
765}
766
767void __cpuinit numa_remove_cpu(int cpu)
768{
769 numa_set_cpumask(cpu, 0);
770}
771
772int cpu_to_node(int cpu)
773{
774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
775 printk(KERN_WARNING
776 "cpu_to_node(%d): usage too early!\n", cpu);
777 dump_stack();
778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
779 }
780 return per_cpu(x86_cpu_to_node_map, cpu);
781}
782EXPORT_SYMBOL(cpu_to_node);
783
784/*
785 * Same function as cpu_to_node() but used if called before the
786 * per_cpu areas are setup.
787 */
788int early_cpu_to_node(int cpu)
789{
790 if (early_per_cpu_ptr(x86_cpu_to_node_map))
791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
792
793 if (!cpu_possible(cpu)) {
794 printk(KERN_WARNING
795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
796 dump_stack();
797 return NUMA_NO_NODE;
798 }
799 return per_cpu(x86_cpu_to_node_map, cpu);
800}
801
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/*
856 * --------- end of debug versions of the numa functions ---------
857 */
858
859#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7233bd7e357b..9c4294986af7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address)
482 pbase = (pte_t *)page_address(base); 482 pbase = (pte_t *)page_address(base);
483 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 483 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
484 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 484 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
485 /*
486 * If we ever want to utilize the PAT bit, we need to
487 * update this function to make sure it's converted from
488 * bit 12 to bit 7 when we cross from the 2MB level to
489 * the 4K level:
490 */
491 WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
485 492
486#ifdef CONFIG_X86_64 493#ifdef CONFIG_X86_64
487 if (level == PG_LEVEL_1G) { 494 if (level == PG_LEVEL_1G) {
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e0ab173b6974..2ed37158012d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -31,7 +31,7 @@
31#ifdef CONFIG_X86_PAT 31#ifdef CONFIG_X86_PAT
32int __read_mostly pat_enabled = 1; 32int __read_mostly pat_enabled = 1;
33 33
34void __cpuinit pat_disable(char *reason) 34void __cpuinit pat_disable(const char *reason)
35{ 35{
36 pat_enabled = 0; 36 pat_enabled = 0;
37 printk(KERN_INFO "%s\n", reason); 37 printk(KERN_INFO "%s\n", reason);
@@ -43,6 +43,11 @@ static int __init nopat(char *str)
43 return 0; 43 return 0;
44} 44}
45early_param("nopat", nopat); 45early_param("nopat", nopat);
46#else
47static inline void pat_disable(const char *reason)
48{
49 (void)reason;
50}
46#endif 51#endif
47 52
48 53
@@ -79,16 +84,20 @@ void pat_init(void)
79 if (!pat_enabled) 84 if (!pat_enabled)
80 return; 85 return;
81 86
82 /* Paranoia check. */ 87 if (!cpu_has_pat) {
83 if (!cpu_has_pat && boot_pat_state) { 88 if (!boot_pat_state) {
84 /* 89 pat_disable("PAT not supported by CPU.");
85 * If this happens we are on a secondary CPU, but 90 return;
86 * switched to PAT on the boot CPU. We have no way to 91 } else {
87 * undo PAT. 92 /*
88 */ 93 * If this happens we are on a secondary CPU, but
89 printk(KERN_ERR "PAT enabled, " 94 * switched to PAT on the boot CPU. We have no way to
90 "but not supported by secondary CPU\n"); 95 * undo PAT.
91 BUG(); 96 */
97 printk(KERN_ERR "PAT enabled, "
98 "but not supported by secondary CPU\n");
99 BUG();
100 }
92 } 101 }
93 102
94 /* Set PWT to Write-Combining. All other bits stay the same */ 103 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -626,6 +635,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
626} 635}
627 636
628/* 637/*
638 * Change the memory type for the physial address range in kernel identity
639 * mapping space if that range is a part of identity map.
640 */
641int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
642{
643 unsigned long id_sz;
644
645 if (!pat_enabled || base >= __pa(high_memory))
646 return 0;
647
648 id_sz = (__pa(high_memory) < base + size) ?
649 __pa(high_memory) - base :
650 size;
651
652 if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
653 printk(KERN_INFO
654 "%s:%d ioremap_change_attr failed %s "
655 "for %Lx-%Lx\n",
656 current->comm, current->pid,
657 cattr_name(flags),
658 base, (unsigned long long)(base + size));
659 return -EINVAL;
660 }
661 return 0;
662}
663
664/*
629 * Internal interface to reserve a range of physical memory with prot. 665 * Internal interface to reserve a range of physical memory with prot.
630 * Reserved non RAM regions only and after successful reserve_memtype, 666 * Reserved non RAM regions only and after successful reserve_memtype,
631 * this func also keeps identity mapping (if any) in sync with this new prot. 667 * this func also keeps identity mapping (if any) in sync with this new prot.
@@ -634,7 +670,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
634 int strict_prot) 670 int strict_prot)
635{ 671{
636 int is_ram = 0; 672 int is_ram = 0;
637 int id_sz, ret; 673 int ret;
638 unsigned long flags; 674 unsigned long flags;
639 unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); 675 unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
640 676
@@ -671,23 +707,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
671 flags); 707 flags);
672 } 708 }
673 709
674 /* Need to keep identity mapping in sync */ 710 if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
675 if (paddr >= __pa(high_memory))
676 return 0;
677
678 id_sz = (__pa(high_memory) < paddr + size) ?
679 __pa(high_memory) - paddr :
680 size;
681
682 if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
683 free_memtype(paddr, paddr + size); 711 free_memtype(paddr, paddr + size);
684 printk(KERN_ERR
685 "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
686 "for %Lx-%Lx\n",
687 current->comm, current->pid,
688 cattr_name(flags),
689 (unsigned long long)paddr,
690 (unsigned long long)(paddr + size));
691 return -EINVAL; 712 return -EINVAL;
692 } 713 }
693 return 0; 714 return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 86f2ffc43c3d..5b7c7c8464fe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
313 return young; 313 return young;
314} 314}
315 315
316/**
317 * reserve_top_address - reserves a hole in the top of kernel address space
318 * @reserve - size of hole to reserve
319 *
320 * Can be used to relocate the fixmap area and poke a hole in the top
321 * of kernel address space to make room for a hypervisor.
322 */
323void __init reserve_top_address(unsigned long reserve)
324{
325#ifdef CONFIG_X86_32
326 BUG_ON(fixmaps_set > 0);
327 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
328 (int)-reserve);
329 __FIXADDR_TOP = -reserve - PAGE_SIZE;
330 __VMALLOC_RESERVE += reserve;
331#endif
332}
333
316int fixmaps_set; 334int fixmaps_set;
317 335
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 336void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0951db9ee519..f2e477c91c1b 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,6 +20,8 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23unsigned int __VMALLOC_RESERVE = 128 << 20;
24
23/* 25/*
24 * Associate a virtual page frame with a given physical page frame 26 * Associate a virtual page frame with a given physical page frame
25 * and protection flags for that frame. 27 * and protection flags for that frame.
@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
97unsigned long __FIXADDR_TOP = 0xfffff000; 99unsigned long __FIXADDR_TOP = 0xfffff000;
98EXPORT_SYMBOL(__FIXADDR_TOP); 100EXPORT_SYMBOL(__FIXADDR_TOP);
99 101
100/**
101 * reserve_top_address - reserves a hole in the top of kernel address space
102 * @reserve - size of hole to reserve
103 *
104 * Can be used to relocate the fixmap area and poke a hole in the top
105 * of kernel address space to make room for a hypervisor.
106 */
107void __init reserve_top_address(unsigned long reserve)
108{
109 BUG_ON(fixmaps_set > 0);
110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
111 (int)-reserve);
112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
113 __VMALLOC_RESERVE += reserve;
114}
115
116/* 102/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size' 103 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the 104 * bytes. This can be used to increase (or decrease) the
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..574c8bc95ef0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,7 +20,8 @@
20#include <asm/proto.h> 20#include <asm/proto.h>
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/apic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/mm/tlb.c
index f8be6f1d2e48..a654d59e4483 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/mm/tlb.c
@@ -1,24 +1,19 @@
1#include <linux/init.h> 1#include <linux/init.h>
2 2
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h>
10 8
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
14#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
15#include <asm/proto.h> 11#include <asm/apic.h>
16#include <asm/apicdef.h> 12#include <asm/uv/uv.h>
17#include <asm/idle.h> 13
18#include <asm/uv/uv_hub.h> 14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
19#include <asm/uv/uv_bau.h> 15 = { &init_mm, 0, };
20 16
21#include <mach_ipi.h>
22/* 17/*
23 * Smarter SMP flushing macros. 18 * Smarter SMP flushing macros.
24 * c/o Linus Torvalds. 19 * c/o Linus Torvalds.
@@ -33,7 +28,7 @@
33 * To avoid global state use 8 different call vectors. 28 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other 29 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into 30 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data. 31 * the right array slot for the flush data.
37 * 32 *
38 * With more than 8 CPUs they are hashed to the 8 available 33 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now. 34 * vectors. The limited global vector space forces us to this right now.
@@ -43,18 +38,18 @@
43 38
44union smp_flush_state { 39union smp_flush_state {
45 struct { 40 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm; 41 struct mm_struct *flush_mm;
48 unsigned long flush_va; 42 unsigned long flush_va;
49 spinlock_t tlbstate_lock; 43 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
50 }; 45 };
51 char pad[SMP_CACHE_BYTES]; 46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
52} ____cacheline_aligned; 47} ____cacheline_internodealigned_in_smp;
53 48
54/* State is put into the per CPU data section, but padded 49/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't 50 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */ 51 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state); 52static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
58 53
59/* 54/*
60 * We cannot call mmdrop() because we are in interrupt context, 55 * We cannot call mmdrop() because we are in interrupt context,
@@ -62,9 +57,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
62 */ 57 */
63void leave_mm(int cpu) 58void leave_mm(int cpu)
64{ 59{
65 if (read_pda(mmu_state) == TLBSTATE_OK) 60 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
66 BUG(); 61 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 62 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir); 63 load_cr3(swapper_pg_dir);
69} 64}
70EXPORT_SYMBOL_GPL(leave_mm); 65EXPORT_SYMBOL_GPL(leave_mm);
@@ -117,10 +112,20 @@ EXPORT_SYMBOL_GPL(leave_mm);
117 * Interrupts are disabled. 112 * Interrupts are disabled.
118 */ 113 */
119 114
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) 115/*
116 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
117 * but still used for documentation purpose but the usage is slightly
118 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
119 * entry calls in with the first parameter in %eax. Maybe define
120 * intrlinkage?
121 */
122#ifdef CONFIG_X86_64
123asmlinkage
124#endif
125void smp_invalidate_interrupt(struct pt_regs *regs)
121{ 126{
122 int cpu; 127 unsigned int cpu;
123 int sender; 128 unsigned int sender;
124 union smp_flush_state *f; 129 union smp_flush_state *f;
125 130
126 cpu = smp_processor_id(); 131 cpu = smp_processor_id();
@@ -129,9 +134,9 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
129 * Use that to determine where the sender put the data. 134 * Use that to determine where the sender put the data.
130 */ 135 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; 136 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender); 137 f = &flush_state[sender];
133 138
134 if (!cpu_isset(cpu, f->flush_cpumask)) 139 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
135 goto out; 140 goto out;
136 /* 141 /*
137 * This was a BUG() but until someone can quote me the 142 * This was a BUG() but until someone can quote me the
@@ -142,8 +147,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
142 * BUG(); 147 * BUG();
143 */ 148 */
144 149
145 if (f->flush_mm == read_pda(active_mm)) { 150 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) { 151 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL) 152 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb(); 153 local_flush_tlb();
149 else 154 else
@@ -153,23 +158,21 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
153 } 158 }
154out: 159out:
155 ack_APIC_irq(); 160 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 161 smp_mb__before_clear_bit();
162 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
163 smp_mb__after_clear_bit();
157 inc_irq_stat(irq_tlb_count); 164 inc_irq_stat(irq_tlb_count);
158} 165}
159 166
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 167static void flush_tlb_others_ipi(const struct cpumask *cpumask,
161 unsigned long va) 168 struct mm_struct *mm, unsigned long va)
162{ 169{
163 int sender; 170 unsigned int sender;
164 union smp_flush_state *f; 171 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169 172
170 /* Caller has disabled preemption */ 173 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 174 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender); 175 f = &flush_state[sender];
173 176
174 /* 177 /*
175 * Could avoid this lock when 178 * Could avoid this lock when
@@ -180,7 +183,8 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
180 183
181 f->flush_mm = mm; 184 f->flush_mm = mm;
182 f->flush_va = va; 185 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); 186 cpumask_andnot(to_cpumask(f->flush_cpumask),
187 cpumask, cpumask_of(smp_processor_id()));
184 188
185 /* 189 /*
186 * Make the above memory operations globally visible before 190 * Make the above memory operations globally visible before
@@ -191,9 +195,10 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
191 * We have to send the IPI only to 195 * We have to send the IPI only to
192 * CPUs affected. 196 * CPUs affected.
193 */ 197 */
194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); 198 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
199 INVALIDATE_TLB_VECTOR_START + sender);
195 200
196 while (!cpus_empty(f->flush_cpumask)) 201 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
197 cpu_relax(); 202 cpu_relax();
198 203
199 f->flush_mm = NULL; 204 f->flush_mm = NULL;
@@ -201,12 +206,28 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
201 spin_unlock(&f->tlbstate_lock); 206 spin_unlock(&f->tlbstate_lock);
202} 207}
203 208
209void native_flush_tlb_others(const struct cpumask *cpumask,
210 struct mm_struct *mm, unsigned long va)
211{
212 if (is_uv_system()) {
213 unsigned int cpu;
214
215 cpu = get_cpu();
216 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
217 if (cpumask)
218 flush_tlb_others_ipi(cpumask, mm, va);
219 put_cpu();
220 return;
221 }
222 flush_tlb_others_ipi(cpumask, mm, va);
223}
224
204static int __cpuinit init_smp_flush(void) 225static int __cpuinit init_smp_flush(void)
205{ 226{
206 int i; 227 int i;
207 228
208 for_each_possible_cpu(i) 229 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 230 spin_lock_init(&flush_state[i].tlbstate_lock);
210 231
211 return 0; 232 return 0;
212} 233}
@@ -215,25 +236,18 @@ core_initcall(init_smp_flush);
215void flush_tlb_current_task(void) 236void flush_tlb_current_task(void)
216{ 237{
217 struct mm_struct *mm = current->mm; 238 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219 239
220 preempt_disable(); 240 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223 241
224 local_flush_tlb(); 242 local_flush_tlb();
225 if (!cpus_empty(cpu_mask)) 243 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 244 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable(); 245 preempt_enable();
228} 246}
229 247
230void flush_tlb_mm(struct mm_struct *mm) 248void flush_tlb_mm(struct mm_struct *mm)
231{ 249{
232 cpumask_t cpu_mask;
233
234 preempt_disable(); 250 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237 251
238 if (current->active_mm == mm) { 252 if (current->active_mm == mm) {
239 if (current->mm) 253 if (current->mm)
@@ -241,8 +255,8 @@ void flush_tlb_mm(struct mm_struct *mm)
241 else 255 else
242 leave_mm(smp_processor_id()); 256 leave_mm(smp_processor_id());
243 } 257 }
244 if (!cpus_empty(cpu_mask)) 258 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 259 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
246 260
247 preempt_enable(); 261 preempt_enable();
248} 262}
@@ -250,11 +264,8 @@ void flush_tlb_mm(struct mm_struct *mm)
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 264void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{ 265{
252 struct mm_struct *mm = vma->vm_mm; 266 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254 267
255 preempt_disable(); 268 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258 269
259 if (current->active_mm == mm) { 270 if (current->active_mm == mm) {
260 if (current->mm) 271 if (current->mm)
@@ -263,8 +274,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
263 leave_mm(smp_processor_id()); 274 leave_mm(smp_processor_id());
264 } 275 }
265 276
266 if (!cpus_empty(cpu_mask)) 277 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
267 flush_tlb_others(cpu_mask, mm, va); 278 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
268 279
269 preempt_enable(); 280 preempt_enable();
270} 281}
@@ -274,7 +285,7 @@ static void do_flush_tlb_all(void *info)
274 unsigned long cpu = smp_processor_id(); 285 unsigned long cpu = smp_processor_id();
275 286
276 __flush_tlb_all(); 287 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY) 288 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
278 leave_mm(cpu); 289 leave_mm(cpu);
279} 290}
280 291
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 2089354968a2..8eb295e116f6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -5,7 +5,7 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h> 8#include <asm/apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11 11
@@ -18,10 +18,6 @@
18 18
19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
20 20
21/* Where the IO area was mapped on multiquad, always 0 otherwise */
22void *xquad_portio;
23EXPORT_SYMBOL(xquad_portio);
24
25#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) 21#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
26 22
27#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 23#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index b82cae970dfd..1c975cc9839e 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -7,7 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/uaccess.h> 8#include <linux/uaccess.h>
9#include <asm/pci_x86.h> 9#include <asm/pci_x86.h>
10#include <asm/mach-default/pci-functions.h> 10#include <asm/pci-functions.h>
11 11
12/* BIOS32 signature: "_32_" */ 12/* BIOS32 signature: "_32_" */
13#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 13#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index d1e9b53f9d33..b641388d8286 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -8,7 +8,7 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/segment.h> 10#include <asm/segment.h>
11#include <asm/page.h> 11#include <asm/page_types.h>
12#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h> 13#include <asm/processor-flags.h>
14 14
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 000415947d93..9356547d8c01 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -18,7 +18,7 @@
18 .text 18 .text
19#include <linux/linkage.h> 19#include <linux/linkage.h>
20#include <asm/segment.h> 20#include <asm/segment.h>
21#include <asm/page.h> 21#include <asm/page_types.h>
22#include <asm/asm-offsets.h> 22#include <asm/asm-offsets.h>
23#include <asm/processor-flags.h> 23#include <asm/processor-flags.h>
24 24
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4d6ef0a336d6..16a9020c8f11 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -38,7 +38,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
38 $(call if_changed,objcopy) 38 $(call if_changed,objcopy)
39 39
40CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ 40CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
41 $(filter -g%,$(KBUILD_CFLAGS)) 41 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
42 42
43$(vobjs): KBUILD_CFLAGS += $(CFL) 43$(vobjs): KBUILD_CFLAGS += $(CFL)
44 44
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 9c98cc6ba978..7133cdf9098b 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
85 unsigned long addr, end; 85 unsigned long addr, end;
86 unsigned offset; 86 unsigned offset;
87 end = (start + PMD_SIZE - 1) & PMD_MASK; 87 end = (start + PMD_SIZE - 1) & PMD_MASK;
88 if (end >= TASK_SIZE64) 88 if (end >= TASK_SIZE_MAX)
89 end = TASK_SIZE64; 89 end = TASK_SIZE_MAX;
90 end -= len; 90 end -= len;
91 /* This loses some more bits than a modulo, but is cheaper */ 91 /* This loses some more bits than a modulo, but is cheaper */
92 offset = get_random_int() & (PTRS_PER_PTE - 1); 92 offset = get_random_int() & (PTRS_PER_PTE - 1);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 87b9ab166423..b83e119fbeb0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) 9 depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
10 depends on X86_CMPXCHG && X86_TSC 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b58e96338149..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -137,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
137 103
138 vcpup = &per_cpu(xen_vcpu_info, cpu); 104 vcpup = &per_cpu(xen_vcpu_info, cpu);
139 105
140 info.mfn = virt_to_mfn(vcpup); 106 info.mfn = arbitrary_virt_to_mfn(vcpup);
141 info.offset = offset_in_page(vcpup); 107 info.offset = offset_in_page(vcpup);
142 108
143 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -335,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
335 frames = mcs.args; 301 frames = mcs.args;
336 302
337 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
338 frames[f] = virt_to_mfn(va); 304 frames[f] = arbitrary_virt_to_mfn((void *)va);
305
339 make_lowmem_page_readonly((void *)va); 306 make_lowmem_page_readonly((void *)va);
307 make_lowmem_page_readonly(mfn_to_virt(frames[f]));
340 } 308 }
341 309
342 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 310 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -348,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
348 unsigned int cpu, unsigned int i) 316 unsigned int cpu, unsigned int i)
349{ 317{
350 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 318 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
351 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 319 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
352 struct multicall_space mc = __xen_mc_entry(0); 320 struct multicall_space mc = __xen_mc_entry(0);
353 321
354 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 322 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -357,13 +325,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
357static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 325static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
358{ 326{
359 /* 327 /*
360 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 328 * XXX sleazy hack: If we're being called in a lazy-cpu zone
361 * it means we're in a context switch, and %gs has just been 329 * and lazy gs handling is enabled, it means we're in a
362 * saved. This means we can zero it out to prevent faults on 330 * context switch, and %gs has just been saved. This means we
363 * exit from the hypervisor if the next process has no %gs. 331 * can zero it out to prevent faults on exit from the
364 * Either way, it has been saved, and the new value will get 332 * hypervisor if the next process has no %gs. Either way, it
365 * loaded properly. This will go away as soon as Xen has been 333 * has been saved, and the new value will get loaded properly.
366 * modified to not save/restore %gs for normal hypercalls. 334 * This will go away as soon as Xen has been modified to not
335 * save/restore %gs for normal hypercalls.
367 * 336 *
368 * On x86_64, this hack is not used for %gs, because gs points 337 * On x86_64, this hack is not used for %gs, because gs points
369 * to KERNEL_GS_BASE (and uses it for PDA references), so we 338 * to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -375,7 +344,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
375 */ 344 */
376 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 345 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
377#ifdef CONFIG_X86_32 346#ifdef CONFIG_X86_32
378 loadsegment(gs, 0); 347 lazy_load_gs(0);
379#else 348#else
380 loadsegment(fs, 0); 349 loadsegment(fs, 0);
381#endif 350#endif
@@ -521,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
521 break; 490 break;
522 491
523 default: { 492 default: {
524 xmaddr_t maddr = virt_to_machine(&dt[entry]); 493 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
525 494
526 xen_mc_flush(); 495 xen_mc_flush();
527 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 496 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
@@ -587,94 +556,18 @@ static u32 xen_safe_apic_wait_icr_idle(void)
587 return 0; 556 return 0;
588} 557}
589 558
590static struct apic_ops xen_basic_apic_ops = { 559static void set_xen_basic_apic_ops(void)
591 .read = xen_apic_read,
592 .write = xen_apic_write,
593 .icr_read = xen_apic_icr_read,
594 .icr_write = xen_apic_icr_write,
595 .wait_icr_idle = xen_apic_wait_icr_idle,
596 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
597};
598
599#endif
600
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{ 560{
621 struct mmuext_op *op; 561 apic->read = xen_apic_read;
622 struct multicall_space mcs; 562 apic->write = xen_apic_write;
623 563 apic->icr_read = xen_apic_icr_read;
624 preempt_disable(); 564 apic->icr_write = xen_apic_icr_write;
625 565 apic->wait_icr_idle = xen_apic_wait_icr_idle;
626 mcs = xen_mc_entry(sizeof(*op)); 566 apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635} 567}
636 568
637static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 569#endif
638 unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 cpumask_t mask;
643 } *args;
644 cpumask_t cpumask = *cpus;
645 struct multicall_space mcs;
646
647 /*
648 * A couple of (to be removed) sanity checks:
649 *
650 * - current CPU must not be in mask
651 * - mask must exist :)
652 */
653 BUG_ON(cpus_empty(cpumask));
654 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
655 BUG_ON(!mm);
656
657 /* If a CPU which we ran on has gone down, OK. */
658 cpus_and(cpumask, cpumask, cpu_online_map);
659 if (cpus_empty(cpumask))
660 return;
661
662 mcs = xen_mc_entry(sizeof(*args));
663 args = mcs.args;
664 args->mask = cpumask;
665 args->op.arg2.vcpumask = &args->mask;
666
667 if (va == TLB_FLUSH_ALL) {
668 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
669 } else {
670 args->op.cmd = MMUEXT_INVLPG_MULTI;
671 args->op.arg1.linear_addr = va;
672 }
673
674 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
675 570
676 xen_mc_issue(PARAVIRT_LAZY_MMU);
677}
678 571
679static void xen_clts(void) 572static void xen_clts(void)
680{ 573{
@@ -700,21 +593,6 @@ static void xen_write_cr0(unsigned long cr0)
700 xen_mc_issue(PARAVIRT_LAZY_CPU); 593 xen_mc_issue(PARAVIRT_LAZY_CPU);
701} 594}
702 595
703static void xen_write_cr2(unsigned long cr2)
704{
705 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
706}
707
708static unsigned long xen_read_cr2(void)
709{
710 return x86_read_percpu(xen_vcpu)->arch.cr2;
711}
712
713static unsigned long xen_read_cr2_direct(void)
714{
715 return x86_read_percpu(xen_vcpu_info.arch.cr2);
716}
717
718static void xen_write_cr4(unsigned long cr4) 596static void xen_write_cr4(unsigned long cr4)
719{ 597{
720 cr4 &= ~X86_CR4_PGE; 598 cr4 &= ~X86_CR4_PGE;
@@ -723,71 +601,6 @@ static void xen_write_cr4(unsigned long cr4)
723 native_write_cr4(cr4); 601 native_write_cr4(cr4);
724} 602}
725 603
726static unsigned long xen_read_cr3(void)
727{
728 return x86_read_percpu(xen_cr3);
729}
730
731static void set_current_cr3(void *v)
732{
733 x86_write_percpu(xen_current_cr3, (unsigned long)v);
734}
735
736static void __xen_write_cr3(bool kernel, unsigned long cr3)
737{
738 struct mmuext_op *op;
739 struct multicall_space mcs;
740 unsigned long mfn;
741
742 if (cr3)
743 mfn = pfn_to_mfn(PFN_DOWN(cr3));
744 else
745 mfn = 0;
746
747 WARN_ON(mfn == 0 && kernel);
748
749 mcs = __xen_mc_entry(sizeof(*op));
750
751 op = mcs.args;
752 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
753 op->arg1.mfn = mfn;
754
755 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
756
757 if (kernel) {
758 x86_write_percpu(xen_cr3, cr3);
759
760 /* Update xen_current_cr3 once the batch has actually
761 been submitted. */
762 xen_mc_callback(set_current_cr3, (void *)cr3);
763 }
764}
765
766static void xen_write_cr3(unsigned long cr3)
767{
768 BUG_ON(preemptible());
769
770 xen_mc_batch(); /* disables interrupts */
771
772 /* Update while interrupts are disabled, so its atomic with
773 respect to ipis */
774 x86_write_percpu(xen_cr3, cr3);
775
776 __xen_write_cr3(true, cr3);
777
778#ifdef CONFIG_X86_64
779 {
780 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
781 if (user_pgd)
782 __xen_write_cr3(false, __pa(user_pgd));
783 else
784 __xen_write_cr3(false, 0);
785 }
786#endif
787
788 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
789}
790
791static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 604static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
792{ 605{
793 int ret; 606 int ret;
@@ -829,185 +642,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
829 return ret; 642 return ret;
830} 643}
831 644
832/* Early in boot, while setting up the initial pagetable, assume
833 everything is pinned. */
834static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
835{
836#ifdef CONFIG_FLATMEM
837 BUG_ON(mem_map); /* should only be used early */
838#endif
839 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
840}
841
842/* Early release_pte assumes that all pts are pinned, since there's
843 only init_mm and anything attached to that is pinned. */
844static void xen_release_pte_init(unsigned long pfn)
845{
846 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
847}
848
849static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
850{
851 struct mmuext_op op;
852 op.cmd = cmd;
853 op.arg1.mfn = pfn_to_mfn(pfn);
854 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
855 BUG();
856}
857
858/* This needs to make sure the new pte page is pinned iff its being
859 attached to a pinned pagetable. */
860static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
861{
862 struct page *page = pfn_to_page(pfn);
863
864 if (PagePinned(virt_to_page(mm->pgd))) {
865 SetPagePinned(page);
866
867 vm_unmap_aliases();
868 if (!PageHighMem(page)) {
869 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
870 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
871 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
872 } else {
873 /* make sure there are no stray mappings of
874 this page */
875 kmap_flush_unused();
876 }
877 }
878}
879
880static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
881{
882 xen_alloc_ptpage(mm, pfn, PT_PTE);
883}
884
885static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
886{
887 xen_alloc_ptpage(mm, pfn, PT_PMD);
888}
889
890static int xen_pgd_alloc(struct mm_struct *mm)
891{
892 pgd_t *pgd = mm->pgd;
893 int ret = 0;
894
895 BUG_ON(PagePinned(virt_to_page(pgd)));
896
897#ifdef CONFIG_X86_64
898 {
899 struct page *page = virt_to_page(pgd);
900 pgd_t *user_pgd;
901
902 BUG_ON(page->private != 0);
903
904 ret = -ENOMEM;
905
906 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
907 page->private = (unsigned long)user_pgd;
908
909 if (user_pgd != NULL) {
910 user_pgd[pgd_index(VSYSCALL_START)] =
911 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
912 ret = 0;
913 }
914
915 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
916 }
917#endif
918
919 return ret;
920}
921
922static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
923{
924#ifdef CONFIG_X86_64
925 pgd_t *user_pgd = xen_get_user_pgd(pgd);
926
927 if (user_pgd)
928 free_page((unsigned long)user_pgd);
929#endif
930}
931
932/* This should never happen until we're OK to use struct page */
933static void xen_release_ptpage(unsigned long pfn, unsigned level)
934{
935 struct page *page = pfn_to_page(pfn);
936
937 if (PagePinned(page)) {
938 if (!PageHighMem(page)) {
939 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
940 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
941 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
942 }
943 ClearPagePinned(page);
944 }
945}
946
947static void xen_release_pte(unsigned long pfn)
948{
949 xen_release_ptpage(pfn, PT_PTE);
950}
951
952static void xen_release_pmd(unsigned long pfn)
953{
954 xen_release_ptpage(pfn, PT_PMD);
955}
956
957#if PAGETABLE_LEVELS == 4
958static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
959{
960 xen_alloc_ptpage(mm, pfn, PT_PUD);
961}
962
963static void xen_release_pud(unsigned long pfn)
964{
965 xen_release_ptpage(pfn, PT_PUD);
966}
967#endif
968
969#ifdef CONFIG_HIGHPTE
970static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
971{
972 pgprot_t prot = PAGE_KERNEL;
973
974 if (PagePinned(page))
975 prot = PAGE_KERNEL_RO;
976
977 if (0 && PageHighMem(page))
978 printk("mapping highpte %lx type %d prot %s\n",
979 page_to_pfn(page), type,
980 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
981
982 return kmap_atomic_prot(page, type, prot);
983}
984#endif
985
986#ifdef CONFIG_X86_32
987static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
988{
989 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
990 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
991 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
992 pte_val_ma(pte));
993
994 return pte;
995}
996
997/* Init-time set_pte while constructing initial pagetables, which
998 doesn't allow RO pagetable pages to be remapped RW */
999static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1000{
1001 pte = mask_rw_pte(ptep, pte);
1002
1003 xen_set_pte(ptep, pte);
1004}
1005#endif
1006
1007static __init void xen_pagetable_setup_start(pgd_t *base)
1008{
1009}
1010
1011void xen_setup_shared_info(void) 645void xen_setup_shared_info(void)
1012{ 646{
1013 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 647 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +662,6 @@ void xen_setup_shared_info(void)
1028 xen_setup_mfn_list_list(); 662 xen_setup_mfn_list_list();
1029} 663}
1030 664
1031static __init void xen_pagetable_setup_done(pgd_t *base)
1032{
1033 xen_setup_shared_info();
1034}
1035
1036static __init void xen_post_allocator_init(void)
1037{
1038 pv_mmu_ops.set_pte = xen_set_pte;
1039 pv_mmu_ops.set_pmd = xen_set_pmd;
1040 pv_mmu_ops.set_pud = xen_set_pud;
1041#if PAGETABLE_LEVELS == 4
1042 pv_mmu_ops.set_pgd = xen_set_pgd;
1043#endif
1044
1045 /* This will work as long as patching hasn't happened yet
1046 (which it hasn't) */
1047 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1048 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1049 pv_mmu_ops.release_pte = xen_release_pte;
1050 pv_mmu_ops.release_pmd = xen_release_pmd;
1051#if PAGETABLE_LEVELS == 4
1052 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1053 pv_mmu_ops.release_pud = xen_release_pud;
1054#endif
1055
1056#ifdef CONFIG_X86_64
1057 SetPagePinned(virt_to_page(level3_user_vsyscall));
1058#endif
1059 xen_mark_init_mm_pinned();
1060}
1061
1062/* This is called once we have the cpu_possible_map */ 665/* This is called once we have the cpu_possible_map */
1063void xen_setup_vcpu_info_placement(void) 666void xen_setup_vcpu_info_placement(void)
1064{ 667{
@@ -1072,10 +675,10 @@ void xen_setup_vcpu_info_placement(void)
1072 if (have_vcpu_info_placement) { 675 if (have_vcpu_info_placement) {
1073 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 676 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1074 677
1075 pv_irq_ops.save_fl = xen_save_fl_direct; 678 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1076 pv_irq_ops.restore_fl = xen_restore_fl_direct; 679 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1077 pv_irq_ops.irq_disable = xen_irq_disable_direct; 680 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1078 pv_irq_ops.irq_enable = xen_irq_enable_direct; 681 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1079 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 682 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1080 } 683 }
1081} 684}
@@ -1133,49 +736,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1133 return ret; 736 return ret;
1134} 737}
1135 738
1136static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1137{
1138 pte_t pte;
1139
1140 phys >>= PAGE_SHIFT;
1141
1142 switch (idx) {
1143 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1144#ifdef CONFIG_X86_F00F_BUG
1145 case FIX_F00F_IDT:
1146#endif
1147#ifdef CONFIG_X86_32
1148 case FIX_WP_TEST:
1149 case FIX_VDSO:
1150# ifdef CONFIG_HIGHMEM
1151 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1152# endif
1153#else
1154 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1155#endif
1156#ifdef CONFIG_X86_LOCAL_APIC
1157 case FIX_APIC_BASE: /* maps dummy local APIC */
1158#endif
1159 pte = pfn_pte(phys, prot);
1160 break;
1161
1162 default:
1163 pte = mfn_pte(phys, prot);
1164 break;
1165 }
1166
1167 __native_set_fixmap(idx, pte);
1168
1169#ifdef CONFIG_X86_64
1170 /* Replicate changes to map the vsyscall page into the user
1171 pagetable vsyscall mapping. */
1172 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1173 unsigned long vaddr = __fix_to_virt(idx);
1174 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1175 }
1176#endif
1177}
1178
1179static const struct pv_info xen_info __initdata = { 739static const struct pv_info xen_info __initdata = {
1180 .paravirt_enabled = 1, 740 .paravirt_enabled = 1,
1181 .shared_kernel_pmd = 0, 741 .shared_kernel_pmd = 0,
@@ -1271,87 +831,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1271#endif 831#endif
1272}; 832};
1273 833
1274static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1275 .pagetable_setup_start = xen_pagetable_setup_start,
1276 .pagetable_setup_done = xen_pagetable_setup_done,
1277
1278 .read_cr2 = xen_read_cr2,
1279 .write_cr2 = xen_write_cr2,
1280
1281 .read_cr3 = xen_read_cr3,
1282 .write_cr3 = xen_write_cr3,
1283
1284 .flush_tlb_user = xen_flush_tlb,
1285 .flush_tlb_kernel = xen_flush_tlb,
1286 .flush_tlb_single = xen_flush_tlb_single,
1287 .flush_tlb_others = xen_flush_tlb_others,
1288
1289 .pte_update = paravirt_nop,
1290 .pte_update_defer = paravirt_nop,
1291
1292 .pgd_alloc = xen_pgd_alloc,
1293 .pgd_free = xen_pgd_free,
1294
1295 .alloc_pte = xen_alloc_pte_init,
1296 .release_pte = xen_release_pte_init,
1297 .alloc_pmd = xen_alloc_pte_init,
1298 .alloc_pmd_clone = paravirt_nop,
1299 .release_pmd = xen_release_pte_init,
1300
1301#ifdef CONFIG_HIGHPTE
1302 .kmap_atomic_pte = xen_kmap_atomic_pte,
1303#endif
1304
1305#ifdef CONFIG_X86_64
1306 .set_pte = xen_set_pte,
1307#else
1308 .set_pte = xen_set_pte_init,
1309#endif
1310 .set_pte_at = xen_set_pte_at,
1311 .set_pmd = xen_set_pmd_hyper,
1312
1313 .ptep_modify_prot_start = __ptep_modify_prot_start,
1314 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1315
1316 .pte_val = xen_pte_val,
1317 .pte_flags = native_pte_flags,
1318 .pgd_val = xen_pgd_val,
1319
1320 .make_pte = xen_make_pte,
1321 .make_pgd = xen_make_pgd,
1322
1323#ifdef CONFIG_X86_PAE
1324 .set_pte_atomic = xen_set_pte_atomic,
1325 .set_pte_present = xen_set_pte_at,
1326 .pte_clear = xen_pte_clear,
1327 .pmd_clear = xen_pmd_clear,
1328#endif /* CONFIG_X86_PAE */
1329 .set_pud = xen_set_pud_hyper,
1330
1331 .make_pmd = xen_make_pmd,
1332 .pmd_val = xen_pmd_val,
1333
1334#if PAGETABLE_LEVELS == 4
1335 .pud_val = xen_pud_val,
1336 .make_pud = xen_make_pud,
1337 .set_pgd = xen_set_pgd_hyper,
1338
1339 .alloc_pud = xen_alloc_pte_init,
1340 .release_pud = xen_release_pte_init,
1341#endif /* PAGETABLE_LEVELS == 4 */
1342
1343 .activate_mm = xen_activate_mm,
1344 .dup_mmap = xen_dup_mmap,
1345 .exit_mmap = xen_exit_mmap,
1346
1347 .lazy_mode = {
1348 .enter = paravirt_enter_lazy_mmu,
1349 .leave = xen_leave_lazy,
1350 },
1351
1352 .set_fixmap = xen_set_fixmap,
1353};
1354
1355static void xen_reboot(int reason) 834static void xen_reboot(int reason)
1356{ 835{
1357 struct sched_shutdown r = { .reason = reason }; 836 struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +873,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1394}; 873};
1395 874
1396 875
1397static void __init xen_reserve_top(void)
1398{
1399#ifdef CONFIG_X86_32
1400 unsigned long top = HYPERVISOR_VIRT_START;
1401 struct xen_platform_parameters pp;
1402
1403 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1404 top = pp.virt_start;
1405
1406 reserve_top_address(-top);
1407#endif /* CONFIG_X86_32 */
1408}
1409
1410/*
1411 * Like __va(), but returns address in the kernel mapping (which is
1412 * all we have until the physical memory mapping has been set up.
1413 */
1414static void *__ka(phys_addr_t paddr)
1415{
1416#ifdef CONFIG_X86_64
1417 return (void *)(paddr + __START_KERNEL_map);
1418#else
1419 return __va(paddr);
1420#endif
1421}
1422
1423/* Convert a machine address to physical address */
1424static unsigned long m2p(phys_addr_t maddr)
1425{
1426 phys_addr_t paddr;
1427
1428 maddr &= PTE_PFN_MASK;
1429 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1430
1431 return paddr;
1432}
1433
1434/* Convert a machine address to kernel virtual */
1435static void *m2v(phys_addr_t maddr)
1436{
1437 return __ka(m2p(maddr));
1438}
1439
1440static void set_page_prot(void *addr, pgprot_t prot)
1441{
1442 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1443 pte_t pte = pfn_pte(pfn, prot);
1444
1445 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1446 BUG();
1447}
1448
1449static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1450{
1451 unsigned pmdidx, pteidx;
1452 unsigned ident_pte;
1453 unsigned long pfn;
1454
1455 ident_pte = 0;
1456 pfn = 0;
1457 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1458 pte_t *pte_page;
1459
1460 /* Reuse or allocate a page of ptes */
1461 if (pmd_present(pmd[pmdidx]))
1462 pte_page = m2v(pmd[pmdidx].pmd);
1463 else {
1464 /* Check for free pte pages */
1465 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1466 break;
1467
1468 pte_page = &level1_ident_pgt[ident_pte];
1469 ident_pte += PTRS_PER_PTE;
1470
1471 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1472 }
1473
1474 /* Install mappings */
1475 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1476 pte_t pte;
1477
1478 if (pfn > max_pfn_mapped)
1479 max_pfn_mapped = pfn;
1480
1481 if (!pte_none(pte_page[pteidx]))
1482 continue;
1483
1484 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1485 pte_page[pteidx] = pte;
1486 }
1487 }
1488
1489 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1490 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1491
1492 set_page_prot(pmd, PAGE_KERNEL_RO);
1493}
1494
1495#ifdef CONFIG_X86_64
1496static void convert_pfn_mfn(void *v)
1497{
1498 pte_t *pte = v;
1499 int i;
1500
1501 /* All levels are converted the same way, so just treat them
1502 as ptes. */
1503 for (i = 0; i < PTRS_PER_PTE; i++)
1504 pte[i] = xen_make_pte(pte[i].pte);
1505}
1506
1507/*
1508 * Set up the inital kernel pagetable.
1509 *
1510 * We can construct this by grafting the Xen provided pagetable into
1511 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1512 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1513 * means that only the kernel has a physical mapping to start with -
1514 * but that's enough to get __va working. We need to fill in the rest
1515 * of the physical mapping once some sort of allocator has been set
1516 * up.
1517 */
1518static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1519 unsigned long max_pfn)
1520{
1521 pud_t *l3;
1522 pmd_t *l2;
1523
1524 /* Zap identity mapping */
1525 init_level4_pgt[0] = __pgd(0);
1526
1527 /* Pre-constructed entries are in pfn, so convert to mfn */
1528 convert_pfn_mfn(init_level4_pgt);
1529 convert_pfn_mfn(level3_ident_pgt);
1530 convert_pfn_mfn(level3_kernel_pgt);
1531
1532 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1533 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1534
1535 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1536 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1537
1538 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1539 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1540 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1541
1542 /* Set up identity map */
1543 xen_map_identity_early(level2_ident_pgt, max_pfn);
1544
1545 /* Make pagetable pieces RO */
1546 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1547 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1548 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1549 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1550 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1551 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1552
1553 /* Pin down new L4 */
1554 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1555 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1556
1557 /* Unpin Xen-provided one */
1558 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1559
1560 /* Switch over */
1561 pgd = init_level4_pgt;
1562
1563 /*
1564 * At this stage there can be no user pgd, and no page
1565 * structure to attach it to, so make sure we just set kernel
1566 * pgd.
1567 */
1568 xen_mc_batch();
1569 __xen_write_cr3(true, __pa(pgd));
1570 xen_mc_issue(PARAVIRT_LAZY_CPU);
1571
1572 reserve_early(__pa(xen_start_info->pt_base),
1573 __pa(xen_start_info->pt_base +
1574 xen_start_info->nr_pt_frames * PAGE_SIZE),
1575 "XEN PAGETABLES");
1576
1577 return pgd;
1578}
1579#else /* !CONFIG_X86_64 */
1580static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1581
1582static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1583 unsigned long max_pfn)
1584{
1585 pmd_t *kernel_pmd;
1586
1587 init_pg_tables_start = __pa(pgd);
1588 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1589 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1590
1591 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1592 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1593
1594 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1595
1596 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1597 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1598 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1599
1600 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1601 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1602 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1603
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 xen_write_cr3(__pa(swapper_pg_dir));
1607
1608 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1609
1610 return swapper_pg_dir;
1611}
1612#endif /* CONFIG_X86_64 */
1613
1614/* First C function to be called on Xen boot */ 876/* First C function to be called on Xen boot */
1615asmlinkage void __init xen_start_kernel(void) 877asmlinkage void __init xen_start_kernel(void)
1616{ 878{
@@ -1639,7 +901,7 @@ asmlinkage void __init xen_start_kernel(void)
1639 /* 901 /*
1640 * set up the basic apic ops. 902 * set up the basic apic ops.
1641 */ 903 */
1642 apic_ops = &xen_basic_apic_ops; 904 set_xen_basic_apic_ops();
1643#endif 905#endif
1644 906
1645 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 907 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1650,10 +912,18 @@ asmlinkage void __init xen_start_kernel(void)
1650 machine_ops = xen_machine_ops; 912 machine_ops = xen_machine_ops;
1651 913
1652#ifdef CONFIG_X86_64 914#ifdef CONFIG_X86_64
1653 /* Disable until direct per-cpu data access. */ 915 /*
1654 have_vcpu_info_placement = 0; 916 * Setup percpu state. We only need to do this for 64-bit
1655 x86_64_init_pda(); 917 * because 32-bit already has %fs set properly.
918 */
919 load_percpu_segment(0);
1656#endif 920#endif
921 /*
922 * The only reliable way to retain the initial address of the
923 * percpu gdt_page is to remember it here, so we can go and
924 * mark it RW later, when the initial percpu area is freed.
925 */
926 xen_initial_gdt = &per_cpu(gdt_page, 0);
1657 927
1658 xen_smp_init(); 928 xen_smp_init();
1659 929
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..cfd17799bd6d 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
19 (void)HYPERVISOR_xen_version(0, NULL); 19 (void)HYPERVISOR_xen_version(0, NULL);
20} 20}
21 21
22static void __init __xen_init_IRQ(void)
23{
24 int i;
25
26 /* Create identity vector->irq map */
27 for(i = 0; i < NR_VECTORS; i++) {
28 int cpu;
29
30 for_each_possible_cpu(cpu)
31 per_cpu(vector_irq, cpu)[i] = i;
32 }
33
34 xen_init_IRQ();
35}
36
37static unsigned long xen_save_fl(void) 22static unsigned long xen_save_fl(void)
38{ 23{
39 struct vcpu_info *vcpu; 24 struct vcpu_info *vcpu;
40 unsigned long flags; 25 unsigned long flags;
41 26
42 vcpu = x86_read_percpu(xen_vcpu); 27 vcpu = percpu_read(xen_vcpu);
43 28
44 /* flag has opposite sense of mask */ 29 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask; 30 flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
50 */ 35 */
51 return (-flags) & X86_EFLAGS_IF; 36 return (-flags) & X86_EFLAGS_IF;
52} 37}
38PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 39
54static void xen_restore_fl(unsigned long flags) 40static void xen_restore_fl(unsigned long flags)
55{ 41{
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
62 make sure we're don't switch CPUs between getting the vcpu 48 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */ 49 pointer and updating the mask. */
64 preempt_disable(); 50 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu); 51 vcpu = percpu_read(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags; 52 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched(); 53 preempt_enable_no_resched();
68 54
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 62 xen_force_evtchn_callback();
77 } 63 }
78} 64}
65PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 66
80static void xen_irq_disable(void) 67static void xen_irq_disable(void)
81{ 68{
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
83 make sure we're don't switch CPUs between getting the vcpu 70 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */ 71 pointer and updating the mask. */
85 preempt_disable(); 72 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 73 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 74 preempt_enable_no_resched();
88} 75}
76PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 77
90static void xen_irq_enable(void) 78static void xen_irq_enable(void)
91{ 79{
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
96 the caller is confused and is trying to re-enable interrupts 84 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */ 85 on an indeterminate processor. */
98 86
99 vcpu = x86_read_percpu(xen_vcpu); 87 vcpu = percpu_read(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0; 88 vcpu->evtchn_upcall_mask = 0;
101 89
102 /* Doesn't matter if we get preempted here, because any 90 /* Doesn't matter if we get preempted here, because any
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 94 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 95 xen_force_evtchn_callback();
108} 96}
97PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 98
110static void xen_safe_halt(void) 99static void xen_safe_halt(void)
111{ 100{
@@ -123,11 +112,13 @@ static void xen_halt(void)
123} 112}
124 113
125static const struct pv_irq_ops xen_irq_ops __initdata = { 114static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 115 .init_IRQ = xen_init_IRQ,
127 .save_fl = xen_save_fl, 116
128 .restore_fl = xen_restore_fl, 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
120 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
121
131 .safe_halt = xen_safe_halt, 122 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 123 .halt = xen_halt,
133#ifdef CONFIG_X86_64 124#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -242,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
242 p2m_top[topidx][idx] = mfn; 276 p2m_top[topidx][idx] = mfn;
243} 277}
244 278
279unsigned long arbitrary_virt_to_mfn(void *vaddr)
280{
281 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
282
283 return PFN_DOWN(maddr.maddr);
284}
285
245xmaddr_t arbitrary_virt_to_machine(void *vaddr) 286xmaddr_t arbitrary_virt_to_machine(void *vaddr)
246{ 287{
247 unsigned long address = (unsigned long)vaddr; 288 unsigned long address = (unsigned long)vaddr;
@@ -458,28 +499,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 499{
459 return pte_mfn_to_pfn(pte.pte); 500 return pte_mfn_to_pfn(pte.pte);
460} 501}
502PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 503
462pgdval_t xen_pgd_val(pgd_t pgd) 504pgdval_t xen_pgd_val(pgd_t pgd)
463{ 505{
464 return pte_mfn_to_pfn(pgd.pgd); 506 return pte_mfn_to_pfn(pgd.pgd);
465} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 509
467pte_t xen_make_pte(pteval_t pte) 510pte_t xen_make_pte(pteval_t pte)
468{ 511{
469 pte = pte_pfn_to_mfn(pte); 512 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 513 return native_make_pte(pte);
471} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 516
473pgd_t xen_make_pgd(pgdval_t pgd) 517pgd_t xen_make_pgd(pgdval_t pgd)
474{ 518{
475 pgd = pte_pfn_to_mfn(pgd); 519 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 520 return native_make_pgd(pgd);
477} 521}
522PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 523
479pmdval_t xen_pmd_val(pmd_t pmd) 524pmdval_t xen_pmd_val(pmd_t pmd)
480{ 525{
481 return pte_mfn_to_pfn(pmd.pmd); 526 return pte_mfn_to_pfn(pmd.pmd);
482} 527}
528PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 529
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 530void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 531{
@@ -556,12 +602,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 602 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 603 return native_make_pmd(pmd);
558} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 606
560#if PAGETABLE_LEVELS == 4 607#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 608pudval_t xen_pud_val(pud_t pud)
562{ 609{
563 return pte_mfn_to_pfn(pud.pud); 610 return pte_mfn_to_pfn(pud.pud);
564} 611}
612PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 613
566pud_t xen_make_pud(pudval_t pud) 614pud_t xen_make_pud(pudval_t pud)
567{ 615{
@@ -569,6 +617,7 @@ pud_t xen_make_pud(pudval_t pud)
569 617
570 return native_make_pud(pud); 618 return native_make_pud(pud);
571} 619}
620PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 621
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 622pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 623{
@@ -1063,18 +1112,14 @@ static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info; 1112 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm; 1113 struct mm_struct *active_mm;
1065 1114
1066#ifdef CONFIG_X86_64 1115 active_mm = percpu_read(cpu_tlbstate.active_mm);
1067 active_mm = read_pda(active_mm);
1068#else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070#endif
1071 1116
1072 if (active_mm == mm) 1117 if (active_mm == mm)
1073 leave_mm(smp_processor_id()); 1118 leave_mm(smp_processor_id());
1074 1119
1075 /* If this cpu still has a stale cr3 reference, then make sure 1120 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */ 1121 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { 1122 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir); 1123 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode(); 1124 arch_flush_lazy_cpu_mode();
1080 } 1125 }
@@ -1156,6 +1201,706 @@ void xen_exit_mmap(struct mm_struct *mm)
1156 spin_unlock(&mm->page_table_lock); 1201 spin_unlock(&mm->page_table_lock);
1157} 1202}
1158 1203
1204static __init void xen_pagetable_setup_start(pgd_t *base)
1205{
1206}
1207
1208static __init void xen_pagetable_setup_done(pgd_t *base)
1209{
1210 xen_setup_shared_info();
1211}
1212
1213static void xen_write_cr2(unsigned long cr2)
1214{
1215 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1216}
1217
1218static unsigned long xen_read_cr2(void)
1219{
1220 return percpu_read(xen_vcpu)->arch.cr2;
1221}
1222
1223unsigned long xen_read_cr2_direct(void)
1224{
1225 return percpu_read(xen_vcpu_info.arch.cr2);
1226}
1227
1228static void xen_flush_tlb(void)
1229{
1230 struct mmuext_op *op;
1231 struct multicall_space mcs;
1232
1233 preempt_disable();
1234
1235 mcs = xen_mc_entry(sizeof(*op));
1236
1237 op = mcs.args;
1238 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1239 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1240
1241 xen_mc_issue(PARAVIRT_LAZY_MMU);
1242
1243 preempt_enable();
1244}
1245
1246static void xen_flush_tlb_single(unsigned long addr)
1247{
1248 struct mmuext_op *op;
1249 struct multicall_space mcs;
1250
1251 preempt_disable();
1252
1253 mcs = xen_mc_entry(sizeof(*op));
1254 op = mcs.args;
1255 op->cmd = MMUEXT_INVLPG_LOCAL;
1256 op->arg1.linear_addr = addr & PAGE_MASK;
1257 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1258
1259 xen_mc_issue(PARAVIRT_LAZY_MMU);
1260
1261 preempt_enable();
1262}
1263
1264static void xen_flush_tlb_others(const struct cpumask *cpus,
1265 struct mm_struct *mm, unsigned long va)
1266{
1267 struct {
1268 struct mmuext_op op;
1269 DECLARE_BITMAP(mask, NR_CPUS);
1270 } *args;
1271 struct multicall_space mcs;
1272
1273 BUG_ON(cpumask_empty(cpus));
1274 BUG_ON(!mm);
1275
1276 mcs = xen_mc_entry(sizeof(*args));
1277 args = mcs.args;
1278 args->op.arg2.vcpumask = to_cpumask(args->mask);
1279
1280 /* Remove us, and any offline CPUS. */
1281 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1282 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1283
1284 if (va == TLB_FLUSH_ALL) {
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286 } else {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = va;
1289 }
1290
1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1292
1293 xen_mc_issue(PARAVIRT_LAZY_MMU);
1294}
1295
1296static unsigned long xen_read_cr3(void)
1297{
1298 return percpu_read(xen_cr3);
1299}
1300
1301static void set_current_cr3(void *v)
1302{
1303 percpu_write(xen_current_cr3, (unsigned long)v);
1304}
1305
1306static void __xen_write_cr3(bool kernel, unsigned long cr3)
1307{
1308 struct mmuext_op *op;
1309 struct multicall_space mcs;
1310 unsigned long mfn;
1311
1312 if (cr3)
1313 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1314 else
1315 mfn = 0;
1316
1317 WARN_ON(mfn == 0 && kernel);
1318
1319 mcs = __xen_mc_entry(sizeof(*op));
1320
1321 op = mcs.args;
1322 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1323 op->arg1.mfn = mfn;
1324
1325 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1326
1327 if (kernel) {
1328 percpu_write(xen_cr3, cr3);
1329
1330 /* Update xen_current_cr3 once the batch has actually
1331 been submitted. */
1332 xen_mc_callback(set_current_cr3, (void *)cr3);
1333 }
1334}
1335
1336static void xen_write_cr3(unsigned long cr3)
1337{
1338 BUG_ON(preemptible());
1339
1340 xen_mc_batch(); /* disables interrupts */
1341
1342 /* Update while interrupts are disabled, so its atomic with
1343 respect to ipis */
1344 percpu_write(xen_cr3, cr3);
1345
1346 __xen_write_cr3(true, cr3);
1347
1348#ifdef CONFIG_X86_64
1349 {
1350 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1351 if (user_pgd)
1352 __xen_write_cr3(false, __pa(user_pgd));
1353 else
1354 __xen_write_cr3(false, 0);
1355 }
1356#endif
1357
1358 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1359}
1360
1361static int xen_pgd_alloc(struct mm_struct *mm)
1362{
1363 pgd_t *pgd = mm->pgd;
1364 int ret = 0;
1365
1366 BUG_ON(PagePinned(virt_to_page(pgd)));
1367
1368#ifdef CONFIG_X86_64
1369 {
1370 struct page *page = virt_to_page(pgd);
1371 pgd_t *user_pgd;
1372
1373 BUG_ON(page->private != 0);
1374
1375 ret = -ENOMEM;
1376
1377 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1378 page->private = (unsigned long)user_pgd;
1379
1380 if (user_pgd != NULL) {
1381 user_pgd[pgd_index(VSYSCALL_START)] =
1382 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1383 ret = 0;
1384 }
1385
1386 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1387 }
1388#endif
1389
1390 return ret;
1391}
1392
1393static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1394{
1395#ifdef CONFIG_X86_64
1396 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1397
1398 if (user_pgd)
1399 free_page((unsigned long)user_pgd);
1400#endif
1401}
1402
1403#ifdef CONFIG_HIGHPTE
1404static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1405{
1406 pgprot_t prot = PAGE_KERNEL;
1407
1408 if (PagePinned(page))
1409 prot = PAGE_KERNEL_RO;
1410
1411 if (0 && PageHighMem(page))
1412 printk("mapping highpte %lx type %d prot %s\n",
1413 page_to_pfn(page), type,
1414 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1415
1416 return kmap_atomic_prot(page, type, prot);
1417}
1418#endif
1419
1420#ifdef CONFIG_X86_32
1421static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1422{
1423 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1424 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1425 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1426 pte_val_ma(pte));
1427
1428 return pte;
1429}
1430
1431/* Init-time set_pte while constructing initial pagetables, which
1432 doesn't allow RO pagetable pages to be remapped RW */
1433static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1434{
1435 pte = mask_rw_pte(ptep, pte);
1436
1437 xen_set_pte(ptep, pte);
1438}
1439#endif
1440
1441/* Early in boot, while setting up the initial pagetable, assume
1442 everything is pinned. */
1443static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1444{
1445#ifdef CONFIG_FLATMEM
1446 BUG_ON(mem_map); /* should only be used early */
1447#endif
1448 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1449}
1450
1451/* Early release_pte assumes that all pts are pinned, since there's
1452 only init_mm and anything attached to that is pinned. */
1453static void xen_release_pte_init(unsigned long pfn)
1454{
1455 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1456}
1457
1458static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1459{
1460 struct mmuext_op op;
1461 op.cmd = cmd;
1462 op.arg1.mfn = pfn_to_mfn(pfn);
1463 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1464 BUG();
1465}
1466
1467/* This needs to make sure the new pte page is pinned iff its being
1468 attached to a pinned pagetable. */
1469static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1470{
1471 struct page *page = pfn_to_page(pfn);
1472
1473 if (PagePinned(virt_to_page(mm->pgd))) {
1474 SetPagePinned(page);
1475
1476 vm_unmap_aliases();
1477 if (!PageHighMem(page)) {
1478 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1479 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1480 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1481 } else {
1482 /* make sure there are no stray mappings of
1483 this page */
1484 kmap_flush_unused();
1485 }
1486 }
1487}
1488
1489static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1490{
1491 xen_alloc_ptpage(mm, pfn, PT_PTE);
1492}
1493
1494static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1495{
1496 xen_alloc_ptpage(mm, pfn, PT_PMD);
1497}
1498
1499/* This should never happen until we're OK to use struct page */
1500static void xen_release_ptpage(unsigned long pfn, unsigned level)
1501{
1502 struct page *page = pfn_to_page(pfn);
1503
1504 if (PagePinned(page)) {
1505 if (!PageHighMem(page)) {
1506 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1507 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1508 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1509 }
1510 ClearPagePinned(page);
1511 }
1512}
1513
1514static void xen_release_pte(unsigned long pfn)
1515{
1516 xen_release_ptpage(pfn, PT_PTE);
1517}
1518
1519static void xen_release_pmd(unsigned long pfn)
1520{
1521 xen_release_ptpage(pfn, PT_PMD);
1522}
1523
1524#if PAGETABLE_LEVELS == 4
1525static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1526{
1527 xen_alloc_ptpage(mm, pfn, PT_PUD);
1528}
1529
1530static void xen_release_pud(unsigned long pfn)
1531{
1532 xen_release_ptpage(pfn, PT_PUD);
1533}
1534#endif
1535
1536void __init xen_reserve_top(void)
1537{
1538#ifdef CONFIG_X86_32
1539 unsigned long top = HYPERVISOR_VIRT_START;
1540 struct xen_platform_parameters pp;
1541
1542 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1543 top = pp.virt_start;
1544
1545 reserve_top_address(-top);
1546#endif /* CONFIG_X86_32 */
1547}
1548
1549/*
1550 * Like __va(), but returns address in the kernel mapping (which is
1551 * all we have until the physical memory mapping has been set up.
1552 */
1553static void *__ka(phys_addr_t paddr)
1554{
1555#ifdef CONFIG_X86_64
1556 return (void *)(paddr + __START_KERNEL_map);
1557#else
1558 return __va(paddr);
1559#endif
1560}
1561
1562/* Convert a machine address to physical address */
1563static unsigned long m2p(phys_addr_t maddr)
1564{
1565 phys_addr_t paddr;
1566
1567 maddr &= PTE_PFN_MASK;
1568 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1569
1570 return paddr;
1571}
1572
1573/* Convert a machine address to kernel virtual */
1574static void *m2v(phys_addr_t maddr)
1575{
1576 return __ka(m2p(maddr));
1577}
1578
1579static void set_page_prot(void *addr, pgprot_t prot)
1580{
1581 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1582 pte_t pte = pfn_pte(pfn, prot);
1583
1584 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1585 BUG();
1586}
1587
1588static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1589{
1590 unsigned pmdidx, pteidx;
1591 unsigned ident_pte;
1592 unsigned long pfn;
1593
1594 ident_pte = 0;
1595 pfn = 0;
1596 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1597 pte_t *pte_page;
1598
1599 /* Reuse or allocate a page of ptes */
1600 if (pmd_present(pmd[pmdidx]))
1601 pte_page = m2v(pmd[pmdidx].pmd);
1602 else {
1603 /* Check for free pte pages */
1604 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1605 break;
1606
1607 pte_page = &level1_ident_pgt[ident_pte];
1608 ident_pte += PTRS_PER_PTE;
1609
1610 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1611 }
1612
1613 /* Install mappings */
1614 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1615 pte_t pte;
1616
1617 if (pfn > max_pfn_mapped)
1618 max_pfn_mapped = pfn;
1619
1620 if (!pte_none(pte_page[pteidx]))
1621 continue;
1622
1623 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1624 pte_page[pteidx] = pte;
1625 }
1626 }
1627
1628 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1629 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1630
1631 set_page_prot(pmd, PAGE_KERNEL_RO);
1632}
1633
1634#ifdef CONFIG_X86_64
1635static void convert_pfn_mfn(void *v)
1636{
1637 pte_t *pte = v;
1638 int i;
1639
1640 /* All levels are converted the same way, so just treat them
1641 as ptes. */
1642 for (i = 0; i < PTRS_PER_PTE; i++)
1643 pte[i] = xen_make_pte(pte[i].pte);
1644}
1645
1646/*
1647 * Set up the inital kernel pagetable.
1648 *
1649 * We can construct this by grafting the Xen provided pagetable into
1650 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1651 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1652 * means that only the kernel has a physical mapping to start with -
1653 * but that's enough to get __va working. We need to fill in the rest
1654 * of the physical mapping once some sort of allocator has been set
1655 * up.
1656 */
1657__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1658 unsigned long max_pfn)
1659{
1660 pud_t *l3;
1661 pmd_t *l2;
1662
1663 /* Zap identity mapping */
1664 init_level4_pgt[0] = __pgd(0);
1665
1666 /* Pre-constructed entries are in pfn, so convert to mfn */
1667 convert_pfn_mfn(init_level4_pgt);
1668 convert_pfn_mfn(level3_ident_pgt);
1669 convert_pfn_mfn(level3_kernel_pgt);
1670
1671 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1672 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1673
1674 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1675 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1678 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1679 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1680
1681 /* Set up identity map */
1682 xen_map_identity_early(level2_ident_pgt, max_pfn);
1683
1684 /* Make pagetable pieces RO */
1685 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1687 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1688 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1689 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1690 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1691
1692 /* Pin down new L4 */
1693 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1694 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1695
1696 /* Unpin Xen-provided one */
1697 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1698
1699 /* Switch over */
1700 pgd = init_level4_pgt;
1701
1702 /*
1703 * At this stage there can be no user pgd, and no page
1704 * structure to attach it to, so make sure we just set kernel
1705 * pgd.
1706 */
1707 xen_mc_batch();
1708 __xen_write_cr3(true, __pa(pgd));
1709 xen_mc_issue(PARAVIRT_LAZY_CPU);
1710
1711 reserve_early(__pa(xen_start_info->pt_base),
1712 __pa(xen_start_info->pt_base +
1713 xen_start_info->nr_pt_frames * PAGE_SIZE),
1714 "XEN PAGETABLES");
1715
1716 return pgd;
1717}
1718#else /* !CONFIG_X86_64 */
1719static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1720
1721__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1722 unsigned long max_pfn)
1723{
1724 pmd_t *kernel_pmd;
1725
1726 init_pg_tables_start = __pa(pgd);
1727 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1728 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1729
1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1732
1733 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1734
1735 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1736 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1737 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1738
1739 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1740 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1741 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1742
1743 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1744
1745 xen_write_cr3(__pa(swapper_pg_dir));
1746
1747 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1748
1749 return swapper_pg_dir;
1750}
1751#endif /* CONFIG_X86_64 */
1752
1753static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1754{
1755 pte_t pte;
1756
1757 phys >>= PAGE_SHIFT;
1758
1759 switch (idx) {
1760 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1761#ifdef CONFIG_X86_F00F_BUG
1762 case FIX_F00F_IDT:
1763#endif
1764#ifdef CONFIG_X86_32
1765 case FIX_WP_TEST:
1766 case FIX_VDSO:
1767# ifdef CONFIG_HIGHMEM
1768 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1769# endif
1770#else
1771 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1772#endif
1773#ifdef CONFIG_X86_LOCAL_APIC
1774 case FIX_APIC_BASE: /* maps dummy local APIC */
1775#endif
1776 pte = pfn_pte(phys, prot);
1777 break;
1778
1779 default:
1780 pte = mfn_pte(phys, prot);
1781 break;
1782 }
1783
1784 __native_set_fixmap(idx, pte);
1785
1786#ifdef CONFIG_X86_64
1787 /* Replicate changes to map the vsyscall page into the user
1788 pagetable vsyscall mapping. */
1789 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1790 unsigned long vaddr = __fix_to_virt(idx);
1791 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1792 }
1793#endif
1794}
1795
1796__init void xen_post_allocator_init(void)
1797{
1798 pv_mmu_ops.set_pte = xen_set_pte;
1799 pv_mmu_ops.set_pmd = xen_set_pmd;
1800 pv_mmu_ops.set_pud = xen_set_pud;
1801#if PAGETABLE_LEVELS == 4
1802 pv_mmu_ops.set_pgd = xen_set_pgd;
1803#endif
1804
1805 /* This will work as long as patching hasn't happened yet
1806 (which it hasn't) */
1807 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1808 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1809 pv_mmu_ops.release_pte = xen_release_pte;
1810 pv_mmu_ops.release_pmd = xen_release_pmd;
1811#if PAGETABLE_LEVELS == 4
1812 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1813 pv_mmu_ops.release_pud = xen_release_pud;
1814#endif
1815
1816#ifdef CONFIG_X86_64
1817 SetPagePinned(virt_to_page(level3_user_vsyscall));
1818#endif
1819 xen_mark_init_mm_pinned();
1820}
1821
1822
1823const struct pv_mmu_ops xen_mmu_ops __initdata = {
1824 .pagetable_setup_start = xen_pagetable_setup_start,
1825 .pagetable_setup_done = xen_pagetable_setup_done,
1826
1827 .read_cr2 = xen_read_cr2,
1828 .write_cr2 = xen_write_cr2,
1829
1830 .read_cr3 = xen_read_cr3,
1831 .write_cr3 = xen_write_cr3,
1832
1833 .flush_tlb_user = xen_flush_tlb,
1834 .flush_tlb_kernel = xen_flush_tlb,
1835 .flush_tlb_single = xen_flush_tlb_single,
1836 .flush_tlb_others = xen_flush_tlb_others,
1837
1838 .pte_update = paravirt_nop,
1839 .pte_update_defer = paravirt_nop,
1840
1841 .pgd_alloc = xen_pgd_alloc,
1842 .pgd_free = xen_pgd_free,
1843
1844 .alloc_pte = xen_alloc_pte_init,
1845 .release_pte = xen_release_pte_init,
1846 .alloc_pmd = xen_alloc_pte_init,
1847 .alloc_pmd_clone = paravirt_nop,
1848 .release_pmd = xen_release_pte_init,
1849
1850#ifdef CONFIG_HIGHPTE
1851 .kmap_atomic_pte = xen_kmap_atomic_pte,
1852#endif
1853
1854#ifdef CONFIG_X86_64
1855 .set_pte = xen_set_pte,
1856#else
1857 .set_pte = xen_set_pte_init,
1858#endif
1859 .set_pte_at = xen_set_pte_at,
1860 .set_pmd = xen_set_pmd_hyper,
1861
1862 .ptep_modify_prot_start = __ptep_modify_prot_start,
1863 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1864
1865 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1866 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1867
1868 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1869 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1870
1871#ifdef CONFIG_X86_PAE
1872 .set_pte_atomic = xen_set_pte_atomic,
1873 .set_pte_present = xen_set_pte_at,
1874 .pte_clear = xen_pte_clear,
1875 .pmd_clear = xen_pmd_clear,
1876#endif /* CONFIG_X86_PAE */
1877 .set_pud = xen_set_pud_hyper,
1878
1879 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1880 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1881
1882#if PAGETABLE_LEVELS == 4
1883 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1884 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1885 .set_pgd = xen_set_pgd_hyper,
1886
1887 .alloc_pud = xen_alloc_pte_init,
1888 .release_pud = xen_release_pte_init,
1889#endif /* PAGETABLE_LEVELS == 4 */
1890
1891 .activate_mm = xen_activate_mm,
1892 .dup_mmap = xen_dup_mmap,
1893 .exit_mmap = xen_exit_mmap,
1894
1895 .lazy_mode = {
1896 .enter = paravirt_enter_lazy_mmu,
1897 .leave = xen_leave_lazy,
1898 },
1899
1900 .set_fixmap = xen_set_fixmap,
1901};
1902
1903
1159#ifdef CONFIG_XEN_DEBUG_FS 1904#ifdef CONFIG_XEN_DEBUG_FS
1160 1905
1161static struct dentry *d_mmu_debug; 1906static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c738644b5435..8bff7e7c290b 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -39,6 +39,7 @@ struct mc_buffer {
39 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
40#if MC_DEBUG 40#if MC_DEBUG
41 struct multicall_entry debug[MC_BATCH]; 41 struct multicall_entry debug[MC_BATCH];
42 void *caller[MC_BATCH];
42#endif 43#endif
43 unsigned char args[MC_ARGS]; 44 unsigned char args[MC_ARGS];
44 struct callback { 45 struct callback {
@@ -154,11 +155,12 @@ void xen_mc_flush(void)
154 ret, smp_processor_id()); 155 ret, smp_processor_id());
155 dump_stack(); 156 dump_stack();
156 for (i = 0; i < b->mcidx; i++) { 157 for (i = 0; i < b->mcidx; i++) {
157 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 158 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
158 i+1, b->mcidx, 159 i+1, b->mcidx,
159 b->debug[i].op, 160 b->debug[i].op,
160 b->debug[i].args[0], 161 b->debug[i].args[0],
161 b->entries[i].result); 162 b->entries[i].result,
163 b->caller[i]);
162 } 164 }
163 } 165 }
164#endif 166#endif
@@ -168,8 +170,6 @@ void xen_mc_flush(void)
168 } else 170 } else
169 BUG_ON(b->argidx != 0); 171 BUG_ON(b->argidx != 0);
170 172
171 local_irq_restore(flags);
172
173 for (i = 0; i < b->cbidx; i++) { 173 for (i = 0; i < b->cbidx; i++) {
174 struct callback *cb = &b->callbacks[i]; 174 struct callback *cb = &b->callbacks[i];
175 175
@@ -177,7 +177,9 @@ void xen_mc_flush(void)
177 } 177 }
178 b->cbidx = 0; 178 b->cbidx = 0;
179 179
180 BUG_ON(ret); 180 local_irq_restore(flags);
181
182 WARN_ON(ret);
181} 183}
182 184
183struct multicall_space __xen_mc_entry(size_t args) 185struct multicall_space __xen_mc_entry(size_t args)
@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(size_t args)
197 } 199 }
198 200
199 ret.mc = &b->entries[b->mcidx]; 201 ret.mc = &b->entries[b->mcidx];
202#ifdef MC_DEBUG
203 b->caller[b->mcidx] = __builtin_return_address(0);
204#endif
200 b->mcidx++; 205 b->mcidx++;
201 ret.args = &b->args[argidx]; 206 ret.args = &b->args[argidx];
202 b->argidx = argidx + args; 207 b->argidx = argidx + args;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d98..9e565da5d1f7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
41 xen_mc_flush(); 41 xen_mc_flush();
42 42
43 /* restore flags saved in xen_mc_batch */ 43 /* restore flags saved in xen_mc_batch */
44 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 44 local_irq_restore(percpu_read(xen_mc_irq_flags));
45} 45}
46 46
47/* Set up a callback to be called when the current batch is flushed */ 47/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50 */ 50 */
51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
52{ 52{
53#ifdef CONFIG_X86_32 53 inc_irq_stat(irq_resched_count);
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58 54
59 return IRQ_HANDLED; 55 return IRQ_HANDLED;
60} 56}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
78 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
79 75
80 cpu_set(cpu, cpu_online_map); 76 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
82 wmb(); 78 wmb();
83 79
84 /* We can take interrupts now: we're officially "up". */ 80 /* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 170
175 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
176 old memory can be recycled */ 172 old memory can be recycled */
177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); 173 make_lowmem_page_readwrite(xen_initial_gdt);
178 174
179 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
180} 176}
@@ -223,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
223{ 219{
224 struct vcpu_guest_context *ctxt; 220 struct vcpu_guest_context *ctxt;
225 struct desc_struct *gdt; 221 struct desc_struct *gdt;
222 unsigned long gdt_mfn;
226 223
227 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 224 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
228 return 0; 225 return 0;
@@ -239,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
239 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
240#ifdef CONFIG_X86_32 237#ifdef CONFIG_X86_32
241 ctxt->user_regs.fs = __KERNEL_PERCPU; 238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239#else
240 ctxt->gs_base_kernel = per_cpu_offset(cpu);
242#endif 241#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 242 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 243 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -250,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
250 ctxt->ldt_ents = 0; 249 ctxt->ldt_ents = 0;
251 250
252 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 251 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
252
253 gdt_mfn = arbitrary_virt_to_mfn(gdt);
253 make_lowmem_page_readonly(gdt); 254 make_lowmem_page_readonly(gdt);
255 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
254 256
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt); 257 ctxt->gdt_frames[0] = gdt_mfn;
256 ctxt->gdt_ents = GDT_ENTRIES; 258 ctxt->gdt_ents = GDT_ENTRIES;
257 259
258 ctxt->user_regs.cs = __KERNEL_CS; 260 ctxt->user_regs.cs = __KERNEL_CS;
@@ -283,23 +285,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
283 struct task_struct *idle = idle_task(cpu); 285 struct task_struct *idle = idle_task(cpu);
284 int rc; 286 int rc;
285 287
286#ifdef CONFIG_X86_64
287 /* Allocate node local memory for AP pdas */
288 WARN_ON(cpu == 0);
289 if (cpu > 0) {
290 rc = get_local_pda(cpu);
291 if (rc)
292 return rc;
293 }
294#endif
295
296#ifdef CONFIG_X86_32
297 init_gdt(cpu);
298 per_cpu(current_task, cpu) = idle; 288 per_cpu(current_task, cpu) = idle;
289#ifdef CONFIG_X86_32
299 irq_ctx_init(cpu); 290 irq_ctx_init(cpu);
300#else 291#else
301 cpu_pda(cpu)->pcurrent = idle;
302 clear_tsk_thread_flag(idle, TIF_FORK); 292 clear_tsk_thread_flag(idle, TIF_FORK);
293 per_cpu(kernel_stack, cpu) =
294 (unsigned long)task_stack_page(idle) -
295 KERNEL_STACK_OFFSET + THREAD_SIZE;
303#endif 296#endif
304 xen_setup_timer(cpu); 297 xen_setup_timer(cpu);
305 xen_init_lock_cpu(cpu); 298 xen_init_lock_cpu(cpu);
@@ -445,11 +438,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
445{ 438{
446 irq_enter(); 439 irq_enter();
447 generic_smp_call_function_interrupt(); 440 generic_smp_call_function_interrupt();
448#ifdef CONFIG_X86_32 441 inc_irq_stat(irq_call_count);
449 __get_cpu_var(irq_stat).irq_call_count++;
450#else
451 add_pda(irq_call_count, 1);
452#endif
453 irq_exit(); 442 irq_exit();
454 443
455 return IRQ_HANDLED; 444 return IRQ_HANDLED;
@@ -459,11 +448,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
459{ 448{
460 irq_enter(); 449 irq_enter();
461 generic_smp_call_function_single_interrupt(); 450 generic_smp_call_function_single_interrupt();
462#ifdef CONFIG_X86_32 451 inc_irq_stat(irq_call_count);
463 __get_cpu_var(irq_stat).irq_call_count++;
464#else
465 add_pda(irq_call_count, 1);
466#endif
467 irq_exit(); 452 irq_exit();
468 453
469 return IRQ_HANDLED; 454 return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b76..95be7b434724 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/xen/hypercall.h> 7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h> 8#include <asm/xen/page.h>
9#include <asm/fixmap.h>
9 10
10#include "xen-ops.h" 11#include "xen-ops.h"
11#include "mmu.h" 12#include "mmu.h"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..79d7362ad6d1
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
1/*
2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 * inlining. The inline versions are the same as the direct-use
4 * versions, with the pre- and post-amble chopped off.
5 *
6 * This code is encoded for size rather than absolute efficiency, with
7 * a view to being able to inline as much as possible.
8 *
9 * We only bother with direct forms (ie, vcpu in percpu data) of the
10 * operations here; the indirect forms are better handled in C, since
11 * they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 * Enable events. This clears the event mask and tests the pending
22 * event status with one and operation. If there are pending events,
23 * then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /*
30 * Preempt here doesn't matter because that will deal with any
31 * pending interrupts. The pending check may end up being run
32 * on the wrong CPU, but that doesn't hurt.
33 */
34
35 /* Test for pending */
36 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
37 jz 1f
38
392: call check_events
401:
41ENDPATCH(xen_irq_enable_direct)
42 ret
43 ENDPROC(xen_irq_enable_direct)
44 RELOC(xen_irq_enable_direct, 2b+1)
45
46
47/*
48 * Disabling events is simply a matter of making the event mask
49 * non-zero.
50 */
51ENTRY(xen_irq_disable_direct)
52 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
53ENDPATCH(xen_irq_disable_direct)
54 ret
55 ENDPROC(xen_irq_disable_direct)
56 RELOC(xen_irq_disable_direct, 0)
57
58/*
59 * (xen_)save_fl is used to get the current interrupt enable status.
60 * Callers expect the status to be in X86_EFLAGS_IF, and other bits
61 * may be set in the return value. We take advantage of this by
62 * making sure that X86_EFLAGS_IF has the right value (and other bits
63 * in that byte are 0), but other bits in the return value are
64 * undefined. We need to toggle the state of the bit, because Xen and
65 * x86 use opposite senses (mask vs enable).
66 */
67ENTRY(xen_save_fl_direct)
68 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
69 setz %ah
70 addb %ah, %ah
71ENDPATCH(xen_save_fl_direct)
72 ret
73 ENDPROC(xen_save_fl_direct)
74 RELOC(xen_save_fl_direct, 0)
75
76
77/*
78 * In principle the caller should be passing us a value return from
79 * xen_save_fl_direct, but for robustness sake we test only the
80 * X86_EFLAGS_IF flag rather than the whole byte. After setting the
81 * interrupt mask state, it checks for unmasked pending events and
82 * enters the hypervisor to get them delivered if so.
83 */
84ENTRY(xen_restore_fl_direct)
85#ifdef CONFIG_X86_64
86 testw $X86_EFLAGS_IF, %di
87#else
88 testb $X86_EFLAGS_IF>>8, %ah
89#endif
90 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
91 /*
92 * Preempt here doesn't matter because that will deal with any
93 * pending interrupts. The pending check may end up being run
94 * on the wrong CPU, but that doesn't hurt.
95 */
96
97 /* check for unmasked and pending */
98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
99 jz 1f
1002: call check_events
1011:
102ENDPATCH(xen_restore_fl_direct)
103 ret
104 ENDPROC(xen_restore_fl_direct)
105 RELOC(xen_restore_fl_direct, 2b+1)
106
107
108/*
109 * Force an event check by making a hypercall, but preserve regs
110 * before making the call.
111 */
112check_events:
113#ifdef CONFIG_X86_32
114 push %eax
115 push %ecx
116 push %edx
117 call xen_force_evtchn_callback
118 pop %edx
119 pop %ecx
120 pop %eax
121#else
122 push %rax
123 push %rcx
124 push %rdx
125 push %rsi
126 push %rdi
127 push %r8
128 push %r9
129 push %r10
130 push %r11
131 call xen_force_evtchn_callback
132 pop %r11
133 pop %r10
134 pop %r9
135 pop %r8
136 pop %rdi
137 pop %rsi
138 pop %rdx
139 pop %rcx
140 pop %rax
141#endif
142 ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..88e15deb8b82 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 14#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 15#include <asm/processor-flags.h>
20#include <asm/segment.h> 16#include <asm/segment.h>
21 17
22#include <xen/interface/xen.h> 18#include <xen/interface/xen.h>
23 19
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 20#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54
55/*
56 Disabling events is simply a matter of making the event mask
57 non-zero.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65 21
66/* 22/*
67 (xen_)save_fl is used to get the current interrupt enable status. 23 * Force an event check by making a hypercall, but preserve regs
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits 24 * before making the call.
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */ 25 */
75ENTRY(xen_save_fl_direct) 26check_events:
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 27 push %eax
77 setz %ah 28 push %ecx
78 addb %ah,%ah 29 push %edx
79ENDPATCH(xen_save_fl_direct) 30 call xen_force_evtchn_callback
80 ret 31 pop %edx
81 ENDPROC(xen_save_fl_direct) 32 pop %ecx
82 RELOC(xen_save_fl_direct, 0) 33 pop %eax
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 34 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 35
110/* 36/*
111 We can't use sysexit directly, because we're not running in ring0. 37 * We can't use sysexit directly, because we're not running in ring0.
112 But we can easily fake it up using iret. Assuming xen_sysexit 38 * But we can easily fake it up using iret. Assuming xen_sysexit is
113 is jumped to with a standard stack frame, we can just strip it 39 * jumped to with a standard stack frame, we can just strip it back to
114 back to a standard iret frame and use iret. 40 * a standard iret frame and use iret.
115 */ 41 */
116ENTRY(xen_sysexit) 42ENTRY(xen_sysexit)
117 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ 43 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
122ENDPROC(xen_sysexit) 48ENDPROC(xen_sysexit)
123 49
124/* 50/*
125 This is run where a normal iret would be run, with the same stack setup: 51 * This is run where a normal iret would be run, with the same stack setup:
126 8: eflags 52 * 8: eflags
127 4: cs 53 * 4: cs
128 esp-> 0: eip 54 * esp-> 0: eip
129 55 *
130 This attempts to make sure that any pending events are dealt 56 * This attempts to make sure that any pending events are dealt with
131 with on return to usermode, but there is a small window in 57 * on return to usermode, but there is a small window in which an
132 which an event can happen just before entering usermode. If 58 * event can happen just before entering usermode. If the nested
133 the nested interrupt ends up setting one of the TIF_WORK_MASK 59 * interrupt ends up setting one of the TIF_WORK_MASK pending work
134 pending work flags, they will not be tested again before 60 * flags, they will not be tested again before returning to
135 returning to usermode. This means that a process can end up 61 * usermode. This means that a process can end up with pending work,
136 with pending work, which will be unprocessed until the process 62 * which will be unprocessed until the process enters and leaves the
137 enters and leaves the kernel again, which could be an 63 * kernel again, which could be an unbounded amount of time. This
138 unbounded amount of time. This means that a pending signal or 64 * means that a pending signal or reschedule event could be
139 reschedule event could be indefinitely delayed. 65 * indefinitely delayed.
140 66 *
141 The fix is to notice a nested interrupt in the critical 67 * The fix is to notice a nested interrupt in the critical window, and
142 window, and if one occurs, then fold the nested interrupt into 68 * if one occurs, then fold the nested interrupt into the current
143 the current interrupt stack frame, and re-process it 69 * interrupt stack frame, and re-process it iteratively rather than
144 iteratively rather than recursively. This means that it will 70 * recursively. This means that it will exit via the normal path, and
145 exit via the normal path, and all pending work will be dealt 71 * all pending work will be dealt with appropriately.
146 with appropriately. 72 *
147 73 * Because the nested interrupt handler needs to deal with the current
148 Because the nested interrupt handler needs to deal with the 74 * stack state in whatever form its in, we keep things simple by only
149 current stack state in whatever form its in, we keep things 75 * using a single register which is pushed/popped on the stack.
150 simple by only using a single register which is pushed/popped
151 on the stack.
152 */ 76 */
153ENTRY(xen_iret) 77ENTRY(xen_iret)
154 /* test eflags for special cases */ 78 /* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
158 push %eax 82 push %eax
159 ESP_OFFSET=4 # bytes pushed onto stack 83 ESP_OFFSET=4 # bytes pushed onto stack
160 84
161 /* Store vcpu_info pointer for easy access. Do it this 85 /*
162 way to avoid having to reload %fs */ 86 * Store vcpu_info pointer for easy access. Do it this way to
87 * avoid having to reload %fs
88 */
163#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
164 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
165 movl TI_cpu(%eax),%eax 91 movl TI_cpu(%eax), %eax
166 movl __per_cpu_offset(,%eax,4),%eax 92 movl __per_cpu_offset(,%eax,4), %eax
167 mov per_cpu__xen_vcpu(%eax),%eax 93 mov per_cpu__xen_vcpu(%eax), %eax
168#else 94#else
169 movl per_cpu__xen_vcpu, %eax 95 movl per_cpu__xen_vcpu, %eax
170#endif 96#endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
172 /* check IF state we're restoring */ 98 /* check IF state we're restoring */
173 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) 99 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
174 100
175 /* Maybe enable events. Once this happens we could get a 101 /*
176 recursive event, so the critical region starts immediately 102 * Maybe enable events. Once this happens we could get a
177 afterwards. However, if that happens we don't end up 103 * recursive event, so the critical region starts immediately
178 resuming the code, so we don't have to be worried about 104 * afterwards. However, if that happens we don't end up
179 being preempted to another CPU. */ 105 * resuming the code, so we don't have to be worried about
106 * being preempted to another CPU.
107 */
180 setz XEN_vcpu_info_mask(%eax) 108 setz XEN_vcpu_info_mask(%eax)
181xen_iret_start_crit: 109xen_iret_start_crit:
182 110
183 /* check for unmasked and pending */ 111 /* check for unmasked and pending */
184 cmpw $0x0001, XEN_vcpu_info_pending(%eax) 112 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
185 113
186 /* If there's something pending, mask events again so we 114 /*
187 can jump back into xen_hypervisor_callback */ 115 * If there's something pending, mask events again so we can
116 * jump back into xen_hypervisor_callback
117 */
188 sete XEN_vcpu_info_mask(%eax) 118 sete XEN_vcpu_info_mask(%eax)
189 119
190 popl %eax 120 popl %eax
191 121
192 /* From this point on the registers are restored and the stack 122 /*
193 updated, so we don't need to worry about it if we're preempted */ 123 * From this point on the registers are restored and the stack
124 * updated, so we don't need to worry about it if we're
125 * preempted
126 */
194iret_restore_end: 127iret_restore_end:
195 128
196 /* Jump to hypervisor_callback after fixing up the stack. 129 /*
197 Events are masked, so jumping out of the critical 130 * Jump to hypervisor_callback after fixing up the stack.
198 region is OK. */ 131 * Events are masked, so jumping out of the critical region is
132 * OK.
133 */
199 je xen_hypervisor_callback 134 je xen_hypervisor_callback
200 135
2011: iret 1361: iret
202xen_iret_end_crit: 137xen_iret_end_crit:
203.section __ex_table,"a" 138.section __ex_table, "a"
204 .align 4 139 .align 4
205 .long 1b,iret_exc 140 .long 1b, iret_exc
206.previous 141.previous
207 142
208hyper_iret: 143hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
212 .globl xen_iret_start_crit, xen_iret_end_crit 147 .globl xen_iret_start_crit, xen_iret_end_crit
213 148
214/* 149/*
215 This is called by xen_hypervisor_callback in entry.S when it sees 150 * This is called by xen_hypervisor_callback in entry.S when it sees
216 that the EIP at the time of interrupt was between xen_iret_start_crit 151 * that the EIP at the time of interrupt was between
217 and xen_iret_end_crit. We're passed the EIP in %eax so we can do 152 * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
218 a more refined determination of what to do. 153 * %eax so we can do a more refined determination of what to do.
219 154 *
220 The stack format at this point is: 155 * The stack format at this point is:
221 ---------------- 156 * ----------------
222 ss : (ss/esp may be present if we came from usermode) 157 * ss : (ss/esp may be present if we came from usermode)
223 esp : 158 * esp :
224 eflags } outer exception info 159 * eflags } outer exception info
225 cs } 160 * cs }
226 eip } 161 * eip }
227 ---------------- <- edi (copy dest) 162 * ---------------- <- edi (copy dest)
228 eax : outer eax if it hasn't been restored 163 * eax : outer eax if it hasn't been restored
229 ---------------- 164 * ----------------
230 eflags } nested exception info 165 * eflags } nested exception info
231 cs } (no ss/esp because we're nested 166 * cs } (no ss/esp because we're nested
232 eip } from the same ring) 167 * eip } from the same ring)
233 orig_eax }<- esi (copy src) 168 * orig_eax }<- esi (copy src)
234 - - - - - - - - 169 * - - - - - - - -
235 fs } 170 * fs }
236 es } 171 * es }
237 ds } SAVE_ALL state 172 * ds } SAVE_ALL state
238 eax } 173 * eax }
239 : : 174 * : :
240 ebx }<- esp 175 * ebx }<- esp
241 ---------------- 176 * ----------------
242 177 *
243 In order to deliver the nested exception properly, we need to shift 178 * In order to deliver the nested exception properly, we need to shift
244 everything from the return addr up to the error code so it 179 * everything from the return addr up to the error code so it sits
245 sits just under the outer exception info. This means that when we 180 * just under the outer exception info. This means that when we
246 handle the exception, we do it in the context of the outer exception 181 * handle the exception, we do it in the context of the outer
247 rather than starting a new one. 182 * exception rather than starting a new one.
248 183 *
249 The only caveat is that if the outer eax hasn't been 184 * The only caveat is that if the outer eax hasn't been restored yet
250 restored yet (ie, it's still on stack), we need to insert 185 * (ie, it's still on stack), we need to insert its value into the
251 its value into the SAVE_ALL state before going on, since 186 * SAVE_ALL state before going on, since it's usermode state which we
252 it's usermode state which we eventually need to restore. 187 * eventually need to restore.
253 */ 188 */
254ENTRY(xen_iret_crit_fixup) 189ENTRY(xen_iret_crit_fixup)
255 /* 190 /*
256 Paranoia: Make sure we're really coming from kernel space. 191 * Paranoia: Make sure we're really coming from kernel space.
257 One could imagine a case where userspace jumps into the 192 * One could imagine a case where userspace jumps into the
258 critical range address, but just before the CPU delivers a GP, 193 * critical range address, but just before the CPU delivers a
259 it decides to deliver an interrupt instead. Unlikely? 194 * GP, it decides to deliver an interrupt instead. Unlikely?
260 Definitely. Easy to avoid? Yes. The Intel documents 195 * Definitely. Easy to avoid? Yes. The Intel documents
261 explicitly say that the reported EIP for a bad jump is the 196 * explicitly say that the reported EIP for a bad jump is the
262 jump instruction itself, not the destination, but some virtual 197 * jump instruction itself, not the destination, but some
263 environments get this wrong. 198 * virtual environments get this wrong.
264 */ 199 */
265 movl PT_CS(%esp), %ecx 200 movl PT_CS(%esp), %ecx
266 andl $SEGMENT_RPL_MASK, %ecx 201 andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
270 lea PT_ORIG_EAX(%esp), %esi 205 lea PT_ORIG_EAX(%esp), %esi
271 lea PT_EFLAGS(%esp), %edi 206 lea PT_EFLAGS(%esp), %edi
272 207
273 /* If eip is before iret_restore_end then stack 208 /*
274 hasn't been restored yet. */ 209 * If eip is before iret_restore_end then stack
210 * hasn't been restored yet.
211 */
275 cmp $iret_restore_end, %eax 212 cmp $iret_restore_end, %eax
276 jae 1f 213 jae 1f
277 214
278 movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ 215 movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
279 movl %eax, PT_EAX(%esp) 216 movl %eax, PT_EAX(%esp)
280 217
281 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ 218 lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
282 219
283 /* set up the copy */ 220 /* set up the copy */
2841: std 2211: std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
286 rep movsl 223 rep movsl
287 cld 224 cld
288 225
289 lea 4(%edi),%esp /* point esp to new frame */ 226 lea 4(%edi), %esp /* point esp to new frame */
2902: jmp xen_do_upcall 2272: jmp xen_do_upcall
291 228
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e87..02f496a8dbaa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
19#include <asm/segment.h> 17#include <asm/segment.h>
20 18
21#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
22 20
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151 22
152ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp), %rcx
154 mov 8+8(%rsp),%r11 25 mov 8+8(%rsp), %r11
155 ret $16 26 ret $16
156 27
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 28hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/* 29/*
159 Xen64 iret frame: 30 * Xen64 iret frame:
160 31 *
161 ss 32 * ss
162 rsp 33 * rsp
163 rflags 34 * rflags
164 cs 35 * cs
165 rip <-- standard iret frame 36 * rip <-- standard iret frame
166 37 *
167 flags 38 * flags
168 39 *
169 rcx } 40 * rcx }
170 r11 }<-- pushed by hypercall page 41 * r11 }<-- pushed by hypercall page
171rsp -> rax } 42 * rsp->rax }
172 */ 43 */
173ENTRY(xen_iret) 44ENTRY(xen_iret)
174 pushq $0 45 pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1) 48RELOC(xen_iret, 1b+1)
178 49
179/* 50/*
180 sysexit is not used for 64-bit processes, so it's 51 * sysexit is not used for 64-bit processes, so it's only ever used to
181 only ever used to return to 32-bit compat userspace. 52 * return to 32-bit compat userspace.
182 */ 53 */
183ENTRY(xen_sysexit) 54ENTRY(xen_sysexit)
184 pushq $__USER32_DS 55 pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1) 64RELOC(xen_sysexit, 1b+1)
194 65
195ENTRY(xen_sysret64) 66ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still 67 /*
197 with the kernel gs, so we can easily switch back */ 68 * We're already on the usermode stack at this point, but
198 movq %rsp, %gs:pda_oldrsp 69 * still with the kernel gs, so we can easily switch back
199 movq %gs:pda_kernelstack,%rsp 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp)
72 movq PER_CPU_VAR(kernel_stack), %rsp
200 73
201 pushq $__USER_DS 74 pushq $__USER_DS
202 pushq %gs:pda_oldrsp 75 pushq PER_CPU_VAR(old_rsp)
203 pushq %r11 76 pushq %r11
204 pushq $__USER_CS 77 pushq $__USER_CS
205 pushq %rcx 78 pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1) 83RELOC(xen_sysret64, 1b+1)
211 84
212ENTRY(xen_sysret32) 85ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still 86 /*
214 with the kernel gs, so we can easily switch back */ 87 * We're already on the usermode stack at this point, but
215 movq %rsp, %gs:pda_oldrsp 88 * still with the kernel gs, so we can easily switch back
216 movq %gs:pda_kernelstack, %rsp 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp)
91 movq PER_CPU_VAR(kernel_stack), %rsp
217 92
218 pushq $__USER32_DS 93 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp 94 pushq PER_CPU_VAR(old_rsp)
220 pushq %r11 95 pushq %r11
221 pushq $__USER32_CS 96 pushq $__USER32_CS
222 pushq %rcx 97 pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
228 103
229/* 104/*
230 Xen handles syscall callbacks much like ordinary exceptions, 105 * Xen handles syscall callbacks much like ordinary exceptions, which
231 which means we have: 106 * means we have:
232 - kernel gs 107 * - kernel gs
233 - kernel rsp 108 * - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11): 109 * - an iret-like stack frame on the stack (including rcx and r11):
235 ss 110 * ss
236 rsp 111 * rsp
237 rflags 112 * rflags
238 cs 113 * cs
239 rip 114 * rip
240 r11 115 * r11
241 rsp-> rcx 116 * rsp->rcx
242 117 *
243 In all the entrypoints, we undo all that to make it look 118 * In all the entrypoints, we undo all that to make it look like a
244 like a CPU-generated syscall/sysenter and jump to the normal 119 * CPU-generated syscall/sysenter and jump to the normal entrypoint.
245 entrypoint.
246 */ 120 */
247 121
248.macro undo_xen_syscall 122.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx 123 mov 0*8(%rsp), %rcx
250 mov 1*8(%rsp),%r11 124 mov 1*8(%rsp), %r11
251 mov 5*8(%rsp),%rsp 125 mov 5*8(%rsp), %rsp
252.endm 126.endm
253 127
254/* Normal 64-bit system call target */ 128/* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
275 149
276ENTRY(xen_syscall32_target) 150ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
279 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall 154 pushq $VGCF_in_syscall
281 jmp hypercall_iret 155 jmp hypercall_iret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 63d49a523ed3..1a5ff24e29c0 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -8,7 +8,7 @@
8 8
9#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h> 10#include <asm/asm.h>
11#include <asm/page.h> 11#include <asm/page_types.h>
12 12
13#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
14#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/arch/xtensa/include/asm/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/arch/xtensa/include/asm/swab.h b/arch/xtensa/include/asm/swab.h
index f50b697eb601..226a39162310 100644
--- a/arch/xtensa/include/asm/swab.h
+++ b/arch/xtensa/include/asm/swab.h
@@ -11,7 +11,7 @@
11#ifndef _XTENSA_SWAB_H 11#ifndef _XTENSA_SWAB_H
12#define _XTENSA_SWAB_H 12#define _XTENSA_SWAB_H
13 13
14#include <asm/types.h> 14#include <linux/types.h>
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16 16
17#define __SWAB_64_THRU_32__ 17#define __SWAB_64_THRU_32__
diff --git a/block/Kconfig b/block/Kconfig
index 0cbb3b88b59a..e7d12782bcfb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -44,22 +44,6 @@ config LBD
44 44
45 If unsure, say N. 45 If unsure, say N.
46 46
47config BLK_DEV_IO_TRACE
48 bool "Support for tracing block io actions"
49 depends on SYSFS
50 select RELAY
51 select DEBUG_FS
52 select TRACEPOINTS
53 help
54 Say Y here if you want to be able to trace the block layer actions
55 on a given queue. Tracing allows you to see any traffic happening
56 on a block device queue. For more information (and the userspace
57 support tools needed), fetch the blktrace tools from:
58
59 git://git.kernel.dk/blktrace.git
60
61 If unsure, say N.
62
63config BLK_DEV_BSG 47config BLK_DEV_BSG
64 bool "Block layer SG support v4 (EXPERIMENTAL)" 48 bool "Block layer SG support v4 (EXPERIMENTAL)"
65 depends on EXPERIMENTAL 49 depends on EXPERIMENTAL
diff --git a/block/Makefile b/block/Makefile
index bfe73049f939..e9fa4dd690f2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
15 15
16obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
17obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 16obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
18obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 17obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c
index c3e841f3cde9..ab0aff3c7d6a 100644
--- a/drivers/acpi/acpica/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -365,7 +365,7 @@ ACPI_EXPORT_SYMBOL(acpi_unload_table_id)
365 365
366/******************************************************************************* 366/*******************************************************************************
367 * 367 *
368 * FUNCTION: acpi_get_table 368 * FUNCTION: acpi_get_table_with_size
369 * 369 *
370 * PARAMETERS: Signature - ACPI signature of needed table 370 * PARAMETERS: Signature - ACPI signature of needed table
371 * Instance - Which instance (for SSDTs) 371 * Instance - Which instance (for SSDTs)
@@ -377,8 +377,9 @@ ACPI_EXPORT_SYMBOL(acpi_unload_table_id)
377 * 377 *
378 *****************************************************************************/ 378 *****************************************************************************/
379acpi_status 379acpi_status
380acpi_get_table(char *signature, 380acpi_get_table_with_size(char *signature,
381 u32 instance, struct acpi_table_header **out_table) 381 u32 instance, struct acpi_table_header **out_table,
382 acpi_size *tbl_size)
382{ 383{
383 u32 i; 384 u32 i;
384 u32 j; 385 u32 j;
@@ -408,6 +409,7 @@ acpi_get_table(char *signature,
408 acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]); 409 acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]);
409 if (ACPI_SUCCESS(status)) { 410 if (ACPI_SUCCESS(status)) {
410 *out_table = acpi_gbl_root_table_list.tables[i].pointer; 411 *out_table = acpi_gbl_root_table_list.tables[i].pointer;
412 *tbl_size = acpi_gbl_root_table_list.tables[i].length;
411 } 413 }
412 414
413 if (!acpi_gbl_permanent_mmap) { 415 if (!acpi_gbl_permanent_mmap) {
@@ -420,6 +422,15 @@ acpi_get_table(char *signature,
420 return (AE_NOT_FOUND); 422 return (AE_NOT_FOUND);
421} 423}
422 424
425acpi_status
426acpi_get_table(char *signature,
427 u32 instance, struct acpi_table_header **out_table)
428{
429 acpi_size tbl_size;
430
431 return acpi_get_table_with_size(signature,
432 instance, out_table, &tbl_size);
433}
423ACPI_EXPORT_SYMBOL(acpi_get_table) 434ACPI_EXPORT_SYMBOL(acpi_get_table)
424 435
425/******************************************************************************* 436/*******************************************************************************
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 1e35f342957c..eb8980d67368 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -272,14 +272,21 @@ acpi_os_map_memory(acpi_physical_address phys, acpi_size size)
272} 272}
273EXPORT_SYMBOL_GPL(acpi_os_map_memory); 273EXPORT_SYMBOL_GPL(acpi_os_map_memory);
274 274
275void acpi_os_unmap_memory(void __iomem * virt, acpi_size size) 275void __ref acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
276{ 276{
277 if (acpi_gbl_permanent_mmap) { 277 if (acpi_gbl_permanent_mmap)
278 iounmap(virt); 278 iounmap(virt);
279 } 279 else
280 __acpi_unmap_table(virt, size);
280} 281}
281EXPORT_SYMBOL_GPL(acpi_os_unmap_memory); 282EXPORT_SYMBOL_GPL(acpi_os_unmap_memory);
282 283
284void __init early_acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
285{
286 if (!acpi_gbl_permanent_mmap)
287 __acpi_unmap_table(virt, size);
288}
289
283#ifdef ACPI_FUTURE_USAGE 290#ifdef ACPI_FUTURE_USAGE
284acpi_status 291acpi_status
285acpi_os_get_physical_address(void *virt, acpi_physical_address * phys) 292acpi_os_get_physical_address(void *virt, acpi_physical_address * phys)
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b587ff..68fd3d292799 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
516 continue; 516 continue;
517 } 517 }
518 518
519 if (!performance || !percpu_ptr(performance, i)) { 519 if (!performance || !per_cpu_ptr(performance, i)) {
520 retval = -EINVAL; 520 retval = -EINVAL;
521 continue; 521 continue;
522 } 522 }
523 523
524 pr->performance = percpu_ptr(performance, i); 524 pr->performance = per_cpu_ptr(performance, i);
525 cpumask_set_cpu(i, pr->performance->shared_cpu_map); 525 cpumask_set_cpu(i, pr->performance->shared_cpu_map);
526 if (acpi_processor_get_psd(pr)) { 526 if (acpi_processor_get_psd(pr)) {
527 retval = -EINVAL; 527 retval = -EINVAL;
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index a8852952fac4..fec1ae36d431 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -181,14 +181,15 @@ acpi_table_parse_entries(char *id,
181 struct acpi_subtable_header *entry; 181 struct acpi_subtable_header *entry;
182 unsigned int count = 0; 182 unsigned int count = 0;
183 unsigned long table_end; 183 unsigned long table_end;
184 acpi_size tbl_size;
184 185
185 if (!handler) 186 if (!handler)
186 return -EINVAL; 187 return -EINVAL;
187 188
188 if (strncmp(id, ACPI_SIG_MADT, 4) == 0) 189 if (strncmp(id, ACPI_SIG_MADT, 4) == 0)
189 acpi_get_table(id, acpi_apic_instance, &table_header); 190 acpi_get_table_with_size(id, acpi_apic_instance, &table_header, &tbl_size);
190 else 191 else
191 acpi_get_table(id, 0, &table_header); 192 acpi_get_table_with_size(id, 0, &table_header, &tbl_size);
192 193
193 if (!table_header) { 194 if (!table_header) {
194 printk(KERN_WARNING PREFIX "%4.4s not present\n", id); 195 printk(KERN_WARNING PREFIX "%4.4s not present\n", id);
@@ -206,8 +207,10 @@ acpi_table_parse_entries(char *id,
206 table_end) { 207 table_end) {
207 if (entry->type == entry_id 208 if (entry->type == entry_id
208 && (!max_entries || count++ < max_entries)) 209 && (!max_entries || count++ < max_entries))
209 if (handler(entry, table_end)) 210 if (handler(entry, table_end)) {
211 early_acpi_os_unmap_memory((char *)table_header, tbl_size);
210 return -EINVAL; 212 return -EINVAL;
213 }
211 214
212 entry = (struct acpi_subtable_header *) 215 entry = (struct acpi_subtable_header *)
213 ((unsigned long)entry + entry->length); 216 ((unsigned long)entry + entry->length);
@@ -217,6 +220,7 @@ acpi_table_parse_entries(char *id,
217 "%i found\n", id, entry_id, count - max_entries, count); 220 "%i found\n", id, entry_id, count - max_entries, count);
218 } 221 }
219 222
223 early_acpi_os_unmap_memory((char *)table_header, tbl_size);
220 return count; 224 return count;
221} 225}
222 226
@@ -241,17 +245,19 @@ acpi_table_parse_madt(enum acpi_madt_type id,
241int __init acpi_table_parse(char *id, acpi_table_handler handler) 245int __init acpi_table_parse(char *id, acpi_table_handler handler)
242{ 246{
243 struct acpi_table_header *table = NULL; 247 struct acpi_table_header *table = NULL;
248 acpi_size tbl_size;
244 249
245 if (!handler) 250 if (!handler)
246 return -EINVAL; 251 return -EINVAL;
247 252
248 if (strncmp(id, ACPI_SIG_MADT, 4) == 0) 253 if (strncmp(id, ACPI_SIG_MADT, 4) == 0)
249 acpi_get_table(id, acpi_apic_instance, &table); 254 acpi_get_table_with_size(id, acpi_apic_instance, &table, &tbl_size);
250 else 255 else
251 acpi_get_table(id, 0, &table); 256 acpi_get_table_with_size(id, 0, &table, &tbl_size);
252 257
253 if (table) { 258 if (table) {
254 handler(table); 259 handler(table);
260 early_acpi_os_unmap_memory(table, tbl_size);
255 return 0; 261 return 0;
256 } else 262 } else
257 return 1; 263 return 1;
@@ -265,8 +271,9 @@ int __init acpi_table_parse(char *id, acpi_table_handler handler)
265static void __init check_multiple_madt(void) 271static void __init check_multiple_madt(void)
266{ 272{
267 struct acpi_table_header *table = NULL; 273 struct acpi_table_header *table = NULL;
274 acpi_size tbl_size;
268 275
269 acpi_get_table(ACPI_SIG_MADT, 2, &table); 276 acpi_get_table_with_size(ACPI_SIG_MADT, 2, &table, &tbl_size);
270 if (table) { 277 if (table) {
271 printk(KERN_WARNING PREFIX 278 printk(KERN_WARNING PREFIX
272 "BIOS bug: multiple APIC/MADT found," 279 "BIOS bug: multiple APIC/MADT found,"
@@ -275,6 +282,7 @@ static void __init check_multiple_madt(void)
275 "If \"acpi_apic_instance=%d\" works better, " 282 "If \"acpi_apic_instance=%d\" works better, "
276 "notify linux-acpi@vger.kernel.org\n", 283 "notify linux-acpi@vger.kernel.org\n",
277 acpi_apic_instance ? 0 : 2); 284 acpi_apic_instance ? 0 : 2);
285 early_acpi_os_unmap_memory(table, tbl_size);
278 286
279 } else 287 } else
280 acpi_apic_instance = 0; 288 acpi_apic_instance = 0;
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 719ee5c1c8d9..5b257a57bc57 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -107,7 +107,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
107/* 107/*
108 * Print cpu online, possible, present, and system maps 108 * Print cpu online, possible, present, and system maps
109 */ 109 */
110static ssize_t print_cpus_map(char *buf, cpumask_t *map) 110static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
111{ 111{
112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map); 112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
113 113
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index a778fb52b11f..bf6b13206d00 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -31,7 +31,10 @@
31#include <linux/hardirq.h> 31#include <linux/hardirq.h>
32#include <linux/topology.h> 32#include <linux/topology.h>
33 33
34#define define_one_ro(_name) \ 34#define define_one_ro_named(_name, _func) \
35static SYSDEV_ATTR(_name, 0444, _func, NULL)
36
37#define define_one_ro(_name) \
35static SYSDEV_ATTR(_name, 0444, show_##_name, NULL) 38static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
36 39
37#define define_id_show_func(name) \ 40#define define_id_show_func(name) \
@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
42 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 45 return sprintf(buf, "%d\n", topology_##name(cpu)); \
43} 46}
44 47
45#if defined(topology_thread_siblings) || defined(topology_core_siblings) 48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
46static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) 49static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
47{ 50{
48 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 51 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
49 int n = 0; 52 int n = 0;
@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_device *dev, \
65 struct sysdev_attribute *attr, char *buf) \ 68 struct sysdev_attribute *attr, char *buf) \
66{ \ 69{ \
67 unsigned int cpu = dev->id; \ 70 unsigned int cpu = dev->id; \
68 return show_cpumap(0, &(topology_##name(cpu)), buf); \ 71 return show_cpumap(0, topology_##name(cpu), buf); \
69} 72}
70 73
71#define define_siblings_show_list(name) \ 74#define define_siblings_show_list(name) \
@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
74 char *buf) \ 77 char *buf) \
75{ \ 78{ \
76 unsigned int cpu = dev->id; \ 79 unsigned int cpu = dev->id; \
77 return show_cpumap(1, &(topology_##name(cpu)), buf); \ 80 return show_cpumap(1, topology_##name(cpu), buf); \
78} 81}
79 82
80#else 83#else
@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
82static ssize_t show_##name(struct sys_device *dev, \ 85static ssize_t show_##name(struct sys_device *dev, \
83 struct sysdev_attribute *attr, char *buf) \ 86 struct sysdev_attribute *attr, char *buf) \
84{ \ 87{ \
85 unsigned int cpu = dev->id; \ 88 return show_cpumap(0, topology_##name(dev->id), buf); \
86 cpumask_t mask = topology_##name(cpu); \
87 return show_cpumap(0, &mask, buf); \
88} 89}
89 90
90#define define_siblings_show_list(name) \ 91#define define_siblings_show_list(name) \
@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
92 struct sysdev_attribute *attr, \ 93 struct sysdev_attribute *attr, \
93 char *buf) \ 94 char *buf) \
94{ \ 95{ \
95 unsigned int cpu = dev->id; \ 96 return show_cpumap(1, topology_##name(dev->id), buf); \
96 cpumask_t mask = topology_##name(cpu); \
97 return show_cpumap(1, &mask, buf); \
98} 97}
99#endif 98#endif
100 99
@@ -107,13 +106,13 @@ define_one_ro(physical_package_id);
107define_id_show_func(core_id); 106define_id_show_func(core_id);
108define_one_ro(core_id); 107define_one_ro(core_id);
109 108
110define_siblings_show_func(thread_siblings); 109define_siblings_show_func(thread_cpumask);
111define_one_ro(thread_siblings); 110define_one_ro_named(thread_siblings, show_thread_cpumask);
112define_one_ro(thread_siblings_list); 111define_one_ro_named(thread_siblings_list, show_thread_cpumask_list);
113 112
114define_siblings_show_func(core_siblings); 113define_siblings_show_func(core_cpumask);
115define_one_ro(core_siblings); 114define_one_ro_named(core_siblings, show_core_cpumask);
116define_one_ro(core_siblings_list); 115define_one_ro_named(core_siblings_list, show_core_cpumask_list);
117 116
118static struct attribute *default_attrs[] = { 117static struct attribute *default_attrs[] = {
119 &attr_physical_package_id.attr, 118 &attr_physical_package_id.attr,
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896d..30659ce9bcf4 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -283,7 +283,7 @@ static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
283} 283}
284static struct sysrq_key_op sysrq_ftrace_dump_op = { 284static struct sysrq_key_op sysrq_ftrace_dump_op = {
285 .handler = sysrq_ftrace_dump, 285 .handler = sysrq_ftrace_dump,
286 .help_msg = "dumpZ-ftrace-buffer", 286 .help_msg = "dump-ftrace-buffer(Z)",
287 .action_msg = "Dump ftrace buffer", 287 .action_msg = "Dump ftrace buffer",
288 .enable_mask = SYSRQ_ENABLE_DUMP, 288 .enable_mask = SYSRQ_ENABLE_DUMP,
289}; 289};
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index e1129fad96dd..ee19b6e8fcb4 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
143#endif 143#endif
144 144
145#ifndef CONFIG_X86_64 145#ifndef CONFIG_X86_64
146#include "mach_timer.h" 146#include <asm/mach_timer.h>
147#define PMTMR_EXPECTED_RATE \ 147#define PMTMR_EXPECTED_RATE \
148 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) 148 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
149/* 149/*
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 1bde303b970b..8615059a8729 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -7,7 +7,7 @@
7#include <asm/pgtable.h> 7#include <asm/pgtable.h>
8#include <asm/io.h> 8#include <asm/io.h>
9 9
10#include "mach_timer.h" 10#include <asm/mach_timer.h>
11 11
12#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */ 12#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */
13#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ 13#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */
diff --git a/drivers/eisa/Kconfig b/drivers/eisa/Kconfig
index c0646576cf47..2705284f6223 100644
--- a/drivers/eisa/Kconfig
+++ b/drivers/eisa/Kconfig
@@ -3,7 +3,7 @@
3# 3#
4config EISA_VLB_PRIMING 4config EISA_VLB_PRIMING
5 bool "Vesa Local Bus priming" 5 bool "Vesa Local Bus priming"
6 depends on X86_PC && EISA 6 depends on X86 && EISA
7 default n 7 default n
8 ---help--- 8 ---help---
9 Activate this option if your system contains a Vesa Local 9 Activate this option if your system contains a Vesa Local
@@ -24,11 +24,11 @@ config EISA_PCI_EISA
24 When in doubt, say Y. 24 When in doubt, say Y.
25 25
26# Using EISA_VIRTUAL_ROOT on something other than an Alpha or 26# Using EISA_VIRTUAL_ROOT on something other than an Alpha or
27# an X86_PC may lead to crashes... 27# an X86 may lead to crashes...
28 28
29config EISA_VIRTUAL_ROOT 29config EISA_VIRTUAL_ROOT
30 bool "EISA virtual root device" 30 bool "EISA virtual root device"
31 depends on EISA && (ALPHA || X86_PC) 31 depends on EISA && (ALPHA || X86)
32 default y 32 default y
33 ---help--- 33 ---help---
34 Activate this option if your system only have EISA bus 34 Activate this option if your system only have EISA bus
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 777fba48d2d3..3009e0171e54 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -244,7 +244,7 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
244 */ 244 */
245int dcdbas_smi_request(struct smi_cmd *smi_cmd) 245int dcdbas_smi_request(struct smi_cmd *smi_cmd)
246{ 246{
247 cpumask_t old_mask; 247 cpumask_var_t old_mask;
248 int ret = 0; 248 int ret = 0;
249 249
250 if (smi_cmd->magic != SMI_CMD_MAGIC) { 250 if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -254,8 +254,11 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
254 } 254 }
255 255
256 /* SMI requires CPU 0 */ 256 /* SMI requires CPU 0 */
257 old_mask = current->cpus_allowed; 257 if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
258 set_cpus_allowed_ptr(current, &cpumask_of_cpu(0)); 258 return -ENOMEM;
259
260 cpumask_copy(old_mask, &current->cpus_allowed);
261 set_cpus_allowed_ptr(current, cpumask_of(0));
259 if (smp_processor_id() != 0) { 262 if (smp_processor_id() != 0) {
260 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", 263 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
261 __func__); 264 __func__);
@@ -275,7 +278,8 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
275 ); 278 );
276 279
277out: 280out:
278 set_cpus_allowed_ptr(current, &old_mask); 281 set_cpus_allowed_ptr(current, old_mask);
282 free_cpumask_var(old_mask);
279 return ret; 283 return ret;
280} 284}
281 285
diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index 3ab3e4a41d67..7b7ddc2d51c9 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -938,8 +938,8 @@ static int __init ibft_init(void)
938 return -ENOMEM; 938 return -ENOMEM;
939 939
940 if (ibft_addr) { 940 if (ibft_addr) {
941 printk(KERN_INFO "iBFT detected at 0x%lx.\n", 941 printk(KERN_INFO "iBFT detected at 0x%llx.\n",
942 virt_to_phys((void *)ibft_addr)); 942 (u64)virt_to_phys((void *)ibft_addr));
943 943
944 rc = ibft_check_device(); 944 rc = ibft_check_device();
945 if (rc) 945 if (rc)
diff --git a/drivers/gpu/drm/drm_proc.c b/drivers/gpu/drm/drm_proc.c
index 8df849f66830..b756f043a5f4 100644
--- a/drivers/gpu/drm/drm_proc.c
+++ b/drivers/gpu/drm/drm_proc.c
@@ -678,9 +678,9 @@ static int drm__vma_info(char *buf, char **start, off_t offset, int request,
678 *start = &buf[offset]; 678 *start = &buf[offset];
679 *eof = 0; 679 *eof = 0;
680 680
681 DRM_PROC_PRINT("vma use count: %d, high_memory = %p, 0x%08lx\n", 681 DRM_PROC_PRINT("vma use count: %d, high_memory = %p, 0x%llx\n",
682 atomic_read(&dev->vma_count), 682 atomic_read(&dev->vma_count),
683 high_memory, virt_to_phys(high_memory)); 683 high_memory, (u64)virt_to_phys(high_memory));
684 list_for_each_entry(pt, &dev->vmalist, head) { 684 list_for_each_entry(pt, &dev->vmalist, head) {
685 if (!(vma = pt->vma)) 685 if (!(vma = pt->vma))
686 continue; 686 continue;
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 35561689ff38..ea2638b41982 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -13,11 +13,11 @@ menuconfig INPUT_KEYBOARD
13if INPUT_KEYBOARD 13if INPUT_KEYBOARD
14 14
15config KEYBOARD_ATKBD 15config KEYBOARD_ATKBD
16 tristate "AT keyboard" if EMBEDDED || !X86_PC 16 tristate "AT keyboard" if EMBEDDED || !X86
17 default y 17 default y
18 select SERIO 18 select SERIO
19 select SERIO_LIBPS2 19 select SERIO_LIBPS2
20 select SERIO_I8042 if X86_PC 20 select SERIO_I8042 if X86
21 select SERIO_GSCPS2 if GSC 21 select SERIO_GSCPS2 if GSC
22 help 22 help
23 Say Y here if you want to use a standard AT or PS/2 keyboard. Usually 23 Say Y here if you want to use a standard AT or PS/2 keyboard. Usually
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index 9705f3a00a3d..4f38e6f7dfdd 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -17,7 +17,7 @@ config MOUSE_PS2
17 default y 17 default y
18 select SERIO 18 select SERIO
19 select SERIO_LIBPS2 19 select SERIO_LIBPS2
20 select SERIO_I8042 if X86_PC 20 select SERIO_I8042 if X86
21 select SERIO_GSCPS2 if GSC 21 select SERIO_GSCPS2 if GSC
22 help 22 help
23 Say Y here if you have a PS/2 mouse connected to your system. This 23 Say Y here if you have a PS/2 mouse connected to your system. This
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 76f2b36881c3..a3d3cbab359a 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
1config LGUEST 1config LGUEST
2 tristate "Linux hypervisor example code" 2 tristate "Linux hypervisor example code"
3 depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !X86_VOYAGER 3 depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
4 select HVC_DRIVER 4 select HVC_DRIVER
5 ---help--- 5 ---help---
6 This is a very simple module which allows you to run 6 This is a very simple module which allows you to run
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index c64e6798878a..1c484084ed4f 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -162,7 +162,7 @@ config ENCLOSURE_SERVICES
162config SGI_XP 162config SGI_XP
163 tristate "Support communication between SGI SSIs" 163 tristate "Support communication between SGI SSIs"
164 depends on NET 164 depends on NET
165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP 165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_UV) && SMP
166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP 168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
@@ -189,7 +189,7 @@ config HP_ILO
189 189
190config SGI_GRU 190config SGI_GRU
191 tristate "SGI GRU driver" 191 tristate "SGI GRU driver"
192 depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP 192 depends on (X86_UV || IA64_SGI_UV || IA64_GENERIC) && SMP
193 default n 193 default n
194 select MMU_NOTIFIER 194 select MMU_NOTIFIER
195 ---help--- 195 ---help---
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 650983806392..c67e4e8bd62c 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -36,23 +36,11 @@
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/uaccess.h> 38#include <linux/uaccess.h>
39#include <asm/uv/uv.h>
39#include "gru.h" 40#include "gru.h"
40#include "grulib.h" 41#include "grulib.h"
41#include "grutables.h" 42#include "grutables.h"
42 43
43#if defined CONFIG_X86_64
44#include <asm/genapic.h>
45#include <asm/irq.h>
46#define IS_UV() is_uv_system()
47#elif defined CONFIG_IA64
48#include <asm/system.h>
49#include <asm/sn/simulator.h>
50/* temp support for running on hardware simulator */
51#define IS_UV() IS_MEDUSA() || ia64_platform_is("uv")
52#else
53#define IS_UV() 0
54#endif
55
56#include <asm/uv/uv_hub.h> 44#include <asm/uv/uv_hub.h>
57#include <asm/uv/uv_mmrs.h> 45#include <asm/uv/uv_mmrs.h>
58 46
@@ -381,7 +369,7 @@ static int __init gru_init(void)
381 char id[10]; 369 char id[10];
382 void *gru_start_vaddr; 370 void *gru_start_vaddr;
383 371
384 if (!IS_UV()) 372 if (!is_uv_system())
385 return 0; 373 return 0;
386 374
387#if defined CONFIG_IA64 375#if defined CONFIG_IA64
@@ -451,7 +439,7 @@ static void __exit gru_exit(void)
451 int order = get_order(sizeof(struct gru_state) * 439 int order = get_order(sizeof(struct gru_state) *
452 GRU_CHIPLETS_PER_BLADE); 440 GRU_CHIPLETS_PER_BLADE);
453 441
454 if (!IS_UV()) 442 if (!is_uv_system())
455 return; 443 return;
456 444
457 for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++) 445 for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index 7b4cbd5e03e9..2275126cb334 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -15,19 +15,19 @@
15 15
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18#ifdef CONFIG_IA64 18#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV
19#include <asm/uv/uv.h>
20#define is_uv() is_uv_system()
21#endif
22
23#ifndef is_uv
24#define is_uv() 0
25#endif
26
27#if defined CONFIG_IA64
19#include <asm/system.h> 28#include <asm/system.h>
20#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */ 29#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */
21#define is_shub() ia64_platform_is("sn2") 30#define is_shub() ia64_platform_is("sn2")
22#ifdef CONFIG_IA64_SGI_UV
23#define is_uv() ia64_platform_is("uv")
24#else
25#define is_uv() 0
26#endif
27#endif
28#ifdef CONFIG_X86_64
29#include <asm/genapic.h>
30#define is_uv() is_uv_system()
31#endif 31#endif
32 32
33#ifndef is_shub1 33#ifndef is_shub1
@@ -42,10 +42,6 @@
42#define is_shub() 0 42#define is_shub() 0
43#endif 43#endif
44 44
45#ifndef is_uv
46#define is_uv() 0
47#endif
48
49#ifdef USE_DBUG_ON 45#ifdef USE_DBUG_ON
50#define DBUG_ON(condition) BUG_ON(condition) 46#define DBUG_ON(condition) BUG_ON(condition)
51#else 47#else
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 89218f7cfaa7..6576170de962 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -318,7 +318,7 @@ xpc_hb_checker(void *ignore)
318 318
319 /* this thread was marked active by xpc_hb_init() */ 319 /* this thread was marked active by xpc_hb_init() */
320 320
321 set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU)); 321 set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
322 322
323 /* set our heartbeating to other partitions into motion */ 323 /* set our heartbeating to other partitions into motion */
324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 8b12e6e109d3..2ff88791cebc 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -273,7 +273,7 @@ config MTD_NAND_CAFE
273 273
274config MTD_NAND_CS553X 274config MTD_NAND_CS553X
275 tristate "NAND support for CS5535/CS5536 (AMD Geode companion chip)" 275 tristate "NAND support for CS5535/CS5536 (AMD Geode companion chip)"
276 depends on X86_32 && (X86_PC || X86_GENERICARCH) 276 depends on X86_32
277 help 277 help
278 The CS553x companion chips for the AMD Geode processor 278 The CS553x companion chips for the AMD Geode processor
279 include NAND flash controllers with built-in hardware ECC 279 include NAND flash controllers with built-in hardware ECC
diff --git a/drivers/net/ne3210.c b/drivers/net/ne3210.c
index fac43fd6fc87..6a843f7350ab 100644
--- a/drivers/net/ne3210.c
+++ b/drivers/net/ne3210.c
@@ -150,7 +150,8 @@ static int __init ne3210_eisa_probe (struct device *device)
150 if (phys_mem < virt_to_phys(high_memory)) { 150 if (phys_mem < virt_to_phys(high_memory)) {
151 printk(KERN_CRIT "ne3210.c: Card RAM overlaps with normal memory!!!\n"); 151 printk(KERN_CRIT "ne3210.c: Card RAM overlaps with normal memory!!!\n");
152 printk(KERN_CRIT "ne3210.c: Use EISA SCU to set card memory below 1MB,\n"); 152 printk(KERN_CRIT "ne3210.c: Use EISA SCU to set card memory below 1MB,\n");
153 printk(KERN_CRIT "ne3210.c: or to an address above 0x%lx.\n", virt_to_phys(high_memory)); 153 printk(KERN_CRIT "ne3210.c: or to an address above 0x%llx.\n",
154 (u64)virt_to_phys(high_memory));
154 printk(KERN_CRIT "ne3210.c: Driver NOT installed.\n"); 155 printk(KERN_CRIT "ne3210.c: Driver NOT installed.\n");
155 retval = -EINVAL; 156 retval = -EINVAL;
156 goto out3; 157 goto out3;
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index ab0e09bf154d..847e9bb0098f 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -854,20 +854,27 @@ static void efx_fini_io(struct efx_nic *efx)
854 * interrupts across them. */ 854 * interrupts across them. */
855static int efx_wanted_rx_queues(void) 855static int efx_wanted_rx_queues(void)
856{ 856{
857 cpumask_t core_mask; 857 cpumask_var_t core_mask;
858 int count; 858 int count;
859 int cpu; 859 int cpu;
860 860
861 cpus_clear(core_mask); 861 if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
862 printk(KERN_WARNING
863 "efx.c: allocation failure, irq balancing hobbled\n");
864 return 1;
865 }
866
867 cpumask_clear(core_mask);
862 count = 0; 868 count = 0;
863 for_each_online_cpu(cpu) { 869 for_each_online_cpu(cpu) {
864 if (!cpu_isset(cpu, core_mask)) { 870 if (!cpumask_test_cpu(cpu, core_mask)) {
865 ++count; 871 ++count;
866 cpus_or(core_mask, core_mask, 872 cpumask_or(core_mask, core_mask,
867 topology_core_siblings(cpu)); 873 topology_core_cpumask(cpu));
868 } 874 }
869 } 875 }
870 876
877 free_cpumask_var(core_mask);
871 return count; 878 return count;
872} 879}
873 880
diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index d5378e60fcdd..064307c2277e 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -338,10 +338,10 @@ static int falcon_alloc_special_buffer(struct efx_nic *efx,
338 nic_data->next_buffer_table += buffer->entries; 338 nic_data->next_buffer_table += buffer->entries;
339 339
340 EFX_LOG(efx, "allocating special buffers %d-%d at %llx+%x " 340 EFX_LOG(efx, "allocating special buffers %d-%d at %llx+%x "
341 "(virt %p phys %lx)\n", buffer->index, 341 "(virt %p phys %llx)\n", buffer->index,
342 buffer->index + buffer->entries - 1, 342 buffer->index + buffer->entries - 1,
343 (unsigned long long)buffer->dma_addr, len, 343 (u64)buffer->dma_addr, len,
344 buffer->addr, virt_to_phys(buffer->addr)); 344 buffer->addr, (u64)virt_to_phys(buffer->addr));
345 345
346 return 0; 346 return 0;
347} 347}
@@ -353,10 +353,10 @@ static void falcon_free_special_buffer(struct efx_nic *efx,
353 return; 353 return;
354 354
355 EFX_LOG(efx, "deallocating special buffers %d-%d at %llx+%x " 355 EFX_LOG(efx, "deallocating special buffers %d-%d at %llx+%x "
356 "(virt %p phys %lx)\n", buffer->index, 356 "(virt %p phys %llx)\n", buffer->index,
357 buffer->index + buffer->entries - 1, 357 buffer->index + buffer->entries - 1,
358 (unsigned long long)buffer->dma_addr, buffer->len, 358 (u64)buffer->dma_addr, buffer->len,
359 buffer->addr, virt_to_phys(buffer->addr)); 359 buffer->addr, (u64)virt_to_phys(buffer->addr));
360 360
361 pci_free_consistent(efx->pci_dev, buffer->len, buffer->addr, 361 pci_free_consistent(efx->pci_dev, buffer->len, buffer->addr,
362 buffer->dma_addr); 362 buffer->dma_addr);
@@ -2343,10 +2343,10 @@ int falcon_probe_port(struct efx_nic *efx)
2343 FALCON_MAC_STATS_SIZE); 2343 FALCON_MAC_STATS_SIZE);
2344 if (rc) 2344 if (rc)
2345 return rc; 2345 return rc;
2346 EFX_LOG(efx, "stats buffer at %llx (virt %p phys %lx)\n", 2346 EFX_LOG(efx, "stats buffer at %llx (virt %p phys %llx)\n",
2347 (unsigned long long)efx->stats_buffer.dma_addr, 2347 (u64)efx->stats_buffer.dma_addr,
2348 efx->stats_buffer.addr, 2348 efx->stats_buffer.addr,
2349 virt_to_phys(efx->stats_buffer.addr)); 2349 (u64)virt_to_phys(efx->stats_buffer.addr));
2350 2350
2351 return 0; 2351 return 0;
2352} 2352}
@@ -2921,9 +2921,9 @@ int falcon_probe_nic(struct efx_nic *efx)
2921 goto fail4; 2921 goto fail4;
2922 BUG_ON(efx->irq_status.dma_addr & 0x0f); 2922 BUG_ON(efx->irq_status.dma_addr & 0x0f);
2923 2923
2924 EFX_LOG(efx, "INT_KER at %llx (virt %p phys %lx)\n", 2924 EFX_LOG(efx, "INT_KER at %llx (virt %p phys %llx)\n",
2925 (unsigned long long)efx->irq_status.dma_addr, 2925 (u64)efx->irq_status.dma_addr,
2926 efx->irq_status.addr, virt_to_phys(efx->irq_status.addr)); 2926 efx->irq_status.addr, (u64)virt_to_phys(efx->irq_status.addr));
2927 2927
2928 falcon_probe_spi_devices(efx); 2928 falcon_probe_spi_devices(efx);
2929 2929
diff --git a/drivers/net/wireless/arlan-main.c b/drivers/net/wireless/arlan-main.c
index bfca15da6f0f..14c11656e82c 100644
--- a/drivers/net/wireless/arlan-main.c
+++ b/drivers/net/wireless/arlan-main.c
@@ -1082,8 +1082,8 @@ static int __init arlan_probe_here(struct net_device *dev,
1082 if (arlan_check_fingerprint(memaddr)) 1082 if (arlan_check_fingerprint(memaddr))
1083 return -ENODEV; 1083 return -ENODEV;
1084 1084
1085 printk(KERN_NOTICE "%s: Arlan found at %x, \n ", dev->name, 1085 printk(KERN_NOTICE "%s: Arlan found at %llx, \n ", dev->name,
1086 (int) virt_to_phys((void*)memaddr)); 1086 (u64) virt_to_phys((void*)memaddr));
1087 1087
1088 ap->card = (void *) memaddr; 1088 ap->card = (void *) memaddr;
1089 dev->mem_start = memaddr; 1089 dev->mem_start = memaddr;
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 9da5a4b81133..c3ea5fa7d05a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -38,7 +38,7 @@
38 38
39static LIST_HEAD(dying_tasks); 39static LIST_HEAD(dying_tasks);
40static LIST_HEAD(dead_tasks); 40static LIST_HEAD(dead_tasks);
41static cpumask_t marked_cpus = CPU_MASK_NONE; 41static cpumask_var_t marked_cpus;
42static DEFINE_SPINLOCK(task_mortuary); 42static DEFINE_SPINLOCK(task_mortuary);
43static void process_task_mortuary(void); 43static void process_task_mortuary(void);
44 44
@@ -456,10 +456,10 @@ static void mark_done(int cpu)
456{ 456{
457 int i; 457 int i;
458 458
459 cpu_set(cpu, marked_cpus); 459 cpumask_set_cpu(cpu, marked_cpus);
460 460
461 for_each_online_cpu(i) { 461 for_each_online_cpu(i) {
462 if (!cpu_isset(i, marked_cpus)) 462 if (!cpumask_test_cpu(i, marked_cpus))
463 return; 463 return;
464 } 464 }
465 465
@@ -468,7 +468,7 @@ static void mark_done(int cpu)
468 */ 468 */
469 process_task_mortuary(); 469 process_task_mortuary();
470 470
471 cpus_clear(marked_cpus); 471 cpumask_clear(marked_cpus);
472} 472}
473 473
474 474
@@ -565,6 +565,20 @@ void sync_buffer(int cpu)
565 mutex_unlock(&buffer_mutex); 565 mutex_unlock(&buffer_mutex);
566} 566}
567 567
568int __init buffer_sync_init(void)
569{
570 if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
571 return -ENOMEM;
572
573 cpumask_clear(marked_cpus);
574 return 0;
575}
576
577void __exit buffer_sync_cleanup(void)
578{
579 free_cpumask_var(marked_cpus);
580}
581
568/* The function can be used to add a buffer worth of data directly to 582/* The function can be used to add a buffer worth of data directly to
569 * the kernel buffer. The buffer is assumed to be a circular buffer. 583 * the kernel buffer. The buffer is assumed to be a circular buffer.
570 * Take the entries from index start and end at index end, wrapping 584 * Take the entries from index start and end at index end, wrapping
diff --git a/drivers/oprofile/buffer_sync.h b/drivers/oprofile/buffer_sync.h
index 3110732c1835..0ebf5db62679 100644
--- a/drivers/oprofile/buffer_sync.h
+++ b/drivers/oprofile/buffer_sync.h
@@ -19,4 +19,8 @@ void sync_stop(void);
19/* sync the given CPU's buffer */ 19/* sync the given CPU's buffer */
20void sync_buffer(int cpu); 20void sync_buffer(int cpu);
21 21
22/* initialize/destroy the buffer system. */
23int buffer_sync_init(void);
24void buffer_sync_cleanup(void);
25
22#endif /* OPROFILE_BUFFER_SYNC_H */ 26#endif /* OPROFILE_BUFFER_SYNC_H */
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index e76d715e4342..f0e99d4c066b 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -161,7 +161,7 @@ struct op_sample
161{ 161{
162 entry->event = ring_buffer_lock_reserve 162 entry->event = ring_buffer_lock_reserve
163 (op_ring_buffer_write, sizeof(struct op_sample) + 163 (op_ring_buffer_write, sizeof(struct op_sample) +
164 size * sizeof(entry->sample->data[0]), &entry->irq_flags); 164 size * sizeof(entry->sample->data[0]));
165 if (entry->event) 165 if (entry->event)
166 entry->sample = ring_buffer_event_data(entry->event); 166 entry->sample = ring_buffer_event_data(entry->event);
167 else 167 else
@@ -178,8 +178,7 @@ struct op_sample
178 178
179int op_cpu_buffer_write_commit(struct op_entry *entry) 179int op_cpu_buffer_write_commit(struct op_entry *entry)
180{ 180{
181 return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, 181 return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event);
182 entry->irq_flags);
183} 182}
184 183
185struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 184struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 3cffce90f82a..ced39f602292 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -183,6 +183,10 @@ static int __init oprofile_init(void)
183{ 183{
184 int err; 184 int err;
185 185
186 err = buffer_sync_init();
187 if (err)
188 return err;
189
186 err = oprofile_arch_init(&oprofile_ops); 190 err = oprofile_arch_init(&oprofile_ops);
187 191
188 if (err < 0 || timer) { 192 if (err < 0 || timer) {
@@ -191,8 +195,10 @@ static int __init oprofile_init(void)
191 } 195 }
192 196
193 err = oprofilefs_register(); 197 err = oprofilefs_register();
194 if (err) 198 if (err) {
195 oprofile_arch_exit(); 199 oprofile_arch_exit();
200 buffer_sync_cleanup();
201 }
196 202
197 return err; 203 return err;
198} 204}
@@ -202,6 +208,7 @@ static void __exit oprofile_exit(void)
202{ 208{
203 oprofilefs_unregister(); 209 oprofilefs_unregister();
204 oprofile_arch_exit(); 210 oprofile_arch_exit();
211 buffer_sync_cleanup();
205} 212}
206 213
207 214
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 26c536b51c5a..5f333403c2ea 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -42,6 +42,7 @@
42LIST_HEAD(dmar_drhd_units); 42LIST_HEAD(dmar_drhd_units);
43 43
44static struct acpi_table_header * __initdata dmar_tbl; 44static struct acpi_table_header * __initdata dmar_tbl;
45static acpi_size dmar_tbl_size;
45 46
46static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd) 47static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd)
47{ 48{
@@ -288,8 +289,9 @@ static int __init dmar_table_detect(void)
288 acpi_status status = AE_OK; 289 acpi_status status = AE_OK;
289 290
290 /* if we could find DMAR table, then there are DMAR devices */ 291 /* if we could find DMAR table, then there are DMAR devices */
291 status = acpi_get_table(ACPI_SIG_DMAR, 0, 292 status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0,
292 (struct acpi_table_header **)&dmar_tbl); 293 (struct acpi_table_header **)&dmar_tbl,
294 &dmar_tbl_size);
293 295
294 if (ACPI_SUCCESS(status) && !dmar_tbl) { 296 if (ACPI_SUCCESS(status) && !dmar_tbl) {
295 printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); 297 printk (KERN_WARNING PREFIX "Unable to map DMAR\n");
@@ -489,6 +491,7 @@ void __init detect_intel_iommu(void)
489 iommu_detected = 1; 491 iommu_detected = 1;
490#endif 492#endif
491 } 493 }
494 early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
492 dmar_tbl = NULL; 495 dmar_tbl = NULL;
493} 496}
494 497
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 45effc5726c0..8e44db040db7 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -6,6 +6,7 @@
6#include <linux/irq.h> 6#include <linux/irq.h>
7#include <asm/io_apic.h> 7#include <asm/io_apic.h>
8#include <asm/smp.h> 8#include <asm/smp.h>
9#include <asm/cpu.h>
9#include <linux/intel-iommu.h> 10#include <linux/intel-iommu.h>
10#include "intr_remapping.h" 11#include "intr_remapping.h"
11 12
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index bf92802f2bbe..36e221beedcd 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -37,7 +37,7 @@
37#include <linux/io.h> 37#include <linux/io.h>
38#include <linux/uaccess.h> 38#include <linux/uaccess.h>
39 39
40#include <asm/mach-rdc321x/rdc321x_defs.h> 40#include <asm/rdc321x_defs.h>
41 41
42#define RDC_WDT_MASK 0x80000000 /* Mask */ 42#define RDC_WDT_MASK 0x80000000 /* Mask */
43#define RDC_WDT_EN 0x00800000 /* Enable bit */ 43#define RDC_WDT_EN 0x00800000 /* Enable bit */
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..30963af5dba0 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -26,9 +26,11 @@
26#include <linux/irq.h> 26#include <linux/irq.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/bootmem.h>
29 30
30#include <asm/ptrace.h> 31#include <asm/ptrace.h>
31#include <asm/irq.h> 32#include <asm/irq.h>
33#include <asm/idle.h>
32#include <asm/sync_bitops.h> 34#include <asm/sync_bitops.h>
33#include <asm/xen/hypercall.h> 35#include <asm/xen/hypercall.h>
34#include <asm/xen/hypervisor.h> 36#include <asm/xen/hypervisor.h>
@@ -50,36 +52,55 @@ static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
50/* IRQ <-> IPI mapping */ 52/* IRQ <-> IPI mapping */
51static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; 53static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
52 54
53/* Packed IRQ information: binding type, sub-type index, and event channel. */ 55/* Interrupt types. */
54struct packed_irq 56enum xen_irq_type {
55{ 57 IRQT_UNBOUND = 0,
56 unsigned short evtchn;
57 unsigned char index;
58 unsigned char type;
59};
60
61static struct packed_irq irq_info[NR_IRQS];
62
63/* Binding types. */
64enum {
65 IRQT_UNBOUND,
66 IRQT_PIRQ, 58 IRQT_PIRQ,
67 IRQT_VIRQ, 59 IRQT_VIRQ,
68 IRQT_IPI, 60 IRQT_IPI,
69 IRQT_EVTCHN 61 IRQT_EVTCHN
70}; 62};
71 63
72/* Convenient shorthand for packed representation of an unbound IRQ. */ 64/*
73#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) 65 * Packed IRQ information:
66 * type - enum xen_irq_type
67 * event channel - irq->event channel mapping
68 * cpu - cpu this event channel is bound to
69 * index - type-specific information:
70 * PIRQ - vector, with MSB being "needs EIO"
71 * VIRQ - virq number
72 * IPI - IPI vector
73 * EVTCHN -
74 */
75struct irq_info
76{
77 enum xen_irq_type type; /* type */
78 unsigned short evtchn; /* event channel */
79 unsigned short cpu; /* cpu bound */
80
81 union {
82 unsigned short virq;
83 enum ipi_vector ipi;
84 struct {
85 unsigned short gsi;
86 unsigned short vector;
87 } pirq;
88 } u;
89};
90
91static struct irq_info irq_info[NR_IRQS];
74 92
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = { 93static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1 94 [0 ... NR_EVENT_CHANNELS-1] = -1
77}; 95};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; 96struct cpu_evtchn_s {
79static u8 cpu_evtchn[NR_EVENT_CHANNELS]; 97 unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
80 98};
81/* Reference counts for bindings to IRQs. */ 99static struct cpu_evtchn_s *cpu_evtchn_mask_p;
82static int irq_bindcount[NR_IRQS]; 100static inline unsigned long *cpu_evtchn_mask(int cpu)
101{
102 return cpu_evtchn_mask_p[cpu].bits;
103}
83 104
84/* Xen will never allocate port zero for any purpose. */ 105/* Xen will never allocate port zero for any purpose. */
85#define VALID_EVTCHN(chn) ((chn) != 0) 106#define VALID_EVTCHN(chn) ((chn) != 0)
@@ -87,27 +108,108 @@ static int irq_bindcount[NR_IRQS];
87static struct irq_chip xen_dynamic_chip; 108static struct irq_chip xen_dynamic_chip;
88 109
89/* Constructor for packed IRQ information. */ 110/* Constructor for packed IRQ information. */
90static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) 111static struct irq_info mk_unbound_info(void)
112{
113 return (struct irq_info) { .type = IRQT_UNBOUND };
114}
115
116static struct irq_info mk_evtchn_info(unsigned short evtchn)
117{
118 return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn,
119 .cpu = 0 };
120}
121
122static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi)
91{ 123{
92 return (struct packed_irq) { evtchn, index, type }; 124 return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn,
125 .cpu = 0, .u.ipi = ipi };
126}
127
128static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
129{
130 return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn,
131 .cpu = 0, .u.virq = virq };
132}
133
134static struct irq_info mk_pirq_info(unsigned short evtchn,
135 unsigned short gsi, unsigned short vector)
136{
137 return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
138 .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
93} 139}
94 140
95/* 141/*
96 * Accessors for packed IRQ information. 142 * Accessors for packed IRQ information.
97 */ 143 */
98static inline unsigned int evtchn_from_irq(int irq) 144static struct irq_info *info_for_irq(unsigned irq)
145{
146 return &irq_info[irq];
147}
148
149static unsigned int evtchn_from_irq(unsigned irq)
99{ 150{
100 return irq_info[irq].evtchn; 151 return info_for_irq(irq)->evtchn;
101} 152}
102 153
103static inline unsigned int index_from_irq(int irq) 154static enum ipi_vector ipi_from_irq(unsigned irq)
104{ 155{
105 return irq_info[irq].index; 156 struct irq_info *info = info_for_irq(irq);
157
158 BUG_ON(info == NULL);
159 BUG_ON(info->type != IRQT_IPI);
160
161 return info->u.ipi;
106} 162}
107 163
108static inline unsigned int type_from_irq(int irq) 164static unsigned virq_from_irq(unsigned irq)
109{ 165{
110 return irq_info[irq].type; 166 struct irq_info *info = info_for_irq(irq);
167
168 BUG_ON(info == NULL);
169 BUG_ON(info->type != IRQT_VIRQ);
170
171 return info->u.virq;
172}
173
174static unsigned gsi_from_irq(unsigned irq)
175{
176 struct irq_info *info = info_for_irq(irq);
177
178 BUG_ON(info == NULL);
179 BUG_ON(info->type != IRQT_PIRQ);
180
181 return info->u.pirq.gsi;
182}
183
184static unsigned vector_from_irq(unsigned irq)
185{
186 struct irq_info *info = info_for_irq(irq);
187
188 BUG_ON(info == NULL);
189 BUG_ON(info->type != IRQT_PIRQ);
190
191 return info->u.pirq.vector;
192}
193
194static enum xen_irq_type type_from_irq(unsigned irq)
195{
196 return info_for_irq(irq)->type;
197}
198
199static unsigned cpu_from_irq(unsigned irq)
200{
201 return info_for_irq(irq)->cpu;
202}
203
204static unsigned int cpu_from_evtchn(unsigned int evtchn)
205{
206 int irq = evtchn_to_irq[evtchn];
207 unsigned ret = 0;
208
209 if (irq != -1)
210 ret = cpu_from_irq(irq);
211
212 return ret;
111} 213}
112 214
113static inline unsigned long active_evtchns(unsigned int cpu, 215static inline unsigned long active_evtchns(unsigned int cpu,
@@ -115,7 +217,7 @@ static inline unsigned long active_evtchns(unsigned int cpu,
115 unsigned int idx) 217 unsigned int idx)
116{ 218{
117 return (sh->evtchn_pending[idx] & 219 return (sh->evtchn_pending[idx] &
118 cpu_evtchn_mask[cpu][idx] & 220 cpu_evtchn_mask(cpu)[idx] &
119 ~sh->evtchn_mask[idx]); 221 ~sh->evtchn_mask[idx]);
120} 222}
121 223
@@ -125,13 +227,13 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
125 227
126 BUG_ON(irq == -1); 228 BUG_ON(irq == -1);
127#ifdef CONFIG_SMP 229#ifdef CONFIG_SMP
128 irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); 230 cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
129#endif 231#endif
130 232
131 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); 233 __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
132 __set_bit(chn, cpu_evtchn_mask[cpu]); 234 __set_bit(chn, cpu_evtchn_mask(cpu));
133 235
134 cpu_evtchn[chn] = cpu; 236 irq_info[irq].cpu = cpu;
135} 237}
136 238
137static void init_evtchn_cpu_bindings(void) 239static void init_evtchn_cpu_bindings(void)
@@ -142,17 +244,11 @@ static void init_evtchn_cpu_bindings(void)
142 244
143 /* By default all event channels notify CPU#0. */ 245 /* By default all event channels notify CPU#0. */
144 for_each_irq_desc(i, desc) { 246 for_each_irq_desc(i, desc) {
145 desc->affinity = cpumask_of_cpu(0); 247 cpumask_copy(desc->affinity, cpumask_of(0));
146 } 248 }
147#endif 249#endif
148 250
149 memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); 251 memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
150 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
151}
152
153static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
154{
155 return cpu_evtchn[evtchn];
156} 252}
157 253
158static inline void clear_evtchn(int port) 254static inline void clear_evtchn(int port)
@@ -232,9 +328,8 @@ static int find_unbound_irq(void)
232 int irq; 328 int irq;
233 struct irq_desc *desc; 329 struct irq_desc *desc;
234 330
235 /* Only allocate from dynirq range */
236 for (irq = 0; irq < nr_irqs; irq++) 331 for (irq = 0; irq < nr_irqs; irq++)
237 if (irq_bindcount[irq] == 0) 332 if (irq_info[irq].type == IRQT_UNBOUND)
238 break; 333 break;
239 334
240 if (irq == nr_irqs) 335 if (irq == nr_irqs)
@@ -244,6 +339,8 @@ static int find_unbound_irq(void)
244 if (WARN_ON(desc == NULL)) 339 if (WARN_ON(desc == NULL))
245 return -1; 340 return -1;
246 341
342 dynamic_irq_init(irq);
343
247 return irq; 344 return irq;
248} 345}
249 346
@@ -258,16 +355,13 @@ int bind_evtchn_to_irq(unsigned int evtchn)
258 if (irq == -1) { 355 if (irq == -1) {
259 irq = find_unbound_irq(); 356 irq = find_unbound_irq();
260 357
261 dynamic_irq_init(irq);
262 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, 358 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
263 handle_level_irq, "event"); 359 handle_level_irq, "event");
264 360
265 evtchn_to_irq[evtchn] = irq; 361 evtchn_to_irq[evtchn] = irq;
266 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); 362 irq_info[irq] = mk_evtchn_info(evtchn);
267 } 363 }
268 364
269 irq_bindcount[irq]++;
270
271 spin_unlock(&irq_mapping_update_lock); 365 spin_unlock(&irq_mapping_update_lock);
272 366
273 return irq; 367 return irq;
@@ -282,12 +376,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
282 spin_lock(&irq_mapping_update_lock); 376 spin_lock(&irq_mapping_update_lock);
283 377
284 irq = per_cpu(ipi_to_irq, cpu)[ipi]; 378 irq = per_cpu(ipi_to_irq, cpu)[ipi];
379
285 if (irq == -1) { 380 if (irq == -1) {
286 irq = find_unbound_irq(); 381 irq = find_unbound_irq();
287 if (irq < 0) 382 if (irq < 0)
288 goto out; 383 goto out;
289 384
290 dynamic_irq_init(irq);
291 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, 385 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
292 handle_level_irq, "ipi"); 386 handle_level_irq, "ipi");
293 387
@@ -298,15 +392,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
298 evtchn = bind_ipi.port; 392 evtchn = bind_ipi.port;
299 393
300 evtchn_to_irq[evtchn] = irq; 394 evtchn_to_irq[evtchn] = irq;
301 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); 395 irq_info[irq] = mk_ipi_info(evtchn, ipi);
302
303 per_cpu(ipi_to_irq, cpu)[ipi] = irq; 396 per_cpu(ipi_to_irq, cpu)[ipi] = irq;
304 397
305 bind_evtchn_to_cpu(evtchn, cpu); 398 bind_evtchn_to_cpu(evtchn, cpu);
306 } 399 }
307 400
308 irq_bindcount[irq]++;
309
310 out: 401 out:
311 spin_unlock(&irq_mapping_update_lock); 402 spin_unlock(&irq_mapping_update_lock);
312 return irq; 403 return irq;
@@ -332,20 +423,17 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
332 423
333 irq = find_unbound_irq(); 424 irq = find_unbound_irq();
334 425
335 dynamic_irq_init(irq);
336 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, 426 set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
337 handle_level_irq, "virq"); 427 handle_level_irq, "virq");
338 428
339 evtchn_to_irq[evtchn] = irq; 429 evtchn_to_irq[evtchn] = irq;
340 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); 430 irq_info[irq] = mk_virq_info(evtchn, virq);
341 431
342 per_cpu(virq_to_irq, cpu)[virq] = irq; 432 per_cpu(virq_to_irq, cpu)[virq] = irq;
343 433
344 bind_evtchn_to_cpu(evtchn, cpu); 434 bind_evtchn_to_cpu(evtchn, cpu);
345 } 435 }
346 436
347 irq_bindcount[irq]++;
348
349 spin_unlock(&irq_mapping_update_lock); 437 spin_unlock(&irq_mapping_update_lock);
350 438
351 return irq; 439 return irq;
@@ -358,7 +446,7 @@ static void unbind_from_irq(unsigned int irq)
358 446
359 spin_lock(&irq_mapping_update_lock); 447 spin_lock(&irq_mapping_update_lock);
360 448
361 if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { 449 if (VALID_EVTCHN(evtchn)) {
362 close.port = evtchn; 450 close.port = evtchn;
363 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) 451 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
364 BUG(); 452 BUG();
@@ -366,11 +454,11 @@ static void unbind_from_irq(unsigned int irq)
366 switch (type_from_irq(irq)) { 454 switch (type_from_irq(irq)) {
367 case IRQT_VIRQ: 455 case IRQT_VIRQ:
368 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) 456 per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
369 [index_from_irq(irq)] = -1; 457 [virq_from_irq(irq)] = -1;
370 break; 458 break;
371 case IRQT_IPI: 459 case IRQT_IPI:
372 per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) 460 per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
373 [index_from_irq(irq)] = -1; 461 [ipi_from_irq(irq)] = -1;
374 break; 462 break;
375 default: 463 default:
376 break; 464 break;
@@ -380,7 +468,7 @@ static void unbind_from_irq(unsigned int irq)
380 bind_evtchn_to_cpu(evtchn, 0); 468 bind_evtchn_to_cpu(evtchn, 0);
381 469
382 evtchn_to_irq[evtchn] = -1; 470 evtchn_to_irq[evtchn] = -1;
383 irq_info[irq] = IRQ_UNBOUND; 471 irq_info[irq] = mk_unbound_info();
384 472
385 dynamic_irq_cleanup(irq); 473 dynamic_irq_cleanup(irq);
386 } 474 }
@@ -498,8 +586,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
498 for(i = 0; i < NR_EVENT_CHANNELS; i++) { 586 for(i = 0; i < NR_EVENT_CHANNELS; i++) {
499 if (sync_test_bit(i, sh->evtchn_pending)) { 587 if (sync_test_bit(i, sh->evtchn_pending)) {
500 printk(" %d: event %d -> irq %d\n", 588 printk(" %d: event %d -> irq %d\n",
501 cpu_evtchn[i], i, 589 cpu_from_evtchn(i), i,
502 evtchn_to_irq[i]); 590 evtchn_to_irq[i]);
503 } 591 }
504 } 592 }
505 593
@@ -508,7 +596,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
508 return IRQ_HANDLED; 596 return IRQ_HANDLED;
509} 597}
510 598
511
512/* 599/*
513 * Search the CPUs pending events bitmasks. For each one found, map 600 * Search the CPUs pending events bitmasks. For each one found, map
514 * the event number to an irq, and feed it into do_IRQ() for 601 * the event number to an irq, and feed it into do_IRQ() for
@@ -521,11 +608,15 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
521void xen_evtchn_do_upcall(struct pt_regs *regs) 608void xen_evtchn_do_upcall(struct pt_regs *regs)
522{ 609{
523 int cpu = get_cpu(); 610 int cpu = get_cpu();
611 struct pt_regs *old_regs = set_irq_regs(regs);
524 struct shared_info *s = HYPERVISOR_shared_info; 612 struct shared_info *s = HYPERVISOR_shared_info;
525 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); 613 struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
526 static DEFINE_PER_CPU(unsigned, nesting_count); 614 static DEFINE_PER_CPU(unsigned, nesting_count);
527 unsigned count; 615 unsigned count;
528 616
617 exit_idle();
618 irq_enter();
619
529 do { 620 do {
530 unsigned long pending_words; 621 unsigned long pending_words;
531 622
@@ -550,7 +641,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
550 int irq = evtchn_to_irq[port]; 641 int irq = evtchn_to_irq[port];
551 642
552 if (irq != -1) 643 if (irq != -1)
553 xen_do_IRQ(irq, regs); 644 handle_irq(irq, regs);
554 } 645 }
555 } 646 }
556 647
@@ -561,12 +652,17 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
561 } while(count != 1); 652 } while(count != 1);
562 653
563out: 654out:
655 irq_exit();
656 set_irq_regs(old_regs);
657
564 put_cpu(); 658 put_cpu();
565} 659}
566 660
567/* Rebind a new event channel to an existing irq. */ 661/* Rebind a new event channel to an existing irq. */
568void rebind_evtchn_irq(int evtchn, int irq) 662void rebind_evtchn_irq(int evtchn, int irq)
569{ 663{
664 struct irq_info *info = info_for_irq(irq);
665
570 /* Make sure the irq is masked, since the new event channel 666 /* Make sure the irq is masked, since the new event channel
571 will also be masked. */ 667 will also be masked. */
572 disable_irq(irq); 668 disable_irq(irq);
@@ -576,11 +672,11 @@ void rebind_evtchn_irq(int evtchn, int irq)
576 /* After resume the irq<->evtchn mappings are all cleared out */ 672 /* After resume the irq<->evtchn mappings are all cleared out */
577 BUG_ON(evtchn_to_irq[evtchn] != -1); 673 BUG_ON(evtchn_to_irq[evtchn] != -1);
578 /* Expect irq to have been bound before, 674 /* Expect irq to have been bound before,
579 so the bindcount should be non-0 */ 675 so there should be a proper type */
580 BUG_ON(irq_bindcount[irq] == 0); 676 BUG_ON(info->type == IRQT_UNBOUND);
581 677
582 evtchn_to_irq[evtchn] = irq; 678 evtchn_to_irq[evtchn] = irq;
583 irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); 679 irq_info[irq] = mk_evtchn_info(evtchn);
584 680
585 spin_unlock(&irq_mapping_update_lock); 681 spin_unlock(&irq_mapping_update_lock);
586 682
@@ -690,8 +786,7 @@ static void restore_cpu_virqs(unsigned int cpu)
690 if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) 786 if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
691 continue; 787 continue;
692 788
693 BUG_ON(irq_info[irq].type != IRQT_VIRQ); 789 BUG_ON(virq_from_irq(irq) != virq);
694 BUG_ON(irq_info[irq].index != virq);
695 790
696 /* Get a new binding from Xen. */ 791 /* Get a new binding from Xen. */
697 bind_virq.virq = virq; 792 bind_virq.virq = virq;
@@ -703,7 +798,7 @@ static void restore_cpu_virqs(unsigned int cpu)
703 798
704 /* Record the new mapping. */ 799 /* Record the new mapping. */
705 evtchn_to_irq[evtchn] = irq; 800 evtchn_to_irq[evtchn] = irq;
706 irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); 801 irq_info[irq] = mk_virq_info(evtchn, virq);
707 bind_evtchn_to_cpu(evtchn, cpu); 802 bind_evtchn_to_cpu(evtchn, cpu);
708 803
709 /* Ready for use. */ 804 /* Ready for use. */
@@ -720,8 +815,7 @@ static void restore_cpu_ipis(unsigned int cpu)
720 if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) 815 if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
721 continue; 816 continue;
722 817
723 BUG_ON(irq_info[irq].type != IRQT_IPI); 818 BUG_ON(ipi_from_irq(irq) != ipi);
724 BUG_ON(irq_info[irq].index != ipi);
725 819
726 /* Get a new binding from Xen. */ 820 /* Get a new binding from Xen. */
727 bind_ipi.vcpu = cpu; 821 bind_ipi.vcpu = cpu;
@@ -732,7 +826,7 @@ static void restore_cpu_ipis(unsigned int cpu)
732 826
733 /* Record the new mapping. */ 827 /* Record the new mapping. */
734 evtchn_to_irq[evtchn] = irq; 828 evtchn_to_irq[evtchn] = irq;
735 irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); 829 irq_info[irq] = mk_ipi_info(evtchn, ipi);
736 bind_evtchn_to_cpu(evtchn, cpu); 830 bind_evtchn_to_cpu(evtchn, cpu);
737 831
738 /* Ready for use. */ 832 /* Ready for use. */
@@ -812,8 +906,11 @@ void xen_irq_resume(void)
812 906
813static struct irq_chip xen_dynamic_chip __read_mostly = { 907static struct irq_chip xen_dynamic_chip __read_mostly = {
814 .name = "xen-dyn", 908 .name = "xen-dyn",
909
910 .disable = disable_dynirq,
815 .mask = disable_dynirq, 911 .mask = disable_dynirq,
816 .unmask = enable_dynirq, 912 .unmask = enable_dynirq,
913
817 .ack = ack_dynirq, 914 .ack = ack_dynirq,
818 .set_affinity = set_affinity_irq, 915 .set_affinity = set_affinity_irq,
819 .retrigger = retrigger_dynirq, 916 .retrigger = retrigger_dynirq,
@@ -822,6 +919,10 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
822void __init xen_init_IRQ(void) 919void __init xen_init_IRQ(void)
823{ 920{
824 int i; 921 int i;
922 size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
923
924 cpu_evtchn_mask_p = alloc_bootmem(size);
925 BUG_ON(cpu_evtchn_mask_p == NULL);
825 926
826 init_evtchn_cpu_bindings(); 927 init_evtchn_cpu_bindings();
827 928
@@ -829,9 +930,5 @@ void __init xen_init_IRQ(void)
829 for (i = 0; i < NR_EVENT_CHANNELS; i++) 930 for (i = 0; i < NR_EVENT_CHANNELS; i++)
830 mask_evtchn(i); 931 mask_evtchn(i);
831 932
832 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
833 for (i = 0; i < nr_irqs; i++)
834 irq_bindcount[i] = 0;
835
836 irq_ctx_init(smp_processor_id()); 933 irq_ctx_init(smp_processor_id());
837} 934}
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 56892a142ee2..3ccd348d112d 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -108,7 +108,7 @@ static void do_suspend(void)
108 /* XXX use normal device tree? */ 108 /* XXX use normal device tree? */
109 xenbus_suspend(); 109 xenbus_suspend();
110 110
111 err = stop_machine(xen_suspend, &cancelled, &cpumask_of_cpu(0)); 111 err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
112 if (err) { 112 if (err) {
113 printk(KERN_ERR "failed to start xen_suspend: %d\n", err); 113 printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
114 goto out; 114 goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..8a17f7edcc74 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/blktrace_api.h>
22 23
23#include "check.h" 24#include "check.h"
24 25
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
294 295
295static struct attribute_group *part_attr_groups[] = { 296static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 297 &part_attr_group,
298#ifdef CONFIG_BLK_DEV_IO_TRACE
299 &blk_trace_attr_group,
300#endif
297 NULL 301 NULL
298}; 302};
299 303
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index a62720a7edc0..ab0b85cf21f3 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -144,6 +144,7 @@ void __iomem *acpi_os_map_memory(acpi_physical_address where,
144 acpi_size length); 144 acpi_size length);
145 145
146void acpi_os_unmap_memory(void __iomem * logical_address, acpi_size size); 146void acpi_os_unmap_memory(void __iomem * logical_address, acpi_size size);
147void early_acpi_os_unmap_memory(void __iomem * virt, acpi_size size);
147 148
148#ifdef ACPI_FUTURE_USAGE 149#ifdef ACPI_FUTURE_USAGE
149acpi_status 150acpi_status
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index c8e8cf45830f..cc40102fe2f3 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -130,6 +130,10 @@ acpi_get_table_header(acpi_string signature,
130 struct acpi_table_header *out_table_header); 130 struct acpi_table_header *out_table_header);
131 131
132acpi_status 132acpi_status
133acpi_get_table_with_size(acpi_string signature,
134 u32 instance, struct acpi_table_header **out_table,
135 acpi_size *tbl_size);
136acpi_status
133acpi_get_table(acpi_string signature, 137acpi_get_table(acpi_string signature,
134 u32 instance, struct acpi_table_header **out_table); 138 u32 instance, struct acpi_table_header **out_table);
135 139
diff --git a/include/asm-frv/ftrace.h b/include/asm-frv/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-frv/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/include/asm-frv/swab.h b/include/asm-frv/swab.h
index afb3396ba5ed..f305834b4799 100644
--- a/include/asm-frv/swab.h
+++ b/include/asm-frv/swab.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_SWAB_H 1#ifndef _ASM_SWAB_H
2#define _ASM_SWAB_H 2#define _ASM_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5 5
6#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) 6#if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__)
7# define __SWAB_64_THRU_32__ 7# define __SWAB_64_THRU_32__
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \ 80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
81 __typeof__(type) per_cpu_var(name) 81 __typeof__(type) per_cpu_var(name)
82 82
83/*
84 * Optional methods for optimized non-lvalue per-cpu variable access.
85 *
86 * @var can be a percpu variable or a field of it and its size should
87 * equal char, int or long. percpu_read() evaluates to a lvalue and
88 * all others to void.
89 *
90 * These operations are guaranteed to be atomic w.r.t. preemption.
91 * The generic versions use plain get/put_cpu_var(). Archs are
92 * encouraged to implement single-instruction alternatives which don't
93 * require preemption protection.
94 */
95#ifndef percpu_read
96# define percpu_read(var) \
97 ({ \
98 typeof(per_cpu_var(var)) __tmp_var__; \
99 __tmp_var__ = get_cpu_var(var); \
100 put_cpu_var(var); \
101 __tmp_var__; \
102 })
103#endif
104
105#define __percpu_generic_to_op(var, val, op) \
106do { \
107 get_cpu_var(var) op val; \
108 put_cpu_var(var); \
109} while (0)
110
111#ifndef percpu_write
112# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =)
113#endif
114
115#ifndef percpu_add
116# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=)
117#endif
118
119#ifndef percpu_sub
120# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=)
121#endif
122
123#ifndef percpu_and
124# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=)
125#endif
126
127#ifndef percpu_or
128# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=)
129#endif
130
131#ifndef percpu_xor
132# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=)
133#endif
134
83#endif /* _ASM_GENERIC_PERCPU_H_ */ 135#endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 79a7ff925bf8..4ce48e878530 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
9extern char __init_begin[], __init_end[]; 9extern char __init_begin[], __init_end[];
10extern char _sinittext[], _einittext[]; 10extern char _sinittext[], _einittext[];
11extern char _end[]; 11extern char _end[];
12extern char __per_cpu_start[], __per_cpu_end[]; 12extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
13extern char __kprobes_text_start[], __kprobes_text_end[]; 13extern char __kprobes_text_start[], __kprobes_text_end[];
14extern char __initdata_begin[], __initdata_end[]; 14extern char __initdata_begin[], __initdata_end[];
15extern char __start_rodata[], __end_rodata[]; 15extern char __start_rodata[], __end_rodata[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..d3bc3c86df6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -61,6 +61,30 @@
61#define BRANCH_PROFILE() 61#define BRANCH_PROFILE()
62#endif 62#endif
63 63
64#ifdef CONFIG_EVENT_TRACER
65#define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \
66 *(_ftrace_events) \
67 VMLINUX_SYMBOL(__stop_ftrace_events) = .;
68#else
69#define FTRACE_EVENTS()
70#endif
71
72#ifdef CONFIG_TRACING
73#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .; \
74 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
75 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
76#else
77#define TRACE_PRINTKS()
78#endif
79
80#ifdef CONFIG_FTRACE_SYSCALLS
81#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .; \
82 *(__syscalls_metadata) \
83 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
84#else
85#define TRACE_SYSCALLS()
86#endif
87
64/* .data section */ 88/* .data section */
65#define DATA_DATA \ 89#define DATA_DATA \
66 *(.data) \ 90 *(.data) \
@@ -81,7 +105,10 @@
81 *(__tracepoints) \ 105 *(__tracepoints) \
82 VMLINUX_SYMBOL(__stop___tracepoints) = .; \ 106 VMLINUX_SYMBOL(__stop___tracepoints) = .; \
83 LIKELY_PROFILE() \ 107 LIKELY_PROFILE() \
84 BRANCH_PROFILE() 108 BRANCH_PROFILE() \
109 TRACE_PRINTKS() \
110 FTRACE_EVENTS() \
111 TRACE_SYSCALLS()
85 112
86#define RO_DATA(align) \ 113#define RO_DATA(align) \
87 . = ALIGN((align)); \ 114 . = ALIGN((align)); \
@@ -430,12 +457,59 @@
430 *(.initcall7.init) \ 457 *(.initcall7.init) \
431 *(.initcall7s.init) 458 *(.initcall7s.init)
432 459
460/**
461 * PERCPU_VADDR - define output section for percpu area
462 * @vaddr: explicit base address (optional)
463 * @phdr: destination PHDR (optional)
464 *
465 * Macro which expands to output section for percpu area. If @vaddr
466 * is not blank, it specifies explicit base address and all percpu
467 * symbols will be offset from the given address. If blank, @vaddr
468 * always equals @laddr + LOAD_OFFSET.
469 *
470 * @phdr defines the output PHDR to use if not blank. Be warned that
471 * output PHDR is sticky. If @phdr is specified, the next output
472 * section in the linker script will go there too. @phdr should have
473 * a leading colon.
474 *
475 * Note that this macros defines __per_cpu_load as an absolute symbol.
476 * If there is no need to put the percpu section at a predetermined
477 * address, use PERCPU().
478 */
479#define PERCPU_VADDR(vaddr, phdr) \
480 VMLINUX_SYMBOL(__per_cpu_load) = .; \
481 .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
482 - LOAD_OFFSET) { \
483 VMLINUX_SYMBOL(__per_cpu_start) = .; \
484 *(.data.percpu.first) \
485 *(.data.percpu.page_aligned) \
486 *(.data.percpu) \
487 *(.data.percpu.shared_aligned) \
488 VMLINUX_SYMBOL(__per_cpu_end) = .; \
489 } phdr \
490 . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
491
492/**
493 * PERCPU - define output section for percpu area, simple version
494 * @align: required alignment
495 *
496 * Align to @align and outputs output section for percpu area. This
497 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
498 * __per_cpu_start will be identical.
499 *
500 * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
501 * that __per_cpu_load is defined as a relative symbol against
502 * .data.percpu which is required for relocatable x86_32
503 * configuration.
504 */
433#define PERCPU(align) \ 505#define PERCPU(align) \
434 . = ALIGN(align); \ 506 . = ALIGN(align); \
435 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 507 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
436 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ 508 VMLINUX_SYMBOL(__per_cpu_load) = .; \
509 VMLINUX_SYMBOL(__per_cpu_start) = .; \
510 *(.data.percpu.first) \
437 *(.data.percpu.page_aligned) \ 511 *(.data.percpu.page_aligned) \
438 *(.data.percpu) \ 512 *(.data.percpu) \
439 *(.data.percpu.shared_aligned) \ 513 *(.data.percpu.shared_aligned) \
440 } \ 514 VMLINUX_SYMBOL(__per_cpu_end) = .; \
441 VMLINUX_SYMBOL(__per_cpu_end) = .; 515 }
diff --git a/include/asm-m32r/ftrace.h b/include/asm-m32r/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-m32r/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/include/asm-m32r/swab.h b/include/asm-m32r/swab.h
index 97973e101825..54dab001d6d1 100644
--- a/include/asm-m32r/swab.h
+++ b/include/asm-m32r/swab.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_M32R_SWAB_H 1#ifndef _ASM_M32R_SWAB_H
2#define _ASM_M32R_SWAB_H 2#define _ASM_M32R_SWAB_H
3 3
4#include <asm/types.h> 4#include <linux/types.h>
5 5
6#if !defined(__STRICT_ANSI__) || defined(__KERNEL__) 6#if !defined(__STRICT_ANSI__) || defined(__KERNEL__)
7# define __SWAB_64_THRU_32__ 7# define __SWAB_64_THRU_32__
diff --git a/include/asm-mn10300/ftrace.h b/include/asm-mn10300/ftrace.h
new file mode 100644
index 000000000000..40a8c178f10d
--- /dev/null
+++ b/include/asm-mn10300/ftrace.h
@@ -0,0 +1 @@
/* empty */
diff --git a/include/asm-mn10300/swab.h b/include/asm-mn10300/swab.h
index 4504d1b4b477..bd818a820ca8 100644
--- a/include/asm-mn10300/swab.h
+++ b/include/asm-mn10300/swab.h
@@ -11,7 +11,7 @@
11#ifndef _ASM_SWAB_H 11#ifndef _ASM_SWAB_H
12#define _ASM_SWAB_H 12#define _ASM_SWAB_H
13 13
14#include <asm/types.h> 14#include <linux/types.h>
15 15
16#ifdef __GNUC__ 16#ifdef __GNUC__
17 17
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6fce2fc2d124..78199151c00b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -79,6 +79,7 @@ typedef int (*acpi_table_handler) (struct acpi_table_header *table);
79typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end); 79typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end);
80 80
81char * __acpi_map_table (unsigned long phys_addr, unsigned long size); 81char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
82void __acpi_unmap_table(char *map, unsigned long size);
82int early_acpi_boot_init(void); 83int early_acpi_boot_init(void);
83int acpi_boot_init (void); 84int acpi_boot_init (void);
84int acpi_boot_table_init (void); 85int acpi_boot_table_init (void);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 6e915878e88c..d960889e92ef 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -144,6 +144,9 @@ struct blk_user_trace_setup {
144 144
145#ifdef __KERNEL__ 145#ifdef __KERNEL__
146#if defined(CONFIG_BLK_DEV_IO_TRACE) 146#if defined(CONFIG_BLK_DEV_IO_TRACE)
147
148#include <linux/sysfs.h>
149
147struct blk_trace { 150struct blk_trace {
148 int trace_state; 151 int trace_state;
149 struct rchan *rchan; 152 struct rchan *rchan;
@@ -194,6 +197,8 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
194extern int blk_trace_startstop(struct request_queue *q, int start); 197extern int blk_trace_startstop(struct request_queue *q, int start);
195extern int blk_trace_remove(struct request_queue *q); 198extern int blk_trace_remove(struct request_queue *q);
196 199
200extern struct attribute_group blk_trace_attr_group;
201
197#else /* !CONFIG_BLK_DEV_IO_TRACE */ 202#else /* !CONFIG_BLK_DEV_IO_TRACE */
198#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) 203#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
199#define blk_trace_shutdown(q) do { } while (0) 204#define blk_trace_shutdown(q) do { } while (0)
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bfb5256..455d83219fae 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
65#define BOOTMEM_DEFAULT 0 65#define BOOTMEM_DEFAULT 0
66#define BOOTMEM_EXCLUSIVE (1<<0) 66#define BOOTMEM_EXCLUSIVE (1<<0)
67 67
68extern int reserve_bootmem(unsigned long addr,
69 unsigned long size,
70 int flags);
68extern int reserve_bootmem_node(pg_data_t *pgdat, 71extern int reserve_bootmem_node(pg_data_t *pgdat,
69 unsigned long physaddr, 72 unsigned long physaddr,
70 unsigned long size, 73 unsigned long size,
71 int flags); 74 int flags);
72#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
73extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
74#endif
75 75
76extern void *__alloc_bootmem_nopanic(unsigned long size, 76extern void *__alloc_bootmem(unsigned long size,
77 unsigned long align, 77 unsigned long align,
78 unsigned long goal); 78 unsigned long goal);
79extern void *__alloc_bootmem(unsigned long size, 79extern void *__alloc_bootmem_nopanic(unsigned long size,
80 unsigned long align, 80 unsigned long align,
81 unsigned long goal); 81 unsigned long goal);
82extern void *__alloc_bootmem_low(unsigned long size,
83 unsigned long align,
84 unsigned long goal);
85extern void *__alloc_bootmem_node(pg_data_t *pgdat, 82extern void *__alloc_bootmem_node(pg_data_t *pgdat,
86 unsigned long size, 83 unsigned long size,
87 unsigned long align, 84 unsigned long align,
@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
90 unsigned long size, 87 unsigned long size,
91 unsigned long align, 88 unsigned long align,
92 unsigned long goal); 89 unsigned long goal);
90extern void *__alloc_bootmem_low(unsigned long size,
91 unsigned long align,
92 unsigned long goal);
93extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, 93extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
94 unsigned long size, 94 unsigned long size,
95 unsigned long align, 95 unsigned long align,
96 unsigned long goal); 96 unsigned long goal);
97#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 97
98#define alloc_bootmem(x) \ 98#define alloc_bootmem(x) \
99 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 99 __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
100#define alloc_bootmem_nopanic(x) \ 100#define alloc_bootmem_nopanic(x) \
101 __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 101 __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_low(x) \
103 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
104#define alloc_bootmem_pages(x) \ 102#define alloc_bootmem_pages(x) \
105 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 103 __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
106#define alloc_bootmem_pages_nopanic(x) \ 104#define alloc_bootmem_pages_nopanic(x) \
107 __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 105 __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
108#define alloc_bootmem_low_pages(x) \
109 __alloc_bootmem_low(x, PAGE_SIZE, 0)
110#define alloc_bootmem_node(pgdat, x) \ 106#define alloc_bootmem_node(pgdat, x) \
111 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 107 __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_pages_node(pgdat, x) \ 108#define alloc_bootmem_pages_node(pgdat, x) \
113 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 109 __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
110#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
111 __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
112
113#define alloc_bootmem_low(x) \
114 __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
115#define alloc_bootmem_low_pages(x) \
116 __alloc_bootmem_low(x, PAGE_SIZE, 0)
114#define alloc_bootmem_low_pages_node(pgdat, x) \ 117#define alloc_bootmem_low_pages_node(pgdat, x) \
115 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) 118 __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
116#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
117 119
118extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, 120extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
119 int flags); 121 int flags);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 07ae8f846055..5b5d4731f956 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -6,6 +6,7 @@
6#define CODA_PSDEV_MAJOR 67 6#define CODA_PSDEV_MAJOR 67
7#define MAX_CODADEVS 5 /* how many do we allow */ 7#define MAX_CODADEVS 5 /* how many do we allow */
8 8
9#ifdef __KERNEL__
9struct kstatfs; 10struct kstatfs;
10 11
11/* communication pending/processing queues */ 12/* communication pending/processing queues */
@@ -24,7 +25,6 @@ static inline struct venus_comm *coda_vcp(struct super_block *sb)
24 return (struct venus_comm *)((sb)->s_fs_info); 25 return (struct venus_comm *)((sb)->s_fs_info);
25} 26}
26 27
27
28/* upcalls */ 28/* upcalls */
29int venus_rootfid(struct super_block *sb, struct CodaFid *fidp); 29int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
30int venus_getattr(struct super_block *sb, struct CodaFid *fid, 30int venus_getattr(struct super_block *sb, struct CodaFid *fid,
@@ -64,6 +64,12 @@ int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
64int venus_fsync(struct super_block *sb, struct CodaFid *fid); 64int venus_fsync(struct super_block *sb, struct CodaFid *fid);
65int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); 65int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
66 66
67/*
68 * Statistics
69 */
70
71extern struct venus_comm coda_comms[];
72#endif /* __KERNEL__ */
67 73
68/* messages between coda filesystem in kernel and Venus */ 74/* messages between coda filesystem in kernel and Venus */
69struct upc_req { 75struct upc_req {
@@ -82,11 +88,4 @@ struct upc_req {
82#define REQ_WRITE 0x4 88#define REQ_WRITE 0x4
83#define REQ_ABORT 0x8 89#define REQ_ABORT 0x8
84 90
85
86/*
87 * Statistics
88 */
89
90extern struct venus_comm coda_comms[];
91
92#endif 91#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d95da1020f1c..6faa7e549de4 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -68,6 +68,7 @@ struct ftrace_branch_data {
68 unsigned long miss; 68 unsigned long miss;
69 unsigned long hit; 69 unsigned long hit;
70 }; 70 };
71 unsigned long miss_hit[2];
71 }; 72 };
72}; 73};
73 74
@@ -125,10 +126,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
125 .line = __LINE__, \ 126 .line = __LINE__, \
126 }; \ 127 }; \
127 ______r = !!(cond); \ 128 ______r = !!(cond); \
128 if (______r) \ 129 ______f.miss_hit[______r]++; \
129 ______f.hit++; \
130 else \
131 ______f.miss++; \
132 ______r; \ 130 ______r; \
133 })) 131 }))
134#endif /* CONFIG_PROFILE_ALL_BRANCHES */ 132#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/include/linux/decompress/bunzip2.h b/include/linux/decompress/bunzip2.h
new file mode 100644
index 000000000000..115272137a9c
--- /dev/null
+++ b/include/linux/decompress/bunzip2.h
@@ -0,0 +1,10 @@
1#ifndef DECOMPRESS_BUNZIP2_H
2#define DECOMPRESS_BUNZIP2_H
3
4int bunzip2(unsigned char *inbuf, int len,
5 int(*fill)(void*, unsigned int),
6 int(*flush)(void*, unsigned int),
7 unsigned char *output,
8 int *pos,
9 void(*error)(char *x));
10#endif
diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h
new file mode 100644
index 000000000000..6dfb856327bb
--- /dev/null
+++ b/include/linux/decompress/generic.h
@@ -0,0 +1,33 @@
1#ifndef DECOMPRESS_GENERIC_H
2#define DECOMPRESS_GENERIC_H
3
4/* Minimal chunksize to be read.
5 *Bzip2 prefers at least 4096
6 *Lzma prefers 0x10000 */
7#define COMPR_IOBUF_SIZE 4096
8
9typedef int (*decompress_fn) (unsigned char *inbuf, int len,
10 int(*fill)(void*, unsigned int),
11 int(*writebb)(void*, unsigned int),
12 unsigned char *output,
13 int *posp,
14 void(*error)(char *x));
15
16/* inbuf - input buffer
17 *len - len of pre-read data in inbuf
18 *fill - function to fill inbuf if empty
19 *writebb - function to write out outbug
20 *posp - if non-null, input position (number of bytes read) will be
21 * returned here
22 *
23 *If len != 0, the inbuf is initialized (with as much data), and fill
24 *should not be called
25 *If len = 0, the inbuf is allocated, but empty. Its size is IOBUF_SIZE
26 *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE
27 */
28
29/* Utility routine to detect the decompression method */
30decompress_fn decompress_method(const unsigned char *inbuf, int len,
31 const char **name);
32
33#endif
diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h
new file mode 100644
index 000000000000..f9b06ccc3e5c
--- /dev/null
+++ b/include/linux/decompress/inflate.h
@@ -0,0 +1,13 @@
1#ifndef INFLATE_H
2#define INFLATE_H
3
4/* Other housekeeping constants */
5#define INBUFSIZ 4096
6
7int gunzip(unsigned char *inbuf, int len,
8 int(*fill)(void*, unsigned int),
9 int(*flush)(void*, unsigned int),
10 unsigned char *output,
11 int *pos,
12 void(*error_fn)(char *x));
13#endif
diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
new file mode 100644
index 000000000000..12ff8c3f1d05
--- /dev/null
+++ b/include/linux/decompress/mm.h
@@ -0,0 +1,87 @@
1/*
2 * linux/compr_mm.h
3 *
4 * Memory management for pre-boot and ramdisk uncompressors
5 *
6 * Authors: Alain Knaff <alain@knaff.lu>
7 *
8 */
9
10#ifndef DECOMPR_MM_H
11#define DECOMPR_MM_H
12
13#ifdef STATIC
14
15/* Code active when included from pre-boot environment: */
16
17/* A trivial malloc implementation, adapted from
18 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
19 */
20static unsigned long malloc_ptr;
21static int malloc_count;
22
23static void *malloc(int size)
24{
25 void *p;
26
27 if (size < 0)
28 error("Malloc error");
29 if (!malloc_ptr)
30 malloc_ptr = free_mem_ptr;
31
32 malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */
33
34 p = (void *)malloc_ptr;
35 malloc_ptr += size;
36
37 if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr)
38 error("Out of memory");
39
40 malloc_count++;
41 return p;
42}
43
44static void free(void *where)
45{
46 malloc_count--;
47 if (!malloc_count)
48 malloc_ptr = free_mem_ptr;
49}
50
51#define large_malloc(a) malloc(a)
52#define large_free(a) free(a)
53
54#define set_error_fn(x)
55
56#define INIT
57
58#else /* STATIC */
59
60/* Code active when compiled standalone for use when loading ramdisk: */
61
62#include <linux/kernel.h>
63#include <linux/fs.h>
64#include <linux/string.h>
65#include <linux/vmalloc.h>
66
67/* Use defines rather than static inline in order to avoid spurious
68 * warnings when not needed (indeed large_malloc / large_free are not
69 * needed by inflate */
70
71#define malloc(a) kmalloc(a, GFP_KERNEL)
72#define free(a) kfree(a)
73
74#define large_malloc(a) vmalloc(a)
75#define large_free(a) vfree(a)
76
77static void(*error)(char *m);
78#define set_error_fn(x) error = x;
79
80#define INIT __init
81#define STATIC
82
83#include <linux/init.h>
84
85#endif /* STATIC */
86
87#endif /* DECOMPR_MM_H */
diff --git a/include/linux/decompress/unlzma.h b/include/linux/decompress/unlzma.h
new file mode 100644
index 000000000000..7796538f1bf4
--- /dev/null
+++ b/include/linux/decompress/unlzma.h
@@ -0,0 +1,12 @@
1#ifndef DECOMPRESS_UNLZMA_H
2#define DECOMPRESS_UNLZMA_H
3
4int unlzma(unsigned char *, int,
5 int(*fill)(void*, unsigned int),
6 int(*flush)(void*, unsigned int),
7 unsigned char *output,
8 int *posp,
9 void(*error)(char *x)
10 );
11
12#endif
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 5ca54d77079f..7605c5e9589f 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -111,6 +111,15 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re
111#endif 111#endif
112} 112}
113 113
114static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
115{
116#ifdef ELF_CORE_COPY_KERNEL_REGS
117 ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs);
118#else
119 elf_core_copy_regs(elfregs, regs);
120#endif
121}
122
114static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) 123static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
115{ 124{
116#ifdef ELF_CORE_COPY_TASK_REGS 125#ifdef ELF_CORE_COPY_TASK_REGS
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..db3fed630db3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,15 +1,18 @@
1#ifndef _LINUX_FTRACE_H 1#ifndef _LINUX_FTRACE_H
2#define _LINUX_FTRACE_H 2#define _LINUX_FTRACE_H
3 3
4#include <linux/linkage.h> 4#include <linux/trace_clock.h>
5#include <linux/fs.h>
6#include <linux/ktime.h>
7#include <linux/init.h>
8#include <linux/types.h>
9#include <linux/module.h>
10#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/linkage.h>
11#include <linux/bitops.h> 7#include <linux/bitops.h>
8#include <linux/module.h>
9#include <linux/ktime.h>
12#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/init.h>
13#include <linux/fs.h>
14
15#include <asm/ftrace.h>
13 16
14#ifdef CONFIG_FUNCTION_TRACER 17#ifdef CONFIG_FUNCTION_TRACER
15 18
@@ -95,9 +98,41 @@ stack_trace_sysctl(struct ctl_table *table, int write,
95 loff_t *ppos); 98 loff_t *ppos);
96#endif 99#endif
97 100
101struct ftrace_func_command {
102 struct list_head list;
103 char *name;
104 int (*func)(char *func, char *cmd,
105 char *params, int enable);
106};
107
98#ifdef CONFIG_DYNAMIC_FTRACE 108#ifdef CONFIG_DYNAMIC_FTRACE
99/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ 109
100#include <asm/ftrace.h> 110int ftrace_arch_code_modify_prepare(void);
111int ftrace_arch_code_modify_post_process(void);
112
113struct seq_file;
114
115struct ftrace_probe_ops {
116 void (*func)(unsigned long ip,
117 unsigned long parent_ip,
118 void **data);
119 int (*callback)(unsigned long ip, void **data);
120 void (*free)(void **data);
121 int (*print)(struct seq_file *m,
122 unsigned long ip,
123 struct ftrace_probe_ops *ops,
124 void *data);
125};
126
127extern int
128register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
129 void *data);
130extern void
131unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
132 void *data);
133extern void
134unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
135extern void unregister_ftrace_function_probe_all(char *glob);
101 136
102enum { 137enum {
103 FTRACE_FL_FREE = (1 << 0), 138 FTRACE_FL_FREE = (1 << 0),
@@ -110,7 +145,6 @@ enum {
110}; 145};
111 146
112struct dyn_ftrace { 147struct dyn_ftrace {
113 struct list_head list;
114 unsigned long ip; /* address of mcount call-site */ 148 unsigned long ip; /* address of mcount call-site */
115 unsigned long flags; 149 unsigned long flags;
116 struct dyn_arch_ftrace arch; 150 struct dyn_arch_ftrace arch;
@@ -119,6 +153,9 @@ struct dyn_ftrace {
119int ftrace_force_update(void); 153int ftrace_force_update(void);
120void ftrace_set_filter(unsigned char *buf, int len, int reset); 154void ftrace_set_filter(unsigned char *buf, int len, int reset);
121 155
156int register_ftrace_command(struct ftrace_func_command *cmd);
157int unregister_ftrace_command(struct ftrace_func_command *cmd);
158
122/* defined in arch */ 159/* defined in arch */
123extern int ftrace_ip_converted(unsigned long ip); 160extern int ftrace_ip_converted(unsigned long ip);
124extern int ftrace_dyn_arch_init(void *data); 161extern int ftrace_dyn_arch_init(void *data);
@@ -126,6 +163,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
126extern void ftrace_caller(void); 163extern void ftrace_caller(void);
127extern void ftrace_call(void); 164extern void ftrace_call(void);
128extern void mcount_call(void); 165extern void mcount_call(void);
166
167#ifndef FTRACE_ADDR
168#define FTRACE_ADDR ((unsigned long)ftrace_caller)
169#endif
129#ifdef CONFIG_FUNCTION_GRAPH_TRACER 170#ifdef CONFIG_FUNCTION_GRAPH_TRACER
130extern void ftrace_graph_caller(void); 171extern void ftrace_graph_caller(void);
131extern int ftrace_enable_ftrace_graph_caller(void); 172extern int ftrace_enable_ftrace_graph_caller(void);
@@ -136,7 +177,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
136#endif 177#endif
137 178
138/** 179/**
139 * ftrace_make_nop - convert code into top 180 * ftrace_make_nop - convert code into nop
140 * @mod: module structure if called by module load initialization 181 * @mod: module structure if called by module load initialization
141 * @rec: the mcount call site record 182 * @rec: the mcount call site record
142 * @addr: the address that the call site should be calling 183 * @addr: the address that the call site should be calling
@@ -181,7 +222,6 @@ extern int ftrace_make_nop(struct module *mod,
181 */ 222 */
182extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); 223extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
183 224
184
185/* May be defined in arch */ 225/* May be defined in arch */
186extern int ftrace_arch_read_dyn_info(char *buf, int size); 226extern int ftrace_arch_read_dyn_info(char *buf, int size);
187 227
@@ -198,6 +238,14 @@ extern void ftrace_enable_daemon(void);
198# define ftrace_disable_daemon() do { } while (0) 238# define ftrace_disable_daemon() do { } while (0)
199# define ftrace_enable_daemon() do { } while (0) 239# define ftrace_enable_daemon() do { } while (0)
200static inline void ftrace_release(void *start, unsigned long size) { } 240static inline void ftrace_release(void *start, unsigned long size) { }
241static inline int register_ftrace_command(struct ftrace_func_command *cmd)
242{
243 return -EINVAL;
244}
245static inline int unregister_ftrace_command(char *cmd_name)
246{
247 return -EINVAL;
248}
201#endif /* CONFIG_DYNAMIC_FTRACE */ 249#endif /* CONFIG_DYNAMIC_FTRACE */
202 250
203/* totally disable ftrace - can not re-enable after this */ 251/* totally disable ftrace - can not re-enable after this */
@@ -233,24 +281,25 @@ static inline void __ftrace_enabled_restore(int enabled)
233#endif 281#endif
234} 282}
235 283
236#ifdef CONFIG_FRAME_POINTER 284#ifndef HAVE_ARCH_CALLER_ADDR
237/* TODO: need to fix this for ARM */ 285# ifdef CONFIG_FRAME_POINTER
238# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 286# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
239# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) 287# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
240# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) 288# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
241# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) 289# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
242# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) 290# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
243# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) 291# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
244# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) 292# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
245#else 293# else
246# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 294# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
247# define CALLER_ADDR1 0UL 295# define CALLER_ADDR1 0UL
248# define CALLER_ADDR2 0UL 296# define CALLER_ADDR2 0UL
249# define CALLER_ADDR3 0UL 297# define CALLER_ADDR3 0UL
250# define CALLER_ADDR4 0UL 298# define CALLER_ADDR4 0UL
251# define CALLER_ADDR5 0UL 299# define CALLER_ADDR5 0UL
252# define CALLER_ADDR6 0UL 300# define CALLER_ADDR6 0UL
253#endif 301# endif
302#endif /* ifndef HAVE_ARCH_CALLER_ADDR */
254 303
255#ifdef CONFIG_IRQSOFF_TRACER 304#ifdef CONFIG_IRQSOFF_TRACER
256 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 305 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
@@ -268,54 +317,6 @@ static inline void __ftrace_enabled_restore(int enabled)
268# define trace_preempt_off(a0, a1) do { } while (0) 317# define trace_preempt_off(a0, a1) do { } while (0)
269#endif 318#endif
270 319
271#ifdef CONFIG_TRACING
272extern int ftrace_dump_on_oops;
273
274extern void tracing_start(void);
275extern void tracing_stop(void);
276extern void ftrace_off_permanent(void);
277
278extern void
279ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
280
281/**
282 * ftrace_printk - printf formatting in the ftrace buffer
283 * @fmt: the printf format for printing
284 *
285 * Note: __ftrace_printk is an internal function for ftrace_printk and
286 * the @ip is passed in via the ftrace_printk macro.
287 *
288 * This function allows a kernel developer to debug fast path sections
289 * that printk is not appropriate for. By scattering in various
290 * printk like tracing in the code, a developer can quickly see
291 * where problems are occurring.
292 *
293 * This is intended as a debugging tool for the developer only.
294 * Please refrain from leaving ftrace_printks scattered around in
295 * your code.
296 */
297# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
298extern int
299__ftrace_printk(unsigned long ip, const char *fmt, ...)
300 __attribute__ ((format (printf, 2, 3)));
301extern void ftrace_dump(void);
302#else
303static inline void
304ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
305static inline int
306ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
307
308static inline void tracing_start(void) { }
309static inline void tracing_stop(void) { }
310static inline void ftrace_off_permanent(void) { }
311static inline int
312ftrace_printk(const char *fmt, ...)
313{
314 return 0;
315}
316static inline void ftrace_dump(void) { }
317#endif
318
319#ifdef CONFIG_FTRACE_MCOUNT_RECORD 320#ifdef CONFIG_FTRACE_MCOUNT_RECORD
320extern void ftrace_init(void); 321extern void ftrace_init(void);
321extern void ftrace_init_module(struct module *mod, 322extern void ftrace_init_module(struct module *mod,
@@ -327,36 +328,6 @@ ftrace_init_module(struct module *mod,
327 unsigned long *start, unsigned long *end) { } 328 unsigned long *start, unsigned long *end) { }
328#endif 329#endif
329 330
330enum {
331 POWER_NONE = 0,
332 POWER_CSTATE = 1,
333 POWER_PSTATE = 2,
334};
335
336struct power_trace {
337#ifdef CONFIG_POWER_TRACER
338 ktime_t stamp;
339 ktime_t end;
340 int type;
341 int state;
342#endif
343};
344
345#ifdef CONFIG_POWER_TRACER
346extern void trace_power_start(struct power_trace *it, unsigned int type,
347 unsigned int state);
348extern void trace_power_mark(struct power_trace *it, unsigned int type,
349 unsigned int state);
350extern void trace_power_end(struct power_trace *it);
351#else
352static inline void trace_power_start(struct power_trace *it, unsigned int type,
353 unsigned int state) { }
354static inline void trace_power_mark(struct power_trace *it, unsigned int type,
355 unsigned int state) { }
356static inline void trace_power_end(struct power_trace *it) { }
357#endif
358
359
360/* 331/*
361 * Structure that defines an entry function trace. 332 * Structure that defines an entry function trace.
362 */ 333 */
@@ -380,6 +351,30 @@ struct ftrace_graph_ret {
380#ifdef CONFIG_FUNCTION_GRAPH_TRACER 351#ifdef CONFIG_FUNCTION_GRAPH_TRACER
381 352
382/* 353/*
354 * Stack of return addresses for functions
355 * of a thread.
356 * Used in struct thread_info
357 */
358struct ftrace_ret_stack {
359 unsigned long ret;
360 unsigned long func;
361 unsigned long long calltime;
362};
363
364/*
365 * Primary handler of a function return.
366 * It relays on ftrace_return_to_handler.
367 * Defined in entry_32/64.S
368 */
369extern void return_to_handler(void);
370
371extern int
372ftrace_push_return_trace(unsigned long ret, unsigned long long time,
373 unsigned long func, int *depth);
374extern void
375ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
376
377/*
383 * Sometimes we don't want to trace a function with the function 378 * Sometimes we don't want to trace a function with the function
384 * graph tracer but we want them to keep traced by the usual function 379 * graph tracer but we want them to keep traced by the usual function
385 * tracer if the function graph tracer is not configured. 380 * tracer if the function graph tracer is not configured.
@@ -490,6 +485,50 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
490 return tsk->trace & TSK_TRACE_FL_GRAPH; 485 return tsk->trace & TSK_TRACE_FL_GRAPH;
491} 486}
492 487
488extern int ftrace_dump_on_oops;
489
493#endif /* CONFIG_TRACING */ 490#endif /* CONFIG_TRACING */
494 491
492
493#ifdef CONFIG_HW_BRANCH_TRACER
494
495void trace_hw_branch(u64 from, u64 to);
496void trace_hw_branch_oops(void);
497
498#else /* CONFIG_HW_BRANCH_TRACER */
499
500static inline void trace_hw_branch(u64 from, u64 to) {}
501static inline void trace_hw_branch_oops(void) {}
502
503#endif /* CONFIG_HW_BRANCH_TRACER */
504
505/*
506 * A syscall entry in the ftrace syscalls array.
507 *
508 * @name: name of the syscall
509 * @nb_args: number of parameters it takes
510 * @types: list of types as strings
511 * @args: list of args as strings (args[i] matches types[i])
512 */
513struct syscall_metadata {
514 const char *name;
515 int nb_args;
516 const char **types;
517 const char **args;
518};
519
520#ifdef CONFIG_FTRACE_SYSCALLS
521extern void arch_init_ftrace_syscalls(void);
522extern struct syscall_metadata *syscall_nr_to_meta(int nr);
523extern void start_ftrace_syscalls(void);
524extern void stop_ftrace_syscalls(void);
525extern void ftrace_syscall_enter(struct pt_regs *regs);
526extern void ftrace_syscall_exit(struct pt_regs *regs);
527#else
528static inline void start_ftrace_syscalls(void) { }
529static inline void stop_ftrace_syscalls(void) { }
530static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
531static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
532#endif
533
495#endif /* _LINUX_FTRACE_H */ 534#endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..dca7bf8cffe2 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,7 @@
2#define _LINUX_FTRACE_IRQ_H 2#define _LINUX_FTRACE_IRQ_H
3 3
4 4
5#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) 5#ifdef CONFIG_FTRACE_NMI_ENTER
6extern void ftrace_nmi_enter(void); 6extern void ftrace_nmi_enter(void);
7extern void ftrace_nmi_exit(void); 7extern void ftrace_nmi_exit(void);
8#else 8#else
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..faa1cf848bcd 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -15,55 +15,61 @@
15 * - bits 0-7 are the preemption count (max preemption depth: 256) 15 * - bits 0-7 are the preemption count (max preemption depth: 256)
16 * - bits 8-15 are the softirq count (max # of softirqs: 256) 16 * - bits 8-15 are the softirq count (max # of softirqs: 256)
17 * 17 *
18 * The hardirq count can be overridden per architecture, the default is: 18 * The hardirq count can in theory reach the same as NR_IRQS.
19 * In reality, the number of nested IRQS is limited to the stack
20 * size as well. For archs with over 1000 IRQS it is not practical
21 * to expect that they will all nest. We give a max of 10 bits for
22 * hardirq nesting. An arch may choose to give less than 10 bits.
23 * m68k expects it to be 8.
19 * 24 *
20 * - bits 16-27 are the hardirq count (max # of hardirqs: 4096) 25 * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
21 * - ( bit 28 is the PREEMPT_ACTIVE flag. ) 26 * - bit 26 is the NMI_MASK
27 * - bit 28 is the PREEMPT_ACTIVE flag
22 * 28 *
23 * PREEMPT_MASK: 0x000000ff 29 * PREEMPT_MASK: 0x000000ff
24 * SOFTIRQ_MASK: 0x0000ff00 30 * SOFTIRQ_MASK: 0x0000ff00
25 * HARDIRQ_MASK: 0x0fff0000 31 * HARDIRQ_MASK: 0x03ff0000
32 * NMI_MASK: 0x04000000
26 */ 33 */
27#define PREEMPT_BITS 8 34#define PREEMPT_BITS 8
28#define SOFTIRQ_BITS 8 35#define SOFTIRQ_BITS 8
36#define NMI_BITS 1
29 37
30#ifndef HARDIRQ_BITS 38#define MAX_HARDIRQ_BITS 10
31#define HARDIRQ_BITS 12
32 39
33#ifndef MAX_HARDIRQS_PER_CPU 40#ifndef HARDIRQ_BITS
34#define MAX_HARDIRQS_PER_CPU NR_IRQS 41# define HARDIRQ_BITS MAX_HARDIRQ_BITS
35#endif 42#endif
36 43
37/* 44#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
38 * The hardirq mask has to be large enough to have space for potentially 45#error HARDIRQ_BITS too high!
39 * all IRQ sources in the system nesting on a single CPU.
40 */
41#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU
42# error HARDIRQ_BITS is too low!
43#endif
44#endif 46#endif
45 47
46#define PREEMPT_SHIFT 0 48#define PREEMPT_SHIFT 0
47#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) 49#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
48#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) 50#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
51#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
49 52
50#define __IRQ_MASK(x) ((1UL << (x))-1) 53#define __IRQ_MASK(x) ((1UL << (x))-1)
51 54
52#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) 55#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
53#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) 56#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
54#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) 57#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
58#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
55 59
56#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) 60#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
57#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) 61#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
58#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 62#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
63#define NMI_OFFSET (1UL << NMI_SHIFT)
59 64
60#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) 65#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
61#error PREEMPT_ACTIVE is too low! 66#error PREEMPT_ACTIVE is too low!
62#endif 67#endif
63 68
64#define hardirq_count() (preempt_count() & HARDIRQ_MASK) 69#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
65#define softirq_count() (preempt_count() & SOFTIRQ_MASK) 70#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
66#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) 71#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
72 | NMI_MASK))
67 73
68/* 74/*
69 * Are we doing bottom half or hardware interrupt processing? 75 * Are we doing bottom half or hardware interrupt processing?
@@ -73,6 +79,11 @@
73#define in_softirq() (softirq_count()) 79#define in_softirq() (softirq_count())
74#define in_interrupt() (irq_count()) 80#define in_interrupt() (irq_count())
75 81
82/*
83 * Are we in NMI context?
84 */
85#define in_nmi() (preempt_count() & NMI_MASK)
86
76#if defined(CONFIG_PREEMPT) 87#if defined(CONFIG_PREEMPT)
77# define PREEMPT_INATOMIC_BASE kernel_locked() 88# define PREEMPT_INATOMIC_BASE kernel_locked()
78# define PREEMPT_CHECK_OFFSET 1 89# define PREEMPT_CHECK_OFFSET 1
@@ -164,20 +175,24 @@ extern void irq_enter(void);
164 */ 175 */
165extern void irq_exit(void); 176extern void irq_exit(void);
166 177
167#define nmi_enter() \ 178#define nmi_enter() \
168 do { \ 179 do { \
169 ftrace_nmi_enter(); \ 180 ftrace_nmi_enter(); \
170 lockdep_off(); \ 181 BUG_ON(in_nmi()); \
171 rcu_nmi_enter(); \ 182 add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \
172 __irq_enter(); \ 183 lockdep_off(); \
184 rcu_nmi_enter(); \
185 trace_hardirq_enter(); \
173 } while (0) 186 } while (0)
174 187
175#define nmi_exit() \ 188#define nmi_exit() \
176 do { \ 189 do { \
177 __irq_exit(); \ 190 trace_hardirq_exit(); \
178 rcu_nmi_exit(); \ 191 rcu_nmi_exit(); \
179 lockdep_on(); \ 192 lockdep_on(); \
180 ftrace_nmi_exit(); \ 193 BUG_ON(!in_nmi()); \
194 sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \
195 ftrace_nmi_exit(); \
181 } while (0) 196 } while (0)
182 197
183#endif /* LINUX_HARDIRQ_H */ 198#endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/in6.h b/include/linux/in6.h
index bc492048c349..718bf21c5754 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -44,11 +44,11 @@ struct in6_addr
44 * NOTE: Be aware the IN6ADDR_* constants and in6addr_* externals are defined 44 * NOTE: Be aware the IN6ADDR_* constants and in6addr_* externals are defined
45 * in network byte order, not in host byte order as are the IPv4 equivalents 45 * in network byte order, not in host byte order as are the IPv4 equivalents
46 */ 46 */
47#ifdef __KERNEL__
47extern const struct in6_addr in6addr_any; 48extern const struct in6_addr in6addr_any;
48#define IN6ADDR_ANY_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } } } 49#define IN6ADDR_ANY_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } } }
49extern const struct in6_addr in6addr_loopback; 50extern const struct in6_addr in6addr_loopback;
50#define IN6ADDR_LOOPBACK_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } 51#define IN6ADDR_LOOPBACK_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } }
51#ifdef __KERNEL__
52extern const struct in6_addr in6addr_linklocal_allnodes; 52extern const struct in6_addr in6addr_linklocal_allnodes;
53#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ 53#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \
54 { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } 54 { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } }
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..9b7e9d743476 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -258,6 +258,11 @@ enum
258 NR_SOFTIRQS 258 NR_SOFTIRQS
259}; 259};
260 260
261/* map softirq index to softirq name. update 'softirq_to_name' in
262 * kernel/softirq.c when adding a new softirq.
263 */
264extern char *softirq_to_name[NR_SOFTIRQS];
265
261/* softirq mask and active fields moved to irq_cpustat_t in 266/* softirq mask and active fields moved to irq_cpustat_t in
262 * asm/hardirq.h to get better cache usage. KAO 267 * asm/hardirq.h to get better cache usage. KAO
263 */ 268 */
@@ -467,6 +472,7 @@ int show_interrupts(struct seq_file *p, void *v);
467struct irq_desc; 472struct irq_desc;
468 473
469extern int early_irq_init(void); 474extern int early_irq_init(void);
475extern int arch_probe_nr_irqs(void);
470extern int arch_early_irq_init(void); 476extern int arch_early_irq_init(void);
471extern int arch_init_chip_data(struct irq_desc *desc, int cpu); 477extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
472 478
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
182 unsigned int irqs_unhandled; 182 unsigned int irqs_unhandled;
183 spinlock_t lock; 183 spinlock_t lock;
184#ifdef CONFIG_SMP 184#ifdef CONFIG_SMP
185 cpumask_t affinity; 185 cpumask_var_t affinity;
186 unsigned int cpu; 186 unsigned int cpu;
187#endif
188#ifdef CONFIG_GENERIC_PENDING_IRQ 187#ifdef CONFIG_GENERIC_PENDING_IRQ
189 cpumask_t pending_mask; 188 cpumask_var_t pending_mask;
189#endif
190#endif 190#endif
191#ifdef CONFIG_PROC_FS 191#ifdef CONFIG_PROC_FS
192 struct proc_dir_entry *dir; 192 struct proc_dir_entry *dir;
@@ -422,4 +422,84 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
422 422
423#endif /* !CONFIG_S390 */ 423#endif /* !CONFIG_S390 */
424 424
425#ifdef CONFIG_SMP
426/**
427 * init_alloc_desc_masks - allocate cpumasks for irq_desc
428 * @desc: pointer to irq_desc struct
429 * @cpu: cpu which will be handling the cpumasks
430 * @boot: true if need bootmem
431 *
432 * Allocates affinity and pending_mask cpumask if required.
433 * Returns true if successful (or not required).
434 * Side effect: affinity has all bits set, pending_mask has all bits clear.
435 */
436static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
437 bool boot)
438{
439 int node;
440
441 if (boot) {
442 alloc_bootmem_cpumask_var(&desc->affinity);
443 cpumask_setall(desc->affinity);
444
445#ifdef CONFIG_GENERIC_PENDING_IRQ
446 alloc_bootmem_cpumask_var(&desc->pending_mask);
447 cpumask_clear(desc->pending_mask);
448#endif
449 return true;
450 }
451
452 node = cpu_to_node(cpu);
453
454 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
455 return false;
456 cpumask_setall(desc->affinity);
457
458#ifdef CONFIG_GENERIC_PENDING_IRQ
459 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
460 free_cpumask_var(desc->affinity);
461 return false;
462 }
463 cpumask_clear(desc->pending_mask);
464#endif
465 return true;
466}
467
468/**
469 * init_copy_desc_masks - copy cpumasks for irq_desc
470 * @old_desc: pointer to old irq_desc struct
471 * @new_desc: pointer to new irq_desc struct
472 *
473 * Insures affinity and pending_masks are copied to new irq_desc.
474 * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
475 * irq_desc struct so the copy is redundant.
476 */
477
478static inline void init_copy_desc_masks(struct irq_desc *old_desc,
479 struct irq_desc *new_desc)
480{
481#ifdef CONFIG_CPUMASKS_OFFSTACK
482 cpumask_copy(new_desc->affinity, old_desc->affinity);
483
484#ifdef CONFIG_GENERIC_PENDING_IRQ
485 cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
486#endif
487#endif
488}
489
490#else /* !CONFIG_SMP */
491
492static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
493 bool boot)
494{
495 return true;
496}
497
498static inline void init_copy_desc_masks(struct irq_desc *old_desc,
499 struct irq_desc *new_desc)
500{
501}
502
503#endif /* CONFIG_SMP */
504
425#endif /* _LINUX_IRQ_H */ 505#endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c9..b02a3f1d46a0 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -24,8 +24,8 @@
24# define trace_softirqs_enabled(p) ((p)->softirqs_enabled) 24# define trace_softirqs_enabled(p) ((p)->softirqs_enabled)
25# define trace_hardirq_enter() do { current->hardirq_context++; } while (0) 25# define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
26# define trace_hardirq_exit() do { current->hardirq_context--; } while (0) 26# define trace_hardirq_exit() do { current->hardirq_context--; } while (0)
27# define trace_softirq_enter() do { current->softirq_context++; } while (0) 27# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
28# define trace_softirq_exit() do { current->softirq_context--; } while (0) 28# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
29# define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, 29# define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
30#else 30#else
31# define trace_hardirqs_on() do { } while (0) 31# define trace_hardirqs_on() do { } while (0)
@@ -38,8 +38,8 @@
38# define trace_softirqs_enabled(p) 0 38# define trace_softirqs_enabled(p) 0
39# define trace_hardirq_enter() do { } while (0) 39# define trace_hardirq_enter() do { } while (0)
40# define trace_hardirq_exit() do { } while (0) 40# define trace_hardirq_exit() do { } while (0)
41# define trace_softirq_enter() do { } while (0) 41# define lockdep_softirq_enter() do { } while (0)
42# define trace_softirq_exit() do { } while (0) 42# define lockdep_softirq_exit() do { } while (0)
43# define INIT_TRACE_IRQFLAGS 43# define INIT_TRACE_IRQFLAGS
44#endif 44#endif
45 45
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,6 +20,7 @@
20 20
21# define for_each_irq_desc_reverse(irq, desc) \ 21# define for_each_irq_desc_reverse(irq, desc) \
22 for (irq = nr_irqs - 1; irq >= 0; irq--) 22 for (irq = nr_irqs - 1; irq >= 0; irq--)
23
23#else /* CONFIG_GENERIC_HARDIRQS */ 24#else /* CONFIG_GENERIC_HARDIRQS */
24 25
25extern int nr_irqs; 26extern int nr_irqs;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7fa371898e3e..1daca3b062bb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,19 @@ extern struct ratelimit_state printk_ratelimit_state;
242extern int printk_ratelimit(void); 242extern int printk_ratelimit(void);
243extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, 243extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
244 unsigned int interval_msec); 244 unsigned int interval_msec);
245
246/*
247 * Print a one-time message (analogous to WARN_ONCE() et al):
248 */
249#define printk_once(x...) ({ \
250 static int __print_once = 1; \
251 \
252 if (__print_once) { \
253 __print_once = 0; \
254 printk(x); \
255 } \
256})
257
245#else 258#else
246static inline int vprintk(const char *s, va_list args) 259static inline int vprintk(const char *s, va_list args)
247 __attribute__ ((format (printf, 1, 0))); 260 __attribute__ ((format (printf, 1, 0)));
@@ -253,6 +266,10 @@ static inline int printk_ratelimit(void) { return 0; }
253static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ 266static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
254 unsigned int interval_msec) \ 267 unsigned int interval_msec) \
255 { return false; } 268 { return false; }
269
270/* No effect, but we still get type checking even in the !PRINTK case: */
271#define printk_once(x...) printk(x)
272
256#endif 273#endif
257 274
258extern int printk_needs_cpu(int cpu); 275extern int printk_needs_cpu(int cpu);
@@ -368,6 +385,139 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
368#endif 385#endif
369 386
370/* 387/*
388 * General tracing related utility functions - trace_printk(),
389 * tracing_on/tracing_off and tracing_start()/tracing_stop
390 *
391 * Use tracing_on/tracing_off when you want to quickly turn on or off
392 * tracing. It simply enables or disables the recording of the trace events.
393 * This also corresponds to the user space debugfs/tracing/tracing_on
394 * file, which gives a means for the kernel and userspace to interact.
395 * Place a tracing_off() in the kernel where you want tracing to end.
396 * From user space, examine the trace, and then echo 1 > tracing_on
397 * to continue tracing.
398 *
399 * tracing_stop/tracing_start has slightly more overhead. It is used
400 * by things like suspend to ram where disabling the recording of the
401 * trace is not enough, but tracing must actually stop because things
402 * like calling smp_processor_id() may crash the system.
403 *
404 * Most likely, you want to use tracing_on/tracing_off.
405 */
406#ifdef CONFIG_RING_BUFFER
407void tracing_on(void);
408void tracing_off(void);
409/* trace_off_permanent stops recording with no way to bring it back */
410void tracing_off_permanent(void);
411int tracing_is_on(void);
412#else
413static inline void tracing_on(void) { }
414static inline void tracing_off(void) { }
415static inline void tracing_off_permanent(void) { }
416static inline int tracing_is_on(void) { return 0; }
417#endif
418#ifdef CONFIG_TRACING
419extern void tracing_start(void);
420extern void tracing_stop(void);
421extern void ftrace_off_permanent(void);
422
423extern void
424ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
425
426static inline void __attribute__ ((format (printf, 1, 2)))
427____trace_printk_check_format(const char *fmt, ...)
428{
429}
430#define __trace_printk_check_format(fmt, args...) \
431do { \
432 if (0) \
433 ____trace_printk_check_format(fmt, ##args); \
434} while (0)
435
436/**
437 * trace_printk - printf formatting in the ftrace buffer
438 * @fmt: the printf format for printing
439 *
440 * Note: __trace_printk is an internal function for trace_printk and
441 * the @ip is passed in via the trace_printk macro.
442 *
443 * This function allows a kernel developer to debug fast path sections
444 * that printk is not appropriate for. By scattering in various
445 * printk like tracing in the code, a developer can quickly see
446 * where problems are occurring.
447 *
448 * This is intended as a debugging tool for the developer only.
449 * Please refrain from leaving trace_printks scattered around in
450 * your code.
451 */
452
453#define trace_printk(fmt, args...) \
454do { \
455 __trace_printk_check_format(fmt, ##args); \
456 if (__builtin_constant_p(fmt)) { \
457 static const char *trace_printk_fmt \
458 __attribute__((section("__trace_printk_fmt"))) = \
459 __builtin_constant_p(fmt) ? fmt : NULL; \
460 \
461 __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \
462 } else \
463 __trace_printk(_THIS_IP_, fmt, ##args); \
464} while (0)
465
466extern int
467__trace_bprintk(unsigned long ip, const char *fmt, ...)
468 __attribute__ ((format (printf, 2, 3)));
469
470extern int
471__trace_printk(unsigned long ip, const char *fmt, ...)
472 __attribute__ ((format (printf, 2, 3)));
473
474/*
475 * The double __builtin_constant_p is because gcc will give us an error
476 * if we try to allocate the static variable to fmt if it is not a
477 * constant. Even with the outer if statement.
478 */
479#define ftrace_vprintk(fmt, vargs) \
480do { \
481 if (__builtin_constant_p(fmt)) { \
482 static const char *trace_printk_fmt \
483 __attribute__((section("__trace_printk_fmt"))) = \
484 __builtin_constant_p(fmt) ? fmt : NULL; \
485 \
486 __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \
487 } else \
488 __ftrace_vprintk(_THIS_IP_, fmt, vargs); \
489} while (0)
490
491extern int
492__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
493
494extern int
495__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
496
497extern void ftrace_dump(void);
498#else
499static inline void
500ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
501static inline int
502trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
503
504static inline void tracing_start(void) { }
505static inline void tracing_stop(void) { }
506static inline void ftrace_off_permanent(void) { }
507static inline int
508trace_printk(const char *fmt, ...)
509{
510 return 0;
511}
512static inline int
513ftrace_vprintk(const char *fmt, va_list ap)
514{
515 return 0;
516}
517static inline void ftrace_dump(void) { }
518#endif /* CONFIG_TRACING */
519
520/*
371 * Display an IP address in readable format. 521 * Display an IP address in readable format.
372 */ 522 */
373 523
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 32851eef48f0..2ec6cc14a114 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -182,6 +182,14 @@ struct kprobe_blackpoint {
182DECLARE_PER_CPU(struct kprobe *, current_kprobe); 182DECLARE_PER_CPU(struct kprobe *, current_kprobe);
183DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 183DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
184 184
185/*
186 * For #ifdef avoidance:
187 */
188static inline int kprobes_built_in(void)
189{
190 return 1;
191}
192
185#ifdef CONFIG_KRETPROBES 193#ifdef CONFIG_KRETPROBES
186extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, 194extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
187 struct pt_regs *regs); 195 struct pt_regs *regs);
@@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretprobe **rps, int num);
271void kprobe_flush_task(struct task_struct *tk); 279void kprobe_flush_task(struct task_struct *tk);
272void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); 280void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
273 281
274#else /* CONFIG_KPROBES */ 282#else /* !CONFIG_KPROBES: */
275 283
284static inline int kprobes_built_in(void)
285{
286 return 0;
287}
288static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
289{
290 return 0;
291}
276static inline struct kprobe *get_kprobe(void *addr) 292static inline struct kprobe *get_kprobe(void *addr)
277{ 293{
278 return NULL; 294 return NULL;
@@ -329,5 +345,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num)
329static inline void kprobe_flush_task(struct task_struct *tk) 345static inline void kprobe_flush_task(struct task_struct *tk)
330{ 346{
331} 347}
332#endif /* CONFIG_KPROBES */ 348#endif /* CONFIG_KPROBES */
333#endif /* _LINUX_KPROBES_H */ 349#endif /* _LINUX_KPROBES_H */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 23bf02fb124f..5a58ea3e91e9 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -20,43 +20,10 @@ struct lockdep_map;
20#include <linux/stacktrace.h> 20#include <linux/stacktrace.h>
21 21
22/* 22/*
23 * Lock-class usage-state bits: 23 * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
24 * the total number of states... :-(
24 */ 25 */
25enum lock_usage_bit 26#define XXX_LOCK_USAGE_STATES (1+3*4)
26{
27 LOCK_USED = 0,
28 LOCK_USED_IN_HARDIRQ,
29 LOCK_USED_IN_SOFTIRQ,
30 LOCK_ENABLED_SOFTIRQS,
31 LOCK_ENABLED_HARDIRQS,
32 LOCK_USED_IN_HARDIRQ_READ,
33 LOCK_USED_IN_SOFTIRQ_READ,
34 LOCK_ENABLED_SOFTIRQS_READ,
35 LOCK_ENABLED_HARDIRQS_READ,
36 LOCK_USAGE_STATES
37};
38
39/*
40 * Usage-state bitmasks:
41 */
42#define LOCKF_USED (1 << LOCK_USED)
43#define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ)
44#define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ)
45#define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS)
46#define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS)
47
48#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
49#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
50
51#define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ)
52#define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ)
53#define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ)
54#define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ)
55
56#define LOCKF_ENABLED_IRQS_READ \
57 (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
58#define LOCKF_USED_IN_IRQ_READ \
59 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
60 27
61#define MAX_LOCKDEP_SUBCLASSES 8UL 28#define MAX_LOCKDEP_SUBCLASSES 8UL
62 29
@@ -97,7 +64,7 @@ struct lock_class {
97 * IRQ/softirq usage tracking bits: 64 * IRQ/softirq usage tracking bits:
98 */ 65 */
99 unsigned long usage_mask; 66 unsigned long usage_mask;
100 struct stack_trace usage_traces[LOCK_USAGE_STATES]; 67 struct stack_trace usage_traces[XXX_LOCK_USAGE_STATES];
101 68
102 /* 69 /*
103 * These fields represent a directed graph of lock dependencies, 70 * These fields represent a directed graph of lock dependencies,
@@ -324,7 +291,11 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
324 lock_set_class(lock, lock->name, lock->key, subclass, ip); 291 lock_set_class(lock, lock->name, lock->key, subclass, ip);
325} 292}
326 293
327# define INIT_LOCKDEP .lockdep_recursion = 0, 294extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
295extern void lockdep_clear_current_reclaim_state(void);
296extern void lockdep_trace_alloc(gfp_t mask);
297
298# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
328 299
329#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) 300#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
330 301
@@ -342,6 +313,9 @@ static inline void lockdep_on(void)
342# define lock_release(l, n, i) do { } while (0) 313# define lock_release(l, n, i) do { } while (0)
343# define lock_set_class(l, n, k, s, i) do { } while (0) 314# define lock_set_class(l, n, k, s, i) do { } while (0)
344# define lock_set_subclass(l, s, i) do { } while (0) 315# define lock_set_subclass(l, s, i) do { } while (0)
316# define lockdep_set_current_reclaim_state(g) do { } while (0)
317# define lockdep_clear_current_reclaim_state() do { } while (0)
318# define lockdep_trace_alloc(g) do { } while (0)
345# define lockdep_init() do { } while (0) 319# define lockdep_init() do { } while (0)
346# define lockdep_info() do { } while (0) 320# define lockdep_info() do { } while (0)
347# define lockdep_init_map(lock, name, key, sub) \ 321# define lockdep_init_map(lock, name, key, sub) \
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 0b4df7eba852..5b4e28bcb788 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -49,4 +49,5 @@
49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA 49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA 50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
51 51
52#define STACK_END_MAGIC 0x57AC6E9D
52#endif /* __LINUX_MAGIC_H__ */ 53#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3fdc10806d31..86a6c0f0518d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -99,4 +99,10 @@ enum mem_add_context { BOOT, HOTPLUG };
99#define hotplug_memory_notifier(fn, pri) do { } while (0) 99#define hotplug_memory_notifier(fn, pri) do { } while (0)
100#endif 100#endif
101 101
102/*
103 * Kernel text modification mutex, used for code patching. Users of this lock
104 * can sleep.
105 */
106extern struct mutex text_mutex;
107
102#endif /* _LINUX_MEMORY_H_ */ 108#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 139d7c88d9c9..3d1b7bde1283 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -1,5 +1,5 @@
1#ifndef MMIOTRACE_H 1#ifndef _LINUX_MMIOTRACE_H
2#define MMIOTRACE_H 2#define _LINUX_MMIOTRACE_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/list.h> 5#include <linux/list.h>
@@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
13 unsigned long condition, struct pt_regs *); 13 unsigned long condition, struct pt_regs *);
14 14
15struct kmmio_probe { 15struct kmmio_probe {
16 struct list_head list; /* kmmio internal list */ 16 /* kmmio internal list: */
17 unsigned long addr; /* start location of the probe point */ 17 struct list_head list;
18 unsigned long len; /* length of the probe region */ 18 /* start location of the probe point: */
19 kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ 19 unsigned long addr;
20 kmmio_post_handler_t post_handler; /* Called after addr is executed */ 20 /* length of the probe region: */
21 void *private; 21 unsigned long len;
22 /* Called before addr is executed: */
23 kmmio_pre_handler_t pre_handler;
24 /* Called after addr is executed: */
25 kmmio_post_handler_t post_handler;
26 void *private;
22}; 27};
23 28
29extern unsigned int kmmio_count;
30
31extern int register_kmmio_probe(struct kmmio_probe *p);
32extern void unregister_kmmio_probe(struct kmmio_probe *p);
33
34#ifdef CONFIG_MMIOTRACE
24/* kmmio is active by some kmmio_probes? */ 35/* kmmio is active by some kmmio_probes? */
25static inline int is_kmmio_active(void) 36static inline int is_kmmio_active(void)
26{ 37{
27 extern unsigned int kmmio_count;
28 return kmmio_count; 38 return kmmio_count;
29} 39}
30 40
31extern int register_kmmio_probe(struct kmmio_probe *p);
32extern void unregister_kmmio_probe(struct kmmio_probe *p);
33
34/* Called from page fault handler. */ 41/* Called from page fault handler. */
35extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); 42extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
36 43
37#ifdef CONFIG_MMIOTRACE
38/* Called from ioremap.c */ 44/* Called from ioremap.c */
39extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, 45extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
40 void __iomem *addr); 46 void __iomem *addr);
@@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr);
43/* For anyone to insert markers. Remember trailing newline. */ 49/* For anyone to insert markers. Remember trailing newline. */
44extern int mmiotrace_printk(const char *fmt, ...) 50extern int mmiotrace_printk(const char *fmt, ...)
45 __attribute__ ((format (printf, 1, 2))); 51 __attribute__ ((format (printf, 1, 2)));
46#else 52#else /* !CONFIG_MMIOTRACE: */
53static inline int is_kmmio_active(void)
54{
55 return 0;
56}
57
58static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr)
59{
60 return 0;
61}
62
47static inline void mmiotrace_ioremap(resource_size_t offset, 63static inline void mmiotrace_ioremap(resource_size_t offset,
48 unsigned long size, void __iomem *addr) 64 unsigned long size, void __iomem *addr)
49{ 65{
@@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const char *fmt, ...)
63#endif /* CONFIG_MMIOTRACE */ 79#endif /* CONFIG_MMIOTRACE */
64 80
65enum mm_io_opcode { 81enum mm_io_opcode {
66 MMIO_READ = 0x1, /* struct mmiotrace_rw */ 82 MMIO_READ = 0x1, /* struct mmiotrace_rw */
67 MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ 83 MMIO_WRITE = 0x2, /* struct mmiotrace_rw */
68 MMIO_PROBE = 0x3, /* struct mmiotrace_map */ 84 MMIO_PROBE = 0x3, /* struct mmiotrace_map */
69 MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ 85 MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */
70 MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ 86 MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */
71}; 87};
72 88
73struct mmiotrace_rw { 89struct mmiotrace_rw {
74 resource_size_t phys; /* PCI address of register */ 90 resource_size_t phys; /* PCI address of register */
75 unsigned long value; 91 unsigned long value;
76 unsigned long pc; /* optional program counter */ 92 unsigned long pc; /* optional program counter */
77 int map_id; 93 int map_id;
78 unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ 94 unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
79 unsigned char width; /* size of register access in bytes */ 95 unsigned char width; /* size of register access in bytes */
80}; 96};
81 97
82struct mmiotrace_map { 98struct mmiotrace_map {
83 resource_size_t phys; /* base address in PCI space */ 99 resource_size_t phys; /* base address in PCI space */
84 unsigned long virt; /* base virtual address */ 100 unsigned long virt; /* base virtual address */
85 unsigned long len; /* mapping size */ 101 unsigned long len; /* mapping size */
86 int map_id; 102 int map_id;
87 unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ 103 unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */
88}; 104};
89 105
90/* in kernel/trace/trace_mmiotrace.c */ 106/* in kernel/trace/trace_mmiotrace.c */
@@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw);
94extern void mmio_trace_mapping(struct mmiotrace_map *map); 110extern void mmio_trace_mapping(struct mmiotrace_map *map);
95extern int mmio_trace_printk(const char *fmt, va_list args); 111extern int mmio_trace_printk(const char *fmt, va_list args);
96 112
97#endif /* MMIOTRACE_H */ 113#endif /* _LINUX_MMIOTRACE_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..22d9878e868c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,6 +329,11 @@ struct module
329 unsigned int num_tracepoints; 329 unsigned int num_tracepoints;
330#endif 330#endif
331 331
332#ifdef CONFIG_TRACING
333 const char **trace_bprintk_fmt_start;
334 unsigned int num_trace_bprintk_fmt;
335#endif
336
332#ifdef CONFIG_MODULE_UNLOAD 337#ifdef CONFIG_MODULE_UNLOAD
333 /* What modules depend on me? */ 338 /* What modules depend on me? */
334 struct list_head modules_which_use_me; 339 struct list_head modules_which_use_me;
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 7a0e5c4f8072..3069ec7e0ab8 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,8 +50,10 @@ struct mutex {
50 atomic_t count; 50 atomic_t count;
51 spinlock_t wait_lock; 51 spinlock_t wait_lock;
52 struct list_head wait_list; 52 struct list_head wait_list;
53#ifdef CONFIG_DEBUG_MUTEXES 53#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
54 struct thread_info *owner; 54 struct thread_info *owner;
55#endif
56#ifdef CONFIG_DEBUG_MUTEXES
55 const char *name; 57 const char *name;
56 void *magic; 58 void *magic;
57#endif 59#endif
@@ -68,7 +70,6 @@ struct mutex_waiter {
68 struct list_head list; 70 struct list_head list;
69 struct task_struct *task; 71 struct task_struct *task;
70#ifdef CONFIG_DEBUG_MUTEXES 72#ifdef CONFIG_DEBUG_MUTEXES
71 struct mutex *lock;
72 void *magic; 73 void *magic;
73#endif 74#endif
74}; 75};
diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index 7382af374731..e137b3c486a7 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -237,6 +237,7 @@ struct nubus_dirent
237 int mask; 237 int mask;
238}; 238};
239 239
240#ifdef __KERNEL__
240struct nubus_board { 241struct nubus_board {
241 struct nubus_board* next; 242 struct nubus_board* next;
242 struct nubus_dev* first_dev; 243 struct nubus_dev* first_dev;
@@ -351,6 +352,7 @@ void nubus_get_rsrc_mem(void* dest,
351void nubus_get_rsrc_str(void* dest, 352void nubus_get_rsrc_str(void* dest,
352 const struct nubus_dirent *dirent, 353 const struct nubus_dirent *dirent,
353 int maxlen); 354 int maxlen);
355#endif /* __KERNEL__ */
354 356
355/* We'd like to get rid of this eventually. Only daynaport.c uses it now. */ 357/* We'd like to get rid of this eventually. Only daynaport.c uses it now. */
356static inline void *nubus_slot_addr(int slot) 358static inline void *nubus_slot_addr(int slot)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f2a3751873a..54a968b4b924 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -5,53 +5,66 @@
5#include <linux/slab.h> /* For kmalloc() */ 5#include <linux/slab.h> /* For kmalloc() */
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <linux/pfn.h>
8 9
9#include <asm/percpu.h> 10#include <asm/percpu.h>
10 11
12#ifndef PER_CPU_BASE_SECTION
13#ifdef CONFIG_SMP
14#define PER_CPU_BASE_SECTION ".data.percpu"
15#else
16#define PER_CPU_BASE_SECTION ".data"
17#endif
18#endif
19
11#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
12#define DEFINE_PER_CPU(type, name) \
13 __attribute__((__section__(".data.percpu"))) \
14 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
15 21
16#ifdef MODULE 22#ifdef MODULE
17#define SHARED_ALIGNED_SECTION ".data.percpu" 23#define PER_CPU_SHARED_ALIGNED_SECTION ""
18#else 24#else
19#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned" 25#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
20#endif 26#endif
27#define PER_CPU_FIRST_SECTION ".first"
21 28
22#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 29#else
23 __attribute__((__section__(SHARED_ALIGNED_SECTION))) \ 30
24 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \ 31#define PER_CPU_SHARED_ALIGNED_SECTION ""
25 ____cacheline_aligned_in_smp 32#define PER_CPU_FIRST_SECTION ""
26 33
27#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 34#endif
28 __attribute__((__section__(".data.percpu.page_aligned"))) \ 35
36#define DEFINE_PER_CPU_SECTION(type, name, section) \
37 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \
29 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 38 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
30#else 39
31#define DEFINE_PER_CPU(type, name) \ 40#define DEFINE_PER_CPU(type, name) \
32 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 41 DEFINE_PER_CPU_SECTION(type, name, "")
33 42
34#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 43#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
35 DEFINE_PER_CPU(type, name) 44 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
45 ____cacheline_aligned_in_smp
36 46
37#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 47#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
38 DEFINE_PER_CPU(type, name) 48 DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
39#endif 49
50#define DEFINE_PER_CPU_FIRST(type, name) \
51 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
40 52
41#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) 53#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
42#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) 54#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
43 55
44/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ 56/* enough to cover all DEFINE_PER_CPUs in modules */
45#ifndef PERCPU_ENOUGH_ROOM
46#ifdef CONFIG_MODULES 57#ifdef CONFIG_MODULES
47#define PERCPU_MODULE_RESERVE 8192 58#define PERCPU_MODULE_RESERVE (8 << 10)
48#else 59#else
49#define PERCPU_MODULE_RESERVE 0 60#define PERCPU_MODULE_RESERVE 0
50#endif 61#endif
51 62
63#ifndef PERCPU_ENOUGH_ROOM
52#define PERCPU_ENOUGH_ROOM \ 64#define PERCPU_ENOUGH_ROOM \
53 (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE) 65 (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
54#endif /* PERCPU_ENOUGH_ROOM */ 66 PERCPU_MODULE_RESERVE)
67#endif
55 68
56/* 69/*
57 * Must be an lvalue. Since @var must be a simple identifier, 70 * Must be an lvalue. Since @var must be a simple identifier,
@@ -65,52 +78,90 @@
65 78
66#ifdef CONFIG_SMP 79#ifdef CONFIG_SMP
67 80
81#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
82
83/* minimum unit size, also is the maximum supported allocation size */
84#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10)
85
86/*
87 * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
88 * back on the first chunk for dynamic percpu allocation if arch is
89 * manually allocating and mapping it for faster access (as a part of
90 * large page mapping for example).
91 *
92 * The following values give between one and two pages of free space
93 * after typical minimal boot (2-way SMP, single disk and NIC) with
94 * both defconfig and a distro config on x86_64 and 32. More
95 * intelligent way to determine this would be nice.
96 */
97#if BITS_PER_LONG > 32
98#define PERCPU_DYNAMIC_RESERVE (20 << 10)
99#else
100#define PERCPU_DYNAMIC_RESERVE (12 << 10)
101#endif
102
103extern void *pcpu_base_addr;
104
105typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
106typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
107
108extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
109 size_t static_size, size_t reserved_size,
110 ssize_t unit_size, ssize_t dyn_size,
111 void *base_addr,
112 pcpu_populate_pte_fn_t populate_pte_fn);
113
114/*
115 * Use this to get to a cpu's version of the per-cpu object
116 * dynamically allocated. Non-atomic access to the current CPU's
117 * version should probably be combined with get_cpu()/put_cpu().
118 */
119#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
120
121extern void *__alloc_reserved_percpu(size_t size, size_t align);
122
123#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
124
68struct percpu_data { 125struct percpu_data {
69 void *ptrs[1]; 126 void *ptrs[1];
70}; 127};
71 128
72#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) 129#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
73/* 130
74 * Use this to get to a cpu's version of the per-cpu object dynamically 131#define per_cpu_ptr(ptr, cpu) \
75 * allocated. Non-atomic access to the current CPU's version should 132({ \
76 * probably be combined with get_cpu()/put_cpu(). 133 struct percpu_data *__p = __percpu_disguise(ptr); \
77 */ 134 (__typeof__(ptr))__p->ptrs[(cpu)]; \
78#define percpu_ptr(ptr, cpu) \
79({ \
80 struct percpu_data *__p = __percpu_disguise(ptr); \
81 (__typeof__(ptr))__p->ptrs[(cpu)]; \
82}) 135})
83 136
84extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); 137#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
85extern void percpu_free(void *__pdata); 138
139extern void *__alloc_percpu(size_t size, size_t align);
140extern void free_percpu(void *__pdata);
86 141
87#else /* CONFIG_SMP */ 142#else /* CONFIG_SMP */
88 143
89#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) 144#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
90 145
91static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 146static inline void *__alloc_percpu(size_t size, size_t align)
92{ 147{
93 return kzalloc(size, gfp); 148 /*
149 * Can't easily make larger alignment work with kmalloc. WARN
150 * on it. Larger alignment should only be used for module
151 * percpu sections on SMP for which this path isn't used.
152 */
153 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
154 return kzalloc(size, GFP_KERNEL);
94} 155}
95 156
96static inline void percpu_free(void *__pdata) 157static inline void free_percpu(void *p)
97{ 158{
98 kfree(__pdata); 159 kfree(p);
99} 160}
100 161
101#endif /* CONFIG_SMP */ 162#endif /* CONFIG_SMP */
102 163
103#define percpu_alloc_mask(size, gfp, mask) \ 164#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
104 __percpu_alloc_mask((size), (gfp), &(mask)) 165 __alignof__(type))
105
106#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
107
108/* (legacy) interface for use without CPU hotplug handling */
109
110#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \
111 cpu_possible_map)
112#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type))
113#define free_percpu(ptr) percpu_free((ptr))
114#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu))
115 166
116#endif /* __LINUX_PERCPU_H */ 167#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index bc5114d35e99..e356c99f0659 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -28,8 +28,6 @@
28#include <linux/reiserfs_fs_sb.h> 28#include <linux/reiserfs_fs_sb.h>
29#endif 29#endif
30 30
31struct fid;
32
33/* 31/*
34 * include/linux/reiser_fs.h 32 * include/linux/reiser_fs.h
35 * 33 *
@@ -37,6 +35,33 @@ struct fid;
37 * 35 *
38 */ 36 */
39 37
38/* ioctl's command */
39#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long)
40/* define following flags to be the same as in ext2, so that chattr(1),
41 lsattr(1) will work with us. */
42#define REISERFS_IOC_GETFLAGS FS_IOC_GETFLAGS
43#define REISERFS_IOC_SETFLAGS FS_IOC_SETFLAGS
44#define REISERFS_IOC_GETVERSION FS_IOC_GETVERSION
45#define REISERFS_IOC_SETVERSION FS_IOC_SETVERSION
46
47#ifdef __KERNEL__
48/* the 32 bit compat definitions with int argument */
49#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
50#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
51#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
52#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
53#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
54
55/* Locking primitives */
56/* Right now we are still falling back to (un)lock_kernel, but eventually that
57 would evolve into real per-fs locks */
58#define reiserfs_write_lock( sb ) lock_kernel()
59#define reiserfs_write_unlock( sb ) unlock_kernel()
60
61/* xattr stuff */
62#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
63struct fid;
64
40/* in reading the #defines, it may help to understand that they employ 65/* in reading the #defines, it may help to understand that they employ
41 the following abbreviations: 66 the following abbreviations:
42 67
@@ -698,6 +723,7 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
698/* object identifier for root dir */ 723/* object identifier for root dir */
699#define REISERFS_ROOT_OBJECTID 2 724#define REISERFS_ROOT_OBJECTID 2
700#define REISERFS_ROOT_PARENT_OBJECTID 1 725#define REISERFS_ROOT_PARENT_OBJECTID 1
726
701extern struct reiserfs_key root_key; 727extern struct reiserfs_key root_key;
702 728
703/* 729/*
@@ -1540,7 +1566,6 @@ struct reiserfs_iget_args {
1540/* FUNCTION DECLARATIONS */ 1566/* FUNCTION DECLARATIONS */
1541/***************************************************************************/ 1567/***************************************************************************/
1542 1568
1543/*#ifdef __KERNEL__*/
1544#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) 1569#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
1545 1570
1546#define journal_trans_half(blocksize) \ 1571#define journal_trans_half(blocksize) \
@@ -2178,29 +2203,6 @@ long reiserfs_compat_ioctl(struct file *filp,
2178 unsigned int cmd, unsigned long arg); 2203 unsigned int cmd, unsigned long arg);
2179int reiserfs_unpack(struct inode *inode, struct file *filp); 2204int reiserfs_unpack(struct inode *inode, struct file *filp);
2180 2205
2181/* ioctl's command */
2182#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long)
2183/* define following flags to be the same as in ext2, so that chattr(1),
2184 lsattr(1) will work with us. */
2185#define REISERFS_IOC_GETFLAGS FS_IOC_GETFLAGS
2186#define REISERFS_IOC_SETFLAGS FS_IOC_SETFLAGS
2187#define REISERFS_IOC_GETVERSION FS_IOC_GETVERSION
2188#define REISERFS_IOC_SETVERSION FS_IOC_SETVERSION
2189
2190/* the 32 bit compat definitions with int argument */
2191#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
2192#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
2193#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
2194#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
2195#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
2196
2197/* Locking primitives */
2198/* Right now we are still falling back to (un)lock_kernel, but eventually that
2199 would evolve into real per-fs locks */
2200#define reiserfs_write_lock( sb ) lock_kernel()
2201#define reiserfs_write_unlock( sb ) unlock_kernel()
2202
2203/* xattr stuff */
2204#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
2205 2206
2207#endif /* __KERNEL__ */
2206#endif /* _LINUX_REISER_FS_H */ 2208#endif /* _LINUX_REISER_FS_H */
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..9e6052bd1a1c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -8,7 +8,7 @@ struct ring_buffer;
8struct ring_buffer_iter; 8struct ring_buffer_iter;
9 9
10/* 10/*
11 * Don't reference this struct directly, use functions below. 11 * Don't refer to this struct directly, use functions below.
12 */ 12 */
13struct ring_buffer_event { 13struct ring_buffer_event {
14 u32 type:2, len:3, time_delta:27; 14 u32 type:2, len:3, time_delta:27;
@@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer);
74 74
75int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); 75int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
76 76
77struct ring_buffer_event * 77struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
78ring_buffer_lock_reserve(struct ring_buffer *buffer, 78 unsigned long length);
79 unsigned long length,
80 unsigned long *flags);
81int ring_buffer_unlock_commit(struct ring_buffer *buffer, 79int ring_buffer_unlock_commit(struct ring_buffer *buffer,
82 struct ring_buffer_event *event, 80 struct ring_buffer_event *event);
83 unsigned long flags);
84int ring_buffer_write(struct ring_buffer *buffer, 81int ring_buffer_write(struct ring_buffer *buffer,
85 unsigned long length, void *data); 82 unsigned long length, void *data);
86 83
@@ -121,17 +118,19 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
121unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); 118unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
122unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); 119unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
123 120
124u64 ring_buffer_time_stamp(int cpu); 121u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
125void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); 122void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
123 int cpu, u64 *ts);
124void ring_buffer_set_clock(struct ring_buffer *buffer,
125 u64 (*clock)(void));
126
127size_t ring_buffer_page_len(void *page);
126 128
127void tracing_on(void);
128void tracing_off(void);
129void tracing_off_permanent(void);
130 129
131void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); 130void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
132void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); 131void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
133int ring_buffer_read_page(struct ring_buffer *buffer, 132int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
134 void **data_page, int cpu, int full); 133 size_t len, int cpu, int full);
135 134
136enum ring_buffer_flags { 135enum ring_buffer_flags {
137 RB_FL_OVERWRITE = 1 << 0, 136 RB_FL_OVERWRITE = 1 << 0,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 011db2f4c94c..89cd308cc7a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void);
137extern unsigned long nr_active(void); 137extern unsigned long nr_active(void);
138extern unsigned long nr_iowait(void); 138extern unsigned long nr_iowait(void);
139 139
140extern unsigned long get_parent_ip(unsigned long addr);
141
140struct seq_file; 142struct seq_file;
141struct cfs_rq; 143struct cfs_rq;
142struct task_group; 144struct task_group;
@@ -331,7 +333,9 @@ extern signed long schedule_timeout(signed long timeout);
331extern signed long schedule_timeout_interruptible(signed long timeout); 333extern signed long schedule_timeout_interruptible(signed long timeout);
332extern signed long schedule_timeout_killable(signed long timeout); 334extern signed long schedule_timeout_killable(signed long timeout);
333extern signed long schedule_timeout_uninterruptible(signed long timeout); 335extern signed long schedule_timeout_uninterruptible(signed long timeout);
336asmlinkage void __schedule(void);
334asmlinkage void schedule(void); 337asmlinkage void schedule(void);
338extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
335 339
336struct nsproxy; 340struct nsproxy;
337struct user_namespace; 341struct user_namespace;
@@ -1178,10 +1182,9 @@ struct task_struct {
1178 pid_t pid; 1182 pid_t pid;
1179 pid_t tgid; 1183 pid_t tgid;
1180 1184
1181#ifdef CONFIG_CC_STACKPROTECTOR
1182 /* Canary value for the -fstack-protector gcc feature */ 1185 /* Canary value for the -fstack-protector gcc feature */
1183 unsigned long stack_canary; 1186 unsigned long stack_canary;
1184#endif 1187
1185 /* 1188 /*
1186 * pointers to (original) parent process, youngest child, younger sibling, 1189 * pointers to (original) parent process, youngest child, younger sibling,
1187 * older sibling, respectively. (p->father can be replaced with 1190 * older sibling, respectively. (p->father can be replaced with
@@ -1328,6 +1331,7 @@ struct task_struct {
1328 int lockdep_depth; 1331 int lockdep_depth;
1329 unsigned int lockdep_recursion; 1332 unsigned int lockdep_recursion;
1330 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1333 struct held_lock held_locks[MAX_LOCK_DEPTH];
1334 gfp_t lockdep_reclaim_gfp;
1331#endif 1335#endif
1332 1336
1333/* journalling filesystem info */ 1337/* journalling filesystem info */
@@ -1673,6 +1677,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1673 return set_cpus_allowed_ptr(p, &new_mask); 1677 return set_cpus_allowed_ptr(p, &new_mask);
1674} 1678}
1675 1679
1680/*
1681 * Architectures can set this to 1 if they have specified
1682 * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
1683 * but then during bootup it turns out that sched_clock()
1684 * is reliable after all:
1685 */
1686#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
1687extern int sched_clock_stable;
1688#endif
1689
1676extern unsigned long long sched_clock(void); 1690extern unsigned long long sched_clock(void);
1677 1691
1678extern void sched_clock_init(void); 1692extern void sched_clock_init(void);
@@ -2090,6 +2104,19 @@ static inline int object_is_on_stack(void *obj)
2090 2104
2091extern void thread_info_cache_init(void); 2105extern void thread_info_cache_init(void);
2092 2106
2107#ifdef CONFIG_DEBUG_STACK_USAGE
2108static inline unsigned long stack_not_used(struct task_struct *p)
2109{
2110 unsigned long *n = end_of_stack(p);
2111
2112 do { /* Skip over canary */
2113 n++;
2114 } while (!*n);
2115
2116 return (unsigned long)n - (unsigned long)end_of_stack(p);
2117}
2118#endif
2119
2093/* set thread flags in other task's structures 2120/* set thread flags in other task's structures
2094 * - see asm/thread_info.h for TIF_xxxx flags available 2121 * - see asm/thread_info.h for TIF_xxxx flags available
2095 */ 2122 */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 6ca6a7b66d75..f4523651fa42 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,6 +14,7 @@
14#include <asm/page.h> /* kmalloc_sizes.h needs PAGE_SIZE */ 14#include <asm/page.h> /* kmalloc_sizes.h needs PAGE_SIZE */
15#include <asm/cache.h> /* kmalloc_sizes.h needs L1_CACHE_BYTES */ 15#include <asm/cache.h> /* kmalloc_sizes.h needs L1_CACHE_BYTES */
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <trace/kmemtrace.h>
17 18
18/* Size description struct for general caches. */ 19/* Size description struct for general caches. */
19struct cache_sizes { 20struct cache_sizes {
@@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[];
28void *kmem_cache_alloc(struct kmem_cache *, gfp_t); 29void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
29void *__kmalloc(size_t size, gfp_t flags); 30void *__kmalloc(size_t size, gfp_t flags);
30 31
31static inline void *kmalloc(size_t size, gfp_t flags) 32#ifdef CONFIG_KMEMTRACE
33extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
34extern size_t slab_buffer_size(struct kmem_cache *cachep);
35#else
36static __always_inline void *
37kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
32{ 38{
39 return kmem_cache_alloc(cachep, flags);
40}
41static inline size_t slab_buffer_size(struct kmem_cache *cachep)
42{
43 return 0;
44}
45#endif
46
47static __always_inline void *kmalloc(size_t size, gfp_t flags)
48{
49 struct kmem_cache *cachep;
50 void *ret;
51
33 if (__builtin_constant_p(size)) { 52 if (__builtin_constant_p(size)) {
34 int i = 0; 53 int i = 0;
35 54
@@ -47,10 +66,17 @@ static inline void *kmalloc(size_t size, gfp_t flags)
47found: 66found:
48#ifdef CONFIG_ZONE_DMA 67#ifdef CONFIG_ZONE_DMA
49 if (flags & GFP_DMA) 68 if (flags & GFP_DMA)
50 return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, 69 cachep = malloc_sizes[i].cs_dmacachep;
51 flags); 70 else
52#endif 71#endif
53 return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); 72 cachep = malloc_sizes[i].cs_cachep;
73
74 ret = kmem_cache_alloc_notrace(cachep, flags);
75
76 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
77 size, slab_buffer_size(cachep), flags);
78
79 return ret;
54 } 80 }
55 return __kmalloc(size, flags); 81 return __kmalloc(size, flags);
56} 82}
@@ -59,8 +85,25 @@ found:
59extern void *__kmalloc_node(size_t size, gfp_t flags, int node); 85extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
60extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); 86extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
61 87
62static inline void *kmalloc_node(size_t size, gfp_t flags, int node) 88#ifdef CONFIG_KMEMTRACE
89extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
90 gfp_t flags,
91 int nodeid);
92#else
93static __always_inline void *
94kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
95 gfp_t flags,
96 int nodeid)
97{
98 return kmem_cache_alloc_node(cachep, flags, nodeid);
99}
100#endif
101
102static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
63{ 103{
104 struct kmem_cache *cachep;
105 void *ret;
106
64 if (__builtin_constant_p(size)) { 107 if (__builtin_constant_p(size)) {
65 int i = 0; 108 int i = 0;
66 109
@@ -78,11 +121,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
78found: 121found:
79#ifdef CONFIG_ZONE_DMA 122#ifdef CONFIG_ZONE_DMA
80 if (flags & GFP_DMA) 123 if (flags & GFP_DMA)
81 return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep, 124 cachep = malloc_sizes[i].cs_dmacachep;
82 flags, node); 125 else
83#endif 126#endif
84 return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep, 127 cachep = malloc_sizes[i].cs_cachep;
85 flags, node); 128
129 ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
130
131 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
132 ret, size, slab_buffer_size(cachep),
133 flags, node);
134
135 return ret;
86 } 136 }
87 return __kmalloc_node(size, flags, node); 137 return __kmalloc_node(size, flags, node);
88} 138}
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 59a3fa476ab9..0ec00b39d006 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -3,14 +3,15 @@
3 3
4void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); 4void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
5 5
6static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 6static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
7 gfp_t flags)
7{ 8{
8 return kmem_cache_alloc_node(cachep, flags, -1); 9 return kmem_cache_alloc_node(cachep, flags, -1);
9} 10}
10 11
11void *__kmalloc_node(size_t size, gfp_t flags, int node); 12void *__kmalloc_node(size_t size, gfp_t flags, int node);
12 13
13static inline void *kmalloc_node(size_t size, gfp_t flags, int node) 14static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
14{ 15{
15 return __kmalloc_node(size, flags, node); 16 return __kmalloc_node(size, flags, node);
16} 17}
@@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
23 * kmalloc is the normal method of allocating memory 24 * kmalloc is the normal method of allocating memory
24 * in the kernel. 25 * in the kernel.
25 */ 26 */
26static inline void *kmalloc(size_t size, gfp_t flags) 27static __always_inline void *kmalloc(size_t size, gfp_t flags)
27{ 28{
28 return __kmalloc_node(size, flags, -1); 29 return __kmalloc_node(size, flags, -1);
29} 30}
30 31
31static inline void *__kmalloc(size_t size, gfp_t flags) 32static __always_inline void *__kmalloc(size_t size, gfp_t flags)
32{ 33{
33 return kmalloc(size, flags); 34 return kmalloc(size, flags);
34} 35}
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 2f5c16b1aacd..9e3a575b2c30 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include <linux/workqueue.h> 11#include <linux/workqueue.h>
12#include <linux/kobject.h> 12#include <linux/kobject.h>
13#include <trace/kmemtrace.h>
13 14
14enum stat_item { 15enum stat_item {
15 ALLOC_FASTPATH, /* Allocation from cpu slab */ 16 ALLOC_FASTPATH, /* Allocation from cpu slab */
@@ -121,10 +122,23 @@ struct kmem_cache {
121#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) 122#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
122 123
123/* 124/*
125 * Maximum kmalloc object size handled by SLUB. Larger object allocations
126 * are passed through to the page allocator. The page allocator "fastpath"
127 * is relatively slow so we need this value sufficiently high so that
128 * performance critical objects are allocated through the SLUB fastpath.
129 *
130 * This should be dropped to PAGE_SIZE / 2 once the page allocator
131 * "fastpath" becomes competitive with the slab allocator fastpaths.
132 */
133#define SLUB_MAX_SIZE (PAGE_SIZE)
134
135#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1)
136
137/*
124 * We keep the general caches in an array of slab caches that are used for 138 * We keep the general caches in an array of slab caches that are used for
125 * 2^x bytes of allocations. 139 * 2^x bytes of allocations.
126 */ 140 */
127extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; 141extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
128 142
129/* 143/*
130 * Sorry that the following has to be that ugly but some versions of GCC 144 * Sorry that the following has to be that ugly but some versions of GCC
@@ -204,15 +218,33 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
204void *kmem_cache_alloc(struct kmem_cache *, gfp_t); 218void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
205void *__kmalloc(size_t size, gfp_t flags); 219void *__kmalloc(size_t size, gfp_t flags);
206 220
221#ifdef CONFIG_KMEMTRACE
222extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
223#else
224static __always_inline void *
225kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
226{
227 return kmem_cache_alloc(s, gfpflags);
228}
229#endif
230
207static __always_inline void *kmalloc_large(size_t size, gfp_t flags) 231static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
208{ 232{
209 return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); 233 unsigned int order = get_order(size);
234 void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
235
236 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
237 size, PAGE_SIZE << order, flags);
238
239 return ret;
210} 240}
211 241
212static __always_inline void *kmalloc(size_t size, gfp_t flags) 242static __always_inline void *kmalloc(size_t size, gfp_t flags)
213{ 243{
244 void *ret;
245
214 if (__builtin_constant_p(size)) { 246 if (__builtin_constant_p(size)) {
215 if (size > PAGE_SIZE) 247 if (size > SLUB_MAX_SIZE)
216 return kmalloc_large(size, flags); 248 return kmalloc_large(size, flags);
217 249
218 if (!(flags & SLUB_DMA)) { 250 if (!(flags & SLUB_DMA)) {
@@ -221,7 +253,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
221 if (!s) 253 if (!s)
222 return ZERO_SIZE_PTR; 254 return ZERO_SIZE_PTR;
223 255
224 return kmem_cache_alloc(s, flags); 256 ret = kmem_cache_alloc_notrace(s, flags);
257
258 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
259 _THIS_IP_, ret,
260 size, s->size, flags);
261
262 return ret;
225 } 263 }
226 } 264 }
227 return __kmalloc(size, flags); 265 return __kmalloc(size, flags);
@@ -231,16 +269,38 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
231void *__kmalloc_node(size_t size, gfp_t flags, int node); 269void *__kmalloc_node(size_t size, gfp_t flags, int node);
232void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); 270void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
233 271
272#ifdef CONFIG_KMEMTRACE
273extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
274 gfp_t gfpflags,
275 int node);
276#else
277static __always_inline void *
278kmem_cache_alloc_node_notrace(struct kmem_cache *s,
279 gfp_t gfpflags,
280 int node)
281{
282 return kmem_cache_alloc_node(s, gfpflags, node);
283}
284#endif
285
234static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) 286static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
235{ 287{
288 void *ret;
289
236 if (__builtin_constant_p(size) && 290 if (__builtin_constant_p(size) &&
237 size <= PAGE_SIZE && !(flags & SLUB_DMA)) { 291 size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
238 struct kmem_cache *s = kmalloc_slab(size); 292 struct kmem_cache *s = kmalloc_slab(size);
239 293
240 if (!s) 294 if (!s)
241 return ZERO_SIZE_PTR; 295 return ZERO_SIZE_PTR;
242 296
243 return kmem_cache_alloc_node(s, flags, node); 297 ret = kmem_cache_alloc_node_notrace(s, flags, node);
298
299 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
300 _THIS_IP_, ret,
301 size, s->size, flags, node);
302
303 return ret;
244 } 304 }
245 return __kmalloc_node(size, flags, node); 305 return __kmalloc_node(size, flags, node);
246} 306}
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..bbacb7baa446 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -176,6 +176,12 @@ static inline void init_call_single_data(void)
176#define put_cpu() preempt_enable() 176#define put_cpu() preempt_enable()
177#define put_cpu_no_resched() preempt_enable_no_resched() 177#define put_cpu_no_resched() preempt_enable_no_resched()
178 178
179/*
180 * Callback to arch code if there's nosmp or maxcpus=0 on the
181 * boot command line:
182 */
183extern void arch_disable_smp_support(void);
184
179void smp_setup_processor_id(void); 185void smp_setup_processor_id(void);
180 186
181#endif /* __LINUX_SMP_H */ 187#endif /* __LINUX_SMP_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 20fc4bbfca42..afc01909a428 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -24,10 +24,12 @@ struct __kernel_sockaddr_storage {
24#include <linux/types.h> /* pid_t */ 24#include <linux/types.h> /* pid_t */
25#include <linux/compiler.h> /* __user */ 25#include <linux/compiler.h> /* __user */
26 26
27#ifdef CONFIG_PROC_FS 27#ifdef __KERNEL__
28# ifdef CONFIG_PROC_FS
28struct seq_file; 29struct seq_file;
29extern void socket_seq_show(struct seq_file *seq); 30extern void socket_seq_show(struct seq_file *seq);
30#endif 31# endif
32#endif /* __KERNEL__ */
31 33
32typedef unsigned short sa_family_t; 34typedef unsigned short sa_family_t;
33 35
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
new file mode 100644
index 000000000000..6f3e54c704c0
--- /dev/null
+++ b/include/linux/stackprotector.h
@@ -0,0 +1,16 @@
1#ifndef _LINUX_STACKPROTECTOR_H
2#define _LINUX_STACKPROTECTOR_H 1
3
4#include <linux/compiler.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7
8#ifdef CONFIG_CC_STACKPROTECTOR
9# include <asm/stackprotector.h>
10#else
11static inline void boot_init_stack_canary(void)
12{
13}
14#endif
15
16#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index d18fc198aa2f..27ac31784ad2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -10,6 +10,7 @@
10#include <linux/compiler.h> /* for inline */ 10#include <linux/compiler.h> /* for inline */
11#include <linux/types.h> /* for size_t */ 11#include <linux/types.h> /* for size_t */
12#include <linux/stddef.h> /* for NULL */ 12#include <linux/stddef.h> /* for NULL */
13#include <stdarg.h>
13 14
14extern char *strndup_user(const char __user *, long); 15extern char *strndup_user(const char __user *, long);
15 16
@@ -111,6 +112,12 @@ extern void argv_free(char **argv);
111 112
112extern bool sysfs_streq(const char *s1, const char *s2); 113extern bool sysfs_streq(const char *s1, const char *s2);
113 114
115#ifdef CONFIG_BINARY_PRINTF
116int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
117int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
118int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
119#endif
120
114extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 121extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
115 const void *from, size_t available); 122 const void *from, size_t available);
116 123
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..0cff9bb80b02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
65#include <asm/signal.h> 65#include <asm/signal.h>
66#include <linux/quota.h> 66#include <linux/quota.h>
67#include <linux/key.h> 67#include <linux/key.h>
68#include <linux/ftrace.h>
68 69
69#define __SC_DECL1(t1, a1) t1 a1 70#define __SC_DECL1(t1, a1) t1 a1
70#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) 71#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@ -95,7 +96,46 @@ struct old_linux_dirent;
95#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) 96#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
96#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) 97#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
97 98
99#ifdef CONFIG_FTRACE_SYSCALLS
100#define __SC_STR_ADECL1(t, a) #a
101#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
102#define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__)
103#define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__)
104#define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__)
105#define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__)
106
107#define __SC_STR_TDECL1(t, a) #t
108#define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__)
109#define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__)
110#define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__)
111#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
112#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
113
114#define SYSCALL_METADATA(sname, nb) \
115 static const struct syscall_metadata __used \
116 __attribute__((__aligned__(4))) \
117 __attribute__((section("__syscalls_metadata"))) \
118 __syscall_meta_##sname = { \
119 .name = "sys"#sname, \
120 .nb_args = nb, \
121 .types = types_##sname, \
122 .args = args_##sname, \
123 }
124
125#define SYSCALL_DEFINE0(sname) \
126 static const struct syscall_metadata __used \
127 __attribute__((__aligned__(4))) \
128 __attribute__((section("__syscalls_metadata"))) \
129 __syscall_meta_##sname = { \
130 .name = "sys_"#sname, \
131 .nb_args = 0, \
132 }; \
133 asmlinkage long sys_##sname(void)
134
135#else
98#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) 136#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
137#endif
138
99#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) 139#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
100#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) 140#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
101#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) 141#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -117,10 +157,26 @@ struct old_linux_dirent;
117#endif 157#endif
118#endif 158#endif
119 159
160#ifdef CONFIG_FTRACE_SYSCALLS
161#define SYSCALL_DEFINEx(x, sname, ...) \
162 static const char *types_##sname[] = { \
163 __SC_STR_TDECL##x(__VA_ARGS__) \
164 }; \
165 static const char *args_##sname[] = { \
166 __SC_STR_ADECL##x(__VA_ARGS__) \
167 }; \
168 SYSCALL_METADATA(sname, x); \
169 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
170#else
171#define SYSCALL_DEFINEx(x, sname, ...) \
172 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
173#endif
174
120#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 175#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
121 176
122#define SYSCALL_DEFINE(name) static inline long SYSC_##name 177#define SYSCALL_DEFINE(name) static inline long SYSC_##name
123#define SYSCALL_DEFINEx(x, name, ...) \ 178
179#define __SYSCALL_DEFINEx(x, name, ...) \
124 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ 180 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \
125 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ 181 static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
126 asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ 182 asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
@@ -134,7 +190,7 @@ struct old_linux_dirent;
134#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ 190#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
135 191
136#define SYSCALL_DEFINE(name) asmlinkage long sys_##name 192#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
137#define SYSCALL_DEFINEx(x, name, ...) \ 193#define __SYSCALL_DEFINEx(x, name, ...) \
138 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) 194 asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
139 195
140#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ 196#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index daf9685b861c..51774eb87cc6 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,6 +5,7 @@
5#include <linux/ktime.h> 5#include <linux/ktime.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7#include <linux/debugobjects.h> 7#include <linux/debugobjects.h>
8#include <linux/stringify.h>
8 9
9struct tvec_base; 10struct tvec_base;
10 11
@@ -21,52 +22,126 @@ struct timer_list {
21 char start_comm[16]; 22 char start_comm[16];
22 int start_pid; 23 int start_pid;
23#endif 24#endif
25#ifdef CONFIG_LOCKDEP
26 struct lockdep_map lockdep_map;
27#endif
24}; 28};
25 29
26extern struct tvec_base boot_tvec_bases; 30extern struct tvec_base boot_tvec_bases;
27 31
32#ifdef CONFIG_LOCKDEP
33/*
34 * NB: because we have to copy the lockdep_map, setting the lockdep_map key
35 * (second argument) here is required, otherwise it could be initialised to
36 * the copy of the lockdep_map later! We use the pointer to and the string
37 * "<file>:<line>" as the key resp. the name of the lockdep_map.
38 */
39#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \
40 .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
41#else
42#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
43#endif
44
28#define TIMER_INITIALIZER(_function, _expires, _data) { \ 45#define TIMER_INITIALIZER(_function, _expires, _data) { \
29 .entry = { .prev = TIMER_ENTRY_STATIC }, \ 46 .entry = { .prev = TIMER_ENTRY_STATIC }, \
30 .function = (_function), \ 47 .function = (_function), \
31 .expires = (_expires), \ 48 .expires = (_expires), \
32 .data = (_data), \ 49 .data = (_data), \
33 .base = &boot_tvec_bases, \ 50 .base = &boot_tvec_bases, \
51 __TIMER_LOCKDEP_MAP_INITIALIZER( \
52 __FILE__ ":" __stringify(__LINE__)) \
34 } 53 }
35 54
36#define DEFINE_TIMER(_name, _function, _expires, _data) \ 55#define DEFINE_TIMER(_name, _function, _expires, _data) \
37 struct timer_list _name = \ 56 struct timer_list _name = \
38 TIMER_INITIALIZER(_function, _expires, _data) 57 TIMER_INITIALIZER(_function, _expires, _data)
39 58
40void init_timer(struct timer_list *timer); 59void init_timer_key(struct timer_list *timer,
41void init_timer_deferrable(struct timer_list *timer); 60 const char *name,
61 struct lock_class_key *key);
62void init_timer_deferrable_key(struct timer_list *timer,
63 const char *name,
64 struct lock_class_key *key);
65
66#ifdef CONFIG_LOCKDEP
67#define init_timer(timer) \
68 do { \
69 static struct lock_class_key __key; \
70 init_timer_key((timer), #timer, &__key); \
71 } while (0)
72
73#define init_timer_deferrable(timer) \
74 do { \
75 static struct lock_class_key __key; \
76 init_timer_deferrable_key((timer), #timer, &__key); \
77 } while (0)
78
79#define init_timer_on_stack(timer) \
80 do { \
81 static struct lock_class_key __key; \
82 init_timer_on_stack_key((timer), #timer, &__key); \
83 } while (0)
84
85#define setup_timer(timer, fn, data) \
86 do { \
87 static struct lock_class_key __key; \
88 setup_timer_key((timer), #timer, &__key, (fn), (data));\
89 } while (0)
90
91#define setup_timer_on_stack(timer, fn, data) \
92 do { \
93 static struct lock_class_key __key; \
94 setup_timer_on_stack_key((timer), #timer, &__key, \
95 (fn), (data)); \
96 } while (0)
97#else
98#define init_timer(timer)\
99 init_timer_key((timer), NULL, NULL)
100#define init_timer_deferrable(timer)\
101 init_timer_deferrable_key((timer), NULL, NULL)
102#define init_timer_on_stack(timer)\
103 init_timer_on_stack_key((timer), NULL, NULL)
104#define setup_timer(timer, fn, data)\
105 setup_timer_key((timer), NULL, NULL, (fn), (data))
106#define setup_timer_on_stack(timer, fn, data)\
107 setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
108#endif
42 109
43#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 110#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
44extern void init_timer_on_stack(struct timer_list *timer); 111extern void init_timer_on_stack_key(struct timer_list *timer,
112 const char *name,
113 struct lock_class_key *key);
45extern void destroy_timer_on_stack(struct timer_list *timer); 114extern void destroy_timer_on_stack(struct timer_list *timer);
46#else 115#else
47static inline void destroy_timer_on_stack(struct timer_list *timer) { } 116static inline void destroy_timer_on_stack(struct timer_list *timer) { }
48static inline void init_timer_on_stack(struct timer_list *timer) 117static inline void init_timer_on_stack_key(struct timer_list *timer,
118 const char *name,
119 struct lock_class_key *key)
49{ 120{
50 init_timer(timer); 121 init_timer_key(timer, name, key);
51} 122}
52#endif 123#endif
53 124
54static inline void setup_timer(struct timer_list * timer, 125static inline void setup_timer_key(struct timer_list * timer,
126 const char *name,
127 struct lock_class_key *key,
55 void (*function)(unsigned long), 128 void (*function)(unsigned long),
56 unsigned long data) 129 unsigned long data)
57{ 130{
58 timer->function = function; 131 timer->function = function;
59 timer->data = data; 132 timer->data = data;
60 init_timer(timer); 133 init_timer_key(timer, name, key);
61} 134}
62 135
63static inline void setup_timer_on_stack(struct timer_list *timer, 136static inline void setup_timer_on_stack_key(struct timer_list *timer,
137 const char *name,
138 struct lock_class_key *key,
64 void (*function)(unsigned long), 139 void (*function)(unsigned long),
65 unsigned long data) 140 unsigned long data)
66{ 141{
67 timer->function = function; 142 timer->function = function;
68 timer->data = data; 143 timer->data = data;
69 init_timer_on_stack(timer); 144 init_timer_on_stack_key(timer, name, key);
70} 145}
71 146
72/** 147/**
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e632d29f0544..a16b9e06f2e5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -193,5 +193,11 @@ int arch_update_cpu_topology(void);
193#ifndef topology_core_siblings 193#ifndef topology_core_siblings
194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu) 194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)
195#endif 195#endif
196#ifndef topology_thread_cpumask
197#define topology_thread_cpumask(cpu) cpumask_of(cpu)
198#endif
199#ifndef topology_core_cpumask
200#define topology_core_cpumask(cpu) cpumask_of(cpu)
201#endif
196 202
197#endif /* _LINUX_TOPOLOGY_H */ 203#endif /* _LINUX_TOPOLOGY_H */
diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
new file mode 100644
index 000000000000..7a8130384087
--- /dev/null
+++ b/include/linux/trace_clock.h
@@ -0,0 +1,19 @@
1#ifndef _LINUX_TRACE_CLOCK_H
2#define _LINUX_TRACE_CLOCK_H
3
4/*
5 * 3 trace clock variants, with differing scalability/precision
6 * tradeoffs:
7 *
8 * - local: CPU-local trace clock
9 * - medium: scalable global clock with some jitter
10 * - global: globally monotonic, serialized clock
11 */
12#include <linux/compiler.h>
13#include <linux/types.h>
14
15extern u64 notrace trace_clock_local(void);
16extern u64 notrace trace_clock(void);
17extern u64 notrace trace_clock_global(void);
18
19#endif /* _LINUX_TRACE_CLOCK_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 757005458366..d35a7ee7611f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,8 +31,8 @@ struct tracepoint {
31 * Keep in sync with vmlinux.lds.h. 31 * Keep in sync with vmlinux.lds.h.
32 */ 32 */
33 33
34#define TPPROTO(args...) args 34#define TP_PROTO(args...) args
35#define TPARGS(args...) args 35#define TP_ARGS(args...) args
36 36
37#ifdef CONFIG_TRACEPOINTS 37#ifdef CONFIG_TRACEPOINTS
38 38
@@ -65,7 +65,7 @@ struct tracepoint {
65 { \ 65 { \
66 if (unlikely(__tracepoint_##name.state)) \ 66 if (unlikely(__tracepoint_##name.state)) \
67 __DO_TRACE(&__tracepoint_##name, \ 67 __DO_TRACE(&__tracepoint_##name, \
68 TPPROTO(proto), TPARGS(args)); \ 68 TP_PROTO(proto), TP_ARGS(args)); \
69 } \ 69 } \
70 static inline int register_trace_##name(void (*probe)(proto)) \ 70 static inline int register_trace_##name(void (*probe)(proto)) \
71 { \ 71 { \
@@ -153,4 +153,114 @@ static inline void tracepoint_synchronize_unregister(void)
153 synchronize_sched(); 153 synchronize_sched();
154} 154}
155 155
156#define PARAMS(args...) args
157#define TRACE_FORMAT(name, proto, args, fmt) \
158 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
159
160
161/*
162 * For use with the TRACE_EVENT macro:
163 *
164 * We define a tracepoint, its arguments, its printk format
165 * and its 'fast binay record' layout.
166 *
167 * Firstly, name your tracepoint via TRACE_EVENT(name : the
168 * 'subsystem_event' notation is fine.
169 *
170 * Think about this whole construct as the
171 * 'trace_sched_switch() function' from now on.
172 *
173 *
174 * TRACE_EVENT(sched_switch,
175 *
176 * *
177 * * A function has a regular function arguments
178 * * prototype, declare it via TP_PROTO():
179 * *
180 *
181 * TP_PROTO(struct rq *rq, struct task_struct *prev,
182 * struct task_struct *next),
183 *
184 * *
185 * * Define the call signature of the 'function'.
186 * * (Design sidenote: we use this instead of a
187 * * TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
188 * *
189 *
190 * TP_ARGS(rq, prev, next),
191 *
192 * *
193 * * Fast binary tracing: define the trace record via
194 * * TP_STRUCT__entry(). You can think about it like a
195 * * regular C structure local variable definition.
196 * *
197 * * This is how the trace record is structured and will
198 * * be saved into the ring buffer. These are the fields
199 * * that will be exposed to user-space in
200 * * /debug/tracing/events/<*>/format.
201 * *
202 * * The declared 'local variable' is called '__entry'
203 * *
204 * * __field(pid_t, prev_prid) is equivalent to a standard declariton:
205 * *
206 * * pid_t prev_pid;
207 * *
208 * * __array(char, prev_comm, TASK_COMM_LEN) is equivalent to:
209 * *
210 * * char prev_comm[TASK_COMM_LEN];
211 * *
212 *
213 * TP_STRUCT__entry(
214 * __array( char, prev_comm, TASK_COMM_LEN )
215 * __field( pid_t, prev_pid )
216 * __field( int, prev_prio )
217 * __array( char, next_comm, TASK_COMM_LEN )
218 * __field( pid_t, next_pid )
219 * __field( int, next_prio )
220 * ),
221 *
222 * *
223 * * Assign the entry into the trace record, by embedding
224 * * a full C statement block into TP_fast_assign(). You
225 * * can refer to the trace record as '__entry' -
226 * * otherwise you can put arbitrary C code in here.
227 * *
228 * * Note: this C code will execute every time a trace event
229 * * happens, on an active tracepoint.
230 * *
231 *
232 * TP_fast_assign(
233 * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
234 * __entry->prev_pid = prev->pid;
235 * __entry->prev_prio = prev->prio;
236 * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
237 * __entry->next_pid = next->pid;
238 * __entry->next_prio = next->prio;
239 * )
240 *
241 * *
242 * * Formatted output of a trace record via TP_printk().
243 * * This is how the tracepoint will appear under ftrace
244 * * plugins that make use of this tracepoint.
245 * *
246 * * (raw-binary tracing wont actually perform this step.)
247 * *
248 *
249 * TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
250 * __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
251 * __entry->next_comm, __entry->next_pid, __entry->next_prio),
252 *
253 * );
254 *
255 * This macro construct is thus used for the regular printk format
256 * tracing setup, it is used to construct a function pointer based
257 * tracepoint callback (this is used by programmatic plugins and
258 * can also by used by generic instrumentation like SystemTap), and
259 * it is also used to expose a structured trace record in
260 * /debug/tracing/events/.
261 */
262
263#define TRACE_EVENT(name, proto, args, struct, assign, print) \
264 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
265
156#endif 266#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 712ca53bc348..fca82ed55f49 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -1,6 +1,9 @@
1#ifndef _LINUX_TYPES_H 1#ifndef _LINUX_TYPES_H
2#define _LINUX_TYPES_H 2#define _LINUX_TYPES_H
3 3
4#include <asm/types.h>
5
6#ifndef __ASSEMBLY__
4#ifdef __KERNEL__ 7#ifdef __KERNEL__
5 8
6#define DECLARE_BITMAP(name,bits) \ 9#define DECLARE_BITMAP(name,bits) \
@@ -9,7 +12,6 @@
9#endif 12#endif
10 13
11#include <linux/posix_types.h> 14#include <linux/posix_types.h>
12#include <asm/types.h>
13 15
14#ifndef __KERNEL_STRICT_NAMES 16#ifndef __KERNEL_STRICT_NAMES
15 17
@@ -212,5 +214,5 @@ struct ustat {
212}; 214};
213 215
214#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
215 217#endif /* __ASSEMBLY__ */
216#endif /* _LINUX_TYPES_H */ 218#endif /* _LINUX_TYPES_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9c0890c7a06a..a43ebec3a7b9 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
95 95
96extern int map_vm_area(struct vm_struct *area, pgprot_t prot, 96extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
97 struct page ***pages); 97 struct page ***pages);
98extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
99 pgprot_t prot, struct page **pages);
100extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
98extern void unmap_kernel_range(unsigned long addr, unsigned long size); 101extern void unmap_kernel_range(unsigned long addr, unsigned long size);
99 102
100/* Allocate/destroy a 'vmalloc' VM area. */ 103/* Allocate/destroy a 'vmalloc' VM area. */
@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
110 */ 113 */
111extern rwlock_t vmlist_lock; 114extern rwlock_t vmlist_lock;
112extern struct vm_struct *vmlist; 115extern struct vm_struct *vmlist;
116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
113 117
114#endif /* _LINUX_VMALLOC_H */ 118#endif /* _LINUX_VMALLOC_H */
diff --git a/include/trace/block.h b/include/trace/block.h
index 25c6a1fd5b77..25b7068b819e 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -5,72 +5,72 @@
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DECLARE_TRACE(block_rq_abort, 7DECLARE_TRACE(block_rq_abort,
8 TPPROTO(struct request_queue *q, struct request *rq), 8 TP_PROTO(struct request_queue *q, struct request *rq),
9 TPARGS(q, rq)); 9 TP_ARGS(q, rq));
10 10
11DECLARE_TRACE(block_rq_insert, 11DECLARE_TRACE(block_rq_insert,
12 TPPROTO(struct request_queue *q, struct request *rq), 12 TP_PROTO(struct request_queue *q, struct request *rq),
13 TPARGS(q, rq)); 13 TP_ARGS(q, rq));
14 14
15DECLARE_TRACE(block_rq_issue, 15DECLARE_TRACE(block_rq_issue,
16 TPPROTO(struct request_queue *q, struct request *rq), 16 TP_PROTO(struct request_queue *q, struct request *rq),
17 TPARGS(q, rq)); 17 TP_ARGS(q, rq));
18 18
19DECLARE_TRACE(block_rq_requeue, 19DECLARE_TRACE(block_rq_requeue,
20 TPPROTO(struct request_queue *q, struct request *rq), 20 TP_PROTO(struct request_queue *q, struct request *rq),
21 TPARGS(q, rq)); 21 TP_ARGS(q, rq));
22 22
23DECLARE_TRACE(block_rq_complete, 23DECLARE_TRACE(block_rq_complete,
24 TPPROTO(struct request_queue *q, struct request *rq), 24 TP_PROTO(struct request_queue *q, struct request *rq),
25 TPARGS(q, rq)); 25 TP_ARGS(q, rq));
26 26
27DECLARE_TRACE(block_bio_bounce, 27DECLARE_TRACE(block_bio_bounce,
28 TPPROTO(struct request_queue *q, struct bio *bio), 28 TP_PROTO(struct request_queue *q, struct bio *bio),
29 TPARGS(q, bio)); 29 TP_ARGS(q, bio));
30 30
31DECLARE_TRACE(block_bio_complete, 31DECLARE_TRACE(block_bio_complete,
32 TPPROTO(struct request_queue *q, struct bio *bio), 32 TP_PROTO(struct request_queue *q, struct bio *bio),
33 TPARGS(q, bio)); 33 TP_ARGS(q, bio));
34 34
35DECLARE_TRACE(block_bio_backmerge, 35DECLARE_TRACE(block_bio_backmerge,
36 TPPROTO(struct request_queue *q, struct bio *bio), 36 TP_PROTO(struct request_queue *q, struct bio *bio),
37 TPARGS(q, bio)); 37 TP_ARGS(q, bio));
38 38
39DECLARE_TRACE(block_bio_frontmerge, 39DECLARE_TRACE(block_bio_frontmerge,
40 TPPROTO(struct request_queue *q, struct bio *bio), 40 TP_PROTO(struct request_queue *q, struct bio *bio),
41 TPARGS(q, bio)); 41 TP_ARGS(q, bio));
42 42
43DECLARE_TRACE(block_bio_queue, 43DECLARE_TRACE(block_bio_queue,
44 TPPROTO(struct request_queue *q, struct bio *bio), 44 TP_PROTO(struct request_queue *q, struct bio *bio),
45 TPARGS(q, bio)); 45 TP_ARGS(q, bio));
46 46
47DECLARE_TRACE(block_getrq, 47DECLARE_TRACE(block_getrq,
48 TPPROTO(struct request_queue *q, struct bio *bio, int rw), 48 TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
49 TPARGS(q, bio, rw)); 49 TP_ARGS(q, bio, rw));
50 50
51DECLARE_TRACE(block_sleeprq, 51DECLARE_TRACE(block_sleeprq,
52 TPPROTO(struct request_queue *q, struct bio *bio, int rw), 52 TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
53 TPARGS(q, bio, rw)); 53 TP_ARGS(q, bio, rw));
54 54
55DECLARE_TRACE(block_plug, 55DECLARE_TRACE(block_plug,
56 TPPROTO(struct request_queue *q), 56 TP_PROTO(struct request_queue *q),
57 TPARGS(q)); 57 TP_ARGS(q));
58 58
59DECLARE_TRACE(block_unplug_timer, 59DECLARE_TRACE(block_unplug_timer,
60 TPPROTO(struct request_queue *q), 60 TP_PROTO(struct request_queue *q),
61 TPARGS(q)); 61 TP_ARGS(q));
62 62
63DECLARE_TRACE(block_unplug_io, 63DECLARE_TRACE(block_unplug_io,
64 TPPROTO(struct request_queue *q), 64 TP_PROTO(struct request_queue *q),
65 TPARGS(q)); 65 TP_ARGS(q));
66 66
67DECLARE_TRACE(block_split, 67DECLARE_TRACE(block_split,
68 TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), 68 TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
69 TPARGS(q, bio, pdu)); 69 TP_ARGS(q, bio, pdu));
70 70
71DECLARE_TRACE(block_remap, 71DECLARE_TRACE(block_remap,
72 TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, 72 TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
73 sector_t from, sector_t to), 73 sector_t from, sector_t to),
74 TPARGS(q, bio, dev, from, to)); 74 TP_ARGS(q, bio, dev, from, to));
75 75
76#endif 76#endif
diff --git a/include/trace/irq.h b/include/trace/irq.h
new file mode 100644
index 000000000000..ff5d4495dc37
--- /dev/null
+++ b/include/trace/irq.h
@@ -0,0 +1,9 @@
1#ifndef _TRACE_IRQ_H
2#define _TRACE_IRQ_H
3
4#include <linux/interrupt.h>
5#include <linux/tracepoint.h>
6
7#include <trace/irq_event_types.h>
8
9#endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
new file mode 100644
index 000000000000..85964ebd47ec
--- /dev/null
+++ b/include/trace/irq_event_types.h
@@ -0,0 +1,55 @@
1
2/* use <trace/irq.h> instead */
3#ifndef TRACE_FORMAT
4# error Do not include this file directly.
5# error Unless you know what you are doing.
6#endif
7
8#undef TRACE_SYSTEM
9#define TRACE_SYSTEM irq
10
11/*
12 * Tracepoint for entry of interrupt handler:
13 */
14TRACE_FORMAT(irq_handler_entry,
15 TP_PROTO(int irq, struct irqaction *action),
16 TP_ARGS(irq, action),
17 TP_FMT("irq=%d handler=%s", irq, action->name)
18 );
19
20/*
21 * Tracepoint for return of an interrupt handler:
22 */
23TRACE_EVENT(irq_handler_exit,
24
25 TP_PROTO(int irq, struct irqaction *action, int ret),
26
27 TP_ARGS(irq, action, ret),
28
29 TP_STRUCT__entry(
30 __field( int, irq )
31 __field( int, ret )
32 ),
33
34 TP_fast_assign(
35 __entry->irq = irq;
36 __entry->ret = ret;
37 ),
38
39 TP_printk("irq=%d return=%s",
40 __entry->irq, __entry->ret ? "handled" : "unhandled")
41);
42
43TRACE_FORMAT(softirq_entry,
44 TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
45 TP_ARGS(h, vec),
46 TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
47 );
48
49TRACE_FORMAT(softirq_exit,
50 TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
51 TP_ARGS(h, vec),
52 TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
53 );
54
55#undef TRACE_SYSTEM
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
new file mode 100644
index 000000000000..ad8b7857855a
--- /dev/null
+++ b/include/trace/kmemtrace.h
@@ -0,0 +1,75 @@
1/*
2 * Copyright (C) 2008 Eduard - Gabriel Munteanu
3 *
4 * This file is released under GPL version 2.
5 */
6
7#ifndef _LINUX_KMEMTRACE_H
8#define _LINUX_KMEMTRACE_H
9
10#ifdef __KERNEL__
11
12#include <linux/types.h>
13#include <linux/marker.h>
14
15enum kmemtrace_type_id {
16 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
17 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
18 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
19};
20
21#ifdef CONFIG_KMEMTRACE
22
23extern void kmemtrace_init(void);
24
25extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
26 unsigned long call_site,
27 const void *ptr,
28 size_t bytes_req,
29 size_t bytes_alloc,
30 gfp_t gfp_flags,
31 int node);
32
33extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
34 unsigned long call_site,
35 const void *ptr);
36
37#else /* CONFIG_KMEMTRACE */
38
39static inline void kmemtrace_init(void)
40{
41}
42
43static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
44 unsigned long call_site,
45 const void *ptr,
46 size_t bytes_req,
47 size_t bytes_alloc,
48 gfp_t gfp_flags,
49 int node)
50{
51}
52
53static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
54 unsigned long call_site,
55 const void *ptr)
56{
57}
58
59#endif /* CONFIG_KMEMTRACE */
60
61static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
62 unsigned long call_site,
63 const void *ptr,
64 size_t bytes_req,
65 size_t bytes_alloc,
66 gfp_t gfp_flags)
67{
68 kmemtrace_mark_alloc_node(type_id, call_site, ptr,
69 bytes_req, bytes_alloc, gfp_flags, -1);
70}
71
72#endif /* __KERNEL__ */
73
74#endif /* _LINUX_KMEMTRACE_H */
75
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
new file mode 100644
index 000000000000..5ca67df87f2a
--- /dev/null
+++ b/include/trace/lockdep.h
@@ -0,0 +1,9 @@
1#ifndef _TRACE_LOCKDEP_H
2#define _TRACE_LOCKDEP_H
3
4#include <linux/lockdep.h>
5#include <linux/tracepoint.h>
6
7#include <trace/lockdep_event_types.h>
8
9#endif
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
new file mode 100644
index 000000000000..adccfcd2ec8f
--- /dev/null
+++ b/include/trace/lockdep_event_types.h
@@ -0,0 +1,44 @@
1
2#ifndef TRACE_FORMAT
3# error Do not include this file directly.
4# error Unless you know what you are doing.
5#endif
6
7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM lock
9
10#ifdef CONFIG_LOCKDEP
11
12TRACE_FORMAT(lock_acquire,
13 TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
14 int trylock, int read, int check,
15 struct lockdep_map *next_lock, unsigned long ip),
16 TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
17 TP_FMT("%s%s%s", trylock ? "try " : "",
18 read ? "read " : "", lock->name)
19 );
20
21TRACE_FORMAT(lock_release,
22 TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
23 TP_ARGS(lock, nested, ip),
24 TP_FMT("%s", lock->name)
25 );
26
27#ifdef CONFIG_LOCK_STAT
28
29TRACE_FORMAT(lock_contended,
30 TP_PROTO(struct lockdep_map *lock, unsigned long ip),
31 TP_ARGS(lock, ip),
32 TP_FMT("%s", lock->name)
33 );
34
35TRACE_FORMAT(lock_acquired,
36 TP_PROTO(struct lockdep_map *lock, unsigned long ip),
37 TP_ARGS(lock, ip),
38 TP_FMT("%s", lock->name)
39 );
40
41#endif
42#endif
43
44#undef TRACE_SYSTEM
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 000000000000..ef204666e983
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,32 @@
1#ifndef _TRACE_POWER_H
2#define _TRACE_POWER_H
3
4#include <linux/ktime.h>
5#include <linux/tracepoint.h>
6
7enum {
8 POWER_NONE = 0,
9 POWER_CSTATE = 1,
10 POWER_PSTATE = 2,
11};
12
13struct power_trace {
14 ktime_t stamp;
15 ktime_t end;
16 int type;
17 int state;
18};
19
20DECLARE_TRACE(power_start,
21 TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
22 TP_ARGS(it, type, state));
23
24DECLARE_TRACE(power_mark,
25 TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
26 TP_ARGS(it, type, state));
27
28DECLARE_TRACE(power_end,
29 TP_PROTO(struct power_trace *it),
30 TP_ARGS(it));
31
32#endif /* _TRACE_POWER_H */
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 0d81098ee9fc..4e372a1a29bf 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,53 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DECLARE_TRACE(sched_kthread_stop, 7#include <trace/sched_event_types.h>
8 TPPROTO(struct task_struct *t),
9 TPARGS(t));
10
11DECLARE_TRACE(sched_kthread_stop_ret,
12 TPPROTO(int ret),
13 TPARGS(ret));
14
15DECLARE_TRACE(sched_wait_task,
16 TPPROTO(struct rq *rq, struct task_struct *p),
17 TPARGS(rq, p));
18
19DECLARE_TRACE(sched_wakeup,
20 TPPROTO(struct rq *rq, struct task_struct *p, int success),
21 TPARGS(rq, p, success));
22
23DECLARE_TRACE(sched_wakeup_new,
24 TPPROTO(struct rq *rq, struct task_struct *p, int success),
25 TPARGS(rq, p, success));
26
27DECLARE_TRACE(sched_switch,
28 TPPROTO(struct rq *rq, struct task_struct *prev,
29 struct task_struct *next),
30 TPARGS(rq, prev, next));
31
32DECLARE_TRACE(sched_migrate_task,
33 TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
34 TPARGS(p, orig_cpu, dest_cpu));
35
36DECLARE_TRACE(sched_process_free,
37 TPPROTO(struct task_struct *p),
38 TPARGS(p));
39
40DECLARE_TRACE(sched_process_exit,
41 TPPROTO(struct task_struct *p),
42 TPARGS(p));
43
44DECLARE_TRACE(sched_process_wait,
45 TPPROTO(struct pid *pid),
46 TPARGS(pid));
47
48DECLARE_TRACE(sched_process_fork,
49 TPPROTO(struct task_struct *parent, struct task_struct *child),
50 TPARGS(parent, child));
51
52DECLARE_TRACE(sched_signal_send,
53 TPPROTO(int sig, struct task_struct *p),
54 TPARGS(sig, p));
55 8
56#endif 9#endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
new file mode 100644
index 000000000000..63547dc1125f
--- /dev/null
+++ b/include/trace/sched_event_types.h
@@ -0,0 +1,337 @@
1
2/* use <trace/sched.h> instead */
3#ifndef TRACE_EVENT
4# error Do not include this file directly.
5# error Unless you know what you are doing.
6#endif
7
8#undef TRACE_SYSTEM
9#define TRACE_SYSTEM sched
10
11/*
12 * Tracepoint for calling kthread_stop, performed to end a kthread:
13 */
14TRACE_EVENT(sched_kthread_stop,
15
16 TP_PROTO(struct task_struct *t),
17
18 TP_ARGS(t),
19
20 TP_STRUCT__entry(
21 __array( char, comm, TASK_COMM_LEN )
22 __field( pid_t, pid )
23 ),
24
25 TP_fast_assign(
26 memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
27 __entry->pid = t->pid;
28 ),
29
30 TP_printk("task %s:%d", __entry->comm, __entry->pid)
31);
32
33/*
34 * Tracepoint for the return value of the kthread stopping:
35 */
36TRACE_EVENT(sched_kthread_stop_ret,
37
38 TP_PROTO(int ret),
39
40 TP_ARGS(ret),
41
42 TP_STRUCT__entry(
43 __field( int, ret )
44 ),
45
46 TP_fast_assign(
47 __entry->ret = ret;
48 ),
49
50 TP_printk("ret %d", __entry->ret)
51);
52
53/*
54 * Tracepoint for waiting on task to unschedule:
55 *
56 * (NOTE: the 'rq' argument is not used by generic trace events,
57 * but used by the latency tracer plugin. )
58 */
59TRACE_EVENT(sched_wait_task,
60
61 TP_PROTO(struct rq *rq, struct task_struct *p),
62
63 TP_ARGS(rq, p),
64
65 TP_STRUCT__entry(
66 __array( char, comm, TASK_COMM_LEN )
67 __field( pid_t, pid )
68 __field( int, prio )
69 ),
70
71 TP_fast_assign(
72 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
73 __entry->pid = p->pid;
74 __entry->prio = p->prio;
75 ),
76
77 TP_printk("task %s:%d [%d]",
78 __entry->comm, __entry->pid, __entry->prio)
79);
80
81/*
82 * Tracepoint for waking up a task:
83 *
84 * (NOTE: the 'rq' argument is not used by generic trace events,
85 * but used by the latency tracer plugin. )
86 */
87TRACE_EVENT(sched_wakeup,
88
89 TP_PROTO(struct rq *rq, struct task_struct *p, int success),
90
91 TP_ARGS(rq, p, success),
92
93 TP_STRUCT__entry(
94 __array( char, comm, TASK_COMM_LEN )
95 __field( pid_t, pid )
96 __field( int, prio )
97 __field( int, success )
98 ),
99
100 TP_fast_assign(
101 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
102 __entry->pid = p->pid;
103 __entry->prio = p->prio;
104 __entry->success = success;
105 ),
106
107 TP_printk("task %s:%d [%d] success=%d",
108 __entry->comm, __entry->pid, __entry->prio,
109 __entry->success)
110);
111
112/*
113 * Tracepoint for waking up a new task:
114 *
115 * (NOTE: the 'rq' argument is not used by generic trace events,
116 * but used by the latency tracer plugin. )
117 */
118TRACE_EVENT(sched_wakeup_new,
119
120 TP_PROTO(struct rq *rq, struct task_struct *p, int success),
121
122 TP_ARGS(rq, p, success),
123
124 TP_STRUCT__entry(
125 __array( char, comm, TASK_COMM_LEN )
126 __field( pid_t, pid )
127 __field( int, prio )
128 __field( int, success )
129 ),
130
131 TP_fast_assign(
132 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
133 __entry->pid = p->pid;
134 __entry->prio = p->prio;
135 __entry->success = success;
136 ),
137
138 TP_printk("task %s:%d [%d] success=%d",
139 __entry->comm, __entry->pid, __entry->prio,
140 __entry->success)
141);
142
143/*
144 * Tracepoint for task switches, performed by the scheduler:
145 *
146 * (NOTE: the 'rq' argument is not used by generic trace events,
147 * but used by the latency tracer plugin. )
148 */
149TRACE_EVENT(sched_switch,
150
151 TP_PROTO(struct rq *rq, struct task_struct *prev,
152 struct task_struct *next),
153
154 TP_ARGS(rq, prev, next),
155
156 TP_STRUCT__entry(
157 __array( char, prev_comm, TASK_COMM_LEN )
158 __field( pid_t, prev_pid )
159 __field( int, prev_prio )
160 __array( char, next_comm, TASK_COMM_LEN )
161 __field( pid_t, next_pid )
162 __field( int, next_prio )
163 ),
164
165 TP_fast_assign(
166 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
167 __entry->prev_pid = prev->pid;
168 __entry->prev_prio = prev->prio;
169 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
170 __entry->next_pid = next->pid;
171 __entry->next_prio = next->prio;
172 ),
173
174 TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
175 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
176 __entry->next_comm, __entry->next_pid, __entry->next_prio)
177);
178
179/*
180 * Tracepoint for a task being migrated:
181 */
182TRACE_EVENT(sched_migrate_task,
183
184 TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
185
186 TP_ARGS(p, orig_cpu, dest_cpu),
187
188 TP_STRUCT__entry(
189 __array( char, comm, TASK_COMM_LEN )
190 __field( pid_t, pid )
191 __field( int, prio )
192 __field( int, orig_cpu )
193 __field( int, dest_cpu )
194 ),
195
196 TP_fast_assign(
197 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
198 __entry->pid = p->pid;
199 __entry->prio = p->prio;
200 __entry->orig_cpu = orig_cpu;
201 __entry->dest_cpu = dest_cpu;
202 ),
203
204 TP_printk("task %s:%d [%d] from: %d to: %d",
205 __entry->comm, __entry->pid, __entry->prio,
206 __entry->orig_cpu, __entry->dest_cpu)
207);
208
209/*
210 * Tracepoint for freeing a task:
211 */
212TRACE_EVENT(sched_process_free,
213
214 TP_PROTO(struct task_struct *p),
215
216 TP_ARGS(p),
217
218 TP_STRUCT__entry(
219 __array( char, comm, TASK_COMM_LEN )
220 __field( pid_t, pid )
221 __field( int, prio )
222 ),
223
224 TP_fast_assign(
225 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
226 __entry->pid = p->pid;
227 __entry->prio = p->prio;
228 ),
229
230 TP_printk("task %s:%d [%d]",
231 __entry->comm, __entry->pid, __entry->prio)
232);
233
234/*
235 * Tracepoint for a task exiting:
236 */
237TRACE_EVENT(sched_process_exit,
238
239 TP_PROTO(struct task_struct *p),
240
241 TP_ARGS(p),
242
243 TP_STRUCT__entry(
244 __array( char, comm, TASK_COMM_LEN )
245 __field( pid_t, pid )
246 __field( int, prio )
247 ),
248
249 TP_fast_assign(
250 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
251 __entry->pid = p->pid;
252 __entry->prio = p->prio;
253 ),
254
255 TP_printk("task %s:%d [%d]",
256 __entry->comm, __entry->pid, __entry->prio)
257);
258
259/*
260 * Tracepoint for a waiting task:
261 */
262TRACE_EVENT(sched_process_wait,
263
264 TP_PROTO(struct pid *pid),
265
266 TP_ARGS(pid),
267
268 TP_STRUCT__entry(
269 __array( char, comm, TASK_COMM_LEN )
270 __field( pid_t, pid )
271 __field( int, prio )
272 ),
273
274 TP_fast_assign(
275 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
276 __entry->pid = pid_nr(pid);
277 __entry->prio = current->prio;
278 ),
279
280 TP_printk("task %s:%d [%d]",
281 __entry->comm, __entry->pid, __entry->prio)
282);
283
284/*
285 * Tracepoint for do_fork:
286 */
287TRACE_EVENT(sched_process_fork,
288
289 TP_PROTO(struct task_struct *parent, struct task_struct *child),
290
291 TP_ARGS(parent, child),
292
293 TP_STRUCT__entry(
294 __array( char, parent_comm, TASK_COMM_LEN )
295 __field( pid_t, parent_pid )
296 __array( char, child_comm, TASK_COMM_LEN )
297 __field( pid_t, child_pid )
298 ),
299
300 TP_fast_assign(
301 memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
302 __entry->parent_pid = parent->pid;
303 memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
304 __entry->child_pid = child->pid;
305 ),
306
307 TP_printk("parent %s:%d child %s:%d",
308 __entry->parent_comm, __entry->parent_pid,
309 __entry->child_comm, __entry->child_pid)
310);
311
312/*
313 * Tracepoint for sending a signal:
314 */
315TRACE_EVENT(sched_signal_send,
316
317 TP_PROTO(int sig, struct task_struct *p),
318
319 TP_ARGS(sig, p),
320
321 TP_STRUCT__entry(
322 __field( int, sig )
323 __array( char, comm, TASK_COMM_LEN )
324 __field( pid_t, pid )
325 ),
326
327 TP_fast_assign(
328 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
329 __entry->pid = p->pid;
330 __entry->sig = sig;
331 ),
332
333 TP_printk("sig: %d task %s:%d",
334 __entry->sig, __entry->comm, __entry->pid)
335);
336
337#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
new file mode 100644
index 000000000000..df56f5694be6
--- /dev/null
+++ b/include/trace/trace_event_types.h
@@ -0,0 +1,5 @@
1/* trace/<type>_event_types.h here */
2
3#include <trace/sched_event_types.h>
4#include <trace/irq_event_types.h>
5#include <trace/lockdep_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
new file mode 100644
index 000000000000..fd13750ca4ba
--- /dev/null
+++ b/include/trace/trace_events.h
@@ -0,0 +1,5 @@
1/* trace/<type>.h here */
2
3#include <trace/sched.h>
4#include <trace/irq.h>
5#include <trace/lockdep.h>
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
new file mode 100644
index 000000000000..7626523deeba
--- /dev/null
+++ b/include/trace/workqueue.h
@@ -0,0 +1,25 @@
1#ifndef __TRACE_WORKQUEUE_H
2#define __TRACE_WORKQUEUE_H
3
4#include <linux/tracepoint.h>
5#include <linux/workqueue.h>
6#include <linux/sched.h>
7
8DECLARE_TRACE(workqueue_insertion,
9 TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
10 TP_ARGS(wq_thread, work));
11
12DECLARE_TRACE(workqueue_execution,
13 TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
14 TP_ARGS(wq_thread, work));
15
16/* Trace the creation of one workqueue thread on a cpu */
17DECLARE_TRACE(workqueue_creation,
18 TP_PROTO(struct task_struct *wq_thread, int cpu),
19 TP_ARGS(wq_thread, cpu));
20
21DECLARE_TRACE(workqueue_destruction,
22 TP_PROTO(struct task_struct *wq_thread),
23 TP_ARGS(wq_thread));
24
25#endif /* __TRACE_WORKQUEUE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 6a5c5fed66c9..69d5190918e5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -101,6 +101,66 @@ config LOCALVERSION_AUTO
101 101
102 which is done within the script "scripts/setlocalversion".) 102 which is done within the script "scripts/setlocalversion".)
103 103
104config HAVE_KERNEL_GZIP
105 bool
106
107config HAVE_KERNEL_BZIP2
108 bool
109
110config HAVE_KERNEL_LZMA
111 bool
112
113choice
114 prompt "Kernel compression mode"
115 default KERNEL_GZIP
116 depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
117 help
118 The linux kernel is a kind of self-extracting executable.
119 Several compression algorithms are available, which differ
120 in efficiency, compression and decompression speed.
121 Compression speed is only relevant when building a kernel.
122 Decompression speed is relevant at each boot.
123
124 If you have any problems with bzip2 or lzma compressed
125 kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
126 version of this functionality (bzip2 only), for 2.4, was
127 supplied by Christian Ludwig)
128
129 High compression options are mostly useful for users, who
130 are low on disk space (embedded systems), but for whom ram
131 size matters less.
132
133 If in doubt, select 'gzip'
134
135config KERNEL_GZIP
136 bool "Gzip"
137 depends on HAVE_KERNEL_GZIP
138 help
139 The old and tried gzip compression. Its compression ratio is
140 the poorest among the 3 choices; however its speed (both
141 compression and decompression) is the fastest.
142
143config KERNEL_BZIP2
144 bool "Bzip2"
145 depends on HAVE_KERNEL_BZIP2
146 help
147 Its compression ratio and speed is intermediate.
148 Decompression speed is slowest among the three. The kernel
149 size is about 10% smaller with bzip2, in comparison to gzip.
150 Bzip2 uses a large amount of memory. For modern kernels you
151 will need at least 8MB RAM or more for booting.
152
153config KERNEL_LZMA
154 bool "LZMA"
155 depends on HAVE_KERNEL_LZMA
156 help
157 The most recent compression algorithm.
158 Its ratio is best, decompression speed is between the other
159 two. Compression is slowest. The kernel size is about 33%
160 smaller with LZMA in comparison to gzip.
161
162endchoice
163
104config SWAP 164config SWAP
105 bool "Support for paging of anonymous memory (swap)" 165 bool "Support for paging of anonymous memory (swap)"
106 depends on MMU && BLOCK 166 depends on MMU && BLOCK
@@ -945,7 +1005,7 @@ config TRACEPOINTS
945 1005
946config MARKERS 1006config MARKERS
947 bool "Activate markers" 1007 bool "Activate markers"
948 depends on TRACEPOINTS 1008 select TRACEPOINTS
949 help 1009 help
950 Place an empty function call at each marker site. Can be 1010 Place an empty function call at each marker site. Can be
951 dynamically changed for a probe function. 1011 dynamically changed for a probe function.
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 0f0f0cf3ba9a..027a402708de 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -11,6 +11,9 @@
11#include "do_mounts.h" 11#include "do_mounts.h"
12#include "../fs/squashfs/squashfs_fs.h" 12#include "../fs/squashfs/squashfs_fs.h"
13 13
14#include <linux/decompress/generic.h>
15
16
14int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ 17int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */
15 18
16static int __init prompt_ramdisk(char *str) 19static int __init prompt_ramdisk(char *str)
@@ -29,7 +32,7 @@ static int __init ramdisk_start_setup(char *str)
29} 32}
30__setup("ramdisk_start=", ramdisk_start_setup); 33__setup("ramdisk_start=", ramdisk_start_setup);
31 34
32static int __init crd_load(int in_fd, int out_fd); 35static int __init crd_load(int in_fd, int out_fd, decompress_fn deco);
33 36
34/* 37/*
35 * This routine tries to find a RAM disk image to load, and returns the 38 * This routine tries to find a RAM disk image to load, and returns the
@@ -38,15 +41,15 @@ static int __init crd_load(int in_fd, int out_fd);
38 * numbers could not be found. 41 * numbers could not be found.
39 * 42 *
40 * We currently check for the following magic numbers: 43 * We currently check for the following magic numbers:
41 * minix 44 * minix
42 * ext2 45 * ext2
43 * romfs 46 * romfs
44 * cramfs 47 * cramfs
45 * squashfs 48 * squashfs
46 * gzip 49 * gzip
47 */ 50 */
48static int __init 51static int __init
49identify_ramdisk_image(int fd, int start_block) 52identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
50{ 53{
51 const int size = 512; 54 const int size = 512;
52 struct minix_super_block *minixsb; 55 struct minix_super_block *minixsb;
@@ -56,6 +59,7 @@ identify_ramdisk_image(int fd, int start_block)
56 struct squashfs_super_block *squashfsb; 59 struct squashfs_super_block *squashfsb;
57 int nblocks = -1; 60 int nblocks = -1;
58 unsigned char *buf; 61 unsigned char *buf;
62 const char *compress_name;
59 63
60 buf = kmalloc(size, GFP_KERNEL); 64 buf = kmalloc(size, GFP_KERNEL);
61 if (!buf) 65 if (!buf)
@@ -69,18 +73,19 @@ identify_ramdisk_image(int fd, int start_block)
69 memset(buf, 0xe5, size); 73 memset(buf, 0xe5, size);
70 74
71 /* 75 /*
72 * Read block 0 to test for gzipped kernel 76 * Read block 0 to test for compressed kernel
73 */ 77 */
74 sys_lseek(fd, start_block * BLOCK_SIZE, 0); 78 sys_lseek(fd, start_block * BLOCK_SIZE, 0);
75 sys_read(fd, buf, size); 79 sys_read(fd, buf, size);
76 80
77 /* 81 *decompressor = decompress_method(buf, size, &compress_name);
78 * If it matches the gzip magic numbers, return 0 82 if (compress_name) {
79 */ 83 printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n",
80 if (buf[0] == 037 && ((buf[1] == 0213) || (buf[1] == 0236))) { 84 compress_name, start_block);
81 printk(KERN_NOTICE 85 if (!*decompressor)
82 "RAMDISK: Compressed image found at block %d\n", 86 printk(KERN_EMERG
83 start_block); 87 "RAMDISK: %s decompressor not configured!\n",
88 compress_name);
84 nblocks = 0; 89 nblocks = 0;
85 goto done; 90 goto done;
86 } 91 }
@@ -142,7 +147,7 @@ identify_ramdisk_image(int fd, int start_block)
142 printk(KERN_NOTICE 147 printk(KERN_NOTICE
143 "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", 148 "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n",
144 start_block); 149 start_block);
145 150
146done: 151done:
147 sys_lseek(fd, start_block * BLOCK_SIZE, 0); 152 sys_lseek(fd, start_block * BLOCK_SIZE, 0);
148 kfree(buf); 153 kfree(buf);
@@ -157,6 +162,7 @@ int __init rd_load_image(char *from)
157 int nblocks, i, disk; 162 int nblocks, i, disk;
158 char *buf = NULL; 163 char *buf = NULL;
159 unsigned short rotate = 0; 164 unsigned short rotate = 0;
165 decompress_fn decompressor = NULL;
160#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES) 166#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
161 char rotator[4] = { '|' , '/' , '-' , '\\' }; 167 char rotator[4] = { '|' , '/' , '-' , '\\' };
162#endif 168#endif
@@ -169,12 +175,12 @@ int __init rd_load_image(char *from)
169 if (in_fd < 0) 175 if (in_fd < 0)
170 goto noclose_input; 176 goto noclose_input;
171 177
172 nblocks = identify_ramdisk_image(in_fd, rd_image_start); 178 nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor);
173 if (nblocks < 0) 179 if (nblocks < 0)
174 goto done; 180 goto done;
175 181
176 if (nblocks == 0) { 182 if (nblocks == 0) {
177 if (crd_load(in_fd, out_fd) == 0) 183 if (crd_load(in_fd, out_fd, decompressor) == 0)
178 goto successful_load; 184 goto successful_load;
179 goto done; 185 goto done;
180 } 186 }
@@ -200,7 +206,7 @@ int __init rd_load_image(char *from)
200 nblocks, rd_blocks); 206 nblocks, rd_blocks);
201 goto done; 207 goto done;
202 } 208 }
203 209
204 /* 210 /*
205 * OK, time to copy in the data 211 * OK, time to copy in the data
206 */ 212 */
@@ -273,138 +279,48 @@ int __init rd_load_disk(int n)
273 return rd_load_image("/dev/root"); 279 return rd_load_image("/dev/root");
274} 280}
275 281
276/*
277 * gzip declarations
278 */
279
280#define OF(args) args
281
282#ifndef memzero
283#define memzero(s, n) memset ((s), 0, (n))
284#endif
285
286typedef unsigned char uch;
287typedef unsigned short ush;
288typedef unsigned long ulg;
289
290#define INBUFSIZ 4096
291#define WSIZE 0x8000 /* window size--must be a power of two, and */
292 /* at least 32K for zip's deflate method */
293
294static uch *inbuf;
295static uch *window;
296
297static unsigned insize; /* valid bytes in inbuf */
298static unsigned inptr; /* index of next byte to be processed in inbuf */
299static unsigned outcnt; /* bytes in output buffer */
300static int exit_code; 282static int exit_code;
301static int unzip_error; 283static int decompress_error;
302static long bytes_out;
303static int crd_infd, crd_outfd; 284static int crd_infd, crd_outfd;
304 285
305#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) 286static int __init compr_fill(void *buf, unsigned int len)
306
307/* Diagnostic functions (stubbed out) */
308#define Assert(cond,msg)
309#define Trace(x)
310#define Tracev(x)
311#define Tracevv(x)
312#define Tracec(c,x)
313#define Tracecv(c,x)
314
315#define STATIC static
316#define INIT __init
317
318static int __init fill_inbuf(void);
319static void __init flush_window(void);
320static void __init error(char *m);
321
322#define NO_INFLATE_MALLOC
323
324#include "../lib/inflate.c"
325
326/* ===========================================================================
327 * Fill the input buffer. This is called only when the buffer is empty
328 * and at least one byte is really needed.
329 * Returning -1 does not guarantee that gunzip() will ever return.
330 */
331static int __init fill_inbuf(void)
332{ 287{
333 if (exit_code) return -1; 288 int r = sys_read(crd_infd, buf, len);
334 289 if (r < 0)
335 insize = sys_read(crd_infd, inbuf, INBUFSIZ); 290 printk(KERN_ERR "RAMDISK: error while reading compressed data");
336 if (insize == 0) { 291 else if (r == 0)
337 error("RAMDISK: ran out of compressed data"); 292 printk(KERN_ERR "RAMDISK: EOF while reading compressed data");
338 return -1; 293 return r;
339 }
340
341 inptr = 1;
342
343 return inbuf[0];
344} 294}
345 295
346/* =========================================================================== 296static int __init compr_flush(void *window, unsigned int outcnt)
347 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
348 * (Used for the decompressed data only.)
349 */
350static void __init flush_window(void)
351{ 297{
352 ulg c = crc; /* temporary variable */ 298 int written = sys_write(crd_outfd, window, outcnt);
353 unsigned n, written; 299 if (written != outcnt) {
354 uch *in, ch; 300 if (decompress_error == 0)
355 301 printk(KERN_ERR
356 written = sys_write(crd_outfd, window, outcnt); 302 "RAMDISK: incomplete write (%d != %d)\n",
357 if (written != outcnt && unzip_error == 0) { 303 written, outcnt);
358 printk(KERN_ERR "RAMDISK: incomplete write (%d != %d) %ld\n", 304 decompress_error = 1;
359 written, outcnt, bytes_out); 305 return -1;
360 unzip_error = 1; 306 }
361 } 307 return outcnt;
362 in = window;
363 for (n = 0; n < outcnt; n++) {
364 ch = *in++;
365 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
366 }
367 crc = c;
368 bytes_out += (ulg)outcnt;
369 outcnt = 0;
370} 308}
371 309
372static void __init error(char *x) 310static void __init error(char *x)
373{ 311{
374 printk(KERN_ERR "%s\n", x); 312 printk(KERN_ERR "%s\n", x);
375 exit_code = 1; 313 exit_code = 1;
376 unzip_error = 1; 314 decompress_error = 1;
377} 315}
378 316
379static int __init crd_load(int in_fd, int out_fd) 317static int __init crd_load(int in_fd, int out_fd, decompress_fn deco)
380{ 318{
381 int result; 319 int result;
382
383 insize = 0; /* valid bytes in inbuf */
384 inptr = 0; /* index of next byte to be processed in inbuf */
385 outcnt = 0; /* bytes in output buffer */
386 exit_code = 0;
387 bytes_out = 0;
388 crc = (ulg)0xffffffffL; /* shift register contents */
389
390 crd_infd = in_fd; 320 crd_infd = in_fd;
391 crd_outfd = out_fd; 321 crd_outfd = out_fd;
392 inbuf = kmalloc(INBUFSIZ, GFP_KERNEL); 322 result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error);
393 if (!inbuf) { 323 if (decompress_error)
394 printk(KERN_ERR "RAMDISK: Couldn't allocate gzip buffer\n");
395 return -1;
396 }
397 window = kmalloc(WSIZE, GFP_KERNEL);
398 if (!window) {
399 printk(KERN_ERR "RAMDISK: Couldn't allocate gzip window\n");
400 kfree(inbuf);
401 return -1;
402 }
403 makecrc();
404 result = gunzip();
405 if (unzip_error)
406 result = 1; 324 result = 1;
407 kfree(inbuf);
408 kfree(window);
409 return result; 325 return result;
410} 326}
diff --git a/init/initramfs.c b/init/initramfs.c
index d9c941c0c3ca..7dcde7ea6603 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -390,11 +390,13 @@ static int __init write_buffer(char *buf, unsigned len)
390 return len - count; 390 return len - count;
391} 391}
392 392
393static void __init flush_buffer(char *buf, unsigned len) 393static int __init flush_buffer(void *bufv, unsigned len)
394{ 394{
395 char *buf = (char *) bufv;
395 int written; 396 int written;
397 int origLen = len;
396 if (message) 398 if (message)
397 return; 399 return -1;
398 while ((written = write_buffer(buf, len)) < len && !message) { 400 while ((written = write_buffer(buf, len)) < len && !message) {
399 char c = buf[written]; 401 char c = buf[written];
400 if (c == '0') { 402 if (c == '0') {
@@ -408,84 +410,28 @@ static void __init flush_buffer(char *buf, unsigned len)
408 } else 410 } else
409 error("junk in compressed archive"); 411 error("junk in compressed archive");
410 } 412 }
413 return origLen;
411} 414}
412 415
413/* 416static unsigned my_inptr; /* index of next byte to be processed in inbuf */
414 * gzip declarations
415 */
416 417
417#define OF(args) args 418#include <linux/decompress/generic.h>
418
419#ifndef memzero
420#define memzero(s, n) memset ((s), 0, (n))
421#endif
422
423typedef unsigned char uch;
424typedef unsigned short ush;
425typedef unsigned long ulg;
426
427#define WSIZE 0x8000 /* window size--must be a power of two, and */
428 /* at least 32K for zip's deflate method */
429
430static uch *inbuf;
431static uch *window;
432
433static unsigned insize; /* valid bytes in inbuf */
434static unsigned inptr; /* index of next byte to be processed in inbuf */
435static unsigned outcnt; /* bytes in output buffer */
436static long bytes_out;
437
438#define get_byte() (inptr < insize ? inbuf[inptr++] : -1)
439
440/* Diagnostic functions (stubbed out) */
441#define Assert(cond,msg)
442#define Trace(x)
443#define Tracev(x)
444#define Tracevv(x)
445#define Tracec(c,x)
446#define Tracecv(c,x)
447
448#define STATIC static
449#define INIT __init
450
451static void __init flush_window(void);
452static void __init error(char *m);
453
454#define NO_INFLATE_MALLOC
455
456#include "../lib/inflate.c"
457
458/* ===========================================================================
459 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
460 * (Used for the decompressed data only.)
461 */
462static void __init flush_window(void)
463{
464 ulg c = crc; /* temporary variable */
465 unsigned n;
466 uch *in, ch;
467
468 flush_buffer(window, outcnt);
469 in = window;
470 for (n = 0; n < outcnt; n++) {
471 ch = *in++;
472 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
473 }
474 crc = c;
475 bytes_out += (ulg)outcnt;
476 outcnt = 0;
477}
478 419
479static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) 420static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only)
480{ 421{
481 int written; 422 int written;
423 decompress_fn decompress;
424 const char *compress_name;
425 static __initdata char msg_buf[64];
426
482 dry_run = check_only; 427 dry_run = check_only;
483 header_buf = kmalloc(110, GFP_KERNEL); 428 header_buf = kmalloc(110, GFP_KERNEL);
484 symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); 429 symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL);
485 name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); 430 name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL);
486 window = kmalloc(WSIZE, GFP_KERNEL); 431
487 if (!window || !header_buf || !symlink_buf || !name_buf) 432 if (!header_buf || !symlink_buf || !name_buf)
488 panic("can't allocate buffers"); 433 panic("can't allocate buffers");
434
489 state = Start; 435 state = Start;
490 this_header = 0; 436 this_header = 0;
491 message = NULL; 437 message = NULL;
@@ -505,22 +451,25 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only)
505 continue; 451 continue;
506 } 452 }
507 this_header = 0; 453 this_header = 0;
508 insize = len; 454 decompress = decompress_method(buf, len, &compress_name);
509 inbuf = buf; 455 if (decompress)
510 inptr = 0; 456 decompress(buf, len, NULL, flush_buffer, NULL,
511 outcnt = 0; /* bytes in output buffer */ 457 &my_inptr, error);
512 bytes_out = 0; 458 else if (compress_name) {
513 crc = (ulg)0xffffffffL; /* shift register contents */ 459 if (!message) {
514 makecrc(); 460 snprintf(msg_buf, sizeof msg_buf,
515 gunzip(); 461 "compression method %s not configured",
462 compress_name);
463 message = msg_buf;
464 }
465 }
516 if (state != Reset) 466 if (state != Reset)
517 error("junk in gzipped archive"); 467 error("junk in compressed archive");
518 this_header = saved_offset + inptr; 468 this_header = saved_offset + my_inptr;
519 buf += inptr; 469 buf += my_inptr;
520 len -= inptr; 470 len -= my_inptr;
521 } 471 }
522 dir_utime(); 472 dir_utime();
523 kfree(window);
524 kfree(name_buf); 473 kfree(name_buf);
525 kfree(symlink_buf); 474 kfree(symlink_buf);
526 kfree(header_buf); 475 kfree(header_buf);
@@ -579,7 +528,7 @@ static int __init populate_rootfs(void)
579 char *err = unpack_to_rootfs(__initramfs_start, 528 char *err = unpack_to_rootfs(__initramfs_start,
580 __initramfs_end - __initramfs_start, 0); 529 __initramfs_end - __initramfs_start, 0);
581 if (err) 530 if (err)
582 panic(err); 531 panic(err); /* Failed to decompress INTERNAL initramfs */
583 if (initrd_start) { 532 if (initrd_start) {
584#ifdef CONFIG_BLK_DEV_RAM 533#ifdef CONFIG_BLK_DEV_RAM
585 int fd; 534 int fd;
@@ -605,9 +554,12 @@ static int __init populate_rootfs(void)
605 printk(KERN_INFO "Unpacking initramfs..."); 554 printk(KERN_INFO "Unpacking initramfs...");
606 err = unpack_to_rootfs((char *)initrd_start, 555 err = unpack_to_rootfs((char *)initrd_start,
607 initrd_end - initrd_start, 0); 556 initrd_end - initrd_start, 0);
608 if (err) 557 if (err) {
609 panic(err); 558 printk(" failed!\n");
610 printk(" done\n"); 559 printk(KERN_EMERG "%s\n", err);
560 } else {
561 printk(" done\n");
562 }
611 free_initrd(); 563 free_initrd();
612#endif 564#endif
613 } 565 }
diff --git a/init/main.c b/init/main.c
index 83697e160b3a..20d784ab5ef8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -14,6 +14,7 @@
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/stackprotector.h>
17#include <linux/string.h> 18#include <linux/string.h>
18#include <linux/ctype.h> 19#include <linux/ctype.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
@@ -70,6 +71,7 @@
70#include <asm/setup.h> 71#include <asm/setup.h>
71#include <asm/sections.h> 72#include <asm/sections.h>
72#include <asm/cacheflush.h> 73#include <asm/cacheflush.h>
74#include <trace/kmemtrace.h>
73 75
74#ifdef CONFIG_X86_LOCAL_APIC 76#ifdef CONFIG_X86_LOCAL_APIC
75#include <asm/smp.h> 77#include <asm/smp.h>
@@ -135,14 +137,14 @@ unsigned int __initdata setup_max_cpus = NR_CPUS;
135 * greater than 0, limits the maximum number of CPUs activated in 137 * greater than 0, limits the maximum number of CPUs activated in
136 * SMP mode to <NUM>. 138 * SMP mode to <NUM>.
137 */ 139 */
138#ifndef CONFIG_X86_IO_APIC 140
139static inline void disable_ioapic_setup(void) {}; 141void __weak arch_disable_smp_support(void) { }
140#endif
141 142
142static int __init nosmp(char *str) 143static int __init nosmp(char *str)
143{ 144{
144 setup_max_cpus = 0; 145 setup_max_cpus = 0;
145 disable_ioapic_setup(); 146 arch_disable_smp_support();
147
146 return 0; 148 return 0;
147} 149}
148 150
@@ -152,14 +154,14 @@ static int __init maxcpus(char *str)
152{ 154{
153 get_option(&str, &setup_max_cpus); 155 get_option(&str, &setup_max_cpus);
154 if (setup_max_cpus == 0) 156 if (setup_max_cpus == 0)
155 disable_ioapic_setup(); 157 arch_disable_smp_support();
156 158
157 return 0; 159 return 0;
158} 160}
159 161
160early_param("maxcpus", maxcpus); 162early_param("maxcpus", maxcpus);
161#else 163#else
162#define setup_max_cpus NR_CPUS 164const unsigned int setup_max_cpus = NR_CPUS;
163#endif 165#endif
164 166
165/* 167/*
@@ -540,6 +542,12 @@ asmlinkage void __init start_kernel(void)
540 */ 542 */
541 lockdep_init(); 543 lockdep_init();
542 debug_objects_early_init(); 544 debug_objects_early_init();
545
546 /*
547 * Set up the the initial canary ASAP:
548 */
549 boot_init_stack_canary();
550
543 cgroup_init_early(); 551 cgroup_init_early();
544 552
545 local_irq_disable(); 553 local_irq_disable();
@@ -642,6 +650,7 @@ asmlinkage void __init start_kernel(void)
642 enable_debug_pagealloc(); 650 enable_debug_pagealloc();
643 cpu_hotplug_init(); 651 cpu_hotplug_init();
644 kmem_cache_init(); 652 kmem_cache_init();
653 kmemtrace_init();
645 debug_objects_mem_init(); 654 debug_objects_mem_init();
646 idr_init_cache(); 655 idr_init_cache();
647 setup_per_cpu_pageset(); 656 setup_per_cpu_pageset();
diff --git a/kernel/exit.c b/kernel/exit.c
index efd30ccf3858..167e1e3ad7c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -980,12 +980,9 @@ static void check_stack_usage(void)
980{ 980{
981 static DEFINE_SPINLOCK(low_water_lock); 981 static DEFINE_SPINLOCK(low_water_lock);
982 static int lowest_to_date = THREAD_SIZE; 982 static int lowest_to_date = THREAD_SIZE;
983 unsigned long *n = end_of_stack(current);
984 unsigned long free; 983 unsigned long free;
985 984
986 while (*n == 0) 985 free = stack_not_used(current);
987 n++;
988 free = (unsigned long)n - (unsigned long)end_of_stack(current);
989 986
990 if (free >= lowest_to_date) 987 if (free >= lowest_to_date)
991 return; 988 return;
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..0df6253730be 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
41 return e; 41 return e;
42} 42}
43 43
44__notrace_funcgraph int core_kernel_text(unsigned long addr) 44int core_kernel_text(unsigned long addr)
45{ 45{
46 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
47 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
54 return 0; 54 return 0;
55} 55}
56 56
57__notrace_funcgraph int __kernel_text_address(unsigned long addr) 57int __kernel_text_address(unsigned long addr)
58{ 58{
59 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
60 return 1; 60 return 1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4854c2c4a82e..6715ebc3761d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,6 +61,7 @@
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <trace/sched.h> 63#include <trace/sched.h>
64#include <linux/magic.h>
64 65
65#include <asm/pgtable.h> 66#include <asm/pgtable.h>
66#include <asm/pgalloc.h> 67#include <asm/pgalloc.h>
@@ -212,6 +213,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
212{ 213{
213 struct task_struct *tsk; 214 struct task_struct *tsk;
214 struct thread_info *ti; 215 struct thread_info *ti;
216 unsigned long *stackend;
217
215 int err; 218 int err;
216 219
217 prepare_to_copy(orig); 220 prepare_to_copy(orig);
@@ -237,6 +240,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
237 goto out; 240 goto out;
238 241
239 setup_thread_stack(tsk, orig); 242 setup_thread_stack(tsk, orig);
243 stackend = end_of_stack(tsk);
244 *stackend = STACK_END_MAGIC; /* for overflow detection */
240 245
241#ifdef CONFIG_CC_STACKPROTECTOR 246#ifdef CONFIG_CC_STACKPROTECTOR
242 tsk->stack_canary = get_random_int(); 247 tsk->stack_canary = get_random_int();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7de11bd64dfe..122fef4b0bd3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpumask_setall(&desc->affinity); 49 cpumask_setall(desc->affinity);
50#ifdef CONFIG_GENERIC_PENDING_IRQ
51 cpumask_clear(desc->pending_mask);
52#endif
50#endif 53#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 54 spin_unlock_irqrestore(&desc->lock, flags);
52} 55}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..412370ab9a34 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,8 @@
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 18#include <linux/rculist.h>
19#include <linux/hash.h> 19#include <linux/hash.h>
20#include <trace/irq.h>
21#include <linux/bootmem.h>
20 22
21#include "internals.h" 23#include "internals.h"
22 24
@@ -69,6 +71,7 @@ int nr_irqs = NR_IRQS;
69EXPORT_SYMBOL_GPL(nr_irqs); 71EXPORT_SYMBOL_GPL(nr_irqs);
70 72
71#ifdef CONFIG_SPARSE_IRQ 73#ifdef CONFIG_SPARSE_IRQ
74
72static struct irq_desc irq_desc_init = { 75static struct irq_desc irq_desc_init = {
73 .irq = -1, 76 .irq = -1,
74 .status = IRQ_DISABLED, 77 .status = IRQ_DISABLED,
@@ -76,9 +79,6 @@ static struct irq_desc irq_desc_init = {
76 .handle_irq = handle_bad_irq, 79 .handle_irq = handle_bad_irq,
77 .depth = 1, 80 .depth = 1,
78 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
79#ifdef CONFIG_SMP
80 .affinity = CPU_MASK_ALL
81#endif
82}; 82};
83 83
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -113,6 +113,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
113 printk(KERN_ERR "can not alloc kstat_irqs\n"); 113 printk(KERN_ERR "can not alloc kstat_irqs\n");
114 BUG_ON(1); 114 BUG_ON(1);
115 } 115 }
116 if (!init_alloc_desc_masks(desc, cpu, false)) {
117 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
118 BUG_ON(1);
119 }
116 arch_init_chip_data(desc, cpu); 120 arch_init_chip_data(desc, cpu);
117} 121}
118 122
@@ -121,7 +125,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
121 */ 125 */
122DEFINE_SPINLOCK(sparse_irq_lock); 126DEFINE_SPINLOCK(sparse_irq_lock);
123 127
124struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; 128struct irq_desc **irq_desc_ptrs __read_mostly;
125 129
126static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 130static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
127 [0 ... NR_IRQS_LEGACY-1] = { 131 [0 ... NR_IRQS_LEGACY-1] = {
@@ -131,14 +135,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
131 .handle_irq = handle_bad_irq, 135 .handle_irq = handle_bad_irq,
132 .depth = 1, 136 .depth = 1,
133 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 137 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
134#ifdef CONFIG_SMP
135 .affinity = CPU_MASK_ALL
136#endif
137 } 138 }
138}; 139};
139 140
140/* FIXME: use bootmem alloc ...*/ 141static unsigned int *kstat_irqs_legacy;
141static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
142 142
143int __init early_irq_init(void) 143int __init early_irq_init(void)
144{ 144{
@@ -148,18 +148,30 @@ int __init early_irq_init(void)
148 148
149 init_irq_default_affinity(); 149 init_irq_default_affinity();
150 150
151 /* initialize nr_irqs based on nr_cpu_ids */
152 arch_probe_nr_irqs();
153 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
154
151 desc = irq_desc_legacy; 155 desc = irq_desc_legacy;
152 legacy_count = ARRAY_SIZE(irq_desc_legacy); 156 legacy_count = ARRAY_SIZE(irq_desc_legacy);
153 157
158 /* allocate irq_desc_ptrs array based on nr_irqs */
159 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
160
161 /* allocate based on nr_cpu_ids */
162 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
163 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
164 sizeof(int));
165
154 for (i = 0; i < legacy_count; i++) { 166 for (i = 0; i < legacy_count; i++) {
155 desc[i].irq = i; 167 desc[i].irq = i;
156 desc[i].kstat_irqs = kstat_irqs_legacy[i]; 168 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
157 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 169 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
158 170 init_alloc_desc_masks(&desc[i], 0, true);
159 irq_desc_ptrs[i] = desc + i; 171 irq_desc_ptrs[i] = desc + i;
160 } 172 }
161 173
162 for (i = legacy_count; i < NR_IRQS; i++) 174 for (i = legacy_count; i < nr_irqs; i++)
163 irq_desc_ptrs[i] = NULL; 175 irq_desc_ptrs[i] = NULL;
164 176
165 return arch_early_irq_init(); 177 return arch_early_irq_init();
@@ -167,7 +179,10 @@ int __init early_irq_init(void)
167 179
168struct irq_desc *irq_to_desc(unsigned int irq) 180struct irq_desc *irq_to_desc(unsigned int irq)
169{ 181{
170 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; 182 if (irq_desc_ptrs && irq < nr_irqs)
183 return irq_desc_ptrs[irq];
184
185 return NULL;
171} 186}
172 187
173struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 188struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -176,10 +191,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
176 unsigned long flags; 191 unsigned long flags;
177 int node; 192 int node;
178 193
179 if (irq >= NR_IRQS) { 194 if (irq >= nr_irqs) {
180 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", 195 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
181 irq, NR_IRQS); 196 irq, nr_irqs);
182 WARN_ON(1);
183 return NULL; 197 return NULL;
184 } 198 }
185 199
@@ -221,9 +235,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
221 .handle_irq = handle_bad_irq, 235 .handle_irq = handle_bad_irq,
222 .depth = 1, 236 .depth = 1,
223 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 237 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
224#ifdef CONFIG_SMP
225 .affinity = CPU_MASK_ALL
226#endif
227 } 238 }
228}; 239};
229 240
@@ -235,12 +246,15 @@ int __init early_irq_init(void)
235 246
236 init_irq_default_affinity(); 247 init_irq_default_affinity();
237 248
249 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
250
238 desc = irq_desc; 251 desc = irq_desc;
239 count = ARRAY_SIZE(irq_desc); 252 count = ARRAY_SIZE(irq_desc);
240 253
241 for (i = 0; i < count; i++) 254 for (i = 0; i < count; i++) {
242 desc[i].irq = i; 255 desc[i].irq = i;
243 256 init_alloc_desc_masks(&desc[i], 0, true);
257 }
244 return arch_early_irq_init(); 258 return arch_early_irq_init();
245} 259}
246 260
@@ -316,6 +330,9 @@ irqreturn_t no_action(int cpl, void *dev_id)
316 return IRQ_NONE; 330 return IRQ_NONE;
317} 331}
318 332
333DEFINE_TRACE(irq_handler_entry);
334DEFINE_TRACE(irq_handler_exit);
335
319/** 336/**
320 * handle_IRQ_event - irq action chain handler 337 * handle_IRQ_event - irq action chain handler
321 * @irq: the interrupt number 338 * @irq: the interrupt number
@@ -332,7 +349,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
332 local_irq_enable_in_hardirq(); 349 local_irq_enable_in_hardirq();
333 350
334 do { 351 do {
352 trace_irq_handler_entry(irq, action);
335 ret = action->handler(irq, action->dev_id); 353 ret = action->handler(irq, action->dev_id);
354 trace_irq_handler_exit(irq, action, ret);
336 if (ret == IRQ_HANDLED) 355 if (ret == IRQ_HANDLED)
337 status |= action->flags; 356 status |= action->flags;
338 retval |= ret; 357 retval |= ret;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
16extern struct lock_class_key irq_desc_lock_class; 16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock; 18extern spinlock_t sparse_irq_lock;
19
20#ifdef CONFIG_SPARSE_IRQ
21/* irq_desc_ptrs allocated at boot time */
22extern struct irq_desc **irq_desc_ptrs;
23#else
24/* irq_desc_ptrs is a fixed size array */
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; 25extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
26#endif
20 27
21#ifdef CONFIG_PROC_FS 28#ifdef CONFIG_PROC_FS
22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 29extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 291f03664552..a3a5dc9ef346 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -90,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
90 90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
93 cpumask_copy(&desc->affinity, cpumask); 93 cpumask_copy(desc->affinity, cpumask);
94 desc->chip->set_affinity(irq, cpumask); 94 desc->chip->set_affinity(irq, cpumask);
95 } else { 95 } else {
96 desc->status |= IRQ_MOVE_PENDING; 96 desc->status |= IRQ_MOVE_PENDING;
97 cpumask_copy(&desc->pending_mask, cpumask); 97 cpumask_copy(desc->pending_mask, cpumask);
98 } 98 }
99#else 99#else
100 cpumask_copy(&desc->affinity, cpumask); 100 cpumask_copy(desc->affinity, cpumask);
101 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
102#endif 102#endif
103 desc->status |= IRQ_AFFINITY_SET; 103 desc->status |= IRQ_AFFINITY_SET;
@@ -119,16 +119,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
119 * one of the targets is online. 119 * one of the targets is online.
120 */ 120 */
121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
122 if (cpumask_any_and(&desc->affinity, cpu_online_mask) 122 if (cpumask_any_and(desc->affinity, cpu_online_mask)
123 < nr_cpu_ids) 123 < nr_cpu_ids)
124 goto set_affinity; 124 goto set_affinity;
125 else 125 else
126 desc->status &= ~IRQ_AFFINITY_SET; 126 desc->status &= ~IRQ_AFFINITY_SET;
127 } 127 }
128 128
129 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); 129 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
130set_affinity: 130set_affinity:
131 desc->chip->set_affinity(irq, &desc->affinity); 131 desc->chip->set_affinity(irq, desc->affinity);
132 132
133 return 0; 133 return 0;
134} 134}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
18 18
19 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
20 20
21 if (unlikely(cpumask_empty(&desc->pending_mask))) 21 if (unlikely(cpumask_empty(desc->pending_mask)))
22 return; 22 return;
23 23
24 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
38 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
39 * masking the irqs. 39 * masking the irqs.
40 */ 40 */
41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) 41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity, 43 cpumask_and(desc->affinity,
44 &desc->pending_mask, cpu_online_mask); 44 desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity); 45 desc->chip->set_affinity(irq, desc->affinity);
46 } 46 }
47 cpumask_clear(&desc->pending_mask); 47 cpumask_clear(desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index acd88356ac76..7f9b80434e32 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,15 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
38 old_desc->kstat_irqs = NULL; 38 old_desc->kstat_irqs = NULL;
39} 39}
40 40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 41static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu) 42 struct irq_desc *desc, int cpu)
43{ 43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc)); 44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 if (!init_alloc_desc_masks(desc, cpu, false)) {
46 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
47 "for migration.\n", irq);
48 return false;
49 }
45 spin_lock_init(&desc->lock); 50 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 51 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 52 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 53 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
54 init_copy_desc_masks(old_desc, desc);
49 arch_init_copy_chip_data(old_desc, desc, cpu); 55 arch_init_copy_chip_data(old_desc, desc, cpu);
56 return true;
50} 57}
51 58
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 59static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -76,12 +83,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 node = cpu_to_node(cpu); 83 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 84 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) { 85 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); 86 printk(KERN_ERR "irq %d: can not get new irq_desc "
87 "for migration.\n", irq);
88 /* still use old one */
89 desc = old_desc;
90 goto out_unlock;
91 }
92 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
80 /* still use old one */ 93 /* still use old one */
94 kfree(desc);
81 desc = old_desc; 95 desc = old_desc;
82 goto out_unlock; 96 goto out_unlock;
83 } 97 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85 98
86 irq_desc_ptrs[irq] = desc; 99 irq_desc_ptrs[irq] = desc;
87 spin_unlock_irqrestore(&sparse_irq_lock, flags); 100 spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 const struct cpumask *mask = &desc->affinity; 23 const struct cpumask *mask = desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
27 mask = &desc->pending_mask; 27 mask = desc->pending_mask;
28#endif 28#endif
29 seq_cpumask(m, mask); 29 seq_cpumask(m, mask);
30 seq_putc(m, '\n'); 30 seq_putc(m, '\n');
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 483899578259..c7fd6692939d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1130 return; 1130 return;
1131 memset(&prstatus, 0, sizeof(prstatus)); 1131 memset(&prstatus, 0, sizeof(prstatus));
1132 prstatus.pr_pid = current->pid; 1132 prstatus.pr_pid = current->pid;
1133 elf_core_copy_regs(&prstatus.pr_reg, regs); 1133 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1134 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1134 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1135 &prstatus, sizeof(prstatus)); 1135 &prstatus, sizeof(prstatus));
1136 final_note(buf); 1136 final_note(buf);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9845cb..5016bfb682b9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h>
46 47
47#include <asm-generic/sections.h> 48#include <asm-generic/sections.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
@@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p)
699 goto out; 700 goto out;
700 } 701 }
701 702
703 mutex_lock(&text_mutex);
702 ret = arch_prepare_kprobe(p); 704 ret = arch_prepare_kprobe(p);
703 if (ret) 705 if (ret)
704 goto out; 706 goto out_unlock_text;
705 707
706 INIT_HLIST_NODE(&p->hlist); 708 INIT_HLIST_NODE(&p->hlist);
707 hlist_add_head_rcu(&p->hlist, 709 hlist_add_head_rcu(&p->hlist,
@@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p)
710 if (kprobe_enabled) 712 if (kprobe_enabled)
711 arch_arm_kprobe(p); 713 arch_arm_kprobe(p);
712 714
715out_unlock_text:
716 mutex_unlock(&text_mutex);
713out: 717out:
714 mutex_unlock(&kprobe_mutex); 718 mutex_unlock(&kprobe_mutex);
715 719
@@ -746,8 +750,11 @@ valid_p:
746 * enabled and not gone - otherwise, the breakpoint would 750 * enabled and not gone - otherwise, the breakpoint would
747 * already have been removed. We save on flushing icache. 751 * already have been removed. We save on flushing icache.
748 */ 752 */
749 if (kprobe_enabled && !kprobe_gone(old_p)) 753 if (kprobe_enabled && !kprobe_gone(old_p)) {
754 mutex_lock(&text_mutex);
750 arch_disarm_kprobe(p); 755 arch_disarm_kprobe(p);
756 mutex_unlock(&text_mutex);
757 }
751 hlist_del_rcu(&old_p->hlist); 758 hlist_del_rcu(&old_p->hlist);
752 } else { 759 } else {
753 if (p->break_handler && !kprobe_gone(p)) 760 if (p->break_handler && !kprobe_gone(p))
@@ -912,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
912 ri->rp = rp; 919 ri->rp = rp;
913 ri->task = current; 920 ri->task = current;
914 921
915 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 922 if (rp->entry_handler && rp->entry_handler(ri, regs))
916 spin_unlock_irqrestore(&rp->lock, flags);
917 return 0; 923 return 0;
918 }
919 924
920 arch_prepare_kretprobe(ri, regs); 925 arch_prepare_kretprobe(ri, regs);
921 926
@@ -1280,12 +1285,14 @@ static void __kprobes enable_all_kprobes(void)
1280 if (kprobe_enabled) 1285 if (kprobe_enabled)
1281 goto already_enabled; 1286 goto already_enabled;
1282 1287
1288 mutex_lock(&text_mutex);
1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1289 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1284 head = &kprobe_table[i]; 1290 head = &kprobe_table[i];
1285 hlist_for_each_entry_rcu(p, node, head, hlist) 1291 hlist_for_each_entry_rcu(p, node, head, hlist)
1286 if (!kprobe_gone(p)) 1292 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p); 1293 arch_arm_kprobe(p);
1288 } 1294 }
1295 mutex_unlock(&text_mutex);
1289 1296
1290 kprobe_enabled = true; 1297 kprobe_enabled = true;
1291 printk(KERN_INFO "Kprobes globally enabled\n"); 1298 printk(KERN_INFO "Kprobes globally enabled\n");
@@ -1310,6 +1317,7 @@ static void __kprobes disable_all_kprobes(void)
1310 1317
1311 kprobe_enabled = false; 1318 kprobe_enabled = false;
1312 printk(KERN_INFO "Kprobes globally disabled\n"); 1319 printk(KERN_INFO "Kprobes globally disabled\n");
1320 mutex_lock(&text_mutex);
1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1321 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1314 head = &kprobe_table[i]; 1322 head = &kprobe_table[i];
1315 hlist_for_each_entry_rcu(p, node, head, hlist) { 1323 hlist_for_each_entry_rcu(p, node, head, hlist) {
@@ -1318,6 +1326,7 @@ static void __kprobes disable_all_kprobes(void)
1318 } 1326 }
1319 } 1327 }
1320 1328
1329 mutex_unlock(&text_mutex);
1321 mutex_unlock(&kprobe_mutex); 1330 mutex_unlock(&kprobe_mutex);
1322 /* Allow all currently running kprobes to complete */ 1331 /* Allow all currently running kprobes to complete */
1323 synchronize_sched(); 1332 synchronize_sched();
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0b..71b567f52813 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,6 +41,8 @@
41#include <linux/utsname.h> 41#include <linux/utsname.h>
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h>
45#include <trace/lockdep.h>
44 46
45#include <asm/sections.h> 47#include <asm/sections.h>
46 48
@@ -310,12 +312,14 @@ EXPORT_SYMBOL(lockdep_on);
310#if VERBOSE 312#if VERBOSE
311# define HARDIRQ_VERBOSE 1 313# define HARDIRQ_VERBOSE 1
312# define SOFTIRQ_VERBOSE 1 314# define SOFTIRQ_VERBOSE 1
315# define RECLAIM_VERBOSE 1
313#else 316#else
314# define HARDIRQ_VERBOSE 0 317# define HARDIRQ_VERBOSE 0
315# define SOFTIRQ_VERBOSE 0 318# define SOFTIRQ_VERBOSE 0
319# define RECLAIM_VERBOSE 0
316#endif 320#endif
317 321
318#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE 322#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
319/* 323/*
320 * Quick filtering for interesting events: 324 * Quick filtering for interesting events:
321 */ 325 */
@@ -430,30 +434,24 @@ atomic_t nr_find_usage_forwards_checks;
430atomic_t nr_find_usage_forwards_recursions; 434atomic_t nr_find_usage_forwards_recursions;
431atomic_t nr_find_usage_backwards_checks; 435atomic_t nr_find_usage_backwards_checks;
432atomic_t nr_find_usage_backwards_recursions; 436atomic_t nr_find_usage_backwards_recursions;
433# define debug_atomic_inc(ptr) atomic_inc(ptr)
434# define debug_atomic_dec(ptr) atomic_dec(ptr)
435# define debug_atomic_read(ptr) atomic_read(ptr)
436#else
437# define debug_atomic_inc(ptr) do { } while (0)
438# define debug_atomic_dec(ptr) do { } while (0)
439# define debug_atomic_read(ptr) 0
440#endif 437#endif
441 438
442/* 439/*
443 * Locking printouts: 440 * Locking printouts:
444 */ 441 */
445 442
443#define __USAGE(__STATE) \
444 [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
445 [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
446 [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
447 [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
448
446static const char *usage_str[] = 449static const char *usage_str[] =
447{ 450{
448 [LOCK_USED] = "initial-use ", 451#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
449 [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", 452#include "lockdep_states.h"
450 [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", 453#undef LOCKDEP_STATE
451 [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", 454 [LOCK_USED] = "INITIAL USE",
452 [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
453 [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
454 [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
455 [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
456 [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
457}; 455};
458 456
459const char * __get_key_name(struct lockdep_subclass_key *key, char *str) 457const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -461,46 +459,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
461 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); 459 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
462} 460}
463 461
464void 462static inline unsigned long lock_flag(enum lock_usage_bit bit)
465get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
466{ 463{
467 *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; 464 return 1UL << bit;
468 465}
469 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
470 *c1 = '+';
471 else
472 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
473 *c1 = '-';
474 466
475 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 467static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
476 *c2 = '+'; 468{
477 else 469 char c = '.';
478 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
479 *c2 = '-';
480 470
481 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 471 if (class->usage_mask & lock_flag(bit + 2))
482 *c3 = '-'; 472 c = '+';
483 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { 473 if (class->usage_mask & lock_flag(bit)) {
484 *c3 = '+'; 474 c = '-';
485 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 475 if (class->usage_mask & lock_flag(bit + 2))
486 *c3 = '?'; 476 c = '?';
487 } 477 }
488 478
489 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 479 return c;
490 *c4 = '-'; 480}
491 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { 481
492 *c4 = '+'; 482void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
493 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 483{
494 *c4 = '?'; 484 int i = 0;
495 } 485
486#define LOCKDEP_STATE(__STATE) \
487 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
488 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
489#include "lockdep_states.h"
490#undef LOCKDEP_STATE
491
492 usage[i] = '\0';
496} 493}
497 494
498static void print_lock_name(struct lock_class *class) 495static void print_lock_name(struct lock_class *class)
499{ 496{
500 char str[KSYM_NAME_LEN], c1, c2, c3, c4; 497 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
501 const char *name; 498 const char *name;
502 499
503 get_usage_chars(class, &c1, &c2, &c3, &c4); 500 get_usage_chars(class, usage);
504 501
505 name = class->name; 502 name = class->name;
506 if (!name) { 503 if (!name) {
@@ -513,7 +510,7 @@ static void print_lock_name(struct lock_class *class)
513 if (class->subclass) 510 if (class->subclass)
514 printk("/%d", class->subclass); 511 printk("/%d", class->subclass);
515 } 512 }
516 printk("){%c%c%c%c}", c1, c2, c3, c4); 513 printk("){%s}", usage);
517} 514}
518 515
519static void print_lockdep_cache(struct lockdep_map *lock) 516static void print_lockdep_cache(struct lockdep_map *lock)
@@ -1263,9 +1260,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1263 bit_backwards, bit_forwards, irqclass); 1260 bit_backwards, bit_forwards, irqclass);
1264} 1261}
1265 1262
1266static int 1263static const char *state_names[] = {
1267check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 1264#define LOCKDEP_STATE(__STATE) \
1268 struct held_lock *next) 1265 __stringify(__STATE),
1266#include "lockdep_states.h"
1267#undef LOCKDEP_STATE
1268};
1269
1270static const char *state_rnames[] = {
1271#define LOCKDEP_STATE(__STATE) \
1272 __stringify(__STATE)"-READ",
1273#include "lockdep_states.h"
1274#undef LOCKDEP_STATE
1275};
1276
1277static inline const char *state_name(enum lock_usage_bit bit)
1278{
1279 return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
1280}
1281
1282static int exclusive_bit(int new_bit)
1283{
1284 /*
1285 * USED_IN
1286 * USED_IN_READ
1287 * ENABLED
1288 * ENABLED_READ
1289 *
1290 * bit 0 - write/read
1291 * bit 1 - used_in/enabled
1292 * bit 2+ state
1293 */
1294
1295 int state = new_bit & ~3;
1296 int dir = new_bit & 2;
1297
1298 /*
1299 * keep state, bit flip the direction and strip read.
1300 */
1301 return state | (dir ^ 2);
1302}
1303
1304static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
1305 struct held_lock *next, enum lock_usage_bit bit)
1269{ 1306{
1270 /* 1307 /*
1271 * Prove that the new dependency does not connect a hardirq-safe 1308 * Prove that the new dependency does not connect a hardirq-safe
@@ -1273,38 +1310,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1273 * the backwards-subgraph starting at <prev>, and the 1310 * the backwards-subgraph starting at <prev>, and the
1274 * forwards-subgraph starting at <next>: 1311 * forwards-subgraph starting at <next>:
1275 */ 1312 */
1276 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, 1313 if (!check_usage(curr, prev, next, bit,
1277 LOCK_ENABLED_HARDIRQS, "hard")) 1314 exclusive_bit(bit), state_name(bit)))
1278 return 0; 1315 return 0;
1279 1316
1317 bit++; /* _READ */
1318
1280 /* 1319 /*
1281 * Prove that the new dependency does not connect a hardirq-safe-read 1320 * Prove that the new dependency does not connect a hardirq-safe-read
1282 * lock with a hardirq-unsafe lock - to achieve this we search 1321 * lock with a hardirq-unsafe lock - to achieve this we search
1283 * the backwards-subgraph starting at <prev>, and the 1322 * the backwards-subgraph starting at <prev>, and the
1284 * forwards-subgraph starting at <next>: 1323 * forwards-subgraph starting at <next>:
1285 */ 1324 */
1286 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, 1325 if (!check_usage(curr, prev, next, bit,
1287 LOCK_ENABLED_HARDIRQS, "hard-read")) 1326 exclusive_bit(bit), state_name(bit)))
1288 return 0; 1327 return 0;
1289 1328
1290 /* 1329 return 1;
1291 * Prove that the new dependency does not connect a softirq-safe 1330}
1292 * lock with a softirq-unsafe lock - to achieve this we search 1331
1293 * the backwards-subgraph starting at <prev>, and the 1332static int
1294 * forwards-subgraph starting at <next>: 1333check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1295 */ 1334 struct held_lock *next)
1296 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, 1335{
1297 LOCK_ENABLED_SOFTIRQS, "soft")) 1336#define LOCKDEP_STATE(__STATE) \
1298 return 0; 1337 if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
1299 /*
1300 * Prove that the new dependency does not connect a softirq-safe-read
1301 * lock with a softirq-unsafe lock - to achieve this we search
1302 * the backwards-subgraph starting at <prev>, and the
1303 * forwards-subgraph starting at <next>:
1304 */
1305 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1306 LOCK_ENABLED_SOFTIRQS, "soft"))
1307 return 0; 1338 return 0;
1339#include "lockdep_states.h"
1340#undef LOCKDEP_STATE
1308 1341
1309 return 1; 1342 return 1;
1310} 1343}
@@ -1861,9 +1894,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1861 curr->comm, task_pid_nr(curr)); 1894 curr->comm, task_pid_nr(curr));
1862 print_lock(this); 1895 print_lock(this);
1863 if (forwards) 1896 if (forwards)
1864 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1897 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1865 else 1898 else
1866 printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); 1899 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1867 print_lock_name(other); 1900 print_lock_name(other);
1868 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 1901 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1869 1902
@@ -1933,7 +1966,7 @@ void print_irqtrace_events(struct task_struct *curr)
1933 print_ip_sym(curr->softirq_disable_ip); 1966 print_ip_sym(curr->softirq_disable_ip);
1934} 1967}
1935 1968
1936static int hardirq_verbose(struct lock_class *class) 1969static int HARDIRQ_verbose(struct lock_class *class)
1937{ 1970{
1938#if HARDIRQ_VERBOSE 1971#if HARDIRQ_VERBOSE
1939 return class_filter(class); 1972 return class_filter(class);
@@ -1941,7 +1974,7 @@ static int hardirq_verbose(struct lock_class *class)
1941 return 0; 1974 return 0;
1942} 1975}
1943 1976
1944static int softirq_verbose(struct lock_class *class) 1977static int SOFTIRQ_verbose(struct lock_class *class)
1945{ 1978{
1946#if SOFTIRQ_VERBOSE 1979#if SOFTIRQ_VERBOSE
1947 return class_filter(class); 1980 return class_filter(class);
@@ -1949,185 +1982,95 @@ static int softirq_verbose(struct lock_class *class)
1949 return 0; 1982 return 0;
1950} 1983}
1951 1984
1985static int RECLAIM_FS_verbose(struct lock_class *class)
1986{
1987#if RECLAIM_VERBOSE
1988 return class_filter(class);
1989#endif
1990 return 0;
1991}
1992
1952#define STRICT_READ_CHECKS 1 1993#define STRICT_READ_CHECKS 1
1953 1994
1954static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 1995static int (*state_verbose_f[])(struct lock_class *class) = {
1996#define LOCKDEP_STATE(__STATE) \
1997 __STATE##_verbose,
1998#include "lockdep_states.h"
1999#undef LOCKDEP_STATE
2000};
2001
2002static inline int state_verbose(enum lock_usage_bit bit,
2003 struct lock_class *class)
2004{
2005 return state_verbose_f[bit >> 2](class);
2006}
2007
2008typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
2009 enum lock_usage_bit bit, const char *name);
2010
2011static int
2012mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1955 enum lock_usage_bit new_bit) 2013 enum lock_usage_bit new_bit)
1956{ 2014{
1957 int ret = 1; 2015 int excl_bit = exclusive_bit(new_bit);
2016 int read = new_bit & 1;
2017 int dir = new_bit & 2;
1958 2018
1959 switch(new_bit) { 2019 /*
1960 case LOCK_USED_IN_HARDIRQ: 2020 * mark USED_IN has to look forwards -- to ensure no dependency
1961 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 2021 * has ENABLED state, which would allow recursion deadlocks.
1962 return 0; 2022 *
1963 if (!valid_state(curr, this, new_bit, 2023 * mark ENABLED has to look backwards -- to ensure no dependee
1964 LOCK_ENABLED_HARDIRQS_READ)) 2024 * has USED_IN state, which, again, would allow recursion deadlocks.
1965 return 0; 2025 */
1966 /* 2026 check_usage_f usage = dir ?
1967 * just marked it hardirq-safe, check that this lock 2027 check_usage_backwards : check_usage_forwards;
1968 * took no hardirq-unsafe lock in the past: 2028
1969 */ 2029 /*
1970 if (!check_usage_forwards(curr, this, 2030 * Validate that this particular lock does not have conflicting
1971 LOCK_ENABLED_HARDIRQS, "hard")) 2031 * usage states.
1972 return 0; 2032 */
1973#if STRICT_READ_CHECKS 2033 if (!valid_state(curr, this, new_bit, excl_bit))
1974 /* 2034 return 0;
1975 * just marked it hardirq-safe, check that this lock 2035
1976 * took no hardirq-unsafe-read lock in the past: 2036 /*
1977 */ 2037 * Validate that the lock dependencies don't have conflicting usage
1978 if (!check_usage_forwards(curr, this, 2038 * states.
1979 LOCK_ENABLED_HARDIRQS_READ, "hard-read")) 2039 */
1980 return 0; 2040 if ((!read || !dir || STRICT_READ_CHECKS) &&
1981#endif 2041 !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
1982 if (hardirq_verbose(hlock_class(this))) 2042 return 0;
1983 ret = 2; 2043
1984 break; 2044 /*
1985 case LOCK_USED_IN_SOFTIRQ: 2045 * Check for read in write conflicts
1986 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) 2046 */
1987 return 0; 2047 if (!read) {
1988 if (!valid_state(curr, this, new_bit, 2048 if (!valid_state(curr, this, new_bit, excl_bit + 1))
1989 LOCK_ENABLED_SOFTIRQS_READ))
1990 return 0;
1991 /*
1992 * just marked it softirq-safe, check that this lock
1993 * took no softirq-unsafe lock in the past:
1994 */
1995 if (!check_usage_forwards(curr, this,
1996 LOCK_ENABLED_SOFTIRQS, "soft"))
1997 return 0;
1998#if STRICT_READ_CHECKS
1999 /*
2000 * just marked it softirq-safe, check that this lock
2001 * took no softirq-unsafe-read lock in the past:
2002 */
2003 if (!check_usage_forwards(curr, this,
2004 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
2005 return 0;
2006#endif
2007 if (softirq_verbose(hlock_class(this)))
2008 ret = 2;
2009 break;
2010 case LOCK_USED_IN_HARDIRQ_READ:
2011 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
2012 return 0;
2013 /*
2014 * just marked it hardirq-read-safe, check that this lock
2015 * took no hardirq-unsafe lock in the past:
2016 */
2017 if (!check_usage_forwards(curr, this,
2018 LOCK_ENABLED_HARDIRQS, "hard"))
2019 return 0;
2020 if (hardirq_verbose(hlock_class(this)))
2021 ret = 2;
2022 break;
2023 case LOCK_USED_IN_SOFTIRQ_READ:
2024 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
2025 return 0;
2026 /*
2027 * just marked it softirq-read-safe, check that this lock
2028 * took no softirq-unsafe lock in the past:
2029 */
2030 if (!check_usage_forwards(curr, this,
2031 LOCK_ENABLED_SOFTIRQS, "soft"))
2032 return 0;
2033 if (softirq_verbose(hlock_class(this)))
2034 ret = 2;
2035 break;
2036 case LOCK_ENABLED_HARDIRQS:
2037 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2038 return 0;
2039 if (!valid_state(curr, this, new_bit,
2040 LOCK_USED_IN_HARDIRQ_READ))
2041 return 0;
2042 /*
2043 * just marked it hardirq-unsafe, check that no hardirq-safe
2044 * lock in the system ever took it in the past:
2045 */
2046 if (!check_usage_backwards(curr, this,
2047 LOCK_USED_IN_HARDIRQ, "hard"))
2048 return 0;
2049#if STRICT_READ_CHECKS
2050 /*
2051 * just marked it hardirq-unsafe, check that no
2052 * hardirq-safe-read lock in the system ever took
2053 * it in the past:
2054 */
2055 if (!check_usage_backwards(curr, this,
2056 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
2057 return 0;
2058#endif
2059 if (hardirq_verbose(hlock_class(this)))
2060 ret = 2;
2061 break;
2062 case LOCK_ENABLED_SOFTIRQS:
2063 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2064 return 0;
2065 if (!valid_state(curr, this, new_bit,
2066 LOCK_USED_IN_SOFTIRQ_READ))
2067 return 0;
2068 /*
2069 * just marked it softirq-unsafe, check that no softirq-safe
2070 * lock in the system ever took it in the past:
2071 */
2072 if (!check_usage_backwards(curr, this,
2073 LOCK_USED_IN_SOFTIRQ, "soft"))
2074 return 0;
2075#if STRICT_READ_CHECKS
2076 /*
2077 * just marked it softirq-unsafe, check that no
2078 * softirq-safe-read lock in the system ever took
2079 * it in the past:
2080 */
2081 if (!check_usage_backwards(curr, this,
2082 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
2083 return 0;
2084#endif
2085 if (softirq_verbose(hlock_class(this)))
2086 ret = 2;
2087 break;
2088 case LOCK_ENABLED_HARDIRQS_READ:
2089 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2090 return 0;
2091#if STRICT_READ_CHECKS
2092 /*
2093 * just marked it hardirq-read-unsafe, check that no
2094 * hardirq-safe lock in the system ever took it in the past:
2095 */
2096 if (!check_usage_backwards(curr, this,
2097 LOCK_USED_IN_HARDIRQ, "hard"))
2098 return 0;
2099#endif
2100 if (hardirq_verbose(hlock_class(this)))
2101 ret = 2;
2102 break;
2103 case LOCK_ENABLED_SOFTIRQS_READ:
2104 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2105 return 0; 2049 return 0;
2106#if STRICT_READ_CHECKS 2050
2107 /* 2051 if (STRICT_READ_CHECKS &&
2108 * just marked it softirq-read-unsafe, check that no 2052 !usage(curr, this, excl_bit + 1,
2109 * softirq-safe lock in the system ever took it in the past: 2053 state_name(new_bit + 1)))
2110 */
2111 if (!check_usage_backwards(curr, this,
2112 LOCK_USED_IN_SOFTIRQ, "soft"))
2113 return 0; 2054 return 0;
2114#endif
2115 if (softirq_verbose(hlock_class(this)))
2116 ret = 2;
2117 break;
2118 default:
2119 WARN_ON(1);
2120 break;
2121 } 2055 }
2122 2056
2123 return ret; 2057 if (state_verbose(new_bit, hlock_class(this)))
2058 return 2;
2059
2060 return 1;
2124} 2061}
2125 2062
2063enum mark_type {
2064#define LOCKDEP_STATE(__STATE) __STATE,
2065#include "lockdep_states.h"
2066#undef LOCKDEP_STATE
2067};
2068
2126/* 2069/*
2127 * Mark all held locks with a usage bit: 2070 * Mark all held locks with a usage bit:
2128 */ 2071 */
2129static int 2072static int
2130mark_held_locks(struct task_struct *curr, int hardirq) 2073mark_held_locks(struct task_struct *curr, enum mark_type mark)
2131{ 2074{
2132 enum lock_usage_bit usage_bit; 2075 enum lock_usage_bit usage_bit;
2133 struct held_lock *hlock; 2076 struct held_lock *hlock;
@@ -2136,17 +2079,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)
2136 for (i = 0; i < curr->lockdep_depth; i++) { 2079 for (i = 0; i < curr->lockdep_depth; i++) {
2137 hlock = curr->held_locks + i; 2080 hlock = curr->held_locks + i;
2138 2081
2139 if (hardirq) { 2082 usage_bit = 2 + (mark << 2); /* ENABLED */
2140 if (hlock->read) 2083 if (hlock->read)
2141 usage_bit = LOCK_ENABLED_HARDIRQS_READ; 2084 usage_bit += 1; /* READ */
2142 else 2085
2143 usage_bit = LOCK_ENABLED_HARDIRQS; 2086 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2144 } else { 2087
2145 if (hlock->read)
2146 usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
2147 else
2148 usage_bit = LOCK_ENABLED_SOFTIRQS;
2149 }
2150 if (!mark_lock(curr, hlock, usage_bit)) 2088 if (!mark_lock(curr, hlock, usage_bit))
2151 return 0; 2089 return 0;
2152 } 2090 }
@@ -2200,7 +2138,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2200 * We are going to turn hardirqs on, so set the 2138 * We are going to turn hardirqs on, so set the
2201 * usage bit for all held locks: 2139 * usage bit for all held locks:
2202 */ 2140 */
2203 if (!mark_held_locks(curr, 1)) 2141 if (!mark_held_locks(curr, HARDIRQ))
2204 return; 2142 return;
2205 /* 2143 /*
2206 * If we have softirqs enabled, then set the usage 2144 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2146,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2208 * this bit from being set before) 2146 * this bit from being set before)
2209 */ 2147 */
2210 if (curr->softirqs_enabled) 2148 if (curr->softirqs_enabled)
2211 if (!mark_held_locks(curr, 0)) 2149 if (!mark_held_locks(curr, SOFTIRQ))
2212 return; 2150 return;
2213 2151
2214 curr->hardirq_enable_ip = ip; 2152 curr->hardirq_enable_ip = ip;
@@ -2288,7 +2226,7 @@ void trace_softirqs_on(unsigned long ip)
2288 * enabled too: 2226 * enabled too:
2289 */ 2227 */
2290 if (curr->hardirqs_enabled) 2228 if (curr->hardirqs_enabled)
2291 mark_held_locks(curr, 0); 2229 mark_held_locks(curr, SOFTIRQ);
2292} 2230}
2293 2231
2294/* 2232/*
@@ -2317,6 +2255,31 @@ void trace_softirqs_off(unsigned long ip)
2317 debug_atomic_inc(&redundant_softirqs_off); 2255 debug_atomic_inc(&redundant_softirqs_off);
2318} 2256}
2319 2257
2258void lockdep_trace_alloc(gfp_t gfp_mask)
2259{
2260 struct task_struct *curr = current;
2261
2262 if (unlikely(!debug_locks))
2263 return;
2264
2265 /* no reclaim without waiting on it */
2266 if (!(gfp_mask & __GFP_WAIT))
2267 return;
2268
2269 /* this guy won't enter reclaim */
2270 if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
2271 return;
2272
2273 /* We're only interested __GFP_FS allocations for now */
2274 if (!(gfp_mask & __GFP_FS))
2275 return;
2276
2277 if (DEBUG_LOCKS_WARN_ON(irqs_disabled()))
2278 return;
2279
2280 mark_held_locks(curr, RECLAIM_FS);
2281}
2282
2320static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 2283static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2321{ 2284{
2322 /* 2285 /*
@@ -2345,19 +2308,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2345 if (!hlock->hardirqs_off) { 2308 if (!hlock->hardirqs_off) {
2346 if (hlock->read) { 2309 if (hlock->read) {
2347 if (!mark_lock(curr, hlock, 2310 if (!mark_lock(curr, hlock,
2348 LOCK_ENABLED_HARDIRQS_READ)) 2311 LOCK_ENABLED_HARDIRQ_READ))
2349 return 0; 2312 return 0;
2350 if (curr->softirqs_enabled) 2313 if (curr->softirqs_enabled)
2351 if (!mark_lock(curr, hlock, 2314 if (!mark_lock(curr, hlock,
2352 LOCK_ENABLED_SOFTIRQS_READ)) 2315 LOCK_ENABLED_SOFTIRQ_READ))
2353 return 0; 2316 return 0;
2354 } else { 2317 } else {
2355 if (!mark_lock(curr, hlock, 2318 if (!mark_lock(curr, hlock,
2356 LOCK_ENABLED_HARDIRQS)) 2319 LOCK_ENABLED_HARDIRQ))
2357 return 0; 2320 return 0;
2358 if (curr->softirqs_enabled) 2321 if (curr->softirqs_enabled)
2359 if (!mark_lock(curr, hlock, 2322 if (!mark_lock(curr, hlock,
2360 LOCK_ENABLED_SOFTIRQS)) 2323 LOCK_ENABLED_SOFTIRQ))
2324 return 0;
2325 }
2326 }
2327
2328 /*
2329 * We reuse the irq context infrastructure more broadly as a general
2330 * context checking code. This tests GFP_FS recursion (a lock taken
2331 * during reclaim for a GFP_FS allocation is held over a GFP_FS
2332 * allocation).
2333 */
2334 if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
2335 if (hlock->read) {
2336 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
2337 return 0;
2338 } else {
2339 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
2361 return 0; 2340 return 0;
2362 } 2341 }
2363 } 2342 }
@@ -2412,6 +2391,10 @@ static inline int separate_irq_context(struct task_struct *curr,
2412 return 0; 2391 return 0;
2413} 2392}
2414 2393
2394void lockdep_trace_alloc(gfp_t gfp_mask)
2395{
2396}
2397
2415#endif 2398#endif
2416 2399
2417/* 2400/*
@@ -2445,14 +2428,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2445 return 0; 2428 return 0;
2446 2429
2447 switch (new_bit) { 2430 switch (new_bit) {
2448 case LOCK_USED_IN_HARDIRQ: 2431#define LOCKDEP_STATE(__STATE) \
2449 case LOCK_USED_IN_SOFTIRQ: 2432 case LOCK_USED_IN_##__STATE: \
2450 case LOCK_USED_IN_HARDIRQ_READ: 2433 case LOCK_USED_IN_##__STATE##_READ: \
2451 case LOCK_USED_IN_SOFTIRQ_READ: 2434 case LOCK_ENABLED_##__STATE: \
2452 case LOCK_ENABLED_HARDIRQS: 2435 case LOCK_ENABLED_##__STATE##_READ:
2453 case LOCK_ENABLED_SOFTIRQS: 2436#include "lockdep_states.h"
2454 case LOCK_ENABLED_HARDIRQS_READ: 2437#undef LOCKDEP_STATE
2455 case LOCK_ENABLED_SOFTIRQS_READ:
2456 ret = mark_lock_irq(curr, this, new_bit); 2438 ret = mark_lock_irq(curr, this, new_bit);
2457 if (!ret) 2439 if (!ret)
2458 return 0; 2440 return 0;
@@ -2925,6 +2907,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2925} 2907}
2926EXPORT_SYMBOL_GPL(lock_set_class); 2908EXPORT_SYMBOL_GPL(lock_set_class);
2927 2909
2910DEFINE_TRACE(lock_acquire);
2911
2928/* 2912/*
2929 * We are not always called with irqs disabled - do that here, 2913 * We are not always called with irqs disabled - do that here,
2930 * and also avoid lockdep recursion: 2914 * and also avoid lockdep recursion:
@@ -2935,6 +2919,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2935{ 2919{
2936 unsigned long flags; 2920 unsigned long flags;
2937 2921
2922 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
2923
2938 if (unlikely(current->lockdep_recursion)) 2924 if (unlikely(current->lockdep_recursion))
2939 return; 2925 return;
2940 2926
@@ -2949,11 +2935,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2949} 2935}
2950EXPORT_SYMBOL_GPL(lock_acquire); 2936EXPORT_SYMBOL_GPL(lock_acquire);
2951 2937
2938DEFINE_TRACE(lock_release);
2939
2952void lock_release(struct lockdep_map *lock, int nested, 2940void lock_release(struct lockdep_map *lock, int nested,
2953 unsigned long ip) 2941 unsigned long ip)
2954{ 2942{
2955 unsigned long flags; 2943 unsigned long flags;
2956 2944
2945 trace_lock_release(lock, nested, ip);
2946
2957 if (unlikely(current->lockdep_recursion)) 2947 if (unlikely(current->lockdep_recursion))
2958 return; 2948 return;
2959 2949
@@ -2966,6 +2956,16 @@ void lock_release(struct lockdep_map *lock, int nested,
2966} 2956}
2967EXPORT_SYMBOL_GPL(lock_release); 2957EXPORT_SYMBOL_GPL(lock_release);
2968 2958
2959void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2960{
2961 current->lockdep_reclaim_gfp = gfp_mask;
2962}
2963
2964void lockdep_clear_current_reclaim_state(void)
2965{
2966 current->lockdep_reclaim_gfp = 0;
2967}
2968
2969#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
2970static int 2970static int
2971print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, 2971print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -3092,10 +3092,14 @@ found_it:
3092 lock->ip = ip; 3092 lock->ip = ip;
3093} 3093}
3094 3094
3095DEFINE_TRACE(lock_contended);
3096
3095void lock_contended(struct lockdep_map *lock, unsigned long ip) 3097void lock_contended(struct lockdep_map *lock, unsigned long ip)
3096{ 3098{
3097 unsigned long flags; 3099 unsigned long flags;
3098 3100
3101 trace_lock_contended(lock, ip);
3102
3099 if (unlikely(!lock_stat)) 3103 if (unlikely(!lock_stat))
3100 return; 3104 return;
3101 3105
@@ -3111,10 +3115,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3111} 3115}
3112EXPORT_SYMBOL_GPL(lock_contended); 3116EXPORT_SYMBOL_GPL(lock_contended);
3113 3117
3118DEFINE_TRACE(lock_acquired);
3119
3114void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3120void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3115{ 3121{
3116 unsigned long flags; 3122 unsigned long flags;
3117 3123
3124 trace_lock_acquired(lock, ip);
3125
3118 if (unlikely(!lock_stat)) 3126 if (unlikely(!lock_stat))
3119 return; 3127 return;
3120 3128
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c08..a2cc7e9a6e84 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -7,6 +7,45 @@
7 */ 7 */
8 8
9/* 9/*
10 * Lock-class usage-state bits:
11 */
12enum lock_usage_bit {
13#define LOCKDEP_STATE(__STATE) \
14 LOCK_USED_IN_##__STATE, \
15 LOCK_USED_IN_##__STATE##_READ, \
16 LOCK_ENABLED_##__STATE, \
17 LOCK_ENABLED_##__STATE##_READ,
18#include "lockdep_states.h"
19#undef LOCKDEP_STATE
20 LOCK_USED,
21 LOCK_USAGE_STATES
22};
23
24/*
25 * Usage-state bitmasks:
26 */
27#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
28
29enum {
30#define LOCKDEP_STATE(__STATE) \
31 __LOCKF(USED_IN_##__STATE) \
32 __LOCKF(USED_IN_##__STATE##_READ) \
33 __LOCKF(ENABLED_##__STATE) \
34 __LOCKF(ENABLED_##__STATE##_READ)
35#include "lockdep_states.h"
36#undef LOCKDEP_STATE
37 __LOCKF(USED)
38};
39
40#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
41#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
42
43#define LOCKF_ENABLED_IRQ_READ \
44 (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
45#define LOCKF_USED_IN_IRQ_READ \
46 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
47
48/*
10 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies 49 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
11 * we track. 50 * we track.
12 * 51 *
@@ -31,8 +70,10 @@
31extern struct list_head all_lock_classes; 70extern struct list_head all_lock_classes;
32extern struct lock_chain lock_chains[]; 71extern struct lock_chain lock_chains[];
33 72
34extern void 73#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); 74
75extern void get_usage_chars(struct lock_class *class,
76 char usage[LOCK_USAGE_CHARS]);
36 77
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); 78extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38 79
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b813896..d7135aa2d2c4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
84{ 84{
85 struct lock_class *class = v; 85 struct lock_class *class = v;
86 struct lock_list *entry; 86 struct lock_list *entry;
87 char c1, c2, c3, c4; 87 char usage[LOCK_USAGE_CHARS];
88 88
89 if (v == SEQ_START_TOKEN) { 89 if (v == SEQ_START_TOKEN) {
90 seq_printf(m, "all lock classes:\n"); 90 seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
101#endif 101#endif
102 102
103 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, usage);
104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %s", usage);
105 105
106 seq_printf(m, ": "); 106 seq_printf(m, ": ");
107 print_name(m, class); 107 print_name(m, class);
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
300 nr_uncategorized++; 300 nr_uncategorized++;
301 if (class->usage_mask & LOCKF_USED_IN_IRQ) 301 if (class->usage_mask & LOCKF_USED_IN_IRQ)
302 nr_irq_safe++; 302 nr_irq_safe++;
303 if (class->usage_mask & LOCKF_ENABLED_IRQS) 303 if (class->usage_mask & LOCKF_ENABLED_IRQ)
304 nr_irq_unsafe++; 304 nr_irq_unsafe++;
305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
306 nr_softirq_safe++; 306 nr_softirq_safe++;
307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) 307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
308 nr_softirq_unsafe++; 308 nr_softirq_unsafe++;
309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) 309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
310 nr_hardirq_safe++; 310 nr_hardirq_safe++;
311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) 311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
312 nr_hardirq_unsafe++; 312 nr_hardirq_unsafe++;
313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) 313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
314 nr_irq_read_safe++; 314 nr_irq_read_safe++;
315 if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) 315 if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
316 nr_irq_read_unsafe++; 316 nr_irq_read_unsafe++;
317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) 317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
318 nr_softirq_read_safe++; 318 nr_softirq_read_safe++;
319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
320 nr_softirq_read_unsafe++; 320 nr_softirq_read_unsafe++;
321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) 321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
322 nr_hardirq_read_safe++; 322 nr_hardirq_read_safe++;
323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
324 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
325 325
326#ifdef CONFIG_PROVE_LOCKING 326#ifdef CONFIG_PROVE_LOCKING
@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
601static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
602{ 602{
603 seq_printf(m, "lock_stat version 0.3\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
604
605 if (unlikely(!debug_locks))
606 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
607
604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 608 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 609 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
606 "%14s %14s\n", 610 "%14s %14s\n",
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 000000000000..995b0cc2b84c
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,9 @@
1/*
2 * Lockdep states,
3 *
4 * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
5 * you add one, or come up with a nice dynamic solution.
6 */
7LOCKDEP_STATE(HARDIRQ)
8LOCKDEP_STATE(SOFTIRQ)
9LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/module.c b/kernel/module.c
index 1196f5d11700..7fa134e0cc24 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
51#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
52#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h> 53#include <linux/async.h>
54#include <linux/percpu.h>
54 55
55#if 0 56#if 0
56#define DEBUGP printk 57#define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
366} 367}
367 368
368#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
370
371#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
372
373static void *percpu_modalloc(unsigned long size, unsigned long align,
374 const char *name)
375{
376 void *ptr;
377
378 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE);
381 align = PAGE_SIZE;
382 }
383
384 ptr = __alloc_reserved_percpu(size, align);
385 if (!ptr)
386 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr;
389}
390
391static void percpu_modfree(void *freeme)
392{
393 free_percpu(freeme);
394}
395
396#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
397
369/* Number of blocks used and allocated. */ 398/* Number of blocks used and allocated. */
370static unsigned int pcpu_num_used, pcpu_num_allocated; 399static unsigned int pcpu_num_used, pcpu_num_allocated;
371/* Size of each block. -ve means used. */ 400/* Size of each block. -ve means used. */
@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
480 } 509 }
481} 510}
482 511
483static unsigned int find_pcpusec(Elf_Ehdr *hdr,
484 Elf_Shdr *sechdrs,
485 const char *secstrings)
486{
487 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
488}
489
490static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
491{
492 int cpu;
493
494 for_each_possible_cpu(cpu)
495 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
496}
497
498static int percpu_modinit(void) 512static int percpu_modinit(void)
499{ 513{
500 pcpu_num_used = 2; 514 pcpu_num_used = 2;
@@ -513,7 +527,26 @@ static int percpu_modinit(void)
513 return 0; 527 return 0;
514} 528}
515__initcall(percpu_modinit); 529__initcall(percpu_modinit);
530
531#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
532
533static unsigned int find_pcpusec(Elf_Ehdr *hdr,
534 Elf_Shdr *sechdrs,
535 const char *secstrings)
536{
537 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
538}
539
540static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
541{
542 int cpu;
543
544 for_each_possible_cpu(cpu)
545 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
546}
547
516#else /* ... !CONFIG_SMP */ 548#else /* ... !CONFIG_SMP */
549
517static inline void *percpu_modalloc(unsigned long size, unsigned long align, 550static inline void *percpu_modalloc(unsigned long size, unsigned long align,
518 const char *name) 551 const char *name)
519{ 552{
@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
535 /* pcpusec should be 0, and size of that section should be 0. */ 568 /* pcpusec should be 0, and size of that section should be 0. */
536 BUG_ON(size != 0); 569 BUG_ON(size != 0);
537} 570}
571
538#endif /* CONFIG_SMP */ 572#endif /* CONFIG_SMP */
539 573
540#define MODINFO_ATTR(field) \ 574#define MODINFO_ATTR(field) \
@@ -2737,7 +2771,7 @@ int is_module_address(unsigned long addr)
2737 2771
2738 2772
2739/* Is this a valid kernel address? */ 2773/* Is this a valid kernel address? */
2740__notrace_funcgraph struct module *__module_text_address(unsigned long addr) 2774struct module *__module_text_address(unsigned long addr)
2741{ 2775{
2742 struct module *mod; 2776 struct module *mod;
2743 2777
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb532..50d022e5a560 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
26/* 26/*
27 * Must be called with lock->wait_lock held. 27 * Must be called with lock->wait_lock held.
28 */ 28 */
29void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
30{
31 lock->owner = new_owner;
32}
33
34void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) 29void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
35{ 30{
36 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); 31 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
59 54
60 /* Mark the current thread as blocked on the lock: */ 55 /* Mark the current thread as blocked on the lock: */
61 ti->task->blocked_on = waiter; 56 ti->task->blocked_on = waiter;
62 waiter->lock = lock;
63} 57}
64 58
65void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 59void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
82 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 76 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 77 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 78 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 79 mutex_clear_owner(lock);
86} 80}
87 81
88void debug_mutex_init(struct mutex *lock, const char *name, 82void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
95 debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 89 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
96 lockdep_init_map(&lock->dep_map, name, key, 0); 90 lockdep_init_map(&lock->dep_map, name, key, 0);
97#endif 91#endif
98 lock->owner = NULL;
99 lock->magic = lock; 92 lock->magic = lock;
100} 93}
101 94
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534b..6b2d735846a5 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
13/* 13/*
14 * This must be called with lock->wait_lock held. 14 * This must be called with lock->wait_lock held.
15 */ 15 */
16extern void
17debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
18
19static inline void debug_mutex_clear_owner(struct mutex *lock)
20{
21 lock->owner = NULL;
22}
23
24extern void debug_mutex_lock_common(struct mutex *lock, 16extern void debug_mutex_lock_common(struct mutex *lock,
25 struct mutex_waiter *waiter); 17 struct mutex_waiter *waiter);
26extern void debug_mutex_wake_waiter(struct mutex *lock, 18extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
35extern void debug_mutex_init(struct mutex *lock, const char *name, 27extern void debug_mutex_init(struct mutex *lock, const char *name,
36 struct lock_class_key *key); 28 struct lock_class_key *key);
37 29
30static inline void mutex_set_owner(struct mutex *lock)
31{
32 lock->owner = current_thread_info();
33}
34
35static inline void mutex_clear_owner(struct mutex *lock)
36{
37 lock->owner = NULL;
38}
39
38#define spin_lock_mutex(lock, flags) \ 40#define spin_lock_mutex(lock, flags) \
39 do { \ 41 do { \
40 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 42 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658ef..5d79781394a3 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and 10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
11 * David Howells for suggestions and improvements. 11 * David Howells for suggestions and improvements.
12 * 12 *
13 * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
14 * from the -rt tree, where it was originally implemented for rtmutexes
15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
16 * and Sven Dietrich.
17 *
13 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/mutex-design.txt.
14 */ 19 */
15#include <linux/mutex.h> 20#include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
46 atomic_set(&lock->count, 1); 51 atomic_set(&lock->count, 1);
47 spin_lock_init(&lock->wait_lock); 52 spin_lock_init(&lock->wait_lock);
48 INIT_LIST_HEAD(&lock->wait_list); 53 INIT_LIST_HEAD(&lock->wait_list);
54 mutex_clear_owner(lock);
49 55
50 debug_mutex_init(lock, name, key); 56 debug_mutex_init(lock, name, key);
51} 57}
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
91 * 'unlocked' into 'locked' state. 97 * 'unlocked' into 'locked' state.
92 */ 98 */
93 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); 99 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
100 mutex_set_owner(lock);
94} 101}
95 102
96EXPORT_SYMBOL(mutex_lock); 103EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
115 * The unlocking fastpath is the 0->1 transition from 'locked' 122 * The unlocking fastpath is the 0->1 transition from 'locked'
116 * into 'unlocked' state: 123 * into 'unlocked' state:
117 */ 124 */
125#ifndef CONFIG_DEBUG_MUTEXES
126 /*
127 * When debugging is enabled we must not clear the owner before time,
128 * the slow path will always be taken, and that clears the owner field
129 * after verifying that it was indeed current.
130 */
131 mutex_clear_owner(lock);
132#endif
118 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); 133 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
119} 134}
120 135
@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
129{ 144{
130 struct task_struct *task = current; 145 struct task_struct *task = current;
131 struct mutex_waiter waiter; 146 struct mutex_waiter waiter;
132 unsigned int old_val;
133 unsigned long flags; 147 unsigned long flags;
134 148
149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
152 /*
153 * Optimistic spinning.
154 *
155 * We try to spin for acquisition when we find that there are no
156 * pending waiters and the lock owner is currently running on a
157 * (different) CPU.
158 *
159 * The rationale is that if the lock owner is running, it is likely to
160 * release the lock soon.
161 *
162 * Since this needs the lock owner, and this mutex implementation
163 * doesn't track the owner atomically in the lock field, we need to
164 * track it non-atomically.
165 *
166 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
167 * to serialize everything.
168 */
169
170 for (;;) {
171 struct thread_info *owner;
172
173 /*
174 * If there's an owner, wait for it to either
175 * release the lock or go to sleep.
176 */
177 owner = ACCESS_ONCE(lock->owner);
178 if (owner && !mutex_spin_on_owner(lock, owner))
179 break;
180
181 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
182 lock_acquired(&lock->dep_map, ip);
183 mutex_set_owner(lock);
184 preempt_enable();
185 return 0;
186 }
187
188 /*
189 * When there's no owner, we might have preempted between the
190 * owner acquiring the lock and setting the owner field. If
191 * we're an RT task that will live-lock because we won't let
192 * the owner complete.
193 */
194 if (!owner && (need_resched() || rt_task(task)))
195 break;
196
197 /*
198 * The cpu_relax() call is a compiler barrier which forces
199 * everything in this loop to be re-loaded. We don't need
200 * memory barriers as we'll eventually observe the right
201 * values at the cost of a few extra spins.
202 */
203 cpu_relax();
204 }
205#endif
135 spin_lock_mutex(&lock->wait_lock, flags); 206 spin_lock_mutex(&lock->wait_lock, flags);
136 207
137 debug_mutex_lock_common(lock, &waiter); 208 debug_mutex_lock_common(lock, &waiter);
138 mutex_acquire(&lock->dep_map, subclass, 0, ip);
139 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 209 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
140 210
141 /* add waiting tasks to the end of the waitqueue (FIFO): */ 211 /* add waiting tasks to the end of the waitqueue (FIFO): */
142 list_add_tail(&waiter.list, &lock->wait_list); 212 list_add_tail(&waiter.list, &lock->wait_list);
143 waiter.task = task; 213 waiter.task = task;
144 214
145 old_val = atomic_xchg(&lock->count, -1); 215 if (atomic_xchg(&lock->count, -1) == 1)
146 if (old_val == 1)
147 goto done; 216 goto done;
148 217
149 lock_contended(&lock->dep_map, ip); 218 lock_contended(&lock->dep_map, ip);
@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * that when we release the lock, we properly wake up the 227 * that when we release the lock, we properly wake up the
159 * other waiters: 228 * other waiters:
160 */ 229 */
161 old_val = atomic_xchg(&lock->count, -1); 230 if (atomic_xchg(&lock->count, -1) == 1)
162 if (old_val == 1)
163 break; 231 break;
164 232
165 /* 233 /*
@@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
173 spin_unlock_mutex(&lock->wait_lock, flags); 241 spin_unlock_mutex(&lock->wait_lock, flags);
174 242
175 debug_mutex_free_waiter(&waiter); 243 debug_mutex_free_waiter(&waiter);
244 preempt_enable();
176 return -EINTR; 245 return -EINTR;
177 } 246 }
178 __set_task_state(task, state); 247 __set_task_state(task, state);
179 248
180 /* didnt get the lock, go to sleep: */ 249 /* didnt get the lock, go to sleep: */
181 spin_unlock_mutex(&lock->wait_lock, flags); 250 spin_unlock_mutex(&lock->wait_lock, flags);
182 schedule(); 251 __schedule();
183 spin_lock_mutex(&lock->wait_lock, flags); 252 spin_lock_mutex(&lock->wait_lock, flags);
184 } 253 }
185 254
186done: 255done:
187 lock_acquired(&lock->dep_map, ip); 256 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 257 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 258 mutex_remove_waiter(lock, &waiter, current_thread_info());
190 debug_mutex_set_owner(lock, task_thread_info(task)); 259 mutex_set_owner(lock);
191 260
192 /* set it to 0 if there are no waiters left: */ 261 /* set it to 0 if there are no waiters left: */
193 if (likely(list_empty(&lock->wait_list))) 262 if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +265,7 @@ done:
196 spin_unlock_mutex(&lock->wait_lock, flags); 265 spin_unlock_mutex(&lock->wait_lock, flags);
197 266
198 debug_mutex_free_waiter(&waiter); 267 debug_mutex_free_waiter(&waiter);
268 preempt_enable();
199 269
200 return 0; 270 return 0;
201} 271}
@@ -222,7 +292,8 @@ int __sched
222mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 292mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
223{ 293{
224 might_sleep(); 294 might_sleep();
225 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); 295 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
296 subclass, _RET_IP_);
226} 297}
227 298
228EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 299EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
260 wake_up_process(waiter->task); 331 wake_up_process(waiter->task);
261 } 332 }
262 333
263 debug_mutex_clear_owner(lock);
264
265 spin_unlock_mutex(&lock->wait_lock, flags); 334 spin_unlock_mutex(&lock->wait_lock, flags);
266} 335}
267 336
@@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
298 */ 367 */
299int __sched mutex_lock_interruptible(struct mutex *lock) 368int __sched mutex_lock_interruptible(struct mutex *lock)
300{ 369{
370 int ret;
371
301 might_sleep(); 372 might_sleep();
302 return __mutex_fastpath_lock_retval 373 ret = __mutex_fastpath_lock_retval
303 (&lock->count, __mutex_lock_interruptible_slowpath); 374 (&lock->count, __mutex_lock_interruptible_slowpath);
375 if (!ret)
376 mutex_set_owner(lock);
377
378 return ret;
304} 379}
305 380
306EXPORT_SYMBOL(mutex_lock_interruptible); 381EXPORT_SYMBOL(mutex_lock_interruptible);
307 382
308int __sched mutex_lock_killable(struct mutex *lock) 383int __sched mutex_lock_killable(struct mutex *lock)
309{ 384{
385 int ret;
386
310 might_sleep(); 387 might_sleep();
311 return __mutex_fastpath_lock_retval 388 ret = __mutex_fastpath_lock_retval
312 (&lock->count, __mutex_lock_killable_slowpath); 389 (&lock->count, __mutex_lock_killable_slowpath);
390 if (!ret)
391 mutex_set_owner(lock);
392
393 return ret;
313} 394}
314EXPORT_SYMBOL(mutex_lock_killable); 395EXPORT_SYMBOL(mutex_lock_killable);
315 396
@@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
352 433
353 prev = atomic_xchg(&lock->count, -1); 434 prev = atomic_xchg(&lock->count, -1);
354 if (likely(prev == 1)) { 435 if (likely(prev == 1)) {
355 debug_mutex_set_owner(lock, current_thread_info()); 436 mutex_set_owner(lock);
356 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); 437 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
357 } 438 }
439
358 /* Set it back to 0 if there are no waiters: */ 440 /* Set it back to 0 if there are no waiters: */
359 if (likely(list_empty(&lock->wait_list))) 441 if (likely(list_empty(&lock->wait_list)))
360 atomic_set(&lock->count, 0); 442 atomic_set(&lock->count, 0);
@@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
380 */ 462 */
381int __sched mutex_trylock(struct mutex *lock) 463int __sched mutex_trylock(struct mutex *lock)
382{ 464{
383 return __mutex_fastpath_trylock(&lock->count, 465 int ret;
384 __mutex_trylock_slowpath); 466
467 ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
468 if (ret)
469 mutex_set_owner(lock);
470
471 return ret;
385} 472}
386 473
387EXPORT_SYMBOL(mutex_trylock); 474EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb290..67578ca48f94 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#define debug_mutex_set_owner(lock, new_owner) do { } while (0) 19#ifdef CONFIG_SMP
20#define debug_mutex_clear_owner(lock) do { } while (0) 20static inline void mutex_set_owner(struct mutex *lock)
21{
22 lock->owner = current_thread_info();
23}
24
25static inline void mutex_clear_owner(struct mutex *lock)
26{
27 lock->owner = NULL;
28}
29#else
30static inline void mutex_set_owner(struct mutex *lock)
31{
32}
33
34static inline void mutex_clear_owner(struct mutex *lock)
35{
36}
37#endif
38
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) 39#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0) 40#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) 41#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44d..32fe4eff1b89 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -74,6 +74,9 @@ NORET_TYPE void panic(const char * fmt, ...)
74 vsnprintf(buf, sizeof(buf), fmt, args); 74 vsnprintf(buf, sizeof(buf), fmt, args);
75 va_end(args); 75 va_end(args);
76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
77#ifdef CONFIG_DEBUG_BUGVERBOSE
78 dump_stack();
79#endif
77 bust_spinlocks(0); 80 bust_spinlocks(0);
78 81
79 /* 82 /*
@@ -355,15 +358,18 @@ EXPORT_SYMBOL(warn_slowpath);
355#endif 358#endif
356 359
357#ifdef CONFIG_CC_STACKPROTECTOR 360#ifdef CONFIG_CC_STACKPROTECTOR
361
358/* 362/*
359 * Called when gcc's -fstack-protector feature is used, and 363 * Called when gcc's -fstack-protector feature is used, and
360 * gcc detects corruption of the on-stack canary value 364 * gcc detects corruption of the on-stack canary value
361 */ 365 */
362void __stack_chk_fail(void) 366void __stack_chk_fail(void)
363{ 367{
364 panic("stack-protector: Kernel stack is corrupted"); 368 panic("stack-protector: Kernel stack is corrupted in: %p\n",
369 __builtin_return_address(0));
365} 370}
366EXPORT_SYMBOL(__stack_chk_fail); 371EXPORT_SYMBOL(__stack_chk_fail);
372
367#endif 373#endif
368 374
369core_param(panic, panic_timeout, int, 0644); 375core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/relay.c b/kernel/relay.c
index 9d79b7854fa6..edc0ba6d8160 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -677,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
677 */ 677 */
678 for_each_online_cpu(i) { 678 for_each_online_cpu(i) {
679 if (unlikely(!chan->buf[i])) { 679 if (unlikely(!chan->buf[i])) {
680 printk(KERN_ERR "relay_late_setup_files: CPU %u " 680 WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
681 "has no buffer, it must have!\n", i);
682 BUG();
683 err = -EINVAL; 681 err = -EINVAL;
684 break; 682 break;
685 } 683 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 8e2558c2ba67..7299083e69e7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4404,10 +4404,7 @@ void scheduler_tick(void)
4404#endif 4404#endif
4405} 4405}
4406 4406
4407#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4407unsigned long get_parent_ip(unsigned long addr)
4408 defined(CONFIG_PREEMPT_TRACER))
4409
4410static inline unsigned long get_parent_ip(unsigned long addr)
4411{ 4408{
4412 if (in_lock_functions(addr)) { 4409 if (in_lock_functions(addr)) {
4413 addr = CALLER_ADDR2; 4410 addr = CALLER_ADDR2;
@@ -4417,6 +4414,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4417 return addr; 4414 return addr;
4418} 4415}
4419 4416
4417#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4418 defined(CONFIG_PREEMPT_TRACER))
4419
4420void __kprobes add_preempt_count(int val) 4420void __kprobes add_preempt_count(int val)
4421{ 4421{
4422#ifdef CONFIG_DEBUG_PREEMPT 4422#ifdef CONFIG_DEBUG_PREEMPT
@@ -4543,15 +4543,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
4543/* 4543/*
4544 * schedule() is the main scheduler function. 4544 * schedule() is the main scheduler function.
4545 */ 4545 */
4546asmlinkage void __sched schedule(void) 4546asmlinkage void __sched __schedule(void)
4547{ 4547{
4548 struct task_struct *prev, *next; 4548 struct task_struct *prev, *next;
4549 unsigned long *switch_count; 4549 unsigned long *switch_count;
4550 struct rq *rq; 4550 struct rq *rq;
4551 int cpu; 4551 int cpu;
4552 4552
4553need_resched:
4554 preempt_disable();
4555 cpu = smp_processor_id(); 4553 cpu = smp_processor_id();
4556 rq = cpu_rq(cpu); 4554 rq = cpu_rq(cpu);
4557 rcu_qsctr_inc(cpu); 4555 rcu_qsctr_inc(cpu);
@@ -4608,13 +4606,80 @@ need_resched_nonpreemptible:
4608 4606
4609 if (unlikely(reacquire_kernel_lock(current) < 0)) 4607 if (unlikely(reacquire_kernel_lock(current) < 0))
4610 goto need_resched_nonpreemptible; 4608 goto need_resched_nonpreemptible;
4609}
4611 4610
4611asmlinkage void __sched schedule(void)
4612{
4613need_resched:
4614 preempt_disable();
4615 __schedule();
4612 preempt_enable_no_resched(); 4616 preempt_enable_no_resched();
4613 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 4617 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4614 goto need_resched; 4618 goto need_resched;
4615} 4619}
4616EXPORT_SYMBOL(schedule); 4620EXPORT_SYMBOL(schedule);
4617 4621
4622#ifdef CONFIG_SMP
4623/*
4624 * Look out! "owner" is an entirely speculative pointer
4625 * access and not reliable.
4626 */
4627int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4628{
4629 unsigned int cpu;
4630 struct rq *rq;
4631
4632 if (!sched_feat(OWNER_SPIN))
4633 return 0;
4634
4635#ifdef CONFIG_DEBUG_PAGEALLOC
4636 /*
4637 * Need to access the cpu field knowing that
4638 * DEBUG_PAGEALLOC could have unmapped it if
4639 * the mutex owner just released it and exited.
4640 */
4641 if (probe_kernel_address(&owner->cpu, cpu))
4642 goto out;
4643#else
4644 cpu = owner->cpu;
4645#endif
4646
4647 /*
4648 * Even if the access succeeded (likely case),
4649 * the cpu field may no longer be valid.
4650 */
4651 if (cpu >= nr_cpumask_bits)
4652 goto out;
4653
4654 /*
4655 * We need to validate that we can do a
4656 * get_cpu() and that we have the percpu area.
4657 */
4658 if (!cpu_online(cpu))
4659 goto out;
4660
4661 rq = cpu_rq(cpu);
4662
4663 for (;;) {
4664 /*
4665 * Owner changed, break to re-assess state.
4666 */
4667 if (lock->owner != owner)
4668 break;
4669
4670 /*
4671 * Is that owner really running on that cpu?
4672 */
4673 if (task_thread_info(rq->curr) != owner || need_resched())
4674 return 0;
4675
4676 cpu_relax();
4677 }
4678out:
4679 return 1;
4680}
4681#endif
4682
4618#ifdef CONFIG_PREEMPT 4683#ifdef CONFIG_PREEMPT
4619/* 4684/*
4620 * this is the entry point to schedule() from in-kernel preemption 4685 * this is the entry point to schedule() from in-kernel preemption
@@ -5944,12 +6009,7 @@ void sched_show_task(struct task_struct *p)
5944 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6009 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5945#endif 6010#endif
5946#ifdef CONFIG_DEBUG_STACK_USAGE 6011#ifdef CONFIG_DEBUG_STACK_USAGE
5947 { 6012 free = stack_not_used(p);
5948 unsigned long *n = end_of_stack(p);
5949 while (!*n)
5950 n++;
5951 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5952 }
5953#endif 6013#endif
5954 printk(KERN_CONT "%5lu %5d %6d\n", free, 6014 printk(KERN_CONT "%5lu %5d %6d\n", free,
5955 task_pid_nr(p), task_pid_nr(p->real_parent)); 6015 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -9490,7 +9550,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9490 9550
9491static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9551static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9492{ 9552{
9493 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9553 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9494 u64 data; 9554 u64 data;
9495 9555
9496#ifndef CONFIG_64BIT 9556#ifndef CONFIG_64BIT
@@ -9509,7 +9569,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9509 9569
9510static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9570static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9511{ 9571{
9512 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9572 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9513 9573
9514#ifndef CONFIG_64BIT 9574#ifndef CONFIG_64BIT
9515 /* 9575 /*
@@ -9605,7 +9665,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9605 ca = task_ca(tsk); 9665 ca = task_ca(tsk);
9606 9666
9607 for (; ca; ca = ca->parent) { 9667 for (; ca; ca = ca->parent) {
9608 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9668 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9609 *cpuusage += cputime; 9669 *cpuusage += cputime;
9610 } 9670 }
9611} 9671}
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..7ec82c1c61c5 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,12 @@
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
25 * consistent between cpus (never more than 2 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
26 */ 26 */
27#include <linux/sched.h>
28#include <linux/percpu.h>
29#include <linux/spinlock.h> 27#include <linux/spinlock.h>
30#include <linux/ktime.h> 28#include <linux/hardirq.h>
31#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/percpu.h>
31#include <linux/ktime.h>
32#include <linux/sched.h>
32 33
33/* 34/*
34 * Scheduler clock - returns current time in nanosec units. 35 * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +44,10 @@ unsigned long long __attribute__((weak)) sched_clock(void)
43static __read_mostly int sched_clock_running; 44static __read_mostly int sched_clock_running;
44 45
45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 46#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
47__read_mostly int sched_clock_stable;
48#else
49static const int sched_clock_stable = 1;
50#endif
46 51
47struct sched_clock_data { 52struct sched_clock_data {
48 /* 53 /*
@@ -87,7 +92,7 @@ void sched_clock_init(void)
87} 92}
88 93
89/* 94/*
90 * min,max except they take wrapping into account 95 * min, max except they take wrapping into account
91 */ 96 */
92 97
93static inline u64 wrap_min(u64 x, u64 y) 98static inline u64 wrap_min(u64 x, u64 y)
@@ -116,10 +121,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
116 if (unlikely(delta < 0)) 121 if (unlikely(delta < 0))
117 delta = 0; 122 delta = 0;
118 123
124 if (unlikely(!sched_clock_running))
125 return 0ull;
126
119 /* 127 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 128 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 129 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 130 * scd->tick_gtod + TICK_NSEC);
123 */ 131 */
124 132
125 clock = scd->tick_gtod + delta; 133 clock = scd->tick_gtod + delta;
@@ -148,8 +156,20 @@ static void lock_double_clock(struct sched_clock_data *data1,
148 156
149u64 sched_clock_cpu(int cpu) 157u64 sched_clock_cpu(int cpu)
150{ 158{
151 struct sched_clock_data *scd = cpu_sdc(cpu);
152 u64 now, clock, this_clock, remote_clock; 159 u64 now, clock, this_clock, remote_clock;
160 struct sched_clock_data *scd;
161
162 if (sched_clock_stable)
163 return sched_clock();
164
165 scd = cpu_sdc(cpu);
166
167 /*
168 * Normally this is not called in NMI context - but if it is,
169 * trying to do any locking here is totally lethal.
170 */
171 if (unlikely(in_nmi()))
172 return scd->clock;
153 173
154 if (unlikely(!sched_clock_running)) 174 if (unlikely(!sched_clock_running))
155 return 0ull; 175 return 0ull;
@@ -193,6 +213,8 @@ u64 sched_clock_cpu(int cpu)
193 return clock; 213 return clock;
194} 214}
195 215
216#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
217
196void sched_clock_tick(void) 218void sched_clock_tick(void)
197{ 219{
198 struct sched_clock_data *scd = this_scd(); 220 struct sched_clock_data *scd = this_scd();
@@ -235,22 +257,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
235} 257}
236EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 258EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
237 259
238#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 260#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
239
240void sched_clock_init(void)
241{
242 sched_clock_running = 1;
243}
244
245u64 sched_clock_cpu(int cpu)
246{
247 if (unlikely(!sched_clock_running))
248 return 0;
249
250 return sched_clock();
251}
252
253#endif
254 261
255unsigned long long cpu_clock(int cpu) 262unsigned long long cpu_clock(int cpu)
256{ 263{
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..07bc02e99ab1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -13,3 +13,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0) 14SCHED_FEAT(WAKEUP_OVERLAP, 0)
15SCHED_FEAT(LAST_BUDDY, 1) 15SCHED_FEAT(LAST_BUDDY, 1)
16SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bac1061cea2f..da932f4c8524 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,12 +960,13 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
960 960
961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
962 962
963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu,
964 const struct cpumask *mask)
964{ 965{
965 int first; 966 int first;
966 967
967 /* "this_cpu" is cheaper to preempt than a remote processor */ 968 /* "this_cpu" is cheaper to preempt than a remote processor */
968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 969 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
969 return this_cpu; 970 return this_cpu;
970 971
971 first = cpumask_first(mask); 972 first = cpumask_first(mask);
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 982 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
982 int this_cpu = smp_processor_id(); 983 int this_cpu = smp_processor_id();
983 int cpu = task_cpu(task); 984 int cpu = task_cpu(task);
985 cpumask_var_t domain_mask;
984 986
985 if (task->rt.nr_cpus_allowed == 1) 987 if (task->rt.nr_cpus_allowed == 1)
986 return -1; /* No other targets possible */ 988 return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
1013 if (this_cpu == cpu) 1015 if (this_cpu == cpu)
1014 this_cpu = -1; /* Skip this_cpu opt if the same */ 1016 this_cpu = -1; /* Skip this_cpu opt if the same */
1015 1017
1016 for_each_domain(cpu, sd) { 1018 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1017 if (sd->flags & SD_WAKE_AFFINE) { 1019 for_each_domain(cpu, sd) {
1018 cpumask_t domain_mask; 1020 if (sd->flags & SD_WAKE_AFFINE) {
1019 int best_cpu; 1021 int best_cpu;
1020 1022
1021 cpumask_and(&domain_mask, sched_domain_span(sd), 1023 cpumask_and(domain_mask,
1022 lowest_mask); 1024 sched_domain_span(sd),
1025 lowest_mask);
1023 1026
1024 best_cpu = pick_optimal_cpu(this_cpu, 1027 best_cpu = pick_optimal_cpu(this_cpu,
1025 &domain_mask); 1028 domain_mask);
1026 if (best_cpu != -1) 1029
1027 return best_cpu; 1030 if (best_cpu != -1) {
1031 free_cpumask_var(domain_mask);
1032 return best_cpu;
1033 }
1034 }
1028 } 1035 }
1036 free_cpumask_var(domain_mask);
1029 } 1037 }
1030 1038
1031 /* 1039 /*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7948fe..65ff3e3961b4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,8 +21,10 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h>
26 28
27#include <asm/irq.h> 29#include <asm/irq.h>
28/* 30/*
@@ -52,6 +54,11 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
52 54
53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 55static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
54 56
57char *softirq_to_name[NR_SOFTIRQS] = {
58 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
59 "TASKLET", "SCHED", "HRTIMER", "RCU"
60};
61
55/* 62/*
56 * we cannot loop indefinitely here to avoid userspace starvation, 63 * we cannot loop indefinitely here to avoid userspace starvation,
57 * but we also don't want to introduce a worst case 1/HZ latency 64 * but we also don't want to introduce a worst case 1/HZ latency
@@ -79,13 +86,23 @@ static void __local_bh_disable(unsigned long ip)
79 WARN_ON_ONCE(in_irq()); 86 WARN_ON_ONCE(in_irq());
80 87
81 raw_local_irq_save(flags); 88 raw_local_irq_save(flags);
82 add_preempt_count(SOFTIRQ_OFFSET); 89 /*
90 * The preempt tracer hooks into add_preempt_count and will break
91 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
92 * is set and before current->softirq_enabled is cleared.
93 * We must manually increment preempt_count here and manually
94 * call the trace_preempt_off later.
95 */
96 preempt_count() += SOFTIRQ_OFFSET;
83 /* 97 /*
84 * Were softirqs turned off above: 98 * Were softirqs turned off above:
85 */ 99 */
86 if (softirq_count() == SOFTIRQ_OFFSET) 100 if (softirq_count() == SOFTIRQ_OFFSET)
87 trace_softirqs_off(ip); 101 trace_softirqs_off(ip);
88 raw_local_irq_restore(flags); 102 raw_local_irq_restore(flags);
103
104 if (preempt_count() == SOFTIRQ_OFFSET)
105 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
89} 106}
90#else /* !CONFIG_TRACE_IRQFLAGS */ 107#else /* !CONFIG_TRACE_IRQFLAGS */
91static inline void __local_bh_disable(unsigned long ip) 108static inline void __local_bh_disable(unsigned long ip)
@@ -169,6 +186,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
169 */ 186 */
170#define MAX_SOFTIRQ_RESTART 10 187#define MAX_SOFTIRQ_RESTART 10
171 188
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
172asmlinkage void __do_softirq(void) 192asmlinkage void __do_softirq(void)
173{ 193{
174 struct softirq_action *h; 194 struct softirq_action *h;
@@ -180,7 +200,7 @@ asmlinkage void __do_softirq(void)
180 account_system_vtime(current); 200 account_system_vtime(current);
181 201
182 __local_bh_disable((unsigned long)__builtin_return_address(0)); 202 __local_bh_disable((unsigned long)__builtin_return_address(0));
183 trace_softirq_enter(); 203 lockdep_softirq_enter();
184 204
185 cpu = smp_processor_id(); 205 cpu = smp_processor_id();
186restart: 206restart:
@@ -195,12 +215,14 @@ restart:
195 if (pending & 1) { 215 if (pending & 1) {
196 int prev_count = preempt_count(); 216 int prev_count = preempt_count();
197 217
218 trace_softirq_entry(h, softirq_vec);
198 h->action(h); 219 h->action(h);
199 220 trace_softirq_exit(h, softirq_vec);
200 if (unlikely(prev_count != preempt_count())) { 221 if (unlikely(prev_count != preempt_count())) {
201 printk(KERN_ERR "huh, entered softirq %td %p" 222 printk(KERN_ERR "huh, entered softirq %td %s %p"
202 "with preempt_count %08x," 223 "with preempt_count %08x,"
203 " exited with %08x?\n", h - softirq_vec, 224 " exited with %08x?\n", h - softirq_vec,
225 softirq_to_name[h - softirq_vec],
204 h->action, prev_count, preempt_count()); 226 h->action, prev_count, preempt_count());
205 preempt_count() = prev_count; 227 preempt_count() = prev_count;
206 } 228 }
@@ -220,7 +242,7 @@ restart:
220 if (pending) 242 if (pending)
221 wakeup_softirqd(); 243 wakeup_softirqd();
222 244
223 trace_softirq_exit(); 245 lockdep_softirq_exit();
224 246
225 account_system_vtime(current); 247 account_system_vtime(current);
226 _local_bh_enable(); 248 _local_bh_enable();
@@ -796,6 +818,11 @@ int __init __weak early_irq_init(void)
796 return 0; 818 return 0;
797} 819}
798 820
821int __init __weak arch_probe_nr_irqs(void)
822{
823 return 0;
824}
825
799int __init __weak arch_early_irq_init(void) 826int __init __weak arch_early_irq_init(void)
800{ 827{
801 return 0; 828 return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a2..74541ca49536 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
170 * doesn't hit this CPU until we're ready. */ 170 * doesn't hit this CPU until we're ready. */
171 get_cpu(); 171 get_cpu();
172 for_each_online_cpu(i) { 172 for_each_online_cpu(i) {
173 sm_work = percpu_ptr(stop_machine_work, i); 173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu); 174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work); 175 queue_work_on(i, stop_machine_wq, sm_work);
176 } 176 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143d..ef1c385bc572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
491 debug_object_free(timer, &timer_debug_descr); 491 debug_object_free(timer, &timer_debug_descr);
492} 492}
493 493
494static void __init_timer(struct timer_list *timer); 494static void __init_timer(struct timer_list *timer,
495 const char *name,
496 struct lock_class_key *key);
495 497
496void init_timer_on_stack(struct timer_list *timer) 498void init_timer_on_stack_key(struct timer_list *timer,
499 const char *name,
500 struct lock_class_key *key)
497{ 501{
498 debug_object_init_on_stack(timer, &timer_debug_descr); 502 debug_object_init_on_stack(timer, &timer_debug_descr);
499 __init_timer(timer); 503 __init_timer(timer, name, key);
500} 504}
501EXPORT_SYMBOL_GPL(init_timer_on_stack); 505EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
502 506
503void destroy_timer_on_stack(struct timer_list *timer) 507void destroy_timer_on_stack(struct timer_list *timer)
504{ 508{
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
512static inline void debug_timer_deactivate(struct timer_list *timer) { } 516static inline void debug_timer_deactivate(struct timer_list *timer) { }
513#endif 517#endif
514 518
515static void __init_timer(struct timer_list *timer) 519static void __init_timer(struct timer_list *timer,
520 const char *name,
521 struct lock_class_key *key)
516{ 522{
517 timer->entry.next = NULL; 523 timer->entry.next = NULL;
518 timer->base = __raw_get_cpu_var(tvec_bases); 524 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
521 timer->start_pid = -1; 527 timer->start_pid = -1;
522 memset(timer->start_comm, 0, TASK_COMM_LEN); 528 memset(timer->start_comm, 0, TASK_COMM_LEN);
523#endif 529#endif
530 lockdep_init_map(&timer->lockdep_map, name, key, 0);
524} 531}
525 532
526/** 533/**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
530 * init_timer() must be done to a timer prior calling *any* of the 537 * init_timer() must be done to a timer prior calling *any* of the
531 * other timer functions. 538 * other timer functions.
532 */ 539 */
533void init_timer(struct timer_list *timer) 540void init_timer_key(struct timer_list *timer,
541 const char *name,
542 struct lock_class_key *key)
534{ 543{
535 debug_timer_init(timer); 544 debug_timer_init(timer);
536 __init_timer(timer); 545 __init_timer(timer, name, key);
537} 546}
538EXPORT_SYMBOL(init_timer); 547EXPORT_SYMBOL(init_timer_key);
539 548
540void init_timer_deferrable(struct timer_list *timer) 549void init_timer_deferrable_key(struct timer_list *timer,
550 const char *name,
551 struct lock_class_key *key)
541{ 552{
542 init_timer(timer); 553 init_timer_key(timer, name, key);
543 timer_set_deferrable(timer); 554 timer_set_deferrable(timer);
544} 555}
545EXPORT_SYMBOL(init_timer_deferrable); 556EXPORT_SYMBOL(init_timer_deferrable_key);
546 557
547static inline void detach_timer(struct timer_list *timer, 558static inline void detach_timer(struct timer_list *timer,
548 int clear_pending) 559 int clear_pending)
@@ -789,6 +800,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
789 */ 800 */
790int del_timer_sync(struct timer_list *timer) 801int del_timer_sync(struct timer_list *timer)
791{ 802{
803#ifdef CONFIG_LOCKDEP
804 unsigned long flags;
805
806 local_irq_save(flags);
807 lock_map_acquire(&timer->lockdep_map);
808 lock_map_release(&timer->lockdep_map);
809 local_irq_restore(flags);
810#endif
811
792 for (;;) { 812 for (;;) {
793 int ret = try_to_del_timer_sync(timer); 813 int ret = try_to_del_timer_sync(timer);
794 if (ret >= 0) 814 if (ret >= 0)
@@ -861,10 +881,36 @@ static inline void __run_timers(struct tvec_base *base)
861 881
862 set_running_timer(base, timer); 882 set_running_timer(base, timer);
863 detach_timer(timer, 1); 883 detach_timer(timer, 1);
884
864 spin_unlock_irq(&base->lock); 885 spin_unlock_irq(&base->lock);
865 { 886 {
866 int preempt_count = preempt_count(); 887 int preempt_count = preempt_count();
888
889#ifdef CONFIG_LOCKDEP
890 /*
891 * It is permissible to free the timer from
892 * inside the function that is called from
893 * it, this we need to take into account for
894 * lockdep too. To avoid bogus "held lock
895 * freed" warnings as well as problems when
896 * looking into timer->lockdep_map, make a
897 * copy and use that here.
898 */
899 struct lockdep_map lockdep_map =
900 timer->lockdep_map;
901#endif
902 /*
903 * Couple the lock chain with the lock chain at
904 * del_timer_sync() by acquiring the lock_map
905 * around the fn() call here and in
906 * del_timer_sync().
907 */
908 lock_map_acquire(&lockdep_map);
909
867 fn(data); 910 fn(data);
911
912 lock_map_release(&lockdep_map);
913
868 if (preempt_count != preempt_count()) { 914 if (preempt_count != preempt_count()) {
869 printk(KERN_ERR "huh, entered %p " 915 printk(KERN_ERR "huh, entered %p "
870 "with preempt_count %08x, exited" 916 "with preempt_count %08x, exited"
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e5ab87..b0a46f889659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9config NOP_TRACER 9config NOP_TRACER
10 bool 10 bool
11 11
12config HAVE_FTRACE_NMI_ENTER
13 bool
14
12config HAVE_FUNCTION_TRACER 15config HAVE_FUNCTION_TRACER
13 bool 16 bool
14 17
@@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD
31config HAVE_HW_BRANCH_TRACER 34config HAVE_HW_BRANCH_TRACER
32 bool 35 bool
33 36
37config HAVE_FTRACE_SYSCALLS
38 bool
39
34config TRACER_MAX_TRACE 40config TRACER_MAX_TRACE
35 bool 41 bool
36 42
37config RING_BUFFER 43config RING_BUFFER
38 bool 44 bool
39 45
46config FTRACE_NMI_ENTER
47 bool
48 depends on HAVE_FTRACE_NMI_ENTER
49 default y
50
40config TRACING 51config TRACING
41 bool 52 bool
42 select DEBUG_FS 53 select DEBUG_FS
@@ -44,13 +55,25 @@ config TRACING
44 select STACKTRACE if STACKTRACE_SUPPORT 55 select STACKTRACE if STACKTRACE_SUPPORT
45 select TRACEPOINTS 56 select TRACEPOINTS
46 select NOP_TRACER 57 select NOP_TRACER
58 select BINARY_PRINTF
59
60#
61# Minimum requirements an architecture has to meet for us to
62# be able to offer generic tracing facilities:
63#
64config TRACING_SUPPORT
65 bool
66 depends on TRACE_IRQFLAGS_SUPPORT
67 depends on STACKTRACE_SUPPORT
68 default y
69
70if TRACING_SUPPORT
47 71
48menu "Tracers" 72menu "Tracers"
49 73
50config FUNCTION_TRACER 74config FUNCTION_TRACER
51 bool "Kernel Function Tracer" 75 bool "Kernel Function Tracer"
52 depends on HAVE_FUNCTION_TRACER 76 depends on HAVE_FUNCTION_TRACER
53 depends on DEBUG_KERNEL
54 select FRAME_POINTER 77 select FRAME_POINTER
55 select KALLSYMS 78 select KALLSYMS
56 select TRACING 79 select TRACING
@@ -83,7 +106,6 @@ config IRQSOFF_TRACER
83 default n 106 default n
84 depends on TRACE_IRQFLAGS_SUPPORT 107 depends on TRACE_IRQFLAGS_SUPPORT
85 depends on GENERIC_TIME 108 depends on GENERIC_TIME
86 depends on DEBUG_KERNEL
87 select TRACE_IRQFLAGS 109 select TRACE_IRQFLAGS
88 select TRACING 110 select TRACING
89 select TRACER_MAX_TRACE 111 select TRACER_MAX_TRACE
@@ -106,7 +128,6 @@ config PREEMPT_TRACER
106 default n 128 default n
107 depends on GENERIC_TIME 129 depends on GENERIC_TIME
108 depends on PREEMPT 130 depends on PREEMPT
109 depends on DEBUG_KERNEL
110 select TRACING 131 select TRACING
111 select TRACER_MAX_TRACE 132 select TRACER_MAX_TRACE
112 help 133 help
@@ -127,13 +148,13 @@ config SYSPROF_TRACER
127 bool "Sysprof Tracer" 148 bool "Sysprof Tracer"
128 depends on X86 149 depends on X86
129 select TRACING 150 select TRACING
151 select CONTEXT_SWITCH_TRACER
130 help 152 help
131 This tracer provides the trace needed by the 'Sysprof' userspace 153 This tracer provides the trace needed by the 'Sysprof' userspace
132 tool. 154 tool.
133 155
134config SCHED_TRACER 156config SCHED_TRACER
135 bool "Scheduling Latency Tracer" 157 bool "Scheduling Latency Tracer"
136 depends on DEBUG_KERNEL
137 select TRACING 158 select TRACING
138 select CONTEXT_SWITCH_TRACER 159 select CONTEXT_SWITCH_TRACER
139 select TRACER_MAX_TRACE 160 select TRACER_MAX_TRACE
@@ -143,16 +164,30 @@ config SCHED_TRACER
143 164
144config CONTEXT_SWITCH_TRACER 165config CONTEXT_SWITCH_TRACER
145 bool "Trace process context switches" 166 bool "Trace process context switches"
146 depends on DEBUG_KERNEL
147 select TRACING 167 select TRACING
148 select MARKERS 168 select MARKERS
149 help 169 help
150 This tracer gets called from the context switch and records 170 This tracer gets called from the context switch and records
151 all switching of tasks. 171 all switching of tasks.
152 172
173config EVENT_TRACER
174 bool "Trace various events in the kernel"
175 select TRACING
176 help
177 This tracer hooks to various trace points in the kernel
178 allowing the user to pick and choose which trace point they
179 want to trace.
180
181config FTRACE_SYSCALLS
182 bool "Trace syscalls"
183 depends on HAVE_FTRACE_SYSCALLS
184 select TRACING
185 select KALLSYMS
186 help
187 Basic tracer to catch the syscall entry and exit events.
188
153config BOOT_TRACER 189config BOOT_TRACER
154 bool "Trace boot initcalls" 190 bool "Trace boot initcalls"
155 depends on DEBUG_KERNEL
156 select TRACING 191 select TRACING
157 select CONTEXT_SWITCH_TRACER 192 select CONTEXT_SWITCH_TRACER
158 help 193 help
@@ -165,13 +200,11 @@ config BOOT_TRACER
165 representation of the delays during initcalls - but the raw 200 representation of the delays during initcalls - but the raw
166 /debug/tracing/trace text output is readable too. 201 /debug/tracing/trace text output is readable too.
167 202
168 ( Note that tracing self tests can't be enabled if this tracer is 203 You must pass in ftrace=initcall to the kernel command line
169 selected, because the self-tests are an initcall as well and that 204 to enable this on bootup.
170 would invalidate the boot trace. )
171 205
172config TRACE_BRANCH_PROFILING 206config TRACE_BRANCH_PROFILING
173 bool "Trace likely/unlikely profiler" 207 bool "Trace likely/unlikely profiler"
174 depends on DEBUG_KERNEL
175 select TRACING 208 select TRACING
176 help 209 help
177 This tracer profiles all the the likely and unlikely macros 210 This tracer profiles all the the likely and unlikely macros
@@ -224,7 +257,6 @@ config BRANCH_TRACER
224 257
225config POWER_TRACER 258config POWER_TRACER
226 bool "Trace power consumption behavior" 259 bool "Trace power consumption behavior"
227 depends on DEBUG_KERNEL
228 depends on X86 260 depends on X86
229 select TRACING 261 select TRACING
230 help 262 help
@@ -236,7 +268,6 @@ config POWER_TRACER
236config STACK_TRACER 268config STACK_TRACER
237 bool "Trace max stack" 269 bool "Trace max stack"
238 depends on HAVE_FUNCTION_TRACER 270 depends on HAVE_FUNCTION_TRACER
239 depends on DEBUG_KERNEL
240 select FUNCTION_TRACER 271 select FUNCTION_TRACER
241 select STACKTRACE 272 select STACKTRACE
242 select KALLSYMS 273 select KALLSYMS
@@ -266,11 +297,66 @@ config HW_BRANCH_TRACER
266 This tracer records all branches on the system in a circular 297 This tracer records all branches on the system in a circular
267 buffer giving access to the last N branches for each cpu. 298 buffer giving access to the last N branches for each cpu.
268 299
300config KMEMTRACE
301 bool "Trace SLAB allocations"
302 select TRACING
303 help
304 kmemtrace provides tracing for slab allocator functions, such as
305 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
306 data is then fed to the userspace application in order to analyse
307 allocation hotspots, internal fragmentation and so on, making it
308 possible to see how well an allocator performs, as well as debug
309 and profile kernel code.
310
311 This requires an userspace application to use. See
312 Documentation/vm/kmemtrace.txt for more information.
313
314 Saying Y will make the kernel somewhat larger and slower. However,
315 if you disable kmemtrace at run-time or boot-time, the performance
316 impact is minimal (depending on the arch the kernel is built for).
317
318 If unsure, say N.
319
320config WORKQUEUE_TRACER
321 bool "Trace workqueues"
322 select TRACING
323 help
324 The workqueue tracer provides some statistical informations
325 about each cpu workqueue thread such as the number of the
326 works inserted and executed since their creation. It can help
327 to evaluate the amount of work each of them have to perform.
328 For example it can help a developer to decide whether he should
329 choose a per cpu workqueue instead of a singlethreaded one.
330
331config BLK_DEV_IO_TRACE
332 bool "Support for tracing block io actions"
333 depends on SYSFS
334 depends on BLOCK
335 select RELAY
336 select DEBUG_FS
337 select TRACEPOINTS
338 select TRACING
339 select STACKTRACE
340 help
341 Say Y here if you want to be able to trace the block layer actions
342 on a given queue. Tracing allows you to see any traffic happening
343 on a block device queue. For more information (and the userspace
344 support tools needed), fetch the blktrace tools from:
345
346 git://git.kernel.dk/blktrace.git
347
348 Tracing also is possible using the ftrace interface, e.g.:
349
350 echo 1 > /sys/block/sda/sda1/trace/enable
351 echo blk > /sys/kernel/debug/tracing/current_tracer
352 cat /sys/kernel/debug/tracing/trace_pipe
353
354 If unsure, say N.
355
269config DYNAMIC_FTRACE 356config DYNAMIC_FTRACE
270 bool "enable/disable ftrace tracepoints dynamically" 357 bool "enable/disable ftrace tracepoints dynamically"
271 depends on FUNCTION_TRACER 358 depends on FUNCTION_TRACER
272 depends on HAVE_DYNAMIC_FTRACE 359 depends on HAVE_DYNAMIC_FTRACE
273 depends on DEBUG_KERNEL
274 default y 360 default y
275 help 361 help
276 This option will modify all the calls to ftrace dynamically 362 This option will modify all the calls to ftrace dynamically
@@ -296,7 +382,7 @@ config FTRACE_SELFTEST
296 382
297config FTRACE_STARTUP_TEST 383config FTRACE_STARTUP_TEST
298 bool "Perform a startup test on ftrace" 384 bool "Perform a startup test on ftrace"
299 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER 385 depends on TRACING
300 select FTRACE_SELFTEST 386 select FTRACE_SELFTEST
301 help 387 help
302 This option performs a series of startup tests on ftrace. On bootup 388 This option performs a series of startup tests on ftrace. On bootup
@@ -306,7 +392,7 @@ config FTRACE_STARTUP_TEST
306 392
307config MMIOTRACE 393config MMIOTRACE
308 bool "Memory mapped IO tracing" 394 bool "Memory mapped IO tracing"
309 depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI 395 depends on HAVE_MMIOTRACE_SUPPORT && PCI
310 select TRACING 396 select TRACING
311 help 397 help
312 Mmiotrace traces Memory Mapped I/O access and is meant for 398 Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -328,3 +414,6 @@ config MMIOTRACE_TEST
328 Say N, unless you absolutely know what you are doing. 414 Say N, unless you absolutely know what you are doing.
329 415
330endmenu 416endmenu
417
418endif # TRACING_SUPPORT
419
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..c3feea01c3e0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
20 20
21obj-$(CONFIG_TRACING) += trace.o 21obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o
22obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 26obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
23obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o 27obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
24obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 28obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +37,12 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 37obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 38obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o 39obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o
44obj-$(CONFIG_EVENT_TRACER) += events.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
36 47
37libftrace-y := ftrace.o 48libftrace-y := ftrace.o
diff --git a/block/blktrace.c b/kernel/trace/blktrace.c
index 7cf9d1ff45a0..b171778e3863 100644
--- a/block/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -24,10 +24,28 @@
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h> 26#include <trace/block.h>
27#include <asm/uaccess.h> 27#include <linux/uaccess.h>
28#include "trace_output.h"
28 29
29static unsigned int blktrace_seq __read_mostly = 1; 30static unsigned int blktrace_seq __read_mostly = 1;
30 31
32static struct trace_array *blk_tr;
33static int __read_mostly blk_tracer_enabled;
34
35/* Select an alternative, minimalistic output than the original one */
36#define TRACE_BLK_OPT_CLASSIC 0x1
37
38static struct tracer_opt blk_tracer_opts[] = {
39 /* Default disable the minimalistic output */
40 { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
41 { }
42};
43
44static struct tracer_flags blk_tracer_flags = {
45 .val = 0,
46 .opts = blk_tracer_opts,
47};
48
31/* Global reference count of probes */ 49/* Global reference count of probes */
32static DEFINE_MUTEX(blk_probe_mutex); 50static DEFINE_MUTEX(blk_probe_mutex);
33static atomic_t blk_probes_ref = ATOMIC_INIT(0); 51static atomic_t blk_probes_ref = ATOMIC_INIT(0);
@@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
43{ 61{
44 struct blk_io_trace *t; 62 struct blk_io_trace *t;
45 63
64 if (!bt->rchan)
65 return;
66
46 t = relay_reserve(bt->rchan, sizeof(*t) + len); 67 t = relay_reserve(bt->rchan, sizeof(*t) + len);
47 if (t) { 68 if (t) {
48 const int cpu = smp_processor_id(); 69 const int cpu = smp_processor_id();
@@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
90 unsigned long flags; 111 unsigned long flags;
91 char *buf; 112 char *buf;
92 113
114 if (blk_tr) {
115 va_start(args, fmt);
116 ftrace_vprintk(fmt, args);
117 va_end(args);
118 return;
119 }
120
121 if (!bt->msg_data)
122 return;
123
93 local_irq_save(flags); 124 local_irq_save(flags);
94 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 125 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
95 va_start(args, fmt); 126 va_start(args, fmt);
@@ -117,11 +148,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
117/* 148/*
118 * Data direction bit lookup 149 * Data direction bit lookup
119 */ 150 */
120static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; 151static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
152 BLK_TC_ACT(BLK_TC_WRITE) };
121 153
122/* The ilog2() calls fall out because they're constant */ 154/* The ilog2() calls fall out because they're constant */
123#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \ 155#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
124 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) ) 156 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
125 157
126/* 158/*
127 * The worker for the various blk_add_trace*() types. Fills out a 159 * The worker for the various blk_add_trace*() types. Fills out a
@@ -131,13 +163,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
131 int rw, u32 what, int error, int pdu_len, void *pdu_data) 163 int rw, u32 what, int error, int pdu_len, void *pdu_data)
132{ 164{
133 struct task_struct *tsk = current; 165 struct task_struct *tsk = current;
166 struct ring_buffer_event *event = NULL;
134 struct blk_io_trace *t; 167 struct blk_io_trace *t;
135 unsigned long flags; 168 unsigned long flags = 0;
136 unsigned long *sequence; 169 unsigned long *sequence;
137 pid_t pid; 170 pid_t pid;
138 int cpu; 171 int cpu, pc = 0;
139 172
140 if (unlikely(bt->trace_state != Blktrace_running)) 173 if (unlikely(bt->trace_state != Blktrace_running ||
174 !blk_tracer_enabled))
141 return; 175 return;
142 176
143 what |= ddir_act[rw & WRITE]; 177 what |= ddir_act[rw & WRITE];
@@ -150,6 +184,20 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
150 pid = tsk->pid; 184 pid = tsk->pid;
151 if (unlikely(act_log_check(bt, what, sector, pid))) 185 if (unlikely(act_log_check(bt, what, sector, pid)))
152 return; 186 return;
187 cpu = raw_smp_processor_id();
188
189 if (blk_tr) {
190 tracing_record_cmdline(current);
191
192 pc = preempt_count();
193 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
194 sizeof(*t) + pdu_len,
195 0, pc);
196 if (!event)
197 return;
198 t = ring_buffer_event_data(event);
199 goto record_it;
200 }
153 201
154 /* 202 /*
155 * A word about the locking here - we disable interrupts to reserve 203 * A word about the locking here - we disable interrupts to reserve
@@ -163,23 +211,35 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
163 211
164 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 212 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
165 if (t) { 213 if (t) {
166 cpu = smp_processor_id();
167 sequence = per_cpu_ptr(bt->sequence, cpu); 214 sequence = per_cpu_ptr(bt->sequence, cpu);
168 215
169 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; 216 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
170 t->sequence = ++(*sequence); 217 t->sequence = ++(*sequence);
171 t->time = ktime_to_ns(ktime_get()); 218 t->time = ktime_to_ns(ktime_get());
219record_it:
220 /*
221 * These two are not needed in ftrace as they are in the
222 * generic trace_entry, filled by tracing_generic_entry_update,
223 * but for the trace_event->bin() synthesizer benefit we do it
224 * here too.
225 */
226 t->cpu = cpu;
227 t->pid = pid;
228
172 t->sector = sector; 229 t->sector = sector;
173 t->bytes = bytes; 230 t->bytes = bytes;
174 t->action = what; 231 t->action = what;
175 t->pid = pid;
176 t->device = bt->dev; 232 t->device = bt->dev;
177 t->cpu = cpu;
178 t->error = error; 233 t->error = error;
179 t->pdu_len = pdu_len; 234 t->pdu_len = pdu_len;
180 235
181 if (pdu_len) 236 if (pdu_len)
182 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 237 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
238
239 if (blk_tr) {
240 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
241 return;
242 }
183 } 243 }
184 244
185 local_irq_restore(flags); 245 local_irq_restore(flags);
@@ -363,7 +423,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
363 if (!bt->sequence) 423 if (!bt->sequence)
364 goto err; 424 goto err;
365 425
366 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); 426 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
367 if (!bt->msg_data) 427 if (!bt->msg_data)
368 goto err; 428 goto err;
369 429
@@ -385,7 +445,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
385 atomic_set(&bt->dropped, 0); 445 atomic_set(&bt->dropped, 0);
386 446
387 ret = -EIO; 447 ret = -EIO;
388 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); 448 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
449 &blk_dropped_fops);
389 if (!bt->dropped_file) 450 if (!bt->dropped_file)
390 goto err; 451 goto err;
391 452
@@ -467,10 +528,10 @@ EXPORT_SYMBOL_GPL(blk_trace_setup);
467 528
468int blk_trace_startstop(struct request_queue *q, int start) 529int blk_trace_startstop(struct request_queue *q, int start)
469{ 530{
470 struct blk_trace *bt;
471 int ret; 531 int ret;
532 struct blk_trace *bt = q->blk_trace;
472 533
473 if ((bt = q->blk_trace) == NULL) 534 if (bt == NULL)
474 return -EINVAL; 535 return -EINVAL;
475 536
476 /* 537 /*
@@ -503,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
503/** 564/**
504 * blk_trace_ioctl: - handle the ioctls associated with tracing 565 * blk_trace_ioctl: - handle the ioctls associated with tracing
505 * @bdev: the block device 566 * @bdev: the block device
506 * @cmd: the ioctl cmd 567 * @cmd: the ioctl cmd
507 * @arg: the argument data, if any 568 * @arg: the argument data, if any
508 * 569 *
509 **/ 570 **/
@@ -606,12 +667,14 @@ static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
606 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 667 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
607} 668}
608 669
609static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) 670static void blk_add_trace_rq_requeue(struct request_queue *q,
671 struct request *rq)
610{ 672{
611 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 673 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
612} 674}
613 675
614static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) 676static void blk_add_trace_rq_complete(struct request_queue *q,
677 struct request *rq)
615{ 678{
616 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 679 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
617} 680}
@@ -648,12 +711,14 @@ static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
648 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 711 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
649} 712}
650 713
651static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) 714static void blk_add_trace_bio_backmerge(struct request_queue *q,
715 struct bio *bio)
652{ 716{
653 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 717 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
654} 718}
655 719
656static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) 720static void blk_add_trace_bio_frontmerge(struct request_queue *q,
721 struct bio *bio)
657{ 722{
658 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 723 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
659} 724}
@@ -663,7 +728,8 @@ static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
663 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 728 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
664} 729}
665 730
666static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) 731static void blk_add_trace_getrq(struct request_queue *q,
732 struct bio *bio, int rw)
667{ 733{
668 if (bio) 734 if (bio)
669 blk_add_trace_bio(q, bio, BLK_TA_GETRQ); 735 blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
@@ -676,7 +742,8 @@ static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw
676} 742}
677 743
678 744
679static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) 745static void blk_add_trace_sleeprq(struct request_queue *q,
746 struct bio *bio, int rw)
680{ 747{
681 if (bio) 748 if (bio)
682 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); 749 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
@@ -684,7 +751,8 @@ static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int
684 struct blk_trace *bt = q->blk_trace; 751 struct blk_trace *bt = q->blk_trace;
685 752
686 if (bt) 753 if (bt)
687 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); 754 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
755 0, 0, NULL);
688 } 756 }
689} 757}
690 758
@@ -858,3 +926,625 @@ static void blk_unregister_tracepoints(void)
858 926
859 tracepoint_synchronize_unregister(); 927 tracepoint_synchronize_unregister();
860} 928}
929
930/*
931 * struct blk_io_tracer formatting routines
932 */
933
934static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
935{
936 int i = 0;
937
938 if (t->action & BLK_TC_DISCARD)
939 rwbs[i++] = 'D';
940 else if (t->action & BLK_TC_WRITE)
941 rwbs[i++] = 'W';
942 else if (t->bytes)
943 rwbs[i++] = 'R';
944 else
945 rwbs[i++] = 'N';
946
947 if (t->action & BLK_TC_AHEAD)
948 rwbs[i++] = 'A';
949 if (t->action & BLK_TC_BARRIER)
950 rwbs[i++] = 'B';
951 if (t->action & BLK_TC_SYNC)
952 rwbs[i++] = 'S';
953 if (t->action & BLK_TC_META)
954 rwbs[i++] = 'M';
955
956 rwbs[i] = '\0';
957}
958
959static inline
960const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
961{
962 return (const struct blk_io_trace *)ent;
963}
964
965static inline const void *pdu_start(const struct trace_entry *ent)
966{
967 return te_blk_io_trace(ent) + 1;
968}
969
970static inline u32 t_sec(const struct trace_entry *ent)
971{
972 return te_blk_io_trace(ent)->bytes >> 9;
973}
974
975static inline unsigned long long t_sector(const struct trace_entry *ent)
976{
977 return te_blk_io_trace(ent)->sector;
978}
979
980static inline __u16 t_error(const struct trace_entry *ent)
981{
982 return te_blk_io_trace(ent)->sector;
983}
984
985static __u64 get_pdu_int(const struct trace_entry *ent)
986{
987 const __u64 *val = pdu_start(ent);
988 return be64_to_cpu(*val);
989}
990
991static void get_pdu_remap(const struct trace_entry *ent,
992 struct blk_io_trace_remap *r)
993{
994 const struct blk_io_trace_remap *__r = pdu_start(ent);
995 __u64 sector = __r->sector;
996
997 r->device = be32_to_cpu(__r->device);
998 r->device_from = be32_to_cpu(__r->device_from);
999 r->sector = be64_to_cpu(sector);
1000}
1001
1002static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
1003{
1004 char rwbs[6];
1005 unsigned long long ts = ns2usecs(iter->ts);
1006 unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
1007 unsigned secs = (unsigned long)ts;
1008 const struct trace_entry *ent = iter->ent;
1009 const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
1010
1011 fill_rwbs(rwbs, t);
1012
1013 return trace_seq_printf(&iter->seq,
1014 "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
1015 MAJOR(t->device), MINOR(t->device), iter->cpu,
1016 secs, usec_rem, ent->pid, act, rwbs);
1017}
1018
1019static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
1020 const char *act)
1021{
1022 char rwbs[6];
1023 fill_rwbs(rwbs, t);
1024 return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
1025 MAJOR(t->device), MINOR(t->device), act, rwbs);
1026}
1027
1028static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1029{
1030 char cmd[TASK_COMM_LEN];
1031
1032 trace_find_cmdline(ent->pid, cmd);
1033
1034 if (t_sec(ent))
1035 return trace_seq_printf(s, "%llu + %u [%s]\n",
1036 t_sector(ent), t_sec(ent), cmd);
1037 return trace_seq_printf(s, "[%s]\n", cmd);
1038}
1039
1040static int blk_log_with_error(struct trace_seq *s,
1041 const struct trace_entry *ent)
1042{
1043 if (t_sec(ent))
1044 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
1045 t_sec(ent), t_error(ent));
1046 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
1047}
1048
1049static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1050{
1051 struct blk_io_trace_remap r = { .device = 0, };
1052
1053 get_pdu_remap(ent, &r);
1054 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1055 t_sector(ent),
1056 t_sec(ent), MAJOR(r.device), MINOR(r.device),
1057 (unsigned long long)r.sector);
1058}
1059
1060static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1061{
1062 char cmd[TASK_COMM_LEN];
1063
1064 trace_find_cmdline(ent->pid, cmd);
1065
1066 return trace_seq_printf(s, "[%s]\n", cmd);
1067}
1068
1069static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1070{
1071 char cmd[TASK_COMM_LEN];
1072
1073 trace_find_cmdline(ent->pid, cmd);
1074
1075 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1076}
1077
1078static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1079{
1080 char cmd[TASK_COMM_LEN];
1081
1082 trace_find_cmdline(ent->pid, cmd);
1083
1084 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1085 get_pdu_int(ent), cmd);
1086}
1087
1088/*
1089 * struct tracer operations
1090 */
1091
1092static void blk_tracer_print_header(struct seq_file *m)
1093{
1094 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1095 return;
1096 seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
1097 "# | | | | | |\n");
1098}
1099
1100static void blk_tracer_start(struct trace_array *tr)
1101{
1102 mutex_lock(&blk_probe_mutex);
1103 if (atomic_add_return(1, &blk_probes_ref) == 1)
1104 if (blk_register_tracepoints())
1105 atomic_dec(&blk_probes_ref);
1106 mutex_unlock(&blk_probe_mutex);
1107 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1108}
1109
1110static int blk_tracer_init(struct trace_array *tr)
1111{
1112 blk_tr = tr;
1113 blk_tracer_start(tr);
1114 mutex_lock(&blk_probe_mutex);
1115 blk_tracer_enabled++;
1116 mutex_unlock(&blk_probe_mutex);
1117 return 0;
1118}
1119
1120static void blk_tracer_stop(struct trace_array *tr)
1121{
1122 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1123 mutex_lock(&blk_probe_mutex);
1124 if (atomic_dec_and_test(&blk_probes_ref))
1125 blk_unregister_tracepoints();
1126 mutex_unlock(&blk_probe_mutex);
1127}
1128
1129static void blk_tracer_reset(struct trace_array *tr)
1130{
1131 if (!atomic_read(&blk_probes_ref))
1132 return;
1133
1134 mutex_lock(&blk_probe_mutex);
1135 blk_tracer_enabled--;
1136 WARN_ON(blk_tracer_enabled < 0);
1137 mutex_unlock(&blk_probe_mutex);
1138
1139 blk_tracer_stop(tr);
1140}
1141
1142static struct {
1143 const char *act[2];
1144 int (*print)(struct trace_seq *s, const struct trace_entry *ent);
1145} what2act[] __read_mostly = {
1146 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1147 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
1148 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
1149 [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
1150 [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
1151 [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
1152 [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
1153 [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
1154 [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
1155 [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
1156 [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1157 [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
1158 [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
1159 [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
1160 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
1161};
1162
1163static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1164 int flags)
1165{
1166 struct trace_seq *s = &iter->seq;
1167 const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1168 const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1169 int ret;
1170
1171 if (!trace_print_context(iter))
1172 return TRACE_TYPE_PARTIAL_LINE;
1173
1174 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
1175 ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1176 else {
1177 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1178 ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
1179 if (ret)
1180 ret = what2act[what].print(s, iter->ent);
1181 }
1182
1183 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1184}
1185
1186static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1187{
1188 struct trace_seq *s = &iter->seq;
1189 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1190 const int offset = offsetof(struct blk_io_trace, sector);
1191 struct blk_io_trace old = {
1192 .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1193 .time = ns2usecs(iter->ts),
1194 };
1195
1196 if (!trace_seq_putmem(s, &old, offset))
1197 return 0;
1198 return trace_seq_putmem(s, &t->sector,
1199 sizeof(old) - offset + t->pdu_len);
1200}
1201
1202static enum print_line_t
1203blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1204{
1205 return blk_trace_synthesize_old_trace(iter) ?
1206 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1207}
1208
1209static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1210{
1211 const struct blk_io_trace *t;
1212 u16 what;
1213 int ret;
1214
1215 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1216 return TRACE_TYPE_UNHANDLED;
1217
1218 t = (const struct blk_io_trace *)iter->ent;
1219 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1220
1221 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
1222 ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
1223 else {
1224 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1225 ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
1226 if (ret)
1227 ret = what2act[what].print(&iter->seq, iter->ent);
1228 }
1229
1230 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1231}
1232
1233static struct tracer blk_tracer __read_mostly = {
1234 .name = "blk",
1235 .init = blk_tracer_init,
1236 .reset = blk_tracer_reset,
1237 .start = blk_tracer_start,
1238 .stop = blk_tracer_stop,
1239 .print_header = blk_tracer_print_header,
1240 .print_line = blk_tracer_print_line,
1241 .flags = &blk_tracer_flags,
1242};
1243
1244static struct trace_event trace_blk_event = {
1245 .type = TRACE_BLK,
1246 .trace = blk_trace_event_print,
1247 .binary = blk_trace_event_print_binary,
1248};
1249
1250static int __init init_blk_tracer(void)
1251{
1252 if (!register_ftrace_event(&trace_blk_event)) {
1253 pr_warning("Warning: could not register block events\n");
1254 return 1;
1255 }
1256
1257 if (register_tracer(&blk_tracer) != 0) {
1258 pr_warning("Warning: could not register the block tracer\n");
1259 unregister_ftrace_event(&trace_blk_event);
1260 return 1;
1261 }
1262
1263 return 0;
1264}
1265
1266device_initcall(init_blk_tracer);
1267
1268static int blk_trace_remove_queue(struct request_queue *q)
1269{
1270 struct blk_trace *bt;
1271
1272 bt = xchg(&q->blk_trace, NULL);
1273 if (bt == NULL)
1274 return -EINVAL;
1275
1276 kfree(bt);
1277 return 0;
1278}
1279
1280/*
1281 * Setup everything required to start tracing
1282 */
1283static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1284{
1285 struct blk_trace *old_bt, *bt = NULL;
1286 int ret;
1287
1288 ret = -ENOMEM;
1289 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1290 if (!bt)
1291 goto err;
1292
1293 bt->dev = dev;
1294 bt->act_mask = (u16)-1;
1295 bt->end_lba = -1ULL;
1296 bt->trace_state = Blktrace_running;
1297
1298 old_bt = xchg(&q->blk_trace, bt);
1299 if (old_bt != NULL) {
1300 (void)xchg(&q->blk_trace, old_bt);
1301 kfree(bt);
1302 ret = -EBUSY;
1303 }
1304 return 0;
1305err:
1306 return ret;
1307}
1308
1309/*
1310 * sysfs interface to enable and configure tracing
1311 */
1312
1313static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
1314 struct device_attribute *attr,
1315 char *buf)
1316{
1317 struct hd_struct *p = dev_to_part(dev);
1318 struct block_device *bdev;
1319 ssize_t ret = -ENXIO;
1320
1321 lock_kernel();
1322 bdev = bdget(part_devt(p));
1323 if (bdev != NULL) {
1324 struct request_queue *q = bdev_get_queue(bdev);
1325
1326 if (q != NULL) {
1327 mutex_lock(&bdev->bd_mutex);
1328 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1329 mutex_unlock(&bdev->bd_mutex);
1330 }
1331
1332 bdput(bdev);
1333 }
1334
1335 unlock_kernel();
1336 return ret;
1337}
1338
1339static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
1340 struct device_attribute *attr,
1341 const char *buf, size_t count)
1342{
1343 struct block_device *bdev;
1344 struct request_queue *q;
1345 struct hd_struct *p;
1346 int value;
1347 ssize_t ret = -ENXIO;
1348
1349 if (count == 0 || sscanf(buf, "%d", &value) != 1)
1350 goto out;
1351
1352 lock_kernel();
1353 p = dev_to_part(dev);
1354 bdev = bdget(part_devt(p));
1355 if (bdev == NULL)
1356 goto out_unlock_kernel;
1357
1358 q = bdev_get_queue(bdev);
1359 if (q == NULL)
1360 goto out_bdput;
1361
1362 mutex_lock(&bdev->bd_mutex);
1363 if (value)
1364 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1365 else
1366 ret = blk_trace_remove_queue(q);
1367 mutex_unlock(&bdev->bd_mutex);
1368
1369 if (ret == 0)
1370 ret = count;
1371out_bdput:
1372 bdput(bdev);
1373out_unlock_kernel:
1374 unlock_kernel();
1375out:
1376 return ret;
1377}
1378
1379static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1380 struct device_attribute *attr,
1381 char *buf);
1382static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1383 struct device_attribute *attr,
1384 const char *buf, size_t count);
1385#define BLK_TRACE_DEVICE_ATTR(_name) \
1386 DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1387 sysfs_blk_trace_attr_show, \
1388 sysfs_blk_trace_attr_store)
1389
1390static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
1391 sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
1392static BLK_TRACE_DEVICE_ATTR(act_mask);
1393static BLK_TRACE_DEVICE_ATTR(pid);
1394static BLK_TRACE_DEVICE_ATTR(start_lba);
1395static BLK_TRACE_DEVICE_ATTR(end_lba);
1396
1397static struct attribute *blk_trace_attrs[] = {
1398 &dev_attr_enable.attr,
1399 &dev_attr_act_mask.attr,
1400 &dev_attr_pid.attr,
1401 &dev_attr_start_lba.attr,
1402 &dev_attr_end_lba.attr,
1403 NULL
1404};
1405
1406struct attribute_group blk_trace_attr_group = {
1407 .name = "trace",
1408 .attrs = blk_trace_attrs,
1409};
1410
1411static int blk_str2act_mask(const char *str)
1412{
1413 int mask = 0;
1414 char *copy = kstrdup(str, GFP_KERNEL), *s;
1415
1416 if (copy == NULL)
1417 return -ENOMEM;
1418
1419 s = strstrip(copy);
1420
1421 while (1) {
1422 char *sep = strchr(s, ',');
1423
1424 if (sep != NULL)
1425 *sep = '\0';
1426
1427 if (strcasecmp(s, "barrier") == 0)
1428 mask |= BLK_TC_BARRIER;
1429 else if (strcasecmp(s, "complete") == 0)
1430 mask |= BLK_TC_COMPLETE;
1431 else if (strcasecmp(s, "fs") == 0)
1432 mask |= BLK_TC_FS;
1433 else if (strcasecmp(s, "issue") == 0)
1434 mask |= BLK_TC_ISSUE;
1435 else if (strcasecmp(s, "pc") == 0)
1436 mask |= BLK_TC_PC;
1437 else if (strcasecmp(s, "queue") == 0)
1438 mask |= BLK_TC_QUEUE;
1439 else if (strcasecmp(s, "read") == 0)
1440 mask |= BLK_TC_READ;
1441 else if (strcasecmp(s, "requeue") == 0)
1442 mask |= BLK_TC_REQUEUE;
1443 else if (strcasecmp(s, "sync") == 0)
1444 mask |= BLK_TC_SYNC;
1445 else if (strcasecmp(s, "write") == 0)
1446 mask |= BLK_TC_WRITE;
1447
1448 if (sep == NULL)
1449 break;
1450
1451 s = sep + 1;
1452 }
1453 kfree(copy);
1454
1455 return mask;
1456}
1457
1458static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1459 struct device_attribute *attr,
1460 char *buf)
1461{
1462 struct hd_struct *p = dev_to_part(dev);
1463 struct request_queue *q;
1464 struct block_device *bdev;
1465 ssize_t ret = -ENXIO;
1466
1467 lock_kernel();
1468 bdev = bdget(part_devt(p));
1469 if (bdev == NULL)
1470 goto out_unlock_kernel;
1471
1472 q = bdev_get_queue(bdev);
1473 if (q == NULL)
1474 goto out_bdput;
1475 mutex_lock(&bdev->bd_mutex);
1476 if (q->blk_trace == NULL)
1477 ret = sprintf(buf, "disabled\n");
1478 else if (attr == &dev_attr_act_mask)
1479 ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
1480 else if (attr == &dev_attr_pid)
1481 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1482 else if (attr == &dev_attr_start_lba)
1483 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1484 else if (attr == &dev_attr_end_lba)
1485 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1486 mutex_unlock(&bdev->bd_mutex);
1487out_bdput:
1488 bdput(bdev);
1489out_unlock_kernel:
1490 unlock_kernel();
1491 return ret;
1492}
1493
1494static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1495 struct device_attribute *attr,
1496 const char *buf, size_t count)
1497{
1498 struct block_device *bdev;
1499 struct request_queue *q;
1500 struct hd_struct *p;
1501 u64 value;
1502 ssize_t ret = -ENXIO;
1503
1504 if (count == 0)
1505 goto out;
1506
1507 if (attr == &dev_attr_act_mask) {
1508 if (sscanf(buf, "%llx", &value) != 1) {
1509 /* Assume it is a list of trace category names */
1510 value = blk_str2act_mask(buf);
1511 if (value < 0)
1512 goto out;
1513 }
1514 } else if (sscanf(buf, "%llu", &value) != 1)
1515 goto out;
1516
1517 lock_kernel();
1518 p = dev_to_part(dev);
1519 bdev = bdget(part_devt(p));
1520 if (bdev == NULL)
1521 goto out_unlock_kernel;
1522
1523 q = bdev_get_queue(bdev);
1524 if (q == NULL)
1525 goto out_bdput;
1526
1527 mutex_lock(&bdev->bd_mutex);
1528 ret = 0;
1529 if (q->blk_trace == NULL)
1530 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1531
1532 if (ret == 0) {
1533 if (attr == &dev_attr_act_mask)
1534 q->blk_trace->act_mask = value;
1535 else if (attr == &dev_attr_pid)
1536 q->blk_trace->pid = value;
1537 else if (attr == &dev_attr_start_lba)
1538 q->blk_trace->start_lba = value;
1539 else if (attr == &dev_attr_end_lba)
1540 q->blk_trace->end_lba = value;
1541 ret = count;
1542 }
1543 mutex_unlock(&bdev->bd_mutex);
1544out_bdput:
1545 bdput(bdev);
1546out_unlock_kernel:
1547 unlock_kernel();
1548out:
1549 return ret;
1550}
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 000000000000..9fc918da404f
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,15 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
15#include <trace/trace_event_types.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..7847806eefef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
27#include <linux/sysctl.h> 27#include <linux/sysctl.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h>
30 31
31#include <asm/ftrace.h> 32#include <asm/ftrace.h>
32 33
@@ -44,14 +45,14 @@
44 ftrace_kill(); \ 45 ftrace_kill(); \
45 } while (0) 46 } while (0)
46 47
48/* hash bits for specific function selection */
49#define FTRACE_HASH_BITS 7
50#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
51
47/* ftrace_enabled is a method to turn ftrace on or off */ 52/* ftrace_enabled is a method to turn ftrace on or off */
48int ftrace_enabled __read_mostly; 53int ftrace_enabled __read_mostly;
49static int last_ftrace_enabled; 54static int last_ftrace_enabled;
50 55
51/* set when tracing only a pid */
52struct pid *ftrace_pid_trace;
53static struct pid * const ftrace_swapper_pid = &init_struct_pid;
54
55/* Quick disabling of function tracer. */ 56/* Quick disabling of function tracer. */
56int function_trace_stop; 57int function_trace_stop;
57 58
@@ -61,9 +62,7 @@ int function_trace_stop;
61 */ 62 */
62static int ftrace_disabled __read_mostly; 63static int ftrace_disabled __read_mostly;
63 64
64static DEFINE_SPINLOCK(ftrace_lock); 65static DEFINE_MUTEX(ftrace_lock);
65static DEFINE_MUTEX(ftrace_sysctl_lock);
66static DEFINE_MUTEX(ftrace_start_lock);
67 66
68static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
69{ 68{
@@ -134,9 +133,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
134 133
135static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
136{ 135{
137 /* should not be called from interrupt context */
138 spin_lock(&ftrace_lock);
139
140 ops->next = ftrace_list; 136 ops->next = ftrace_list;
141 /* 137 /*
142 * We are entering ops into the ftrace_list but another 138 * We are entering ops into the ftrace_list but another
@@ -172,18 +168,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
172#endif 168#endif
173 } 169 }
174 170
175 spin_unlock(&ftrace_lock);
176
177 return 0; 171 return 0;
178} 172}
179 173
180static int __unregister_ftrace_function(struct ftrace_ops *ops) 174static int __unregister_ftrace_function(struct ftrace_ops *ops)
181{ 175{
182 struct ftrace_ops **p; 176 struct ftrace_ops **p;
183 int ret = 0;
184
185 /* should not be called from interrupt context */
186 spin_lock(&ftrace_lock);
187 177
188 /* 178 /*
189 * If we are removing the last function, then simply point 179 * If we are removing the last function, then simply point
@@ -192,17 +182,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
192 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 182 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
193 ftrace_trace_function = ftrace_stub; 183 ftrace_trace_function = ftrace_stub;
194 ftrace_list = &ftrace_list_end; 184 ftrace_list = &ftrace_list_end;
195 goto out; 185 return 0;
196 } 186 }
197 187
198 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 188 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
199 if (*p == ops) 189 if (*p == ops)
200 break; 190 break;
201 191
202 if (*p != ops) { 192 if (*p != ops)
203 ret = -1; 193 return -1;
204 goto out;
205 }
206 194
207 *p = (*p)->next; 195 *p = (*p)->next;
208 196
@@ -223,21 +211,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
223 } 211 }
224 } 212 }
225 213
226 out: 214 return 0;
227 spin_unlock(&ftrace_lock);
228
229 return ret;
230} 215}
231 216
232static void ftrace_update_pid_func(void) 217static void ftrace_update_pid_func(void)
233{ 218{
234 ftrace_func_t func; 219 ftrace_func_t func;
235 220
236 /* should not be called from interrupt context */
237 spin_lock(&ftrace_lock);
238
239 if (ftrace_trace_function == ftrace_stub) 221 if (ftrace_trace_function == ftrace_stub)
240 goto out; 222 return;
241 223
242 func = ftrace_trace_function; 224 func = ftrace_trace_function;
243 225
@@ -254,23 +236,29 @@ static void ftrace_update_pid_func(void)
254#else 236#else
255 __ftrace_trace_function = func; 237 __ftrace_trace_function = func;
256#endif 238#endif
257
258 out:
259 spin_unlock(&ftrace_lock);
260} 239}
261 240
241/* set when tracing only a pid */
242struct pid *ftrace_pid_trace;
243static struct pid * const ftrace_swapper_pid = &init_struct_pid;
244
262#ifdef CONFIG_DYNAMIC_FTRACE 245#ifdef CONFIG_DYNAMIC_FTRACE
246
263#ifndef CONFIG_FTRACE_MCOUNT_RECORD 247#ifndef CONFIG_FTRACE_MCOUNT_RECORD
264# error Dynamic ftrace depends on MCOUNT_RECORD 248# error Dynamic ftrace depends on MCOUNT_RECORD
265#endif 249#endif
266 250
267/* 251static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
268 * Since MCOUNT_ADDR may point to mcount itself, we do not want 252
269 * to get it confused by reading a reference in the code as we 253struct ftrace_func_probe {
270 * are parsing on objcopy output of text. Use a variable for 254 struct hlist_node node;
271 * it instead. 255 struct ftrace_probe_ops *ops;
272 */ 256 unsigned long flags;
273static unsigned long mcount_addr = MCOUNT_ADDR; 257 unsigned long ip;
258 void *data;
259 struct rcu_head rcu;
260};
261
274 262
275enum { 263enum {
276 FTRACE_ENABLE_CALLS = (1 << 0), 264 FTRACE_ENABLE_CALLS = (1 << 0),
@@ -284,13 +272,13 @@ enum {
284 272
285static int ftrace_filtered; 273static int ftrace_filtered;
286 274
287static LIST_HEAD(ftrace_new_addrs); 275static struct dyn_ftrace *ftrace_new_addrs;
288 276
289static DEFINE_MUTEX(ftrace_regex_lock); 277static DEFINE_MUTEX(ftrace_regex_lock);
290 278
291struct ftrace_page { 279struct ftrace_page {
292 struct ftrace_page *next; 280 struct ftrace_page *next;
293 unsigned long index; 281 int index;
294 struct dyn_ftrace records[]; 282 struct dyn_ftrace records[];
295}; 283};
296 284
@@ -305,6 +293,19 @@ static struct ftrace_page *ftrace_pages;
305 293
306static struct dyn_ftrace *ftrace_free_records; 294static struct dyn_ftrace *ftrace_free_records;
307 295
296/*
297 * This is a double for. Do not use 'break' to break out of the loop,
298 * you must use a goto.
299 */
300#define do_for_each_ftrace_rec(pg, rec) \
301 for (pg = ftrace_pages_start; pg; pg = pg->next) { \
302 int _____i; \
303 for (_____i = 0; _____i < pg->index; _____i++) { \
304 rec = &pg->records[_____i];
305
306#define while_for_each_ftrace_rec() \
307 } \
308 }
308 309
309#ifdef CONFIG_KPROBES 310#ifdef CONFIG_KPROBES
310 311
@@ -349,23 +350,17 @@ void ftrace_release(void *start, unsigned long size)
349 struct ftrace_page *pg; 350 struct ftrace_page *pg;
350 unsigned long s = (unsigned long)start; 351 unsigned long s = (unsigned long)start;
351 unsigned long e = s + size; 352 unsigned long e = s + size;
352 int i;
353 353
354 if (ftrace_disabled || !start) 354 if (ftrace_disabled || !start)
355 return; 355 return;
356 356
357 /* should not be called from interrupt context */ 357 mutex_lock(&ftrace_lock);
358 spin_lock(&ftrace_lock); 358 do_for_each_ftrace_rec(pg, rec) {
359 359 if ((rec->ip >= s) && (rec->ip < e) &&
360 for (pg = ftrace_pages_start; pg; pg = pg->next) { 360 !(rec->flags & FTRACE_FL_FREE))
361 for (i = 0; i < pg->index; i++) { 361 ftrace_free_rec(rec);
362 rec = &pg->records[i]; 362 } while_for_each_ftrace_rec();
363 363 mutex_unlock(&ftrace_lock);
364 if ((rec->ip >= s) && (rec->ip < e))
365 ftrace_free_rec(rec);
366 }
367 }
368 spin_unlock(&ftrace_lock);
369} 364}
370 365
371static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 366static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -414,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
414 return NULL; 409 return NULL;
415 410
416 rec->ip = ip; 411 rec->ip = ip;
417 412 rec->flags = (unsigned long)ftrace_new_addrs;
418 list_add(&rec->list, &ftrace_new_addrs); 413 ftrace_new_addrs = rec;
419 414
420 return rec; 415 return rec;
421} 416}
@@ -461,10 +456,10 @@ static void ftrace_bug(int failed, unsigned long ip)
461static int 456static int
462__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 457__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
463{ 458{
464 unsigned long ip, fl;
465 unsigned long ftrace_addr; 459 unsigned long ftrace_addr;
460 unsigned long ip, fl;
466 461
467 ftrace_addr = (unsigned long)ftrace_caller; 462 ftrace_addr = (unsigned long)FTRACE_ADDR;
468 463
469 ip = rec->ip; 464 ip = rec->ip;
470 465
@@ -473,7 +468,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
473 * it is not enabled then do nothing. 468 * it is not enabled then do nothing.
474 * 469 *
475 * If this record is not to be traced and 470 * If this record is not to be traced and
476 * it is enabled then disabled it. 471 * it is enabled then disable it.
477 * 472 *
478 */ 473 */
479 if (rec->flags & FTRACE_FL_NOTRACE) { 474 if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -493,7 +488,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
493 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) 488 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
494 return 0; 489 return 0;
495 490
496 /* Record is not filtered and is not enabled do nothing */ 491 /* Record is not filtered or enabled, do nothing */
497 if (!fl) 492 if (!fl)
498 return 0; 493 return 0;
499 494
@@ -515,7 +510,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
515 510
516 } else { 511 } else {
517 512
518 /* if record is not enabled do nothing */ 513 /* if record is not enabled, do nothing */
519 if (!(rec->flags & FTRACE_FL_ENABLED)) 514 if (!(rec->flags & FTRACE_FL_ENABLED))
520 return 0; 515 return 0;
521 516
@@ -531,41 +526,41 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
531 526
532static void ftrace_replace_code(int enable) 527static void ftrace_replace_code(int enable)
533{ 528{
534 int i, failed;
535 struct dyn_ftrace *rec; 529 struct dyn_ftrace *rec;
536 struct ftrace_page *pg; 530 struct ftrace_page *pg;
531 int failed;
537 532
538 for (pg = ftrace_pages_start; pg; pg = pg->next) { 533 do_for_each_ftrace_rec(pg, rec) {
539 for (i = 0; i < pg->index; i++) { 534 /*
540 rec = &pg->records[i]; 535 * Skip over free records, records that have
541 536 * failed and not converted.
542 /* 537 */
543 * Skip over free records and records that have 538 if (rec->flags & FTRACE_FL_FREE ||
544 * failed. 539 rec->flags & FTRACE_FL_FAILED ||
545 */ 540 !(rec->flags & FTRACE_FL_CONVERTED))
546 if (rec->flags & FTRACE_FL_FREE || 541 continue;
547 rec->flags & FTRACE_FL_FAILED)
548 continue;
549 542
550 /* ignore updates to this record's mcount site */ 543 /* ignore updates to this record's mcount site */
551 if (get_kprobe((void *)rec->ip)) { 544 if (get_kprobe((void *)rec->ip)) {
552 freeze_record(rec); 545 freeze_record(rec);
553 continue; 546 continue;
554 } else { 547 } else {
555 unfreeze_record(rec); 548 unfreeze_record(rec);
556 } 549 }
557 550
558 failed = __ftrace_replace_code(rec, enable); 551 failed = __ftrace_replace_code(rec, enable);
559 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 552 if (failed) {
560 rec->flags |= FTRACE_FL_FAILED; 553 rec->flags |= FTRACE_FL_FAILED;
561 if ((system_state == SYSTEM_BOOTING) || 554 if ((system_state == SYSTEM_BOOTING) ||
562 !core_kernel_text(rec->ip)) { 555 !core_kernel_text(rec->ip)) {
563 ftrace_free_rec(rec); 556 ftrace_free_rec(rec);
564 } else 557 } else {
565 ftrace_bug(failed, rec->ip); 558 ftrace_bug(failed, rec->ip);
566 } 559 /* Stop processing */
560 return;
561 }
567 } 562 }
568 } 563 } while_for_each_ftrace_rec();
569} 564}
570 565
571static int 566static int
@@ -576,7 +571,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
576 571
577 ip = rec->ip; 572 ip = rec->ip;
578 573
579 ret = ftrace_make_nop(mod, rec, mcount_addr); 574 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
580 if (ret) { 575 if (ret) {
581 ftrace_bug(ret, ip); 576 ftrace_bug(ret, ip);
582 rec->flags |= FTRACE_FL_FAILED; 577 rec->flags |= FTRACE_FL_FAILED;
@@ -585,6 +580,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
585 return 1; 580 return 1;
586} 581}
587 582
583/*
584 * archs can override this function if they must do something
585 * before the modifying code is performed.
586 */
587int __weak ftrace_arch_code_modify_prepare(void)
588{
589 return 0;
590}
591
592/*
593 * archs can override this function if they must do something
594 * after the modifying code is performed.
595 */
596int __weak ftrace_arch_code_modify_post_process(void)
597{
598 return 0;
599}
600
588static int __ftrace_modify_code(void *data) 601static int __ftrace_modify_code(void *data)
589{ 602{
590 int *command = data; 603 int *command = data;
@@ -607,7 +620,17 @@ static int __ftrace_modify_code(void *data)
607 620
608static void ftrace_run_update_code(int command) 621static void ftrace_run_update_code(int command)
609{ 622{
623 int ret;
624
625 ret = ftrace_arch_code_modify_prepare();
626 FTRACE_WARN_ON(ret);
627 if (ret)
628 return;
629
610 stop_machine(__ftrace_modify_code, &command, NULL); 630 stop_machine(__ftrace_modify_code, &command, NULL);
631
632 ret = ftrace_arch_code_modify_post_process();
633 FTRACE_WARN_ON(ret);
611} 634}
612 635
613static ftrace_func_t saved_ftrace_func; 636static ftrace_func_t saved_ftrace_func;
@@ -631,13 +654,10 @@ static void ftrace_startup(int command)
631 if (unlikely(ftrace_disabled)) 654 if (unlikely(ftrace_disabled))
632 return; 655 return;
633 656
634 mutex_lock(&ftrace_start_lock);
635 ftrace_start_up++; 657 ftrace_start_up++;
636 command |= FTRACE_ENABLE_CALLS; 658 command |= FTRACE_ENABLE_CALLS;
637 659
638 ftrace_startup_enable(command); 660 ftrace_startup_enable(command);
639
640 mutex_unlock(&ftrace_start_lock);
641} 661}
642 662
643static void ftrace_shutdown(int command) 663static void ftrace_shutdown(int command)
@@ -645,7 +665,6 @@ static void ftrace_shutdown(int command)
645 if (unlikely(ftrace_disabled)) 665 if (unlikely(ftrace_disabled))
646 return; 666 return;
647 667
648 mutex_lock(&ftrace_start_lock);
649 ftrace_start_up--; 668 ftrace_start_up--;
650 if (!ftrace_start_up) 669 if (!ftrace_start_up)
651 command |= FTRACE_DISABLE_CALLS; 670 command |= FTRACE_DISABLE_CALLS;
@@ -656,11 +675,9 @@ static void ftrace_shutdown(int command)
656 } 675 }
657 676
658 if (!command || !ftrace_enabled) 677 if (!command || !ftrace_enabled)
659 goto out; 678 return;
660 679
661 ftrace_run_update_code(command); 680 ftrace_run_update_code(command);
662 out:
663 mutex_unlock(&ftrace_start_lock);
664} 681}
665 682
666static void ftrace_startup_sysctl(void) 683static void ftrace_startup_sysctl(void)
@@ -670,7 +687,6 @@ static void ftrace_startup_sysctl(void)
670 if (unlikely(ftrace_disabled)) 687 if (unlikely(ftrace_disabled))
671 return; 688 return;
672 689
673 mutex_lock(&ftrace_start_lock);
674 /* Force update next time */ 690 /* Force update next time */
675 saved_ftrace_func = NULL; 691 saved_ftrace_func = NULL;
676 /* ftrace_start_up is true if we want ftrace running */ 692 /* ftrace_start_up is true if we want ftrace running */
@@ -678,7 +694,6 @@ static void ftrace_startup_sysctl(void)
678 command |= FTRACE_ENABLE_CALLS; 694 command |= FTRACE_ENABLE_CALLS;
679 695
680 ftrace_run_update_code(command); 696 ftrace_run_update_code(command);
681 mutex_unlock(&ftrace_start_lock);
682} 697}
683 698
684static void ftrace_shutdown_sysctl(void) 699static void ftrace_shutdown_sysctl(void)
@@ -688,13 +703,11 @@ static void ftrace_shutdown_sysctl(void)
688 if (unlikely(ftrace_disabled)) 703 if (unlikely(ftrace_disabled))
689 return; 704 return;
690 705
691 mutex_lock(&ftrace_start_lock);
692 /* ftrace_start_up is true if ftrace is running */ 706 /* ftrace_start_up is true if ftrace is running */
693 if (ftrace_start_up) 707 if (ftrace_start_up)
694 command |= FTRACE_DISABLE_CALLS; 708 command |= FTRACE_DISABLE_CALLS;
695 709
696 ftrace_run_update_code(command); 710 ftrace_run_update_code(command);
697 mutex_unlock(&ftrace_start_lock);
698} 711}
699 712
700static cycle_t ftrace_update_time; 713static cycle_t ftrace_update_time;
@@ -703,19 +716,21 @@ unsigned long ftrace_update_tot_cnt;
703 716
704static int ftrace_update_code(struct module *mod) 717static int ftrace_update_code(struct module *mod)
705{ 718{
706 struct dyn_ftrace *p, *t; 719 struct dyn_ftrace *p;
707 cycle_t start, stop; 720 cycle_t start, stop;
708 721
709 start = ftrace_now(raw_smp_processor_id()); 722 start = ftrace_now(raw_smp_processor_id());
710 ftrace_update_cnt = 0; 723 ftrace_update_cnt = 0;
711 724
712 list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { 725 while (ftrace_new_addrs) {
713 726
714 /* If something went wrong, bail without enabling anything */ 727 /* If something went wrong, bail without enabling anything */
715 if (unlikely(ftrace_disabled)) 728 if (unlikely(ftrace_disabled))
716 return -1; 729 return -1;
717 730
718 list_del_init(&p->list); 731 p = ftrace_new_addrs;
732 ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
733 p->flags = 0L;
719 734
720 /* convert record (i.e, patch mcount-call with NOP) */ 735 /* convert record (i.e, patch mcount-call with NOP) */
721 if (ftrace_code_disable(mod, p)) { 736 if (ftrace_code_disable(mod, p)) {
@@ -781,13 +796,16 @@ enum {
781 FTRACE_ITER_CONT = (1 << 1), 796 FTRACE_ITER_CONT = (1 << 1),
782 FTRACE_ITER_NOTRACE = (1 << 2), 797 FTRACE_ITER_NOTRACE = (1 << 2),
783 FTRACE_ITER_FAILURES = (1 << 3), 798 FTRACE_ITER_FAILURES = (1 << 3),
799 FTRACE_ITER_PRINTALL = (1 << 4),
800 FTRACE_ITER_HASH = (1 << 5),
784}; 801};
785 802
786#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 803#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
787 804
788struct ftrace_iterator { 805struct ftrace_iterator {
789 struct ftrace_page *pg; 806 struct ftrace_page *pg;
790 unsigned idx; 807 int hidx;
808 int idx;
791 unsigned flags; 809 unsigned flags;
792 unsigned char buffer[FTRACE_BUFF_MAX+1]; 810 unsigned char buffer[FTRACE_BUFF_MAX+1];
793 unsigned buffer_idx; 811 unsigned buffer_idx;
@@ -795,15 +813,89 @@ struct ftrace_iterator {
795}; 813};
796 814
797static void * 815static void *
816t_hash_next(struct seq_file *m, void *v, loff_t *pos)
817{
818 struct ftrace_iterator *iter = m->private;
819 struct hlist_node *hnd = v;
820 struct hlist_head *hhd;
821
822 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
823
824 (*pos)++;
825
826 retry:
827 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
828 return NULL;
829
830 hhd = &ftrace_func_hash[iter->hidx];
831
832 if (hlist_empty(hhd)) {
833 iter->hidx++;
834 hnd = NULL;
835 goto retry;
836 }
837
838 if (!hnd)
839 hnd = hhd->first;
840 else {
841 hnd = hnd->next;
842 if (!hnd) {
843 iter->hidx++;
844 goto retry;
845 }
846 }
847
848 return hnd;
849}
850
851static void *t_hash_start(struct seq_file *m, loff_t *pos)
852{
853 struct ftrace_iterator *iter = m->private;
854 void *p = NULL;
855
856 iter->flags |= FTRACE_ITER_HASH;
857
858 return t_hash_next(m, p, pos);
859}
860
861static int t_hash_show(struct seq_file *m, void *v)
862{
863 struct ftrace_func_probe *rec;
864 struct hlist_node *hnd = v;
865 char str[KSYM_SYMBOL_LEN];
866
867 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
868
869 if (rec->ops->print)
870 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
871
872 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
873 seq_printf(m, "%s:", str);
874
875 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
876 seq_printf(m, "%s", str);
877
878 if (rec->data)
879 seq_printf(m, ":%p", rec->data);
880 seq_putc(m, '\n');
881
882 return 0;
883}
884
885static void *
798t_next(struct seq_file *m, void *v, loff_t *pos) 886t_next(struct seq_file *m, void *v, loff_t *pos)
799{ 887{
800 struct ftrace_iterator *iter = m->private; 888 struct ftrace_iterator *iter = m->private;
801 struct dyn_ftrace *rec = NULL; 889 struct dyn_ftrace *rec = NULL;
802 890
891 if (iter->flags & FTRACE_ITER_HASH)
892 return t_hash_next(m, v, pos);
893
803 (*pos)++; 894 (*pos)++;
804 895
805 /* should not be called from interrupt context */ 896 if (iter->flags & FTRACE_ITER_PRINTALL)
806 spin_lock(&ftrace_lock); 897 return NULL;
898
807 retry: 899 retry:
808 if (iter->idx >= iter->pg->index) { 900 if (iter->idx >= iter->pg->index) {
809 if (iter->pg->next) { 901 if (iter->pg->next) {
@@ -832,7 +924,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
832 goto retry; 924 goto retry;
833 } 925 }
834 } 926 }
835 spin_unlock(&ftrace_lock);
836 927
837 return rec; 928 return rec;
838} 929}
@@ -842,6 +933,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
842 struct ftrace_iterator *iter = m->private; 933 struct ftrace_iterator *iter = m->private;
843 void *p = NULL; 934 void *p = NULL;
844 935
936 mutex_lock(&ftrace_lock);
937 /*
938 * For set_ftrace_filter reading, if we have the filter
939 * off, we can short cut and just print out that all
940 * functions are enabled.
941 */
942 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
943 if (*pos > 0)
944 return t_hash_start(m, pos);
945 iter->flags |= FTRACE_ITER_PRINTALL;
946 (*pos)++;
947 return iter;
948 }
949
950 if (iter->flags & FTRACE_ITER_HASH)
951 return t_hash_start(m, pos);
952
845 if (*pos > 0) { 953 if (*pos > 0) {
846 if (iter->idx < 0) 954 if (iter->idx < 0)
847 return p; 955 return p;
@@ -851,18 +959,31 @@ static void *t_start(struct seq_file *m, loff_t *pos)
851 959
852 p = t_next(m, p, pos); 960 p = t_next(m, p, pos);
853 961
962 if (!p)
963 return t_hash_start(m, pos);
964
854 return p; 965 return p;
855} 966}
856 967
857static void t_stop(struct seq_file *m, void *p) 968static void t_stop(struct seq_file *m, void *p)
858{ 969{
970 mutex_unlock(&ftrace_lock);
859} 971}
860 972
861static int t_show(struct seq_file *m, void *v) 973static int t_show(struct seq_file *m, void *v)
862{ 974{
975 struct ftrace_iterator *iter = m->private;
863 struct dyn_ftrace *rec = v; 976 struct dyn_ftrace *rec = v;
864 char str[KSYM_SYMBOL_LEN]; 977 char str[KSYM_SYMBOL_LEN];
865 978
979 if (iter->flags & FTRACE_ITER_HASH)
980 return t_hash_show(m, v);
981
982 if (iter->flags & FTRACE_ITER_PRINTALL) {
983 seq_printf(m, "#### all functions enabled ####\n");
984 return 0;
985 }
986
866 if (!rec) 987 if (!rec)
867 return 0; 988 return 0;
868 989
@@ -941,23 +1062,16 @@ static void ftrace_filter_reset(int enable)
941 struct ftrace_page *pg; 1062 struct ftrace_page *pg;
942 struct dyn_ftrace *rec; 1063 struct dyn_ftrace *rec;
943 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1064 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
944 unsigned i;
945 1065
946 /* should not be called from interrupt context */ 1066 mutex_lock(&ftrace_lock);
947 spin_lock(&ftrace_lock);
948 if (enable) 1067 if (enable)
949 ftrace_filtered = 0; 1068 ftrace_filtered = 0;
950 pg = ftrace_pages_start; 1069 do_for_each_ftrace_rec(pg, rec) {
951 while (pg) { 1070 if (rec->flags & FTRACE_FL_FAILED)
952 for (i = 0; i < pg->index; i++) { 1071 continue;
953 rec = &pg->records[i]; 1072 rec->flags &= ~type;
954 if (rec->flags & FTRACE_FL_FAILED) 1073 } while_for_each_ftrace_rec();
955 continue; 1074 mutex_unlock(&ftrace_lock);
956 rec->flags &= ~type;
957 }
958 pg = pg->next;
959 }
960 spin_unlock(&ftrace_lock);
961} 1075}
962 1076
963static int 1077static int
@@ -1008,16 +1122,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
1008 return ftrace_regex_open(inode, file, 0); 1122 return ftrace_regex_open(inode, file, 0);
1009} 1123}
1010 1124
1011static ssize_t
1012ftrace_regex_read(struct file *file, char __user *ubuf,
1013 size_t cnt, loff_t *ppos)
1014{
1015 if (file->f_mode & FMODE_READ)
1016 return seq_read(file, ubuf, cnt, ppos);
1017 else
1018 return -EPERM;
1019}
1020
1021static loff_t 1125static loff_t
1022ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 1126ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1023{ 1127{
@@ -1038,86 +1142,536 @@ enum {
1038 MATCH_END_ONLY, 1142 MATCH_END_ONLY,
1039}; 1143};
1040 1144
1041static void 1145/*
1042ftrace_match(unsigned char *buff, int len, int enable) 1146 * (static function - no need for kernel doc)
1147 *
1148 * Pass in a buffer containing a glob and this function will
1149 * set search to point to the search part of the buffer and
1150 * return the type of search it is (see enum above).
1151 * This does modify buff.
1152 *
1153 * Returns enum type.
1154 * search returns the pointer to use for comparison.
1155 * not returns 1 if buff started with a '!'
1156 * 0 otherwise.
1157 */
1158static int
1159ftrace_setup_glob(char *buff, int len, char **search, int *not)
1043{ 1160{
1044 char str[KSYM_SYMBOL_LEN];
1045 char *search = NULL;
1046 struct ftrace_page *pg;
1047 struct dyn_ftrace *rec;
1048 int type = MATCH_FULL; 1161 int type = MATCH_FULL;
1049 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1162 int i;
1050 unsigned i, match = 0, search_len = 0;
1051 int not = 0;
1052 1163
1053 if (buff[0] == '!') { 1164 if (buff[0] == '!') {
1054 not = 1; 1165 *not = 1;
1055 buff++; 1166 buff++;
1056 len--; 1167 len--;
1057 } 1168 } else
1169 *not = 0;
1170
1171 *search = buff;
1058 1172
1059 for (i = 0; i < len; i++) { 1173 for (i = 0; i < len; i++) {
1060 if (buff[i] == '*') { 1174 if (buff[i] == '*') {
1061 if (!i) { 1175 if (!i) {
1062 search = buff + i + 1; 1176 *search = buff + 1;
1063 type = MATCH_END_ONLY; 1177 type = MATCH_END_ONLY;
1064 search_len = len - (i + 1);
1065 } else { 1178 } else {
1066 if (type == MATCH_END_ONLY) { 1179 if (type == MATCH_END_ONLY)
1067 type = MATCH_MIDDLE_ONLY; 1180 type = MATCH_MIDDLE_ONLY;
1068 } else { 1181 else
1069 match = i;
1070 type = MATCH_FRONT_ONLY; 1182 type = MATCH_FRONT_ONLY;
1071 }
1072 buff[i] = 0; 1183 buff[i] = 0;
1073 break; 1184 break;
1074 } 1185 }
1075 } 1186 }
1076 } 1187 }
1077 1188
1078 /* should not be called from interrupt context */ 1189 return type;
1079 spin_lock(&ftrace_lock); 1190}
1080 if (enable) 1191
1081 ftrace_filtered = 1; 1192static int ftrace_match(char *str, char *regex, int len, int type)
1082 pg = ftrace_pages_start; 1193{
1083 while (pg) { 1194 int matched = 0;
1084 for (i = 0; i < pg->index; i++) { 1195 char *ptr;
1085 int matched = 0; 1196
1086 char *ptr; 1197 switch (type) {
1087 1198 case MATCH_FULL:
1088 rec = &pg->records[i]; 1199 if (strcmp(str, regex) == 0)
1089 if (rec->flags & FTRACE_FL_FAILED) 1200 matched = 1;
1201 break;
1202 case MATCH_FRONT_ONLY:
1203 if (strncmp(str, regex, len) == 0)
1204 matched = 1;
1205 break;
1206 case MATCH_MIDDLE_ONLY:
1207 if (strstr(str, regex))
1208 matched = 1;
1209 break;
1210 case MATCH_END_ONLY:
1211 ptr = strstr(str, regex);
1212 if (ptr && (ptr[len] == 0))
1213 matched = 1;
1214 break;
1215 }
1216
1217 return matched;
1218}
1219
1220static int
1221ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1222{
1223 char str[KSYM_SYMBOL_LEN];
1224
1225 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1226 return ftrace_match(str, regex, len, type);
1227}
1228
1229static void ftrace_match_records(char *buff, int len, int enable)
1230{
1231 unsigned int search_len;
1232 struct ftrace_page *pg;
1233 struct dyn_ftrace *rec;
1234 unsigned long flag;
1235 char *search;
1236 int type;
1237 int not;
1238
1239 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1240 type = ftrace_setup_glob(buff, len, &search, &not);
1241
1242 search_len = strlen(search);
1243
1244 mutex_lock(&ftrace_lock);
1245 do_for_each_ftrace_rec(pg, rec) {
1246
1247 if (rec->flags & FTRACE_FL_FAILED)
1248 continue;
1249
1250 if (ftrace_match_record(rec, search, search_len, type)) {
1251 if (not)
1252 rec->flags &= ~flag;
1253 else
1254 rec->flags |= flag;
1255 }
1256 /*
1257 * Only enable filtering if we have a function that
1258 * is filtered on.
1259 */
1260 if (enable && (rec->flags & FTRACE_FL_FILTER))
1261 ftrace_filtered = 1;
1262 } while_for_each_ftrace_rec();
1263 mutex_unlock(&ftrace_lock);
1264}
1265
1266static int
1267ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1268 char *regex, int len, int type)
1269{
1270 char str[KSYM_SYMBOL_LEN];
1271 char *modname;
1272
1273 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1274
1275 if (!modname || strcmp(modname, mod))
1276 return 0;
1277
1278 /* blank search means to match all funcs in the mod */
1279 if (len)
1280 return ftrace_match(str, regex, len, type);
1281 else
1282 return 1;
1283}
1284
1285static void ftrace_match_module_records(char *buff, char *mod, int enable)
1286{
1287 unsigned search_len = 0;
1288 struct ftrace_page *pg;
1289 struct dyn_ftrace *rec;
1290 int type = MATCH_FULL;
1291 char *search = buff;
1292 unsigned long flag;
1293 int not = 0;
1294
1295 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1296
1297 /* blank or '*' mean the same */
1298 if (strcmp(buff, "*") == 0)
1299 buff[0] = 0;
1300
1301 /* handle the case of 'dont filter this module' */
1302 if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
1303 buff[0] = 0;
1304 not = 1;
1305 }
1306
1307 if (strlen(buff)) {
1308 type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
1309 search_len = strlen(search);
1310 }
1311
1312 mutex_lock(&ftrace_lock);
1313 do_for_each_ftrace_rec(pg, rec) {
1314
1315 if (rec->flags & FTRACE_FL_FAILED)
1316 continue;
1317
1318 if (ftrace_match_module_record(rec, mod,
1319 search, search_len, type)) {
1320 if (not)
1321 rec->flags &= ~flag;
1322 else
1323 rec->flags |= flag;
1324 }
1325 if (enable && (rec->flags & FTRACE_FL_FILTER))
1326 ftrace_filtered = 1;
1327
1328 } while_for_each_ftrace_rec();
1329 mutex_unlock(&ftrace_lock);
1330}
1331
1332/*
1333 * We register the module command as a template to show others how
1334 * to register the a command as well.
1335 */
1336
1337static int
1338ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1339{
1340 char *mod;
1341
1342 /*
1343 * cmd == 'mod' because we only registered this func
1344 * for the 'mod' ftrace_func_command.
1345 * But if you register one func with multiple commands,
1346 * you can tell which command was used by the cmd
1347 * parameter.
1348 */
1349
1350 /* we must have a module name */
1351 if (!param)
1352 return -EINVAL;
1353
1354 mod = strsep(&param, ":");
1355 if (!strlen(mod))
1356 return -EINVAL;
1357
1358 ftrace_match_module_records(func, mod, enable);
1359 return 0;
1360}
1361
1362static struct ftrace_func_command ftrace_mod_cmd = {
1363 .name = "mod",
1364 .func = ftrace_mod_callback,
1365};
1366
1367static int __init ftrace_mod_cmd_init(void)
1368{
1369 return register_ftrace_command(&ftrace_mod_cmd);
1370}
1371device_initcall(ftrace_mod_cmd_init);
1372
1373static void
1374function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1375{
1376 struct ftrace_func_probe *entry;
1377 struct hlist_head *hhd;
1378 struct hlist_node *n;
1379 unsigned long key;
1380 int resched;
1381
1382 key = hash_long(ip, FTRACE_HASH_BITS);
1383
1384 hhd = &ftrace_func_hash[key];
1385
1386 if (hlist_empty(hhd))
1387 return;
1388
1389 /*
1390 * Disable preemption for these calls to prevent a RCU grace
1391 * period. This syncs the hash iteration and freeing of items
1392 * on the hash. rcu_read_lock is too dangerous here.
1393 */
1394 resched = ftrace_preempt_disable();
1395 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1396 if (entry->ip == ip)
1397 entry->ops->func(ip, parent_ip, &entry->data);
1398 }
1399 ftrace_preempt_enable(resched);
1400}
1401
1402static struct ftrace_ops trace_probe_ops __read_mostly =
1403{
1404 .func = function_trace_probe_call,
1405};
1406
1407static int ftrace_probe_registered;
1408
1409static void __enable_ftrace_function_probe(void)
1410{
1411 int i;
1412
1413 if (ftrace_probe_registered)
1414 return;
1415
1416 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1417 struct hlist_head *hhd = &ftrace_func_hash[i];
1418 if (hhd->first)
1419 break;
1420 }
1421 /* Nothing registered? */
1422 if (i == FTRACE_FUNC_HASHSIZE)
1423 return;
1424
1425 __register_ftrace_function(&trace_probe_ops);
1426 ftrace_startup(0);
1427 ftrace_probe_registered = 1;
1428}
1429
1430static void __disable_ftrace_function_probe(void)
1431{
1432 int i;
1433
1434 if (!ftrace_probe_registered)
1435 return;
1436
1437 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1438 struct hlist_head *hhd = &ftrace_func_hash[i];
1439 if (hhd->first)
1440 return;
1441 }
1442
1443 /* no more funcs left */
1444 __unregister_ftrace_function(&trace_probe_ops);
1445 ftrace_shutdown(0);
1446 ftrace_probe_registered = 0;
1447}
1448
1449
1450static void ftrace_free_entry_rcu(struct rcu_head *rhp)
1451{
1452 struct ftrace_func_probe *entry =
1453 container_of(rhp, struct ftrace_func_probe, rcu);
1454
1455 if (entry->ops->free)
1456 entry->ops->free(&entry->data);
1457 kfree(entry);
1458}
1459
1460
1461int
1462register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1463 void *data)
1464{
1465 struct ftrace_func_probe *entry;
1466 struct ftrace_page *pg;
1467 struct dyn_ftrace *rec;
1468 int type, len, not;
1469 unsigned long key;
1470 int count = 0;
1471 char *search;
1472
1473 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1474 len = strlen(search);
1475
1476 /* we do not support '!' for function probes */
1477 if (WARN_ON(not))
1478 return -EINVAL;
1479
1480 mutex_lock(&ftrace_lock);
1481 do_for_each_ftrace_rec(pg, rec) {
1482
1483 if (rec->flags & FTRACE_FL_FAILED)
1484 continue;
1485
1486 if (!ftrace_match_record(rec, search, len, type))
1487 continue;
1488
1489 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1490 if (!entry) {
1491 /* If we did not process any, then return error */
1492 if (!count)
1493 count = -ENOMEM;
1494 goto out_unlock;
1495 }
1496
1497 count++;
1498
1499 entry->data = data;
1500
1501 /*
1502 * The caller might want to do something special
1503 * for each function we find. We call the callback
1504 * to give the caller an opportunity to do so.
1505 */
1506 if (ops->callback) {
1507 if (ops->callback(rec->ip, &entry->data) < 0) {
1508 /* caller does not like this func */
1509 kfree(entry);
1090 continue; 1510 continue;
1091 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1092 switch (type) {
1093 case MATCH_FULL:
1094 if (strcmp(str, buff) == 0)
1095 matched = 1;
1096 break;
1097 case MATCH_FRONT_ONLY:
1098 if (memcmp(str, buff, match) == 0)
1099 matched = 1;
1100 break;
1101 case MATCH_MIDDLE_ONLY:
1102 if (strstr(str, search))
1103 matched = 1;
1104 break;
1105 case MATCH_END_ONLY:
1106 ptr = strstr(str, search);
1107 if (ptr && (ptr[search_len] == 0))
1108 matched = 1;
1109 break;
1110 } 1511 }
1111 if (matched) { 1512 }
1112 if (not) 1513
1113 rec->flags &= ~flag; 1514 entry->ops = ops;
1114 else 1515 entry->ip = rec->ip;
1115 rec->flags |= flag; 1516
1517 key = hash_long(entry->ip, FTRACE_HASH_BITS);
1518 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
1519
1520 } while_for_each_ftrace_rec();
1521 __enable_ftrace_function_probe();
1522
1523 out_unlock:
1524 mutex_unlock(&ftrace_lock);
1525
1526 return count;
1527}
1528
1529enum {
1530 PROBE_TEST_FUNC = 1,
1531 PROBE_TEST_DATA = 2
1532};
1533
1534static void
1535__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1536 void *data, int flags)
1537{
1538 struct ftrace_func_probe *entry;
1539 struct hlist_node *n, *tmp;
1540 char str[KSYM_SYMBOL_LEN];
1541 int type = MATCH_FULL;
1542 int i, len = 0;
1543 char *search;
1544
1545 if (glob && (strcmp(glob, "*") || !strlen(glob)))
1546 glob = NULL;
1547 else {
1548 int not;
1549
1550 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1551 len = strlen(search);
1552
1553 /* we do not support '!' for function probes */
1554 if (WARN_ON(not))
1555 return;
1556 }
1557
1558 mutex_lock(&ftrace_lock);
1559 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1560 struct hlist_head *hhd = &ftrace_func_hash[i];
1561
1562 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
1563
1564 /* break up if statements for readability */
1565 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
1566 continue;
1567
1568 if ((flags & PROBE_TEST_DATA) && entry->data != data)
1569 continue;
1570
1571 /* do this last, since it is the most expensive */
1572 if (glob) {
1573 kallsyms_lookup(entry->ip, NULL, NULL,
1574 NULL, str);
1575 if (!ftrace_match(str, glob, len, type))
1576 continue;
1116 } 1577 }
1578
1579 hlist_del(&entry->node);
1580 call_rcu(&entry->rcu, ftrace_free_entry_rcu);
1581 }
1582 }
1583 __disable_ftrace_function_probe();
1584 mutex_unlock(&ftrace_lock);
1585}
1586
1587void
1588unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1589 void *data)
1590{
1591 __unregister_ftrace_function_probe(glob, ops, data,
1592 PROBE_TEST_FUNC | PROBE_TEST_DATA);
1593}
1594
1595void
1596unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
1597{
1598 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
1599}
1600
1601void unregister_ftrace_function_probe_all(char *glob)
1602{
1603 __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
1604}
1605
1606static LIST_HEAD(ftrace_commands);
1607static DEFINE_MUTEX(ftrace_cmd_mutex);
1608
1609int register_ftrace_command(struct ftrace_func_command *cmd)
1610{
1611 struct ftrace_func_command *p;
1612 int ret = 0;
1613
1614 mutex_lock(&ftrace_cmd_mutex);
1615 list_for_each_entry(p, &ftrace_commands, list) {
1616 if (strcmp(cmd->name, p->name) == 0) {
1617 ret = -EBUSY;
1618 goto out_unlock;
1117 } 1619 }
1118 pg = pg->next;
1119 } 1620 }
1120 spin_unlock(&ftrace_lock); 1621 list_add(&cmd->list, &ftrace_commands);
1622 out_unlock:
1623 mutex_unlock(&ftrace_cmd_mutex);
1624
1625 return ret;
1626}
1627
1628int unregister_ftrace_command(struct ftrace_func_command *cmd)
1629{
1630 struct ftrace_func_command *p, *n;
1631 int ret = -ENODEV;
1632
1633 mutex_lock(&ftrace_cmd_mutex);
1634 list_for_each_entry_safe(p, n, &ftrace_commands, list) {
1635 if (strcmp(cmd->name, p->name) == 0) {
1636 ret = 0;
1637 list_del_init(&p->list);
1638 goto out_unlock;
1639 }
1640 }
1641 out_unlock:
1642 mutex_unlock(&ftrace_cmd_mutex);
1643
1644 return ret;
1645}
1646
1647static int ftrace_process_regex(char *buff, int len, int enable)
1648{
1649 char *func, *command, *next = buff;
1650 struct ftrace_func_command *p;
1651 int ret = -EINVAL;
1652
1653 func = strsep(&next, ":");
1654
1655 if (!next) {
1656 ftrace_match_records(func, len, enable);
1657 return 0;
1658 }
1659
1660 /* command found */
1661
1662 command = strsep(&next, ":");
1663
1664 mutex_lock(&ftrace_cmd_mutex);
1665 list_for_each_entry(p, &ftrace_commands, list) {
1666 if (strcmp(p->name, command) == 0) {
1667 ret = p->func(func, command, next, enable);
1668 goto out_unlock;
1669 }
1670 }
1671 out_unlock:
1672 mutex_unlock(&ftrace_cmd_mutex);
1673
1674 return ret;
1121} 1675}
1122 1676
1123static ssize_t 1677static ssize_t
@@ -1187,7 +1741,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1187 if (isspace(ch)) { 1741 if (isspace(ch)) {
1188 iter->filtered++; 1742 iter->filtered++;
1189 iter->buffer[iter->buffer_idx] = 0; 1743 iter->buffer[iter->buffer_idx] = 0;
1190 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1744 ret = ftrace_process_regex(iter->buffer,
1745 iter->buffer_idx, enable);
1746 if (ret)
1747 goto out;
1191 iter->buffer_idx = 0; 1748 iter->buffer_idx = 0;
1192 } else 1749 } else
1193 iter->flags |= FTRACE_ITER_CONT; 1750 iter->flags |= FTRACE_ITER_CONT;
@@ -1226,7 +1783,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
1226 if (reset) 1783 if (reset)
1227 ftrace_filter_reset(enable); 1784 ftrace_filter_reset(enable);
1228 if (buf) 1785 if (buf)
1229 ftrace_match(buf, len, enable); 1786 ftrace_match_records(buf, len, enable);
1230 mutex_unlock(&ftrace_regex_lock); 1787 mutex_unlock(&ftrace_regex_lock);
1231} 1788}
1232 1789
@@ -1276,15 +1833,13 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1276 if (iter->buffer_idx) { 1833 if (iter->buffer_idx) {
1277 iter->filtered++; 1834 iter->filtered++;
1278 iter->buffer[iter->buffer_idx] = 0; 1835 iter->buffer[iter->buffer_idx] = 0;
1279 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1836 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
1280 } 1837 }
1281 1838
1282 mutex_lock(&ftrace_sysctl_lock); 1839 mutex_lock(&ftrace_lock);
1283 mutex_lock(&ftrace_start_lock);
1284 if (ftrace_start_up && ftrace_enabled) 1840 if (ftrace_start_up && ftrace_enabled)
1285 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1841 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1286 mutex_unlock(&ftrace_start_lock); 1842 mutex_unlock(&ftrace_lock);
1287 mutex_unlock(&ftrace_sysctl_lock);
1288 1843
1289 kfree(iter); 1844 kfree(iter);
1290 mutex_unlock(&ftrace_regex_lock); 1845 mutex_unlock(&ftrace_regex_lock);
@@ -1303,31 +1858,31 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
1303 return ftrace_regex_release(inode, file, 0); 1858 return ftrace_regex_release(inode, file, 0);
1304} 1859}
1305 1860
1306static struct file_operations ftrace_avail_fops = { 1861static const struct file_operations ftrace_avail_fops = {
1307 .open = ftrace_avail_open, 1862 .open = ftrace_avail_open,
1308 .read = seq_read, 1863 .read = seq_read,
1309 .llseek = seq_lseek, 1864 .llseek = seq_lseek,
1310 .release = ftrace_avail_release, 1865 .release = ftrace_avail_release,
1311}; 1866};
1312 1867
1313static struct file_operations ftrace_failures_fops = { 1868static const struct file_operations ftrace_failures_fops = {
1314 .open = ftrace_failures_open, 1869 .open = ftrace_failures_open,
1315 .read = seq_read, 1870 .read = seq_read,
1316 .llseek = seq_lseek, 1871 .llseek = seq_lseek,
1317 .release = ftrace_avail_release, 1872 .release = ftrace_avail_release,
1318}; 1873};
1319 1874
1320static struct file_operations ftrace_filter_fops = { 1875static const struct file_operations ftrace_filter_fops = {
1321 .open = ftrace_filter_open, 1876 .open = ftrace_filter_open,
1322 .read = ftrace_regex_read, 1877 .read = seq_read,
1323 .write = ftrace_filter_write, 1878 .write = ftrace_filter_write,
1324 .llseek = ftrace_regex_lseek, 1879 .llseek = ftrace_regex_lseek,
1325 .release = ftrace_filter_release, 1880 .release = ftrace_filter_release,
1326}; 1881};
1327 1882
1328static struct file_operations ftrace_notrace_fops = { 1883static const struct file_operations ftrace_notrace_fops = {
1329 .open = ftrace_notrace_open, 1884 .open = ftrace_notrace_open,
1330 .read = ftrace_regex_read, 1885 .read = seq_read,
1331 .write = ftrace_notrace_write, 1886 .write = ftrace_notrace_write,
1332 .llseek = ftrace_regex_lseek, 1887 .llseek = ftrace_regex_lseek,
1333 .release = ftrace_notrace_release, 1888 .release = ftrace_notrace_release,
@@ -1360,6 +1915,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
1360 1915
1361 mutex_lock(&graph_lock); 1916 mutex_lock(&graph_lock);
1362 1917
1918 /* Nothing, tell g_show to print all functions are enabled */
1919 if (!ftrace_graph_count && !*pos)
1920 return (void *)1;
1921
1363 p = g_next(m, p, pos); 1922 p = g_next(m, p, pos);
1364 1923
1365 return p; 1924 return p;
@@ -1378,6 +1937,11 @@ static int g_show(struct seq_file *m, void *v)
1378 if (!ptr) 1937 if (!ptr)
1379 return 0; 1938 return 0;
1380 1939
1940 if (ptr == (unsigned long *)1) {
1941 seq_printf(m, "#### all functions enabled ####\n");
1942 return 0;
1943 }
1944
1381 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 1945 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1382 1946
1383 seq_printf(m, "%s\n", str); 1947 seq_printf(m, "%s\n", str);
@@ -1420,53 +1984,53 @@ ftrace_graph_open(struct inode *inode, struct file *file)
1420 return ret; 1984 return ret;
1421} 1985}
1422 1986
1423static ssize_t
1424ftrace_graph_read(struct file *file, char __user *ubuf,
1425 size_t cnt, loff_t *ppos)
1426{
1427 if (file->f_mode & FMODE_READ)
1428 return seq_read(file, ubuf, cnt, ppos);
1429 else
1430 return -EPERM;
1431}
1432
1433static int 1987static int
1434ftrace_set_func(unsigned long *array, int idx, char *buffer) 1988ftrace_set_func(unsigned long *array, int *idx, char *buffer)
1435{ 1989{
1436 char str[KSYM_SYMBOL_LEN];
1437 struct dyn_ftrace *rec; 1990 struct dyn_ftrace *rec;
1438 struct ftrace_page *pg; 1991 struct ftrace_page *pg;
1992 int search_len;
1439 int found = 0; 1993 int found = 0;
1440 int i, j; 1994 int type, not;
1995 char *search;
1996 bool exists;
1997 int i;
1441 1998
1442 if (ftrace_disabled) 1999 if (ftrace_disabled)
1443 return -ENODEV; 2000 return -ENODEV;
1444 2001
1445 /* should not be called from interrupt context */ 2002 /* decode regex */
1446 spin_lock(&ftrace_lock); 2003 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
2004 if (not)
2005 return -EINVAL;
1447 2006
1448 for (pg = ftrace_pages_start; pg; pg = pg->next) { 2007 search_len = strlen(search);
1449 for (i = 0; i < pg->index; i++) {
1450 rec = &pg->records[i];
1451 2008
1452 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2009 mutex_lock(&ftrace_lock);
1453 continue; 2010 do_for_each_ftrace_rec(pg, rec) {
2011
2012 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2013 break;
1454 2014
1455 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 2015 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1456 if (strcmp(str, buffer) == 0) { 2016 continue;
2017
2018 if (ftrace_match_record(rec, search, search_len, type)) {
2019 /* ensure it is not already in the array */
2020 exists = false;
2021 for (i = 0; i < *idx; i++)
2022 if (array[i] == rec->ip) {
2023 exists = true;
2024 break;
2025 }
2026 if (!exists) {
2027 array[(*idx)++] = rec->ip;
1457 found = 1; 2028 found = 1;
1458 for (j = 0; j < idx; j++)
1459 if (array[j] == rec->ip) {
1460 found = 0;
1461 break;
1462 }
1463 if (found)
1464 array[idx] = rec->ip;
1465 break;
1466 } 2029 }
1467 } 2030 }
1468 } 2031 } while_for_each_ftrace_rec();
1469 spin_unlock(&ftrace_lock); 2032
2033 mutex_unlock(&ftrace_lock);
1470 2034
1471 return found ? 0 : -EINVAL; 2035 return found ? 0 : -EINVAL;
1472} 2036}
@@ -1534,13 +2098,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1534 } 2098 }
1535 buffer[index] = 0; 2099 buffer[index] = 0;
1536 2100
1537 /* we allow only one at a time */ 2101 /* we allow only one expression at a time */
1538 ret = ftrace_set_func(array, ftrace_graph_count, buffer); 2102 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
1539 if (ret) 2103 if (ret)
1540 goto out; 2104 goto out;
1541 2105
1542 ftrace_graph_count++;
1543
1544 file->f_pos += read; 2106 file->f_pos += read;
1545 2107
1546 ret = read; 2108 ret = read;
@@ -1552,7 +2114,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1552 2114
1553static const struct file_operations ftrace_graph_fops = { 2115static const struct file_operations ftrace_graph_fops = {
1554 .open = ftrace_graph_open, 2116 .open = ftrace_graph_open,
1555 .read = ftrace_graph_read, 2117 .read = seq_read,
1556 .write = ftrace_graph_write, 2118 .write = ftrace_graph_write,
1557}; 2119};
1558#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2120#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -1604,7 +2166,7 @@ static int ftrace_convert_nops(struct module *mod,
1604 unsigned long addr; 2166 unsigned long addr;
1605 unsigned long flags; 2167 unsigned long flags;
1606 2168
1607 mutex_lock(&ftrace_start_lock); 2169 mutex_lock(&ftrace_lock);
1608 p = start; 2170 p = start;
1609 while (p < end) { 2171 while (p < end) {
1610 addr = ftrace_call_adjust(*p++); 2172 addr = ftrace_call_adjust(*p++);
@@ -1623,7 +2185,7 @@ static int ftrace_convert_nops(struct module *mod,
1623 local_irq_save(flags); 2185 local_irq_save(flags);
1624 ftrace_update_code(mod); 2186 ftrace_update_code(mod);
1625 local_irq_restore(flags); 2187 local_irq_restore(flags);
1626 mutex_unlock(&ftrace_start_lock); 2188 mutex_unlock(&ftrace_lock);
1627 2189
1628 return 0; 2190 return 0;
1629} 2191}
@@ -1796,7 +2358,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1796 if (ret < 0) 2358 if (ret < 0)
1797 return ret; 2359 return ret;
1798 2360
1799 mutex_lock(&ftrace_start_lock); 2361 mutex_lock(&ftrace_lock);
1800 if (val < 0) { 2362 if (val < 0) {
1801 /* disable pid tracing */ 2363 /* disable pid tracing */
1802 if (!ftrace_pid_trace) 2364 if (!ftrace_pid_trace)
@@ -1835,12 +2397,12 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1835 ftrace_startup_enable(0); 2397 ftrace_startup_enable(0);
1836 2398
1837 out: 2399 out:
1838 mutex_unlock(&ftrace_start_lock); 2400 mutex_unlock(&ftrace_lock);
1839 2401
1840 return cnt; 2402 return cnt;
1841} 2403}
1842 2404
1843static struct file_operations ftrace_pid_fops = { 2405static const struct file_operations ftrace_pid_fops = {
1844 .read = ftrace_pid_read, 2406 .read = ftrace_pid_read,
1845 .write = ftrace_pid_write, 2407 .write = ftrace_pid_write,
1846}; 2408};
@@ -1863,7 +2425,6 @@ static __init int ftrace_init_debugfs(void)
1863 "'set_ftrace_pid' entry\n"); 2425 "'set_ftrace_pid' entry\n");
1864 return 0; 2426 return 0;
1865} 2427}
1866
1867fs_initcall(ftrace_init_debugfs); 2428fs_initcall(ftrace_init_debugfs);
1868 2429
1869/** 2430/**
@@ -1898,17 +2459,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
1898 if (unlikely(ftrace_disabled)) 2459 if (unlikely(ftrace_disabled))
1899 return -1; 2460 return -1;
1900 2461
1901 mutex_lock(&ftrace_sysctl_lock); 2462 mutex_lock(&ftrace_lock);
1902 2463
1903 ret = __register_ftrace_function(ops); 2464 ret = __register_ftrace_function(ops);
1904 ftrace_startup(0); 2465 ftrace_startup(0);
1905 2466
1906 mutex_unlock(&ftrace_sysctl_lock); 2467 mutex_unlock(&ftrace_lock);
1907 return ret; 2468 return ret;
1908} 2469}
1909 2470
1910/** 2471/**
1911 * unregister_ftrace_function - unresgister a function for profiling. 2472 * unregister_ftrace_function - unregister a function for profiling.
1912 * @ops - ops structure that holds the function to unregister 2473 * @ops - ops structure that holds the function to unregister
1913 * 2474 *
1914 * Unregister a function that was added to be called by ftrace profiling. 2475 * Unregister a function that was added to be called by ftrace profiling.
@@ -1917,10 +2478,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1917{ 2478{
1918 int ret; 2479 int ret;
1919 2480
1920 mutex_lock(&ftrace_sysctl_lock); 2481 mutex_lock(&ftrace_lock);
1921 ret = __unregister_ftrace_function(ops); 2482 ret = __unregister_ftrace_function(ops);
1922 ftrace_shutdown(0); 2483 ftrace_shutdown(0);
1923 mutex_unlock(&ftrace_sysctl_lock); 2484 mutex_unlock(&ftrace_lock);
1924 2485
1925 return ret; 2486 return ret;
1926} 2487}
@@ -1935,7 +2496,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1935 if (unlikely(ftrace_disabled)) 2496 if (unlikely(ftrace_disabled))
1936 return -ENODEV; 2497 return -ENODEV;
1937 2498
1938 mutex_lock(&ftrace_sysctl_lock); 2499 mutex_lock(&ftrace_lock);
1939 2500
1940 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 2501 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1941 2502
@@ -1964,7 +2525,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1964 } 2525 }
1965 2526
1966 out: 2527 out:
1967 mutex_unlock(&ftrace_sysctl_lock); 2528 mutex_unlock(&ftrace_lock);
1968 return ret; 2529 return ret;
1969} 2530}
1970 2531
@@ -2080,7 +2641,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2080{ 2641{
2081 int ret = 0; 2642 int ret = 0;
2082 2643
2083 mutex_lock(&ftrace_sysctl_lock); 2644 mutex_lock(&ftrace_lock);
2084 2645
2085 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 2646 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2086 register_pm_notifier(&ftrace_suspend_notifier); 2647 register_pm_notifier(&ftrace_suspend_notifier);
@@ -2098,13 +2659,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2098 ftrace_startup(FTRACE_START_FUNC_RET); 2659 ftrace_startup(FTRACE_START_FUNC_RET);
2099 2660
2100out: 2661out:
2101 mutex_unlock(&ftrace_sysctl_lock); 2662 mutex_unlock(&ftrace_lock);
2102 return ret; 2663 return ret;
2103} 2664}
2104 2665
2105void unregister_ftrace_graph(void) 2666void unregister_ftrace_graph(void)
2106{ 2667{
2107 mutex_lock(&ftrace_sysctl_lock); 2668 mutex_lock(&ftrace_lock);
2108 2669
2109 atomic_dec(&ftrace_graph_active); 2670 atomic_dec(&ftrace_graph_active);
2110 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 2671 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2112,7 +2673,7 @@ void unregister_ftrace_graph(void)
2112 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 2673 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2113 unregister_pm_notifier(&ftrace_suspend_notifier); 2674 unregister_pm_notifier(&ftrace_suspend_notifier);
2114 2675
2115 mutex_unlock(&ftrace_sysctl_lock); 2676 mutex_unlock(&ftrace_lock);
2116} 2677}
2117 2678
2118/* Allocate a return stack for newly created task */ 2679/* Allocate a return stack for newly created task */
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..ae201b3eda89
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,339 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/dcache.h>
10#include <linux/debugfs.h>
11#include <linux/fs.h>
12#include <linux/seq_file.h>
13#include <trace/kmemtrace.h>
14
15#include "trace.h"
16#include "trace_output.h"
17
18/* Select an alternative, minimalistic output than the original one */
19#define TRACE_KMEM_OPT_MINIMAL 0x1
20
21static struct tracer_opt kmem_opts[] = {
22 /* Default disable the minimalistic output */
23 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
24 { }
25};
26
27static struct tracer_flags kmem_tracer_flags = {
28 .val = 0,
29 .opts = kmem_opts
30};
31
32
33static bool kmem_tracing_enabled __read_mostly;
34static struct trace_array *kmemtrace_array;
35
36static int kmem_trace_init(struct trace_array *tr)
37{
38 int cpu;
39 kmemtrace_array = tr;
40
41 for_each_cpu_mask(cpu, cpu_possible_map)
42 tracing_reset(tr, cpu);
43
44 kmem_tracing_enabled = true;
45
46 return 0;
47}
48
49static void kmem_trace_reset(struct trace_array *tr)
50{
51 kmem_tracing_enabled = false;
52}
53
54static void kmemtrace_headers(struct seq_file *s)
55{
56 /* Don't need headers for the original kmemtrace output */
57 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
58 return;
59
60 seq_printf(s, "#\n");
61 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
62 " POINTER NODE CALLER\n");
63 seq_printf(s, "# FREE | | | | "
64 " | | | |\n");
65 seq_printf(s, "# |\n\n");
66}
67
68/*
69 * The two following functions give the original output from kmemtrace,
70 * or something close to....perhaps they need some missing things
71 */
72static enum print_line_t
73kmemtrace_print_alloc_original(struct trace_iterator *iter,
74 struct kmemtrace_alloc_entry *entry)
75{
76 struct trace_seq *s = &iter->seq;
77 int ret;
78
79 /* Taken from the old linux/kmemtrace.h */
80 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
81 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
82 entry->type_id, entry->call_site, (unsigned long) entry->ptr,
83 (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
84 (unsigned long) entry->gfp_flags, entry->node);
85
86 if (!ret)
87 return TRACE_TYPE_PARTIAL_LINE;
88
89 return TRACE_TYPE_HANDLED;
90}
91
92static enum print_line_t
93kmemtrace_print_free_original(struct trace_iterator *iter,
94 struct kmemtrace_free_entry *entry)
95{
96 struct trace_seq *s = &iter->seq;
97 int ret;
98
99 /* Taken from the old linux/kmemtrace.h */
100 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
101 entry->type_id, entry->call_site, (unsigned long) entry->ptr);
102
103 if (!ret)
104 return TRACE_TYPE_PARTIAL_LINE;
105
106 return TRACE_TYPE_HANDLED;
107}
108
109
110/* The two other following provide a more minimalistic output */
111static enum print_line_t
112kmemtrace_print_alloc_compress(struct trace_iterator *iter,
113 struct kmemtrace_alloc_entry *entry)
114{
115 struct trace_seq *s = &iter->seq;
116 int ret;
117
118 /* Alloc entry */
119 ret = trace_seq_printf(s, " + ");
120 if (!ret)
121 return TRACE_TYPE_PARTIAL_LINE;
122
123 /* Type */
124 switch (entry->type_id) {
125 case KMEMTRACE_TYPE_KMALLOC:
126 ret = trace_seq_printf(s, "K ");
127 break;
128 case KMEMTRACE_TYPE_CACHE:
129 ret = trace_seq_printf(s, "C ");
130 break;
131 case KMEMTRACE_TYPE_PAGES:
132 ret = trace_seq_printf(s, "P ");
133 break;
134 default:
135 ret = trace_seq_printf(s, "? ");
136 }
137
138 if (!ret)
139 return TRACE_TYPE_PARTIAL_LINE;
140
141 /* Requested */
142 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
143 if (!ret)
144 return TRACE_TYPE_PARTIAL_LINE;
145
146 /* Allocated */
147 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
148 if (!ret)
149 return TRACE_TYPE_PARTIAL_LINE;
150
151 /* Flags
152 * TODO: would be better to see the name of the GFP flag names
153 */
154 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
155 if (!ret)
156 return TRACE_TYPE_PARTIAL_LINE;
157
158 /* Pointer to allocated */
159 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
160 if (!ret)
161 return TRACE_TYPE_PARTIAL_LINE;
162
163 /* Node */
164 ret = trace_seq_printf(s, "%4d ", entry->node);
165 if (!ret)
166 return TRACE_TYPE_PARTIAL_LINE;
167
168 /* Call site */
169 ret = seq_print_ip_sym(s, entry->call_site, 0);
170 if (!ret)
171 return TRACE_TYPE_PARTIAL_LINE;
172
173 if (!trace_seq_printf(s, "\n"))
174 return TRACE_TYPE_PARTIAL_LINE;
175
176 return TRACE_TYPE_HANDLED;
177}
178
179static enum print_line_t
180kmemtrace_print_free_compress(struct trace_iterator *iter,
181 struct kmemtrace_free_entry *entry)
182{
183 struct trace_seq *s = &iter->seq;
184 int ret;
185
186 /* Free entry */
187 ret = trace_seq_printf(s, " - ");
188 if (!ret)
189 return TRACE_TYPE_PARTIAL_LINE;
190
191 /* Type */
192 switch (entry->type_id) {
193 case KMEMTRACE_TYPE_KMALLOC:
194 ret = trace_seq_printf(s, "K ");
195 break;
196 case KMEMTRACE_TYPE_CACHE:
197 ret = trace_seq_printf(s, "C ");
198 break;
199 case KMEMTRACE_TYPE_PAGES:
200 ret = trace_seq_printf(s, "P ");
201 break;
202 default:
203 ret = trace_seq_printf(s, "? ");
204 }
205
206 if (!ret)
207 return TRACE_TYPE_PARTIAL_LINE;
208
209 /* Skip requested/allocated/flags */
210 ret = trace_seq_printf(s, " ");
211 if (!ret)
212 return TRACE_TYPE_PARTIAL_LINE;
213
214 /* Pointer to allocated */
215 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
216 if (!ret)
217 return TRACE_TYPE_PARTIAL_LINE;
218
219 /* Skip node */
220 ret = trace_seq_printf(s, " ");
221 if (!ret)
222 return TRACE_TYPE_PARTIAL_LINE;
223
224 /* Call site */
225 ret = seq_print_ip_sym(s, entry->call_site, 0);
226 if (!ret)
227 return TRACE_TYPE_PARTIAL_LINE;
228
229 if (!trace_seq_printf(s, "\n"))
230 return TRACE_TYPE_PARTIAL_LINE;
231
232 return TRACE_TYPE_HANDLED;
233}
234
235static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
236{
237 struct trace_entry *entry = iter->ent;
238
239 switch (entry->type) {
240 case TRACE_KMEM_ALLOC: {
241 struct kmemtrace_alloc_entry *field;
242 trace_assign_type(field, entry);
243 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
244 return kmemtrace_print_alloc_compress(iter, field);
245 else
246 return kmemtrace_print_alloc_original(iter, field);
247 }
248
249 case TRACE_KMEM_FREE: {
250 struct kmemtrace_free_entry *field;
251 trace_assign_type(field, entry);
252 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
253 return kmemtrace_print_free_compress(iter, field);
254 else
255 return kmemtrace_print_free_original(iter, field);
256 }
257
258 default:
259 return TRACE_TYPE_UNHANDLED;
260 }
261}
262
263/* Trace allocations */
264void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
265 unsigned long call_site,
266 const void *ptr,
267 size_t bytes_req,
268 size_t bytes_alloc,
269 gfp_t gfp_flags,
270 int node)
271{
272 struct ring_buffer_event *event;
273 struct kmemtrace_alloc_entry *entry;
274 struct trace_array *tr = kmemtrace_array;
275
276 if (!kmem_tracing_enabled)
277 return;
278
279 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
280 sizeof(*entry), 0, 0);
281 if (!event)
282 return;
283 entry = ring_buffer_event_data(event);
284
285 entry->call_site = call_site;
286 entry->ptr = ptr;
287 entry->bytes_req = bytes_req;
288 entry->bytes_alloc = bytes_alloc;
289 entry->gfp_flags = gfp_flags;
290 entry->node = node;
291
292 trace_buffer_unlock_commit(tr, event, 0, 0);
293}
294EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
295
296void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
297 unsigned long call_site,
298 const void *ptr)
299{
300 struct ring_buffer_event *event;
301 struct kmemtrace_free_entry *entry;
302 struct trace_array *tr = kmemtrace_array;
303
304 if (!kmem_tracing_enabled)
305 return;
306
307 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
308 sizeof(*entry), 0, 0);
309 if (!event)
310 return;
311 entry = ring_buffer_event_data(event);
312 entry->type_id = type_id;
313 entry->call_site = call_site;
314 entry->ptr = ptr;
315
316 trace_buffer_unlock_commit(tr, event, 0, 0);
317}
318EXPORT_SYMBOL(kmemtrace_mark_free);
319
320static struct tracer kmem_tracer __read_mostly = {
321 .name = "kmemtrace",
322 .init = kmem_trace_init,
323 .reset = kmem_trace_reset,
324 .print_line = kmemtrace_print_line,
325 .print_header = kmemtrace_headers,
326 .flags = &kmem_tracer_flags
327};
328
329void kmemtrace_init(void)
330{
331 /* earliest opportunity to start kmem tracing */
332}
333
334static int __init init_kmem_tracer(void)
335{
336 return register_tracer(&kmem_tracer);
337}
338
339device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd38c5cfd8ad..384ca5d9d729 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,21 +4,92 @@
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
7#include <linux/spinlock.h> 9#include <linux/spinlock.h>
8#include <linux/debugfs.h> 10#include <linux/debugfs.h>
9#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h>
10#include <linux/module.h> 13#include <linux/module.h>
11#include <linux/percpu.h> 14#include <linux/percpu.h>
12#include <linux/mutex.h> 15#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/hash.h> 17#include <linux/hash.h>
16#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/cpu.h>
17#include <linux/fs.h> 20#include <linux/fs.h>
18 21
19#include "trace.h" 22#include "trace.h"
20 23
21/* 24/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read
28 * from any per cpu buffer.
29 *
30 * The reader is special. For each per cpu buffer, the reader has its own
31 * reader page. When a reader has read the entire reader page, this reader
32 * page is swapped with another page in the ring buffer.
33 *
34 * Now, as long as the writer is off the reader page, the reader can do what
35 * ever it wants with that page. The writer will never write to that page
36 * again (as long as it is out of the ring buffer).
37 *
38 * Here's some silly ASCII art.
39 *
40 * +------+
41 * |reader| RING BUFFER
42 * |page |
43 * +------+ +---+ +---+ +---+
44 * | |-->| |-->| |
45 * +---+ +---+ +---+
46 * ^ |
47 * | |
48 * +---------------+
49 *
50 *
51 * +------+
52 * |reader| RING BUFFER
53 * |page |------------------v
54 * +------+ +---+ +---+ +---+
55 * | |-->| |-->| |
56 * +---+ +---+ +---+
57 * ^ |
58 * | |
59 * +---------------+
60 *
61 *
62 * +------+
63 * |reader| RING BUFFER
64 * |page |------------------v
65 * +------+ +---+ +---+ +---+
66 * ^ | |-->| |-->| |
67 * | +---+ +---+ +---+
68 * | |
69 * | |
70 * +------------------------------+
71 *
72 *
73 * +------+
74 * |buffer| RING BUFFER
75 * |page |------------------v
76 * +------+ +---+ +---+ +---+
77 * ^ | | | |-->| |
78 * | New +---+ +---+ +---+
79 * | Reader------^ |
80 * | page |
81 * +------------------------------+
82 *
83 *
84 * After we make this swap, the reader can hand this page off to the splice
85 * code and be done with it. It can even allocate a new page if it needs to
86 * and swap that into the ring buffer.
87 *
88 * We will be using cmpxchg soon to make all this lockless.
89 *
90 */
91
92/*
22 * A fast way to enable or disable all ring buffers is to 93 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers 94 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to. 95 * prevents all ring buffers from being recorded to.
@@ -57,7 +128,9 @@ enum {
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, 128 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58}; 129};
59 130
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; 131static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
132
133#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
61 134
62/** 135/**
63 * tracing_on - enable all tracing buffers 136 * tracing_on - enable all tracing buffers
@@ -89,42 +162,26 @@ EXPORT_SYMBOL_GPL(tracing_off);
89 * tracing_off_permanent - permanently disable ring buffers 162 * tracing_off_permanent - permanently disable ring buffers
90 * 163 *
91 * This function, once called, will disable all ring buffers 164 * This function, once called, will disable all ring buffers
92 * permanenty. 165 * permanently.
93 */ 166 */
94void tracing_off_permanent(void) 167void tracing_off_permanent(void)
95{ 168{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); 169 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97} 170}
98 171
99#include "trace.h" 172/**
100 173 * tracing_is_on - show state of ring buffers enabled
101/* Up this if you want to test the TIME_EXTENTS and normalization */ 174 */
102#define DEBUG_SHIFT 0 175int tracing_is_on(void)
103
104/* FIXME!!! */
105u64 ring_buffer_time_stamp(int cpu)
106{ 176{
107 u64 time; 177 return ring_buffer_flags == RB_BUFFERS_ON;
108
109 preempt_disable_notrace();
110 /* shift to debug/test normalization and TIME_EXTENTS */
111 time = sched_clock() << DEBUG_SHIFT;
112 preempt_enable_no_resched_notrace();
113
114 return time;
115} 178}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); 179EXPORT_SYMBOL_GPL(tracing_is_on);
117 180
118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 181#include "trace.h"
119{
120 /* Just stupid testing the normalize function and deltas */
121 *ts >>= DEBUG_SHIFT;
122}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
124 182
125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
126#define RB_ALIGNMENT_SHIFT 2 184#define RB_ALIGNMENT 4U
127#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
128#define RB_MAX_SMALL_DATA 28 185#define RB_MAX_SMALL_DATA 28
129 186
130enum { 187enum {
@@ -133,7 +190,7 @@ enum {
133}; 190};
134 191
135/* inline for ring buffer fast paths */ 192/* inline for ring buffer fast paths */
136static inline unsigned 193static unsigned
137rb_event_length(struct ring_buffer_event *event) 194rb_event_length(struct ring_buffer_event *event)
138{ 195{
139 unsigned length; 196 unsigned length;
@@ -151,7 +208,7 @@ rb_event_length(struct ring_buffer_event *event)
151 208
152 case RINGBUF_TYPE_DATA: 209 case RINGBUF_TYPE_DATA:
153 if (event->len) 210 if (event->len)
154 length = event->len << RB_ALIGNMENT_SHIFT; 211 length = event->len * RB_ALIGNMENT;
155 else 212 else
156 length = event->array[0]; 213 length = event->array[0];
157 return length + RB_EVNT_HDR_SIZE; 214 return length + RB_EVNT_HDR_SIZE;
@@ -179,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
179EXPORT_SYMBOL_GPL(ring_buffer_event_length); 236EXPORT_SYMBOL_GPL(ring_buffer_event_length);
180 237
181/* inline for ring buffer fast paths */ 238/* inline for ring buffer fast paths */
182static inline void * 239static void *
183rb_event_data(struct ring_buffer_event *event) 240rb_event_data(struct ring_buffer_event *event)
184{ 241{
185 BUG_ON(event->type != RINGBUF_TYPE_DATA); 242 BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -209,7 +266,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
209 266
210struct buffer_data_page { 267struct buffer_data_page {
211 u64 time_stamp; /* page time stamp */ 268 u64 time_stamp; /* page time stamp */
212 local_t commit; /* write commited index */ 269 local_t commit; /* write committed index */
213 unsigned char data[]; /* data of buffer page */ 270 unsigned char data[]; /* data of buffer page */
214}; 271};
215 272
@@ -225,14 +282,25 @@ static void rb_init_page(struct buffer_data_page *bpage)
225 local_set(&bpage->commit, 0); 282 local_set(&bpage->commit, 0);
226} 283}
227 284
285/**
286 * ring_buffer_page_len - the size of data on the page.
287 * @page: The page to read
288 *
289 * Returns the amount of data on the page, including buffer page header.
290 */
291size_t ring_buffer_page_len(void *page)
292{
293 return local_read(&((struct buffer_data_page *)page)->commit)
294 + BUF_PAGE_HDR_SIZE;
295}
296
228/* 297/*
229 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 298 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
230 * this issue out. 299 * this issue out.
231 */ 300 */
232static inline void free_buffer_page(struct buffer_page *bpage) 301static void free_buffer_page(struct buffer_page *bpage)
233{ 302{
234 if (bpage->page) 303 free_page((unsigned long)bpage->page);
235 free_page((unsigned long)bpage->page);
236 kfree(bpage); 304 kfree(bpage);
237} 305}
238 306
@@ -246,7 +314,7 @@ static inline int test_time_stamp(u64 delta)
246 return 0; 314 return 0;
247} 315}
248 316
249#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) 317#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
250 318
251/* 319/*
252 * head_page == tail_page && head == tail then buffer is empty. 320 * head_page == tail_page && head == tail then buffer is empty.
@@ -260,7 +328,7 @@ struct ring_buffer_per_cpu {
260 struct list_head pages; 328 struct list_head pages;
261 struct buffer_page *head_page; /* read from head */ 329 struct buffer_page *head_page; /* read from head */
262 struct buffer_page *tail_page; /* write to tail */ 330 struct buffer_page *tail_page; /* write to tail */
263 struct buffer_page *commit_page; /* commited pages */ 331 struct buffer_page *commit_page; /* committed pages */
264 struct buffer_page *reader_page; 332 struct buffer_page *reader_page;
265 unsigned long overrun; 333 unsigned long overrun;
266 unsigned long entries; 334 unsigned long entries;
@@ -273,12 +341,17 @@ struct ring_buffer {
273 unsigned pages; 341 unsigned pages;
274 unsigned flags; 342 unsigned flags;
275 int cpus; 343 int cpus;
276 cpumask_var_t cpumask;
277 atomic_t record_disabled; 344 atomic_t record_disabled;
345 cpumask_var_t cpumask;
278 346
279 struct mutex mutex; 347 struct mutex mutex;
280 348
281 struct ring_buffer_per_cpu **buffers; 349 struct ring_buffer_per_cpu **buffers;
350
351#ifdef CONFIG_HOTPLUG_CPU
352 struct notifier_block cpu_notify;
353#endif
354 u64 (*clock)(void);
282}; 355};
283 356
284struct ring_buffer_iter { 357struct ring_buffer_iter {
@@ -299,11 +372,35 @@ struct ring_buffer_iter {
299 _____ret; \ 372 _____ret; \
300 }) 373 })
301 374
375/* Up this if you want to test the TIME_EXTENTS and normalization */
376#define DEBUG_SHIFT 0
377
378u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
379{
380 u64 time;
381
382 preempt_disable_notrace();
383 /* shift to debug/test normalization and TIME_EXTENTS */
384 time = buffer->clock() << DEBUG_SHIFT;
385 preempt_enable_no_resched_notrace();
386
387 return time;
388}
389EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
390
391void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
392 int cpu, u64 *ts)
393{
394 /* Just stupid testing the normalize function and deltas */
395 *ts >>= DEBUG_SHIFT;
396}
397EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
398
302/** 399/**
303 * check_pages - integrity check of buffer pages 400 * check_pages - integrity check of buffer pages
304 * @cpu_buffer: CPU buffer with pages to test 401 * @cpu_buffer: CPU buffer with pages to test
305 * 402 *
306 * As a safty measure we check to make sure the data pages have not 403 * As a safety measure we check to make sure the data pages have not
307 * been corrupted. 404 * been corrupted.
308 */ 405 */
309static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 406static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
@@ -437,6 +534,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
437 */ 534 */
438extern int ring_buffer_page_too_big(void); 535extern int ring_buffer_page_too_big(void);
439 536
537#ifdef CONFIG_HOTPLUG_CPU
538static int __cpuinit rb_cpu_notify(struct notifier_block *self,
539 unsigned long action, void *hcpu);
540#endif
541
440/** 542/**
441 * ring_buffer_alloc - allocate a new ring_buffer 543 * ring_buffer_alloc - allocate a new ring_buffer
442 * @size: the size in bytes per cpu that is needed. 544 * @size: the size in bytes per cpu that is needed.
@@ -469,12 +571,23 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
469 571
470 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 572 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
471 buffer->flags = flags; 573 buffer->flags = flags;
574 buffer->clock = trace_clock_local;
472 575
473 /* need at least two pages */ 576 /* need at least two pages */
474 if (buffer->pages == 1) 577 if (buffer->pages == 1)
475 buffer->pages++; 578 buffer->pages++;
476 579
580 /*
581 * In case of non-hotplug cpu, if the ring-buffer is allocated
582 * in early initcall, it will not be notified of secondary cpus.
583 * In that off case, we need to allocate for all possible cpus.
584 */
585#ifdef CONFIG_HOTPLUG_CPU
586 get_online_cpus();
587 cpumask_copy(buffer->cpumask, cpu_online_mask);
588#else
477 cpumask_copy(buffer->cpumask, cpu_possible_mask); 589 cpumask_copy(buffer->cpumask, cpu_possible_mask);
590#endif
478 buffer->cpus = nr_cpu_ids; 591 buffer->cpus = nr_cpu_ids;
479 592
480 bsize = sizeof(void *) * nr_cpu_ids; 593 bsize = sizeof(void *) * nr_cpu_ids;
@@ -490,6 +603,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
490 goto fail_free_buffers; 603 goto fail_free_buffers;
491 } 604 }
492 605
606#ifdef CONFIG_HOTPLUG_CPU
607 buffer->cpu_notify.notifier_call = rb_cpu_notify;
608 buffer->cpu_notify.priority = 0;
609 register_cpu_notifier(&buffer->cpu_notify);
610#endif
611
612 put_online_cpus();
493 mutex_init(&buffer->mutex); 613 mutex_init(&buffer->mutex);
494 614
495 return buffer; 615 return buffer;
@@ -503,6 +623,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
503 623
504 fail_free_cpumask: 624 fail_free_cpumask:
505 free_cpumask_var(buffer->cpumask); 625 free_cpumask_var(buffer->cpumask);
626 put_online_cpus();
506 627
507 fail_free_buffer: 628 fail_free_buffer:
508 kfree(buffer); 629 kfree(buffer);
@@ -519,15 +640,29 @@ ring_buffer_free(struct ring_buffer *buffer)
519{ 640{
520 int cpu; 641 int cpu;
521 642
643 get_online_cpus();
644
645#ifdef CONFIG_HOTPLUG_CPU
646 unregister_cpu_notifier(&buffer->cpu_notify);
647#endif
648
522 for_each_buffer_cpu(buffer, cpu) 649 for_each_buffer_cpu(buffer, cpu)
523 rb_free_cpu_buffer(buffer->buffers[cpu]); 650 rb_free_cpu_buffer(buffer->buffers[cpu]);
524 651
652 put_online_cpus();
653
525 free_cpumask_var(buffer->cpumask); 654 free_cpumask_var(buffer->cpumask);
526 655
527 kfree(buffer); 656 kfree(buffer);
528} 657}
529EXPORT_SYMBOL_GPL(ring_buffer_free); 658EXPORT_SYMBOL_GPL(ring_buffer_free);
530 659
660void ring_buffer_set_clock(struct ring_buffer *buffer,
661 u64 (*clock)(void))
662{
663 buffer->clock = clock;
664}
665
531static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 666static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
532 667
533static void 668static void
@@ -627,16 +762,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
627 return size; 762 return size;
628 763
629 mutex_lock(&buffer->mutex); 764 mutex_lock(&buffer->mutex);
765 get_online_cpus();
630 766
631 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 767 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
632 768
633 if (size < buffer_size) { 769 if (size < buffer_size) {
634 770
635 /* easy case, just free pages */ 771 /* easy case, just free pages */
636 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { 772 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
637 mutex_unlock(&buffer->mutex); 773 goto out_fail;
638 return -1;
639 }
640 774
641 rm_pages = buffer->pages - nr_pages; 775 rm_pages = buffer->pages - nr_pages;
642 776
@@ -655,10 +789,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
655 * add these pages to the cpu_buffers. Otherwise we just free 789 * add these pages to the cpu_buffers. Otherwise we just free
656 * them all and return -ENOMEM; 790 * them all and return -ENOMEM;
657 */ 791 */
658 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { 792 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
659 mutex_unlock(&buffer->mutex); 793 goto out_fail;
660 return -1;
661 }
662 794
663 new_pages = nr_pages - buffer->pages; 795 new_pages = nr_pages - buffer->pages;
664 796
@@ -683,13 +815,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
683 rb_insert_pages(cpu_buffer, &pages, new_pages); 815 rb_insert_pages(cpu_buffer, &pages, new_pages);
684 } 816 }
685 817
686 if (RB_WARN_ON(buffer, !list_empty(&pages))) { 818 if (RB_WARN_ON(buffer, !list_empty(&pages)))
687 mutex_unlock(&buffer->mutex); 819 goto out_fail;
688 return -1;
689 }
690 820
691 out: 821 out:
692 buffer->pages = nr_pages; 822 buffer->pages = nr_pages;
823 put_online_cpus();
693 mutex_unlock(&buffer->mutex); 824 mutex_unlock(&buffer->mutex);
694 825
695 return size; 826 return size;
@@ -699,8 +830,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
699 list_del_init(&bpage->list); 830 list_del_init(&bpage->list);
700 free_buffer_page(bpage); 831 free_buffer_page(bpage);
701 } 832 }
833 put_online_cpus();
702 mutex_unlock(&buffer->mutex); 834 mutex_unlock(&buffer->mutex);
703 return -ENOMEM; 835 return -ENOMEM;
836
837 /*
838 * Something went totally wrong, and we are too paranoid
839 * to even clean up the mess.
840 */
841 out_fail:
842 put_online_cpus();
843 mutex_unlock(&buffer->mutex);
844 return -1;
704} 845}
705EXPORT_SYMBOL_GPL(ring_buffer_resize); 846EXPORT_SYMBOL_GPL(ring_buffer_resize);
706 847
@@ -811,7 +952,7 @@ rb_event_index(struct ring_buffer_event *event)
811 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 952 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
812} 953}
813 954
814static inline int 955static int
815rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 956rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
816 struct ring_buffer_event *event) 957 struct ring_buffer_event *event)
817{ 958{
@@ -825,7 +966,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
825 rb_commit_index(cpu_buffer) == index; 966 rb_commit_index(cpu_buffer) == index;
826} 967}
827 968
828static inline void 969static void
829rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, 970rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
830 struct ring_buffer_event *event) 971 struct ring_buffer_event *event)
831{ 972{
@@ -850,7 +991,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
850 local_set(&cpu_buffer->commit_page->page->commit, index); 991 local_set(&cpu_buffer->commit_page->page->commit, index);
851} 992}
852 993
853static inline void 994static void
854rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 995rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
855{ 996{
856 /* 997 /*
@@ -896,7 +1037,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
896 cpu_buffer->reader_page->read = 0; 1037 cpu_buffer->reader_page->read = 0;
897} 1038}
898 1039
899static inline void rb_inc_iter(struct ring_buffer_iter *iter) 1040static void rb_inc_iter(struct ring_buffer_iter *iter)
900{ 1041{
901 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1042 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
902 1043
@@ -926,7 +1067,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
926 * and with this, we can determine what to place into the 1067 * and with this, we can determine what to place into the
927 * data field. 1068 * data field.
928 */ 1069 */
929static inline void 1070static void
930rb_update_event(struct ring_buffer_event *event, 1071rb_update_event(struct ring_buffer_event *event,
931 unsigned type, unsigned length) 1072 unsigned type, unsigned length)
932{ 1073{
@@ -938,15 +1079,11 @@ rb_update_event(struct ring_buffer_event *event,
938 break; 1079 break;
939 1080
940 case RINGBUF_TYPE_TIME_EXTEND: 1081 case RINGBUF_TYPE_TIME_EXTEND:
941 event->len = 1082 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
942 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
943 >> RB_ALIGNMENT_SHIFT;
944 break; 1083 break;
945 1084
946 case RINGBUF_TYPE_TIME_STAMP: 1085 case RINGBUF_TYPE_TIME_STAMP:
947 event->len = 1086 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
948 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
949 >> RB_ALIGNMENT_SHIFT;
950 break; 1087 break;
951 1088
952 case RINGBUF_TYPE_DATA: 1089 case RINGBUF_TYPE_DATA:
@@ -955,16 +1092,14 @@ rb_update_event(struct ring_buffer_event *event,
955 event->len = 0; 1092 event->len = 0;
956 event->array[0] = length; 1093 event->array[0] = length;
957 } else 1094 } else
958 event->len = 1095 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
959 (length + (RB_ALIGNMENT-1))
960 >> RB_ALIGNMENT_SHIFT;
961 break; 1096 break;
962 default: 1097 default:
963 BUG(); 1098 BUG();
964 } 1099 }
965} 1100}
966 1101
967static inline unsigned rb_calculate_event_length(unsigned length) 1102static unsigned rb_calculate_event_length(unsigned length)
968{ 1103{
969 struct ring_buffer_event event; /* Used only for sizeof array */ 1104 struct ring_buffer_event event; /* Used only for sizeof array */
970 1105
@@ -990,6 +1125,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
990 struct ring_buffer *buffer = cpu_buffer->buffer; 1125 struct ring_buffer *buffer = cpu_buffer->buffer;
991 struct ring_buffer_event *event; 1126 struct ring_buffer_event *event;
992 unsigned long flags; 1127 unsigned long flags;
1128 bool lock_taken = false;
993 1129
994 commit_page = cpu_buffer->commit_page; 1130 commit_page = cpu_buffer->commit_page;
995 /* we just need to protect against interrupts */ 1131 /* we just need to protect against interrupts */
@@ -1003,7 +1139,30 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1003 struct buffer_page *next_page = tail_page; 1139 struct buffer_page *next_page = tail_page;
1004 1140
1005 local_irq_save(flags); 1141 local_irq_save(flags);
1006 __raw_spin_lock(&cpu_buffer->lock); 1142 /*
1143 * Since the write to the buffer is still not
1144 * fully lockless, we must be careful with NMIs.
1145 * The locks in the writers are taken when a write
1146 * crosses to a new page. The locks protect against
1147 * races with the readers (this will soon be fixed
1148 * with a lockless solution).
1149 *
1150 * Because we can not protect against NMIs, and we
1151 * want to keep traces reentrant, we need to manage
1152 * what happens when we are in an NMI.
1153 *
1154 * NMIs can happen after we take the lock.
1155 * If we are in an NMI, only take the lock
1156 * if it is not already taken. Otherwise
1157 * simply fail.
1158 */
1159 if (unlikely(in_nmi())) {
1160 if (!__raw_spin_trylock(&cpu_buffer->lock))
1161 goto out_reset;
1162 } else
1163 __raw_spin_lock(&cpu_buffer->lock);
1164
1165 lock_taken = true;
1007 1166
1008 rb_inc_page(cpu_buffer, &next_page); 1167 rb_inc_page(cpu_buffer, &next_page);
1009 1168
@@ -1012,7 +1171,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1012 1171
1013 /* we grabbed the lock before incrementing */ 1172 /* we grabbed the lock before incrementing */
1014 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1173 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1015 goto out_unlock; 1174 goto out_reset;
1016 1175
1017 /* 1176 /*
1018 * If for some reason, we had an interrupt storm that made 1177 * If for some reason, we had an interrupt storm that made
@@ -1021,12 +1180,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1021 */ 1180 */
1022 if (unlikely(next_page == commit_page)) { 1181 if (unlikely(next_page == commit_page)) {
1023 WARN_ON_ONCE(1); 1182 WARN_ON_ONCE(1);
1024 goto out_unlock; 1183 goto out_reset;
1025 } 1184 }
1026 1185
1027 if (next_page == head_page) { 1186 if (next_page == head_page) {
1028 if (!(buffer->flags & RB_FL_OVERWRITE)) 1187 if (!(buffer->flags & RB_FL_OVERWRITE))
1029 goto out_unlock; 1188 goto out_reset;
1030 1189
1031 /* tail_page has not moved yet? */ 1190 /* tail_page has not moved yet? */
1032 if (tail_page == cpu_buffer->tail_page) { 1191 if (tail_page == cpu_buffer->tail_page) {
@@ -1050,7 +1209,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1050 cpu_buffer->tail_page = next_page; 1209 cpu_buffer->tail_page = next_page;
1051 1210
1052 /* reread the time stamp */ 1211 /* reread the time stamp */
1053 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1212 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
1054 cpu_buffer->tail_page->page->time_stamp = *ts; 1213 cpu_buffer->tail_page->page->time_stamp = *ts;
1055 } 1214 }
1056 1215
@@ -1100,12 +1259,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1100 1259
1101 return event; 1260 return event;
1102 1261
1103 out_unlock: 1262 out_reset:
1104 /* reset write */ 1263 /* reset write */
1105 if (tail <= BUF_PAGE_SIZE) 1264 if (tail <= BUF_PAGE_SIZE)
1106 local_set(&tail_page->write, tail); 1265 local_set(&tail_page->write, tail);
1107 1266
1108 __raw_spin_unlock(&cpu_buffer->lock); 1267 if (likely(lock_taken))
1268 __raw_spin_unlock(&cpu_buffer->lock);
1109 local_irq_restore(flags); 1269 local_irq_restore(flags);
1110 return NULL; 1270 return NULL;
1111} 1271}
@@ -1192,7 +1352,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1192 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1352 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1193 return NULL; 1353 return NULL;
1194 1354
1195 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1355 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1196 1356
1197 /* 1357 /*
1198 * Only the first commit can update the timestamp. 1358 * Only the first commit can update the timestamp.
@@ -1265,7 +1425,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1265 * ring_buffer_lock_reserve - reserve a part of the buffer 1425 * ring_buffer_lock_reserve - reserve a part of the buffer
1266 * @buffer: the ring buffer to reserve from 1426 * @buffer: the ring buffer to reserve from
1267 * @length: the length of the data to reserve (excluding event header) 1427 * @length: the length of the data to reserve (excluding event header)
1268 * @flags: a pointer to save the interrupt flags
1269 * 1428 *
1270 * Returns a reseverd event on the ring buffer to copy directly to. 1429 * Returns a reseverd event on the ring buffer to copy directly to.
1271 * The user of this interface will need to get the body to write into 1430 * The user of this interface will need to get the body to write into
@@ -1278,9 +1437,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1278 * If NULL is returned, then nothing has been allocated or locked. 1437 * If NULL is returned, then nothing has been allocated or locked.
1279 */ 1438 */
1280struct ring_buffer_event * 1439struct ring_buffer_event *
1281ring_buffer_lock_reserve(struct ring_buffer *buffer, 1440ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1282 unsigned long length,
1283 unsigned long *flags)
1284{ 1441{
1285 struct ring_buffer_per_cpu *cpu_buffer; 1442 struct ring_buffer_per_cpu *cpu_buffer;
1286 struct ring_buffer_event *event; 1443 struct ring_buffer_event *event;
@@ -1347,15 +1504,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1347 * ring_buffer_unlock_commit - commit a reserved 1504 * ring_buffer_unlock_commit - commit a reserved
1348 * @buffer: The buffer to commit to 1505 * @buffer: The buffer to commit to
1349 * @event: The event pointer to commit. 1506 * @event: The event pointer to commit.
1350 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1351 * 1507 *
1352 * This commits the data to the ring buffer, and releases any locks held. 1508 * This commits the data to the ring buffer, and releases any locks held.
1353 * 1509 *
1354 * Must be paired with ring_buffer_lock_reserve. 1510 * Must be paired with ring_buffer_lock_reserve.
1355 */ 1511 */
1356int ring_buffer_unlock_commit(struct ring_buffer *buffer, 1512int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1357 struct ring_buffer_event *event, 1513 struct ring_buffer_event *event)
1358 unsigned long flags)
1359{ 1514{
1360 struct ring_buffer_per_cpu *cpu_buffer; 1515 struct ring_buffer_per_cpu *cpu_buffer;
1361 int cpu = raw_smp_processor_id(); 1516 int cpu = raw_smp_processor_id();
@@ -1438,7 +1593,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1438} 1593}
1439EXPORT_SYMBOL_GPL(ring_buffer_write); 1594EXPORT_SYMBOL_GPL(ring_buffer_write);
1440 1595
1441static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1596static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1442{ 1597{
1443 struct buffer_page *reader = cpu_buffer->reader_page; 1598 struct buffer_page *reader = cpu_buffer->reader_page;
1444 struct buffer_page *head = cpu_buffer->head_page; 1599 struct buffer_page *head = cpu_buffer->head_page;
@@ -1528,12 +1683,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1528unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 1683unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1529{ 1684{
1530 struct ring_buffer_per_cpu *cpu_buffer; 1685 struct ring_buffer_per_cpu *cpu_buffer;
1686 unsigned long ret;
1531 1687
1532 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1688 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1533 return 0; 1689 return 0;
1534 1690
1535 cpu_buffer = buffer->buffers[cpu]; 1691 cpu_buffer = buffer->buffers[cpu];
1536 return cpu_buffer->entries; 1692 ret = cpu_buffer->entries;
1693
1694 return ret;
1537} 1695}
1538EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 1696EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1539 1697
@@ -1545,12 +1703,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1545unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) 1703unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1546{ 1704{
1547 struct ring_buffer_per_cpu *cpu_buffer; 1705 struct ring_buffer_per_cpu *cpu_buffer;
1706 unsigned long ret;
1548 1707
1549 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1708 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1550 return 0; 1709 return 0;
1551 1710
1552 cpu_buffer = buffer->buffers[cpu]; 1711 cpu_buffer = buffer->buffers[cpu];
1553 return cpu_buffer->overrun; 1712 ret = cpu_buffer->overrun;
1713
1714 return ret;
1554} 1715}
1555EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1716EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1556 1717
@@ -1627,9 +1788,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
1627 */ 1788 */
1628void ring_buffer_iter_reset(struct ring_buffer_iter *iter) 1789void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1629{ 1790{
1630 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1791 struct ring_buffer_per_cpu *cpu_buffer;
1631 unsigned long flags; 1792 unsigned long flags;
1632 1793
1794 if (!iter)
1795 return;
1796
1797 cpu_buffer = iter->cpu_buffer;
1798
1633 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 1799 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1634 rb_iter_reset(iter); 1800 rb_iter_reset(iter);
1635 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 1801 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1864,9 +2030,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1864 struct buffer_page *reader; 2030 struct buffer_page *reader;
1865 int nr_loops = 0; 2031 int nr_loops = 0;
1866 2032
1867 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1868 return NULL;
1869
1870 cpu_buffer = buffer->buffers[cpu]; 2033 cpu_buffer = buffer->buffers[cpu];
1871 2034
1872 again: 2035 again:
@@ -1906,7 +2069,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1906 case RINGBUF_TYPE_DATA: 2069 case RINGBUF_TYPE_DATA:
1907 if (ts) { 2070 if (ts) {
1908 *ts = cpu_buffer->read_stamp + event->time_delta; 2071 *ts = cpu_buffer->read_stamp + event->time_delta;
1909 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2072 ring_buffer_normalize_time_stamp(buffer,
2073 cpu_buffer->cpu, ts);
1910 } 2074 }
1911 return event; 2075 return event;
1912 2076
@@ -1967,7 +2131,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1967 case RINGBUF_TYPE_DATA: 2131 case RINGBUF_TYPE_DATA:
1968 if (ts) { 2132 if (ts) {
1969 *ts = iter->read_stamp + event->time_delta; 2133 *ts = iter->read_stamp + event->time_delta;
1970 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2134 ring_buffer_normalize_time_stamp(buffer,
2135 cpu_buffer->cpu, ts);
1971 } 2136 }
1972 return event; 2137 return event;
1973 2138
@@ -1995,6 +2160,9 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1995 struct ring_buffer_event *event; 2160 struct ring_buffer_event *event;
1996 unsigned long flags; 2161 unsigned long flags;
1997 2162
2163 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2164 return NULL;
2165
1998 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2166 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1999 event = rb_buffer_peek(buffer, cpu, ts); 2167 event = rb_buffer_peek(buffer, cpu, ts);
2000 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2168 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -2035,24 +2203,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2035struct ring_buffer_event * 2203struct ring_buffer_event *
2036ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 2204ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2037{ 2205{
2038 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2206 struct ring_buffer_per_cpu *cpu_buffer;
2039 struct ring_buffer_event *event; 2207 struct ring_buffer_event *event = NULL;
2040 unsigned long flags; 2208 unsigned long flags;
2041 2209
2210 /* might be called in atomic */
2211 preempt_disable();
2212
2042 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2213 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2043 return NULL; 2214 goto out;
2044 2215
2216 cpu_buffer = buffer->buffers[cpu];
2045 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2217 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2046 2218
2047 event = rb_buffer_peek(buffer, cpu, ts); 2219 event = rb_buffer_peek(buffer, cpu, ts);
2048 if (!event) 2220 if (!event)
2049 goto out; 2221 goto out_unlock;
2050 2222
2051 rb_advance_reader(cpu_buffer); 2223 rb_advance_reader(cpu_buffer);
2052 2224
2053 out: 2225 out_unlock:
2054 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2226 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2055 2227
2228 out:
2229 preempt_enable();
2230
2056 return event; 2231 return event;
2057} 2232}
2058EXPORT_SYMBOL_GPL(ring_buffer_consume); 2233EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2232,6 +2407,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2232 if (!rb_per_cpu_empty(cpu_buffer)) 2407 if (!rb_per_cpu_empty(cpu_buffer))
2233 return 0; 2408 return 0;
2234 } 2409 }
2410
2235 return 1; 2411 return 1;
2236} 2412}
2237EXPORT_SYMBOL_GPL(ring_buffer_empty); 2413EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2244,12 +2420,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2244int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2420int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2245{ 2421{
2246 struct ring_buffer_per_cpu *cpu_buffer; 2422 struct ring_buffer_per_cpu *cpu_buffer;
2423 int ret;
2247 2424
2248 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2425 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2249 return 1; 2426 return 1;
2250 2427
2251 cpu_buffer = buffer->buffers[cpu]; 2428 cpu_buffer = buffer->buffers[cpu];
2252 return rb_per_cpu_empty(cpu_buffer); 2429 ret = rb_per_cpu_empty(cpu_buffer);
2430
2431
2432 return ret;
2253} 2433}
2254EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 2434EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2255 2435
@@ -2268,18 +2448,36 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2268{ 2448{
2269 struct ring_buffer_per_cpu *cpu_buffer_a; 2449 struct ring_buffer_per_cpu *cpu_buffer_a;
2270 struct ring_buffer_per_cpu *cpu_buffer_b; 2450 struct ring_buffer_per_cpu *cpu_buffer_b;
2451 int ret = -EINVAL;
2271 2452
2272 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || 2453 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2273 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 2454 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2274 return -EINVAL; 2455 goto out;
2275 2456
2276 /* At least make sure the two buffers are somewhat the same */ 2457 /* At least make sure the two buffers are somewhat the same */
2277 if (buffer_a->pages != buffer_b->pages) 2458 if (buffer_a->pages != buffer_b->pages)
2278 return -EINVAL; 2459 goto out;
2460
2461 ret = -EAGAIN;
2462
2463 if (ring_buffer_flags != RB_BUFFERS_ON)
2464 goto out;
2465
2466 if (atomic_read(&buffer_a->record_disabled))
2467 goto out;
2468
2469 if (atomic_read(&buffer_b->record_disabled))
2470 goto out;
2279 2471
2280 cpu_buffer_a = buffer_a->buffers[cpu]; 2472 cpu_buffer_a = buffer_a->buffers[cpu];
2281 cpu_buffer_b = buffer_b->buffers[cpu]; 2473 cpu_buffer_b = buffer_b->buffers[cpu];
2282 2474
2475 if (atomic_read(&cpu_buffer_a->record_disabled))
2476 goto out;
2477
2478 if (atomic_read(&cpu_buffer_b->record_disabled))
2479 goto out;
2480
2283 /* 2481 /*
2284 * We can't do a synchronize_sched here because this 2482 * We can't do a synchronize_sched here because this
2285 * function can be called in atomic context. 2483 * function can be called in atomic context.
@@ -2298,18 +2496,21 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2298 atomic_dec(&cpu_buffer_a->record_disabled); 2496 atomic_dec(&cpu_buffer_a->record_disabled);
2299 atomic_dec(&cpu_buffer_b->record_disabled); 2497 atomic_dec(&cpu_buffer_b->record_disabled);
2300 2498
2301 return 0; 2499 ret = 0;
2500out:
2501 return ret;
2302} 2502}
2303EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2503EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2304 2504
2305static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, 2505static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2306 struct buffer_data_page *bpage) 2506 struct buffer_data_page *bpage,
2507 unsigned int offset)
2307{ 2508{
2308 struct ring_buffer_event *event; 2509 struct ring_buffer_event *event;
2309 unsigned long head; 2510 unsigned long head;
2310 2511
2311 __raw_spin_lock(&cpu_buffer->lock); 2512 __raw_spin_lock(&cpu_buffer->lock);
2312 for (head = 0; head < local_read(&bpage->commit); 2513 for (head = offset; head < local_read(&bpage->commit);
2313 head += rb_event_length(event)) { 2514 head += rb_event_length(event)) {
2314 2515
2315 event = __rb_data_page_index(bpage, head); 2516 event = __rb_data_page_index(bpage, head);
@@ -2340,8 +2541,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2340 */ 2541 */
2341void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 2542void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2342{ 2543{
2343 unsigned long addr;
2344 struct buffer_data_page *bpage; 2544 struct buffer_data_page *bpage;
2545 unsigned long addr;
2345 2546
2346 addr = __get_free_page(GFP_KERNEL); 2547 addr = __get_free_page(GFP_KERNEL);
2347 if (!addr) 2548 if (!addr)
@@ -2349,6 +2550,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2349 2550
2350 bpage = (void *)addr; 2551 bpage = (void *)addr;
2351 2552
2553 rb_init_page(bpage);
2554
2352 return bpage; 2555 return bpage;
2353} 2556}
2354 2557
@@ -2368,6 +2571,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2368 * ring_buffer_read_page - extract a page from the ring buffer 2571 * ring_buffer_read_page - extract a page from the ring buffer
2369 * @buffer: buffer to extract from 2572 * @buffer: buffer to extract from
2370 * @data_page: the page to use allocated from ring_buffer_alloc_read_page 2573 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2574 * @len: amount to extract
2371 * @cpu: the cpu of the buffer to extract 2575 * @cpu: the cpu of the buffer to extract
2372 * @full: should the extraction only happen when the page is full. 2576 * @full: should the extraction only happen when the page is full.
2373 * 2577 *
@@ -2377,12 +2581,12 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2377 * to swap with a page in the ring buffer. 2581 * to swap with a page in the ring buffer.
2378 * 2582 *
2379 * for example: 2583 * for example:
2380 * rpage = ring_buffer_alloc_page(buffer); 2584 * rpage = ring_buffer_alloc_read_page(buffer);
2381 * if (!rpage) 2585 * if (!rpage)
2382 * return error; 2586 * return error;
2383 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); 2587 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
2384 * if (ret) 2588 * if (ret >= 0)
2385 * process_page(rpage); 2589 * process_page(rpage, ret);
2386 * 2590 *
2387 * When @full is set, the function will not return true unless 2591 * When @full is set, the function will not return true unless
2388 * the writer is off the reader page. 2592 * the writer is off the reader page.
@@ -2393,72 +2597,118 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2393 * responsible for that. 2597 * responsible for that.
2394 * 2598 *
2395 * Returns: 2599 * Returns:
2396 * 1 if data has been transferred 2600 * >=0 if data has been transferred, returns the offset of consumed data.
2397 * 0 if no data has been transferred. 2601 * <0 if no data has been transferred.
2398 */ 2602 */
2399int ring_buffer_read_page(struct ring_buffer *buffer, 2603int ring_buffer_read_page(struct ring_buffer *buffer,
2400 void **data_page, int cpu, int full) 2604 void **data_page, size_t len, int cpu, int full)
2401{ 2605{
2402 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2606 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2403 struct ring_buffer_event *event; 2607 struct ring_buffer_event *event;
2404 struct buffer_data_page *bpage; 2608 struct buffer_data_page *bpage;
2609 struct buffer_page *reader;
2405 unsigned long flags; 2610 unsigned long flags;
2406 int ret = 0; 2611 unsigned int commit;
2612 unsigned int read;
2613 u64 save_timestamp;
2614 int ret = -1;
2615
2616 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2617 goto out;
2618
2619 /*
2620 * If len is not big enough to hold the page header, then
2621 * we can not copy anything.
2622 */
2623 if (len <= BUF_PAGE_HDR_SIZE)
2624 goto out;
2625
2626 len -= BUF_PAGE_HDR_SIZE;
2407 2627
2408 if (!data_page) 2628 if (!data_page)
2409 return 0; 2629 goto out;
2410 2630
2411 bpage = *data_page; 2631 bpage = *data_page;
2412 if (!bpage) 2632 if (!bpage)
2413 return 0; 2633 goto out;
2414 2634
2415 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2635 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2416 2636
2417 /* 2637 reader = rb_get_reader_page(cpu_buffer);
2418 * rb_buffer_peek will get the next ring buffer if 2638 if (!reader)
2419 * the current reader page is empty. 2639 goto out_unlock;
2420 */ 2640
2421 event = rb_buffer_peek(buffer, cpu, NULL); 2641 event = rb_reader_event(cpu_buffer);
2422 if (!event) 2642
2423 goto out; 2643 read = reader->read;
2644 commit = rb_page_commit(reader);
2424 2645
2425 /* check for data */
2426 if (!local_read(&cpu_buffer->reader_page->page->commit))
2427 goto out;
2428 /* 2646 /*
2429 * If the writer is already off of the read page, then simply 2647 * If this page has been partially read or
2430 * switch the read page with the given page. Otherwise 2648 * if len is not big enough to read the rest of the page or
2431 * we need to copy the data from the reader to the writer. 2649 * a writer is still on the page, then
2650 * we must copy the data from the page to the buffer.
2651 * Otherwise, we can simply swap the page with the one passed in.
2432 */ 2652 */
2433 if (cpu_buffer->reader_page == cpu_buffer->commit_page) { 2653 if (read || (len < (commit - read)) ||
2434 unsigned int read = cpu_buffer->reader_page->read; 2654 cpu_buffer->reader_page == cpu_buffer->commit_page) {
2655 struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
2656 unsigned int rpos = read;
2657 unsigned int pos = 0;
2658 unsigned int size;
2435 2659
2436 if (full) 2660 if (full)
2437 goto out; 2661 goto out_unlock;
2438 /* The writer is still on the reader page, we must copy */ 2662
2439 bpage = cpu_buffer->reader_page->page; 2663 if (len > (commit - read))
2440 memcpy(bpage->data, 2664 len = (commit - read);
2441 cpu_buffer->reader_page->page->data + read, 2665
2442 local_read(&bpage->commit) - read); 2666 size = rb_event_length(event);
2443 2667
2444 /* consume what was read */ 2668 if (len < size)
2445 cpu_buffer->reader_page += read; 2669 goto out_unlock;
2670
2671 /* save the current timestamp, since the user will need it */
2672 save_timestamp = cpu_buffer->read_stamp;
2673
2674 /* Need to copy one event at a time */
2675 do {
2676 memcpy(bpage->data + pos, rpage->data + rpos, size);
2677
2678 len -= size;
2679
2680 rb_advance_reader(cpu_buffer);
2681 rpos = reader->read;
2682 pos += size;
2683
2684 event = rb_reader_event(cpu_buffer);
2685 size = rb_event_length(event);
2686 } while (len > size);
2687
2688 /* update bpage */
2689 local_set(&bpage->commit, pos);
2690 bpage->time_stamp = save_timestamp;
2446 2691
2692 /* we copied everything to the beginning */
2693 read = 0;
2447 } else { 2694 } else {
2448 /* swap the pages */ 2695 /* swap the pages */
2449 rb_init_page(bpage); 2696 rb_init_page(bpage);
2450 bpage = cpu_buffer->reader_page->page; 2697 bpage = reader->page;
2451 cpu_buffer->reader_page->page = *data_page; 2698 reader->page = *data_page;
2452 cpu_buffer->reader_page->read = 0; 2699 local_set(&reader->write, 0);
2700 reader->read = 0;
2453 *data_page = bpage; 2701 *data_page = bpage;
2702
2703 /* update the entry counter */
2704 rb_remove_entries(cpu_buffer, bpage, read);
2454 } 2705 }
2455 ret = 1; 2706 ret = read;
2456 2707
2457 /* update the entry counter */ 2708 out_unlock:
2458 rb_remove_entries(cpu_buffer, bpage);
2459 out:
2460 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2709 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2461 2710
2711 out:
2462 return ret; 2712 return ret;
2463} 2713}
2464 2714
@@ -2466,7 +2716,7 @@ static ssize_t
2466rb_simple_read(struct file *filp, char __user *ubuf, 2716rb_simple_read(struct file *filp, char __user *ubuf,
2467 size_t cnt, loff_t *ppos) 2717 size_t cnt, loff_t *ppos)
2468{ 2718{
2469 long *p = filp->private_data; 2719 unsigned long *p = filp->private_data;
2470 char buf[64]; 2720 char buf[64];
2471 int r; 2721 int r;
2472 2722
@@ -2482,9 +2732,9 @@ static ssize_t
2482rb_simple_write(struct file *filp, const char __user *ubuf, 2732rb_simple_write(struct file *filp, const char __user *ubuf,
2483 size_t cnt, loff_t *ppos) 2733 size_t cnt, loff_t *ppos)
2484{ 2734{
2485 long *p = filp->private_data; 2735 unsigned long *p = filp->private_data;
2486 char buf[64]; 2736 char buf[64];
2487 long val; 2737 unsigned long val;
2488 int ret; 2738 int ret;
2489 2739
2490 if (cnt >= sizeof(buf)) 2740 if (cnt >= sizeof(buf))
@@ -2509,7 +2759,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2509 return cnt; 2759 return cnt;
2510} 2760}
2511 2761
2512static struct file_operations rb_simple_fops = { 2762static const struct file_operations rb_simple_fops = {
2513 .open = tracing_open_generic, 2763 .open = tracing_open_generic,
2514 .read = rb_simple_read, 2764 .read = rb_simple_read,
2515 .write = rb_simple_write, 2765 .write = rb_simple_write,
@@ -2532,3 +2782,42 @@ static __init int rb_init_debugfs(void)
2532} 2782}
2533 2783
2534fs_initcall(rb_init_debugfs); 2784fs_initcall(rb_init_debugfs);
2785
2786#ifdef CONFIG_HOTPLUG_CPU
2787static int __cpuinit rb_cpu_notify(struct notifier_block *self,
2788 unsigned long action, void *hcpu)
2789{
2790 struct ring_buffer *buffer =
2791 container_of(self, struct ring_buffer, cpu_notify);
2792 long cpu = (long)hcpu;
2793
2794 switch (action) {
2795 case CPU_UP_PREPARE:
2796 case CPU_UP_PREPARE_FROZEN:
2797 if (cpu_isset(cpu, *buffer->cpumask))
2798 return NOTIFY_OK;
2799
2800 buffer->buffers[cpu] =
2801 rb_allocate_cpu_buffer(buffer, cpu);
2802 if (!buffer->buffers[cpu]) {
2803 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
2804 cpu);
2805 return NOTIFY_OK;
2806 }
2807 smp_wmb();
2808 cpu_set(cpu, *buffer->cpumask);
2809 break;
2810 case CPU_DOWN_PREPARE:
2811 case CPU_DOWN_PREPARE_FROZEN:
2812 /*
2813 * Do nothing.
2814 * If we were to free the buffer, then the user would
2815 * lose any trace that was in the buffer.
2816 */
2817 break;
2818 default:
2819 break;
2820 }
2821 return NOTIFY_OK;
2822}
2823#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 17bb88d86ac2..c95b7292be70 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,32 +11,33 @@
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h>
14#include <linux/utsrelease.h> 15#include <linux/utsrelease.h>
16#include <linux/stacktrace.h>
17#include <linux/writeback.h>
15#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 19#include <linux/seq_file.h>
17#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h>
18#include <linux/debugfs.h> 22#include <linux/debugfs.h>
19#include <linux/pagemap.h> 23#include <linux/pagemap.h>
20#include <linux/hardirq.h> 24#include <linux/hardirq.h>
21#include <linux/linkage.h> 25#include <linux/linkage.h>
22#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/kprobes.h>
23#include <linux/ftrace.h> 28#include <linux/ftrace.h>
24#include <linux/module.h> 29#include <linux/module.h>
25#include <linux/percpu.h> 30#include <linux/percpu.h>
31#include <linux/splice.h>
26#include <linux/kdebug.h> 32#include <linux/kdebug.h>
27#include <linux/ctype.h> 33#include <linux/ctype.h>
28#include <linux/init.h> 34#include <linux/init.h>
29#include <linux/poll.h> 35#include <linux/poll.h>
30#include <linux/gfp.h> 36#include <linux/gfp.h>
31#include <linux/fs.h> 37#include <linux/fs.h>
32#include <linux/kprobes.h>
33#include <linux/writeback.h>
34
35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
37#include <linux/irqflags.h>
38 38
39#include "trace.h" 39#include "trace.h"
40#include "trace_output.h"
40 41
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 42#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42 43
@@ -44,14 +45,25 @@ unsigned long __read_mostly tracing_max_latency;
44unsigned long __read_mostly tracing_thresh; 45unsigned long __read_mostly tracing_thresh;
45 46
46/* 47/*
48 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing.
50 */
51static int ring_buffer_expanded;
52
53/*
47 * We need to change this state when a selftest is running. 54 * We need to change this state when a selftest is running.
48 * A selftest will lurk into the ring-buffer to count the 55 * A selftest will lurk into the ring-buffer to count the
49 * entries inserted during the selftest although some concurrent 56 * entries inserted during the selftest although some concurrent
50 * insertions into the ring-buffer such as ftrace_printk could occurred 57 * insertions into the ring-buffer such as trace_printk could occurred
51 * at the same time, giving false positive or negative results. 58 * at the same time, giving false positive or negative results.
52 */ 59 */
53static bool __read_mostly tracing_selftest_running; 60static bool __read_mostly tracing_selftest_running;
54 61
62/*
63 * If a tracer is running, we do not want to run SELFTEST.
64 */
65static bool __read_mostly tracing_selftest_disabled;
66
55/* For tracers that don't implement custom flags */ 67/* For tracers that don't implement custom flags */
56static struct tracer_opt dummy_tracer_opt[] = { 68static struct tracer_opt dummy_tracer_opt[] = {
57 { } 69 { }
@@ -73,7 +85,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
73 * of the tracer is successful. But that is the only place that sets 85 * of the tracer is successful. But that is the only place that sets
74 * this back to zero. 86 * this back to zero.
75 */ 87 */
76int tracing_disabled = 1; 88static int tracing_disabled = 1;
77 89
78static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 90static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
79 91
@@ -91,6 +103,9 @@ static inline void ftrace_enable_cpu(void)
91 103
92static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
93 105
106/* Define which cpu buffers are currently read in trace_pipe */
107static cpumask_var_t tracing_reader_cpumask;
108
94#define for_each_tracing_cpu(cpu) \ 109#define for_each_tracing_cpu(cpu) \
95 for_each_cpu(cpu, tracing_buffer_mask) 110 for_each_cpu(cpu, tracing_buffer_mask)
96 111
@@ -109,14 +124,21 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
109 */ 124 */
110int ftrace_dump_on_oops; 125int ftrace_dump_on_oops;
111 126
112static int tracing_set_tracer(char *buf); 127static int tracing_set_tracer(const char *buf);
128
129#define BOOTUP_TRACER_SIZE 100
130static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
131static char *default_bootup_tracer;
113 132
114static int __init set_ftrace(char *str) 133static int __init set_ftrace(char *str)
115{ 134{
116 tracing_set_tracer(str); 135 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1;
117 return 1; 139 return 1;
118} 140}
119__setup("ftrace", set_ftrace); 141__setup("ftrace=", set_ftrace);
120 142
121static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
122{ 144{
@@ -133,13 +155,6 @@ ns2usecs(cycle_t nsec)
133 return nsec; 155 return nsec;
134} 156}
135 157
136cycle_t ftrace_now(int cpu)
137{
138 u64 ts = ring_buffer_time_stamp(cpu);
139 ring_buffer_normalize_time_stamp(cpu, &ts);
140 return ts;
141}
142
143/* 158/*
144 * The global_trace is the descriptor that holds the tracing 159 * The global_trace is the descriptor that holds the tracing
145 * buffers for the live tracing. For each CPU, it contains 160 * buffers for the live tracing. For each CPU, it contains
@@ -156,6 +171,20 @@ static struct trace_array global_trace;
156 171
157static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
158 173
174cycle_t ftrace_now(int cpu)
175{
176 u64 ts;
177
178 /* Early boot up does not have a buffer yet */
179 if (!global_trace.buffer)
180 return trace_clock_local();
181
182 ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
183 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
184
185 return ts;
186}
187
159/* 188/*
160 * The max_tr is used to snapshot the global_trace when a maximum 189 * The max_tr is used to snapshot the global_trace when a maximum
161 * latency is reached. Some tracers will use this to store a maximum 190 * latency is reached. Some tracers will use this to store a maximum
@@ -186,9 +215,6 @@ int tracing_is_enabled(void)
186 return tracer_enabled; 215 return tracer_enabled;
187} 216}
188 217
189/* function tracing enabled */
190int ftrace_function_enabled;
191
192/* 218/*
193 * trace_buf_size is the size in bytes that is allocated 219 * trace_buf_size is the size in bytes that is allocated
194 * for a buffer. Note, the number of bytes is always rounded 220 * for a buffer. Note, the number of bytes is always rounded
@@ -229,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
229 255
230/* trace_flags holds trace_options default values */ 256/* trace_flags holds trace_options default values */
231unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
232 TRACE_ITER_ANNOTATE; 258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
233 259
234/** 260/**
235 * trace_wake_up - wake up tasks waiting for trace input 261 * trace_wake_up - wake up tasks waiting for trace input
@@ -280,13 +306,16 @@ static const char *trace_options[] = {
280 "block", 306 "block",
281 "stacktrace", 307 "stacktrace",
282 "sched-tree", 308 "sched-tree",
283 "ftrace_printk", 309 "trace_printk",
284 "ftrace_preempt", 310 "ftrace_preempt",
285 "branch", 311 "branch",
286 "annotate", 312 "annotate",
287 "userstacktrace", 313 "userstacktrace",
288 "sym-userobj", 314 "sym-userobj",
289 "printk-msg-only", 315 "printk-msg-only",
316 "context-info",
317 "latency-format",
318 "global-clock",
290 NULL 319 NULL
291}; 320};
292 321
@@ -326,146 +355,37 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
326 data->rt_priority = tsk->rt_priority; 355 data->rt_priority = tsk->rt_priority;
327 356
328 /* record this tasks comm */ 357 /* record this tasks comm */
329 tracing_record_cmdline(current); 358 tracing_record_cmdline(tsk);
330} 359}
331 360
332/** 361ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
333 * trace_seq_printf - sequence printing of trace information
334 * @s: trace sequence descriptor
335 * @fmt: printf format string
336 *
337 * The tracer may use either sequence operations or its own
338 * copy to user routines. To simplify formating of a trace
339 * trace_seq_printf is used to store strings into a special
340 * buffer (@s). Then the output may be either used by
341 * the sequencer or pulled into another buffer.
342 */
343int
344trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
345{ 362{
346 int len = (PAGE_SIZE - 1) - s->len; 363 int len;
347 va_list ap;
348 int ret; 364 int ret;
349 365
350 if (!len) 366 if (!cnt)
351 return 0;
352
353 va_start(ap, fmt);
354 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
355 va_end(ap);
356
357 /* If we can't write it all, don't bother writing anything */
358 if (ret >= len)
359 return 0;
360
361 s->len += ret;
362
363 return len;
364}
365
366/**
367 * trace_seq_puts - trace sequence printing of simple string
368 * @s: trace sequence descriptor
369 * @str: simple string to record
370 *
371 * The tracer may use either the sequence operations or its own
372 * copy to user routines. This function records a simple string
373 * into a special buffer (@s) for later retrieval by a sequencer
374 * or other mechanism.
375 */
376static int
377trace_seq_puts(struct trace_seq *s, const char *str)
378{
379 int len = strlen(str);
380
381 if (len > ((PAGE_SIZE - 1) - s->len))
382 return 0;
383
384 memcpy(s->buffer + s->len, str, len);
385 s->len += len;
386
387 return len;
388}
389
390static int
391trace_seq_putc(struct trace_seq *s, unsigned char c)
392{
393 if (s->len >= (PAGE_SIZE - 1))
394 return 0;
395
396 s->buffer[s->len++] = c;
397
398 return 1;
399}
400
401static int
402trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
403{
404 if (len > ((PAGE_SIZE - 1) - s->len))
405 return 0; 367 return 0;
406 368
407 memcpy(s->buffer + s->len, mem, len); 369 if (s->len <= s->readpos)
408 s->len += len; 370 return -EBUSY;
409
410 return len;
411}
412
413#define MAX_MEMHEX_BYTES 8
414#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
415
416static int
417trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
418{
419 unsigned char hex[HEX_CHARS];
420 unsigned char *data = mem;
421 int i, j;
422
423#ifdef __BIG_ENDIAN
424 for (i = 0, j = 0; i < len; i++) {
425#else
426 for (i = len-1, j = 0; i >= 0; i--) {
427#endif
428 hex[j++] = hex_asc_hi(data[i]);
429 hex[j++] = hex_asc_lo(data[i]);
430 }
431 hex[j++] = ' ';
432
433 return trace_seq_putmem(s, hex, j);
434}
435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440 371
441 if (s->len >= (PAGE_SIZE - 1)) 372 len = s->len - s->readpos;
442 return 0; 373 if (cnt > len)
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 374 cnt = len;
444 if (!IS_ERR(p)) { 375 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
445 p = mangle_path(s->buffer + s->len, p, "\n"); 376 if (ret == cnt)
446 if (p) { 377 return -EFAULT;
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454 378
455 return 0; 379 cnt -= ret;
456}
457 380
458static void 381 s->readpos += cnt;
459trace_seq_reset(struct trace_seq *s) 382 return cnt;
460{
461 s->len = 0;
462 s->readpos = 0;
463} 383}
464 384
465ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 385ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
466{ 386{
467 int len; 387 int len;
468 int ret; 388 void *ret;
469 389
470 if (s->len <= s->readpos) 390 if (s->len <= s->readpos)
471 return -EBUSY; 391 return -EBUSY;
@@ -473,11 +393,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
473 len = s->len - s->readpos; 393 len = s->len - s->readpos;
474 if (cnt > len) 394 if (cnt > len)
475 cnt = len; 395 cnt = len;
476 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); 396 ret = memcpy(buf, s->buffer + s->readpos, cnt);
477 if (ret) 397 if (!ret)
478 return -EFAULT; 398 return -EFAULT;
479 399
480 s->readpos += len; 400 s->readpos += cnt;
481 return cnt; 401 return cnt;
482} 402}
483 403
@@ -489,7 +409,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
489 s->buffer[len] = 0; 409 s->buffer[len] = 0;
490 seq_puts(m, s->buffer); 410 seq_puts(m, s->buffer);
491 411
492 trace_seq_reset(s); 412 trace_seq_init(s);
493} 413}
494 414
495/** 415/**
@@ -543,7 +463,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
543 463
544 ftrace_enable_cpu(); 464 ftrace_enable_cpu();
545 465
546 WARN_ON_ONCE(ret); 466 WARN_ON_ONCE(ret && ret != -EAGAIN);
547 467
548 __update_max_tr(tr, tsk, cpu); 468 __update_max_tr(tr, tsk, cpu);
549 __raw_spin_unlock(&ftrace_max_lock); 469 __raw_spin_unlock(&ftrace_max_lock);
@@ -556,6 +476,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
556 * Register a new plugin tracer. 476 * Register a new plugin tracer.
557 */ 477 */
558int register_tracer(struct tracer *type) 478int register_tracer(struct tracer *type)
479__releases(kernel_lock)
480__acquires(kernel_lock)
559{ 481{
560 struct tracer *t; 482 struct tracer *t;
561 int len; 483 int len;
@@ -594,9 +516,12 @@ int register_tracer(struct tracer *type)
594 else 516 else
595 if (!type->flags->opts) 517 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt; 518 type->flags->opts = dummy_tracer_opt;
519 if (!type->wait_pipe)
520 type->wait_pipe = default_wait_pipe;
521
597 522
598#ifdef CONFIG_FTRACE_STARTUP_TEST 523#ifdef CONFIG_FTRACE_STARTUP_TEST
599 if (type->selftest) { 524 if (type->selftest && !tracing_selftest_disabled) {
600 struct tracer *saved_tracer = current_trace; 525 struct tracer *saved_tracer = current_trace;
601 struct trace_array *tr = &global_trace; 526 struct trace_array *tr = &global_trace;
602 int i; 527 int i;
@@ -638,8 +563,26 @@ int register_tracer(struct tracer *type)
638 out: 563 out:
639 tracing_selftest_running = false; 564 tracing_selftest_running = false;
640 mutex_unlock(&trace_types_lock); 565 mutex_unlock(&trace_types_lock);
641 lock_kernel();
642 566
567 if (ret || !default_bootup_tracer)
568 goto out_unlock;
569
570 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
571 goto out_unlock;
572
573 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
574 /* Do we want this tracer to start on bootup? */
575 tracing_set_tracer(type->name);
576 default_bootup_tracer = NULL;
577 /* disable other selftests, since this will break it. */
578 tracing_selftest_disabled = 1;
579#ifdef CONFIG_FTRACE_STARTUP_TEST
580 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
581 type->name);
582#endif
583
584 out_unlock:
585 lock_kernel();
643 return ret; 586 return ret;
644} 587}
645 588
@@ -658,6 +601,15 @@ void unregister_tracer(struct tracer *type)
658 601
659 found: 602 found:
660 *t = (*t)->next; 603 *t = (*t)->next;
604
605 if (type == current_trace && tracer_enabled) {
606 tracer_enabled = 0;
607 tracing_stop();
608 if (current_trace->stop)
609 current_trace->stop(&global_trace);
610 current_trace = &nop_trace;
611 }
612
661 if (strlen(type->name) != max_tracer_type_len) 613 if (strlen(type->name) != max_tracer_type_len)
662 goto out; 614 goto out;
663 615
@@ -689,19 +641,20 @@ void tracing_reset_online_cpus(struct trace_array *tr)
689} 641}
690 642
691#define SAVED_CMDLINES 128 643#define SAVED_CMDLINES 128
644#define NO_CMDLINE_MAP UINT_MAX
692static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 645static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
693static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 646static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
694static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 647static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
695static int cmdline_idx; 648static int cmdline_idx;
696static DEFINE_SPINLOCK(trace_cmdline_lock); 649static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
697 650
698/* temporary disable recording */ 651/* temporary disable recording */
699atomic_t trace_record_cmdline_disabled __read_mostly; 652static atomic_t trace_record_cmdline_disabled __read_mostly;
700 653
701static void trace_init_cmdlines(void) 654static void trace_init_cmdlines(void)
702{ 655{
703 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); 656 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
704 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); 657 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
705 cmdline_idx = 0; 658 cmdline_idx = 0;
706} 659}
707 660
@@ -738,13 +691,12 @@ void tracing_start(void)
738 return; 691 return;
739 692
740 spin_lock_irqsave(&tracing_start_lock, flags); 693 spin_lock_irqsave(&tracing_start_lock, flags);
741 if (--trace_stop_count) 694 if (--trace_stop_count) {
742 goto out; 695 if (trace_stop_count < 0) {
743 696 /* Someone screwed up their debugging */
744 if (trace_stop_count < 0) { 697 WARN_ON_ONCE(1);
745 /* Someone screwed up their debugging */ 698 trace_stop_count = 0;
746 WARN_ON_ONCE(1); 699 }
747 trace_stop_count = 0;
748 goto out; 700 goto out;
749 } 701 }
750 702
@@ -794,8 +746,7 @@ void trace_stop_cmdline_recording(void);
794 746
795static void trace_save_cmdline(struct task_struct *tsk) 747static void trace_save_cmdline(struct task_struct *tsk)
796{ 748{
797 unsigned map; 749 unsigned pid, idx;
798 unsigned idx;
799 750
800 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 751 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
801 return; 752 return;
@@ -806,17 +757,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
806 * nor do we want to disable interrupts, 757 * nor do we want to disable interrupts,
807 * so if we miss here, then better luck next time. 758 * so if we miss here, then better luck next time.
808 */ 759 */
809 if (!spin_trylock(&trace_cmdline_lock)) 760 if (!__raw_spin_trylock(&trace_cmdline_lock))
810 return; 761 return;
811 762
812 idx = map_pid_to_cmdline[tsk->pid]; 763 idx = map_pid_to_cmdline[tsk->pid];
813 if (idx >= SAVED_CMDLINES) { 764 if (idx == NO_CMDLINE_MAP) {
814 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 765 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
815 766
816 map = map_cmdline_to_pid[idx]; 767 /*
817 if (map <= PID_MAX_DEFAULT) 768 * Check whether the cmdline buffer at idx has a pid
818 map_pid_to_cmdline[map] = (unsigned)-1; 769 * mapped. We are going to overwrite that entry so we
770 * need to clear the map_pid_to_cmdline. Otherwise we
771 * would read the new comm for the old pid.
772 */
773 pid = map_cmdline_to_pid[idx];
774 if (pid != NO_CMDLINE_MAP)
775 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
819 776
777 map_cmdline_to_pid[idx] = tsk->pid;
820 map_pid_to_cmdline[tsk->pid] = idx; 778 map_pid_to_cmdline[tsk->pid] = idx;
821 779
822 cmdline_idx = idx; 780 cmdline_idx = idx;
@@ -824,33 +782,37 @@ static void trace_save_cmdline(struct task_struct *tsk)
824 782
825 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 783 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
826 784
827 spin_unlock(&trace_cmdline_lock); 785 __raw_spin_unlock(&trace_cmdline_lock);
828} 786}
829 787
830char *trace_find_cmdline(int pid) 788void trace_find_cmdline(int pid, char comm[])
831{ 789{
832 char *cmdline = "<...>";
833 unsigned map; 790 unsigned map;
834 791
835 if (!pid) 792 if (!pid) {
836 return "<idle>"; 793 strcpy(comm, "<idle>");
794 return;
795 }
837 796
838 if (pid > PID_MAX_DEFAULT) 797 if (pid > PID_MAX_DEFAULT) {
839 goto out; 798 strcpy(comm, "<...>");
799 return;
800 }
840 801
802 __raw_spin_lock(&trace_cmdline_lock);
841 map = map_pid_to_cmdline[pid]; 803 map = map_pid_to_cmdline[pid];
842 if (map >= SAVED_CMDLINES) 804 if (map != NO_CMDLINE_MAP)
843 goto out; 805 strcpy(comm, saved_cmdlines[map]);
844 806 else
845 cmdline = saved_cmdlines[map]; 807 strcpy(comm, "<...>");
846 808
847 out: 809 __raw_spin_unlock(&trace_cmdline_lock);
848 return cmdline;
849} 810}
850 811
851void tracing_record_cmdline(struct task_struct *tsk) 812void tracing_record_cmdline(struct task_struct *tsk)
852{ 813{
853 if (atomic_read(&trace_record_cmdline_disabled)) 814 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
815 !tracing_is_on())
854 return; 816 return;
855 817
856 trace_save_cmdline(tsk); 818 trace_save_cmdline(tsk);
@@ -864,7 +826,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
864 826
865 entry->preempt_count = pc & 0xff; 827 entry->preempt_count = pc & 0xff;
866 entry->pid = (tsk) ? tsk->pid : 0; 828 entry->pid = (tsk) ? tsk->pid : 0;
867 entry->tgid = (tsk) ? tsk->tgid : 0; 829 entry->tgid = (tsk) ? tsk->tgid : 0;
868 entry->flags = 830 entry->flags =
869#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 831#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
870 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 832 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -876,78 +838,114 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
876 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 838 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
877} 839}
878 840
841struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
842 unsigned char type,
843 unsigned long len,
844 unsigned long flags, int pc)
845{
846 struct ring_buffer_event *event;
847
848 event = ring_buffer_lock_reserve(tr->buffer, len);
849 if (event != NULL) {
850 struct trace_entry *ent = ring_buffer_event_data(event);
851
852 tracing_generic_entry_update(ent, flags, pc);
853 ent->type = type;
854 }
855
856 return event;
857}
858static void ftrace_trace_stack(struct trace_array *tr,
859 unsigned long flags, int skip, int pc);
860static void ftrace_trace_userstack(struct trace_array *tr,
861 unsigned long flags, int pc);
862
863void trace_buffer_unlock_commit(struct trace_array *tr,
864 struct ring_buffer_event *event,
865 unsigned long flags, int pc)
866{
867 ring_buffer_unlock_commit(tr->buffer, event);
868
869 ftrace_trace_stack(tr, flags, 6, pc);
870 ftrace_trace_userstack(tr, flags, pc);
871 trace_wake_up();
872}
873
874struct ring_buffer_event *
875trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
876 unsigned long flags, int pc)
877{
878 return trace_buffer_lock_reserve(&global_trace,
879 type, len, flags, pc);
880}
881
882void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
883 unsigned long flags, int pc)
884{
885 return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
886}
887
879void 888void
880trace_function(struct trace_array *tr, struct trace_array_cpu *data, 889trace_function(struct trace_array *tr,
881 unsigned long ip, unsigned long parent_ip, unsigned long flags, 890 unsigned long ip, unsigned long parent_ip, unsigned long flags,
882 int pc) 891 int pc)
883{ 892{
884 struct ring_buffer_event *event; 893 struct ring_buffer_event *event;
885 struct ftrace_entry *entry; 894 struct ftrace_entry *entry;
886 unsigned long irq_flags;
887 895
888 /* If we are reading the ring buffer, don't trace */ 896 /* If we are reading the ring buffer, don't trace */
889 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 897 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
890 return; 898 return;
891 899
892 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 900 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
893 &irq_flags); 901 flags, pc);
894 if (!event) 902 if (!event)
895 return; 903 return;
896 entry = ring_buffer_event_data(event); 904 entry = ring_buffer_event_data(event);
897 tracing_generic_entry_update(&entry->ent, flags, pc);
898 entry->ent.type = TRACE_FN;
899 entry->ip = ip; 905 entry->ip = ip;
900 entry->parent_ip = parent_ip; 906 entry->parent_ip = parent_ip;
901 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 907 ring_buffer_unlock_commit(tr->buffer, event);
902} 908}
903 909
904#ifdef CONFIG_FUNCTION_GRAPH_TRACER 910#ifdef CONFIG_FUNCTION_GRAPH_TRACER
905static void __trace_graph_entry(struct trace_array *tr, 911static void __trace_graph_entry(struct trace_array *tr,
906 struct trace_array_cpu *data,
907 struct ftrace_graph_ent *trace, 912 struct ftrace_graph_ent *trace,
908 unsigned long flags, 913 unsigned long flags,
909 int pc) 914 int pc)
910{ 915{
911 struct ring_buffer_event *event; 916 struct ring_buffer_event *event;
912 struct ftrace_graph_ent_entry *entry; 917 struct ftrace_graph_ent_entry *entry;
913 unsigned long irq_flags;
914 918
915 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 919 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
916 return; 920 return;
917 921
918 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 922 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
919 &irq_flags); 923 sizeof(*entry), flags, pc);
920 if (!event) 924 if (!event)
921 return; 925 return;
922 entry = ring_buffer_event_data(event); 926 entry = ring_buffer_event_data(event);
923 tracing_generic_entry_update(&entry->ent, flags, pc);
924 entry->ent.type = TRACE_GRAPH_ENT;
925 entry->graph_ent = *trace; 927 entry->graph_ent = *trace;
926 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 928 ring_buffer_unlock_commit(global_trace.buffer, event);
927} 929}
928 930
929static void __trace_graph_return(struct trace_array *tr, 931static void __trace_graph_return(struct trace_array *tr,
930 struct trace_array_cpu *data,
931 struct ftrace_graph_ret *trace, 932 struct ftrace_graph_ret *trace,
932 unsigned long flags, 933 unsigned long flags,
933 int pc) 934 int pc)
934{ 935{
935 struct ring_buffer_event *event; 936 struct ring_buffer_event *event;
936 struct ftrace_graph_ret_entry *entry; 937 struct ftrace_graph_ret_entry *entry;
937 unsigned long irq_flags;
938 938
939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
940 return; 940 return;
941 941
942 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 942 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
943 &irq_flags); 943 sizeof(*entry), flags, pc);
944 if (!event) 944 if (!event)
945 return; 945 return;
946 entry = ring_buffer_event_data(event); 946 entry = ring_buffer_event_data(event);
947 tracing_generic_entry_update(&entry->ent, flags, pc);
948 entry->ent.type = TRACE_GRAPH_RET;
949 entry->ret = *trace; 947 entry->ret = *trace;
950 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 948 ring_buffer_unlock_commit(global_trace.buffer, event);
951} 949}
952#endif 950#endif
953 951
@@ -957,31 +955,23 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
957 int pc) 955 int pc)
958{ 956{
959 if (likely(!atomic_read(&data->disabled))) 957 if (likely(!atomic_read(&data->disabled)))
960 trace_function(tr, data, ip, parent_ip, flags, pc); 958 trace_function(tr, ip, parent_ip, flags, pc);
961} 959}
962 960
963static void ftrace_trace_stack(struct trace_array *tr, 961static void __ftrace_trace_stack(struct trace_array *tr,
964 struct trace_array_cpu *data, 962 unsigned long flags,
965 unsigned long flags, 963 int skip, int pc)
966 int skip, int pc)
967{ 964{
968#ifdef CONFIG_STACKTRACE 965#ifdef CONFIG_STACKTRACE
969 struct ring_buffer_event *event; 966 struct ring_buffer_event *event;
970 struct stack_entry *entry; 967 struct stack_entry *entry;
971 struct stack_trace trace; 968 struct stack_trace trace;
972 unsigned long irq_flags;
973
974 if (!(trace_flags & TRACE_ITER_STACKTRACE))
975 return;
976 969
977 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 970 event = trace_buffer_lock_reserve(tr, TRACE_STACK,
978 &irq_flags); 971 sizeof(*entry), flags, pc);
979 if (!event) 972 if (!event)
980 return; 973 return;
981 entry = ring_buffer_event_data(event); 974 entry = ring_buffer_event_data(event);
982 tracing_generic_entry_update(&entry->ent, flags, pc);
983 entry->ent.type = TRACE_STACK;
984
985 memset(&entry->caller, 0, sizeof(entry->caller)); 975 memset(&entry->caller, 0, sizeof(entry->caller));
986 976
987 trace.nr_entries = 0; 977 trace.nr_entries = 0;
@@ -990,38 +980,43 @@ static void ftrace_trace_stack(struct trace_array *tr,
990 trace.entries = entry->caller; 980 trace.entries = entry->caller;
991 981
992 save_stack_trace(&trace); 982 save_stack_trace(&trace);
993 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 983 ring_buffer_unlock_commit(tr->buffer, event);
994#endif 984#endif
995} 985}
996 986
987static void ftrace_trace_stack(struct trace_array *tr,
988 unsigned long flags,
989 int skip, int pc)
990{
991 if (!(trace_flags & TRACE_ITER_STACKTRACE))
992 return;
993
994 __ftrace_trace_stack(tr, flags, skip, pc);
995}
996
997void __trace_stack(struct trace_array *tr, 997void __trace_stack(struct trace_array *tr,
998 struct trace_array_cpu *data,
999 unsigned long flags, 998 unsigned long flags,
1000 int skip) 999 int skip, int pc)
1001{ 1000{
1002 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 1001 __ftrace_trace_stack(tr, flags, skip, pc);
1003} 1002}
1004 1003
1005static void ftrace_trace_userstack(struct trace_array *tr, 1004static void ftrace_trace_userstack(struct trace_array *tr,
1006 struct trace_array_cpu *data, 1005 unsigned long flags, int pc)
1007 unsigned long flags, int pc)
1008{ 1006{
1009#ifdef CONFIG_STACKTRACE 1007#ifdef CONFIG_STACKTRACE
1010 struct ring_buffer_event *event; 1008 struct ring_buffer_event *event;
1011 struct userstack_entry *entry; 1009 struct userstack_entry *entry;
1012 struct stack_trace trace; 1010 struct stack_trace trace;
1013 unsigned long irq_flags;
1014 1011
1015 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1012 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1016 return; 1013 return;
1017 1014
1018 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1015 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
1019 &irq_flags); 1016 sizeof(*entry), flags, pc);
1020 if (!event) 1017 if (!event)
1021 return; 1018 return;
1022 entry = ring_buffer_event_data(event); 1019 entry = ring_buffer_event_data(event);
1023 tracing_generic_entry_update(&entry->ent, flags, pc);
1024 entry->ent.type = TRACE_USER_STACK;
1025 1020
1026 memset(&entry->caller, 0, sizeof(entry->caller)); 1021 memset(&entry->caller, 0, sizeof(entry->caller));
1027 1022
@@ -1031,70 +1026,58 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1031 trace.entries = entry->caller; 1026 trace.entries = entry->caller;
1032 1027
1033 save_stack_trace_user(&trace); 1028 save_stack_trace_user(&trace);
1034 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1029 ring_buffer_unlock_commit(tr->buffer, event);
1035#endif 1030#endif
1036} 1031}
1037 1032
1038void __trace_userstack(struct trace_array *tr, 1033#ifdef UNUSED
1039 struct trace_array_cpu *data, 1034static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1040 unsigned long flags)
1041{ 1035{
1042 ftrace_trace_userstack(tr, data, flags, preempt_count()); 1036 ftrace_trace_userstack(tr, flags, preempt_count());
1043} 1037}
1038#endif /* UNUSED */
1044 1039
1045static void 1040static void
1046ftrace_trace_special(void *__tr, void *__data, 1041ftrace_trace_special(void *__tr,
1047 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1042 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1048 int pc) 1043 int pc)
1049{ 1044{
1050 struct ring_buffer_event *event; 1045 struct ring_buffer_event *event;
1051 struct trace_array_cpu *data = __data;
1052 struct trace_array *tr = __tr; 1046 struct trace_array *tr = __tr;
1053 struct special_entry *entry; 1047 struct special_entry *entry;
1054 unsigned long irq_flags;
1055 1048
1056 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1049 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
1057 &irq_flags); 1050 sizeof(*entry), 0, pc);
1058 if (!event) 1051 if (!event)
1059 return; 1052 return;
1060 entry = ring_buffer_event_data(event); 1053 entry = ring_buffer_event_data(event);
1061 tracing_generic_entry_update(&entry->ent, 0, pc);
1062 entry->ent.type = TRACE_SPECIAL;
1063 entry->arg1 = arg1; 1054 entry->arg1 = arg1;
1064 entry->arg2 = arg2; 1055 entry->arg2 = arg2;
1065 entry->arg3 = arg3; 1056 entry->arg3 = arg3;
1066 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1057 trace_buffer_unlock_commit(tr, event, 0, pc);
1067 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1068 ftrace_trace_userstack(tr, data, irq_flags, pc);
1069
1070 trace_wake_up();
1071} 1058}
1072 1059
1073void 1060void
1074__trace_special(void *__tr, void *__data, 1061__trace_special(void *__tr, void *__data,
1075 unsigned long arg1, unsigned long arg2, unsigned long arg3) 1062 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1076{ 1063{
1077 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); 1064 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1078} 1065}
1079 1066
1080void 1067void
1081tracing_sched_switch_trace(struct trace_array *tr, 1068tracing_sched_switch_trace(struct trace_array *tr,
1082 struct trace_array_cpu *data,
1083 struct task_struct *prev, 1069 struct task_struct *prev,
1084 struct task_struct *next, 1070 struct task_struct *next,
1085 unsigned long flags, int pc) 1071 unsigned long flags, int pc)
1086{ 1072{
1087 struct ring_buffer_event *event; 1073 struct ring_buffer_event *event;
1088 struct ctx_switch_entry *entry; 1074 struct ctx_switch_entry *entry;
1089 unsigned long irq_flags;
1090 1075
1091 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1076 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1092 &irq_flags); 1077 sizeof(*entry), flags, pc);
1093 if (!event) 1078 if (!event)
1094 return; 1079 return;
1095 entry = ring_buffer_event_data(event); 1080 entry = ring_buffer_event_data(event);
1096 tracing_generic_entry_update(&entry->ent, flags, pc);
1097 entry->ent.type = TRACE_CTX;
1098 entry->prev_pid = prev->pid; 1081 entry->prev_pid = prev->pid;
1099 entry->prev_prio = prev->prio; 1082 entry->prev_prio = prev->prio;
1100 entry->prev_state = prev->state; 1083 entry->prev_state = prev->state;
@@ -1102,29 +1085,23 @@ tracing_sched_switch_trace(struct trace_array *tr,
1102 entry->next_prio = next->prio; 1085 entry->next_prio = next->prio;
1103 entry->next_state = next->state; 1086 entry->next_state = next->state;
1104 entry->next_cpu = task_cpu(next); 1087 entry->next_cpu = task_cpu(next);
1105 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1088 trace_buffer_unlock_commit(tr, event, flags, pc);
1106 ftrace_trace_stack(tr, data, flags, 5, pc);
1107 ftrace_trace_userstack(tr, data, flags, pc);
1108} 1089}
1109 1090
1110void 1091void
1111tracing_sched_wakeup_trace(struct trace_array *tr, 1092tracing_sched_wakeup_trace(struct trace_array *tr,
1112 struct trace_array_cpu *data,
1113 struct task_struct *wakee, 1093 struct task_struct *wakee,
1114 struct task_struct *curr, 1094 struct task_struct *curr,
1115 unsigned long flags, int pc) 1095 unsigned long flags, int pc)
1116{ 1096{
1117 struct ring_buffer_event *event; 1097 struct ring_buffer_event *event;
1118 struct ctx_switch_entry *entry; 1098 struct ctx_switch_entry *entry;
1119 unsigned long irq_flags;
1120 1099
1121 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1100 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1122 &irq_flags); 1101 sizeof(*entry), flags, pc);
1123 if (!event) 1102 if (!event)
1124 return; 1103 return;
1125 entry = ring_buffer_event_data(event); 1104 entry = ring_buffer_event_data(event);
1126 tracing_generic_entry_update(&entry->ent, flags, pc);
1127 entry->ent.type = TRACE_WAKE;
1128 entry->prev_pid = curr->pid; 1105 entry->prev_pid = curr->pid;
1129 entry->prev_prio = curr->prio; 1106 entry->prev_prio = curr->prio;
1130 entry->prev_state = curr->state; 1107 entry->prev_state = curr->state;
@@ -1132,11 +1109,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1132 entry->next_prio = wakee->prio; 1109 entry->next_prio = wakee->prio;
1133 entry->next_state = wakee->state; 1110 entry->next_state = wakee->state;
1134 entry->next_cpu = task_cpu(wakee); 1111 entry->next_cpu = task_cpu(wakee);
1135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1136 ftrace_trace_stack(tr, data, flags, 6, pc);
1137 ftrace_trace_userstack(tr, data, flags, pc);
1138 1112
1139 trace_wake_up(); 1113 ring_buffer_unlock_commit(tr->buffer, event);
1114 ftrace_trace_stack(tr, flags, 6, pc);
1115 ftrace_trace_userstack(tr, flags, pc);
1140} 1116}
1141 1117
1142void 1118void
@@ -1157,66 +1133,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1157 data = tr->data[cpu]; 1133 data = tr->data[cpu];
1158 1134
1159 if (likely(atomic_inc_return(&data->disabled) == 1)) 1135 if (likely(atomic_inc_return(&data->disabled) == 1))
1160 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1136 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1161
1162 atomic_dec(&data->disabled);
1163 local_irq_restore(flags);
1164}
1165
1166#ifdef CONFIG_FUNCTION_TRACER
1167static void
1168function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
1169{
1170 struct trace_array *tr = &global_trace;
1171 struct trace_array_cpu *data;
1172 unsigned long flags;
1173 long disabled;
1174 int cpu, resched;
1175 int pc;
1176
1177 if (unlikely(!ftrace_function_enabled))
1178 return;
1179
1180 pc = preempt_count();
1181 resched = ftrace_preempt_disable();
1182 local_save_flags(flags);
1183 cpu = raw_smp_processor_id();
1184 data = tr->data[cpu];
1185 disabled = atomic_inc_return(&data->disabled);
1186
1187 if (likely(disabled == 1))
1188 trace_function(tr, data, ip, parent_ip, flags, pc);
1189
1190 atomic_dec(&data->disabled);
1191 ftrace_preempt_enable(resched);
1192}
1193
1194static void
1195function_trace_call(unsigned long ip, unsigned long parent_ip)
1196{
1197 struct trace_array *tr = &global_trace;
1198 struct trace_array_cpu *data;
1199 unsigned long flags;
1200 long disabled;
1201 int cpu;
1202 int pc;
1203
1204 if (unlikely(!ftrace_function_enabled))
1205 return;
1206
1207 /*
1208 * Need to use raw, since this must be called before the
1209 * recursive protection is performed.
1210 */
1211 local_irq_save(flags);
1212 cpu = raw_smp_processor_id();
1213 data = tr->data[cpu];
1214 disabled = atomic_inc_return(&data->disabled);
1215
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 trace_function(tr, data, ip, parent_ip, flags, pc);
1219 }
1220 1137
1221 atomic_dec(&data->disabled); 1138 atomic_dec(&data->disabled);
1222 local_irq_restore(flags); 1139 local_irq_restore(flags);
@@ -1244,7 +1161,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1244 disabled = atomic_inc_return(&data->disabled); 1161 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) { 1162 if (likely(disabled == 1)) {
1246 pc = preempt_count(); 1163 pc = preempt_count();
1247 __trace_graph_entry(tr, data, trace, flags, pc); 1164 __trace_graph_entry(tr, trace, flags, pc);
1248 } 1165 }
1249 /* Only do the atomic if it is not already set */ 1166 /* Only do the atomic if it is not already set */
1250 if (!test_tsk_trace_graph(current)) 1167 if (!test_tsk_trace_graph(current))
@@ -1270,7 +1187,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1270 disabled = atomic_inc_return(&data->disabled); 1187 disabled = atomic_inc_return(&data->disabled);
1271 if (likely(disabled == 1)) { 1188 if (likely(disabled == 1)) {
1272 pc = preempt_count(); 1189 pc = preempt_count();
1273 __trace_graph_return(tr, data, trace, flags, pc); 1190 __trace_graph_return(tr, trace, flags, pc);
1274 } 1191 }
1275 if (!trace->depth) 1192 if (!trace->depth)
1276 clear_tsk_trace_graph(current); 1193 clear_tsk_trace_graph(current);
@@ -1279,30 +1196,124 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1279} 1196}
1280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 1197#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1281 1198
1282static struct ftrace_ops trace_ops __read_mostly =
1283{
1284 .func = function_trace_call,
1285};
1286 1199
1287void tracing_start_function_trace(void) 1200/**
1201 * trace_vbprintk - write binary msg to tracing buffer
1202 *
1203 */
1204int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
1288{ 1205{
1289 ftrace_function_enabled = 0; 1206 static raw_spinlock_t trace_buf_lock =
1207 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1208 static u32 trace_buf[TRACE_BUF_SIZE];
1290 1209
1291 if (trace_flags & TRACE_ITER_PREEMPTONLY) 1210 struct ring_buffer_event *event;
1292 trace_ops.func = function_trace_call_preempt_only; 1211 struct trace_array *tr = &global_trace;
1293 else 1212 struct trace_array_cpu *data;
1294 trace_ops.func = function_trace_call; 1213 struct bprint_entry *entry;
1214 unsigned long flags;
1215 int resched;
1216 int cpu, len = 0, size, pc;
1217
1218 if (unlikely(tracing_selftest_running || tracing_disabled))
1219 return 0;
1220
1221 /* Don't pollute graph traces with trace_vprintk internals */
1222 pause_graph_tracing();
1223
1224 pc = preempt_count();
1225 resched = ftrace_preempt_disable();
1226 cpu = raw_smp_processor_id();
1227 data = tr->data[cpu];
1228
1229 if (unlikely(atomic_read(&data->disabled)))
1230 goto out;
1295 1231
1296 register_ftrace_function(&trace_ops); 1232 /* Lockdep uses trace_printk for lock tracing */
1297 ftrace_function_enabled = 1; 1233 local_irq_save(flags);
1234 __raw_spin_lock(&trace_buf_lock);
1235 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1236
1237 if (len > TRACE_BUF_SIZE || len < 0)
1238 goto out_unlock;
1239
1240 size = sizeof(*entry) + sizeof(u32) * len;
1241 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
1242 if (!event)
1243 goto out_unlock;
1244 entry = ring_buffer_event_data(event);
1245 entry->ip = ip;
1246 entry->depth = depth;
1247 entry->fmt = fmt;
1248
1249 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1250 ring_buffer_unlock_commit(tr->buffer, event);
1251
1252out_unlock:
1253 __raw_spin_unlock(&trace_buf_lock);
1254 local_irq_restore(flags);
1255
1256out:
1257 ftrace_preempt_enable(resched);
1258 unpause_graph_tracing();
1259
1260 return len;
1298} 1261}
1262EXPORT_SYMBOL_GPL(trace_vbprintk);
1299 1263
1300void tracing_stop_function_trace(void) 1264int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
1301{ 1265{
1302 ftrace_function_enabled = 0; 1266 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1303 unregister_ftrace_function(&trace_ops); 1267 static char trace_buf[TRACE_BUF_SIZE];
1268
1269 struct ring_buffer_event *event;
1270 struct trace_array *tr = &global_trace;
1271 struct trace_array_cpu *data;
1272 int cpu, len = 0, size, pc;
1273 struct print_entry *entry;
1274 unsigned long irq_flags;
1275
1276 if (tracing_disabled || tracing_selftest_running)
1277 return 0;
1278
1279 pc = preempt_count();
1280 preempt_disable_notrace();
1281 cpu = raw_smp_processor_id();
1282 data = tr->data[cpu];
1283
1284 if (unlikely(atomic_read(&data->disabled)))
1285 goto out;
1286
1287 pause_graph_tracing();
1288 raw_local_irq_save(irq_flags);
1289 __raw_spin_lock(&trace_buf_lock);
1290 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1291
1292 len = min(len, TRACE_BUF_SIZE-1);
1293 trace_buf[len] = 0;
1294
1295 size = sizeof(*entry) + len + 1;
1296 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
1297 if (!event)
1298 goto out_unlock;
1299 entry = ring_buffer_event_data(event);
1300 entry->ip = ip;
1301 entry->depth = depth;
1302
1303 memcpy(&entry->buf, trace_buf, len);
1304 entry->buf[len] = 0;
1305 ring_buffer_unlock_commit(tr->buffer, event);
1306
1307 out_unlock:
1308 __raw_spin_unlock(&trace_buf_lock);
1309 raw_local_irq_restore(irq_flags);
1310 unpause_graph_tracing();
1311 out:
1312 preempt_enable_notrace();
1313
1314 return len;
1304} 1315}
1305#endif 1316EXPORT_SYMBOL_GPL(trace_vprintk);
1306 1317
1307enum trace_file_type { 1318enum trace_file_type {
1308 TRACE_FILE_LAT_FMT = 1, 1319 TRACE_FILE_LAT_FMT = 1,
@@ -1345,10 +1356,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1345{ 1356{
1346 struct ring_buffer *buffer = iter->tr->buffer; 1357 struct ring_buffer *buffer = iter->tr->buffer;
1347 struct trace_entry *ent, *next = NULL; 1358 struct trace_entry *ent, *next = NULL;
1359 int cpu_file = iter->cpu_file;
1348 u64 next_ts = 0, ts; 1360 u64 next_ts = 0, ts;
1349 int next_cpu = -1; 1361 int next_cpu = -1;
1350 int cpu; 1362 int cpu;
1351 1363
1364 /*
1365 * If we are in a per_cpu trace file, don't bother by iterating over
1366 * all cpu and peek directly.
1367 */
1368 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1369 if (ring_buffer_empty_cpu(buffer, cpu_file))
1370 return NULL;
1371 ent = peek_next_entry(iter, cpu_file, ent_ts);
1372 if (ent_cpu)
1373 *ent_cpu = cpu_file;
1374
1375 return ent;
1376 }
1377
1352 for_each_tracing_cpu(cpu) { 1378 for_each_tracing_cpu(cpu) {
1353 1379
1354 if (ring_buffer_empty_cpu(buffer, cpu)) 1380 if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1376,8 +1402,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1376} 1402}
1377 1403
1378/* Find the next real entry, without updating the iterator itself */ 1404/* Find the next real entry, without updating the iterator itself */
1379static struct trace_entry * 1405struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1380find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1406 int *ent_cpu, u64 *ent_ts)
1381{ 1407{
1382 return __find_next_entry(iter, ent_cpu, ent_ts); 1408 return __find_next_entry(iter, ent_cpu, ent_ts);
1383} 1409}
@@ -1426,19 +1452,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1426 return ent; 1452 return ent;
1427} 1453}
1428 1454
1455/*
1456 * No necessary locking here. The worst thing which can
1457 * happen is loosing events consumed at the same time
1458 * by a trace_pipe reader.
1459 * Other than that, we don't risk to crash the ring buffer
1460 * because it serializes the readers.
1461 *
1462 * The current tracer is copied to avoid a global locking
1463 * all around.
1464 */
1429static void *s_start(struct seq_file *m, loff_t *pos) 1465static void *s_start(struct seq_file *m, loff_t *pos)
1430{ 1466{
1431 struct trace_iterator *iter = m->private; 1467 struct trace_iterator *iter = m->private;
1468 static struct tracer *old_tracer;
1469 int cpu_file = iter->cpu_file;
1432 void *p = NULL; 1470 void *p = NULL;
1433 loff_t l = 0; 1471 loff_t l = 0;
1434 int cpu; 1472 int cpu;
1435 1473
1474 /* copy the tracer to avoid using a global lock all around */
1436 mutex_lock(&trace_types_lock); 1475 mutex_lock(&trace_types_lock);
1437 1476 if (unlikely(old_tracer != current_trace && current_trace)) {
1438 if (!current_trace || current_trace != iter->trace) { 1477 old_tracer = current_trace;
1439 mutex_unlock(&trace_types_lock); 1478 *iter->trace = *current_trace;
1440 return NULL;
1441 } 1479 }
1480 mutex_unlock(&trace_types_lock);
1442 1481
1443 atomic_inc(&trace_record_cmdline_disabled); 1482 atomic_inc(&trace_record_cmdline_disabled);
1444 1483
@@ -1449,9 +1488,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1449 1488
1450 ftrace_disable_cpu(); 1489 ftrace_disable_cpu();
1451 1490
1452 for_each_tracing_cpu(cpu) { 1491 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1453 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1492 for_each_tracing_cpu(cpu)
1454 } 1493 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1494 } else
1495 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
1496
1455 1497
1456 ftrace_enable_cpu(); 1498 ftrace_enable_cpu();
1457 1499
@@ -1469,155 +1511,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1469static void s_stop(struct seq_file *m, void *p) 1511static void s_stop(struct seq_file *m, void *p)
1470{ 1512{
1471 atomic_dec(&trace_record_cmdline_disabled); 1513 atomic_dec(&trace_record_cmdline_disabled);
1472 mutex_unlock(&trace_types_lock);
1473}
1474
1475#ifdef CONFIG_KRETPROBES
1476static inline const char *kretprobed(const char *name)
1477{
1478 static const char tramp_name[] = "kretprobe_trampoline";
1479 int size = sizeof(tramp_name);
1480
1481 if (strncmp(tramp_name, name, size) == 0)
1482 return "[unknown/kretprobe'd]";
1483 return name;
1484}
1485#else
1486static inline const char *kretprobed(const char *name)
1487{
1488 return name;
1489}
1490#endif /* CONFIG_KRETPROBES */
1491
1492static int
1493seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1494{
1495#ifdef CONFIG_KALLSYMS
1496 char str[KSYM_SYMBOL_LEN];
1497 const char *name;
1498
1499 kallsyms_lookup(address, NULL, NULL, NULL, str);
1500
1501 name = kretprobed(str);
1502
1503 return trace_seq_printf(s, fmt, name);
1504#endif
1505 return 1;
1506}
1507
1508static int
1509seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1510 unsigned long address)
1511{
1512#ifdef CONFIG_KALLSYMS
1513 char str[KSYM_SYMBOL_LEN];
1514 const char *name;
1515
1516 sprint_symbol(str, address);
1517 name = kretprobed(str);
1518
1519 return trace_seq_printf(s, fmt, name);
1520#endif
1521 return 1;
1522}
1523
1524#ifndef CONFIG_64BIT
1525# define IP_FMT "%08lx"
1526#else
1527# define IP_FMT "%016lx"
1528#endif
1529
1530int
1531seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1532{
1533 int ret;
1534
1535 if (!ip)
1536 return trace_seq_printf(s, "0");
1537
1538 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1539 ret = seq_print_sym_offset(s, "%s", ip);
1540 else
1541 ret = seq_print_sym_short(s, "%s", ip);
1542
1543 if (!ret)
1544 return 0;
1545
1546 if (sym_flags & TRACE_ITER_SYM_ADDR)
1547 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1548 return ret;
1549}
1550
1551static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1552 unsigned long ip, unsigned long sym_flags)
1553{
1554 struct file *file = NULL;
1555 unsigned long vmstart = 0;
1556 int ret = 1;
1557
1558 if (mm) {
1559 const struct vm_area_struct *vma;
1560
1561 down_read(&mm->mmap_sem);
1562 vma = find_vma(mm, ip);
1563 if (vma) {
1564 file = vma->vm_file;
1565 vmstart = vma->vm_start;
1566 }
1567 if (file) {
1568 ret = trace_seq_path(s, &file->f_path);
1569 if (ret)
1570 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1571 }
1572 up_read(&mm->mmap_sem);
1573 }
1574 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1575 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1576 return ret;
1577}
1578
1579static int
1580seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1581 unsigned long sym_flags)
1582{
1583 struct mm_struct *mm = NULL;
1584 int ret = 1;
1585 unsigned int i;
1586
1587 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1588 struct task_struct *task;
1589 /*
1590 * we do the lookup on the thread group leader,
1591 * since individual threads might have already quit!
1592 */
1593 rcu_read_lock();
1594 task = find_task_by_vpid(entry->ent.tgid);
1595 if (task)
1596 mm = get_task_mm(task);
1597 rcu_read_unlock();
1598 }
1599
1600 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1601 unsigned long ip = entry->caller[i];
1602
1603 if (ip == ULONG_MAX || !ret)
1604 break;
1605 if (i && ret)
1606 ret = trace_seq_puts(s, " <- ");
1607 if (!ip) {
1608 if (ret)
1609 ret = trace_seq_puts(s, "??");
1610 continue;
1611 }
1612 if (!ret)
1613 break;
1614 if (ret)
1615 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1616 }
1617
1618 if (mm)
1619 mmput(mm);
1620 return ret;
1621} 1514}
1622 1515
1623static void print_lat_help_header(struct seq_file *m) 1516static void print_lat_help_header(struct seq_file *m)
@@ -1658,11 +1551,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1658 total = entries + 1551 total = entries +
1659 ring_buffer_overruns(iter->tr->buffer); 1552 ring_buffer_overruns(iter->tr->buffer);
1660 1553
1661 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1554 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1662 name, UTS_RELEASE); 1555 name, UTS_RELEASE);
1663 seq_puts(m, "-----------------------------------" 1556 seq_puts(m, "# -----------------------------------"
1664 "---------------------------------\n"); 1557 "---------------------------------\n");
1665 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" 1558 seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
1666 " (M:%s VP:%d, KP:%d, SP:%d HP:%d", 1559 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1667 nsecs_to_usecs(data->saved_latency), 1560 nsecs_to_usecs(data->saved_latency),
1668 entries, 1561 entries,
@@ -1684,121 +1577,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1684#else 1577#else
1685 seq_puts(m, ")\n"); 1578 seq_puts(m, ")\n");
1686#endif 1579#endif
1687 seq_puts(m, " -----------------\n"); 1580 seq_puts(m, "# -----------------\n");
1688 seq_printf(m, " | task: %.16s-%d " 1581 seq_printf(m, "# | task: %.16s-%d "
1689 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 1582 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1690 data->comm, data->pid, data->uid, data->nice, 1583 data->comm, data->pid, data->uid, data->nice,
1691 data->policy, data->rt_priority); 1584 data->policy, data->rt_priority);
1692 seq_puts(m, " -----------------\n"); 1585 seq_puts(m, "# -----------------\n");
1693 1586
1694 if (data->critical_start) { 1587 if (data->critical_start) {
1695 seq_puts(m, " => started at: "); 1588 seq_puts(m, "# => started at: ");
1696 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); 1589 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1697 trace_print_seq(m, &iter->seq); 1590 trace_print_seq(m, &iter->seq);
1698 seq_puts(m, "\n => ended at: "); 1591 seq_puts(m, "\n# => ended at: ");
1699 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1592 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1700 trace_print_seq(m, &iter->seq); 1593 trace_print_seq(m, &iter->seq);
1701 seq_puts(m, "\n"); 1594 seq_puts(m, "#\n");
1702 }
1703
1704 seq_puts(m, "\n");
1705}
1706
1707static void
1708lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1709{
1710 int hardirq, softirq;
1711 char *comm;
1712
1713 comm = trace_find_cmdline(entry->pid);
1714
1715 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1716 trace_seq_printf(s, "%3d", cpu);
1717 trace_seq_printf(s, "%c%c",
1718 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
1719 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
1720 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1721
1722 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1723 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1724 if (hardirq && softirq) {
1725 trace_seq_putc(s, 'H');
1726 } else {
1727 if (hardirq) {
1728 trace_seq_putc(s, 'h');
1729 } else {
1730 if (softirq)
1731 trace_seq_putc(s, 's');
1732 else
1733 trace_seq_putc(s, '.');
1734 }
1735 } 1595 }
1736 1596
1737 if (entry->preempt_count) 1597 seq_puts(m, "#\n");
1738 trace_seq_printf(s, "%x", entry->preempt_count);
1739 else
1740 trace_seq_puts(s, ".");
1741}
1742
1743unsigned long preempt_mark_thresh = 100;
1744
1745static void
1746lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1747 unsigned long rel_usecs)
1748{
1749 trace_seq_printf(s, " %4lldus", abs_usecs);
1750 if (rel_usecs > preempt_mark_thresh)
1751 trace_seq_puts(s, "!: ");
1752 else if (rel_usecs > 1)
1753 trace_seq_puts(s, "+: ");
1754 else
1755 trace_seq_puts(s, " : ");
1756}
1757
1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1759
1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1767/*
1768 * The message is supposed to contain an ending newline.
1769 * If the printing stops prematurely, try to add a newline of our own.
1770 */
1771void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1772{
1773 struct trace_entry *ent;
1774 struct trace_field_cont *cont;
1775 bool ok = true;
1776
1777 ent = peek_next_entry(iter, iter->cpu, NULL);
1778 if (!ent || ent->type != TRACE_CONT) {
1779 trace_seq_putc(s, '\n');
1780 return;
1781 }
1782
1783 do {
1784 cont = (struct trace_field_cont *)ent;
1785 if (ok)
1786 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1787
1788 ftrace_disable_cpu();
1789
1790 if (iter->buffer_iter[iter->cpu])
1791 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1792 else
1793 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1794
1795 ftrace_enable_cpu();
1796
1797 ent = peek_next_entry(iter, iter->cpu, NULL);
1798 } while (ent && ent->type == TRACE_CONT);
1799
1800 if (!ok)
1801 trace_seq_putc(s, '\n');
1802} 1598}
1803 1599
1804static void test_cpu_buff_start(struct trace_iterator *iter) 1600static void test_cpu_buff_start(struct trace_iterator *iter)
@@ -1818,453 +1614,105 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1614 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819} 1615}
1820 1616
1821static enum print_line_t
1822print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1823{
1824 struct trace_seq *s = &iter->seq;
1825 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1826 struct trace_entry *next_entry;
1827 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1828 struct trace_entry *entry = iter->ent;
1829 unsigned long abs_usecs;
1830 unsigned long rel_usecs;
1831 u64 next_ts;
1832 char *comm;
1833 int S, T;
1834 int i;
1835
1836 if (entry->type == TRACE_CONT)
1837 return TRACE_TYPE_HANDLED;
1838
1839 test_cpu_buff_start(iter);
1840
1841 next_entry = find_next_entry(iter, NULL, &next_ts);
1842 if (!next_entry)
1843 next_ts = iter->ts;
1844 rel_usecs = ns2usecs(next_ts - iter->ts);
1845 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1846
1847 if (verbose) {
1848 comm = trace_find_cmdline(entry->pid);
1849 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1850 " %ld.%03ldms (+%ld.%03ldms): ",
1851 comm,
1852 entry->pid, cpu, entry->flags,
1853 entry->preempt_count, trace_idx,
1854 ns2usecs(iter->ts),
1855 abs_usecs/1000,
1856 abs_usecs % 1000, rel_usecs/1000,
1857 rel_usecs % 1000);
1858 } else {
1859 lat_print_generic(s, entry, cpu);
1860 lat_print_timestamp(s, abs_usecs, rel_usecs);
1861 }
1862 switch (entry->type) {
1863 case TRACE_FN: {
1864 struct ftrace_entry *field;
1865
1866 trace_assign_type(field, entry);
1867
1868 seq_print_ip_sym(s, field->ip, sym_flags);
1869 trace_seq_puts(s, " (");
1870 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1871 trace_seq_puts(s, ")\n");
1872 break;
1873 }
1874 case TRACE_CTX:
1875 case TRACE_WAKE: {
1876 struct ctx_switch_entry *field;
1877
1878 trace_assign_type(field, entry);
1879
1880 T = task_state_char(field->next_state);
1881 S = task_state_char(field->prev_state);
1882 comm = trace_find_cmdline(field->next_pid);
1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1884 field->prev_pid,
1885 field->prev_prio,
1886 S, entry->type == TRACE_CTX ? "==>" : " +",
1887 field->next_cpu,
1888 field->next_pid,
1889 field->next_prio,
1890 T, comm);
1891 break;
1892 }
1893 case TRACE_SPECIAL: {
1894 struct special_entry *field;
1895
1896 trace_assign_type(field, entry);
1897
1898 trace_seq_printf(s, "# %ld %ld %ld\n",
1899 field->arg1,
1900 field->arg2,
1901 field->arg3);
1902 break;
1903 }
1904 case TRACE_STACK: {
1905 struct stack_entry *field;
1906
1907 trace_assign_type(field, entry);
1908
1909 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1910 if (i)
1911 trace_seq_puts(s, " <= ");
1912 seq_print_ip_sym(s, field->caller[i], sym_flags);
1913 }
1914 trace_seq_puts(s, "\n");
1915 break;
1916 }
1917 case TRACE_PRINT: {
1918 struct print_entry *field;
1919
1920 trace_assign_type(field, entry);
1921
1922 seq_print_ip_sym(s, field->ip, sym_flags);
1923 trace_seq_printf(s, ": %s", field->buf);
1924 if (entry->flags & TRACE_FLAG_CONT)
1925 trace_seq_print_cont(s, iter);
1926 break;
1927 }
1928 case TRACE_BRANCH: {
1929 struct trace_branch *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1934 field->correct ? " ok " : " MISS ",
1935 field->func,
1936 field->file,
1937 field->line);
1938 break;
1939 }
1940 case TRACE_USER_STACK: {
1941 struct userstack_entry *field;
1942
1943 trace_assign_type(field, entry);
1944
1945 seq_print_userip_objs(field, s, sym_flags);
1946 trace_seq_putc(s, '\n');
1947 break;
1948 }
1949 default:
1950 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1951 }
1952 return TRACE_TYPE_HANDLED;
1953}
1954
1955static enum print_line_t print_trace_fmt(struct trace_iterator *iter) 1617static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1956{ 1618{
1957 struct trace_seq *s = &iter->seq; 1619 struct trace_seq *s = &iter->seq;
1958 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1620 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1959 struct trace_entry *entry; 1621 struct trace_entry *entry;
1960 unsigned long usec_rem; 1622 struct trace_event *event;
1961 unsigned long long t;
1962 unsigned long secs;
1963 char *comm;
1964 int ret;
1965 int S, T;
1966 int i;
1967 1623
1968 entry = iter->ent; 1624 entry = iter->ent;
1969 1625
1970 if (entry->type == TRACE_CONT)
1971 return TRACE_TYPE_HANDLED;
1972
1973 test_cpu_buff_start(iter); 1626 test_cpu_buff_start(iter);
1974 1627
1975 comm = trace_find_cmdline(iter->ent->pid); 1628 event = ftrace_find_event(entry->type);
1976 1629
1977 t = ns2usecs(iter->ts); 1630 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
1978 usec_rem = do_div(t, 1000000ULL); 1631 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1979 secs = (unsigned long)t; 1632 if (!trace_print_lat_context(iter))
1980 1633 goto partial;
1981 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); 1634 } else {
1982 if (!ret) 1635 if (!trace_print_context(iter))
1983 return TRACE_TYPE_PARTIAL_LINE; 1636 goto partial;
1984 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1985 if (!ret)
1986 return TRACE_TYPE_PARTIAL_LINE;
1987 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1988 if (!ret)
1989 return TRACE_TYPE_PARTIAL_LINE;
1990
1991 switch (entry->type) {
1992 case TRACE_FN: {
1993 struct ftrace_entry *field;
1994
1995 trace_assign_type(field, entry);
1996
1997 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1998 if (!ret)
1999 return TRACE_TYPE_PARTIAL_LINE;
2000 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
2001 field->parent_ip) {
2002 ret = trace_seq_printf(s, " <-");
2003 if (!ret)
2004 return TRACE_TYPE_PARTIAL_LINE;
2005 ret = seq_print_ip_sym(s,
2006 field->parent_ip,
2007 sym_flags);
2008 if (!ret)
2009 return TRACE_TYPE_PARTIAL_LINE;
2010 }
2011 ret = trace_seq_printf(s, "\n");
2012 if (!ret)
2013 return TRACE_TYPE_PARTIAL_LINE;
2014 break;
2015 }
2016 case TRACE_CTX:
2017 case TRACE_WAKE: {
2018 struct ctx_switch_entry *field;
2019
2020 trace_assign_type(field, entry);
2021
2022 T = task_state_char(field->next_state);
2023 S = task_state_char(field->prev_state);
2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
2025 field->prev_pid,
2026 field->prev_prio,
2027 S,
2028 entry->type == TRACE_CTX ? "==>" : " +",
2029 field->next_cpu,
2030 field->next_pid,
2031 field->next_prio,
2032 T);
2033 if (!ret)
2034 return TRACE_TYPE_PARTIAL_LINE;
2035 break;
2036 }
2037 case TRACE_SPECIAL: {
2038 struct special_entry *field;
2039
2040 trace_assign_type(field, entry);
2041
2042 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
2043 field->arg1,
2044 field->arg2,
2045 field->arg3);
2046 if (!ret)
2047 return TRACE_TYPE_PARTIAL_LINE;
2048 break;
2049 }
2050 case TRACE_STACK: {
2051 struct stack_entry *field;
2052
2053 trace_assign_type(field, entry);
2054
2055 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
2056 if (i) {
2057 ret = trace_seq_puts(s, " <= ");
2058 if (!ret)
2059 return TRACE_TYPE_PARTIAL_LINE;
2060 }
2061 ret = seq_print_ip_sym(s, field->caller[i],
2062 sym_flags);
2063 if (!ret)
2064 return TRACE_TYPE_PARTIAL_LINE;
2065 } 1637 }
2066 ret = trace_seq_puts(s, "\n");
2067 if (!ret)
2068 return TRACE_TYPE_PARTIAL_LINE;
2069 break;
2070 } 1638 }
2071 case TRACE_PRINT: {
2072 struct print_entry *field;
2073 1639
2074 trace_assign_type(field, entry); 1640 if (event)
1641 return event->trace(iter, sym_flags);
2075 1642
2076 seq_print_ip_sym(s, field->ip, sym_flags); 1643 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
2077 trace_seq_printf(s, ": %s", field->buf); 1644 goto partial;
2078 if (entry->flags & TRACE_FLAG_CONT)
2079 trace_seq_print_cont(s, iter);
2080 break;
2081 }
2082 case TRACE_GRAPH_RET: {
2083 return print_graph_function(iter);
2084 }
2085 case TRACE_GRAPH_ENT: {
2086 return print_graph_function(iter);
2087 }
2088 case TRACE_BRANCH: {
2089 struct trace_branch *field;
2090 1645
2091 trace_assign_type(field, entry);
2092
2093 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2094 field->correct ? " ok " : " MISS ",
2095 field->func,
2096 field->file,
2097 field->line);
2098 break;
2099 }
2100 case TRACE_USER_STACK: {
2101 struct userstack_entry *field;
2102
2103 trace_assign_type(field, entry);
2104
2105 ret = seq_print_userip_objs(field, s, sym_flags);
2106 if (!ret)
2107 return TRACE_TYPE_PARTIAL_LINE;
2108 ret = trace_seq_putc(s, '\n');
2109 if (!ret)
2110 return TRACE_TYPE_PARTIAL_LINE;
2111 break;
2112 }
2113 }
2114 return TRACE_TYPE_HANDLED; 1646 return TRACE_TYPE_HANDLED;
1647partial:
1648 return TRACE_TYPE_PARTIAL_LINE;
2115} 1649}
2116 1650
2117static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 1651static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2118{ 1652{
2119 struct trace_seq *s = &iter->seq; 1653 struct trace_seq *s = &iter->seq;
2120 struct trace_entry *entry; 1654 struct trace_entry *entry;
2121 int ret; 1655 struct trace_event *event;
2122 int S, T;
2123 1656
2124 entry = iter->ent; 1657 entry = iter->ent;
2125 1658
2126 if (entry->type == TRACE_CONT) 1659 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2127 return TRACE_TYPE_HANDLED; 1660 if (!trace_seq_printf(s, "%d %d %llu ",
2128 1661 entry->pid, iter->cpu, iter->ts))
2129 ret = trace_seq_printf(s, "%d %d %llu ", 1662 goto partial;
2130 entry->pid, iter->cpu, iter->ts);
2131 if (!ret)
2132 return TRACE_TYPE_PARTIAL_LINE;
2133
2134 switch (entry->type) {
2135 case TRACE_FN: {
2136 struct ftrace_entry *field;
2137
2138 trace_assign_type(field, entry);
2139
2140 ret = trace_seq_printf(s, "%x %x\n",
2141 field->ip,
2142 field->parent_ip);
2143 if (!ret)
2144 return TRACE_TYPE_PARTIAL_LINE;
2145 break;
2146 } 1663 }
2147 case TRACE_CTX:
2148 case TRACE_WAKE: {
2149 struct ctx_switch_entry *field;
2150
2151 trace_assign_type(field, entry);
2152
2153 T = task_state_char(field->next_state);
2154 S = entry->type == TRACE_WAKE ? '+' :
2155 task_state_char(field->prev_state);
2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
2157 field->prev_pid,
2158 field->prev_prio,
2159 S,
2160 field->next_cpu,
2161 field->next_pid,
2162 field->next_prio,
2163 T);
2164 if (!ret)
2165 return TRACE_TYPE_PARTIAL_LINE;
2166 break;
2167 }
2168 case TRACE_SPECIAL:
2169 case TRACE_USER_STACK:
2170 case TRACE_STACK: {
2171 struct special_entry *field;
2172
2173 trace_assign_type(field, entry);
2174 1664
2175 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1665 event = ftrace_find_event(entry->type);
2176 field->arg1, 1666 if (event)
2177 field->arg2, 1667 return event->raw(iter, 0);
2178 field->arg3);
2179 if (!ret)
2180 return TRACE_TYPE_PARTIAL_LINE;
2181 break;
2182 }
2183 case TRACE_PRINT: {
2184 struct print_entry *field;
2185 1668
2186 trace_assign_type(field, entry); 1669 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1670 goto partial;
2187 1671
2188 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
2189 if (entry->flags & TRACE_FLAG_CONT)
2190 trace_seq_print_cont(s, iter);
2191 break;
2192 }
2193 }
2194 return TRACE_TYPE_HANDLED; 1672 return TRACE_TYPE_HANDLED;
1673partial:
1674 return TRACE_TYPE_PARTIAL_LINE;
2195} 1675}
2196 1676
2197#define SEQ_PUT_FIELD_RET(s, x) \
2198do { \
2199 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
2200 return 0; \
2201} while (0)
2202
2203#define SEQ_PUT_HEX_FIELD_RET(s, x) \
2204do { \
2205 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
2206 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
2207 return 0; \
2208} while (0)
2209
2210static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 1677static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2211{ 1678{
2212 struct trace_seq *s = &iter->seq; 1679 struct trace_seq *s = &iter->seq;
2213 unsigned char newline = '\n'; 1680 unsigned char newline = '\n';
2214 struct trace_entry *entry; 1681 struct trace_entry *entry;
2215 int S, T; 1682 struct trace_event *event;
2216 1683
2217 entry = iter->ent; 1684 entry = iter->ent;
2218 1685
2219 if (entry->type == TRACE_CONT) 1686 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2220 return TRACE_TYPE_HANDLED; 1687 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1688 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1689 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
1690 }
2221 1691
2222 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 1692 event = ftrace_find_event(entry->type);
2223 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 1693 if (event) {
2224 SEQ_PUT_HEX_FIELD_RET(s, iter->ts); 1694 enum print_line_t ret = event->hex(iter, 0);
1695 if (ret != TRACE_TYPE_HANDLED)
1696 return ret;
1697 }
2225 1698
2226 switch (entry->type) { 1699 SEQ_PUT_FIELD_RET(s, newline);
2227 case TRACE_FN: {
2228 struct ftrace_entry *field;
2229 1700
2230 trace_assign_type(field, entry); 1701 return TRACE_TYPE_HANDLED;
1702}
2231 1703
2232 SEQ_PUT_HEX_FIELD_RET(s, field->ip); 1704static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
2233 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); 1705{
2234 break; 1706 struct trace_seq *s = &iter->seq;
2235 } 1707 struct trace_entry *entry = iter->ent;
2236 case TRACE_CTX: 1708 struct bprint_entry *field;
2237 case TRACE_WAKE: { 1709 int ret;
2238 struct ctx_switch_entry *field;
2239
2240 trace_assign_type(field, entry);
2241
2242 T = task_state_char(field->next_state);
2243 S = entry->type == TRACE_WAKE ? '+' :
2244 task_state_char(field->prev_state);
2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
2247 SEQ_PUT_HEX_FIELD_RET(s, S);
2248 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
2249 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
2250 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
2251 SEQ_PUT_HEX_FIELD_RET(s, T);
2252 break;
2253 }
2254 case TRACE_SPECIAL:
2255 case TRACE_USER_STACK:
2256 case TRACE_STACK: {
2257 struct special_entry *field;
2258 1710
2259 trace_assign_type(field, entry); 1711 trace_assign_type(field, entry);
2260 1712
2261 SEQ_PUT_HEX_FIELD_RET(s, field->arg1); 1713 ret = trace_seq_bprintf(s, field->fmt, field->buf);
2262 SEQ_PUT_HEX_FIELD_RET(s, field->arg2); 1714 if (!ret)
2263 SEQ_PUT_HEX_FIELD_RET(s, field->arg3); 1715 return TRACE_TYPE_PARTIAL_LINE;
2264 break;
2265 }
2266 }
2267 SEQ_PUT_FIELD_RET(s, newline);
2268 1716
2269 return TRACE_TYPE_HANDLED; 1717 return TRACE_TYPE_HANDLED;
2270} 1718}
@@ -2278,13 +1726,10 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2278 1726
2279 trace_assign_type(field, entry); 1727 trace_assign_type(field, entry);
2280 1728
2281 ret = trace_seq_printf(s, field->buf); 1729 ret = trace_seq_printf(s, "%s", field->buf);
2282 if (!ret) 1730 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE; 1731 return TRACE_TYPE_PARTIAL_LINE;
2284 1732
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287
2288 return TRACE_TYPE_HANDLED; 1733 return TRACE_TYPE_HANDLED;
2289} 1734}
2290 1735
@@ -2292,59 +1737,37 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2292{ 1737{
2293 struct trace_seq *s = &iter->seq; 1738 struct trace_seq *s = &iter->seq;
2294 struct trace_entry *entry; 1739 struct trace_entry *entry;
1740 struct trace_event *event;
2295 1741
2296 entry = iter->ent; 1742 entry = iter->ent;
2297 1743
2298 if (entry->type == TRACE_CONT) 1744 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2299 return TRACE_TYPE_HANDLED; 1745 SEQ_PUT_FIELD_RET(s, entry->pid);
2300 1746 SEQ_PUT_FIELD_RET(s, iter->cpu);
2301 SEQ_PUT_FIELD_RET(s, entry->pid); 1747 SEQ_PUT_FIELD_RET(s, iter->ts);
2302 SEQ_PUT_FIELD_RET(s, entry->cpu);
2303 SEQ_PUT_FIELD_RET(s, iter->ts);
2304
2305 switch (entry->type) {
2306 case TRACE_FN: {
2307 struct ftrace_entry *field;
2308
2309 trace_assign_type(field, entry);
2310
2311 SEQ_PUT_FIELD_RET(s, field->ip);
2312 SEQ_PUT_FIELD_RET(s, field->parent_ip);
2313 break;
2314 }
2315 case TRACE_CTX: {
2316 struct ctx_switch_entry *field;
2317
2318 trace_assign_type(field, entry);
2319
2320 SEQ_PUT_FIELD_RET(s, field->prev_pid);
2321 SEQ_PUT_FIELD_RET(s, field->prev_prio);
2322 SEQ_PUT_FIELD_RET(s, field->prev_state);
2323 SEQ_PUT_FIELD_RET(s, field->next_pid);
2324 SEQ_PUT_FIELD_RET(s, field->next_prio);
2325 SEQ_PUT_FIELD_RET(s, field->next_state);
2326 break;
2327 } 1748 }
2328 case TRACE_SPECIAL:
2329 case TRACE_USER_STACK:
2330 case TRACE_STACK: {
2331 struct special_entry *field;
2332
2333 trace_assign_type(field, entry);
2334 1749
2335 SEQ_PUT_FIELD_RET(s, field->arg1); 1750 event = ftrace_find_event(entry->type);
2336 SEQ_PUT_FIELD_RET(s, field->arg2); 1751 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
2337 SEQ_PUT_FIELD_RET(s, field->arg3);
2338 break;
2339 }
2340 }
2341 return 1;
2342} 1752}
2343 1753
2344static int trace_empty(struct trace_iterator *iter) 1754static int trace_empty(struct trace_iterator *iter)
2345{ 1755{
2346 int cpu; 1756 int cpu;
2347 1757
1758 /* If we are looking at one CPU buffer, only check that one */
1759 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
1760 cpu = iter->cpu_file;
1761 if (iter->buffer_iter[cpu]) {
1762 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1763 return 0;
1764 } else {
1765 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1766 return 0;
1767 }
1768 return 1;
1769 }
1770
2348 for_each_tracing_cpu(cpu) { 1771 for_each_tracing_cpu(cpu) {
2349 if (iter->buffer_iter[cpu]) { 1772 if (iter->buffer_iter[cpu]) {
2350 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 1773 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
@@ -2368,6 +1791,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2368 return ret; 1791 return ret;
2369 } 1792 }
2370 1793
1794 if (iter->ent->type == TRACE_BPRINT &&
1795 trace_flags & TRACE_ITER_PRINTK &&
1796 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
1797 return print_bprintk_msg_only(iter);
1798
2371 if (iter->ent->type == TRACE_PRINT && 1799 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK && 1800 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 1801 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2382,9 +1810,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2382 if (trace_flags & TRACE_ITER_RAW) 1810 if (trace_flags & TRACE_ITER_RAW)
2383 return print_raw_fmt(iter); 1811 return print_raw_fmt(iter);
2384 1812
2385 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2386 return print_lat_fmt(iter, iter->idx, iter->cpu);
2387
2388 return print_trace_fmt(iter); 1813 return print_trace_fmt(iter);
2389} 1814}
2390 1815
@@ -2426,30 +1851,40 @@ static struct seq_operations tracer_seq_ops = {
2426}; 1851};
2427 1852
2428static struct trace_iterator * 1853static struct trace_iterator *
2429__tracing_open(struct inode *inode, struct file *file, int *ret) 1854__tracing_open(struct inode *inode, struct file *file)
2430{ 1855{
1856 long cpu_file = (long) inode->i_private;
1857 void *fail_ret = ERR_PTR(-ENOMEM);
2431 struct trace_iterator *iter; 1858 struct trace_iterator *iter;
2432 struct seq_file *m; 1859 struct seq_file *m;
2433 int cpu; 1860 int cpu, ret;
2434 1861
2435 if (tracing_disabled) { 1862 if (tracing_disabled)
2436 *ret = -ENODEV; 1863 return ERR_PTR(-ENODEV);
2437 return NULL;
2438 }
2439 1864
2440 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 1865 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2441 if (!iter) { 1866 if (!iter)
2442 *ret = -ENOMEM; 1867 return ERR_PTR(-ENOMEM);
2443 goto out;
2444 }
2445 1868
1869 /*
1870 * We make a copy of the current tracer to avoid concurrent
1871 * changes on it while we are reading.
1872 */
2446 mutex_lock(&trace_types_lock); 1873 mutex_lock(&trace_types_lock);
1874 iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
1875 if (!iter->trace)
1876 goto fail;
1877
1878 if (current_trace)
1879 *iter->trace = *current_trace;
1880
2447 if (current_trace && current_trace->print_max) 1881 if (current_trace && current_trace->print_max)
2448 iter->tr = &max_tr; 1882 iter->tr = &max_tr;
2449 else 1883 else
2450 iter->tr = inode->i_private; 1884 iter->tr = &global_trace;
2451 iter->trace = current_trace;
2452 iter->pos = -1; 1885 iter->pos = -1;
1886 mutex_init(&iter->mutex);
1887 iter->cpu_file = cpu_file;
2453 1888
2454 /* Notify the tracer early; before we stop tracing. */ 1889 /* Notify the tracer early; before we stop tracing. */
2455 if (iter->trace && iter->trace->open) 1890 if (iter->trace && iter->trace->open)
@@ -2459,20 +1894,24 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2459 if (ring_buffer_overruns(iter->tr->buffer)) 1894 if (ring_buffer_overruns(iter->tr->buffer))
2460 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1895 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2461 1896
1897 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1898 for_each_tracing_cpu(cpu) {
2462 1899
2463 for_each_tracing_cpu(cpu) { 1900 iter->buffer_iter[cpu] =
2464 1901 ring_buffer_read_start(iter->tr->buffer, cpu);
1902 }
1903 } else {
1904 cpu = iter->cpu_file;
2465 iter->buffer_iter[cpu] = 1905 iter->buffer_iter[cpu] =
2466 ring_buffer_read_start(iter->tr->buffer, cpu); 1906 ring_buffer_read_start(iter->tr->buffer, cpu);
2467
2468 if (!iter->buffer_iter[cpu])
2469 goto fail_buffer;
2470 } 1907 }
2471 1908
2472 /* TODO stop tracer */ 1909 /* TODO stop tracer */
2473 *ret = seq_open(file, &tracer_seq_ops); 1910 ret = seq_open(file, &tracer_seq_ops);
2474 if (*ret) 1911 if (ret < 0) {
1912 fail_ret = ERR_PTR(ret);
2475 goto fail_buffer; 1913 goto fail_buffer;
1914 }
2476 1915
2477 m = file->private_data; 1916 m = file->private_data;
2478 m->private = iter; 1917 m->private = iter;
@@ -2482,7 +1921,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2482 1921
2483 mutex_unlock(&trace_types_lock); 1922 mutex_unlock(&trace_types_lock);
2484 1923
2485 out:
2486 return iter; 1924 return iter;
2487 1925
2488 fail_buffer: 1926 fail_buffer:
@@ -2490,10 +1928,12 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2490 if (iter->buffer_iter[cpu]) 1928 if (iter->buffer_iter[cpu])
2491 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1929 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2492 } 1930 }
1931 fail:
2493 mutex_unlock(&trace_types_lock); 1932 mutex_unlock(&trace_types_lock);
1933 kfree(iter->trace);
2494 kfree(iter); 1934 kfree(iter);
2495 1935
2496 return ERR_PTR(-ENOMEM); 1936 return fail_ret;
2497} 1937}
2498 1938
2499int tracing_open_generic(struct inode *inode, struct file *filp) 1939int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2505,7 +1945,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2505 return 0; 1945 return 0;
2506} 1946}
2507 1947
2508int tracing_release(struct inode *inode, struct file *file) 1948static int tracing_release(struct inode *inode, struct file *file)
2509{ 1949{
2510 struct seq_file *m = (struct seq_file *)file->private_data; 1950 struct seq_file *m = (struct seq_file *)file->private_data;
2511 struct trace_iterator *iter = m->private; 1951 struct trace_iterator *iter = m->private;
@@ -2525,33 +1965,26 @@ int tracing_release(struct inode *inode, struct file *file)
2525 mutex_unlock(&trace_types_lock); 1965 mutex_unlock(&trace_types_lock);
2526 1966
2527 seq_release(inode, file); 1967 seq_release(inode, file);
1968 mutex_destroy(&iter->mutex);
1969 kfree(iter->trace);
2528 kfree(iter); 1970 kfree(iter);
2529 return 0; 1971 return 0;
2530} 1972}
2531 1973
2532static int tracing_open(struct inode *inode, struct file *file) 1974static int tracing_open(struct inode *inode, struct file *file)
2533{ 1975{
2534 int ret;
2535
2536 __tracing_open(inode, file, &ret);
2537
2538 return ret;
2539}
2540
2541static int tracing_lt_open(struct inode *inode, struct file *file)
2542{
2543 struct trace_iterator *iter; 1976 struct trace_iterator *iter;
2544 int ret; 1977 int ret = 0;
2545
2546 iter = __tracing_open(inode, file, &ret);
2547 1978
2548 if (!ret) 1979 iter = __tracing_open(inode, file);
1980 if (IS_ERR(iter))
1981 ret = PTR_ERR(iter);
1982 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2549 iter->iter_flags |= TRACE_FILE_LAT_FMT; 1983 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2550 1984
2551 return ret; 1985 return ret;
2552} 1986}
2553 1987
2554
2555static void * 1988static void *
2556t_next(struct seq_file *m, void *v, loff_t *pos) 1989t_next(struct seq_file *m, void *v, loff_t *pos)
2557{ 1990{
@@ -2623,21 +2056,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
2623 return ret; 2056 return ret;
2624} 2057}
2625 2058
2626static struct file_operations tracing_fops = { 2059static const struct file_operations tracing_fops = {
2627 .open = tracing_open, 2060 .open = tracing_open,
2628 .read = seq_read, 2061 .read = seq_read,
2629 .llseek = seq_lseek, 2062 .llseek = seq_lseek,
2630 .release = tracing_release, 2063 .release = tracing_release,
2631}; 2064};
2632 2065
2633static struct file_operations tracing_lt_fops = { 2066static const struct file_operations show_traces_fops = {
2634 .open = tracing_lt_open,
2635 .read = seq_read,
2636 .llseek = seq_lseek,
2637 .release = tracing_release,
2638};
2639
2640static struct file_operations show_traces_fops = {
2641 .open = show_traces_open, 2067 .open = show_traces_open,
2642 .read = seq_read, 2068 .read = seq_read,
2643 .release = seq_release, 2069 .release = seq_release,
@@ -2730,7 +2156,7 @@ err_unlock:
2730 return err; 2156 return err;
2731} 2157}
2732 2158
2733static struct file_operations tracing_cpumask_fops = { 2159static const struct file_operations tracing_cpumask_fops = {
2734 .open = tracing_open_generic, 2160 .open = tracing_open_generic,
2735 .read = tracing_cpumask_read, 2161 .read = tracing_cpumask_read,
2736 .write = tracing_cpumask_write, 2162 .write = tracing_cpumask_write,
@@ -2740,57 +2166,62 @@ static ssize_t
2740tracing_trace_options_read(struct file *filp, char __user *ubuf, 2166tracing_trace_options_read(struct file *filp, char __user *ubuf,
2741 size_t cnt, loff_t *ppos) 2167 size_t cnt, loff_t *ppos)
2742{ 2168{
2743 int i; 2169 struct tracer_opt *trace_opts;
2170 u32 tracer_flags;
2171 int len = 0;
2744 char *buf; 2172 char *buf;
2745 int r = 0; 2173 int r = 0;
2746 int len = 0; 2174 int i;
2747 u32 tracer_flags = current_trace->flags->val;
2748 struct tracer_opt *trace_opts = current_trace->flags->opts;
2749 2175
2750 2176
2751 /* calulate max size */ 2177 /* calculate max size */
2752 for (i = 0; trace_options[i]; i++) { 2178 for (i = 0; trace_options[i]; i++) {
2753 len += strlen(trace_options[i]); 2179 len += strlen(trace_options[i]);
2754 len += 3; /* "no" and space */ 2180 len += 3; /* "no" and newline */
2755 } 2181 }
2756 2182
2183 mutex_lock(&trace_types_lock);
2184 tracer_flags = current_trace->flags->val;
2185 trace_opts = current_trace->flags->opts;
2186
2757 /* 2187 /*
2758 * Increase the size with names of options specific 2188 * Increase the size with names of options specific
2759 * of the current tracer. 2189 * of the current tracer.
2760 */ 2190 */
2761 for (i = 0; trace_opts[i].name; i++) { 2191 for (i = 0; trace_opts[i].name; i++) {
2762 len += strlen(trace_opts[i].name); 2192 len += strlen(trace_opts[i].name);
2763 len += 3; /* "no" and space */ 2193 len += 3; /* "no" and newline */
2764 } 2194 }
2765 2195
2766 /* +2 for \n and \0 */ 2196 /* +2 for \n and \0 */
2767 buf = kmalloc(len + 2, GFP_KERNEL); 2197 buf = kmalloc(len + 2, GFP_KERNEL);
2768 if (!buf) 2198 if (!buf) {
2199 mutex_unlock(&trace_types_lock);
2769 return -ENOMEM; 2200 return -ENOMEM;
2201 }
2770 2202
2771 for (i = 0; trace_options[i]; i++) { 2203 for (i = 0; trace_options[i]; i++) {
2772 if (trace_flags & (1 << i)) 2204 if (trace_flags & (1 << i))
2773 r += sprintf(buf + r, "%s ", trace_options[i]); 2205 r += sprintf(buf + r, "%s\n", trace_options[i]);
2774 else 2206 else
2775 r += sprintf(buf + r, "no%s ", trace_options[i]); 2207 r += sprintf(buf + r, "no%s\n", trace_options[i]);
2776 } 2208 }
2777 2209
2778 for (i = 0; trace_opts[i].name; i++) { 2210 for (i = 0; trace_opts[i].name; i++) {
2779 if (tracer_flags & trace_opts[i].bit) 2211 if (tracer_flags & trace_opts[i].bit)
2780 r += sprintf(buf + r, "%s ", 2212 r += sprintf(buf + r, "%s\n",
2781 trace_opts[i].name); 2213 trace_opts[i].name);
2782 else 2214 else
2783 r += sprintf(buf + r, "no%s ", 2215 r += sprintf(buf + r, "no%s\n",
2784 trace_opts[i].name); 2216 trace_opts[i].name);
2785 } 2217 }
2218 mutex_unlock(&trace_types_lock);
2786 2219
2787 r += sprintf(buf + r, "\n");
2788 WARN_ON(r >= len + 2); 2220 WARN_ON(r >= len + 2);
2789 2221
2790 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2222 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2791 2223
2792 kfree(buf); 2224 kfree(buf);
2793
2794 return r; 2225 return r;
2795} 2226}
2796 2227
@@ -2828,6 +2259,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2828 return 0; 2259 return 0;
2829} 2260}
2830 2261
2262static void set_tracer_flags(unsigned int mask, int enabled)
2263{
2264 /* do nothing if flag is already set */
2265 if (!!(trace_flags & mask) == !!enabled)
2266 return;
2267
2268 if (enabled)
2269 trace_flags |= mask;
2270 else
2271 trace_flags &= ~mask;
2272
2273 if (mask == TRACE_ITER_GLOBAL_CLK) {
2274 u64 (*func)(void);
2275
2276 if (enabled)
2277 func = trace_clock_global;
2278 else
2279 func = trace_clock_local;
2280
2281 mutex_lock(&trace_types_lock);
2282 ring_buffer_set_clock(global_trace.buffer, func);
2283
2284 if (max_tr.buffer)
2285 ring_buffer_set_clock(max_tr.buffer, func);
2286 mutex_unlock(&trace_types_lock);
2287 }
2288}
2289
2831static ssize_t 2290static ssize_t
2832tracing_trace_options_write(struct file *filp, const char __user *ubuf, 2291tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2833 size_t cnt, loff_t *ppos) 2292 size_t cnt, loff_t *ppos)
@@ -2855,17 +2314,16 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2855 int len = strlen(trace_options[i]); 2314 int len = strlen(trace_options[i]);
2856 2315
2857 if (strncmp(cmp, trace_options[i], len) == 0) { 2316 if (strncmp(cmp, trace_options[i], len) == 0) {
2858 if (neg) 2317 set_tracer_flags(1 << i, !neg);
2859 trace_flags &= ~(1 << i);
2860 else
2861 trace_flags |= (1 << i);
2862 break; 2318 break;
2863 } 2319 }
2864 } 2320 }
2865 2321
2866 /* If no option could be set, test the specific tracer options */ 2322 /* If no option could be set, test the specific tracer options */
2867 if (!trace_options[i]) { 2323 if (!trace_options[i]) {
2324 mutex_lock(&trace_types_lock);
2868 ret = set_tracer_option(current_trace, cmp, neg); 2325 ret = set_tracer_option(current_trace, cmp, neg);
2326 mutex_unlock(&trace_types_lock);
2869 if (ret) 2327 if (ret)
2870 return ret; 2328 return ret;
2871 } 2329 }
@@ -2875,7 +2333,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2875 return cnt; 2333 return cnt;
2876} 2334}
2877 2335
2878static struct file_operations tracing_iter_fops = { 2336static const struct file_operations tracing_iter_fops = {
2879 .open = tracing_open_generic, 2337 .open = tracing_open_generic,
2880 .read = tracing_trace_options_read, 2338 .read = tracing_trace_options_read,
2881 .write = tracing_trace_options_write, 2339 .write = tracing_trace_options_write,
@@ -2908,7 +2366,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2908 readme_msg, strlen(readme_msg)); 2366 readme_msg, strlen(readme_msg));
2909} 2367}
2910 2368
2911static struct file_operations tracing_readme_fops = { 2369static const struct file_operations tracing_readme_fops = {
2912 .open = tracing_open_generic, 2370 .open = tracing_open_generic,
2913 .read = tracing_readme_read, 2371 .read = tracing_readme_read,
2914}; 2372};
@@ -2930,7 +2388,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2930{ 2388{
2931 struct trace_array *tr = filp->private_data; 2389 struct trace_array *tr = filp->private_data;
2932 char buf[64]; 2390 char buf[64];
2933 long val; 2391 unsigned long val;
2934 int ret; 2392 int ret;
2935 2393
2936 if (cnt >= sizeof(buf)) 2394 if (cnt >= sizeof(buf))
@@ -2985,13 +2443,105 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2443 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2986} 2444}
2987 2445
2988static int tracing_set_tracer(char *buf) 2446int tracer_init(struct tracer *t, struct trace_array *tr)
2447{
2448 tracing_reset_online_cpus(tr);
2449 return t->init(tr);
2450}
2451
2452static int tracing_resize_ring_buffer(unsigned long size)
2453{
2454 int ret;
2455
2456 /*
2457 * If kernel or user changes the size of the ring buffer
2458 * we use the size that was given, and we can forget about
2459 * expanding it later.
2460 */
2461 ring_buffer_expanded = 1;
2462
2463 ret = ring_buffer_resize(global_trace.buffer, size);
2464 if (ret < 0)
2465 return ret;
2466
2467 ret = ring_buffer_resize(max_tr.buffer, size);
2468 if (ret < 0) {
2469 int r;
2470
2471 r = ring_buffer_resize(global_trace.buffer,
2472 global_trace.entries);
2473 if (r < 0) {
2474 /*
2475 * AARGH! We are left with different
2476 * size max buffer!!!!
2477 * The max buffer is our "snapshot" buffer.
2478 * When a tracer needs a snapshot (one of the
2479 * latency tracers), it swaps the max buffer
2480 * with the saved snap shot. We succeeded to
2481 * update the size of the main buffer, but failed to
2482 * update the size of the max buffer. But when we tried
2483 * to reset the main buffer to the original size, we
2484 * failed there too. This is very unlikely to
2485 * happen, but if it does, warn and kill all
2486 * tracing.
2487 */
2488 WARN_ON(1);
2489 tracing_disabled = 1;
2490 }
2491 return ret;
2492 }
2493
2494 global_trace.entries = size;
2495
2496 return ret;
2497}
2498
2499/**
2500 * tracing_update_buffers - used by tracing facility to expand ring buffers
2501 *
2502 * To save on memory when the tracing is never used on a system with it
2503 * configured in. The ring buffers are set to a minimum size. But once
2504 * a user starts to use the tracing facility, then they need to grow
2505 * to their default size.
2506 *
2507 * This function is to be called when a tracer is about to be used.
2508 */
2509int tracing_update_buffers(void)
2510{
2511 int ret = 0;
2512
2513 mutex_lock(&trace_types_lock);
2514 if (!ring_buffer_expanded)
2515 ret = tracing_resize_ring_buffer(trace_buf_size);
2516 mutex_unlock(&trace_types_lock);
2517
2518 return ret;
2519}
2520
2521struct trace_option_dentry;
2522
2523static struct trace_option_dentry *
2524create_trace_option_files(struct tracer *tracer);
2525
2526static void
2527destroy_trace_option_files(struct trace_option_dentry *topts);
2528
2529static int tracing_set_tracer(const char *buf)
2989{ 2530{
2531 static struct trace_option_dentry *topts;
2990 struct trace_array *tr = &global_trace; 2532 struct trace_array *tr = &global_trace;
2991 struct tracer *t; 2533 struct tracer *t;
2992 int ret = 0; 2534 int ret = 0;
2993 2535
2994 mutex_lock(&trace_types_lock); 2536 mutex_lock(&trace_types_lock);
2537
2538 if (!ring_buffer_expanded) {
2539 ret = tracing_resize_ring_buffer(trace_buf_size);
2540 if (ret < 0)
2541 goto out;
2542 ret = 0;
2543 }
2544
2995 for (t = trace_types; t; t = t->next) { 2545 for (t = trace_types; t; t = t->next) {
2996 if (strcmp(t->name, buf) == 0) 2546 if (strcmp(t->name, buf) == 0)
2997 break; 2547 break;
@@ -3007,9 +2557,14 @@ static int tracing_set_tracer(char *buf)
3007 if (current_trace && current_trace->reset) 2557 if (current_trace && current_trace->reset)
3008 current_trace->reset(tr); 2558 current_trace->reset(tr);
3009 2559
2560 destroy_trace_option_files(topts);
2561
3010 current_trace = t; 2562 current_trace = t;
2563
2564 topts = create_trace_option_files(current_trace);
2565
3011 if (t->init) { 2566 if (t->init) {
3012 ret = t->init(tr); 2567 ret = tracer_init(t, tr);
3013 if (ret) 2568 if (ret)
3014 goto out; 2569 goto out;
3015 } 2570 }
@@ -3072,9 +2627,9 @@ static ssize_t
3072tracing_max_lat_write(struct file *filp, const char __user *ubuf, 2627tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3073 size_t cnt, loff_t *ppos) 2628 size_t cnt, loff_t *ppos)
3074{ 2629{
3075 long *ptr = filp->private_data; 2630 unsigned long *ptr = filp->private_data;
3076 char buf[64]; 2631 char buf[64];
3077 long val; 2632 unsigned long val;
3078 int ret; 2633 int ret;
3079 2634
3080 if (cnt >= sizeof(buf)) 2635 if (cnt >= sizeof(buf))
@@ -3094,54 +2649,96 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3094 return cnt; 2649 return cnt;
3095} 2650}
3096 2651
3097static atomic_t tracing_reader;
3098
3099static int tracing_open_pipe(struct inode *inode, struct file *filp) 2652static int tracing_open_pipe(struct inode *inode, struct file *filp)
3100{ 2653{
2654 long cpu_file = (long) inode->i_private;
3101 struct trace_iterator *iter; 2655 struct trace_iterator *iter;
2656 int ret = 0;
3102 2657
3103 if (tracing_disabled) 2658 if (tracing_disabled)
3104 return -ENODEV; 2659 return -ENODEV;
3105 2660
3106 /* We only allow for reader of the pipe */ 2661 mutex_lock(&trace_types_lock);
3107 if (atomic_inc_return(&tracing_reader) != 1) { 2662
3108 atomic_dec(&tracing_reader); 2663 /* We only allow one reader per cpu */
3109 return -EBUSY; 2664 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2665 if (!cpumask_empty(tracing_reader_cpumask)) {
2666 ret = -EBUSY;
2667 goto out;
2668 }
2669 cpumask_setall(tracing_reader_cpumask);
2670 } else {
2671 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2672 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2673 else {
2674 ret = -EBUSY;
2675 goto out;
2676 }
3110 } 2677 }
3111 2678
3112 /* create a buffer to store the information to pass to userspace */ 2679 /* create a buffer to store the information to pass to userspace */
3113 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2680 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3114 if (!iter) 2681 if (!iter) {
3115 return -ENOMEM; 2682 ret = -ENOMEM;
2683 goto out;
2684 }
3116 2685
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 2686 /*
3118 kfree(iter); 2687 * We make a copy of the current tracer to avoid concurrent
3119 return -ENOMEM; 2688 * changes on it while we are reading.
2689 */
2690 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
2691 if (!iter->trace) {
2692 ret = -ENOMEM;
2693 goto fail;
3120 } 2694 }
2695 if (current_trace)
2696 *iter->trace = *current_trace;
3121 2697
3122 mutex_lock(&trace_types_lock); 2698 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
2699 ret = -ENOMEM;
2700 goto fail;
2701 }
3123 2702
3124 /* trace pipe does not show start of buffer */ 2703 /* trace pipe does not show start of buffer */
3125 cpumask_setall(iter->started); 2704 cpumask_setall(iter->started);
3126 2705
2706 iter->cpu_file = cpu_file;
3127 iter->tr = &global_trace; 2707 iter->tr = &global_trace;
3128 iter->trace = current_trace; 2708 mutex_init(&iter->mutex);
3129 filp->private_data = iter; 2709 filp->private_data = iter;
3130 2710
3131 if (iter->trace->pipe_open) 2711 if (iter->trace->pipe_open)
3132 iter->trace->pipe_open(iter); 2712 iter->trace->pipe_open(iter);
2713
2714out:
3133 mutex_unlock(&trace_types_lock); 2715 mutex_unlock(&trace_types_lock);
2716 return ret;
3134 2717
3135 return 0; 2718fail:
2719 kfree(iter->trace);
2720 kfree(iter);
2721 mutex_unlock(&trace_types_lock);
2722 return ret;
3136} 2723}
3137 2724
3138static int tracing_release_pipe(struct inode *inode, struct file *file) 2725static int tracing_release_pipe(struct inode *inode, struct file *file)
3139{ 2726{
3140 struct trace_iterator *iter = file->private_data; 2727 struct trace_iterator *iter = file->private_data;
3141 2728
2729 mutex_lock(&trace_types_lock);
2730
2731 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2732 cpumask_clear(tracing_reader_cpumask);
2733 else
2734 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2735
2736 mutex_unlock(&trace_types_lock);
2737
3142 free_cpumask_var(iter->started); 2738 free_cpumask_var(iter->started);
2739 mutex_destroy(&iter->mutex);
2740 kfree(iter->trace);
3143 kfree(iter); 2741 kfree(iter);
3144 atomic_dec(&tracing_reader);
3145 2742
3146 return 0; 2743 return 0;
3147} 2744}
@@ -3167,67 +2764,57 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3167 } 2764 }
3168} 2765}
3169 2766
3170/* 2767
3171 * Consumer reader. 2768void default_wait_pipe(struct trace_iterator *iter)
3172 */
3173static ssize_t
3174tracing_read_pipe(struct file *filp, char __user *ubuf,
3175 size_t cnt, loff_t *ppos)
3176{ 2769{
3177 struct trace_iterator *iter = filp->private_data; 2770 DEFINE_WAIT(wait);
3178 ssize_t sret;
3179 2771
3180 /* return any leftover data */ 2772 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3181 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
3182 if (sret != -EBUSY)
3183 return sret;
3184 2773
3185 trace_seq_reset(&iter->seq); 2774 if (trace_empty(iter))
2775 schedule();
3186 2776
3187 mutex_lock(&trace_types_lock); 2777 finish_wait(&trace_wait, &wait);
3188 if (iter->trace->read) { 2778}
3189 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); 2779
3190 if (sret) 2780/*
3191 goto out; 2781 * This is a make-shift waitqueue.
3192 } 2782 * A tracer might use this callback on some rare cases:
2783 *
2784 * 1) the current tracer might hold the runqueue lock when it wakes up
2785 * a reader, hence a deadlock (sched, function, and function graph tracers)
2786 * 2) the function tracers, trace all functions, we don't want
2787 * the overhead of calling wake_up and friends
2788 * (and tracing them too)
2789 *
2790 * Anyway, this is really very primitive wakeup.
2791 */
2792void poll_wait_pipe(struct trace_iterator *iter)
2793{
2794 set_current_state(TASK_INTERRUPTIBLE);
2795 /* sleep for 100 msecs, and try again. */
2796 schedule_timeout(HZ / 10);
2797}
2798
2799/* Must be called with trace_types_lock mutex held. */
2800static int tracing_wait_pipe(struct file *filp)
2801{
2802 struct trace_iterator *iter = filp->private_data;
3193 2803
3194waitagain:
3195 sret = 0;
3196 while (trace_empty(iter)) { 2804 while (trace_empty(iter)) {
3197 2805
3198 if ((filp->f_flags & O_NONBLOCK)) { 2806 if ((filp->f_flags & O_NONBLOCK)) {
3199 sret = -EAGAIN; 2807 return -EAGAIN;
3200 goto out;
3201 } 2808 }
3202 2809
3203 /* 2810 mutex_unlock(&iter->mutex);
3204 * This is a make-shift waitqueue. The reason we don't use
3205 * an actual wait queue is because:
3206 * 1) we only ever have one waiter
3207 * 2) the tracing, traces all functions, we don't want
3208 * the overhead of calling wake_up and friends
3209 * (and tracing them too)
3210 * Anyway, this is really very primitive wakeup.
3211 */
3212 set_current_state(TASK_INTERRUPTIBLE);
3213 iter->tr->waiter = current;
3214 2811
3215 mutex_unlock(&trace_types_lock); 2812 iter->trace->wait_pipe(iter);
3216 2813
3217 /* sleep for 100 msecs, and try again. */ 2814 mutex_lock(&iter->mutex);
3218 schedule_timeout(HZ/10);
3219 2815
3220 mutex_lock(&trace_types_lock); 2816 if (signal_pending(current))
3221 2817 return -EINTR;
3222 iter->tr->waiter = NULL;
3223
3224 if (signal_pending(current)) {
3225 sret = -EINTR;
3226 goto out;
3227 }
3228
3229 if (iter->trace != current_trace)
3230 goto out;
3231 2818
3232 /* 2819 /*
3233 * We block until we read something and tracing is disabled. 2820 * We block until we read something and tracing is disabled.
@@ -3240,13 +2827,59 @@ waitagain:
3240 */ 2827 */
3241 if (!tracer_enabled && iter->pos) 2828 if (!tracer_enabled && iter->pos)
3242 break; 2829 break;
2830 }
2831
2832 return 1;
2833}
2834
2835/*
2836 * Consumer reader.
2837 */
2838static ssize_t
2839tracing_read_pipe(struct file *filp, char __user *ubuf,
2840 size_t cnt, loff_t *ppos)
2841{
2842 struct trace_iterator *iter = filp->private_data;
2843 static struct tracer *old_tracer;
2844 ssize_t sret;
2845
2846 /* return any leftover data */
2847 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2848 if (sret != -EBUSY)
2849 return sret;
2850
2851 trace_seq_init(&iter->seq);
2852
2853 /* copy the tracer to avoid using a global lock all around */
2854 mutex_lock(&trace_types_lock);
2855 if (unlikely(old_tracer != current_trace && current_trace)) {
2856 old_tracer = current_trace;
2857 *iter->trace = *current_trace;
2858 }
2859 mutex_unlock(&trace_types_lock);
3243 2860
3244 continue; 2861 /*
2862 * Avoid more than one consumer on a single file descriptor
2863 * This is just a matter of traces coherency, the ring buffer itself
2864 * is protected.
2865 */
2866 mutex_lock(&iter->mutex);
2867 if (iter->trace->read) {
2868 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2869 if (sret)
2870 goto out;
3245 } 2871 }
3246 2872
2873waitagain:
2874 sret = tracing_wait_pipe(filp);
2875 if (sret <= 0)
2876 goto out;
2877
3247 /* stop when tracing is finished */ 2878 /* stop when tracing is finished */
3248 if (trace_empty(iter)) 2879 if (trace_empty(iter)) {
2880 sret = 0;
3249 goto out; 2881 goto out;
2882 }
3250 2883
3251 if (cnt >= PAGE_SIZE) 2884 if (cnt >= PAGE_SIZE)
3252 cnt = PAGE_SIZE - 1; 2885 cnt = PAGE_SIZE - 1;
@@ -3267,8 +2900,8 @@ waitagain:
3267 iter->seq.len = len; 2900 iter->seq.len = len;
3268 break; 2901 break;
3269 } 2902 }
3270 2903 if (ret != TRACE_TYPE_NO_CONSUME)
3271 trace_consume(iter); 2904 trace_consume(iter);
3272 2905
3273 if (iter->seq.len >= cnt) 2906 if (iter->seq.len >= cnt)
3274 break; 2907 break;
@@ -3277,7 +2910,7 @@ waitagain:
3277 /* Now copy what we have to the user */ 2910 /* Now copy what we have to the user */
3278 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2911 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
3279 if (iter->seq.readpos >= iter->seq.len) 2912 if (iter->seq.readpos >= iter->seq.len)
3280 trace_seq_reset(&iter->seq); 2913 trace_seq_init(&iter->seq);
3281 2914
3282 /* 2915 /*
3283 * If there was nothing to send to user, inspite of consuming trace 2916 * If there was nothing to send to user, inspite of consuming trace
@@ -3287,20 +2920,165 @@ waitagain:
3287 goto waitagain; 2920 goto waitagain;
3288 2921
3289out: 2922out:
3290 mutex_unlock(&trace_types_lock); 2923 mutex_unlock(&iter->mutex);
3291 2924
3292 return sret; 2925 return sret;
3293} 2926}
3294 2927
2928static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
2929 struct pipe_buffer *buf)
2930{
2931 __free_page(buf->page);
2932}
2933
2934static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
2935 unsigned int idx)
2936{
2937 __free_page(spd->pages[idx]);
2938}
2939
2940static struct pipe_buf_operations tracing_pipe_buf_ops = {
2941 .can_merge = 0,
2942 .map = generic_pipe_buf_map,
2943 .unmap = generic_pipe_buf_unmap,
2944 .confirm = generic_pipe_buf_confirm,
2945 .release = tracing_pipe_buf_release,
2946 .steal = generic_pipe_buf_steal,
2947 .get = generic_pipe_buf_get,
2948};
2949
2950static size_t
2951tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
2952{
2953 size_t count;
2954 int ret;
2955
2956 /* Seq buffer is page-sized, exactly what we need. */
2957 for (;;) {
2958 count = iter->seq.len;
2959 ret = print_trace_line(iter);
2960 count = iter->seq.len - count;
2961 if (rem < count) {
2962 rem = 0;
2963 iter->seq.len -= count;
2964 break;
2965 }
2966 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2967 iter->seq.len -= count;
2968 break;
2969 }
2970
2971 trace_consume(iter);
2972 rem -= count;
2973 if (!find_next_entry_inc(iter)) {
2974 rem = 0;
2975 iter->ent = NULL;
2976 break;
2977 }
2978 }
2979
2980 return rem;
2981}
2982
2983static ssize_t tracing_splice_read_pipe(struct file *filp,
2984 loff_t *ppos,
2985 struct pipe_inode_info *pipe,
2986 size_t len,
2987 unsigned int flags)
2988{
2989 struct page *pages[PIPE_BUFFERS];
2990 struct partial_page partial[PIPE_BUFFERS];
2991 struct trace_iterator *iter = filp->private_data;
2992 struct splice_pipe_desc spd = {
2993 .pages = pages,
2994 .partial = partial,
2995 .nr_pages = 0, /* This gets updated below. */
2996 .flags = flags,
2997 .ops = &tracing_pipe_buf_ops,
2998 .spd_release = tracing_spd_release_pipe,
2999 };
3000 static struct tracer *old_tracer;
3001 ssize_t ret;
3002 size_t rem;
3003 unsigned int i;
3004
3005 /* copy the tracer to avoid using a global lock all around */
3006 mutex_lock(&trace_types_lock);
3007 if (unlikely(old_tracer != current_trace && current_trace)) {
3008 old_tracer = current_trace;
3009 *iter->trace = *current_trace;
3010 }
3011 mutex_unlock(&trace_types_lock);
3012
3013 mutex_lock(&iter->mutex);
3014
3015 if (iter->trace->splice_read) {
3016 ret = iter->trace->splice_read(iter, filp,
3017 ppos, pipe, len, flags);
3018 if (ret)
3019 goto out_err;
3020 }
3021
3022 ret = tracing_wait_pipe(filp);
3023 if (ret <= 0)
3024 goto out_err;
3025
3026 if (!iter->ent && !find_next_entry_inc(iter)) {
3027 ret = -EFAULT;
3028 goto out_err;
3029 }
3030
3031 /* Fill as many pages as possible. */
3032 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3033 pages[i] = alloc_page(GFP_KERNEL);
3034 if (!pages[i])
3035 break;
3036
3037 rem = tracing_fill_pipe_page(rem, iter);
3038
3039 /* Copy the data into the page, so we can start over. */
3040 ret = trace_seq_to_buffer(&iter->seq,
3041 page_address(pages[i]),
3042 iter->seq.len);
3043 if (ret < 0) {
3044 __free_page(pages[i]);
3045 break;
3046 }
3047 partial[i].offset = 0;
3048 partial[i].len = iter->seq.len;
3049
3050 trace_seq_init(&iter->seq);
3051 }
3052
3053 mutex_unlock(&iter->mutex);
3054
3055 spd.nr_pages = i;
3056
3057 return splice_to_pipe(pipe, &spd);
3058
3059out_err:
3060 mutex_unlock(&iter->mutex);
3061
3062 return ret;
3063}
3064
3295static ssize_t 3065static ssize_t
3296tracing_entries_read(struct file *filp, char __user *ubuf, 3066tracing_entries_read(struct file *filp, char __user *ubuf,
3297 size_t cnt, loff_t *ppos) 3067 size_t cnt, loff_t *ppos)
3298{ 3068{
3299 struct trace_array *tr = filp->private_data; 3069 struct trace_array *tr = filp->private_data;
3300 char buf[64]; 3070 char buf[96];
3301 int r; 3071 int r;
3302 3072
3303 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3073 mutex_lock(&trace_types_lock);
3074 if (!ring_buffer_expanded)
3075 r = sprintf(buf, "%lu (expanded: %lu)\n",
3076 tr->entries >> 10,
3077 trace_buf_size >> 10);
3078 else
3079 r = sprintf(buf, "%lu\n", tr->entries >> 10);
3080 mutex_unlock(&trace_types_lock);
3081
3304 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3082 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3305} 3083}
3306 3084
@@ -3344,28 +3122,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3344 val <<= 10; 3122 val <<= 10;
3345 3123
3346 if (val != global_trace.entries) { 3124 if (val != global_trace.entries) {
3347 ret = ring_buffer_resize(global_trace.buffer, val); 3125 ret = tracing_resize_ring_buffer(val);
3348 if (ret < 0) {
3349 cnt = ret;
3350 goto out;
3351 }
3352
3353 ret = ring_buffer_resize(max_tr.buffer, val);
3354 if (ret < 0) { 3126 if (ret < 0) {
3355 int r;
3356 cnt = ret; 3127 cnt = ret;
3357 r = ring_buffer_resize(global_trace.buffer,
3358 global_trace.entries);
3359 if (r < 0) {
3360 /* AARGH! We are left with different
3361 * size max buffer!!!! */
3362 WARN_ON(1);
3363 tracing_disabled = 1;
3364 }
3365 goto out; 3128 goto out;
3366 } 3129 }
3367
3368 global_trace.entries = val;
3369 } 3130 }
3370 3131
3371 filp->f_pos += cnt; 3132 filp->f_pos += cnt;
@@ -3433,42 +3194,288 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3433 return cnt; 3194 return cnt;
3434} 3195}
3435 3196
3436static struct file_operations tracing_max_lat_fops = { 3197static const struct file_operations tracing_max_lat_fops = {
3437 .open = tracing_open_generic, 3198 .open = tracing_open_generic,
3438 .read = tracing_max_lat_read, 3199 .read = tracing_max_lat_read,
3439 .write = tracing_max_lat_write, 3200 .write = tracing_max_lat_write,
3440}; 3201};
3441 3202
3442static struct file_operations tracing_ctrl_fops = { 3203static const struct file_operations tracing_ctrl_fops = {
3443 .open = tracing_open_generic, 3204 .open = tracing_open_generic,
3444 .read = tracing_ctrl_read, 3205 .read = tracing_ctrl_read,
3445 .write = tracing_ctrl_write, 3206 .write = tracing_ctrl_write,
3446}; 3207};
3447 3208
3448static struct file_operations set_tracer_fops = { 3209static const struct file_operations set_tracer_fops = {
3449 .open = tracing_open_generic, 3210 .open = tracing_open_generic,
3450 .read = tracing_set_trace_read, 3211 .read = tracing_set_trace_read,
3451 .write = tracing_set_trace_write, 3212 .write = tracing_set_trace_write,
3452}; 3213};
3453 3214
3454static struct file_operations tracing_pipe_fops = { 3215static const struct file_operations tracing_pipe_fops = {
3455 .open = tracing_open_pipe, 3216 .open = tracing_open_pipe,
3456 .poll = tracing_poll_pipe, 3217 .poll = tracing_poll_pipe,
3457 .read = tracing_read_pipe, 3218 .read = tracing_read_pipe,
3219 .splice_read = tracing_splice_read_pipe,
3458 .release = tracing_release_pipe, 3220 .release = tracing_release_pipe,
3459}; 3221};
3460 3222
3461static struct file_operations tracing_entries_fops = { 3223static const struct file_operations tracing_entries_fops = {
3462 .open = tracing_open_generic, 3224 .open = tracing_open_generic,
3463 .read = tracing_entries_read, 3225 .read = tracing_entries_read,
3464 .write = tracing_entries_write, 3226 .write = tracing_entries_write,
3465}; 3227};
3466 3228
3467static struct file_operations tracing_mark_fops = { 3229static const struct file_operations tracing_mark_fops = {
3468 .open = tracing_open_generic, 3230 .open = tracing_open_generic,
3469 .write = tracing_mark_write, 3231 .write = tracing_mark_write,
3470}; 3232};
3471 3233
3234struct ftrace_buffer_info {
3235 struct trace_array *tr;
3236 void *spare;
3237 int cpu;
3238 unsigned int read;
3239};
3240
3241static int tracing_buffers_open(struct inode *inode, struct file *filp)
3242{
3243 int cpu = (int)(long)inode->i_private;
3244 struct ftrace_buffer_info *info;
3245
3246 if (tracing_disabled)
3247 return -ENODEV;
3248
3249 info = kzalloc(sizeof(*info), GFP_KERNEL);
3250 if (!info)
3251 return -ENOMEM;
3252
3253 info->tr = &global_trace;
3254 info->cpu = cpu;
3255 info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
3256 /* Force reading ring buffer for first read */
3257 info->read = (unsigned int)-1;
3258 if (!info->spare)
3259 goto out;
3260
3261 filp->private_data = info;
3262
3263 return 0;
3264
3265 out:
3266 kfree(info);
3267 return -ENOMEM;
3268}
3269
3270static ssize_t
3271tracing_buffers_read(struct file *filp, char __user *ubuf,
3272 size_t count, loff_t *ppos)
3273{
3274 struct ftrace_buffer_info *info = filp->private_data;
3275 unsigned int pos;
3276 ssize_t ret;
3277 size_t size;
3278
3279 if (!count)
3280 return 0;
3281
3282 /* Do we have previous read data to read? */
3283 if (info->read < PAGE_SIZE)
3284 goto read;
3285
3286 info->read = 0;
3287
3288 ret = ring_buffer_read_page(info->tr->buffer,
3289 &info->spare,
3290 count,
3291 info->cpu, 0);
3292 if (ret < 0)
3293 return 0;
3294
3295 pos = ring_buffer_page_len(info->spare);
3296
3297 if (pos < PAGE_SIZE)
3298 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3299
3300read:
3301 size = PAGE_SIZE - info->read;
3302 if (size > count)
3303 size = count;
3304
3305 ret = copy_to_user(ubuf, info->spare + info->read, size);
3306 if (ret == size)
3307 return -EFAULT;
3308 size -= ret;
3309
3310 *ppos += size;
3311 info->read += size;
3312
3313 return size;
3314}
3315
3316static int tracing_buffers_release(struct inode *inode, struct file *file)
3317{
3318 struct ftrace_buffer_info *info = file->private_data;
3319
3320 ring_buffer_free_read_page(info->tr->buffer, info->spare);
3321 kfree(info);
3322
3323 return 0;
3324}
3325
3326struct buffer_ref {
3327 struct ring_buffer *buffer;
3328 void *page;
3329 int ref;
3330};
3331
3332static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
3333 struct pipe_buffer *buf)
3334{
3335 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3336
3337 if (--ref->ref)
3338 return;
3339
3340 ring_buffer_free_read_page(ref->buffer, ref->page);
3341 kfree(ref);
3342 buf->private = 0;
3343}
3344
3345static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
3346 struct pipe_buffer *buf)
3347{
3348 return 1;
3349}
3350
3351static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3352 struct pipe_buffer *buf)
3353{
3354 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3355
3356 ref->ref++;
3357}
3358
3359/* Pipe buffer operations for a buffer. */
3360static struct pipe_buf_operations buffer_pipe_buf_ops = {
3361 .can_merge = 0,
3362 .map = generic_pipe_buf_map,
3363 .unmap = generic_pipe_buf_unmap,
3364 .confirm = generic_pipe_buf_confirm,
3365 .release = buffer_pipe_buf_release,
3366 .steal = buffer_pipe_buf_steal,
3367 .get = buffer_pipe_buf_get,
3368};
3369
3370/*
3371 * Callback from splice_to_pipe(), if we need to release some pages
3372 * at the end of the spd in case we error'ed out in filling the pipe.
3373 */
3374static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
3375{
3376 struct buffer_ref *ref =
3377 (struct buffer_ref *)spd->partial[i].private;
3378
3379 if (--ref->ref)
3380 return;
3381
3382 ring_buffer_free_read_page(ref->buffer, ref->page);
3383 kfree(ref);
3384 spd->partial[i].private = 0;
3385}
3386
3387static ssize_t
3388tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3389 struct pipe_inode_info *pipe, size_t len,
3390 unsigned int flags)
3391{
3392 struct ftrace_buffer_info *info = file->private_data;
3393 struct partial_page partial[PIPE_BUFFERS];
3394 struct page *pages[PIPE_BUFFERS];
3395 struct splice_pipe_desc spd = {
3396 .pages = pages,
3397 .partial = partial,
3398 .flags = flags,
3399 .ops = &buffer_pipe_buf_ops,
3400 .spd_release = buffer_spd_release,
3401 };
3402 struct buffer_ref *ref;
3403 int size, i;
3404 size_t ret;
3405
3406 /*
3407 * We can't seek on a buffer input
3408 */
3409 if (unlikely(*ppos))
3410 return -ESPIPE;
3411
3412
3413 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
3414 struct page *page;
3415 int r;
3416
3417 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
3418 if (!ref)
3419 break;
3420
3421 ref->buffer = info->tr->buffer;
3422 ref->page = ring_buffer_alloc_read_page(ref->buffer);
3423 if (!ref->page) {
3424 kfree(ref);
3425 break;
3426 }
3427
3428 r = ring_buffer_read_page(ref->buffer, &ref->page,
3429 len, info->cpu, 0);
3430 if (r < 0) {
3431 ring_buffer_free_read_page(ref->buffer,
3432 ref->page);
3433 kfree(ref);
3434 break;
3435 }
3436
3437 /*
3438 * zero out any left over data, this is going to
3439 * user land.
3440 */
3441 size = ring_buffer_page_len(ref->page);
3442 if (size < PAGE_SIZE)
3443 memset(ref->page + size, 0, PAGE_SIZE - size);
3444
3445 page = virt_to_page(ref->page);
3446
3447 spd.pages[i] = page;
3448 spd.partial[i].len = PAGE_SIZE;
3449 spd.partial[i].offset = 0;
3450 spd.partial[i].private = (unsigned long)ref;
3451 spd.nr_pages++;
3452 }
3453
3454 spd.nr_pages = i;
3455
3456 /* did we read anything? */
3457 if (!spd.nr_pages) {
3458 if (flags & SPLICE_F_NONBLOCK)
3459 ret = -EAGAIN;
3460 else
3461 ret = 0;
3462 /* TODO: block */
3463 return ret;
3464 }
3465
3466 ret = splice_to_pipe(pipe, &spd);
3467
3468 return ret;
3469}
3470
3471static const struct file_operations tracing_buffers_fops = {
3472 .open = tracing_buffers_open,
3473 .read = tracing_buffers_read,
3474 .release = tracing_buffers_release,
3475 .splice_read = tracing_buffers_splice_read,
3476 .llseek = no_llseek,
3477};
3478
3472#ifdef CONFIG_DYNAMIC_FTRACE 3479#ifdef CONFIG_DYNAMIC_FTRACE
3473 3480
3474int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3481int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3500,7 +3507,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3500 return r; 3507 return r;
3501} 3508}
3502 3509
3503static struct file_operations tracing_dyn_info_fops = { 3510static const struct file_operations tracing_dyn_info_fops = {
3504 .open = tracing_open_generic, 3511 .open = tracing_open_generic,
3505 .read = tracing_read_dyn_info, 3512 .read = tracing_read_dyn_info,
3506}; 3513};
@@ -3526,15 +3533,350 @@ struct dentry *tracing_init_dentry(void)
3526 return d_tracer; 3533 return d_tracer;
3527} 3534}
3528 3535
3536static struct dentry *d_percpu;
3537
3538struct dentry *tracing_dentry_percpu(void)
3539{
3540 static int once;
3541 struct dentry *d_tracer;
3542
3543 if (d_percpu)
3544 return d_percpu;
3545
3546 d_tracer = tracing_init_dentry();
3547
3548 if (!d_tracer)
3549 return NULL;
3550
3551 d_percpu = debugfs_create_dir("per_cpu", d_tracer);
3552
3553 if (!d_percpu && !once) {
3554 once = 1;
3555 pr_warning("Could not create debugfs directory 'per_cpu'\n");
3556 return NULL;
3557 }
3558
3559 return d_percpu;
3560}
3561
3562static void tracing_init_debugfs_percpu(long cpu)
3563{
3564 struct dentry *d_percpu = tracing_dentry_percpu();
3565 struct dentry *entry, *d_cpu;
3566 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3567 char cpu_dir[7];
3568
3569 if (cpu > 999 || cpu < 0)
3570 return;
3571
3572 sprintf(cpu_dir, "cpu%ld", cpu);
3573 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
3574 if (!d_cpu) {
3575 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
3576 return;
3577 }
3578
3579 /* per cpu trace_pipe */
3580 entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
3581 (void *) cpu, &tracing_pipe_fops);
3582 if (!entry)
3583 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3584
3585 /* per cpu trace */
3586 entry = debugfs_create_file("trace", 0444, d_cpu,
3587 (void *) cpu, &tracing_fops);
3588 if (!entry)
3589 pr_warning("Could not create debugfs 'trace' entry\n");
3590
3591 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
3592 (void *) cpu, &tracing_buffers_fops);
3593 if (!entry)
3594 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
3595}
3596
3529#ifdef CONFIG_FTRACE_SELFTEST 3597#ifdef CONFIG_FTRACE_SELFTEST
3530/* Let selftest have access to static functions in this file */ 3598/* Let selftest have access to static functions in this file */
3531#include "trace_selftest.c" 3599#include "trace_selftest.c"
3532#endif 3600#endif
3533 3601
3602struct trace_option_dentry {
3603 struct tracer_opt *opt;
3604 struct tracer_flags *flags;
3605 struct dentry *entry;
3606};
3607
3608static ssize_t
3609trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
3610 loff_t *ppos)
3611{
3612 struct trace_option_dentry *topt = filp->private_data;
3613 char *buf;
3614
3615 if (topt->flags->val & topt->opt->bit)
3616 buf = "1\n";
3617 else
3618 buf = "0\n";
3619
3620 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3621}
3622
3623static ssize_t
3624trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3625 loff_t *ppos)
3626{
3627 struct trace_option_dentry *topt = filp->private_data;
3628 unsigned long val;
3629 char buf[64];
3630 int ret;
3631
3632 if (cnt >= sizeof(buf))
3633 return -EINVAL;
3634
3635 if (copy_from_user(&buf, ubuf, cnt))
3636 return -EFAULT;
3637
3638 buf[cnt] = 0;
3639
3640 ret = strict_strtoul(buf, 10, &val);
3641 if (ret < 0)
3642 return ret;
3643
3644 ret = 0;
3645 switch (val) {
3646 case 0:
3647 /* do nothing if already cleared */
3648 if (!(topt->flags->val & topt->opt->bit))
3649 break;
3650
3651 mutex_lock(&trace_types_lock);
3652 if (current_trace->set_flag)
3653 ret = current_trace->set_flag(topt->flags->val,
3654 topt->opt->bit, 0);
3655 mutex_unlock(&trace_types_lock);
3656 if (ret)
3657 return ret;
3658 topt->flags->val &= ~topt->opt->bit;
3659 break;
3660 case 1:
3661 /* do nothing if already set */
3662 if (topt->flags->val & topt->opt->bit)
3663 break;
3664
3665 mutex_lock(&trace_types_lock);
3666 if (current_trace->set_flag)
3667 ret = current_trace->set_flag(topt->flags->val,
3668 topt->opt->bit, 1);
3669 mutex_unlock(&trace_types_lock);
3670 if (ret)
3671 return ret;
3672 topt->flags->val |= topt->opt->bit;
3673 break;
3674
3675 default:
3676 return -EINVAL;
3677 }
3678
3679 *ppos += cnt;
3680
3681 return cnt;
3682}
3683
3684
3685static const struct file_operations trace_options_fops = {
3686 .open = tracing_open_generic,
3687 .read = trace_options_read,
3688 .write = trace_options_write,
3689};
3690
3691static ssize_t
3692trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
3693 loff_t *ppos)
3694{
3695 long index = (long)filp->private_data;
3696 char *buf;
3697
3698 if (trace_flags & (1 << index))
3699 buf = "1\n";
3700 else
3701 buf = "0\n";
3702
3703 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3704}
3705
3706static ssize_t
3707trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3708 loff_t *ppos)
3709{
3710 long index = (long)filp->private_data;
3711 char buf[64];
3712 unsigned long val;
3713 int ret;
3714
3715 if (cnt >= sizeof(buf))
3716 return -EINVAL;
3717
3718 if (copy_from_user(&buf, ubuf, cnt))
3719 return -EFAULT;
3720
3721 buf[cnt] = 0;
3722
3723 ret = strict_strtoul(buf, 10, &val);
3724 if (ret < 0)
3725 return ret;
3726
3727 switch (val) {
3728 case 0:
3729 trace_flags &= ~(1 << index);
3730 break;
3731 case 1:
3732 trace_flags |= 1 << index;
3733 break;
3734
3735 default:
3736 return -EINVAL;
3737 }
3738
3739 *ppos += cnt;
3740
3741 return cnt;
3742}
3743
3744static const struct file_operations trace_options_core_fops = {
3745 .open = tracing_open_generic,
3746 .read = trace_options_core_read,
3747 .write = trace_options_core_write,
3748};
3749
3750static struct dentry *trace_options_init_dentry(void)
3751{
3752 struct dentry *d_tracer;
3753 static struct dentry *t_options;
3754
3755 if (t_options)
3756 return t_options;
3757
3758 d_tracer = tracing_init_dentry();
3759 if (!d_tracer)
3760 return NULL;
3761
3762 t_options = debugfs_create_dir("options", d_tracer);
3763 if (!t_options) {
3764 pr_warning("Could not create debugfs directory 'options'\n");
3765 return NULL;
3766 }
3767
3768 return t_options;
3769}
3770
3771static void
3772create_trace_option_file(struct trace_option_dentry *topt,
3773 struct tracer_flags *flags,
3774 struct tracer_opt *opt)
3775{
3776 struct dentry *t_options;
3777 struct dentry *entry;
3778
3779 t_options = trace_options_init_dentry();
3780 if (!t_options)
3781 return;
3782
3783 topt->flags = flags;
3784 topt->opt = opt;
3785
3786 entry = debugfs_create_file(opt->name, 0644, t_options, topt,
3787 &trace_options_fops);
3788
3789 topt->entry = entry;
3790
3791}
3792
3793static struct trace_option_dentry *
3794create_trace_option_files(struct tracer *tracer)
3795{
3796 struct trace_option_dentry *topts;
3797 struct tracer_flags *flags;
3798 struct tracer_opt *opts;
3799 int cnt;
3800
3801 if (!tracer)
3802 return NULL;
3803
3804 flags = tracer->flags;
3805
3806 if (!flags || !flags->opts)
3807 return NULL;
3808
3809 opts = flags->opts;
3810
3811 for (cnt = 0; opts[cnt].name; cnt++)
3812 ;
3813
3814 topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
3815 if (!topts)
3816 return NULL;
3817
3818 for (cnt = 0; opts[cnt].name; cnt++)
3819 create_trace_option_file(&topts[cnt], flags,
3820 &opts[cnt]);
3821
3822 return topts;
3823}
3824
3825static void
3826destroy_trace_option_files(struct trace_option_dentry *topts)
3827{
3828 int cnt;
3829
3830 if (!topts)
3831 return;
3832
3833 for (cnt = 0; topts[cnt].opt; cnt++) {
3834 if (topts[cnt].entry)
3835 debugfs_remove(topts[cnt].entry);
3836 }
3837
3838 kfree(topts);
3839}
3840
3841static struct dentry *
3842create_trace_option_core_file(const char *option, long index)
3843{
3844 struct dentry *t_options;
3845 struct dentry *entry;
3846
3847 t_options = trace_options_init_dentry();
3848 if (!t_options)
3849 return NULL;
3850
3851 entry = debugfs_create_file(option, 0644, t_options, (void *)index,
3852 &trace_options_core_fops);
3853
3854 return entry;
3855}
3856
3857static __init void create_trace_options_dir(void)
3858{
3859 struct dentry *t_options;
3860 struct dentry *entry;
3861 int i;
3862
3863 t_options = trace_options_init_dentry();
3864 if (!t_options)
3865 return;
3866
3867 for (i = 0; trace_options[i]; i++) {
3868 entry = create_trace_option_core_file(trace_options[i], i);
3869 if (!entry)
3870 pr_warning("Could not create debugfs %s entry\n",
3871 trace_options[i]);
3872 }
3873}
3874
3534static __init int tracer_init_debugfs(void) 3875static __init int tracer_init_debugfs(void)
3535{ 3876{
3536 struct dentry *d_tracer; 3877 struct dentry *d_tracer;
3537 struct dentry *entry; 3878 struct dentry *entry;
3879 int cpu;
3538 3880
3539 d_tracer = tracing_init_dentry(); 3881 d_tracer = tracing_init_dentry();
3540 3882
@@ -3548,18 +3890,15 @@ static __init int tracer_init_debugfs(void)
3548 if (!entry) 3890 if (!entry)
3549 pr_warning("Could not create debugfs 'trace_options' entry\n"); 3891 pr_warning("Could not create debugfs 'trace_options' entry\n");
3550 3892
3893 create_trace_options_dir();
3894
3551 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3895 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
3552 NULL, &tracing_cpumask_fops); 3896 NULL, &tracing_cpumask_fops);
3553 if (!entry) 3897 if (!entry)
3554 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 3898 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
3555 3899
3556 entry = debugfs_create_file("latency_trace", 0444, d_tracer,
3557 &global_trace, &tracing_lt_fops);
3558 if (!entry)
3559 pr_warning("Could not create debugfs 'latency_trace' entry\n");
3560
3561 entry = debugfs_create_file("trace", 0444, d_tracer, 3900 entry = debugfs_create_file("trace", 0444, d_tracer,
3562 &global_trace, &tracing_fops); 3901 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
3563 if (!entry) 3902 if (!entry)
3564 pr_warning("Could not create debugfs 'trace' entry\n"); 3903 pr_warning("Could not create debugfs 'trace' entry\n");
3565 3904
@@ -3590,8 +3929,8 @@ static __init int tracer_init_debugfs(void)
3590 if (!entry) 3929 if (!entry)
3591 pr_warning("Could not create debugfs 'README' entry\n"); 3930 pr_warning("Could not create debugfs 'README' entry\n");
3592 3931
3593 entry = debugfs_create_file("trace_pipe", 0644, d_tracer, 3932 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3594 NULL, &tracing_pipe_fops); 3933 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3595 if (!entry) 3934 if (!entry)
3596 pr_warning("Could not create debugfs " 3935 pr_warning("Could not create debugfs "
3597 "'trace_pipe' entry\n"); 3936 "'trace_pipe' entry\n");
@@ -3619,77 +3958,12 @@ static __init int tracer_init_debugfs(void)
3619#ifdef CONFIG_SYSPROF_TRACER 3958#ifdef CONFIG_SYSPROF_TRACER
3620 init_tracer_sysprof_debugfs(d_tracer); 3959 init_tracer_sysprof_debugfs(d_tracer);
3621#endif 3960#endif
3622 return 0;
3623}
3624
3625int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
3626{
3627 static DEFINE_SPINLOCK(trace_buf_lock);
3628 static char trace_buf[TRACE_BUF_SIZE];
3629
3630 struct ring_buffer_event *event;
3631 struct trace_array *tr = &global_trace;
3632 struct trace_array_cpu *data;
3633 int cpu, len = 0, size, pc;
3634 struct print_entry *entry;
3635 unsigned long irq_flags;
3636
3637 if (tracing_disabled || tracing_selftest_running)
3638 return 0;
3639
3640 pc = preempt_count();
3641 preempt_disable_notrace();
3642 cpu = raw_smp_processor_id();
3643 data = tr->data[cpu];
3644
3645 if (unlikely(atomic_read(&data->disabled)))
3646 goto out;
3647
3648 pause_graph_tracing();
3649 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3650 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3651
3652 len = min(len, TRACE_BUF_SIZE-1);
3653 trace_buf[len] = 0;
3654
3655 size = sizeof(*entry) + len + 1;
3656 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
3657 if (!event)
3658 goto out_unlock;
3659 entry = ring_buffer_event_data(event);
3660 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3661 entry->ent.type = TRACE_PRINT;
3662 entry->ip = ip;
3663 entry->depth = depth;
3664
3665 memcpy(&entry->buf, trace_buf, len);
3666 entry->buf[len] = 0;
3667 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3668
3669 out_unlock:
3670 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3671 unpause_graph_tracing();
3672 out:
3673 preempt_enable_notrace();
3674
3675 return len;
3676}
3677EXPORT_SYMBOL_GPL(trace_vprintk);
3678
3679int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3680{
3681 int ret;
3682 va_list ap;
3683 3961
3684 if (!(trace_flags & TRACE_ITER_PRINTK)) 3962 for_each_tracing_cpu(cpu)
3685 return 0; 3963 tracing_init_debugfs_percpu(cpu);
3686 3964
3687 va_start(ap, fmt); 3965 return 0;
3688 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3689 va_end(ap);
3690 return ret;
3691} 3966}
3692EXPORT_SYMBOL_GPL(__ftrace_printk);
3693 3967
3694static int trace_panic_handler(struct notifier_block *this, 3968static int trace_panic_handler(struct notifier_block *this,
3695 unsigned long event, void *unused) 3969 unsigned long event, void *unused)
@@ -3750,7 +4024,7 @@ trace_printk_seq(struct trace_seq *s)
3750 4024
3751 printk(KERN_TRACE "%s", s->buffer); 4025 printk(KERN_TRACE "%s", s->buffer);
3752 4026
3753 trace_seq_reset(s); 4027 trace_seq_init(s);
3754} 4028}
3755 4029
3756void ftrace_dump(void) 4030void ftrace_dump(void)
@@ -3782,8 +4056,10 @@ void ftrace_dump(void)
3782 4056
3783 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 4057 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3784 4058
4059 /* Simulate the iterator */
3785 iter.tr = &global_trace; 4060 iter.tr = &global_trace;
3786 iter.trace = current_trace; 4061 iter.trace = current_trace;
4062 iter.cpu_file = TRACE_PIPE_ALL_CPU;
3787 4063
3788 /* 4064 /*
3789 * We need to stop all tracing on all CPUS to read the 4065 * We need to stop all tracing on all CPUS to read the
@@ -3826,6 +4102,7 @@ void ftrace_dump(void)
3826__init static int tracer_alloc_buffers(void) 4102__init static int tracer_alloc_buffers(void)
3827{ 4103{
3828 struct trace_array_cpu *data; 4104 struct trace_array_cpu *data;
4105 int ring_buf_size;
3829 int i; 4106 int i;
3830 int ret = -ENOMEM; 4107 int ret = -ENOMEM;
3831 4108
@@ -3835,11 +4112,21 @@ __init static int tracer_alloc_buffers(void)
3835 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4112 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3836 goto out_free_buffer_mask; 4113 goto out_free_buffer_mask;
3837 4114
4115 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4116 goto out_free_tracing_cpumask;
4117
4118 /* To save memory, keep the ring buffer size to its minimum */
4119 if (ring_buffer_expanded)
4120 ring_buf_size = trace_buf_size;
4121 else
4122 ring_buf_size = 1;
4123
3838 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4124 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3839 cpumask_copy(tracing_cpumask, cpu_all_mask); 4125 cpumask_copy(tracing_cpumask, cpu_all_mask);
4126 cpumask_clear(tracing_reader_cpumask);
3840 4127
3841 /* TODO: make the number of buffers hot pluggable with CPUS */ 4128 /* TODO: make the number of buffers hot pluggable with CPUS */
3842 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 4129 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
3843 TRACE_BUFFER_FLAGS); 4130 TRACE_BUFFER_FLAGS);
3844 if (!global_trace.buffer) { 4131 if (!global_trace.buffer) {
3845 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4132 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3850,7 +4137,7 @@ __init static int tracer_alloc_buffers(void)
3850 4137
3851 4138
3852#ifdef CONFIG_TRACER_MAX_TRACE 4139#ifdef CONFIG_TRACER_MAX_TRACE
3853 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 4140 max_tr.buffer = ring_buffer_alloc(ring_buf_size,
3854 TRACE_BUFFER_FLAGS); 4141 TRACE_BUFFER_FLAGS);
3855 if (!max_tr.buffer) { 4142 if (!max_tr.buffer) {
3856 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4143 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
@@ -3871,14 +4158,10 @@ __init static int tracer_alloc_buffers(void)
3871 trace_init_cmdlines(); 4158 trace_init_cmdlines();
3872 4159
3873 register_tracer(&nop_trace); 4160 register_tracer(&nop_trace);
4161 current_trace = &nop_trace;
3874#ifdef CONFIG_BOOT_TRACER 4162#ifdef CONFIG_BOOT_TRACER
3875 register_tracer(&boot_tracer); 4163 register_tracer(&boot_tracer);
3876 current_trace = &boot_tracer;
3877 current_trace->init(&global_trace);
3878#else
3879 current_trace = &nop_trace;
3880#endif 4164#endif
3881
3882 /* All seems OK, enable tracing */ 4165 /* All seems OK, enable tracing */
3883 tracing_disabled = 0; 4166 tracing_disabled = 0;
3884 4167
@@ -3886,14 +4169,38 @@ __init static int tracer_alloc_buffers(void)
3886 &trace_panic_notifier); 4169 &trace_panic_notifier);
3887 4170
3888 register_die_notifier(&trace_die_notifier); 4171 register_die_notifier(&trace_die_notifier);
3889 ret = 0; 4172
4173 return 0;
3890 4174
3891out_free_cpumask: 4175out_free_cpumask:
4176 free_cpumask_var(tracing_reader_cpumask);
4177out_free_tracing_cpumask:
3892 free_cpumask_var(tracing_cpumask); 4178 free_cpumask_var(tracing_cpumask);
3893out_free_buffer_mask: 4179out_free_buffer_mask:
3894 free_cpumask_var(tracing_buffer_mask); 4180 free_cpumask_var(tracing_buffer_mask);
3895out: 4181out:
3896 return ret; 4182 return ret;
3897} 4183}
4184
4185__init static int clear_boot_tracer(void)
4186{
4187 /*
4188 * The default tracer at boot buffer is an init section.
4189 * This function is called in lateinit. If we did not
4190 * find the boot tracer, then clear it out, to prevent
4191 * later registration from accessing the buffer that is
4192 * about to be freed.
4193 */
4194 if (!default_bootup_tracer)
4195 return 0;
4196
4197 printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
4198 default_bootup_tracer);
4199 default_bootup_tracer = NULL;
4200
4201 return 0;
4202}
4203
3898early_initcall(tracer_alloc_buffers); 4204early_initcall(tracer_alloc_buffers);
3899fs_initcall(tracer_init_debugfs); 4205fs_initcall(tracer_init_debugfs);
4206late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381bfd95..38276d1638e3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,8 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h>
13#include <trace/power.h>
12 14
13enum trace_type { 15enum trace_type {
14 __TRACE_FIRST_TYPE = 0, 16 __TRACE_FIRST_TYPE = 0,
@@ -16,9 +18,9 @@ enum trace_type {
16 TRACE_FN, 18 TRACE_FN,
17 TRACE_CTX, 19 TRACE_CTX,
18 TRACE_WAKE, 20 TRACE_WAKE,
19 TRACE_CONT,
20 TRACE_STACK, 21 TRACE_STACK,
21 TRACE_PRINT, 22 TRACE_PRINT,
23 TRACE_BPRINT,
22 TRACE_SPECIAL, 24 TRACE_SPECIAL,
23 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
24 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
@@ -29,9 +31,14 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 31 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 32 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES, 33 TRACE_HW_BRANCHES,
34 TRACE_SYSCALL_ENTER,
35 TRACE_SYSCALL_EXIT,
36 TRACE_KMEM_ALLOC,
37 TRACE_KMEM_FREE,
32 TRACE_POWER, 38 TRACE_POWER,
39 TRACE_BLK,
33 40
34 __TRACE_LAST_TYPE 41 __TRACE_LAST_TYPE,
35}; 42};
36 43
37/* 44/*
@@ -42,7 +49,6 @@ enum trace_type {
42 */ 49 */
43struct trace_entry { 50struct trace_entry {
44 unsigned char type; 51 unsigned char type;
45 unsigned char cpu;
46 unsigned char flags; 52 unsigned char flags;
47 unsigned char preempt_count; 53 unsigned char preempt_count;
48 int pid; 54 int pid;
@@ -60,13 +66,13 @@ struct ftrace_entry {
60 66
61/* Function call entry */ 67/* Function call entry */
62struct ftrace_graph_ent_entry { 68struct ftrace_graph_ent_entry {
63 struct trace_entry ent; 69 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent; 70 struct ftrace_graph_ent graph_ent;
65}; 71};
66 72
67/* Function return entry */ 73/* Function return entry */
68struct ftrace_graph_ret_entry { 74struct ftrace_graph_ret_entry {
69 struct trace_entry ent; 75 struct trace_entry ent;
70 struct ftrace_graph_ret ret; 76 struct ftrace_graph_ret ret;
71}; 77};
72extern struct tracer boot_tracer; 78extern struct tracer boot_tracer;
@@ -112,8 +118,16 @@ struct userstack_entry {
112}; 118};
113 119
114/* 120/*
115 * ftrace_printk entry: 121 * trace_printk entry:
116 */ 122 */
123struct bprint_entry {
124 struct trace_entry ent;
125 unsigned long ip;
126 int depth;
127 const char *fmt;
128 u32 buf[];
129};
130
117struct print_entry { 131struct print_entry {
118 struct trace_entry ent; 132 struct trace_entry ent;
119 unsigned long ip; 133 unsigned long ip;
@@ -170,15 +184,45 @@ struct trace_power {
170 struct power_trace state_data; 184 struct power_trace state_data;
171}; 185};
172 186
187struct kmemtrace_alloc_entry {
188 struct trace_entry ent;
189 enum kmemtrace_type_id type_id;
190 unsigned long call_site;
191 const void *ptr;
192 size_t bytes_req;
193 size_t bytes_alloc;
194 gfp_t gfp_flags;
195 int node;
196};
197
198struct kmemtrace_free_entry {
199 struct trace_entry ent;
200 enum kmemtrace_type_id type_id;
201 unsigned long call_site;
202 const void *ptr;
203};
204
205struct syscall_trace_enter {
206 struct trace_entry ent;
207 int nr;
208 unsigned long args[];
209};
210
211struct syscall_trace_exit {
212 struct trace_entry ent;
213 int nr;
214 unsigned long ret;
215};
216
217
173/* 218/*
174 * trace_flag_type is an enumeration that holds different 219 * trace_flag_type is an enumeration that holds different
175 * states when a trace occurs. These are: 220 * states when a trace occurs. These are:
176 * IRQS_OFF - interrupts were disabled 221 * IRQS_OFF - interrupts were disabled
177 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 222 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
178 * NEED_RESCED - reschedule is requested 223 * NEED_RESCED - reschedule is requested
179 * HARDIRQ - inside an interrupt handler 224 * HARDIRQ - inside an interrupt handler
180 * SOFTIRQ - inside a softirq handler 225 * SOFTIRQ - inside a softirq handler
181 * CONT - multiple entries hold the trace item
182 */ 226 */
183enum trace_flag_type { 227enum trace_flag_type {
184 TRACE_FLAG_IRQS_OFF = 0x01, 228 TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +230,6 @@ enum trace_flag_type {
186 TRACE_FLAG_NEED_RESCHED = 0x04, 230 TRACE_FLAG_NEED_RESCHED = 0x04,
187 TRACE_FLAG_HARDIRQ = 0x08, 231 TRACE_FLAG_HARDIRQ = 0x08,
188 TRACE_FLAG_SOFTIRQ = 0x10, 232 TRACE_FLAG_SOFTIRQ = 0x10,
189 TRACE_FLAG_CONT = 0x20,
190}; 233};
191 234
192#define TRACE_BUF_SIZE 1024 235#define TRACE_BUF_SIZE 1024
@@ -198,6 +241,7 @@ enum trace_flag_type {
198 */ 241 */
199struct trace_array_cpu { 242struct trace_array_cpu {
200 atomic_t disabled; 243 atomic_t disabled;
244 void *buffer_page; /* ring buffer spare */
201 245
202 /* these fields get copied into max-trace: */ 246 /* these fields get copied into max-trace: */
203 unsigned long trace_idx; 247 unsigned long trace_idx;
@@ -262,10 +306,10 @@ extern void __ftrace_bad_type(void);
262 do { \ 306 do { \
263 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ 307 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 308 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 309 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 310 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 311 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
312 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
269 IF_ASSIGN(var, ent, struct special_entry, 0); \ 313 IF_ASSIGN(var, ent, struct special_entry, 0); \
270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 314 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
271 TRACE_MMIO_RW); \ 315 TRACE_MMIO_RW); \
@@ -279,7 +323,15 @@ extern void __ftrace_bad_type(void);
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 323 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \ 324 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 325 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ 326 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
327 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
328 TRACE_KMEM_ALLOC); \
329 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
330 TRACE_KMEM_FREE); \
331 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
332 TRACE_SYSCALL_ENTER); \
333 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
334 TRACE_SYSCALL_EXIT); \
283 __ftrace_bad_type(); \ 335 __ftrace_bad_type(); \
284 } while (0) 336 } while (0)
285 337
@@ -287,7 +339,8 @@ extern void __ftrace_bad_type(void);
287enum print_line_t { 339enum print_line_t {
288 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ 340 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
289 TRACE_TYPE_HANDLED = 1, 341 TRACE_TYPE_HANDLED = 1,
290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 342 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
343 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
291}; 344};
292 345
293 346
@@ -297,8 +350,8 @@ enum print_line_t {
297 * flags value in struct tracer_flags. 350 * flags value in struct tracer_flags.
298 */ 351 */
299struct tracer_opt { 352struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */ 353 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */ 354 u32 bit; /* Mask assigned in val field in tracer_flags */
302}; 355};
303 356
304/* 357/*
@@ -307,28 +360,51 @@ struct tracer_opt {
307 */ 360 */
308struct tracer_flags { 361struct tracer_flags {
309 u32 val; 362 u32 val;
310 struct tracer_opt *opts; 363 struct tracer_opt *opts;
311}; 364};
312 365
313/* Makes more easy to define a tracer opt */ 366/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b 367#define TRACER_OPT(s, b) .name = #s, .bit = b
315 368
316/* 369
317 * A specific tracer, represented by methods that operate on a trace array: 370/**
371 * struct tracer - a specific tracer and its callbacks to interact with debugfs
372 * @name: the name chosen to select it on the available_tracers file
373 * @init: called when one switches to this tracer (echo name > current_tracer)
374 * @reset: called when one switches to another tracer
375 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
376 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
377 * @open: called when the trace file is opened
378 * @pipe_open: called when the trace_pipe file is opened
379 * @wait_pipe: override how the user waits for traces on trace_pipe
380 * @close: called when the trace file is released
381 * @read: override the default read callback on trace_pipe
382 * @splice_read: override the default splice_read callback on trace_pipe
383 * @selftest: selftest to run on boot (see trace_selftest.c)
384 * @print_headers: override the first lines that describe your columns
385 * @print_line: callback that prints a trace
386 * @set_flag: signals one of your private flags changed (trace_options file)
387 * @flags: your private flags
318 */ 388 */
319struct tracer { 389struct tracer {
320 const char *name; 390 const char *name;
321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr); 391 int (*init)(struct trace_array *tr);
323 void (*reset)(struct trace_array *tr); 392 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr); 393 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr); 394 void (*stop)(struct trace_array *tr);
326 void (*open)(struct trace_iterator *iter); 395 void (*open)(struct trace_iterator *iter);
327 void (*pipe_open)(struct trace_iterator *iter); 396 void (*pipe_open)(struct trace_iterator *iter);
397 void (*wait_pipe)(struct trace_iterator *iter);
328 void (*close)(struct trace_iterator *iter); 398 void (*close)(struct trace_iterator *iter);
329 ssize_t (*read)(struct trace_iterator *iter, 399 ssize_t (*read)(struct trace_iterator *iter,
330 struct file *filp, char __user *ubuf, 400 struct file *filp, char __user *ubuf,
331 size_t cnt, loff_t *ppos); 401 size_t cnt, loff_t *ppos);
402 ssize_t (*splice_read)(struct trace_iterator *iter,
403 struct file *filp,
404 loff_t *ppos,
405 struct pipe_inode_info *pipe,
406 size_t len,
407 unsigned int flags);
332#ifdef CONFIG_FTRACE_STARTUP_TEST 408#ifdef CONFIG_FTRACE_STARTUP_TEST
333 int (*selftest)(struct tracer *trace, 409 int (*selftest)(struct tracer *trace,
334 struct trace_array *tr); 410 struct trace_array *tr);
@@ -339,7 +415,8 @@ struct tracer {
339 int (*set_flag)(u32 old_flags, u32 bit, int set); 415 int (*set_flag)(u32 old_flags, u32 bit, int set);
340 struct tracer *next; 416 struct tracer *next;
341 int print_max; 417 int print_max;
342 struct tracer_flags *flags; 418 struct tracer_flags *flags;
419 struct tracer_stat *stats;
343}; 420};
344 421
345struct trace_seq { 422struct trace_seq {
@@ -348,6 +425,16 @@ struct trace_seq {
348 unsigned int readpos; 425 unsigned int readpos;
349}; 426};
350 427
428static inline void
429trace_seq_init(struct trace_seq *s)
430{
431 s->len = 0;
432 s->readpos = 0;
433}
434
435
436#define TRACE_PIPE_ALL_CPU -1
437
351/* 438/*
352 * Trace iterator - used by printout routines who present trace 439 * Trace iterator - used by printout routines who present trace
353 * results to users and which routines might sleep, etc: 440 * results to users and which routines might sleep, etc:
@@ -356,6 +443,8 @@ struct trace_iterator {
356 struct trace_array *tr; 443 struct trace_array *tr;
357 struct tracer *trace; 444 struct tracer *trace;
358 void *private; 445 void *private;
446 int cpu_file;
447 struct mutex mutex;
359 struct ring_buffer_iter *buffer_iter[NR_CPUS]; 448 struct ring_buffer_iter *buffer_iter[NR_CPUS];
360 449
361 /* The below is zeroed out in pipe_read */ 450 /* The below is zeroed out in pipe_read */
@@ -371,6 +460,7 @@ struct trace_iterator {
371 cpumask_var_t started; 460 cpumask_var_t started;
372}; 461};
373 462
463int tracer_init(struct tracer *t, struct trace_array *tr);
374int tracing_is_enabled(void); 464int tracing_is_enabled(void);
375void trace_wake_up(void); 465void trace_wake_up(void);
376void tracing_reset(struct trace_array *tr, int cpu); 466void tracing_reset(struct trace_array *tr, int cpu);
@@ -379,26 +469,48 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
379struct dentry *tracing_init_dentry(void); 469struct dentry *tracing_init_dentry(void);
380void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 470void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
381 471
472struct ring_buffer_event;
473
474struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
475 unsigned char type,
476 unsigned long len,
477 unsigned long flags,
478 int pc);
479void trace_buffer_unlock_commit(struct trace_array *tr,
480 struct ring_buffer_event *event,
481 unsigned long flags, int pc);
482
483struct ring_buffer_event *
484trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
485 unsigned long flags, int pc);
486void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
487 unsigned long flags, int pc);
488
382struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 489struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
383 struct trace_array_cpu *data); 490 struct trace_array_cpu *data);
491
492struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
493 int *ent_cpu, u64 *ent_ts);
494
384void tracing_generic_entry_update(struct trace_entry *entry, 495void tracing_generic_entry_update(struct trace_entry *entry,
385 unsigned long flags, 496 unsigned long flags,
386 int pc); 497 int pc);
387 498
499void default_wait_pipe(struct trace_iterator *iter);
500void poll_wait_pipe(struct trace_iterator *iter);
501
388void ftrace(struct trace_array *tr, 502void ftrace(struct trace_array *tr,
389 struct trace_array_cpu *data, 503 struct trace_array_cpu *data,
390 unsigned long ip, 504 unsigned long ip,
391 unsigned long parent_ip, 505 unsigned long parent_ip,
392 unsigned long flags, int pc); 506 unsigned long flags, int pc);
393void tracing_sched_switch_trace(struct trace_array *tr, 507void tracing_sched_switch_trace(struct trace_array *tr,
394 struct trace_array_cpu *data,
395 struct task_struct *prev, 508 struct task_struct *prev,
396 struct task_struct *next, 509 struct task_struct *next,
397 unsigned long flags, int pc); 510 unsigned long flags, int pc);
398void tracing_record_cmdline(struct task_struct *tsk); 511void tracing_record_cmdline(struct task_struct *tsk);
399 512
400void tracing_sched_wakeup_trace(struct trace_array *tr, 513void tracing_sched_wakeup_trace(struct trace_array *tr,
401 struct trace_array_cpu *data,
402 struct task_struct *wakee, 514 struct task_struct *wakee,
403 struct task_struct *cur, 515 struct task_struct *cur,
404 unsigned long flags, int pc); 516 unsigned long flags, int pc);
@@ -408,14 +520,12 @@ void trace_special(struct trace_array *tr,
408 unsigned long arg2, 520 unsigned long arg2,
409 unsigned long arg3, int pc); 521 unsigned long arg3, int pc);
410void trace_function(struct trace_array *tr, 522void trace_function(struct trace_array *tr,
411 struct trace_array_cpu *data,
412 unsigned long ip, 523 unsigned long ip,
413 unsigned long parent_ip, 524 unsigned long parent_ip,
414 unsigned long flags, int pc); 525 unsigned long flags, int pc);
415 526
416void trace_graph_return(struct ftrace_graph_ret *trace); 527void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace); 528int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
419 529
420void tracing_start_cmdline_record(void); 530void tracing_start_cmdline_record(void);
421void tracing_stop_cmdline_record(void); 531void tracing_stop_cmdline_record(void);
@@ -434,15 +544,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
434void update_max_tr_single(struct trace_array *tr, 544void update_max_tr_single(struct trace_array *tr,
435 struct task_struct *tsk, int cpu); 545 struct task_struct *tsk, int cpu);
436 546
437extern cycle_t ftrace_now(int cpu); 547void __trace_stack(struct trace_array *tr,
548 unsigned long flags,
549 int skip, int pc);
438 550
439#ifdef CONFIG_FUNCTION_TRACER 551extern cycle_t ftrace_now(int cpu);
440void tracing_start_function_trace(void);
441void tracing_stop_function_trace(void);
442#else
443# define tracing_start_function_trace() do { } while (0)
444# define tracing_stop_function_trace() do { } while (0)
445#endif
446 552
447#ifdef CONFIG_CONTEXT_SWITCH_TRACER 553#ifdef CONFIG_CONTEXT_SWITCH_TRACER
448typedef void 554typedef void
@@ -456,10 +562,10 @@ struct tracer_switch_ops {
456 void *private; 562 void *private;
457 struct tracer_switch_ops *next; 563 struct tracer_switch_ops *next;
458}; 564};
459
460char *trace_find_cmdline(int pid);
461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 565#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
462 566
567extern void trace_find_cmdline(int pid, char comm[]);
568
463#ifdef CONFIG_DYNAMIC_FTRACE 569#ifdef CONFIG_DYNAMIC_FTRACE
464extern unsigned long ftrace_update_tot_cnt; 570extern unsigned long ftrace_update_tot_cnt;
465#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 571#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@@ -469,6 +575,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
469#ifdef CONFIG_FTRACE_STARTUP_TEST 575#ifdef CONFIG_FTRACE_STARTUP_TEST
470extern int trace_selftest_startup_function(struct tracer *trace, 576extern int trace_selftest_startup_function(struct tracer *trace,
471 struct trace_array *tr); 577 struct trace_array *tr);
578extern int trace_selftest_startup_function_graph(struct tracer *trace,
579 struct trace_array *tr);
472extern int trace_selftest_startup_irqsoff(struct tracer *trace, 580extern int trace_selftest_startup_irqsoff(struct tracer *trace,
473 struct trace_array *tr); 581 struct trace_array *tr);
474extern int trace_selftest_startup_preemptoff(struct tracer *trace, 582extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@@ -488,17 +596,10 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
488#endif /* CONFIG_FTRACE_STARTUP_TEST */ 596#endif /* CONFIG_FTRACE_STARTUP_TEST */
489 597
490extern void *head_page(struct trace_array_cpu *data); 598extern void *head_page(struct trace_array_cpu *data);
491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
492extern void trace_seq_print_cont(struct trace_seq *s,
493 struct trace_iterator *iter);
494
495extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
497 unsigned long sym_flags);
498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
499 size_t cnt);
500extern long ns2usecs(cycle_t nsec); 599extern long ns2usecs(cycle_t nsec);
501extern int 600extern int
601trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
602extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); 603trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
503 604
504extern unsigned long trace_flags; 605extern unsigned long trace_flags;
@@ -580,7 +681,10 @@ enum trace_iterator_flags {
580 TRACE_ITER_ANNOTATE = 0x2000, 681 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000, 682 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000, 683 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000 684 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
685 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
686 TRACE_ITER_LATENCY_FMT = 0x40000,
687 TRACE_ITER_GLOBAL_CLK = 0x80000,
584}; 688};
585 689
586/* 690/*
@@ -601,12 +705,12 @@ extern struct tracer nop_trace;
601 * preempt_enable (after a disable), a schedule might take place 705 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion. 706 * causing an infinite recursion.
603 * 707 *
604 * To prevent this, we read the need_recshed flag before 708 * To prevent this, we read the need_resched flag before
605 * disabling preemption. When we want to enable preemption we 709 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched. 710 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable. 711 * Otherwise, we call preempt_enable.
608 * 712 *
609 * The rational for doing the above is that if need resched is set 713 * The rational for doing the above is that if need_resched is set
610 * and we have yet to reschedule, we are either in an atomic location 714 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside 715 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched. 716 * the scheduler and do not want to resched.
@@ -627,7 +731,7 @@ static inline int ftrace_preempt_disable(void)
627 * 731 *
628 * This is a scheduler safe way to enable preemption and not miss 732 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption. 733 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or 734 * If resched is set, then we are either inside an atomic or
631 * are inside the scheduler (we would have already scheduled 735 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal 736 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead. 737 * preempt_enable, but preempt_enable_no_resched instead.
@@ -664,4 +768,51 @@ static inline void trace_branch_disable(void)
664} 768}
665#endif /* CONFIG_BRANCH_TRACER */ 769#endif /* CONFIG_BRANCH_TRACER */
666 770
771/* set ring buffers to default size if not already done so */
772int tracing_update_buffers(void);
773
774/* trace event type bit fields, not numeric */
775enum {
776 TRACE_EVENT_TYPE_PRINTF = 1,
777 TRACE_EVENT_TYPE_RAW = 2,
778};
779
780struct ftrace_event_call {
781 char *name;
782 char *system;
783 struct dentry *dir;
784 int enabled;
785 int (*regfunc)(void);
786 void (*unregfunc)(void);
787 int id;
788 int (*raw_init)(void);
789 int (*show_format)(struct trace_seq *s);
790};
791
792void event_trace_printk(unsigned long ip, const char *fmt, ...);
793extern struct ftrace_event_call __start_ftrace_events[];
794extern struct ftrace_event_call __stop_ftrace_events[];
795
796extern const char *__start___trace_bprintk_fmt[];
797extern const char *__stop___trace_bprintk_fmt[];
798
799/*
800 * The double __builtin_constant_p is because gcc will give us an error
801 * if we try to allocate the static variable to fmt if it is not a
802 * constant. Even with the outer if statement optimizing out.
803 */
804#define event_trace_printk(ip, fmt, args...) \
805do { \
806 __trace_printk_check_format(fmt, ##args); \
807 tracing_record_cmdline(current); \
808 if (__builtin_constant_p(fmt)) { \
809 static const char *trace_printk_fmt \
810 __attribute__((section("__trace_printk_fmt"))) = \
811 __builtin_constant_p(fmt) ? fmt : NULL; \
812 \
813 __trace_bprintk(ip, trace_printk_fmt, ##args); \
814 } else \
815 __trace_printk(ip, fmt, ##args); \
816} while (0)
817
667#endif /* _LINUX_KERNEL_TRACE_H */ 818#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 366c8c333e13..7a30fc4c3642 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12 12
13#include "trace.h" 13#include "trace.h"
14#include "trace_output.h"
14 15
15static struct trace_array *boot_trace; 16static struct trace_array *boot_trace;
16static bool pre_initcalls_finished; 17static bool pre_initcalls_finished;
@@ -27,13 +28,13 @@ void start_boot_trace(void)
27 28
28void enable_boot_trace(void) 29void enable_boot_trace(void)
29{ 30{
30 if (pre_initcalls_finished) 31 if (boot_trace && pre_initcalls_finished)
31 tracing_start_sched_switch_record(); 32 tracing_start_sched_switch_record();
32} 33}
33 34
34void disable_boot_trace(void) 35void disable_boot_trace(void)
35{ 36{
36 if (pre_initcalls_finished) 37 if (boot_trace && pre_initcalls_finished)
37 tracing_stop_sched_switch_record(); 38 tracing_stop_sched_switch_record();
38} 39}
39 40
@@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 43 int cpu;
43 boot_trace = tr; 44 boot_trace = tr;
44 45
46 if (!tr)
47 return 0;
48
45 for_each_cpu(cpu, cpu_possible_mask) 49 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 50 tracing_reset(tr, cpu);
47 51
@@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
128{ 132{
129 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
130 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
131 unsigned long irq_flags;
132 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
133 136
134 if (!pre_initcalls_finished) 137 if (!tr || !pre_initcalls_finished)
135 return; 138 return;
136 139
137 /* Get its name now since this function could 140 /* Get its name now since this function could
@@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
140 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable(); 144 preempt_disable();
142 145
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 146 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
144 &irq_flags); 147 sizeof(*entry), 0, 0);
145 if (!event) 148 if (!event)
146 goto out; 149 goto out;
147 entry = ring_buffer_event_data(event); 150 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt; 151 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 152 trace_buffer_unlock_commit(tr, event, 0, 0);
152
153 trace_wake_up();
154
155 out: 153 out:
156 preempt_enable(); 154 preempt_enable();
157} 155}
@@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{ 158{
161 struct ring_buffer_event *event; 159 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry; 160 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace; 161 struct trace_array *tr = boot_trace;
165 162
166 if (!pre_initcalls_finished) 163 if (!tr || !pre_initcalls_finished)
167 return; 164 return;
168 165
169 sprint_symbol(bt->func, (unsigned long)fn); 166 sprint_symbol(bt->func, (unsigned long)fn);
170 preempt_disable(); 167 preempt_disable();
171 168
172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 169 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
173 &irq_flags); 170 sizeof(*entry), 0, 0);
174 if (!event) 171 if (!event)
175 goto out; 172 goto out;
176 entry = ring_buffer_event_data(event); 173 entry = ring_buffer_event_data(event);
177 tracing_generic_entry_update(&entry->ent, 0, 0);
178 entry->ent.type = TRACE_BOOT_RET;
179 entry->boot_ret = *bt; 174 entry->boot_ret = *bt;
180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 175 trace_buffer_unlock_commit(tr, event, 0, 0);
181
182 trace_wake_up();
183
184 out: 176 out:
185 preempt_enable(); 177 preempt_enable();
186} 178}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,12 +14,17 @@
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <asm/local.h> 16#include <asm/local.h>
17
17#include "trace.h" 18#include "trace.h"
19#include "trace_stat.h"
20#include "trace_output.h"
18 21
19#ifdef CONFIG_BRANCH_TRACER 22#ifdef CONFIG_BRANCH_TRACER
20 23
24static struct tracer branch_trace;
21static int branch_tracing_enabled __read_mostly; 25static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex); 26static DEFINE_MUTEX(branch_tracing_mutex);
27
23static struct trace_array *branch_tracer; 28static struct trace_array *branch_tracer;
24 29
25static void 30static void
@@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
28 struct trace_array *tr = branch_tracer; 33 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event; 34 struct ring_buffer_event *event;
30 struct trace_branch *entry; 35 struct trace_branch *entry;
31 unsigned long flags, irq_flags; 36 unsigned long flags;
32 int cpu, pc; 37 int cpu, pc;
33 const char *p; 38 const char *p;
34 39
@@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 52 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out; 53 goto out;
49 54
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 55 pc = preempt_count();
51 &irq_flags); 56 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
57 sizeof(*entry), flags, pc);
52 if (!event) 58 if (!event)
53 goto out; 59 goto out;
54 60
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event); 61 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59 62
60 /* Strip off the path, only save the file */ 63 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file); 64 p = f->file + strlen(f->file);
@@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
70 entry->line = f->line; 73 entry->line = f->line;
71 entry->correct = val == expect; 74 entry->correct = val == expect;
72 75
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 76 ring_buffer_unlock_commit(tr->buffer, event);
74 77
75 out: 78 out:
76 atomic_dec(&tr->data[cpu]->disabled); 79 atomic_dec(&tr->data[cpu]->disabled);
@@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
88 91
89int enable_branch_tracing(struct trace_array *tr) 92int enable_branch_tracing(struct trace_array *tr)
90{ 93{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex); 94 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr; 95 branch_tracer = tr;
95 /* 96 /*
@@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
100 branch_tracing_enabled++; 101 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex); 102 mutex_unlock(&branch_tracing_mutex);
102 103
103 return ret; 104 return 0;
104} 105}
105 106
106void disable_branch_tracing(void) 107void disable_branch_tracing(void)
@@ -128,11 +129,6 @@ static void stop_branch_trace(struct trace_array *tr)
128 129
129static int branch_trace_init(struct trace_array *tr) 130static int branch_trace_init(struct trace_array *tr)
130{ 131{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr); 132 start_branch_trace(tr);
137 return 0; 133 return 0;
138} 134}
@@ -142,22 +138,53 @@ static void branch_trace_reset(struct trace_array *tr)
142 stop_branch_trace(tr); 138 stop_branch_trace(tr);
143} 139}
144 140
145struct tracer branch_trace __read_mostly = 141static enum print_line_t trace_branch_print(struct trace_iterator *iter,
142 int flags)
143{
144 struct trace_branch *field;
145
146 trace_assign_type(field, iter->ent);
147
148 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
149 field->correct ? " ok " : " MISS ",
150 field->func,
151 field->file,
152 field->line))
153 return TRACE_TYPE_PARTIAL_LINE;
154
155 return TRACE_TYPE_HANDLED;
156}
157
158
159static struct trace_event trace_branch_event = {
160 .type = TRACE_BRANCH,
161 .trace = trace_branch_print,
162};
163
164static struct tracer branch_trace __read_mostly =
146{ 165{
147 .name = "branch", 166 .name = "branch",
148 .init = branch_trace_init, 167 .init = branch_trace_init,
149 .reset = branch_trace_reset, 168 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST 169#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch, 170 .selftest = trace_selftest_startup_branch,
152#endif 171#endif /* CONFIG_FTRACE_SELFTEST */
153}; 172};
154 173
155__init static int init_branch_trace(void) 174__init static int init_branch_tracer(void)
156{ 175{
176 int ret;
177
178 ret = register_ftrace_event(&trace_branch_event);
179 if (!ret) {
180 printk(KERN_WARNING "Warning: could not register "
181 "branch events\n");
182 return 1;
183 }
157 return register_tracer(&branch_trace); 184 return register_tracer(&branch_trace);
158} 185}
186device_initcall(init_branch_tracer);
159 187
160device_initcall(init_branch_trace);
161#else 188#else
162static inline 189static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) 190void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,66 +210,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
183} 210}
184EXPORT_SYMBOL(ftrace_likely_update); 211EXPORT_SYMBOL(ftrace_likely_update);
185 212
186struct ftrace_pointer { 213extern unsigned long __start_annotated_branch_profile[];
187 void *start; 214extern unsigned long __stop_annotated_branch_profile[];
188 void *stop;
189 int hit;
190};
191 215
192static void * 216static int annotated_branch_stat_headers(struct seq_file *m)
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{ 217{
195 const struct ftrace_pointer *f = m->private; 218 seq_printf(m, " correct incorrect %% ");
196 struct ftrace_branch_data *p = v; 219 seq_printf(m, " Function "
197 220 " File Line\n"
198 (*pos)++; 221 " ------- --------- - "
199 222 " -------- "
200 if (v == (void *)1) 223 " ---- ----\n");
201 return f->start; 224 return 0;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209} 225}
210 226
211static void *t_start(struct seq_file *m, loff_t *pos) 227static inline long get_incorrect_percent(struct ftrace_branch_data *p)
212{ 228{
213 void *t = (void *)1; 229 long percent;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218 230
219 return t; 231 if (p->correct) {
220} 232 percent = p->incorrect * 100;
233 percent /= p->correct + p->incorrect;
234 } else
235 percent = p->incorrect ? 100 : -1;
221 236
222static void t_stop(struct seq_file *m, void *p) 237 return percent;
223{
224} 238}
225 239
226static int t_show(struct seq_file *m, void *v) 240static int branch_stat_show(struct seq_file *m, void *v)
227{ 241{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v; 242 struct ftrace_branch_data *p = v;
230 const char *f; 243 const char *f;
231 long percent; 244 long percent;
232 245
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */ 246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file); 247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/') 248 while (f >= p->file && *f != '/')
@@ -252,11 +252,7 @@ static int t_show(struct seq_file *m, void *v)
252 /* 252 /*
253 * The miss is overlayed on correct, and hit on incorrect. 253 * The miss is overlayed on correct, and hit on incorrect.
254 */ 254 */
255 if (p->correct) { 255 percent = get_incorrect_percent(p);
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260 256
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 257 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0) 258 if (percent < 0)
@@ -267,76 +263,118 @@ static int t_show(struct seq_file *m, void *v)
267 return 0; 263 return 0;
268} 264}
269 265
270static struct seq_operations tracing_likely_seq_ops = { 266static void *annotated_branch_stat_start(void)
271 .start = t_start, 267{
272 .next = t_next, 268 return __start_annotated_branch_profile;
273 .stop = t_stop, 269}
274 .show = t_show, 270
271static void *
272annotated_branch_stat_next(void *v, int idx)
273{
274 struct ftrace_branch_data *p = v;
275
276 ++p;
277
278 if ((void *)p >= (void *)__stop_annotated_branch_profile)
279 return NULL;
280
281 return p;
282}
283
284static int annotated_branch_stat_cmp(void *p1, void *p2)
285{
286 struct ftrace_branch_data *a = p1;
287 struct ftrace_branch_data *b = p2;
288
289 long percent_a, percent_b;
290
291 percent_a = get_incorrect_percent(a);
292 percent_b = get_incorrect_percent(b);
293
294 if (percent_a < percent_b)
295 return -1;
296 if (percent_a > percent_b)
297 return 1;
298 else
299 return 0;
300}
301
302static struct tracer_stat annotated_branch_stats = {
303 .name = "branch_annotated",
304 .stat_start = annotated_branch_stat_start,
305 .stat_next = annotated_branch_stat_next,
306 .stat_cmp = annotated_branch_stat_cmp,
307 .stat_headers = annotated_branch_stat_headers,
308 .stat_show = branch_stat_show
275}; 309};
276 310
277static int tracing_branch_open(struct inode *inode, struct file *file) 311__init static int init_annotated_branch_stats(void)
278{ 312{
279 int ret; 313 int ret;
280 314
281 ret = seq_open(file, &tracing_likely_seq_ops); 315 ret = register_stat_tracer(&annotated_branch_stats);
282 if (!ret) { 316 if (!ret) {
283 struct seq_file *m = file->private_data; 317 printk(KERN_WARNING "Warning: could not register "
284 m->private = (void *)inode->i_private; 318 "annotated branches stats\n");
319 return 1;
285 } 320 }
286 321 return 0;
287 return ret;
288} 322}
289 323fs_initcall(init_annotated_branch_stats);
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295 324
296#ifdef CONFIG_PROFILE_ALL_BRANCHES 325#ifdef CONFIG_PROFILE_ALL_BRANCHES
326
297extern unsigned long __start_branch_profile[]; 327extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[]; 328extern unsigned long __stop_branch_profile[];
299 329
300static const struct ftrace_pointer ftrace_branch_pos = { 330static int all_branch_stat_headers(struct seq_file *m)
301 .start = __start_branch_profile, 331{
302 .stop = __stop_branch_profile, 332 seq_printf(m, " miss hit %% ");
303 .hit = 1, 333 seq_printf(m, " Function "
304}; 334 " File Line\n"
335 " ------- --------- - "
336 " -------- "
337 " ---- ----\n");
338 return 0;
339}
305 340
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */ 341static void *all_branch_stat_start(void)
342{
343 return __start_branch_profile;
344}
307 345
308extern unsigned long __start_annotated_branch_profile[]; 346static void *
309extern unsigned long __stop_annotated_branch_profile[]; 347all_branch_stat_next(void *v, int idx)
348{
349 struct ftrace_branch_data *p = v;
310 350
311static const struct ftrace_pointer ftrace_annotated_branch_pos = { 351 ++p;
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315 352
316static __init int ftrace_branch_init(void) 353 if ((void *)p >= (void *)__stop_branch_profile)
317{ 354 return NULL;
318 struct dentry *d_tracer;
319 struct dentry *entry;
320 355
321 d_tracer = tracing_init_dentry(); 356 return p;
357}
322 358
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, 359static struct tracer_stat all_branch_stats = {
324 (void *)&ftrace_annotated_branch_pos, 360 .name = "branch_all",
325 &tracing_branch_fops); 361 .stat_start = all_branch_stat_start,
326 if (!entry) 362 .stat_next = all_branch_stat_next,
327 pr_warning("Could not create debugfs " 363 .stat_headers = all_branch_stat_headers,
328 "'profile_annotatet_branch' entry\n"); 364 .stat_show = branch_stat_show
365};
329 366
330#ifdef CONFIG_PROFILE_ALL_BRANCHES 367__init static int all_annotated_branch_stats(void)
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer, 368{
332 (void *)&ftrace_branch_pos, 369 int ret;
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338 370
371 ret = register_stat_tracer(&all_branch_stats);
372 if (!ret) {
373 printk(KERN_WARNING "Warning: could not register "
374 "all branches stats\n");
375 return 1;
376 }
339 return 0; 377 return 0;
340} 378}
341 379fs_initcall(all_annotated_branch_stats);
342device_initcall(ftrace_branch_init); 380#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..05b176abfd30
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,108 @@
1/*
2 * tracing clocks
3 *
4 * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Implements 3 trace clock variants, with differing scalability/precision
7 * tradeoffs:
8 *
9 * - local: CPU-local trace clock
10 * - medium: scalable global clock with some jitter
11 * - global: globally monotonic, serialized clock
12 *
13 * Tracer plugins will chose a default from these clocks.
14 */
15#include <linux/spinlock.h>
16#include <linux/hardirq.h>
17#include <linux/module.h>
18#include <linux/percpu.h>
19#include <linux/sched.h>
20#include <linux/ktime.h>
21
22/*
23 * trace_clock_local(): the simplest and least coherent tracing clock.
24 *
25 * Useful for tracing that does not cross to other CPUs nor
26 * does it go through idle events.
27 */
28u64 notrace trace_clock_local(void)
29{
30 unsigned long flags;
31 u64 clock;
32
33 /*
34 * sched_clock() is an architecture implemented, fast, scalable,
35 * lockless clock. It is not guaranteed to be coherent across
36 * CPUs, nor across CPU idle events.
37 */
38 raw_local_irq_save(flags);
39 clock = sched_clock();
40 raw_local_irq_restore(flags);
41
42 return clock;
43}
44
45/*
46 * trace_clock(): 'inbetween' trace clock. Not completely serialized,
47 * but not completely incorrect when crossing CPUs either.
48 *
49 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
50 * jitter between CPUs. So it's a pretty scalable clock, but there
51 * can be offsets in the trace data.
52 */
53u64 notrace trace_clock(void)
54{
55 return cpu_clock(raw_smp_processor_id());
56}
57
58
59/*
60 * trace_clock_global(): special globally coherent trace clock
61 *
62 * It has higher overhead than the other trace clocks but is still
63 * an order of magnitude faster than GTOD derived hardware clocks.
64 *
65 * Used by plugins that need globally coherent timestamps.
66 */
67
68static u64 prev_trace_clock_time;
69
70static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
71 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72
73u64 notrace trace_clock_global(void)
74{
75 unsigned long flags;
76 int this_cpu;
77 u64 now;
78
79 raw_local_irq_save(flags);
80
81 this_cpu = raw_smp_processor_id();
82 now = cpu_clock(this_cpu);
83 /*
84 * If in an NMI context then dont risk lockups and return the
85 * cpu_clock() time:
86 */
87 if (unlikely(in_nmi()))
88 goto out;
89
90 __raw_spin_lock(&trace_clock_lock);
91
92 /*
93 * TODO: if this happens often then maybe we should reset
94 * my_scd->clock to prev_trace_clock_time+1, to make sure
95 * we start ticking with the local clock from now on?
96 */
97 if ((s64)(now - prev_trace_clock_time) < 0)
98 now = prev_trace_clock_time + 1;
99
100 prev_trace_clock_time = now;
101
102 __raw_spin_unlock(&trace_clock_lock);
103
104 out:
105 raw_local_irq_restore(flags);
106
107 return now;
108}
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 000000000000..019915063fe6
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,175 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(int, ret.depth, depth)
30 ),
31 TP_RAW_FMT("<-- %lx (%d)")
32);
33
34TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
35 TRACE_STRUCT(
36 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
37 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
38 TRACE_FIELD(unsigned char, prev_state, prev_state)
39 TRACE_FIELD(unsigned int, next_pid, next_pid)
40 TRACE_FIELD(unsigned char, next_prio, next_prio)
41 TRACE_FIELD(unsigned char, next_state, next_state)
42 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
43 ),
44 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
45);
46
47TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
48 TRACE_STRUCT(
49 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
50 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
51 TRACE_FIELD(unsigned char, prev_state, prev_state)
52 TRACE_FIELD(unsigned int, next_pid, next_pid)
53 TRACE_FIELD(unsigned char, next_prio, next_prio)
54 TRACE_FIELD(unsigned char, next_state, next_state)
55 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
56 ),
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58);
59
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2)
64 TRACE_FIELD(unsigned long, arg3, arg3)
65 ),
66 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
67);
68
69/*
70 * Stack-trace entry:
71 */
72
73/* #define FTRACE_STACK_ENTRIES 8 */
74
75TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
76 TRACE_STRUCT(
77 TRACE_FIELD(unsigned long, caller[0], stack0)
78 TRACE_FIELD(unsigned long, caller[1], stack1)
79 TRACE_FIELD(unsigned long, caller[2], stack2)
80 TRACE_FIELD(unsigned long, caller[3], stack3)
81 TRACE_FIELD(unsigned long, caller[4], stack4)
82 TRACE_FIELD(unsigned long, caller[5], stack5)
83 TRACE_FIELD(unsigned long, caller[6], stack6)
84 TRACE_FIELD(unsigned long, caller[7], stack7)
85 ),
86 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
87 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
88);
89
90TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
91 TRACE_STRUCT(
92 TRACE_FIELD(unsigned long, caller[0], stack0)
93 TRACE_FIELD(unsigned long, caller[1], stack1)
94 TRACE_FIELD(unsigned long, caller[2], stack2)
95 TRACE_FIELD(unsigned long, caller[3], stack3)
96 TRACE_FIELD(unsigned long, caller[4], stack4)
97 TRACE_FIELD(unsigned long, caller[5], stack5)
98 TRACE_FIELD(unsigned long, caller[6], stack6)
99 TRACE_FIELD(unsigned long, caller[7], stack7)
100 ),
101 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
102 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
103);
104
105TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
106 TRACE_STRUCT(
107 TRACE_FIELD(unsigned long, ip, ip)
108 TRACE_FIELD(unsigned int, depth, depth)
109 TRACE_FIELD(char *, fmt, fmt)
110 TRACE_FIELD_ZERO_CHAR(buf)
111 ),
112 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
113);
114
115TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
116 TRACE_STRUCT(
117 TRACE_FIELD(unsigned long, ip, ip)
118 TRACE_FIELD(unsigned int, depth, depth)
119 TRACE_FIELD_ZERO_CHAR(buf)
120 ),
121 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
122);
123
124TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
125 TRACE_STRUCT(
126 TRACE_FIELD(unsigned int, line, line)
127 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
128 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
129 TRACE_FIELD(char, correct, correct)
130 ),
131 TP_RAW_FMT("%u:%s:%s (%u)")
132);
133
134TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
135 TRACE_STRUCT(
136 TRACE_FIELD(u64, from, from)
137 TRACE_FIELD(u64, to, to)
138 ),
139 TP_RAW_FMT("from: %llx to: %llx")
140);
141
142TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
143 TRACE_STRUCT(
144 TRACE_FIELD(ktime_t, state_data.stamp, stamp)
145 TRACE_FIELD(ktime_t, state_data.end, end)
146 TRACE_FIELD(int, state_data.type, type)
147 TRACE_FIELD(int, state_data.state, state)
148 ),
149 TP_RAW_FMT("%llx->%llx type:%u state:%u")
150);
151
152TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
153 TRACE_STRUCT(
154 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
155 TRACE_FIELD(unsigned long, call_site, call_site)
156 TRACE_FIELD(const void *, ptr, ptr)
157 TRACE_FIELD(size_t, bytes_req, bytes_req)
158 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
159 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
160 TRACE_FIELD(int, node, node)
161 ),
162 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
163 " flags:%x node:%d")
164);
165
166TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
167 TRACE_STRUCT(
168 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
169 TRACE_FIELD(unsigned long, call_site, call_site)
170 TRACE_FIELD(const void *, ptr, ptr)
171 ),
172 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
173);
174
175#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000000..c88227b3b9db
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,604 @@
1/*
2 * event tracer
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 * - Added format output of fields of the trace point.
7 * This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
8 *
9 */
10
11#include <linux/debugfs.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
14#include <linux/ctype.h>
15
16#include "trace_output.h"
17
18#define TRACE_SYSTEM "TRACE_SYSTEM"
19
20static DEFINE_MUTEX(event_mutex);
21
22#define events_for_each(event) \
23 for (event = __start_ftrace_events; \
24 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
25 event++)
26
27static void ftrace_clear_events(void)
28{
29 struct ftrace_event_call *call = (void *)__start_ftrace_events;
30
31
32 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
33
34 if (call->enabled) {
35 call->enabled = 0;
36 call->unregfunc();
37 }
38 call++;
39 }
40}
41
42static void ftrace_event_enable_disable(struct ftrace_event_call *call,
43 int enable)
44{
45
46 switch (enable) {
47 case 0:
48 if (call->enabled) {
49 call->enabled = 0;
50 call->unregfunc();
51 }
52 break;
53 case 1:
54 if (!call->enabled) {
55 call->enabled = 1;
56 call->regfunc();
57 }
58 break;
59 }
60}
61
62static int ftrace_set_clr_event(char *buf, int set)
63{
64 struct ftrace_event_call *call = __start_ftrace_events;
65 char *event = NULL, *sub = NULL, *match;
66 int ret = -EINVAL;
67
68 /*
69 * The buf format can be <subsystem>:<event-name>
70 * *:<event-name> means any event by that name.
71 * :<event-name> is the same.
72 *
73 * <subsystem>:* means all events in that subsystem
74 * <subsystem>: means the same.
75 *
76 * <name> (no ':') means all events in a subsystem with
77 * the name <name> or any event that matches <name>
78 */
79
80 match = strsep(&buf, ":");
81 if (buf) {
82 sub = match;
83 event = buf;
84 match = NULL;
85
86 if (!strlen(sub) || strcmp(sub, "*") == 0)
87 sub = NULL;
88 if (!strlen(event) || strcmp(event, "*") == 0)
89 event = NULL;
90 }
91
92 mutex_lock(&event_mutex);
93 events_for_each(call) {
94
95 if (!call->name || !call->regfunc)
96 continue;
97
98 if (match &&
99 strcmp(match, call->name) != 0 &&
100 strcmp(match, call->system) != 0)
101 continue;
102
103 if (sub && strcmp(sub, call->system) != 0)
104 continue;
105
106 if (event && strcmp(event, call->name) != 0)
107 continue;
108
109 ftrace_event_enable_disable(call, set);
110
111 ret = 0;
112 }
113 mutex_unlock(&event_mutex);
114
115 return ret;
116}
117
118/* 128 should be much more than enough */
119#define EVENT_BUF_SIZE 127
120
121static ssize_t
122ftrace_event_write(struct file *file, const char __user *ubuf,
123 size_t cnt, loff_t *ppos)
124{
125 size_t read = 0;
126 int i, set = 1;
127 ssize_t ret;
128 char *buf;
129 char ch;
130
131 if (!cnt || cnt < 0)
132 return 0;
133
134 ret = tracing_update_buffers();
135 if (ret < 0)
136 return ret;
137
138 ret = get_user(ch, ubuf++);
139 if (ret)
140 return ret;
141 read++;
142 cnt--;
143
144 /* skip white space */
145 while (cnt && isspace(ch)) {
146 ret = get_user(ch, ubuf++);
147 if (ret)
148 return ret;
149 read++;
150 cnt--;
151 }
152
153 /* Only white space found? */
154 if (isspace(ch)) {
155 file->f_pos += read;
156 ret = read;
157 return ret;
158 }
159
160 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
161 if (!buf)
162 return -ENOMEM;
163
164 if (cnt > EVENT_BUF_SIZE)
165 cnt = EVENT_BUF_SIZE;
166
167 i = 0;
168 while (cnt && !isspace(ch)) {
169 if (!i && ch == '!')
170 set = 0;
171 else
172 buf[i++] = ch;
173
174 ret = get_user(ch, ubuf++);
175 if (ret)
176 goto out_free;
177 read++;
178 cnt--;
179 }
180 buf[i] = 0;
181
182 file->f_pos += read;
183
184 ret = ftrace_set_clr_event(buf, set);
185 if (ret)
186 goto out_free;
187
188 ret = read;
189
190 out_free:
191 kfree(buf);
192
193 return ret;
194}
195
196static void *
197t_next(struct seq_file *m, void *v, loff_t *pos)
198{
199 struct ftrace_event_call *call = m->private;
200 struct ftrace_event_call *next = call;
201
202 (*pos)++;
203
204 for (;;) {
205 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
206 return NULL;
207
208 /*
209 * The ftrace subsystem is for showing formats only.
210 * They can not be enabled or disabled via the event files.
211 */
212 if (call->regfunc)
213 break;
214
215 call++;
216 next = call;
217 }
218
219 m->private = ++next;
220
221 return call;
222}
223
224static void *t_start(struct seq_file *m, loff_t *pos)
225{
226 return t_next(m, NULL, pos);
227}
228
229static void *
230s_next(struct seq_file *m, void *v, loff_t *pos)
231{
232 struct ftrace_event_call *call = m->private;
233 struct ftrace_event_call *next;
234
235 (*pos)++;
236
237 retry:
238 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
239 return NULL;
240
241 if (!call->enabled) {
242 call++;
243 goto retry;
244 }
245
246 next = call;
247 m->private = ++next;
248
249 return call;
250}
251
252static void *s_start(struct seq_file *m, loff_t *pos)
253{
254 return s_next(m, NULL, pos);
255}
256
257static int t_show(struct seq_file *m, void *v)
258{
259 struct ftrace_event_call *call = v;
260
261 if (strcmp(call->system, TRACE_SYSTEM) != 0)
262 seq_printf(m, "%s:", call->system);
263 seq_printf(m, "%s\n", call->name);
264
265 return 0;
266}
267
268static void t_stop(struct seq_file *m, void *p)
269{
270}
271
272static int
273ftrace_event_seq_open(struct inode *inode, struct file *file)
274{
275 int ret;
276 const struct seq_operations *seq_ops;
277
278 if ((file->f_mode & FMODE_WRITE) &&
279 !(file->f_flags & O_APPEND))
280 ftrace_clear_events();
281
282 seq_ops = inode->i_private;
283 ret = seq_open(file, seq_ops);
284 if (!ret) {
285 struct seq_file *m = file->private_data;
286
287 m->private = __start_ftrace_events;
288 }
289 return ret;
290}
291
292static ssize_t
293event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
294 loff_t *ppos)
295{
296 struct ftrace_event_call *call = filp->private_data;
297 char *buf;
298
299 if (call->enabled)
300 buf = "1\n";
301 else
302 buf = "0\n";
303
304 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
305}
306
307static ssize_t
308event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
309 loff_t *ppos)
310{
311 struct ftrace_event_call *call = filp->private_data;
312 char buf[64];
313 unsigned long val;
314 int ret;
315
316 if (cnt >= sizeof(buf))
317 return -EINVAL;
318
319 if (copy_from_user(&buf, ubuf, cnt))
320 return -EFAULT;
321
322 buf[cnt] = 0;
323
324 ret = strict_strtoul(buf, 10, &val);
325 if (ret < 0)
326 return ret;
327
328 ret = tracing_update_buffers();
329 if (ret < 0)
330 return ret;
331
332 switch (val) {
333 case 0:
334 case 1:
335 mutex_lock(&event_mutex);
336 ftrace_event_enable_disable(call, val);
337 mutex_unlock(&event_mutex);
338 break;
339
340 default:
341 return -EINVAL;
342 }
343
344 *ppos += cnt;
345
346 return cnt;
347}
348
349#undef FIELD
350#define FIELD(type, name) \
351 #type, #name, offsetof(typeof(field), name), sizeof(field.name)
352
353static int trace_write_header(struct trace_seq *s)
354{
355 struct trace_entry field;
356
357 /* struct trace_entry */
358 return trace_seq_printf(s,
359 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
360 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
361 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
362 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
363 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
364 "\n",
365 FIELD(unsigned char, type),
366 FIELD(unsigned char, flags),
367 FIELD(unsigned char, preempt_count),
368 FIELD(int, pid),
369 FIELD(int, tgid));
370}
371
372static ssize_t
373event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
374 loff_t *ppos)
375{
376 struct ftrace_event_call *call = filp->private_data;
377 struct trace_seq *s;
378 char *buf;
379 int r;
380
381 if (*ppos)
382 return 0;
383
384 s = kmalloc(sizeof(*s), GFP_KERNEL);
385 if (!s)
386 return -ENOMEM;
387
388 trace_seq_init(s);
389
390 /* If any of the first writes fail, so will the show_format. */
391
392 trace_seq_printf(s, "name: %s\n", call->name);
393 trace_seq_printf(s, "ID: %d\n", call->id);
394 trace_seq_printf(s, "format:\n");
395 trace_write_header(s);
396
397 r = call->show_format(s);
398 if (!r) {
399 /*
400 * ug! The format output is bigger than a PAGE!!
401 */
402 buf = "FORMAT TOO BIG\n";
403 r = simple_read_from_buffer(ubuf, cnt, ppos,
404 buf, strlen(buf));
405 goto out;
406 }
407
408 r = simple_read_from_buffer(ubuf, cnt, ppos,
409 s->buffer, s->len);
410 out:
411 kfree(s);
412 return r;
413}
414
415static const struct seq_operations show_event_seq_ops = {
416 .start = t_start,
417 .next = t_next,
418 .show = t_show,
419 .stop = t_stop,
420};
421
422static const struct seq_operations show_set_event_seq_ops = {
423 .start = s_start,
424 .next = s_next,
425 .show = t_show,
426 .stop = t_stop,
427};
428
429static const struct file_operations ftrace_avail_fops = {
430 .open = ftrace_event_seq_open,
431 .read = seq_read,
432 .llseek = seq_lseek,
433 .release = seq_release,
434};
435
436static const struct file_operations ftrace_set_event_fops = {
437 .open = ftrace_event_seq_open,
438 .read = seq_read,
439 .write = ftrace_event_write,
440 .llseek = seq_lseek,
441 .release = seq_release,
442};
443
444static const struct file_operations ftrace_enable_fops = {
445 .open = tracing_open_generic,
446 .read = event_enable_read,
447 .write = event_enable_write,
448};
449
450static const struct file_operations ftrace_event_format_fops = {
451 .open = tracing_open_generic,
452 .read = event_format_read,
453};
454
455static struct dentry *event_trace_events_dir(void)
456{
457 static struct dentry *d_tracer;
458 static struct dentry *d_events;
459
460 if (d_events)
461 return d_events;
462
463 d_tracer = tracing_init_dentry();
464 if (!d_tracer)
465 return NULL;
466
467 d_events = debugfs_create_dir("events", d_tracer);
468 if (!d_events)
469 pr_warning("Could not create debugfs "
470 "'events' directory\n");
471
472 return d_events;
473}
474
475struct event_subsystem {
476 struct list_head list;
477 const char *name;
478 struct dentry *entry;
479};
480
481static LIST_HEAD(event_subsystems);
482
483static struct dentry *
484event_subsystem_dir(const char *name, struct dentry *d_events)
485{
486 struct event_subsystem *system;
487
488 /* First see if we did not already create this dir */
489 list_for_each_entry(system, &event_subsystems, list) {
490 if (strcmp(system->name, name) == 0)
491 return system->entry;
492 }
493
494 /* need to create new entry */
495 system = kmalloc(sizeof(*system), GFP_KERNEL);
496 if (!system) {
497 pr_warning("No memory to create event subsystem %s\n",
498 name);
499 return d_events;
500 }
501
502 system->entry = debugfs_create_dir(name, d_events);
503 if (!system->entry) {
504 pr_warning("Could not create event subsystem %s\n",
505 name);
506 kfree(system);
507 return d_events;
508 }
509
510 system->name = name;
511 list_add(&system->list, &event_subsystems);
512
513 return system->entry;
514}
515
516static int
517event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
518{
519 struct dentry *entry;
520 int ret;
521
522 /*
523 * If the trace point header did not define TRACE_SYSTEM
524 * then the system would be called "TRACE_SYSTEM".
525 */
526 if (strcmp(call->system, "TRACE_SYSTEM") != 0)
527 d_events = event_subsystem_dir(call->system, d_events);
528
529 if (call->raw_init) {
530 ret = call->raw_init();
531 if (ret < 0) {
532 pr_warning("Could not initialize trace point"
533 " events/%s\n", call->name);
534 return ret;
535 }
536 }
537
538 call->dir = debugfs_create_dir(call->name, d_events);
539 if (!call->dir) {
540 pr_warning("Could not create debugfs "
541 "'%s' directory\n", call->name);
542 return -1;
543 }
544
545 if (call->regfunc) {
546 entry = debugfs_create_file("enable", 0644, call->dir, call,
547 &ftrace_enable_fops);
548 if (!entry)
549 pr_warning("Could not create debugfs "
550 "'%s/enable' entry\n", call->name);
551 }
552
553 /* A trace may not want to export its format */
554 if (!call->show_format)
555 return 0;
556
557 entry = debugfs_create_file("format", 0444, call->dir, call,
558 &ftrace_event_format_fops);
559 if (!entry)
560 pr_warning("Could not create debugfs "
561 "'%s/format' entry\n", call->name);
562
563 return 0;
564}
565
566static __init int event_trace_init(void)
567{
568 struct ftrace_event_call *call = __start_ftrace_events;
569 struct dentry *d_tracer;
570 struct dentry *entry;
571 struct dentry *d_events;
572
573 d_tracer = tracing_init_dentry();
574 if (!d_tracer)
575 return 0;
576
577 entry = debugfs_create_file("available_events", 0444, d_tracer,
578 (void *)&show_event_seq_ops,
579 &ftrace_avail_fops);
580 if (!entry)
581 pr_warning("Could not create debugfs "
582 "'available_events' entry\n");
583
584 entry = debugfs_create_file("set_event", 0644, d_tracer,
585 (void *)&show_set_event_seq_ops,
586 &ftrace_set_event_fops);
587 if (!entry)
588 pr_warning("Could not create debugfs "
589 "'set_event' entry\n");
590
591 d_events = event_trace_events_dir();
592 if (!d_events)
593 return 0;
594
595 events_for_each(call) {
596 /* The linker may leave blanks */
597 if (!call->name)
598 continue;
599 event_create_dir(call, d_events);
600 }
601
602 return 0;
603}
604fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 000000000000..38985f9b379c
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,39 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 000000000000..5117c43f5c67
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,131 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry "REC"
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 000000000000..ae2e323df0c7
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,217 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#define _TRACE_FORMAT(call, proto, args, fmt) \
113static void ftrace_event_##call(proto) \
114{ \
115 event_trace_printk(_RET_IP_, #call ": " fmt); \
116} \
117 \
118static int ftrace_reg_event_##call(void) \
119{ \
120 int ret; \
121 \
122 ret = register_trace_##call(ftrace_event_##call); \
123 if (ret) \
124 pr_info("event trace: Could not activate trace point " \
125 "probe to " #call "\n"); \
126 return ret; \
127} \
128 \
129static void ftrace_unreg_event_##call(void) \
130{ \
131 unregister_trace_##call(ftrace_event_##call); \
132} \
133
134
135#undef TRACE_FORMAT
136#define TRACE_FORMAT(call, proto, args, fmt) \
137_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
138static struct ftrace_event_call __used \
139__attribute__((__aligned__(4))) \
140__attribute__((section("_ftrace_events"))) event_##call = { \
141 .name = #call, \
142 .system = __stringify(TRACE_SYSTEM), \
143 .regfunc = ftrace_reg_event_##call, \
144 .unregfunc = ftrace_unreg_event_##call, \
145}
146
147#undef __entry
148#define __entry entry
149
150#undef TRACE_EVENT
151#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
152 \
153static struct ftrace_event_call event_##call; \
154 \
155static void ftrace_raw_event_##call(proto) \
156{ \
157 struct ring_buffer_event *event; \
158 struct ftrace_raw_##call *entry; \
159 unsigned long irq_flags; \
160 int pc; \
161 \
162 local_save_flags(irq_flags); \
163 pc = preempt_count(); \
164 \
165 event = trace_current_buffer_lock_reserve(event_##call.id, \
166 sizeof(struct ftrace_raw_##call), \
167 irq_flags, pc); \
168 if (!event) \
169 return; \
170 entry = ring_buffer_event_data(event); \
171 \
172 assign; \
173 \
174 trace_current_buffer_unlock_commit(event, irq_flags, pc); \
175} \
176 \
177static int ftrace_raw_reg_event_##call(void) \
178{ \
179 int ret; \
180 \
181 ret = register_trace_##call(ftrace_raw_event_##call); \
182 if (ret) \
183 pr_info("event trace: Could not activate trace point " \
184 "probe to " #call "\n"); \
185 return ret; \
186} \
187 \
188static void ftrace_raw_unreg_event_##call(void) \
189{ \
190 unregister_trace_##call(ftrace_raw_event_##call); \
191} \
192 \
193static struct trace_event ftrace_event_type_##call = { \
194 .trace = ftrace_raw_output_##call, \
195}; \
196 \
197static int ftrace_raw_init_event_##call(void) \
198{ \
199 int id; \
200 \
201 id = register_ftrace_event(&ftrace_event_type_##call); \
202 if (!id) \
203 return -ENODEV; \
204 event_##call.id = id; \
205 return 0; \
206} \
207 \
208static struct ftrace_event_call __used \
209__attribute__((__aligned__(4))) \
210__attribute__((section("_ftrace_events"))) event_##call = { \
211 .name = #call, \
212 .system = __stringify(TRACE_SYSTEM), \
213 .raw_init = ftrace_raw_init_event_##call, \
214 .regfunc = ftrace_raw_reg_event_##call, \
215 .unregfunc = ftrace_raw_unreg_event_##call, \
216 .show_format = ftrace_format_##call, \
217}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000000..4d9952d3df50
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,102 @@
1/*
2 * trace_export.c - export basic ftrace utilities to user space
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/stringify.h>
7#include <linux/kallsyms.h>
8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15
16#include "trace_output.h"
17
18
19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args
21
22#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \
27 (unsigned int)sizeof(field.item)); \
28 if (!ret) \
29 return 0;
30
31
32#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \
37 (unsigned int)sizeof(field.item)); \
38 if (!ret) \
39 return 0;
40
41#undef TRACE_FIELD_ZERO_CHAR
42#define TRACE_FIELD_ZERO_CHAR(item) \
43 ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \
44 "offset:%u;\tsize:0;\n", \
45 (unsigned int)offsetof(typeof(field), item)); \
46 if (!ret) \
47 return 0;
48
49
50#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args
52
53#undef TRACE_EVENT_FORMAT
54#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
55static int \
56ftrace_format_##call(struct trace_seq *s) \
57{ \
58 struct args field; \
59 int ret; \
60 \
61 tstruct; \
62 \
63 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
64 \
65 return ret; \
66}
67
68#include "trace_event_types.h"
69
70#undef TRACE_ZERO_CHAR
71#define TRACE_ZERO_CHAR(arg)
72
73#undef TRACE_FIELD
74#define TRACE_FIELD(type, item, assign)\
75 entry->item = assign;
76
77#undef TRACE_FIELD
78#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign;
80
81#undef TP_CMD
82#define TP_CMD(cmd...) cmd
83
84#undef TRACE_ENTRY
85#define TRACE_ENTRY entry
86
87#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
89 cmd;
90
91#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
93 \
94static struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \
98 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \
101}
102#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..c9a0b7df44ff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/ring_buffer.h>
12#include <linux/debugfs.h> 13#include <linux/debugfs.h>
13#include <linux/uaccess.h> 14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
@@ -16,52 +17,388 @@
16 17
17#include "trace.h" 18#include "trace.h"
18 19
19static void start_function_trace(struct trace_array *tr) 20/* function tracing enabled */
21static int ftrace_function_enabled;
22
23static struct trace_array *func_trace;
24
25static void tracing_start_function_trace(void);
26static void tracing_stop_function_trace(void);
27
28static int function_trace_init(struct trace_array *tr)
20{ 29{
30 func_trace = tr;
21 tr->cpu = get_cpu(); 31 tr->cpu = get_cpu();
22 tracing_reset_online_cpus(tr);
23 put_cpu(); 32 put_cpu();
24 33
25 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
26 tracing_start_function_trace(); 35 tracing_start_function_trace();
36 return 0;
27} 37}
28 38
29static void stop_function_trace(struct trace_array *tr) 39static void function_trace_reset(struct trace_array *tr)
30{ 40{
31 tracing_stop_function_trace(); 41 tracing_stop_function_trace();
32 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
33} 43}
34 44
35static int function_trace_init(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
36{ 46{
37 start_function_trace(tr); 47 tracing_reset_online_cpus(tr);
38 return 0;
39} 48}
40 49
41static void function_trace_reset(struct trace_array *tr) 50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
52{
53 struct trace_array *tr = func_trace;
54 struct trace_array_cpu *data;
55 unsigned long flags;
56 long disabled;
57 int cpu, resched;
58 int pc;
59
60 if (unlikely(!ftrace_function_enabled))
61 return;
62
63 pc = preempt_count();
64 resched = ftrace_preempt_disable();
65 local_save_flags(flags);
66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu];
68 disabled = atomic_inc_return(&data->disabled);
69
70 if (likely(disabled == 1))
71 trace_function(tr, ip, parent_ip, flags, pc);
72
73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched);
75}
76
77static void
78function_trace_call(unsigned long ip, unsigned long parent_ip)
42{ 79{
43 stop_function_trace(tr); 80 struct trace_array *tr = func_trace;
81 struct trace_array_cpu *data;
82 unsigned long flags;
83 long disabled;
84 int cpu;
85 int pc;
86
87 if (unlikely(!ftrace_function_enabled))
88 return;
89
90 /*
91 * Need to use raw, since this must be called before the
92 * recursive protection is performed.
93 */
94 local_irq_save(flags);
95 cpu = raw_smp_processor_id();
96 data = tr->data[cpu];
97 disabled = atomic_inc_return(&data->disabled);
98
99 if (likely(disabled == 1)) {
100 pc = preempt_count();
101 trace_function(tr, ip, parent_ip, flags, pc);
102 }
103
104 atomic_dec(&data->disabled);
105 local_irq_restore(flags);
44} 106}
45 107
46static void function_trace_start(struct trace_array *tr) 108static void
109function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
47{ 110{
48 tracing_reset_online_cpus(tr); 111 struct trace_array *tr = func_trace;
112 struct trace_array_cpu *data;
113 unsigned long flags;
114 long disabled;
115 int cpu;
116 int pc;
117
118 if (unlikely(!ftrace_function_enabled))
119 return;
120
121 /*
122 * Need to use raw, since this must be called before the
123 * recursive protection is performed.
124 */
125 local_irq_save(flags);
126 cpu = raw_smp_processor_id();
127 data = tr->data[cpu];
128 disabled = atomic_inc_return(&data->disabled);
129
130 if (likely(disabled == 1)) {
131 pc = preempt_count();
132 trace_function(tr, ip, parent_ip, flags, pc);
133 /*
134 * skip over 5 funcs:
135 * __ftrace_trace_stack,
136 * __trace_stack,
137 * function_stack_trace_call
138 * ftrace_list_func
139 * ftrace_call
140 */
141 __trace_stack(tr, flags, 5, pc);
142 }
143
144 atomic_dec(&data->disabled);
145 local_irq_restore(flags);
146}
147
148
149static struct ftrace_ops trace_ops __read_mostly =
150{
151 .func = function_trace_call,
152};
153
154static struct ftrace_ops trace_stack_ops __read_mostly =
155{
156 .func = function_stack_trace_call,
157};
158
159/* Our two options */
160enum {
161 TRACE_FUNC_OPT_STACK = 0x1,
162};
163
164static struct tracer_opt func_opts[] = {
165#ifdef CONFIG_STACKTRACE
166 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
167#endif
168 { } /* Always set a last empty entry */
169};
170
171static struct tracer_flags func_flags = {
172 .val = 0, /* By default: all flags disabled */
173 .opts = func_opts
174};
175
176static void tracing_start_function_trace(void)
177{
178 ftrace_function_enabled = 0;
179
180 if (trace_flags & TRACE_ITER_PREEMPTONLY)
181 trace_ops.func = function_trace_call_preempt_only;
182 else
183 trace_ops.func = function_trace_call;
184
185 if (func_flags.val & TRACE_FUNC_OPT_STACK)
186 register_ftrace_function(&trace_stack_ops);
187 else
188 register_ftrace_function(&trace_ops);
189
190 ftrace_function_enabled = 1;
191}
192
193static void tracing_stop_function_trace(void)
194{
195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */
197 unregister_ftrace_function(&trace_stack_ops);
198 unregister_ftrace_function(&trace_ops);
199}
200
201static int func_set_flag(u32 old_flags, u32 bit, int set)
202{
203 if (bit == TRACE_FUNC_OPT_STACK) {
204 /* do nothing if already set */
205 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
206 return 0;
207
208 if (set) {
209 unregister_ftrace_function(&trace_ops);
210 register_ftrace_function(&trace_stack_ops);
211 } else {
212 unregister_ftrace_function(&trace_stack_ops);
213 register_ftrace_function(&trace_ops);
214 }
215
216 return 0;
217 }
218
219 return -EINVAL;
49} 220}
50 221
51static struct tracer function_trace __read_mostly = 222static struct tracer function_trace __read_mostly =
52{ 223{
53 .name = "function", 224 .name = "function",
54 .init = function_trace_init, 225 .init = function_trace_init,
55 .reset = function_trace_reset, 226 .reset = function_trace_reset,
56 .start = function_trace_start, 227 .start = function_trace_start,
228 .wait_pipe = poll_wait_pipe,
229 .flags = &func_flags,
230 .set_flag = func_set_flag,
57#ifdef CONFIG_FTRACE_SELFTEST 231#ifdef CONFIG_FTRACE_SELFTEST
58 .selftest = trace_selftest_startup_function, 232 .selftest = trace_selftest_startup_function,
59#endif 233#endif
60}; 234};
61 235
236#ifdef CONFIG_DYNAMIC_FTRACE
237static void
238ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
239{
240 long *count = (long *)data;
241
242 if (tracing_is_on())
243 return;
244
245 if (!*count)
246 return;
247
248 if (*count != -1)
249 (*count)--;
250
251 tracing_on();
252}
253
254static void
255ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
256{
257 long *count = (long *)data;
258
259 if (!tracing_is_on())
260 return;
261
262 if (!*count)
263 return;
264
265 if (*count != -1)
266 (*count)--;
267
268 tracing_off();
269}
270
271static int
272ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
273 struct ftrace_probe_ops *ops, void *data);
274
275static struct ftrace_probe_ops traceon_probe_ops = {
276 .func = ftrace_traceon,
277 .print = ftrace_trace_onoff_print,
278};
279
280static struct ftrace_probe_ops traceoff_probe_ops = {
281 .func = ftrace_traceoff,
282 .print = ftrace_trace_onoff_print,
283};
284
285static int
286ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
287 struct ftrace_probe_ops *ops, void *data)
288{
289 char str[KSYM_SYMBOL_LEN];
290 long count = (long)data;
291
292 kallsyms_lookup(ip, NULL, NULL, NULL, str);
293 seq_printf(m, "%s:", str);
294
295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon");
297 else
298 seq_printf(m, "traceoff");
299
300 if (count == -1)
301 seq_printf(m, ":unlimited\n");
302 else
303 seq_printf(m, ":count=%ld", count);
304 seq_putc(m, '\n');
305
306 return 0;
307}
308
309static int
310ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
311{
312 struct ftrace_probe_ops *ops;
313
314 /* we register both traceon and traceoff to this callback */
315 if (strcmp(cmd, "traceon") == 0)
316 ops = &traceon_probe_ops;
317 else
318 ops = &traceoff_probe_ops;
319
320 unregister_ftrace_function_probe_func(glob, ops);
321
322 return 0;
323}
324
325static int
326ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
327{
328 struct ftrace_probe_ops *ops;
329 void *count = (void *)-1;
330 char *number;
331 int ret;
332
333 /* hash funcs only work with set_ftrace_filter */
334 if (!enable)
335 return -EINVAL;
336
337 if (glob[0] == '!')
338 return ftrace_trace_onoff_unreg(glob+1, cmd, param);
339
340 /* we register both traceon and traceoff to this callback */
341 if (strcmp(cmd, "traceon") == 0)
342 ops = &traceon_probe_ops;
343 else
344 ops = &traceoff_probe_ops;
345
346 if (!param)
347 goto out_reg;
348
349 number = strsep(&param, ":");
350
351 if (!strlen(number))
352 goto out_reg;
353
354 /*
355 * We use the callback data field (which is a pointer)
356 * as our counter.
357 */
358 ret = strict_strtoul(number, 0, (unsigned long *)&count);
359 if (ret)
360 return ret;
361
362 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count);
364
365 return ret;
366}
367
368static struct ftrace_func_command ftrace_traceon_cmd = {
369 .name = "traceon",
370 .func = ftrace_trace_onoff_callback,
371};
372
373static struct ftrace_func_command ftrace_traceoff_cmd = {
374 .name = "traceoff",
375 .func = ftrace_trace_onoff_callback,
376};
377
378static int __init init_func_cmd_traceon(void)
379{
380 int ret;
381
382 ret = register_ftrace_command(&ftrace_traceoff_cmd);
383 if (ret)
384 return ret;
385
386 ret = register_ftrace_command(&ftrace_traceon_cmd);
387 if (ret)
388 unregister_ftrace_command(&ftrace_traceoff_cmd);
389 return ret;
390}
391#else
392static inline int init_func_cmd_traceon(void)
393{
394 return 0;
395}
396#endif /* CONFIG_DYNAMIC_FTRACE */
397
62static __init int init_function_trace(void) 398static __init int init_function_trace(void)
63{ 399{
400 init_func_cmd_traceon();
64 return register_tracer(&function_trace); 401 return register_tracer(&function_trace);
65} 402}
66
67device_initcall(init_function_trace); 403device_initcall(init_function_trace);
404
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..6004ccac2dd7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * 2 *
3 * Function graph tracer. 3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which 5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com> 6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 * 7 *
@@ -12,6 +12,7 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13 13
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
15 16
16#define TRACE_GRAPH_INDENT 2 17#define TRACE_GRAPH_INDENT 2
17 18
@@ -20,9 +21,11 @@
20#define TRACE_GRAPH_PRINT_CPU 0x2 21#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 22#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8 23#define TRACE_GRAPH_PRINT_PROC 0x8
24#define TRACE_GRAPH_PRINT_DURATION 0x10
25#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
23 26
24static struct tracer_opt trace_opts[] = { 27static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */ 28 /* Display overruns? (for self-debug purpose) */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 29 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */ 30 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, 31 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -30,26 +33,101 @@ static struct tracer_opt trace_opts[] = {
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, 33 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */ 34 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, 35 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
36 /* Display duration of execution */
37 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
38 /* Display absolute time of an entry */
39 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
33 { } /* Empty entry */ 40 { } /* Empty entry */
34}; 41};
35 42
36static struct tracer_flags tracer_flags = { 43static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */ 44 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, 45 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
46 TRACE_GRAPH_PRINT_DURATION,
39 .opts = trace_opts 47 .opts = trace_opts
40}; 48};
41 49
42/* pid on the last trace processed */ 50/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44 51
45static int graph_trace_init(struct trace_array *tr) 52
53/* Add a function return address to the trace stack on thread info.*/
54int
55ftrace_push_return_trace(unsigned long ret, unsigned long long time,
56 unsigned long func, int *depth)
46{ 57{
47 int cpu, ret; 58 int index;
48 59
49 for_each_online_cpu(cpu) 60 if (!current->ret_stack)
50 tracing_reset(tr, cpu); 61 return -EBUSY;
62
63 /* The return trace stack is full */
64 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
65 atomic_inc(&current->trace_overrun);
66 return -EBUSY;
67 }
68
69 index = ++current->curr_ret_stack;
70 barrier();
71 current->ret_stack[index].ret = ret;
72 current->ret_stack[index].func = func;
73 current->ret_stack[index].calltime = time;
74 *depth = index;
75
76 return 0;
77}
78
79/* Retrieve a function return address to the trace stack on thread info.*/
80void
81ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
82{
83 int index;
84
85 index = current->curr_ret_stack;
86
87 if (unlikely(index < 0)) {
88 ftrace_graph_stop();
89 WARN_ON(1);
90 /* Might as well panic, otherwise we have no where to go */
91 *ret = (unsigned long)panic;
92 return;
93 }
51 94
52 ret = register_ftrace_graph(&trace_graph_return, 95 *ret = current->ret_stack[index].ret;
96 trace->func = current->ret_stack[index].func;
97 trace->calltime = current->ret_stack[index].calltime;
98 trace->overrun = atomic_read(&current->trace_overrun);
99 trace->depth = index;
100 barrier();
101 current->curr_ret_stack--;
102
103}
104
105/*
106 * Send the trace to the ring-buffer.
107 * @return the original return address.
108 */
109unsigned long ftrace_return_to_handler(void)
110{
111 struct ftrace_graph_ret trace;
112 unsigned long ret;
113
114 ftrace_pop_return_trace(&trace, &ret);
115 trace.rettime = trace_clock_local();
116 ftrace_graph_return(&trace);
117
118 if (unlikely(!ret)) {
119 ftrace_graph_stop();
120 WARN_ON(1);
121 /* Might as well panic. What else to do? */
122 ret = (unsigned long)panic;
123 }
124
125 return ret;
126}
127
128static int graph_trace_init(struct trace_array *tr)
129{
130 int ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry); 131 &trace_graph_entry);
54 if (ret) 132 if (ret)
55 return ret; 133 return ret;
@@ -112,15 +190,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
112static enum print_line_t 190static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid) 191print_graph_proc(struct trace_seq *s, pid_t pid)
114{ 192{
115 int i; 193 char comm[TASK_COMM_LEN];
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */ 194 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11]; 195 char pid_str[11];
196 int spaces = 0;
197 int ret;
198 int len;
199 int i;
122 200
123 strncpy(comm, trace_find_cmdline(pid), 7); 201 trace_find_cmdline(pid, comm);
124 comm[7] = '\0'; 202 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid); 203 sprintf(pid_str, "%d", pid);
126 204
@@ -153,17 +231,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
153 231
154/* If the pid changed since the last trace, output this event */ 232/* If the pid changed since the last trace, output this event */
155static enum print_line_t 233static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu) 234verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
157{ 235{
158 pid_t prev_pid; 236 pid_t prev_pid;
237 pid_t *last_pid;
159 int ret; 238 int ret;
160 239
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid) 240 if (!last_pids_cpu)
241 return TRACE_TYPE_HANDLED;
242
243 last_pid = per_cpu_ptr(last_pids_cpu, cpu);
244
245 if (*last_pid == pid)
162 return TRACE_TYPE_HANDLED; 246 return TRACE_TYPE_HANDLED;
163 247
164 prev_pid = last_pid[cpu]; 248 prev_pid = *last_pid;
165 last_pid[cpu] = pid; 249 *last_pid = pid;
166 250
251 if (prev_pid == -1)
252 return TRACE_TYPE_HANDLED;
167/* 253/*
168 * Context-switch trace line: 254 * Context-switch trace line:
169 255
@@ -175,34 +261,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
175 ret = trace_seq_printf(s, 261 ret = trace_seq_printf(s,
176 " ------------------------------------------\n"); 262 " ------------------------------------------\n");
177 if (!ret) 263 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE; 264 return TRACE_TYPE_PARTIAL_LINE;
179 265
180 ret = print_graph_cpu(s, cpu); 266 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE) 267 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE; 268 return TRACE_TYPE_PARTIAL_LINE;
183 269
184 ret = print_graph_proc(s, prev_pid); 270 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE) 271 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE; 272 return TRACE_TYPE_PARTIAL_LINE;
187 273
188 ret = trace_seq_printf(s, " => "); 274 ret = trace_seq_printf(s, " => ");
189 if (!ret) 275 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE; 276 return TRACE_TYPE_PARTIAL_LINE;
191 277
192 ret = print_graph_proc(s, pid); 278 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE) 279 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE; 280 return TRACE_TYPE_PARTIAL_LINE;
195 281
196 ret = trace_seq_printf(s, 282 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n"); 283 "\n ------------------------------------------\n\n");
198 if (!ret) 284 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE; 285 return TRACE_TYPE_PARTIAL_LINE;
200 286
201 return ret; 287 return TRACE_TYPE_HANDLED;
202} 288}
203 289
204static bool 290static struct ftrace_graph_ret_entry *
205trace_branch_is_leaf(struct trace_iterator *iter, 291get_return_for_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr) 292 struct ftrace_graph_ent_entry *curr)
207{ 293{
208 struct ring_buffer_iter *ring_iter; 294 struct ring_buffer_iter *ring_iter;
@@ -211,65 +297,123 @@ trace_branch_is_leaf(struct trace_iterator *iter,
211 297
212 ring_iter = iter->buffer_iter[iter->cpu]; 298 ring_iter = iter->buffer_iter[iter->cpu];
213 299
214 if (!ring_iter) 300 /* First peek to compare current entry and the next one */
215 return false; 301 if (ring_iter)
216 302 event = ring_buffer_iter_peek(ring_iter, NULL);
217 event = ring_buffer_iter_peek(ring_iter, NULL); 303 else {
304 /* We need to consume the current entry to see the next one */
305 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
306 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
307 NULL);
308 }
218 309
219 if (!event) 310 if (!event)
220 return false; 311 return NULL;
221 312
222 next = ring_buffer_event_data(event); 313 next = ring_buffer_event_data(event);
223 314
224 if (next->ent.type != TRACE_GRAPH_RET) 315 if (next->ent.type != TRACE_GRAPH_RET)
225 return false; 316 return NULL;
226 317
227 if (curr->ent.pid != next->ent.pid || 318 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func) 319 curr->graph_ent.func != next->ret.func)
229 return false; 320 return NULL;
321
322 /* this is a leaf, now advance the iterator */
323 if (ring_iter)
324 ring_buffer_read(ring_iter, NULL);
230 325
231 return true; 326 return next;
327}
328
329/* Signal a overhead of time execution to the output */
330static int
331print_graph_overhead(unsigned long long duration, struct trace_seq *s)
332{
333 /* If duration disappear, we don't need anything */
334 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
335 return 1;
336
337 /* Non nested entry or return */
338 if (duration == -1)
339 return trace_seq_printf(s, " ");
340
341 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
342 /* Duration exceeded 100 msecs */
343 if (duration > 100000ULL)
344 return trace_seq_printf(s, "! ");
345
346 /* Duration exceeded 10 msecs */
347 if (duration > 10000ULL)
348 return trace_seq_printf(s, "+ ");
349 }
350
351 return trace_seq_printf(s, " ");
352}
353
354static int print_graph_abs_time(u64 t, struct trace_seq *s)
355{
356 unsigned long usecs_rem;
357
358 usecs_rem = do_div(t, NSEC_PER_SEC);
359 usecs_rem /= 1000;
360
361 return trace_seq_printf(s, "%5lu.%06lu | ",
362 (unsigned long)t, usecs_rem);
232} 363}
233 364
234static enum print_line_t 365static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr, 366print_graph_irq(struct trace_iterator *iter, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid) 367 enum trace_type type, int cpu, pid_t pid)
237{ 368{
238 int ret; 369 int ret;
370 struct trace_seq *s = &iter->seq;
239 371
240 if (addr < (unsigned long)__irqentry_text_start || 372 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end) 373 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED; 374 return TRACE_TYPE_UNHANDLED;
243 375
244 if (type == TRACE_GRAPH_ENT) { 376 /* Absolute time */
245 ret = trace_seq_printf(s, "==========> | "); 377 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
246 } else { 378 ret = print_graph_abs_time(iter->ts, s);
247 /* Cpu */ 379 if (!ret)
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 380 return TRACE_TYPE_PARTIAL_LINE;
249 ret = print_graph_cpu(s, cpu); 381 }
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258 382
259 ret = trace_seq_printf(s, " | "); 383 /* Cpu */
260 if (!ret) 384 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
261 return TRACE_TYPE_PARTIAL_LINE; 385 ret = print_graph_cpu(s, cpu);
262 } 386 if (ret == TRACE_TYPE_PARTIAL_LINE)
387 return TRACE_TYPE_PARTIAL_LINE;
388 }
389 /* Proc */
390 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
391 ret = print_graph_proc(s, pid);
392 if (ret == TRACE_TYPE_PARTIAL_LINE)
393 return TRACE_TYPE_PARTIAL_LINE;
394 ret = trace_seq_printf(s, " | ");
395 if (!ret)
396 return TRACE_TYPE_PARTIAL_LINE;
397 }
263 398
264 /* No overhead */ 399 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 400 ret = print_graph_overhead(-1, s);
266 ret = trace_seq_printf(s, " "); 401 if (!ret)
267 if (!ret) 402 return TRACE_TYPE_PARTIAL_LINE;
268 return TRACE_TYPE_PARTIAL_LINE; 403
269 } 404 if (type == TRACE_GRAPH_ENT)
405 ret = trace_seq_printf(s, "==========>");
406 else
407 ret = trace_seq_printf(s, "<==========");
408
409 if (!ret)
410 return TRACE_TYPE_PARTIAL_LINE;
411
412 /* Don't close the duration column if haven't one */
413 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
414 trace_seq_printf(s, " |");
415 ret = trace_seq_printf(s, "\n");
270 416
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret) 417 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE; 418 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED; 419 return TRACE_TYPE_HANDLED;
@@ -288,7 +432,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
288 sprintf(msecs_str, "%lu", (unsigned long) duration); 432 sprintf(msecs_str, "%lu", (unsigned long) duration);
289 433
290 /* Print msecs */ 434 /* Print msecs */
291 ret = trace_seq_printf(s, msecs_str); 435 ret = trace_seq_printf(s, "%s", msecs_str);
292 if (!ret) 436 if (!ret)
293 return TRACE_TYPE_PARTIAL_LINE; 437 return TRACE_TYPE_PARTIAL_LINE;
294 438
@@ -321,51 +465,33 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
321 465
322} 466}
323 467
324/* Signal a overhead of time execution to the output */
325static int
326print_graph_overhead(unsigned long long duration, struct trace_seq *s)
327{
328 /* Duration exceeded 100 msecs */
329 if (duration > 100000ULL)
330 return trace_seq_printf(s, "! ");
331
332 /* Duration exceeded 10 msecs */
333 if (duration > 10000ULL)
334 return trace_seq_printf(s, "+ ");
335
336 return trace_seq_printf(s, " ");
337}
338
339/* Case of a leaf function on its call entry */ 468/* Case of a leaf function on its call entry */
340static enum print_line_t 469static enum print_line_t
341print_graph_entry_leaf(struct trace_iterator *iter, 470print_graph_entry_leaf(struct trace_iterator *iter,
342 struct ftrace_graph_ent_entry *entry, struct trace_seq *s) 471 struct ftrace_graph_ent_entry *entry,
472 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
343{ 473{
344 struct ftrace_graph_ret_entry *ret_entry;
345 struct ftrace_graph_ret *graph_ret; 474 struct ftrace_graph_ret *graph_ret;
346 struct ring_buffer_event *event;
347 struct ftrace_graph_ent *call; 475 struct ftrace_graph_ent *call;
348 unsigned long long duration; 476 unsigned long long duration;
349 int ret; 477 int ret;
350 int i; 478 int i;
351 479
352 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
353 ret_entry = ring_buffer_event_data(event);
354 graph_ret = &ret_entry->ret; 480 graph_ret = &ret_entry->ret;
355 call = &entry->graph_ent; 481 call = &entry->graph_ent;
356 duration = graph_ret->rettime - graph_ret->calltime; 482 duration = graph_ret->rettime - graph_ret->calltime;
357 483
358 /* Overhead */ 484 /* Overhead */
359 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 485 ret = print_graph_overhead(duration, s);
360 ret = print_graph_overhead(duration, s); 486 if (!ret)
361 if (!ret) 487 return TRACE_TYPE_PARTIAL_LINE;
362 return TRACE_TYPE_PARTIAL_LINE;
363 }
364 488
365 /* Duration */ 489 /* Duration */
366 ret = print_graph_duration(duration, s); 490 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
367 if (ret == TRACE_TYPE_PARTIAL_LINE) 491 ret = print_graph_duration(duration, s);
368 return TRACE_TYPE_PARTIAL_LINE; 492 if (ret == TRACE_TYPE_PARTIAL_LINE)
493 return TRACE_TYPE_PARTIAL_LINE;
494 }
369 495
370 /* Function */ 496 /* Function */
371 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 497 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -394,25 +520,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
394 struct ftrace_graph_ent *call = &entry->graph_ent; 520 struct ftrace_graph_ent *call = &entry->graph_ent;
395 521
396 /* No overhead */ 522 /* No overhead */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 523 ret = print_graph_overhead(-1, s);
398 ret = trace_seq_printf(s, " "); 524 if (!ret)
399 if (!ret) 525 return TRACE_TYPE_PARTIAL_LINE;
400 return TRACE_TYPE_PARTIAL_LINE;
401 }
402 526
403 /* Interrupt */ 527 /* No time */
404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); 528 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
405 if (ret == TRACE_TYPE_UNHANDLED) {
406 /* No time */
407 ret = trace_seq_printf(s, " | "); 529 ret = trace_seq_printf(s, " | ");
408 if (!ret) 530 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 } 532 }
414 533
415
416 /* Function */ 534 /* Function */
417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 535 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
418 ret = trace_seq_printf(s, " "); 536 ret = trace_seq_printf(s, " ");
@@ -428,20 +546,40 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
428 if (!ret) 546 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE; 547 return TRACE_TYPE_PARTIAL_LINE;
430 548
431 return TRACE_TYPE_HANDLED; 549 /*
550 * we already consumed the current entry to check the next one
551 * and see if this is a leaf.
552 */
553 return TRACE_TYPE_NO_CONSUME;
432} 554}
433 555
434static enum print_line_t 556static enum print_line_t
435print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 557print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
436 struct trace_iterator *iter, int cpu) 558 struct trace_iterator *iter)
437{ 559{
438 int ret; 560 int ret;
561 int cpu = iter->cpu;
562 pid_t *last_entry = iter->private;
439 struct trace_entry *ent = iter->ent; 563 struct trace_entry *ent = iter->ent;
564 struct ftrace_graph_ent *call = &field->graph_ent;
565 struct ftrace_graph_ret_entry *leaf_ret;
440 566
441 /* Pid */ 567 /* Pid */
442 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 568 if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
569 return TRACE_TYPE_PARTIAL_LINE;
570
571 /* Interrupt */
572 ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
573 if (ret == TRACE_TYPE_PARTIAL_LINE)
443 return TRACE_TYPE_PARTIAL_LINE; 574 return TRACE_TYPE_PARTIAL_LINE;
444 575
576 /* Absolute time */
577 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
578 ret = print_graph_abs_time(iter->ts, s);
579 if (!ret)
580 return TRACE_TYPE_PARTIAL_LINE;
581 }
582
445 /* Cpu */ 583 /* Cpu */
446 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
447 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
@@ -460,8 +598,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
460 return TRACE_TYPE_PARTIAL_LINE; 598 return TRACE_TYPE_PARTIAL_LINE;
461 } 599 }
462 600
463 if (trace_branch_is_leaf(iter, field)) 601 leaf_ret = get_return_for_leaf(iter, field);
464 return print_graph_entry_leaf(iter, field, s); 602 if (leaf_ret)
603 return print_graph_entry_leaf(iter, field, leaf_ret, s);
465 else 604 else
466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu); 605 return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
467 606
@@ -469,16 +608,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
469 608
470static enum print_line_t 609static enum print_line_t
471print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 610print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
472 struct trace_entry *ent, int cpu) 611 struct trace_entry *ent, struct trace_iterator *iter)
473{ 612{
474 int i; 613 int i;
475 int ret; 614 int ret;
615 int cpu = iter->cpu;
616 pid_t *last_pid = iter->private, pid = ent->pid;
476 unsigned long long duration = trace->rettime - trace->calltime; 617 unsigned long long duration = trace->rettime - trace->calltime;
477 618
478 /* Pid */ 619 /* Pid */
479 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 620 if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
480 return TRACE_TYPE_PARTIAL_LINE; 621 return TRACE_TYPE_PARTIAL_LINE;
481 622
623 /* Absolute time */
624 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
625 ret = print_graph_abs_time(iter->ts, s);
626 if (!ret)
627 return TRACE_TYPE_PARTIAL_LINE;
628 }
629
482 /* Cpu */ 630 /* Cpu */
483 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 631 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
484 ret = print_graph_cpu(s, cpu); 632 ret = print_graph_cpu(s, cpu);
@@ -498,16 +646,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
498 } 646 }
499 647
500 /* Overhead */ 648 /* Overhead */
501 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 649 ret = print_graph_overhead(duration, s);
502 ret = print_graph_overhead(duration, s); 650 if (!ret)
503 if (!ret) 651 return TRACE_TYPE_PARTIAL_LINE;
504 return TRACE_TYPE_PARTIAL_LINE;
505 }
506 652
507 /* Duration */ 653 /* Duration */
508 ret = print_graph_duration(duration, s); 654 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
509 if (ret == TRACE_TYPE_PARTIAL_LINE) 655 ret = print_graph_duration(duration, s);
510 return TRACE_TYPE_PARTIAL_LINE; 656 if (ret == TRACE_TYPE_PARTIAL_LINE)
657 return TRACE_TYPE_PARTIAL_LINE;
658 }
511 659
512 /* Closing brace */ 660 /* Closing brace */
513 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 661 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -528,7 +676,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
528 return TRACE_TYPE_PARTIAL_LINE; 676 return TRACE_TYPE_PARTIAL_LINE;
529 } 677 }
530 678
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); 679 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE) 680 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE; 681 return TRACE_TYPE_PARTIAL_LINE;
534 682
@@ -536,19 +684,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
536} 684}
537 685
538static enum print_line_t 686static enum print_line_t
539print_graph_comment(struct print_entry *trace, struct trace_seq *s, 687print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
540 struct trace_entry *ent, struct trace_iterator *iter) 688 struct trace_entry *ent, struct trace_iterator *iter)
541{ 689{
542 int i; 690 int i;
543 int ret; 691 int ret;
692 int cpu = iter->cpu;
693 pid_t *last_pid = iter->private;
544 694
545 /* Pid */ 695 /* Pid */
546 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) 696 if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
547 return TRACE_TYPE_PARTIAL_LINE; 697 return TRACE_TYPE_PARTIAL_LINE;
548 698
699 /* Absolute time */
700 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
701 ret = print_graph_abs_time(iter->ts, s);
702 if (!ret)
703 return TRACE_TYPE_PARTIAL_LINE;
704 }
705
549 /* Cpu */ 706 /* Cpu */
550 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 707 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
551 ret = print_graph_cpu(s, iter->cpu); 708 ret = print_graph_cpu(s, cpu);
552 if (ret == TRACE_TYPE_PARTIAL_LINE) 709 if (ret == TRACE_TYPE_PARTIAL_LINE)
553 return TRACE_TYPE_PARTIAL_LINE; 710 return TRACE_TYPE_PARTIAL_LINE;
554 } 711 }
@@ -565,17 +722,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
565 } 722 }
566 723
567 /* No overhead */ 724 /* No overhead */
568 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 725 ret = print_graph_overhead(-1, s);
569 ret = trace_seq_printf(s, " "); 726 if (!ret)
727 return TRACE_TYPE_PARTIAL_LINE;
728
729 /* No time */
730 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
731 ret = trace_seq_printf(s, " | ");
570 if (!ret) 732 if (!ret)
571 return TRACE_TYPE_PARTIAL_LINE; 733 return TRACE_TYPE_PARTIAL_LINE;
572 } 734 }
573 735
574 /* No time */
575 ret = trace_seq_printf(s, " | ");
576 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE;
578
579 /* Indentation */ 736 /* Indentation */
580 if (trace->depth > 0) 737 if (trace->depth > 0)
581 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { 738 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
@@ -585,12 +742,19 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
585 } 742 }
586 743
587 /* The comment */ 744 /* The comment */
588 ret = trace_seq_printf(s, "/* %s", trace->buf); 745 ret = trace_seq_printf(s, "/* ");
746 if (!ret)
747 return TRACE_TYPE_PARTIAL_LINE;
748
749 ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
589 if (!ret) 750 if (!ret)
590 return TRACE_TYPE_PARTIAL_LINE; 751 return TRACE_TYPE_PARTIAL_LINE;
591 752
592 if (ent->flags & TRACE_FLAG_CONT) 753 /* Strip ending newline */
593 trace_seq_print_cont(s, iter); 754 if (s->buffer[s->len - 1] == '\n') {
755 s->buffer[s->len - 1] = '\0';
756 s->len--;
757 }
594 758
595 ret = trace_seq_printf(s, " */\n"); 759 ret = trace_seq_printf(s, " */\n");
596 if (!ret) 760 if (!ret)
@@ -610,16 +774,15 @@ print_graph_function(struct trace_iterator *iter)
610 case TRACE_GRAPH_ENT: { 774 case TRACE_GRAPH_ENT: {
611 struct ftrace_graph_ent_entry *field; 775 struct ftrace_graph_ent_entry *field;
612 trace_assign_type(field, entry); 776 trace_assign_type(field, entry);
613 return print_graph_entry(field, s, iter, 777 return print_graph_entry(field, s, iter);
614 iter->cpu);
615 } 778 }
616 case TRACE_GRAPH_RET: { 779 case TRACE_GRAPH_RET: {
617 struct ftrace_graph_ret_entry *field; 780 struct ftrace_graph_ret_entry *field;
618 trace_assign_type(field, entry); 781 trace_assign_type(field, entry);
619 return print_graph_return(&field->ret, s, entry, iter->cpu); 782 return print_graph_return(&field->ret, s, entry, iter);
620 } 783 }
621 case TRACE_PRINT: { 784 case TRACE_BPRINT: {
622 struct print_entry *field; 785 struct bprint_entry *field;
623 trace_assign_type(field, entry); 786 trace_assign_type(field, entry);
624 return print_graph_comment(field, s, entry, iter); 787 return print_graph_comment(field, s, entry, iter);
625 } 788 }
@@ -632,33 +795,64 @@ static void print_graph_headers(struct seq_file *s)
632{ 795{
633 /* 1st line */ 796 /* 1st line */
634 seq_printf(s, "# "); 797 seq_printf(s, "# ");
798 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
799 seq_printf(s, " TIME ");
635 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 800 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
636 seq_printf(s, "CPU "); 801 seq_printf(s, "CPU");
637 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 802 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
638 seq_printf(s, "TASK/PID "); 803 seq_printf(s, " TASK/PID ");
639 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) 804 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
640 seq_printf(s, "OVERHEAD/"); 805 seq_printf(s, " DURATION ");
641 seq_printf(s, "DURATION FUNCTION CALLS\n"); 806 seq_printf(s, " FUNCTION CALLS\n");
642 807
643 /* 2nd line */ 808 /* 2nd line */
644 seq_printf(s, "# "); 809 seq_printf(s, "# ");
810 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
811 seq_printf(s, " | ");
645 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 812 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
646 seq_printf(s, "| "); 813 seq_printf(s, "| ");
647 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 814 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
648 seq_printf(s, "| | "); 815 seq_printf(s, " | | ");
649 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 816 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
650 seq_printf(s, "| "); 817 seq_printf(s, " | | ");
651 seq_printf(s, "| | | | |\n"); 818 seq_printf(s, " | | | |\n");
652 } else 819}
653 seq_printf(s, " | | | | |\n"); 820
821static void graph_trace_open(struct trace_iterator *iter)
822{
823 /* pid on the last trace processed */
824 pid_t *last_pid = alloc_percpu(pid_t);
825 int cpu;
826
827 if (!last_pid)
828 pr_warning("function graph tracer: not enough memory\n");
829 else
830 for_each_possible_cpu(cpu) {
831 pid_t *pid = per_cpu_ptr(last_pid, cpu);
832 *pid = -1;
833 }
834
835 iter->private = last_pid;
836}
837
838static void graph_trace_close(struct trace_iterator *iter)
839{
840 free_percpu(iter->private);
654} 841}
842
655static struct tracer graph_trace __read_mostly = { 843static struct tracer graph_trace __read_mostly = {
656 .name = "function_graph", 844 .name = "function_graph",
657 .init = graph_trace_init, 845 .open = graph_trace_open,
658 .reset = graph_trace_reset, 846 .close = graph_trace_close,
847 .wait_pipe = poll_wait_pipe,
848 .init = graph_trace_init,
849 .reset = graph_trace_reset,
659 .print_line = print_graph_function, 850 .print_line = print_graph_function,
660 .print_header = print_graph_headers, 851 .print_header = print_graph_headers,
661 .flags = &tracer_flags, 852 .flags = &tracer_flags,
853#ifdef CONFIG_FTRACE_SELFTEST
854 .selftest = trace_selftest_startup_function_graph,
855#endif
662}; 856};
663 857
664static __init int init_graph_trace(void) 858static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 649df22d435f..7bfdf4c2347f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,30 +1,53 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on bts
3 * 3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7 7#include <linux/spinlock.h>
8#include <linux/module.h> 8#include <linux/kallsyms.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/kallsyms.h> 11#include <linux/module.h>
12#include <linux/cpu.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
13 15
14#include <asm/ds.h> 16#include <asm/ds.h>
15 17
16#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
17 20
18 21
19#define SIZEOF_BTS (1 << 13) 22#define SIZEOF_BTS (1 << 13)
20 23
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
21static DEFINE_PER_CPU(struct bts_tracer *, tracer); 35static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23 37
24#define this_tracer per_cpu(tracer, smp_processor_id()) 38#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id()) 39#define this_buffer per_cpu(buffer, smp_processor_id())
26 40
41static int __read_mostly trace_hw_branches_enabled;
42static struct trace_array *hw_branch_trace __read_mostly;
43
27 44
45/*
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
28static void bts_trace_start_cpu(void *arg) 51static void bts_trace_start_cpu(void *arg)
29{ 52{
30 if (this_tracer) 53 if (this_tracer)
@@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
42 65
43static void bts_trace_start(struct trace_array *tr) 66static void bts_trace_start(struct trace_array *tr)
44{ 67{
45 int cpu; 68 spin_lock(&bts_tracer_lock);
46 69
47 tracing_reset_online_cpus(tr); 70 on_each_cpu(bts_trace_start_cpu, NULL, 1);
71 trace_hw_branches_enabled = 1;
48 72
49 for_each_cpu(cpu, cpu_possible_mask) 73 spin_unlock(&bts_tracer_lock);
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 74}
52 75
76/*
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
53static void bts_trace_stop_cpu(void *arg) 82static void bts_trace_stop_cpu(void *arg)
54{ 83{
55 if (this_tracer) { 84 if (this_tracer) {
@@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg)
60 89
61static void bts_trace_stop(struct trace_array *tr) 90static void bts_trace_stop(struct trace_array *tr)
62{ 91{
63 int cpu; 92 spin_lock(&bts_tracer_lock);
93
94 trace_hw_branches_enabled = 0;
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1);
96
97 spin_unlock(&bts_tracer_lock);
98}
99
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu)
102{
103 unsigned int cpu = (unsigned long)hcpu;
64 104
65 for_each_cpu(cpu, cpu_possible_mask) 105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
114 break;
115 case CPU_DOWN_PREPARE:
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
117 break;
118 }
119
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE;
67} 123}
68 124
125static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler
127};
128
69static int bts_trace_init(struct trace_array *tr) 129static int bts_trace_init(struct trace_array *tr)
70{ 130{
71 tracing_reset_online_cpus(tr); 131 hw_branch_trace = tr;
132
72 bts_trace_start(tr); 133 bts_trace_start(tr);
73 134
74 return 0; 135 return 0;
75} 136}
76 137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
77static void bts_trace_print_header(struct seq_file *m) 143static void bts_trace_print_header(struct seq_file *m)
78{ 144{
79 seq_puts(m, 145 seq_puts(m, "# CPU# TO <- FROM\n");
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83} 146}
84 147
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
87 struct trace_entry *entry = iter->ent; 150 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq; 151 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it; 152 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
90 154
91 trace_assign_type(it, entry); 155 trace_assign_type(it, entry);
92 156
93 if (entry->type == TRACE_HW_BRANCHES) { 157 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) && 158 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", 159 seq_print_ip_sym(seq, it->to, symflags) &&
96 it->from, it->to) && 160 trace_seq_printf(seq, "\t <- ") &&
97 (!it->from || 161 seq_print_ip_sym(seq, it->from, symflags) &&
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n")) 162 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;; 164 return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
103 return TRACE_TYPE_UNHANDLED; 166 return TRACE_TYPE_UNHANDLED;
104} 167}
105 168
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) 169void trace_hw_branch(u64 from, u64 to)
107{ 170{
171 struct trace_array *tr = hw_branch_trace;
108 struct ring_buffer_event *event; 172 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry; 173 struct hw_branch_entry *entry;
110 unsigned long irq; 174 unsigned long irq1;
175 int cpu;
111 176
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); 177 if (unlikely(!tr))
113 if (!event)
114 return; 178 return;
179
180 if (unlikely(!trace_hw_branches_enabled))
181 return;
182
183 local_irq_save(irq1);
184 cpu = raw_smp_processor_id();
185 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
186 goto out;
187
188 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
189 sizeof(*entry), 0, 0);
190 if (!event)
191 goto out;
115 entry = ring_buffer_event_data(event); 192 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from); 193 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES; 194 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from; 195 entry->from = from;
120 entry->to = to; 196 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq); 197 trace_buffer_unlock_commit(tr, event, 0, 0);
198
199 out:
200 atomic_dec(&tr->data[cpu]->disabled);
201 local_irq_restore(irq1);
122} 202}
123 203
124static void trace_bts_at(struct trace_array *tr, 204static void trace_bts_at(const struct bts_trace *trace, void *at)
125 const struct bts_trace *trace, void *at)
126{ 205{
127 struct bts_struct bts; 206 struct bts_struct bts;
128 int err = 0; 207 int err = 0;
@@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_array *tr,
137 216
138 switch (bts.qualifier) { 217 switch (bts.qualifier) {
139 case BTS_BRANCH: 218 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); 219 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
141 break; 220 break;
142 } 221 }
143} 222}
144 223
224/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 *
227 * pre: bts_tracer_lock must be locked
228 */
145static void trace_bts_cpu(void *arg) 229static void trace_bts_cpu(void *arg)
146{ 230{
147 struct trace_array *tr = (struct trace_array *) arg; 231 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace; 232 const struct bts_trace *trace;
149 unsigned char *at; 233 unsigned char *at;
150 234
151 if (!this_tracer) 235 if (unlikely(!tr))
236 return;
237
238 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
239 return;
240
241 if (unlikely(!this_tracer))
152 return; 242 return;
153 243
154 ds_suspend_bts(this_tracer); 244 ds_suspend_bts(this_tracer);
@@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg)
158 248
159 for (at = trace->ds.top; (void *)at < trace->ds.end; 249 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size) 250 at += trace->ds.size)
161 trace_bts_at(tr, trace, at); 251 trace_bts_at(trace, at);
162 252
163 for (at = trace->ds.begin; (void *)at < trace->ds.top; 253 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size) 254 at += trace->ds.size)
165 trace_bts_at(tr, trace, at); 255 trace_bts_at(trace, at);
166 256
167out: 257out:
168 ds_resume_bts(this_tracer); 258 ds_resume_bts(this_tracer);
@@ -170,26 +260,43 @@ out:
170 260
171static void trace_bts_prepare(struct trace_iterator *iter) 261static void trace_bts_prepare(struct trace_iterator *iter)
172{ 262{
173 int cpu; 263 spin_lock(&bts_tracer_lock);
264
265 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266
267 spin_unlock(&bts_tracer_lock);
268}
269
270static void trace_bts_close(struct trace_iterator *iter)
271{
272 tracing_reset_online_cpus(iter->tr);
273}
274
275void trace_hw_branch_oops(void)
276{
277 spin_lock(&bts_tracer_lock);
278
279 trace_bts_cpu(hw_branch_trace);
174 280
175 for_each_cpu(cpu, cpu_possible_mask) 281 spin_unlock(&bts_tracer_lock);
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 282}
178 283
179struct tracer bts_tracer __read_mostly = 284struct tracer bts_tracer __read_mostly =
180{ 285{
181 .name = "hw-branch-tracer", 286 .name = "hw-branch-tracer",
182 .init = bts_trace_init, 287 .init = bts_trace_init,
183 .reset = bts_trace_stop, 288 .reset = bts_trace_reset,
184 .print_header = bts_trace_print_header, 289 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line, 290 .print_line = bts_trace_print_line,
186 .start = bts_trace_start, 291 .start = bts_trace_start,
187 .stop = bts_trace_stop, 292 .stop = bts_trace_stop,
188 .open = trace_bts_prepare 293 .open = trace_bts_prepare,
294 .close = trace_bts_close
189}; 295};
190 296
191__init static int init_bts_trace(void) 297__init static int init_bts_trace(void)
192{ 298{
299 register_hotcpu_notifier(&bts_hotcpu_notifier);
193 return register_tracer(&bts_tracer); 300 return register_tracer(&bts_tracer);
194} 301}
195device_initcall(init_bts_trace); 302device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 62a78d943534..b923d13e2fad 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * trace irqs off criticall timings 2 * trace irqs off critical timings
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@@ -32,6 +32,8 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag;
36
35#ifdef CONFIG_PREEMPT_TRACER 37#ifdef CONFIG_PREEMPT_TRACER
36static inline int 38static inline int
37preempt_trace(void) 39preempt_trace(void)
@@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 97 disabled = atomic_inc_return(&data->disabled);
96 98
97 if (likely(disabled == 1)) 99 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 100 trace_function(tr, ip, parent_ip, flags, preempt_count());
99 101
100 atomic_dec(&data->disabled); 102 atomic_dec(&data->disabled);
101} 103}
@@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
153 if (!report_latency(delta)) 155 if (!report_latency(delta))
154 goto out_unlock; 156 goto out_unlock;
155 157
156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
157 159
158 latency = nsecs_to_usecs(delta); 160 latency = nsecs_to_usecs(delta);
159 161
@@ -177,7 +179,7 @@ out:
177 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
178 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
179 tracing_reset(tr, cpu); 181 tracing_reset(tr, cpu);
180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
181} 183}
182 184
183static inline void 185static inline void
@@ -210,7 +212,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
210 212
211 local_save_flags(flags); 213 local_save_flags(flags);
212 214
213 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 215 trace_function(tr, ip, parent_ip, flags, preempt_count());
214 216
215 per_cpu(tracing_cpu, cpu) = 1; 217 per_cpu(tracing_cpu, cpu) = 1;
216 218
@@ -244,7 +246,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
244 atomic_inc(&data->disabled); 246 atomic_inc(&data->disabled);
245 247
246 local_save_flags(flags); 248 local_save_flags(flags);
247 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 249 trace_function(tr, ip, parent_ip, flags, preempt_count());
248 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 250 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
249 data->critical_start = 0; 251 data->critical_start = 0;
250 atomic_dec(&data->disabled); 252 atomic_dec(&data->disabled);
@@ -353,33 +355,26 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 355}
354#endif /* CONFIG_PREEMPT_TRACER */ 356#endif /* CONFIG_PREEMPT_TRACER */
355 357
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
362static void start_irqsoff_tracer(struct trace_array *tr) 358static void start_irqsoff_tracer(struct trace_array *tr)
363{ 359{
364 register_ftrace_function(&trace_ops); 360 register_ftrace_function(&trace_ops);
365 if (tracing_is_enabled()) { 361 if (tracing_is_enabled())
366 tracer_enabled = 1; 362 tracer_enabled = 1;
367 save_tracer_enabled = 1; 363 else
368 } else {
369 tracer_enabled = 0; 364 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
372} 365}
373 366
374static void stop_irqsoff_tracer(struct trace_array *tr) 367static void stop_irqsoff_tracer(struct trace_array *tr)
375{ 368{
376 tracer_enabled = 0; 369 tracer_enabled = 0;
377 save_tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops); 370 unregister_ftrace_function(&trace_ops);
379} 371}
380 372
381static void __irqsoff_tracer_init(struct trace_array *tr) 373static void __irqsoff_tracer_init(struct trace_array *tr)
382{ 374{
375 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
376 trace_flags |= TRACE_ITER_LATENCY_FMT;
377
383 tracing_max_latency = 0; 378 tracing_max_latency = 0;
384 irqsoff_trace = tr; 379 irqsoff_trace = tr;
385 /* make sure that the tracer is visible */ 380 /* make sure that the tracer is visible */
@@ -390,30 +385,19 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
390static void irqsoff_tracer_reset(struct trace_array *tr) 385static void irqsoff_tracer_reset(struct trace_array *tr)
391{ 386{
392 stop_irqsoff_tracer(tr); 387 stop_irqsoff_tracer(tr);
388
389 if (!save_lat_flag)
390 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
393} 391}
394 392
395static void irqsoff_tracer_start(struct trace_array *tr) 393static void irqsoff_tracer_start(struct trace_array *tr)
396{ 394{
397 tracer_enabled = 1; 395 tracer_enabled = 1;
398 save_tracer_enabled = 1;
399} 396}
400 397
401static void irqsoff_tracer_stop(struct trace_array *tr) 398static void irqsoff_tracer_stop(struct trace_array *tr)
402{ 399{
403 tracer_enabled = 0; 400 tracer_enabled = 0;
404 save_tracer_enabled = 0;
405}
406
407static void irqsoff_tracer_open(struct trace_iterator *iter)
408{
409 /* stop the trace while dumping */
410 tracer_enabled = 0;
411}
412
413static void irqsoff_tracer_close(struct trace_iterator *iter)
414{
415 /* restart tracing */
416 tracer_enabled = save_tracer_enabled;
417} 401}
418 402
419#ifdef CONFIG_IRQSOFF_TRACER 403#ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +415,6 @@ static struct tracer irqsoff_tracer __read_mostly =
431 .reset = irqsoff_tracer_reset, 415 .reset = irqsoff_tracer_reset,
432 .start = irqsoff_tracer_start, 416 .start = irqsoff_tracer_start,
433 .stop = irqsoff_tracer_stop, 417 .stop = irqsoff_tracer_stop,
434 .open = irqsoff_tracer_open,
435 .close = irqsoff_tracer_close,
436 .print_max = 1, 418 .print_max = 1,
437#ifdef CONFIG_FTRACE_SELFTEST 419#ifdef CONFIG_FTRACE_SELFTEST
438 .selftest = trace_selftest_startup_irqsoff, 420 .selftest = trace_selftest_startup_irqsoff,
@@ -459,8 +441,6 @@ static struct tracer preemptoff_tracer __read_mostly =
459 .reset = irqsoff_tracer_reset, 441 .reset = irqsoff_tracer_reset,
460 .start = irqsoff_tracer_start, 442 .start = irqsoff_tracer_start,
461 .stop = irqsoff_tracer_stop, 443 .stop = irqsoff_tracer_stop,
462 .open = irqsoff_tracer_open,
463 .close = irqsoff_tracer_close,
464 .print_max = 1, 444 .print_max = 1,
465#ifdef CONFIG_FTRACE_SELFTEST 445#ifdef CONFIG_FTRACE_SELFTEST
466 .selftest = trace_selftest_startup_preemptoff, 446 .selftest = trace_selftest_startup_preemptoff,
@@ -489,8 +469,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
489 .reset = irqsoff_tracer_reset, 469 .reset = irqsoff_tracer_reset,
490 .start = irqsoff_tracer_start, 470 .start = irqsoff_tracer_start,
491 .stop = irqsoff_tracer_stop, 471 .stop = irqsoff_tracer_stop,
492 .open = irqsoff_tracer_open,
493 .close = irqsoff_tracer_close,
494 .print_max = 1, 472 .print_max = 1,
495#ifdef CONFIG_FTRACE_SELFTEST 473#ifdef CONFIG_FTRACE_SELFTEST
496 .selftest = trace_selftest_startup_preemptirqsoff, 474 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 80e503ef6136..f095916e477f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,6 +12,7 @@
12#include <asm/atomic.h> 12#include <asm/atomic.h>
13 13
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
15 16
16struct header_iter { 17struct header_iter {
17 struct pci_dev *dev; 18 struct pci_dev *dev;
@@ -183,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
183 switch (rw->opcode) { 184 switch (rw->opcode) {
184 case MMIO_READ: 185 case MMIO_READ:
185 ret = trace_seq_printf(s, 186 ret = trace_seq_printf(s,
186 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 187 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
187 rw->width, secs, usec_rem, rw->map_id, 188 rw->width, secs, usec_rem, rw->map_id,
188 (unsigned long long)rw->phys, 189 (unsigned long long)rw->phys,
189 rw->value, rw->pc, 0); 190 rw->value, rw->pc, 0);
190 break; 191 break;
191 case MMIO_WRITE: 192 case MMIO_WRITE:
192 ret = trace_seq_printf(s, 193 ret = trace_seq_printf(s,
193 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 194 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
194 rw->width, secs, usec_rem, rw->map_id, 195 rw->width, secs, usec_rem, rw->map_id,
195 (unsigned long long)rw->phys, 196 (unsigned long long)rw->phys,
196 rw->value, rw->pc, 0); 197 rw->value, rw->pc, 0);
197 break; 198 break;
198 case MMIO_UNKNOWN_OP: 199 case MMIO_UNKNOWN_OP:
199 ret = trace_seq_printf(s, 200 ret = trace_seq_printf(s,
200 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", 201 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
202 "%02lx 0x%lx %d\n",
201 secs, usec_rem, rw->map_id, 203 secs, usec_rem, rw->map_id,
202 (unsigned long long)rw->phys, 204 (unsigned long long)rw->phys,
203 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, 205 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -229,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
229 switch (m->opcode) { 231 switch (m->opcode) {
230 case MMIO_PROBE: 232 case MMIO_PROBE:
231 ret = trace_seq_printf(s, 233 ret = trace_seq_printf(s,
232 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 234 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
233 secs, usec_rem, m->map_id, 235 secs, usec_rem, m->map_id,
234 (unsigned long long)m->phys, m->virt, m->len, 236 (unsigned long long)m->phys, m->virt, m->len,
235 0UL, 0); 237 0UL, 0);
236 break; 238 break;
237 case MMIO_UNPROBE: 239 case MMIO_UNPROBE:
238 ret = trace_seq_printf(s, 240 ret = trace_seq_printf(s,
239 "UNMAP %lu.%06lu %d 0x%lx %d\n", 241 "UNMAP %u.%06lu %d 0x%lx %d\n",
240 secs, usec_rem, m->map_id, 0UL, 0); 242 secs, usec_rem, m->map_id, 0UL, 0);
241 break; 243 break;
242 default: 244 default:
@@ -255,18 +257,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
255 const char *msg = print->buf; 257 const char *msg = print->buf;
256 struct trace_seq *s = &iter->seq; 258 struct trace_seq *s = &iter->seq;
257 unsigned long long t = ns2usecs(iter->ts); 259 unsigned long long t = ns2usecs(iter->ts);
258 unsigned long usec_rem = do_div(t, 1000000ULL); 260 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
259 unsigned secs = (unsigned long)t; 261 unsigned secs = (unsigned long)t;
260 int ret; 262 int ret;
261 263
262 /* The trailing newline must be in the message. */ 264 /* The trailing newline must be in the message. */
263 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); 265 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
264 if (!ret) 266 if (!ret)
265 return TRACE_TYPE_PARTIAL_LINE; 267 return TRACE_TYPE_PARTIAL_LINE;
266 268
267 if (entry->flags & TRACE_FLAG_CONT)
268 trace_seq_print_cont(s, iter);
269
270 return TRACE_TYPE_HANDLED; 269 return TRACE_TYPE_HANDLED;
271} 270}
272 271
@@ -308,21 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
308{ 307{
309 struct ring_buffer_event *event; 308 struct ring_buffer_event *event;
310 struct trace_mmiotrace_rw *entry; 309 struct trace_mmiotrace_rw *entry;
311 unsigned long irq_flags; 310 int pc = preempt_count();
312 311
313 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 312 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
314 &irq_flags); 313 sizeof(*entry), 0, pc);
315 if (!event) { 314 if (!event) {
316 atomic_inc(&dropped_count); 315 atomic_inc(&dropped_count);
317 return; 316 return;
318 } 317 }
319 entry = ring_buffer_event_data(event); 318 entry = ring_buffer_event_data(event);
320 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
321 entry->ent.type = TRACE_MMIO_RW;
322 entry->rw = *rw; 319 entry->rw = *rw;
323 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 320 trace_buffer_unlock_commit(tr, event, 0, pc);
324
325 trace_wake_up();
326} 321}
327 322
328void mmio_trace_rw(struct mmiotrace_rw *rw) 323void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -338,21 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
338{ 333{
339 struct ring_buffer_event *event; 334 struct ring_buffer_event *event;
340 struct trace_mmiotrace_map *entry; 335 struct trace_mmiotrace_map *entry;
341 unsigned long irq_flags; 336 int pc = preempt_count();
342 337
343 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 338 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
344 &irq_flags); 339 sizeof(*entry), 0, pc);
345 if (!event) { 340 if (!event) {
346 atomic_inc(&dropped_count); 341 atomic_inc(&dropped_count);
347 return; 342 return;
348 } 343 }
349 entry = ring_buffer_event_data(event); 344 entry = ring_buffer_event_data(event);
350 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
351 entry->ent.type = TRACE_MMIO_MAP;
352 entry->map = *map; 345 entry->map = *map;
353 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 346 trace_buffer_unlock_commit(tr, event, 0, pc);
354
355 trace_wake_up();
356} 347}
357 348
358void mmio_trace_mapping(struct mmiotrace_map *map) 349void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30ac..9aa84bde23cd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
47 47
48static int nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
49{ 49{
50 int cpu;
51 ctx_trace = tr; 50 ctx_trace = tr;
52
53 for_each_online_cpu(cpu)
54 tracing_reset(tr, cpu);
55
56 start_nop_trace(tr); 51 start_nop_trace(tr);
57 return 0; 52 return 0;
58} 53}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..6a4c9dea191e
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,967 @@
1/*
2 * trace_output.c
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/mutex.h>
10#include <linux/ftrace.h>
11
12#include "trace_output.h"
13
14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128
16
17static DEFINE_MUTEX(trace_event_mutex);
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19
20static int next_event_type = __TRACE_LAST_TYPE + 1;
21
22/**
23 * trace_seq_printf - sequence printing of trace information
24 * @s: trace sequence descriptor
25 * @fmt: printf format string
26 *
27 * The tracer may use either sequence operations or its own
28 * copy to user routines. To simplify formating of a trace
29 * trace_seq_printf is used to store strings into a special
30 * buffer (@s). Then the output may be either used by
31 * the sequencer or pulled into another buffer.
32 */
33int
34trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
35{
36 int len = (PAGE_SIZE - 1) - s->len;
37 va_list ap;
38 int ret;
39
40 if (!len)
41 return 0;
42
43 va_start(ap, fmt);
44 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
45 va_end(ap);
46
47 /* If we can't write it all, don't bother writing anything */
48 if (ret >= len)
49 return 0;
50
51 s->len += ret;
52
53 return len;
54}
55
56int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
57{
58 int len = (PAGE_SIZE - 1) - s->len;
59 int ret;
60
61 if (!len)
62 return 0;
63
64 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
65
66 /* If we can't write it all, don't bother writing anything */
67 if (ret >= len)
68 return 0;
69
70 s->len += ret;
71
72 return len;
73}
74
75/**
76 * trace_seq_puts - trace sequence printing of simple string
77 * @s: trace sequence descriptor
78 * @str: simple string to record
79 *
80 * The tracer may use either the sequence operations or its own
81 * copy to user routines. This function records a simple string
82 * into a special buffer (@s) for later retrieval by a sequencer
83 * or other mechanism.
84 */
85int trace_seq_puts(struct trace_seq *s, const char *str)
86{
87 int len = strlen(str);
88
89 if (len > ((PAGE_SIZE - 1) - s->len))
90 return 0;
91
92 memcpy(s->buffer + s->len, str, len);
93 s->len += len;
94
95 return len;
96}
97
98int trace_seq_putc(struct trace_seq *s, unsigned char c)
99{
100 if (s->len >= (PAGE_SIZE - 1))
101 return 0;
102
103 s->buffer[s->len++] = c;
104
105 return 1;
106}
107
108int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
109{
110 if (len > ((PAGE_SIZE - 1) - s->len))
111 return 0;
112
113 memcpy(s->buffer + s->len, mem, len);
114 s->len += len;
115
116 return len;
117}
118
119int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
120{
121 unsigned char hex[HEX_CHARS];
122 unsigned char *data = mem;
123 int i, j;
124
125#ifdef __BIG_ENDIAN
126 for (i = 0, j = 0; i < len; i++) {
127#else
128 for (i = len-1, j = 0; i >= 0; i--) {
129#endif
130 hex[j++] = hex_asc_hi(data[i]);
131 hex[j++] = hex_asc_lo(data[i]);
132 }
133 hex[j++] = ' ';
134
135 return trace_seq_putmem(s, hex, j);
136}
137
138int trace_seq_path(struct trace_seq *s, struct path *path)
139{
140 unsigned char *p;
141
142 if (s->len >= (PAGE_SIZE - 1))
143 return 0;
144 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
145 if (!IS_ERR(p)) {
146 p = mangle_path(s->buffer + s->len, p, "\n");
147 if (p) {
148 s->len = p - s->buffer;
149 return 1;
150 }
151 } else {
152 s->buffer[s->len++] = '?';
153 return 1;
154 }
155
156 return 0;
157}
158
159#ifdef CONFIG_KRETPROBES
160static inline const char *kretprobed(const char *name)
161{
162 static const char tramp_name[] = "kretprobe_trampoline";
163 int size = sizeof(tramp_name);
164
165 if (strncmp(tramp_name, name, size) == 0)
166 return "[unknown/kretprobe'd]";
167 return name;
168}
169#else
170static inline const char *kretprobed(const char *name)
171{
172 return name;
173}
174#endif /* CONFIG_KRETPROBES */
175
176static int
177seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
178{
179#ifdef CONFIG_KALLSYMS
180 char str[KSYM_SYMBOL_LEN];
181 const char *name;
182
183 kallsyms_lookup(address, NULL, NULL, NULL, str);
184
185 name = kretprobed(str);
186
187 return trace_seq_printf(s, fmt, name);
188#endif
189 return 1;
190}
191
192static int
193seq_print_sym_offset(struct trace_seq *s, const char *fmt,
194 unsigned long address)
195{
196#ifdef CONFIG_KALLSYMS
197 char str[KSYM_SYMBOL_LEN];
198 const char *name;
199
200 sprint_symbol(str, address);
201 name = kretprobed(str);
202
203 return trace_seq_printf(s, fmt, name);
204#endif
205 return 1;
206}
207
208#ifndef CONFIG_64BIT
209# define IP_FMT "%08lx"
210#else
211# define IP_FMT "%016lx"
212#endif
213
214int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
215 unsigned long ip, unsigned long sym_flags)
216{
217 struct file *file = NULL;
218 unsigned long vmstart = 0;
219 int ret = 1;
220
221 if (mm) {
222 const struct vm_area_struct *vma;
223
224 down_read(&mm->mmap_sem);
225 vma = find_vma(mm, ip);
226 if (vma) {
227 file = vma->vm_file;
228 vmstart = vma->vm_start;
229 }
230 if (file) {
231 ret = trace_seq_path(s, &file->f_path);
232 if (ret)
233 ret = trace_seq_printf(s, "[+0x%lx]",
234 ip - vmstart);
235 }
236 up_read(&mm->mmap_sem);
237 }
238 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
239 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
240 return ret;
241}
242
243int
244seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
245 unsigned long sym_flags)
246{
247 struct mm_struct *mm = NULL;
248 int ret = 1;
249 unsigned int i;
250
251 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
252 struct task_struct *task;
253 /*
254 * we do the lookup on the thread group leader,
255 * since individual threads might have already quit!
256 */
257 rcu_read_lock();
258 task = find_task_by_vpid(entry->ent.tgid);
259 if (task)
260 mm = get_task_mm(task);
261 rcu_read_unlock();
262 }
263
264 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
265 unsigned long ip = entry->caller[i];
266
267 if (ip == ULONG_MAX || !ret)
268 break;
269 if (i && ret)
270 ret = trace_seq_puts(s, " <- ");
271 if (!ip) {
272 if (ret)
273 ret = trace_seq_puts(s, "??");
274 continue;
275 }
276 if (!ret)
277 break;
278 if (ret)
279 ret = seq_print_user_ip(s, mm, ip, sym_flags);
280 }
281
282 if (mm)
283 mmput(mm);
284 return ret;
285}
286
287int
288seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
289{
290 int ret;
291
292 if (!ip)
293 return trace_seq_printf(s, "0");
294
295 if (sym_flags & TRACE_ITER_SYM_OFFSET)
296 ret = seq_print_sym_offset(s, "%s", ip);
297 else
298 ret = seq_print_sym_short(s, "%s", ip);
299
300 if (!ret)
301 return 0;
302
303 if (sym_flags & TRACE_ITER_SYM_ADDR)
304 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
305 return ret;
306}
307
308static int
309lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
310{
311 int hardirq, softirq;
312 char comm[TASK_COMM_LEN];
313
314 trace_find_cmdline(entry->pid, comm);
315 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
316 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
317
318 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
319 comm, entry->pid, cpu,
320 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
321 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
322 'X' : '.',
323 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
324 'N' : '.',
325 (hardirq && softirq) ? 'H' :
326 hardirq ? 'h' : softirq ? 's' : '.'))
327 return 0;
328
329 if (entry->preempt_count)
330 return trace_seq_printf(s, "%x", entry->preempt_count);
331 return trace_seq_puts(s, ".");
332}
333
334static unsigned long preempt_mark_thresh = 100;
335
336static int
337lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
338 unsigned long rel_usecs)
339{
340 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
341 rel_usecs > preempt_mark_thresh ? '!' :
342 rel_usecs > 1 ? '+' : ' ');
343}
344
345int trace_print_context(struct trace_iterator *iter)
346{
347 struct trace_seq *s = &iter->seq;
348 struct trace_entry *entry = iter->ent;
349 unsigned long long t = ns2usecs(iter->ts);
350 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
351 unsigned long secs = (unsigned long)t;
352 char comm[TASK_COMM_LEN];
353
354 trace_find_cmdline(entry->pid, comm);
355
356 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
357 comm, entry->pid, iter->cpu, secs, usec_rem);
358}
359
360int trace_print_lat_context(struct trace_iterator *iter)
361{
362 u64 next_ts;
363 int ret;
364 struct trace_seq *s = &iter->seq;
365 struct trace_entry *entry = iter->ent,
366 *next_entry = trace_find_next_entry(iter, NULL,
367 &next_ts);
368 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
369 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
370 unsigned long rel_usecs;
371
372 if (!next_entry)
373 next_ts = iter->ts;
374 rel_usecs = ns2usecs(next_ts - iter->ts);
375
376 if (verbose) {
377 char comm[TASK_COMM_LEN];
378
379 trace_find_cmdline(entry->pid, comm);
380
381 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
382 " %ld.%03ldms (+%ld.%03ldms): ", comm,
383 entry->pid, iter->cpu, entry->flags,
384 entry->preempt_count, iter->idx,
385 ns2usecs(iter->ts),
386 abs_usecs / USEC_PER_MSEC,
387 abs_usecs % USEC_PER_MSEC,
388 rel_usecs / USEC_PER_MSEC,
389 rel_usecs % USEC_PER_MSEC);
390 } else {
391 ret = lat_print_generic(s, entry, iter->cpu);
392 if (ret)
393 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
394 }
395
396 return ret;
397}
398
399static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
400
401static int task_state_char(unsigned long state)
402{
403 int bit = state ? __ffs(state) + 1 : 0;
404
405 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
406}
407
408/**
409 * ftrace_find_event - find a registered event
410 * @type: the type of event to look for
411 *
412 * Returns an event of type @type otherwise NULL
413 */
414struct trace_event *ftrace_find_event(int type)
415{
416 struct trace_event *event;
417 struct hlist_node *n;
418 unsigned key;
419
420 key = type & (EVENT_HASHSIZE - 1);
421
422 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
423 if (event->type == type)
424 return event;
425 }
426
427 return NULL;
428}
429
430/**
431 * register_ftrace_event - register output for an event type
432 * @event: the event type to register
433 *
434 * Event types are stored in a hash and this hash is used to
435 * find a way to print an event. If the @event->type is set
436 * then it will use that type, otherwise it will assign a
437 * type to use.
438 *
439 * If you assign your own type, please make sure it is added
440 * to the trace_type enum in trace.h, to avoid collisions
441 * with the dynamic types.
442 *
443 * Returns the event type number or zero on error.
444 */
445int register_ftrace_event(struct trace_event *event)
446{
447 unsigned key;
448 int ret = 0;
449
450 mutex_lock(&trace_event_mutex);
451
452 if (!event->type)
453 event->type = next_event_type++;
454 else if (event->type > __TRACE_LAST_TYPE) {
455 printk(KERN_WARNING "Need to add type to trace.h\n");
456 WARN_ON(1);
457 }
458
459 if (ftrace_find_event(event->type))
460 goto out;
461
462 if (event->trace == NULL)
463 event->trace = trace_nop_print;
464 if (event->raw == NULL)
465 event->raw = trace_nop_print;
466 if (event->hex == NULL)
467 event->hex = trace_nop_print;
468 if (event->binary == NULL)
469 event->binary = trace_nop_print;
470
471 key = event->type & (EVENT_HASHSIZE - 1);
472
473 hlist_add_head_rcu(&event->node, &event_hash[key]);
474
475 ret = event->type;
476 out:
477 mutex_unlock(&trace_event_mutex);
478
479 return ret;
480}
481
482/**
483 * unregister_ftrace_event - remove a no longer used event
484 * @event: the event to remove
485 */
486int unregister_ftrace_event(struct trace_event *event)
487{
488 mutex_lock(&trace_event_mutex);
489 hlist_del(&event->node);
490 mutex_unlock(&trace_event_mutex);
491
492 return 0;
493}
494
495/*
496 * Standard events
497 */
498
499enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
500{
501 return TRACE_TYPE_HANDLED;
502}
503
504/* TRACE_FN */
505static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
506{
507 struct ftrace_entry *field;
508 struct trace_seq *s = &iter->seq;
509
510 trace_assign_type(field, iter->ent);
511
512 if (!seq_print_ip_sym(s, field->ip, flags))
513 goto partial;
514
515 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
516 if (!trace_seq_printf(s, " <-"))
517 goto partial;
518 if (!seq_print_ip_sym(s,
519 field->parent_ip,
520 flags))
521 goto partial;
522 }
523 if (!trace_seq_printf(s, "\n"))
524 goto partial;
525
526 return TRACE_TYPE_HANDLED;
527
528 partial:
529 return TRACE_TYPE_PARTIAL_LINE;
530}
531
532static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
533{
534 struct ftrace_entry *field;
535
536 trace_assign_type(field, iter->ent);
537
538 if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
539 field->ip,
540 field->parent_ip))
541 return TRACE_TYPE_PARTIAL_LINE;
542
543 return TRACE_TYPE_HANDLED;
544}
545
546static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
547{
548 struct ftrace_entry *field;
549 struct trace_seq *s = &iter->seq;
550
551 trace_assign_type(field, iter->ent);
552
553 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
554 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
555
556 return TRACE_TYPE_HANDLED;
557}
558
559static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
560{
561 struct ftrace_entry *field;
562 struct trace_seq *s = &iter->seq;
563
564 trace_assign_type(field, iter->ent);
565
566 SEQ_PUT_FIELD_RET(s, field->ip);
567 SEQ_PUT_FIELD_RET(s, field->parent_ip);
568
569 return TRACE_TYPE_HANDLED;
570}
571
572static struct trace_event trace_fn_event = {
573 .type = TRACE_FN,
574 .trace = trace_fn_trace,
575 .raw = trace_fn_raw,
576 .hex = trace_fn_hex,
577 .binary = trace_fn_bin,
578};
579
580/* TRACE_CTX an TRACE_WAKE */
581static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
582 char *delim)
583{
584 struct ctx_switch_entry *field;
585 char comm[TASK_COMM_LEN];
586 int S, T;
587
588
589 trace_assign_type(field, iter->ent);
590
591 T = task_state_char(field->next_state);
592 S = task_state_char(field->prev_state);
593 trace_find_cmdline(field->next_pid, comm);
594 if (!trace_seq_printf(&iter->seq,
595 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
596 field->prev_pid,
597 field->prev_prio,
598 S, delim,
599 field->next_cpu,
600 field->next_pid,
601 field->next_prio,
602 T, comm))
603 return TRACE_TYPE_PARTIAL_LINE;
604
605 return TRACE_TYPE_HANDLED;
606}
607
608static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
609{
610 return trace_ctxwake_print(iter, "==>");
611}
612
613static enum print_line_t trace_wake_print(struct trace_iterator *iter,
614 int flags)
615{
616 return trace_ctxwake_print(iter, " +");
617}
618
619static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
620{
621 struct ctx_switch_entry *field;
622 int T;
623
624 trace_assign_type(field, iter->ent);
625
626 if (!S)
627 task_state_char(field->prev_state);
628 T = task_state_char(field->next_state);
629 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
630 field->prev_pid,
631 field->prev_prio,
632 S,
633 field->next_cpu,
634 field->next_pid,
635 field->next_prio,
636 T))
637 return TRACE_TYPE_PARTIAL_LINE;
638
639 return TRACE_TYPE_HANDLED;
640}
641
642static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
643{
644 return trace_ctxwake_raw(iter, 0);
645}
646
647static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
648{
649 return trace_ctxwake_raw(iter, '+');
650}
651
652
653static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
654{
655 struct ctx_switch_entry *field;
656 struct trace_seq *s = &iter->seq;
657 int T;
658
659 trace_assign_type(field, iter->ent);
660
661 if (!S)
662 task_state_char(field->prev_state);
663 T = task_state_char(field->next_state);
664
665 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
666 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
667 SEQ_PUT_HEX_FIELD_RET(s, S);
668 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
669 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
670 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
671 SEQ_PUT_HEX_FIELD_RET(s, T);
672
673 return TRACE_TYPE_HANDLED;
674}
675
676static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
677{
678 return trace_ctxwake_hex(iter, 0);
679}
680
681static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
682{
683 return trace_ctxwake_hex(iter, '+');
684}
685
686static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
687 int flags)
688{
689 struct ctx_switch_entry *field;
690 struct trace_seq *s = &iter->seq;
691
692 trace_assign_type(field, iter->ent);
693
694 SEQ_PUT_FIELD_RET(s, field->prev_pid);
695 SEQ_PUT_FIELD_RET(s, field->prev_prio);
696 SEQ_PUT_FIELD_RET(s, field->prev_state);
697 SEQ_PUT_FIELD_RET(s, field->next_pid);
698 SEQ_PUT_FIELD_RET(s, field->next_prio);
699 SEQ_PUT_FIELD_RET(s, field->next_state);
700
701 return TRACE_TYPE_HANDLED;
702}
703
704static struct trace_event trace_ctx_event = {
705 .type = TRACE_CTX,
706 .trace = trace_ctx_print,
707 .raw = trace_ctx_raw,
708 .hex = trace_ctx_hex,
709 .binary = trace_ctxwake_bin,
710};
711
712static struct trace_event trace_wake_event = {
713 .type = TRACE_WAKE,
714 .trace = trace_wake_print,
715 .raw = trace_wake_raw,
716 .hex = trace_wake_hex,
717 .binary = trace_ctxwake_bin,
718};
719
720/* TRACE_SPECIAL */
721static enum print_line_t trace_special_print(struct trace_iterator *iter,
722 int flags)
723{
724 struct special_entry *field;
725
726 trace_assign_type(field, iter->ent);
727
728 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
729 field->arg1,
730 field->arg2,
731 field->arg3))
732 return TRACE_TYPE_PARTIAL_LINE;
733
734 return TRACE_TYPE_HANDLED;
735}
736
737static enum print_line_t trace_special_hex(struct trace_iterator *iter,
738 int flags)
739{
740 struct special_entry *field;
741 struct trace_seq *s = &iter->seq;
742
743 trace_assign_type(field, iter->ent);
744
745 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
746 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
747 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
748
749 return TRACE_TYPE_HANDLED;
750}
751
752static enum print_line_t trace_special_bin(struct trace_iterator *iter,
753 int flags)
754{
755 struct special_entry *field;
756 struct trace_seq *s = &iter->seq;
757
758 trace_assign_type(field, iter->ent);
759
760 SEQ_PUT_FIELD_RET(s, field->arg1);
761 SEQ_PUT_FIELD_RET(s, field->arg2);
762 SEQ_PUT_FIELD_RET(s, field->arg3);
763
764 return TRACE_TYPE_HANDLED;
765}
766
767static struct trace_event trace_special_event = {
768 .type = TRACE_SPECIAL,
769 .trace = trace_special_print,
770 .raw = trace_special_print,
771 .hex = trace_special_hex,
772 .binary = trace_special_bin,
773};
774
775/* TRACE_STACK */
776
777static enum print_line_t trace_stack_print(struct trace_iterator *iter,
778 int flags)
779{
780 struct stack_entry *field;
781 struct trace_seq *s = &iter->seq;
782 int i;
783
784 trace_assign_type(field, iter->ent);
785
786 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
787 if (i) {
788 if (!trace_seq_puts(s, " <= "))
789 goto partial;
790
791 if (!seq_print_ip_sym(s, field->caller[i], flags))
792 goto partial;
793 }
794 if (!trace_seq_puts(s, "\n"))
795 goto partial;
796 }
797
798 return TRACE_TYPE_HANDLED;
799
800 partial:
801 return TRACE_TYPE_PARTIAL_LINE;
802}
803
804static struct trace_event trace_stack_event = {
805 .type = TRACE_STACK,
806 .trace = trace_stack_print,
807 .raw = trace_special_print,
808 .hex = trace_special_hex,
809 .binary = trace_special_bin,
810};
811
812/* TRACE_USER_STACK */
813static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
814 int flags)
815{
816 struct userstack_entry *field;
817 struct trace_seq *s = &iter->seq;
818
819 trace_assign_type(field, iter->ent);
820
821 if (!seq_print_userip_objs(field, s, flags))
822 goto partial;
823
824 if (!trace_seq_putc(s, '\n'))
825 goto partial;
826
827 return TRACE_TYPE_HANDLED;
828
829 partial:
830 return TRACE_TYPE_PARTIAL_LINE;
831}
832
833static struct trace_event trace_user_stack_event = {
834 .type = TRACE_USER_STACK,
835 .trace = trace_user_stack_print,
836 .raw = trace_special_print,
837 .hex = trace_special_hex,
838 .binary = trace_special_bin,
839};
840
841/* TRACE_BPRINT */
842static enum print_line_t
843trace_bprint_print(struct trace_iterator *iter, int flags)
844{
845 struct trace_entry *entry = iter->ent;
846 struct trace_seq *s = &iter->seq;
847 struct bprint_entry *field;
848
849 trace_assign_type(field, entry);
850
851 if (!seq_print_ip_sym(s, field->ip, flags))
852 goto partial;
853
854 if (!trace_seq_puts(s, ": "))
855 goto partial;
856
857 if (!trace_seq_bprintf(s, field->fmt, field->buf))
858 goto partial;
859
860 return TRACE_TYPE_HANDLED;
861
862 partial:
863 return TRACE_TYPE_PARTIAL_LINE;
864}
865
866
867static enum print_line_t
868trace_bprint_raw(struct trace_iterator *iter, int flags)
869{
870 struct bprint_entry *field;
871 struct trace_seq *s = &iter->seq;
872
873 trace_assign_type(field, iter->ent);
874
875 if (!trace_seq_printf(s, ": %lx : ", field->ip))
876 goto partial;
877
878 if (!trace_seq_bprintf(s, field->fmt, field->buf))
879 goto partial;
880
881 return TRACE_TYPE_HANDLED;
882
883 partial:
884 return TRACE_TYPE_PARTIAL_LINE;
885}
886
887
888static struct trace_event trace_bprint_event = {
889 .type = TRACE_BPRINT,
890 .trace = trace_bprint_print,
891 .raw = trace_bprint_raw,
892};
893
894/* TRACE_PRINT */
895static enum print_line_t trace_print_print(struct trace_iterator *iter,
896 int flags)
897{
898 struct print_entry *field;
899 struct trace_seq *s = &iter->seq;
900
901 trace_assign_type(field, iter->ent);
902
903 if (!seq_print_ip_sym(s, field->ip, flags))
904 goto partial;
905
906 if (!trace_seq_printf(s, ": %s", field->buf))
907 goto partial;
908
909 return TRACE_TYPE_HANDLED;
910
911 partial:
912 return TRACE_TYPE_PARTIAL_LINE;
913}
914
915static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
916{
917 struct print_entry *field;
918
919 trace_assign_type(field, iter->ent);
920
921 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
922 goto partial;
923
924 return TRACE_TYPE_HANDLED;
925
926 partial:
927 return TRACE_TYPE_PARTIAL_LINE;
928}
929
930static struct trace_event trace_print_event = {
931 .type = TRACE_PRINT,
932 .trace = trace_print_print,
933 .raw = trace_print_raw,
934};
935
936
937static struct trace_event *events[] __initdata = {
938 &trace_fn_event,
939 &trace_ctx_event,
940 &trace_wake_event,
941 &trace_special_event,
942 &trace_stack_event,
943 &trace_user_stack_event,
944 &trace_bprint_event,
945 &trace_print_event,
946 NULL
947};
948
949__init static int init_events(void)
950{
951 struct trace_event *event;
952 int i, ret;
953
954 for (i = 0; events[i]; i++) {
955 event = events[i];
956
957 ret = register_ftrace_event(event);
958 if (!ret) {
959 printk(KERN_WARNING "event %d failed to register\n",
960 event->type);
961 WARN_ON_ONCE(1);
962 }
963 }
964
965 return 0;
966}
967device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..3b90e6ade1aa
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,63 @@
1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H
3
4#include "trace.h"
5
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
19 __attribute__ ((format (printf, 2, 3)));
20extern int
21trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
22extern int
23seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
24 unsigned long sym_flags);
25extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
26 size_t cnt);
27int trace_seq_puts(struct trace_seq *s, const char *str);
28int trace_seq_putc(struct trace_seq *s, unsigned char c);
29int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
30int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
31int trace_seq_path(struct trace_seq *s, struct path *path);
32int seq_print_userip_objs(const struct userstack_entry *entry,
33 struct trace_seq *s, unsigned long sym_flags);
34int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
35 unsigned long ip, unsigned long sym_flags);
36
37int trace_print_context(struct trace_iterator *iter);
38int trace_print_lat_context(struct trace_iterator *iter);
39
40struct trace_event *ftrace_find_event(int type);
41int register_ftrace_event(struct trace_event *event);
42int unregister_ftrace_event(struct trace_event *event);
43
44enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
45
46#define MAX_MEMHEX_BYTES 8
47#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
48
49#define SEQ_PUT_FIELD_RET(s, x) \
50do { \
51 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
52 return TRACE_TYPE_PARTIAL_LINE; \
53} while (0)
54
55#define SEQ_PUT_HEX_FIELD_RET(s, x) \
56do { \
57 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
58 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
59 return TRACE_TYPE_PARTIAL_LINE; \
60} while (0)
61
62#endif
63
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248daf55..bae791ebcc51 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,15 +11,113 @@
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/ftrace.h> 14#include <trace/power.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/module.h> 16#include <linux/module.h>
17 17
18#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
19 20
20static struct trace_array *power_trace; 21static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled; 22static int __read_mostly trace_power_enabled;
22 23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ring_buffer_event *event;
40 struct trace_power *entry;
41 struct trace_array_cpu *data;
42 struct trace_array *tr = power_trace;
43
44 if (!trace_power_enabled)
45 return;
46
47 preempt_disable();
48 it->end = ktime_get();
49 data = tr->data[smp_processor_id()];
50
51 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
52 sizeof(*entry), 0, 0);
53 if (!event)
54 goto out;
55 entry = ring_buffer_event_data(event);
56 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out:
59 preempt_enable();
60}
61
62static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level)
64{
65 struct ring_buffer_event *event;
66 struct trace_power *entry;
67 struct trace_array_cpu *data;
68 struct trace_array *tr = power_trace;
69
70 if (!trace_power_enabled)
71 return;
72
73 memset(it, 0, sizeof(struct power_trace));
74 it->state = level;
75 it->type = type;
76 it->stamp = ktime_get();
77 preempt_disable();
78 it->end = it->stamp;
79 data = tr->data[smp_processor_id()];
80
81 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
82 sizeof(*entry), 0, 0);
83 if (!event)
84 goto out;
85 entry = ring_buffer_event_data(event);
86 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out:
89 preempt_enable();
90}
91
92static int tracing_power_register(void)
93{
94 int ret;
95
96 ret = register_trace_power_start(probe_power_start);
97 if (ret) {
98 pr_info("power trace: Couldn't activate tracepoint"
99 " probe to trace_power_start\n");
100 return ret;
101 }
102 ret = register_trace_power_end(probe_power_end);
103 if (ret) {
104 pr_info("power trace: Couldn't activate tracepoint"
105 " probe to trace_power_end\n");
106 goto fail_start;
107 }
108 ret = register_trace_power_mark(probe_power_mark);
109 if (ret) {
110 pr_info("power trace: Couldn't activate tracepoint"
111 " probe to trace_power_mark\n");
112 goto fail_end;
113 }
114 return ret;
115fail_end:
116 unregister_trace_power_end(probe_power_end);
117fail_start:
118 unregister_trace_power_start(probe_power_start);
119 return ret;
120}
23 121
24static void start_power_trace(struct trace_array *tr) 122static void start_power_trace(struct trace_array *tr)
25{ 123{
@@ -31,6 +129,14 @@ static void stop_power_trace(struct trace_array *tr)
31 trace_power_enabled = 0; 129 trace_power_enabled = 0;
32} 130}
33 131
132static void power_trace_reset(struct trace_array *tr)
133{
134 trace_power_enabled = 0;
135 unregister_trace_power_start(probe_power_start);
136 unregister_trace_power_end(probe_power_end);
137 unregister_trace_power_mark(probe_power_mark);
138}
139
34 140
35static int power_trace_init(struct trace_array *tr) 141static int power_trace_init(struct trace_array *tr)
36{ 142{
@@ -38,6 +144,7 @@ static int power_trace_init(struct trace_array *tr)
38 power_trace = tr; 144 power_trace = tr;
39 145
40 trace_power_enabled = 1; 146 trace_power_enabled = 1;
147 tracing_power_register();
41 148
42 for_each_cpu(cpu, cpu_possible_mask) 149 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 150 tracing_reset(tr, cpu);
@@ -85,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
85 .init = power_trace_init, 192 .init = power_trace_init,
86 .start = start_power_trace, 193 .start = start_power_trace,
87 .stop = stop_power_trace, 194 .stop = stop_power_trace,
88 .reset = stop_power_trace, 195 .reset = power_trace_reset,
89 .print_line = power_print_line, 196 .print_line = power_print_line,
90}; 197};
91 198
@@ -94,86 +201,3 @@ static int init_power_trace(void)
94 return register_tracer(&power_tracer); 201 return register_tracer(&power_tracer);
95} 202}
96device_initcall(init_power_trace); 203device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..486785214e3e
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,270 @@
1/*
2 * trace binary printk
3 *
4 * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
5 *
6 */
7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/kernel.h>
11#include <linux/ftrace.h>
12#include <linux/string.h>
13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h>
16#include <linux/ctype.h>
17#include <linux/list.h>
18#include <linux/slab.h>
19#include <linux/fs.h>
20
21#include "trace.h"
22
23#ifdef CONFIG_MODULES
24
25/*
26 * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
27 * which are queued on trace_bprintk_fmt_list.
28 */
29static LIST_HEAD(trace_bprintk_fmt_list);
30
31/* serialize accesses to trace_bprintk_fmt_list */
32static DEFINE_MUTEX(btrace_mutex);
33
34struct trace_bprintk_fmt {
35 struct list_head list;
36 char fmt[0];
37};
38
39static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
40{
41 struct trace_bprintk_fmt *pos;
42 list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
43 if (!strcmp(pos->fmt, fmt))
44 return pos;
45 }
46 return NULL;
47}
48
49static
50void hold_module_trace_bprintk_format(const char **start, const char **end)
51{
52 const char **iter;
53
54 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
57 if (tb_fmt) {
58 *iter = tb_fmt->fmt;
59 continue;
60 }
61
62 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
63 + strlen(*iter) + 1, GFP_KERNEL);
64 if (tb_fmt) {
65 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
66 strcpy(tb_fmt->fmt, *iter);
67 *iter = tb_fmt->fmt;
68 } else
69 *iter = NULL;
70 }
71 mutex_unlock(&btrace_mutex);
72}
73
74static int module_trace_bprintk_format_notify(struct notifier_block *self,
75 unsigned long val, void *data)
76{
77 struct module *mod = data;
78 if (mod->num_trace_bprintk_fmt) {
79 const char **start = mod->trace_bprintk_fmt_start;
80 const char **end = start + mod->num_trace_bprintk_fmt;
81
82 if (val == MODULE_STATE_COMING)
83 hold_module_trace_bprintk_format(start, end);
84 }
85 return 0;
86}
87
88#else /* !CONFIG_MODULES */
89__init static int
90module_trace_bprintk_format_notify(struct notifier_block *self,
91 unsigned long val, void *data)
92{
93 return 0;
94}
95#endif /* CONFIG_MODULES */
96
97
98__initdata_or_module static
99struct notifier_block module_trace_bprintk_format_nb = {
100 .notifier_call = module_trace_bprintk_format_notify,
101};
102
103int __trace_bprintk(unsigned long ip, const char *fmt, ...)
104 {
105 int ret;
106 va_list ap;
107
108 if (unlikely(!fmt))
109 return 0;
110
111 if (!(trace_flags & TRACE_ITER_PRINTK))
112 return 0;
113
114 va_start(ap, fmt);
115 ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
116 va_end(ap);
117 return ret;
118}
119EXPORT_SYMBOL_GPL(__trace_bprintk);
120
121int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
122 {
123 if (unlikely(!fmt))
124 return 0;
125
126 if (!(trace_flags & TRACE_ITER_PRINTK))
127 return 0;
128
129 return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
130}
131EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
132
133int __trace_printk(unsigned long ip, const char *fmt, ...)
134{
135 int ret;
136 va_list ap;
137
138 if (!(trace_flags & TRACE_ITER_PRINTK))
139 return 0;
140
141 va_start(ap, fmt);
142 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
143 va_end(ap);
144 return ret;
145}
146EXPORT_SYMBOL_GPL(__trace_printk);
147
148int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
149{
150 if (!(trace_flags & TRACE_ITER_PRINTK))
151 return 0;
152
153 return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
154}
155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156
157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos)
159{
160 const char **fmt = m->private;
161 const char **next = fmt;
162
163 (*pos)++;
164
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt;
172}
173
174static void *t_start(struct seq_file *m, loff_t *pos)
175{
176 return t_next(m, NULL, pos);
177}
178
179static int t_show(struct seq_file *m, void *v)
180{
181 const char **fmt = v;
182 const char *str = *fmt;
183 int i;
184
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
186
187 /*
188 * Tabs and new lines need to be converted.
189 */
190 for (i = 0; str[i]; i++) {
191 switch (str[i]) {
192 case '\n':
193 seq_puts(m, "\\n");
194 break;
195 case '\t':
196 seq_puts(m, "\\t");
197 break;
198 case '\\':
199 seq_puts(m, "\\");
200 break;
201 case '"':
202 seq_puts(m, "\\\"");
203 break;
204 default:
205 seq_putc(m, str[i]);
206 }
207 }
208 seq_puts(m, "\"\n");
209
210 return 0;
211}
212
213static void t_stop(struct seq_file *m, void *p)
214{
215}
216
217static const struct seq_operations show_format_seq_ops = {
218 .start = t_start,
219 .next = t_next,
220 .show = t_show,
221 .stop = t_stop,
222};
223
224static int
225ftrace_formats_open(struct inode *inode, struct file *file)
226{
227 int ret;
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236}
237
238static const struct file_operations ftrace_formats_fops = {
239 .open = ftrace_formats_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = seq_release,
243};
244
245static __init int init_trace_printk_function_export(void)
246{
247 struct dentry *d_tracer;
248 struct dentry *entry;
249
250 d_tracer = tracing_init_dentry();
251 if (!d_tracer)
252 return 0;
253
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259
260 return 0;
261}
262
263fs_initcall(init_trace_printk_function_export);
264
265static __init int init_trace_printk(void)
266{
267 return register_module_notifier(&module_trace_bprintk_format_nb);
268}
269
270early_initcall(init_trace_printk);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564f..de35f200abd3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static int sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
21 22
22static void 23static void
23probe_sched_switch(struct rq *__rq, struct task_struct *prev, 24probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
28 int cpu; 29 int cpu;
29 int pc; 30 int pc;
30 31
31 if (!sched_ref) 32 if (!sched_ref || sched_stopped)
32 return; 33 return;
33 34
34 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
@@ -43,7 +44,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
43 data = ctx_trace->data[cpu]; 44 data = ctx_trace->data[cpu];
44 45
45 if (likely(!atomic_read(&data->disabled))) 46 if (likely(!atomic_read(&data->disabled)))
46 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); 47 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
47 48
48 local_irq_restore(flags); 49 local_irq_restore(flags);
49} 50}
@@ -66,7 +67,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
66 data = ctx_trace->data[cpu]; 67 data = ctx_trace->data[cpu];
67 68
68 if (likely(!atomic_read(&data->disabled))) 69 if (likely(!atomic_read(&data->disabled)))
69 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, 70 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
70 flags, pc); 71 flags, pc);
71 72
72 local_irq_restore(flags); 73 local_irq_restore(flags);
@@ -93,7 +94,7 @@ static int tracing_sched_register(void)
93 ret = register_trace_sched_switch(probe_sched_switch); 94 ret = register_trace_sched_switch(probe_sched_switch);
94 if (ret) { 95 if (ret) {
95 pr_info("sched trace: Couldn't activate tracepoint" 96 pr_info("sched trace: Couldn't activate tracepoint"
96 " probe to kernel_sched_schedule\n"); 97 " probe to kernel_sched_switch\n");
97 goto fail_deprobe_wake_new; 98 goto fail_deprobe_wake_new;
98 } 99 }
99 100
@@ -185,12 +186,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
185 ctx_trace = tr; 186 ctx_trace = tr;
186} 187}
187 188
188static void start_sched_trace(struct trace_array *tr)
189{
190 tracing_reset_online_cpus(tr);
191 tracing_start_sched_switch_record();
192}
193
194static void stop_sched_trace(struct trace_array *tr) 189static void stop_sched_trace(struct trace_array *tr)
195{ 190{
196 tracing_stop_sched_switch_record(); 191 tracing_stop_sched_switch_record();
@@ -199,7 +194,8 @@ static void stop_sched_trace(struct trace_array *tr)
199static int sched_switch_trace_init(struct trace_array *tr) 194static int sched_switch_trace_init(struct trace_array *tr)
200{ 195{
201 ctx_trace = tr; 196 ctx_trace = tr;
202 start_sched_trace(tr); 197 tracing_reset_online_cpus(tr);
198 tracing_start_sched_switch_record();
203 return 0; 199 return 0;
204} 200}
205 201
@@ -211,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
211 207
212static void sched_switch_trace_start(struct trace_array *tr) 208static void sched_switch_trace_start(struct trace_array *tr)
213{ 209{
214 tracing_reset_online_cpus(tr); 210 sched_stopped = 0;
215 tracing_start_sched_switch();
216} 211}
217 212
218static void sched_switch_trace_stop(struct trace_array *tr) 213static void sched_switch_trace_stop(struct trace_array *tr)
219{ 214{
220 tracing_stop_sched_switch(); 215 sched_stopped = 1;
221} 216}
222 217
223static struct tracer sched_switch_trace __read_mostly = 218static struct tracer sched_switch_trace __read_mostly =
@@ -227,6 +222,7 @@ static struct tracer sched_switch_trace __read_mostly =
227 .reset = sched_switch_trace_reset, 222 .reset = sched_switch_trace_reset,
228 .start = sched_switch_trace_start, 223 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop, 224 .stop = sched_switch_trace_stop,
225 .wait_pipe = poll_wait_pipe,
230#ifdef CONFIG_FTRACE_SELFTEST 226#ifdef CONFIG_FTRACE_SELFTEST
231 .selftest = trace_selftest_startup_sched_switch, 227 .selftest = trace_selftest_startup_sched_switch,
232#endif 228#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 42ae1e77b6b3..3c5ad6b2ec84 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,12 +25,15 @@ static int __read_mostly tracer_enabled;
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static unsigned wakeup_prio = -1; 27static unsigned wakeup_prio = -1;
28static int wakeup_rt;
28 29
29static raw_spinlock_t wakeup_lock = 30static raw_spinlock_t wakeup_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31 32
32static void __wakeup_reset(struct trace_array *tr); 33static void __wakeup_reset(struct trace_array *tr);
33 34
35static int save_lat_flag;
36
34#ifdef CONFIG_FUNCTION_TRACER 37#ifdef CONFIG_FUNCTION_TRACER
35/* 38/*
36 * irqsoff uses its own tracer function to keep the overhead down: 39 * irqsoff uses its own tracer function to keep the overhead down:
@@ -71,7 +74,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
71 if (task_cpu(wakeup_task) != cpu) 74 if (task_cpu(wakeup_task) != cpu)
72 goto unlock; 75 goto unlock;
73 76
74 trace_function(tr, data, ip, parent_ip, flags, pc); 77 trace_function(tr, ip, parent_ip, flags, pc);
75 78
76 unlock: 79 unlock:
77 __raw_spin_unlock(&wakeup_lock); 80 __raw_spin_unlock(&wakeup_lock);
@@ -151,7 +154,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
151 if (unlikely(!tracer_enabled || next != wakeup_task)) 154 if (unlikely(!tracer_enabled || next != wakeup_task))
152 goto out_unlock; 155 goto out_unlock;
153 156
154 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
155 159
156 /* 160 /*
157 * usecs conversion is slow so we try to delay the conversion 161 * usecs conversion is slow so we try to delay the conversion
@@ -182,13 +186,10 @@ out:
182 186
183static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
184{ 188{
185 struct trace_array_cpu *data;
186 int cpu; 189 int cpu;
187 190
188 for_each_possible_cpu(cpu) { 191 for_each_possible_cpu(cpu)
189 data = tr->data[cpu];
190 tracing_reset(tr, cpu); 192 tracing_reset(tr, cpu);
191 }
192 193
193 wakeup_cpu = -1; 194 wakeup_cpu = -1;
194 wakeup_prio = -1; 195 wakeup_prio = -1;
@@ -213,6 +214,7 @@ static void wakeup_reset(struct trace_array *tr)
213static void 214static void
214probe_wakeup(struct rq *rq, struct task_struct *p, int success) 215probe_wakeup(struct rq *rq, struct task_struct *p, int success)
215{ 216{
217 struct trace_array_cpu *data;
216 int cpu = smp_processor_id(); 218 int cpu = smp_processor_id();
217 unsigned long flags; 219 unsigned long flags;
218 long disabled; 220 long disabled;
@@ -224,7 +226,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
224 tracing_record_cmdline(p); 226 tracing_record_cmdline(p);
225 tracing_record_cmdline(current); 227 tracing_record_cmdline(current);
226 228
227 if (likely(!rt_task(p)) || 229 if ((wakeup_rt && !rt_task(p)) ||
228 p->prio >= wakeup_prio || 230 p->prio >= wakeup_prio ||
229 p->prio >= current->prio) 231 p->prio >= current->prio)
230 return; 232 return;
@@ -252,9 +254,10 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
252 254
253 local_save_flags(flags); 255 local_save_flags(flags);
254 256
255 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 257 data = wakeup_trace->data[wakeup_cpu];
256 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], 258 data->preempt_timestamp = ftrace_now(cpu);
257 CALLER_ADDR1, CALLER_ADDR2, flags, pc); 259 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
260 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
258 261
259out_locked: 262out_locked:
260 __raw_spin_unlock(&wakeup_lock); 263 __raw_spin_unlock(&wakeup_lock);
@@ -262,12 +265,6 @@ out:
262 atomic_dec(&wakeup_trace->data[cpu]->disabled); 265 atomic_dec(&wakeup_trace->data[cpu]->disabled);
263} 266}
264 267
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
271static void start_wakeup_tracer(struct trace_array *tr) 268static void start_wakeup_tracer(struct trace_array *tr)
272{ 269{
273 int ret; 270 int ret;
@@ -289,7 +286,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
289 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 286 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
290 if (ret) { 287 if (ret) {
291 pr_info("sched trace: Couldn't activate tracepoint" 288 pr_info("sched trace: Couldn't activate tracepoint"
292 " probe to kernel_sched_schedule\n"); 289 " probe to kernel_sched_switch\n");
293 goto fail_deprobe_wake_new; 290 goto fail_deprobe_wake_new;
294 } 291 }
295 292
@@ -306,13 +303,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
306 303
307 register_ftrace_function(&trace_ops); 304 register_ftrace_function(&trace_ops);
308 305
309 if (tracing_is_enabled()) { 306 if (tracing_is_enabled())
310 tracer_enabled = 1; 307 tracer_enabled = 1;
311 save_tracer_enabled = 1; 308 else
312 } else {
313 tracer_enabled = 0; 309 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
316 310
317 return; 311 return;
318fail_deprobe_wake_new: 312fail_deprobe_wake_new:
@@ -324,54 +318,54 @@ fail_deprobe:
324static void stop_wakeup_tracer(struct trace_array *tr) 318static void stop_wakeup_tracer(struct trace_array *tr)
325{ 319{
326 tracer_enabled = 0; 320 tracer_enabled = 0;
327 save_tracer_enabled = 0;
328 unregister_ftrace_function(&trace_ops); 321 unregister_ftrace_function(&trace_ops);
329 unregister_trace_sched_switch(probe_wakeup_sched_switch); 322 unregister_trace_sched_switch(probe_wakeup_sched_switch);
330 unregister_trace_sched_wakeup_new(probe_wakeup); 323 unregister_trace_sched_wakeup_new(probe_wakeup);
331 unregister_trace_sched_wakeup(probe_wakeup); 324 unregister_trace_sched_wakeup(probe_wakeup);
332} 325}
333 326
334static int wakeup_tracer_init(struct trace_array *tr) 327static int __wakeup_tracer_init(struct trace_array *tr)
335{ 328{
329 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
330 trace_flags |= TRACE_ITER_LATENCY_FMT;
331
336 tracing_max_latency = 0; 332 tracing_max_latency = 0;
337 wakeup_trace = tr; 333 wakeup_trace = tr;
338 start_wakeup_tracer(tr); 334 start_wakeup_tracer(tr);
339 return 0; 335 return 0;
340} 336}
341 337
338static int wakeup_tracer_init(struct trace_array *tr)
339{
340 wakeup_rt = 0;
341 return __wakeup_tracer_init(tr);
342}
343
344static int wakeup_rt_tracer_init(struct trace_array *tr)
345{
346 wakeup_rt = 1;
347 return __wakeup_tracer_init(tr);
348}
349
342static void wakeup_tracer_reset(struct trace_array *tr) 350static void wakeup_tracer_reset(struct trace_array *tr)
343{ 351{
344 stop_wakeup_tracer(tr); 352 stop_wakeup_tracer(tr);
345 /* make sure we put back any tasks we are tracing */ 353 /* make sure we put back any tasks we are tracing */
346 wakeup_reset(tr); 354 wakeup_reset(tr);
355
356 if (!save_lat_flag)
357 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
347} 358}
348 359
349static void wakeup_tracer_start(struct trace_array *tr) 360static void wakeup_tracer_start(struct trace_array *tr)
350{ 361{
351 wakeup_reset(tr); 362 wakeup_reset(tr);
352 tracer_enabled = 1; 363 tracer_enabled = 1;
353 save_tracer_enabled = 1;
354} 364}
355 365
356static void wakeup_tracer_stop(struct trace_array *tr) 366static void wakeup_tracer_stop(struct trace_array *tr)
357{ 367{
358 tracer_enabled = 0; 368 tracer_enabled = 0;
359 save_tracer_enabled = 0;
360}
361
362static void wakeup_tracer_open(struct trace_iterator *iter)
363{
364 /* stop the trace while dumping */
365 tracer_enabled = 0;
366}
367
368static void wakeup_tracer_close(struct trace_iterator *iter)
369{
370 /* forget about any processes we were recording */
371 if (save_tracer_enabled) {
372 wakeup_reset(iter->tr);
373 tracer_enabled = 1;
374 }
375} 369}
376 370
377static struct tracer wakeup_tracer __read_mostly = 371static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +375,20 @@ static struct tracer wakeup_tracer __read_mostly =
381 .reset = wakeup_tracer_reset, 375 .reset = wakeup_tracer_reset,
382 .start = wakeup_tracer_start, 376 .start = wakeup_tracer_start,
383 .stop = wakeup_tracer_stop, 377 .stop = wakeup_tracer_stop,
384 .open = wakeup_tracer_open, 378 .print_max = 1,
385 .close = wakeup_tracer_close, 379#ifdef CONFIG_FTRACE_SELFTEST
380 .selftest = trace_selftest_startup_wakeup,
381#endif
382};
383
384static struct tracer wakeup_rt_tracer __read_mostly =
385{
386 .name = "wakeup_rt",
387 .init = wakeup_rt_tracer_init,
388 .reset = wakeup_tracer_reset,
389 .start = wakeup_tracer_start,
390 .stop = wakeup_tracer_stop,
391 .wait_pipe = poll_wait_pipe,
386 .print_max = 1, 392 .print_max = 1,
387#ifdef CONFIG_FTRACE_SELFTEST 393#ifdef CONFIG_FTRACE_SELFTEST
388 .selftest = trace_selftest_startup_wakeup, 394 .selftest = trace_selftest_startup_wakeup,
@@ -397,6 +403,10 @@ __init static int init_wakeup_tracer(void)
397 if (ret) 403 if (ret)
398 return ret; 404 return ret;
399 405
406 ret = register_tracer(&wakeup_rt_tracer);
407 if (ret)
408 return ret;
409
400 return 0; 410 return 0;
401} 411}
402device_initcall(init_wakeup_tracer); 412device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index bc8e80a86bca..38856ba78a92 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
1/* Include in trace.c */ 1/* Include in trace.c */
2 2
3#include <linux/stringify.h>
3#include <linux/kthread.h> 4#include <linux/kthread.h>
4#include <linux/delay.h> 5#include <linux/delay.h>
5 6
@@ -9,11 +10,12 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 10 case TRACE_FN:
10 case TRACE_CTX: 11 case TRACE_CTX:
11 case TRACE_WAKE: 12 case TRACE_WAKE:
12 case TRACE_CONT:
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET:
17 return 1; 19 return 1;
18 } 20 }
19 return 0; 21 return 0;
@@ -99,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
99 101
100#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
101 103
102#define __STR(x) #x
103#define STR(x) __STR(x)
104
105/* Test dynamic code modification and ftrace filters */ 104/* Test dynamic code modification and ftrace filters */
106int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 105int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
107 struct trace_array *tr, 106 struct trace_array *tr,
@@ -125,17 +124,17 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
125 func(); 124 func();
126 125
127 /* 126 /*
128 * Some archs *cough*PowerPC*cough* add charachters to the 127 * Some archs *cough*PowerPC*cough* add characters to the
129 * start of the function names. We simply put a '*' to 128 * start of the function names. We simply put a '*' to
130 * accomodate them. 129 * accommodate them.
131 */ 130 */
132 func_name = "*" STR(DYN_FTRACE_TEST_NAME); 131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
133 132
134 /* filter only on our function */ 133 /* filter only on our function */
135 ftrace_set_filter(func_name, strlen(func_name), 1); 134 ftrace_set_filter(func_name, strlen(func_name), 1);
136 135
137 /* enable tracing */ 136 /* enable tracing */
138 ret = trace->init(tr); 137 ret = tracer_init(trace, tr);
139 if (ret) { 138 if (ret) {
140 warn_failed_init_tracer(trace, ret); 139 warn_failed_init_tracer(trace, ret);
141 goto out; 140 goto out;
@@ -209,7 +208,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
209 ftrace_enabled = 1; 208 ftrace_enabled = 1;
210 tracer_enabled = 1; 209 tracer_enabled = 1;
211 210
212 ret = trace->init(tr); 211 ret = tracer_init(trace, tr);
213 if (ret) { 212 if (ret) {
214 warn_failed_init_tracer(trace, ret); 213 warn_failed_init_tracer(trace, ret);
215 goto out; 214 goto out;
@@ -247,6 +246,54 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
247} 246}
248#endif /* CONFIG_FUNCTION_TRACER */ 247#endif /* CONFIG_FUNCTION_TRACER */
249 248
249
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251/*
252 * Pretty much the same than for the function tracer from which the selftest
253 * has been borrowed.
254 */
255int
256trace_selftest_startup_function_graph(struct tracer *trace,
257 struct trace_array *tr)
258{
259 int ret;
260 unsigned long count;
261
262 ret = tracer_init(trace, tr);
263 if (ret) {
264 warn_failed_init_tracer(trace, ret);
265 goto out;
266 }
267
268 /* Sleep for a 1/10 of a second */
269 msleep(100);
270
271 tracing_stop();
272
273 /* check the trace buffer */
274 ret = trace_test_buffer(tr, &count);
275
276 trace->reset(tr);
277 tracing_start();
278
279 if (!ret && !count) {
280 printk(KERN_CONT ".. no entries found ..");
281 ret = -1;
282 goto out;
283 }
284
285 /* Don't test dynamic tracing, the function tracer already did */
286
287out:
288 /* Stop it if we failed */
289 if (ret)
290 ftrace_graph_stop();
291
292 return ret;
293}
294#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
295
296
250#ifdef CONFIG_IRQSOFF_TRACER 297#ifdef CONFIG_IRQSOFF_TRACER
251int 298int
252trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 299trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
@@ -256,7 +303,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
256 int ret; 303 int ret;
257 304
258 /* start the tracing */ 305 /* start the tracing */
259 ret = trace->init(tr); 306 ret = tracer_init(trace, tr);
260 if (ret) { 307 if (ret) {
261 warn_failed_init_tracer(trace, ret); 308 warn_failed_init_tracer(trace, ret);
262 return ret; 309 return ret;
@@ -268,6 +315,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
268 local_irq_disable(); 315 local_irq_disable();
269 udelay(100); 316 udelay(100);
270 local_irq_enable(); 317 local_irq_enable();
318
319 /*
320 * Stop the tracer to avoid a warning subsequent
321 * to buffer flipping failure because tracing_stop()
322 * disables the tr and max buffers, making flipping impossible
323 * in case of parallels max irqs off latencies.
324 */
325 trace->stop(tr);
271 /* stop the tracing. */ 326 /* stop the tracing. */
272 tracing_stop(); 327 tracing_stop();
273 /* check both trace buffers */ 328 /* check both trace buffers */
@@ -310,7 +365,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
310 } 365 }
311 366
312 /* start the tracing */ 367 /* start the tracing */
313 ret = trace->init(tr); 368 ret = tracer_init(trace, tr);
314 if (ret) { 369 if (ret) {
315 warn_failed_init_tracer(trace, ret); 370 warn_failed_init_tracer(trace, ret);
316 return ret; 371 return ret;
@@ -322,6 +377,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
322 preempt_disable(); 377 preempt_disable();
323 udelay(100); 378 udelay(100);
324 preempt_enable(); 379 preempt_enable();
380
381 /*
382 * Stop the tracer to avoid a warning subsequent
383 * to buffer flipping failure because tracing_stop()
384 * disables the tr and max buffers, making flipping impossible
385 * in case of parallels max preempt off latencies.
386 */
387 trace->stop(tr);
325 /* stop the tracing. */ 388 /* stop the tracing. */
326 tracing_stop(); 389 tracing_stop();
327 /* check both trace buffers */ 390 /* check both trace buffers */
@@ -364,10 +427,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
364 } 427 }
365 428
366 /* start the tracing */ 429 /* start the tracing */
367 ret = trace->init(tr); 430 ret = tracer_init(trace, tr);
368 if (ret) { 431 if (ret) {
369 warn_failed_init_tracer(trace, ret); 432 warn_failed_init_tracer(trace, ret);
370 goto out; 433 goto out_no_start;
371 } 434 }
372 435
373 /* reset the max latency */ 436 /* reset the max latency */
@@ -381,31 +444,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
381 /* reverse the order of preempt vs irqs */ 444 /* reverse the order of preempt vs irqs */
382 local_irq_enable(); 445 local_irq_enable();
383 446
447 /*
448 * Stop the tracer to avoid a warning subsequent
449 * to buffer flipping failure because tracing_stop()
450 * disables the tr and max buffers, making flipping impossible
451 * in case of parallels max irqs/preempt off latencies.
452 */
453 trace->stop(tr);
384 /* stop the tracing. */ 454 /* stop the tracing. */
385 tracing_stop(); 455 tracing_stop();
386 /* check both trace buffers */ 456 /* check both trace buffers */
387 ret = trace_test_buffer(tr, NULL); 457 ret = trace_test_buffer(tr, NULL);
388 if (ret) { 458 if (ret)
389 tracing_start();
390 goto out; 459 goto out;
391 }
392 460
393 ret = trace_test_buffer(&max_tr, &count); 461 ret = trace_test_buffer(&max_tr, &count);
394 if (ret) { 462 if (ret)
395 tracing_start();
396 goto out; 463 goto out;
397 }
398 464
399 if (!ret && !count) { 465 if (!ret && !count) {
400 printk(KERN_CONT ".. no entries found .."); 466 printk(KERN_CONT ".. no entries found ..");
401 ret = -1; 467 ret = -1;
402 tracing_start();
403 goto out; 468 goto out;
404 } 469 }
405 470
406 /* do the test by disabling interrupts first this time */ 471 /* do the test by disabling interrupts first this time */
407 tracing_max_latency = 0; 472 tracing_max_latency = 0;
408 tracing_start(); 473 tracing_start();
474 trace->start(tr);
475
409 preempt_disable(); 476 preempt_disable();
410 local_irq_disable(); 477 local_irq_disable();
411 udelay(100); 478 udelay(100);
@@ -413,6 +480,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
413 /* reverse the order of preempt vs irqs */ 480 /* reverse the order of preempt vs irqs */
414 local_irq_enable(); 481 local_irq_enable();
415 482
483 trace->stop(tr);
416 /* stop the tracing. */ 484 /* stop the tracing. */
417 tracing_stop(); 485 tracing_stop();
418 /* check both trace buffers */ 486 /* check both trace buffers */
@@ -428,9 +496,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
428 goto out; 496 goto out;
429 } 497 }
430 498
431 out: 499out:
432 trace->reset(tr);
433 tracing_start(); 500 tracing_start();
501out_no_start:
502 trace->reset(tr);
434 tracing_max_latency = save_max; 503 tracing_max_latency = save_max;
435 504
436 return ret; 505 return ret;
@@ -496,7 +565,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
496 wait_for_completion(&isrt); 565 wait_for_completion(&isrt);
497 566
498 /* start the tracing */ 567 /* start the tracing */
499 ret = trace->init(tr); 568 ret = tracer_init(trace, tr);
500 if (ret) { 569 if (ret) {
501 warn_failed_init_tracer(trace, ret); 570 warn_failed_init_tracer(trace, ret);
502 return ret; 571 return ret;
@@ -557,7 +626,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
557 int ret; 626 int ret;
558 627
559 /* start the tracing */ 628 /* start the tracing */
560 ret = trace->init(tr); 629 ret = tracer_init(trace, tr);
561 if (ret) { 630 if (ret) {
562 warn_failed_init_tracer(trace, ret); 631 warn_failed_init_tracer(trace, ret);
563 return ret; 632 return ret;
@@ -589,10 +658,10 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
589 int ret; 658 int ret;
590 659
591 /* start the tracing */ 660 /* start the tracing */
592 ret = trace->init(tr); 661 ret = tracer_init(trace, tr);
593 if (ret) { 662 if (ret) {
594 warn_failed_init_tracer(trace, ret); 663 warn_failed_init_tracer(trace, ret);
595 return 0; 664 return ret;
596 } 665 }
597 666
598 /* Sleep for a 1/10 of a second */ 667 /* Sleep for a 1/10 of a second */
@@ -604,6 +673,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
604 trace->reset(tr); 673 trace->reset(tr);
605 tracing_start(); 674 tracing_start();
606 675
676 if (!ret && !count) {
677 printk(KERN_CONT ".. no entries found ..");
678 ret = -1;
679 }
680
607 return ret; 681 return ret;
608} 682}
609#endif /* CONFIG_SYSPROF_TRACER */ 683#endif /* CONFIG_SYSPROF_TRACER */
@@ -616,7 +690,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
616 int ret; 690 int ret;
617 691
618 /* start the tracing */ 692 /* start the tracing */
619 ret = trace->init(tr); 693 ret = tracer_init(trace, tr);
620 if (ret) { 694 if (ret) {
621 warn_failed_init_tracer(trace, ret); 695 warn_failed_init_tracer(trace, ret);
622 return ret; 696 return ret;
@@ -631,6 +705,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
631 trace->reset(tr); 705 trace->reset(tr);
632 tracing_start(); 706 tracing_start();
633 707
708 if (!ret && !count) {
709 printk(KERN_CONT ".. no entries found ..");
710 ret = -1;
711 }
712
634 return ret; 713 return ret;
635} 714}
636#endif /* CONFIG_BRANCH_TRACER */ 715#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca5..c750f65f9661 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,16 +245,31 @@ static int trace_lookup_stack(struct seq_file *m, long i)
245#endif 245#endif
246} 246}
247 247
248static void print_disabled(struct seq_file *m)
249{
250 seq_puts(m, "#\n"
251 "# Stack tracer disabled\n"
252 "#\n"
253 "# To enable the stack tracer, either add 'stacktrace' to the\n"
254 "# kernel command line\n"
255 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
256 "#\n");
257}
258
248static int t_show(struct seq_file *m, void *v) 259static int t_show(struct seq_file *m, void *v)
249{ 260{
250 long i; 261 long i;
251 int size; 262 int size;
252 263
253 if (v == SEQ_START_TOKEN) { 264 if (v == SEQ_START_TOKEN) {
254 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
255 " (%d entries)\n" 266 " (%d entries)\n"
256 " ----- ---- --------\n", 267 " ----- ---- --------\n",
257 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries);
269
270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m);
272
258 return 0; 273 return 0;
259 } 274 }
260 275
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..39310e3434ee
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,319 @@
1/*
2 * Infrastructure for statistic tracing (histogram output).
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
8 *
9 */
10
11
12#include <linux/list.h>
13#include <linux/debugfs.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* List of stat entries from a tracer */
19struct trace_stat_list {
20 struct list_head list;
21 void *stat;
22};
23
24/* A stat session is the stats output in one file */
25struct tracer_stat_session {
26 struct list_head session_list;
27 struct tracer_stat *ts;
28 struct list_head stat_list;
29 struct mutex stat_mutex;
30 struct dentry *file;
31};
32
33/* All of the sessions currently in use. Each stat file embed one session */
34static LIST_HEAD(all_stat_sessions);
35static DEFINE_MUTEX(all_stat_sessions_mutex);
36
37/* The root directory for all stat files */
38static struct dentry *stat_dir;
39
40
41static void reset_stat_session(struct tracer_stat_session *session)
42{
43 struct trace_stat_list *node, *next;
44
45 list_for_each_entry_safe(node, next, &session->stat_list, list)
46 kfree(node);
47
48 INIT_LIST_HEAD(&session->stat_list);
49}
50
51static void destroy_session(struct tracer_stat_session *session)
52{
53 debugfs_remove(session->file);
54 reset_stat_session(session);
55 mutex_destroy(&session->stat_mutex);
56 kfree(session);
57}
58
59/*
60 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of
62 * the list.
63 */
64static int dummy_cmp(void *p1, void *p2)
65{
66 return 1;
67}
68
69/*
70 * Initialize the stat list at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions.
73 */
74static int stat_seq_init(struct tracer_stat_session *session)
75{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts;
78 void *prev_stat;
79 int ret = 0;
80 int i;
81
82 mutex_lock(&session->stat_mutex);
83 reset_stat_session(session);
84
85 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp;
87
88 /*
89 * The first entry. Actually this is the second, but the first
90 * one (the stat_list head) is pointless.
91 */
92 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
93 if (!new_entry) {
94 ret = -ENOMEM;
95 goto exit;
96 }
97
98 INIT_LIST_HEAD(&new_entry->list);
99
100 list_add(&new_entry->list, &session->stat_list);
101
102 new_entry->stat = ts->stat_start();
103 prev_stat = new_entry->stat;
104
105 /*
106 * Iterate over the tracer stat entries and store them in a sorted
107 * list.
108 */
109 for (i = 1; ; i++) {
110 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
111 if (!new_entry) {
112 ret = -ENOMEM;
113 goto exit_free_list;
114 }
115
116 INIT_LIST_HEAD(&new_entry->list);
117 new_entry->stat = ts->stat_next(prev_stat, i);
118
119 /* End of insertion */
120 if (!new_entry->stat)
121 break;
122
123 list_for_each_entry(iter_entry, &session->stat_list, list) {
124
125 /* Insertion with a descendent sorting */
126 if (ts->stat_cmp(new_entry->stat,
127 iter_entry->stat) > 0) {
128
129 list_add_tail(&new_entry->list,
130 &iter_entry->list);
131 break;
132
133 /* The current smaller value */
134 } else if (list_is_last(&iter_entry->list,
135 &session->stat_list)) {
136 list_add(&new_entry->list, &iter_entry->list);
137 break;
138 }
139 }
140
141 prev_stat = new_entry->stat;
142 }
143exit:
144 mutex_unlock(&session->stat_mutex);
145 return ret;
146
147exit_free_list:
148 reset_stat_session(session);
149 mutex_unlock(&session->stat_mutex);
150 return ret;
151}
152
153
154static void *stat_seq_start(struct seq_file *s, loff_t *pos)
155{
156 struct tracer_stat_session *session = s->private;
157
158 /* Prevent from tracer switch or stat_list modification */
159 mutex_lock(&session->stat_mutex);
160
161 /* If we are in the beginning of the file, print the headers */
162 if (!*pos && session->ts->stat_headers)
163 session->ts->stat_headers(s);
164
165 return seq_list_start(&session->stat_list, *pos);
166}
167
168static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
169{
170 struct tracer_stat_session *session = s->private;
171
172 return seq_list_next(p, &session->stat_list, pos);
173}
174
175static void stat_seq_stop(struct seq_file *s, void *p)
176{
177 struct tracer_stat_session *session = s->private;
178 mutex_unlock(&session->stat_mutex);
179}
180
181static int stat_seq_show(struct seq_file *s, void *v)
182{
183 struct tracer_stat_session *session = s->private;
184 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
185
186 return session->ts->stat_show(s, l->stat);
187}
188
189static const struct seq_operations trace_stat_seq_ops = {
190 .start = stat_seq_start,
191 .next = stat_seq_next,
192 .stop = stat_seq_stop,
193 .show = stat_seq_show
194};
195
196/* The session stat is refilled and resorted at each stat file opening */
197static int tracing_stat_open(struct inode *inode, struct file *file)
198{
199 int ret;
200
201 struct tracer_stat_session *session = inode->i_private;
202
203 ret = seq_open(file, &trace_stat_seq_ops);
204 if (!ret) {
205 struct seq_file *m = file->private_data;
206 m->private = session;
207 ret = stat_seq_init(session);
208 }
209
210 return ret;
211}
212
213/*
214 * Avoid consuming memory with our now useless list.
215 */
216static int tracing_stat_release(struct inode *i, struct file *f)
217{
218 struct tracer_stat_session *session = i->i_private;
219
220 mutex_lock(&session->stat_mutex);
221 reset_stat_session(session);
222 mutex_unlock(&session->stat_mutex);
223
224 return 0;
225}
226
227static const struct file_operations tracing_stat_fops = {
228 .open = tracing_stat_open,
229 .read = seq_read,
230 .llseek = seq_lseek,
231 .release = tracing_stat_release
232};
233
234static int tracing_stat_init(void)
235{
236 struct dentry *d_tracing;
237
238 d_tracing = tracing_init_dentry();
239
240 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
241 if (!stat_dir)
242 pr_warning("Could not create debugfs "
243 "'trace_stat' entry\n");
244 return 0;
245}
246
247static int init_stat_file(struct tracer_stat_session *session)
248{
249 if (!stat_dir && tracing_stat_init())
250 return -ENODEV;
251
252 session->file = debugfs_create_file(session->ts->name, 0644,
253 stat_dir,
254 session, &tracing_stat_fops);
255 if (!session->file)
256 return -ENOMEM;
257 return 0;
258}
259
260int register_stat_tracer(struct tracer_stat *trace)
261{
262 struct tracer_stat_session *session, *node, *tmp;
263 int ret;
264
265 if (!trace)
266 return -EINVAL;
267
268 if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
269 return -EINVAL;
270
271 /* Already registered? */
272 mutex_lock(&all_stat_sessions_mutex);
273 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
274 if (node->ts == trace) {
275 mutex_unlock(&all_stat_sessions_mutex);
276 return -EINVAL;
277 }
278 }
279 mutex_unlock(&all_stat_sessions_mutex);
280
281 /* Init the session */
282 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
283 if (!session)
284 return -ENOMEM;
285
286 session->ts = trace;
287 INIT_LIST_HEAD(&session->session_list);
288 INIT_LIST_HEAD(&session->stat_list);
289 mutex_init(&session->stat_mutex);
290 session->file = NULL;
291
292 ret = init_stat_file(session);
293 if (ret) {
294 destroy_session(session);
295 return ret;
296 }
297
298 /* Register */
299 mutex_lock(&all_stat_sessions_mutex);
300 list_add_tail(&session->session_list, &all_stat_sessions);
301 mutex_unlock(&all_stat_sessions_mutex);
302
303 return 0;
304}
305
306void unregister_stat_tracer(struct tracer_stat *trace)
307{
308 struct tracer_stat_session *node, *tmp;
309
310 mutex_lock(&all_stat_sessions_mutex);
311 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
312 if (node->ts == trace) {
313 list_del(&node->session_list);
314 destroy_session(node);
315 break;
316 }
317 }
318 mutex_unlock(&all_stat_sessions_mutex);
319}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
1#ifndef __TRACE_STAT_H
2#define __TRACE_STAT_H
3
4#include <linux/seq_file.h>
5
6/*
7 * If you want to provide a stat file (one-shot statistics), fill
8 * an iterator with stat_start/stat_next and a stat_show callbacks.
9 * The others callbacks are optional.
10 */
11struct tracer_stat {
12 /* The name of your stat file */
13 const char *name;
14 /* Iteration over statistic entries */
15 void *(*stat_start)(void);
16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s);
23};
24
25/*
26 * Destroy or create a stat file
27 */
28extern int register_stat_tracer(struct tracer_stat *trace);
29extern void unregister_stat_tracer(struct tracer_stat *trace);
30
31#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..a2a3af29c943
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,250 @@
1#include <linux/kernel.h>
2#include <linux/ftrace.h>
3#include <asm/syscall.h>
4
5#include "trace_output.h"
6#include "trace.h"
7
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock);
13
14/* Option to display the parameters types */
15enum {
16 TRACE_SYSCALLS_OPT_TYPES = 0x1,
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28
29enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags)
31{
32 struct trace_seq *s = &iter->seq;
33 struct trace_entry *ent = iter->ent;
34 struct syscall_trace_enter *trace;
35 struct syscall_metadata *entry;
36 int i, ret, syscall;
37
38 trace_assign_type(trace, ent);
39
40 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall);
43 if (!entry)
44 goto end;
45
46 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE;
49
50 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE;
56 }
57 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
59 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ",");
61 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE;
63 }
64
65end:
66 trace_seq_printf(s, "\n");
67 return TRACE_TYPE_HANDLED;
68}
69
70enum print_line_t
71print_syscall_exit(struct trace_iterator *iter, int flags)
72{
73 struct trace_seq *s = &iter->seq;
74 struct trace_entry *ent = iter->ent;
75 struct syscall_trace_exit *trace;
76 int syscall;
77 struct syscall_metadata *entry;
78 int ret;
79
80 trace_assign_type(trace, ent);
81
82 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall);
85 if (!entry) {
86 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED;
88 }
89
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret);
92 if (!ret)
93 return TRACE_TYPE_PARTIAL_LINE;
94
95 return TRACE_TYPE_HANDLED;
96}
97
98void start_ftrace_syscalls(void)
99{
100 unsigned long flags;
101 struct task_struct *g, *t;
102
103 mutex_lock(&syscall_trace_lock);
104
105 /* Don't enable the flag on the tasks twice */
106 if (++refcount != 1)
107 goto unlock;
108
109 arch_init_ftrace_syscalls();
110 read_lock_irqsave(&tasklist_lock, flags);
111
112 do_each_thread(g, t) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
114 } while_each_thread(g, t);
115
116 read_unlock_irqrestore(&tasklist_lock, flags);
117
118unlock:
119 mutex_unlock(&syscall_trace_lock);
120}
121
122void stop_ftrace_syscalls(void)
123{
124 unsigned long flags;
125 struct task_struct *g, *t;
126
127 mutex_lock(&syscall_trace_lock);
128
129 /* There are perhaps still some users */
130 if (--refcount)
131 goto unlock;
132
133 read_lock_irqsave(&tasklist_lock, flags);
134
135 do_each_thread(g, t) {
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
137 } while_each_thread(g, t);
138
139 read_unlock_irqrestore(&tasklist_lock, flags);
140
141unlock:
142 mutex_unlock(&syscall_trace_lock);
143}
144
145void ftrace_syscall_enter(struct pt_regs *regs)
146{
147 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event;
150 int size;
151 int syscall_nr;
152
153 syscall_nr = syscall_get_nr(current, regs);
154
155 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data)
157 return;
158
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
162 0, 0);
163 if (!event)
164 return;
165
166 entry = ring_buffer_event_data(event);
167 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169
170 trace_current_buffer_unlock_commit(event, 0, 0);
171 trace_wake_up();
172}
173
174void ftrace_syscall_exit(struct pt_regs *regs)
175{
176 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event;
179 int syscall_nr;
180
181 syscall_nr = syscall_get_nr(current, regs);
182
183 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data)
185 return;
186
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
188 sizeof(*entry), 0, 0);
189 if (!event)
190 return;
191
192 entry = ring_buffer_event_data(event);
193 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs);
195
196 trace_current_buffer_unlock_commit(event, 0, 0);
197 trace_wake_up();
198}
199
200static int init_syscall_tracer(struct trace_array *tr)
201{
202 start_ftrace_syscalls();
203
204 return 0;
205}
206
207static void reset_syscall_tracer(struct trace_array *tr)
208{
209 stop_ftrace_syscalls();
210 tracing_reset_online_cpus(tr);
211}
212
213static struct trace_event syscall_enter_event = {
214 .type = TRACE_SYSCALL_ENTER,
215 .trace = print_syscall_enter,
216};
217
218static struct trace_event syscall_exit_event = {
219 .type = TRACE_SYSCALL_EXIT,
220 .trace = print_syscall_exit,
221};
222
223static struct tracer syscall_tracer __read_mostly = {
224 .name = "syscall",
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228};
229
230__init int register_ftrace_syscalls(void)
231{
232 int ret;
233
234 ret = register_ftrace_event(&syscall_enter_event);
235 if (!ret) {
236 printk(KERN_WARNING "event %d failed to register\n",
237 syscall_enter_event.type);
238 WARN_ON_ONCE(1);
239 }
240
241 ret = register_ftrace_event(&syscall_exit_event);
242 if (!ret) {
243 printk(KERN_WARNING "event %d failed to register\n",
244 syscall_exit_event.type);
245 WARN_ON_ONCE(1);
246 }
247
248 return register_tracer(&syscall_tracer);
249}
250device_initcall(register_ftrace_syscalls);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803ff..91fd19c2149f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
88 } 88 }
89} 89}
90 90
91const static struct stacktrace_ops backtrace_ops = { 91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning, 92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
226 stop_stack_timer(cpu); 226 stop_stack_timer(cpu);
227} 227}
228 228
229static void start_stack_trace(struct trace_array *tr)
230{
231 mutex_lock(&sample_timer_lock);
232 tracing_reset_online_cpus(tr);
233 start_stack_timers();
234 tracer_enabled = 1;
235 mutex_unlock(&sample_timer_lock);
236}
237
238static void stop_stack_trace(struct trace_array *tr) 229static void stop_stack_trace(struct trace_array *tr)
239{ 230{
240 mutex_lock(&sample_timer_lock); 231 mutex_lock(&sample_timer_lock);
@@ -247,12 +238,18 @@ static int stack_trace_init(struct trace_array *tr)
247{ 238{
248 sysprof_trace = tr; 239 sysprof_trace = tr;
249 240
250 start_stack_trace(tr); 241 tracing_start_cmdline_record();
242
243 mutex_lock(&sample_timer_lock);
244 start_stack_timers();
245 tracer_enabled = 1;
246 mutex_unlock(&sample_timer_lock);
251 return 0; 247 return 0;
252} 248}
253 249
254static void stack_trace_reset(struct trace_array *tr) 250static void stack_trace_reset(struct trace_array *tr)
255{ 251{
252 tracing_stop_cmdline_record();
256 stop_stack_trace(tr); 253 stop_stack_trace(tr);
257} 254}
258 255
@@ -317,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
317 return cnt; 314 return cnt;
318} 315}
319 316
320static struct file_operations sysprof_sample_fops = { 317static const struct file_operations sysprof_sample_fops = {
321 .read = sysprof_sample_read, 318 .read = sysprof_sample_read,
322 .write = sysprof_sample_write, 319 .write = sysprof_sample_write,
323}; 320};
@@ -330,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
330 d_tracer, NULL, &sysprof_sample_fops); 327 d_tracer, NULL, &sysprof_sample_fops);
331 if (entry) 328 if (entry)
332 return; 329 return;
333 pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); 330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
334} 331}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..9ab035b58cf1
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,288 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include "trace_stat.h"
13#include "trace.h"
14
15
16/* A cpu workqueue thread */
17struct cpu_workqueue_stats {
18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu;
22 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */
24 atomic_t inserted;
25/*
26 * Don't need to be atomic, works are serialized in a single workqueue thread
27 * on a single CPU.
28 */
29 unsigned int executed;
30};
31
32/* List of workqueue threads on one cpu */
33struct workqueue_global_stats {
34 struct list_head list;
35 spinlock_t lock;
36};
37
38/* Don't need a global lock because allocated before the workqueues, and
39 * never freed.
40 */
41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
43
44/* Insertion of a work */
45static void
46probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work)
48{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next;
51 unsigned long flags;
52
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
55 list) {
56 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted);
58 goto found;
59 }
60 }
61 pr_debug("trace_workqueue: entry not found\n");
62found:
63 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
64}
65
66/* Execution of a work */
67static void
68probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work)
70{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next;
73 unsigned long flags;
74
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
77 list) {
78 if (node->pid == wq_thread->pid) {
79 node->executed++;
80 goto found;
81 }
82 }
83 pr_debug("trace_workqueue: entry not found\n");
84found:
85 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
86}
87
88/* Creation of a cpu workqueue thread */
89static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
90{
91 struct cpu_workqueue_stats *cws;
92 unsigned long flags;
93
94 WARN_ON(cpu < 0);
95
96 /* Workqueues are sometimes created in atomic context */
97 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
98 if (!cws) {
99 pr_warning("trace_workqueue: not enough memory\n");
100 return;
101 }
102 INIT_LIST_HEAD(&cws->list);
103 cws->cpu = cpu;
104
105 cws->pid = wq_thread->pid;
106
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112}
113
114/* Destruction of a cpu workqueue thread */
115static void probe_workqueue_destruction(struct task_struct *wq_thread)
116{
117 /* Workqueue only execute on one cpu */
118 int cpu = cpumask_first(&wq_thread->cpus_allowed);
119 struct cpu_workqueue_stats *node, *next;
120 unsigned long flags;
121
122 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
123 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
124 list) {
125 if (node->pid == wq_thread->pid) {
126 list_del(&node->list);
127 kfree(node);
128 goto found;
129 }
130 }
131
132 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
133found:
134 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
135
136}
137
138static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
139{
140 unsigned long flags;
141 struct cpu_workqueue_stats *ret = NULL;
142
143
144 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
145
146 if (!list_empty(&workqueue_cpu_stat(cpu)->list))
147 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
148 struct cpu_workqueue_stats, list);
149
150 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 return ret;
153}
154
155static void *workqueue_stat_start(void)
156{
157 int cpu;
158 void *ret = NULL;
159
160 for_each_possible_cpu(cpu) {
161 ret = workqueue_stat_start_cpu(cpu);
162 if (ret)
163 return ret;
164 }
165 return NULL;
166}
167
168static void *workqueue_stat_next(void *prev, int idx)
169{
170 struct cpu_workqueue_stats *prev_cws = prev;
171 int cpu = prev_cws->cpu;
172 unsigned long flags;
173 void *ret = NULL;
174
175 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
176 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
177 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
178 do {
179 cpu = cpumask_next(cpu, cpu_possible_mask);
180 if (cpu >= nr_cpu_ids)
181 return NULL;
182 } while (!(ret = workqueue_stat_start_cpu(cpu)));
183 return ret;
184 }
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186
187 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
188 list);
189}
190
191static int workqueue_stat_show(struct seq_file *s, void *p)
192{
193 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid;
197 struct task_struct *tsk;
198
199 pid = find_get_pid(cws->pid);
200 if (pid) {
201 tsk = get_pid_task(pid, PIDTYPE_PID);
202 if (tsk) {
203 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
204 atomic_read(&cws->inserted), cws->executed,
205 tsk->comm);
206 put_task_struct(tsk);
207 }
208 put_pid(pid);
209 }
210
211 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
212 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
213 seq_printf(s, "\n");
214 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
215
216 return 0;
217}
218
219static int workqueue_stat_headers(struct seq_file *s)
220{
221 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
222 seq_printf(s, "# | | | |\n\n");
223 return 0;
224}
225
226struct tracer_stat workqueue_stats __read_mostly = {
227 .name = "workqueues",
228 .stat_start = workqueue_stat_start,
229 .stat_next = workqueue_stat_next,
230 .stat_show = workqueue_stat_show,
231 .stat_headers = workqueue_stat_headers
232};
233
234
235int __init stat_workqueue_init(void)
236{
237 if (register_stat_tracer(&workqueue_stats)) {
238 pr_warning("Unable to register workqueue stat tracer\n");
239 return 1;
240 }
241
242 return 0;
243}
244fs_initcall(stat_workqueue_init);
245
246/*
247 * Workqueues are created very early, just after pre-smp initcalls.
248 * So we must register our tracepoints at this stage.
249 */
250int __init trace_workqueue_early_init(void)
251{
252 int ret, cpu;
253
254 ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
255 if (ret)
256 goto out;
257
258 ret = register_trace_workqueue_execution(probe_workqueue_execution);
259 if (ret)
260 goto no_insertion;
261
262 ret = register_trace_workqueue_creation(probe_workqueue_creation);
263 if (ret)
264 goto no_execution;
265
266 ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
267 if (ret)
268 goto no_creation;
269
270 for_each_possible_cpu(cpu) {
271 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
272 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
273 }
274
275 return 0;
276
277no_creation:
278 unregister_trace_workqueue_creation(probe_workqueue_creation);
279no_execution:
280 unregister_trace_workqueue_execution(probe_workqueue_execution);
281no_insertion:
282 unregister_trace_workqueue_insertion(probe_workqueue_insertion);
283out:
284 pr_warning("trace_workqueue: unable to trace workqueues\n");
285
286 return 1;
287}
288early_initcall(trace_workqueue_early_init);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb5..1ef5d3a601c7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,15 @@ static void disable_tracepoint(struct tracepoint *elem)
272 * 272 *
273 * Updates the probe callback corresponding to a range of tracepoints. 273 * Updates the probe callback corresponding to a range of tracepoints.
274 */ 274 */
275void tracepoint_update_probe_range(struct tracepoint *begin, 275void
276 struct tracepoint *end) 276tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
277{ 277{
278 struct tracepoint *iter; 278 struct tracepoint *iter;
279 struct tracepoint_entry *mark_entry; 279 struct tracepoint_entry *mark_entry;
280 280
281 if (!begin)
282 return;
283
281 mutex_lock(&tracepoints_mutex); 284 mutex_lock(&tracepoints_mutex);
282 for (iter = begin; iter < end; iter++) { 285 for (iter = begin; iter < end; iter++) {
283 mark_entry = get_tracepoint(iter->name); 286 mark_entry = get_tracepoint(iter->name);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d3..e53ee18ef431 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h>
36 37
37/* 38/*
38 * The per-CPU workqueue (if single thread, we always use the first 39 * The per-CPU workqueue (if single thread, we always use the first
@@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
125 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 126 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
126} 127}
127 128
129DEFINE_TRACE(workqueue_insertion);
130
128static void insert_work(struct cpu_workqueue_struct *cwq, 131static void insert_work(struct cpu_workqueue_struct *cwq,
129 struct work_struct *work, struct list_head *head) 132 struct work_struct *work, struct list_head *head)
130{ 133{
134 trace_workqueue_insertion(cwq->thread, work);
135
131 set_wq_data(work, cwq); 136 set_wq_data(work, cwq);
132 /* 137 /*
133 * Ensure that we get the right work->data if we see the 138 * Ensure that we get the right work->data if we see the
@@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
259} 264}
260EXPORT_SYMBOL_GPL(queue_delayed_work_on); 265EXPORT_SYMBOL_GPL(queue_delayed_work_on);
261 266
267DEFINE_TRACE(workqueue_execution);
268
262static void run_workqueue(struct cpu_workqueue_struct *cwq) 269static void run_workqueue(struct cpu_workqueue_struct *cwq)
263{ 270{
264 spin_lock_irq(&cwq->lock); 271 spin_lock_irq(&cwq->lock);
@@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
284 */ 291 */
285 struct lockdep_map lockdep_map = work->lockdep_map; 292 struct lockdep_map lockdep_map = work->lockdep_map;
286#endif 293#endif
287 294 trace_workqueue_execution(cwq->thread, work);
288 cwq->current_work = work; 295 cwq->current_work = work;
289 list_del_init(cwq->worklist.next); 296 list_del_init(cwq->worklist.next);
290 spin_unlock_irq(&cwq->lock); 297 spin_unlock_irq(&cwq->lock);
@@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
765 return cwq; 772 return cwq;
766} 773}
767 774
775DEFINE_TRACE(workqueue_creation);
776
768static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 777static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
769{ 778{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 779 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
787 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 796 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
788 cwq->thread = p; 797 cwq->thread = p;
789 798
799 trace_workqueue_creation(cwq->thread, cpu);
800
790 return 0; 801 return 0;
791} 802}
792 803
@@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
868} 879}
869EXPORT_SYMBOL_GPL(__create_workqueue_key); 880EXPORT_SYMBOL_GPL(__create_workqueue_key);
870 881
882DEFINE_TRACE(workqueue_destruction);
883
871static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 884static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
872{ 885{
873 /* 886 /*
@@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
891 * checks list_empty(), and a "normal" queue_work() can't use 904 * checks list_empty(), and a "normal" queue_work() can't use
892 * a dead CPU. 905 * a dead CPU.
893 */ 906 */
907 trace_workqueue_destruction(cwq->thread);
894 kthread_stop(cwq->thread); 908 kthread_stop(cwq->thread);
895 cwq->thread = NULL; 909 cwq->thread = NULL;
896} 910}
diff --git a/lib/Kconfig b/lib/Kconfig
index 03c2c24b9083..206f36a9efb4 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -2,6 +2,9 @@
2# Library configuration 2# Library configuration
3# 3#
4 4
5config BINARY_PRINTF
6 def_bool n
7
5menu "Library routines" 8menu "Library routines"
6 9
7config BITREVERSE 10config BITREVERSE
@@ -98,6 +101,20 @@ config LZO_DECOMPRESS
98 tristate 101 tristate
99 102
100# 103#
104# These all provide a common interface (hence the apparent duplication with
105# ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.)
106#
107config DECOMPRESS_GZIP
108 select ZLIB_INFLATE
109 tristate
110
111config DECOMPRESS_BZIP2
112 tristate
113
114config DECOMPRESS_LZMA
115 tristate
116
117#
101# Generic allocator support is selected if needed 118# Generic allocator support is selected if needed
102# 119#
103config GENERIC_ALLOCATOR 120config GENERIC_ALLOCATOR
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1bcf9cd4baa0..a0879b2e8b6b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -402,7 +402,7 @@ config LOCKDEP
402 bool 402 bool
403 depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT 403 depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
404 select STACKTRACE 404 select STACKTRACE
405 select FRAME_POINTER if !X86 && !MIPS && !PPC 405 select FRAME_POINTER if !MIPS && !PPC
406 select KALLSYMS 406 select KALLSYMS
407 select KALLSYMS_ALL 407 select KALLSYMS_ALL
408 408
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64ded27..790de7c25d0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
11 rbtree.o radix-tree.o dump_stack.o \ 11 rbtree.o radix-tree.o dump_stack.o \
12 idr.o int_sqrt.o extable.o prio_tree.o \ 12 idr.o int_sqrt.o extable.o prio_tree.o \
13 sha1.o irq_regs.o reciprocal_div.o argv_split.o \ 13 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
14 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o 14 proportions.o prio_heap.o ratelimit.o show_mem.o \
15 is_single_threaded.o decompress.o
15 16
16lib-$(CONFIG_MMU) += ioremap.o 17lib-$(CONFIG_MMU) += ioremap.o
17lib-$(CONFIG_SMP) += cpumask.o 18lib-$(CONFIG_SMP) += cpumask.o
@@ -65,6 +66,10 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
65obj-$(CONFIG_LZO_COMPRESS) += lzo/ 66obj-$(CONFIG_LZO_COMPRESS) += lzo/
66obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ 67obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
67 68
69lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
70lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
71lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
72
68obj-$(CONFIG_TEXTSEARCH) += textsearch.o 73obj-$(CONFIG_TEXTSEARCH) += textsearch.o
69obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o 74obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
70obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o 75obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
diff --git a/lib/decompress.c b/lib/decompress.c
new file mode 100644
index 000000000000..d2842f571674
--- /dev/null
+++ b/lib/decompress.c
@@ -0,0 +1,54 @@
1/*
2 * decompress.c
3 *
4 * Detect the decompression method based on magic number
5 */
6
7#include <linux/decompress/generic.h>
8
9#include <linux/decompress/bunzip2.h>
10#include <linux/decompress/unlzma.h>
11#include <linux/decompress/inflate.h>
12
13#include <linux/types.h>
14#include <linux/string.h>
15
16#ifndef CONFIG_DECOMPRESS_GZIP
17# define gunzip NULL
18#endif
19#ifndef CONFIG_DECOMPRESS_BZIP2
20# define bunzip2 NULL
21#endif
22#ifndef CONFIG_DECOMPRESS_LZMA
23# define unlzma NULL
24#endif
25
26static const struct compress_format {
27 unsigned char magic[2];
28 const char *name;
29 decompress_fn decompressor;
30} compressed_formats[] = {
31 { {037, 0213}, "gzip", gunzip },
32 { {037, 0236}, "gzip", gunzip },
33 { {0x42, 0x5a}, "bzip2", bunzip2 },
34 { {0x5d, 0x00}, "lzma", unlzma },
35 { {0, 0}, NULL, NULL }
36};
37
38decompress_fn decompress_method(const unsigned char *inbuf, int len,
39 const char **name)
40{
41 const struct compress_format *cf;
42
43 if (len < 2)
44 return NULL; /* Need at least this much... */
45
46 for (cf = compressed_formats; cf->name; cf++) {
47 if (!memcmp(inbuf, cf->magic, 2))
48 break;
49
50 }
51 if (name)
52 *name = cf->name;
53 return cf->decompressor;
54}
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
new file mode 100644
index 000000000000..5d3ddb5fcfd9
--- /dev/null
+++ b/lib/decompress_bunzip2.c
@@ -0,0 +1,735 @@
1/* vi: set sw = 4 ts = 4: */
2/* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
3
4 Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
5 which also acknowledges contributions by Mike Burrows, David Wheeler,
6 Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
7 Robert Sedgewick, and Jon L. Bentley.
8
9 This code is licensed under the LGPLv2:
10 LGPL (http://www.gnu.org/copyleft/lgpl.html
11*/
12
13/*
14 Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org).
15
16 More efficient reading of Huffman codes, a streamlined read_bunzip()
17 function, and various other tweaks. In (limited) tests, approximately
18 20% faster than bzcat on x86 and about 10% faster on arm.
19
20 Note that about 2/3 of the time is spent in read_unzip() reversing
21 the Burrows-Wheeler transformation. Much of that time is delay
22 resulting from cache misses.
23
24 I would ask that anyone benefiting from this work, especially those
25 using it in commercial products, consider making a donation to my local
26 non-profit hospice organization in the name of the woman I loved, who
27 passed away Feb. 12, 2003.
28
29 In memory of Toni W. Hagan
30
31 Hospice of Acadiana, Inc.
32 2600 Johnston St., Suite 200
33 Lafayette, LA 70503-3240
34
35 Phone (337) 232-1234 or 1-800-738-2226
36 Fax (337) 232-1297
37
38 http://www.hospiceacadiana.com/
39
40 Manuel
41 */
42
43/*
44 Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
45*/
46
47
48#ifndef STATIC
49#include <linux/decompress/bunzip2.h>
50#endif /* !STATIC */
51
52#include <linux/decompress/mm.h>
53
54#ifndef INT_MAX
55#define INT_MAX 0x7fffffff
56#endif
57
58/* Constants for Huffman coding */
59#define MAX_GROUPS 6
60#define GROUP_SIZE 50 /* 64 would have been more efficient */
61#define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */
62#define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
63#define SYMBOL_RUNA 0
64#define SYMBOL_RUNB 1
65
66/* Status return values */
67#define RETVAL_OK 0
68#define RETVAL_LAST_BLOCK (-1)
69#define RETVAL_NOT_BZIP_DATA (-2)
70#define RETVAL_UNEXPECTED_INPUT_EOF (-3)
71#define RETVAL_UNEXPECTED_OUTPUT_EOF (-4)
72#define RETVAL_DATA_ERROR (-5)
73#define RETVAL_OUT_OF_MEMORY (-6)
74#define RETVAL_OBSOLETE_INPUT (-7)
75
76/* Other housekeeping constants */
77#define BZIP2_IOBUF_SIZE 4096
78
79/* This is what we know about each Huffman coding group */
80struct group_data {
81 /* We have an extra slot at the end of limit[] for a sentinal value. */
82 int limit[MAX_HUFCODE_BITS+1];
83 int base[MAX_HUFCODE_BITS];
84 int permute[MAX_SYMBOLS];
85 int minLen, maxLen;
86};
87
88/* Structure holding all the housekeeping data, including IO buffers and
89 memory that persists between calls to bunzip */
90struct bunzip_data {
91 /* State for interrupting output loop */
92 int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
93 /* I/O tracking data (file handles, buffers, positions, etc.) */
94 int (*fill)(void*, unsigned int);
95 int inbufCount, inbufPos /*, outbufPos*/;
96 unsigned char *inbuf /*,*outbuf*/;
97 unsigned int inbufBitCount, inbufBits;
98 /* The CRC values stored in the block header and calculated from the
99 data */
100 unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
101 /* Intermediate buffer and its size (in bytes) */
102 unsigned int *dbuf, dbufSize;
103 /* These things are a bit too big to go on the stack */
104 unsigned char selectors[32768]; /* nSelectors = 15 bits */
105 struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */
106 int io_error; /* non-zero if we have IO error */
107};
108
109
110/* Return the next nnn bits of input. All reads from the compressed input
111 are done through this function. All reads are big endian */
112static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
113{
114 unsigned int bits = 0;
115
116 /* If we need to get more data from the byte buffer, do so.
117 (Loop getting one byte at a time to enforce endianness and avoid
118 unaligned access.) */
119 while (bd->inbufBitCount < bits_wanted) {
120 /* If we need to read more data from file into byte buffer, do
121 so */
122 if (bd->inbufPos == bd->inbufCount) {
123 if (bd->io_error)
124 return 0;
125 bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
126 if (bd->inbufCount <= 0) {
127 bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
128 return 0;
129 }
130 bd->inbufPos = 0;
131 }
132 /* Avoid 32-bit overflow (dump bit buffer to top of output) */
133 if (bd->inbufBitCount >= 24) {
134 bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
135 bits_wanted -= bd->inbufBitCount;
136 bits <<= bits_wanted;
137 bd->inbufBitCount = 0;
138 }
139 /* Grab next 8 bits of input from buffer. */
140 bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
141 bd->inbufBitCount += 8;
142 }
143 /* Calculate result */
144 bd->inbufBitCount -= bits_wanted;
145 bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
146
147 return bits;
148}
149
150/* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
151
152static int INIT get_next_block(struct bunzip_data *bd)
153{
154 struct group_data *hufGroup = NULL;
155 int *base = NULL;
156 int *limit = NULL;
157 int dbufCount, nextSym, dbufSize, groupCount, selector,
158 i, j, k, t, runPos, symCount, symTotal, nSelectors,
159 byteCount[256];
160 unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
161 unsigned int *dbuf, origPtr;
162
163 dbuf = bd->dbuf;
164 dbufSize = bd->dbufSize;
165 selectors = bd->selectors;
166
167 /* Read in header signature and CRC, then validate signature.
168 (last block signature means CRC is for whole file, return now) */
169 i = get_bits(bd, 24);
170 j = get_bits(bd, 24);
171 bd->headerCRC = get_bits(bd, 32);
172 if ((i == 0x177245) && (j == 0x385090))
173 return RETVAL_LAST_BLOCK;
174 if ((i != 0x314159) || (j != 0x265359))
175 return RETVAL_NOT_BZIP_DATA;
176 /* We can add support for blockRandomised if anybody complains.
177 There was some code for this in busybox 1.0.0-pre3, but nobody ever
178 noticed that it didn't actually work. */
179 if (get_bits(bd, 1))
180 return RETVAL_OBSOLETE_INPUT;
181 origPtr = get_bits(bd, 24);
182 if (origPtr > dbufSize)
183 return RETVAL_DATA_ERROR;
184 /* mapping table: if some byte values are never used (encoding things
185 like ascii text), the compression code removes the gaps to have fewer
186 symbols to deal with, and writes a sparse bitfield indicating which
187 values were present. We make a translation table to convert the
188 symbols back to the corresponding bytes. */
189 t = get_bits(bd, 16);
190 symTotal = 0;
191 for (i = 0; i < 16; i++) {
192 if (t&(1 << (15-i))) {
193 k = get_bits(bd, 16);
194 for (j = 0; j < 16; j++)
195 if (k&(1 << (15-j)))
196 symToByte[symTotal++] = (16*i)+j;
197 }
198 }
199 /* How many different Huffman coding groups does this block use? */
200 groupCount = get_bits(bd, 3);
201 if (groupCount < 2 || groupCount > MAX_GROUPS)
202 return RETVAL_DATA_ERROR;
203 /* nSelectors: Every GROUP_SIZE many symbols we select a new
204 Huffman coding group. Read in the group selector list,
205 which is stored as MTF encoded bit runs. (MTF = Move To
206 Front, as each value is used it's moved to the start of the
207 list.) */
208 nSelectors = get_bits(bd, 15);
209 if (!nSelectors)
210 return RETVAL_DATA_ERROR;
211 for (i = 0; i < groupCount; i++)
212 mtfSymbol[i] = i;
213 for (i = 0; i < nSelectors; i++) {
214 /* Get next value */
215 for (j = 0; get_bits(bd, 1); j++)
216 if (j >= groupCount)
217 return RETVAL_DATA_ERROR;
218 /* Decode MTF to get the next selector */
219 uc = mtfSymbol[j];
220 for (; j; j--)
221 mtfSymbol[j] = mtfSymbol[j-1];
222 mtfSymbol[0] = selectors[i] = uc;
223 }
224 /* Read the Huffman coding tables for each group, which code
225 for symTotal literal symbols, plus two run symbols (RUNA,
226 RUNB) */
227 symCount = symTotal+2;
228 for (j = 0; j < groupCount; j++) {
229 unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
230 int minLen, maxLen, pp;
231 /* Read Huffman code lengths for each symbol. They're
232 stored in a way similar to mtf; record a starting
233 value for the first symbol, and an offset from the
234 previous value for everys symbol after that.
235 (Subtracting 1 before the loop and then adding it
236 back at the end is an optimization that makes the
237 test inside the loop simpler: symbol length 0
238 becomes negative, so an unsigned inequality catches
239 it.) */
240 t = get_bits(bd, 5)-1;
241 for (i = 0; i < symCount; i++) {
242 for (;;) {
243 if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
244 return RETVAL_DATA_ERROR;
245
246 /* If first bit is 0, stop. Else
247 second bit indicates whether to
248 increment or decrement the value.
249 Optimization: grab 2 bits and unget
250 the second if the first was 0. */
251
252 k = get_bits(bd, 2);
253 if (k < 2) {
254 bd->inbufBitCount++;
255 break;
256 }
257 /* Add one if second bit 1, else
258 * subtract 1. Avoids if/else */
259 t += (((k+1)&2)-1);
260 }
261 /* Correct for the initial -1, to get the
262 * final symbol length */
263 length[i] = t+1;
264 }
265 /* Find largest and smallest lengths in this group */
266 minLen = maxLen = length[0];
267
268 for (i = 1; i < symCount; i++) {
269 if (length[i] > maxLen)
270 maxLen = length[i];
271 else if (length[i] < minLen)
272 minLen = length[i];
273 }
274
275 /* Calculate permute[], base[], and limit[] tables from
276 * length[].
277 *
278 * permute[] is the lookup table for converting
279 * Huffman coded symbols into decoded symbols. base[]
280 * is the amount to subtract from the value of a
281 * Huffman symbol of a given length when using
282 * permute[].
283 *
284 * limit[] indicates the largest numerical value a
285 * symbol with a given number of bits can have. This
286 * is how the Huffman codes can vary in length: each
287 * code with a value > limit[length] needs another
288 * bit.
289 */
290 hufGroup = bd->groups+j;
291 hufGroup->minLen = minLen;
292 hufGroup->maxLen = maxLen;
293 /* Note that minLen can't be smaller than 1, so we
294 adjust the base and limit array pointers so we're
295 not always wasting the first entry. We do this
296 again when using them (during symbol decoding).*/
297 base = hufGroup->base-1;
298 limit = hufGroup->limit-1;
299 /* Calculate permute[]. Concurently, initialize
300 * temp[] and limit[]. */
301 pp = 0;
302 for (i = minLen; i <= maxLen; i++) {
303 temp[i] = limit[i] = 0;
304 for (t = 0; t < symCount; t++)
305 if (length[t] == i)
306 hufGroup->permute[pp++] = t;
307 }
308 /* Count symbols coded for at each bit length */
309 for (i = 0; i < symCount; i++)
310 temp[length[i]]++;
311 /* Calculate limit[] (the largest symbol-coding value
312 *at each bit length, which is (previous limit <<
313 *1)+symbols at this level), and base[] (number of
314 *symbols to ignore at each bit length, which is limit
315 *minus the cumulative count of symbols coded for
316 *already). */
317 pp = t = 0;
318 for (i = minLen; i < maxLen; i++) {
319 pp += temp[i];
320 /* We read the largest possible symbol size
321 and then unget bits after determining how
322 many we need, and those extra bits could be
323 set to anything. (They're noise from
324 future symbols.) At each level we're
325 really only interested in the first few
326 bits, so here we set all the trailing
327 to-be-ignored bits to 1 so they don't
328 affect the value > limit[length]
329 comparison. */
330 limit[i] = (pp << (maxLen - i)) - 1;
331 pp <<= 1;
332 base[i+1] = pp-(t += temp[i]);
333 }
334 limit[maxLen+1] = INT_MAX; /* Sentinal value for
335 * reading next sym. */
336 limit[maxLen] = pp+temp[maxLen]-1;
337 base[minLen] = 0;
338 }
339 /* We've finished reading and digesting the block header. Now
340 read this block's Huffman coded symbols from the file and
341 undo the Huffman coding and run length encoding, saving the
342 result into dbuf[dbufCount++] = uc */
343
344 /* Initialize symbol occurrence counters and symbol Move To
345 * Front table */
346 for (i = 0; i < 256; i++) {
347 byteCount[i] = 0;
348 mtfSymbol[i] = (unsigned char)i;
349 }
350 /* Loop through compressed symbols. */
351 runPos = dbufCount = symCount = selector = 0;
352 for (;;) {
353 /* Determine which Huffman coding group to use. */
354 if (!(symCount--)) {
355 symCount = GROUP_SIZE-1;
356 if (selector >= nSelectors)
357 return RETVAL_DATA_ERROR;
358 hufGroup = bd->groups+selectors[selector++];
359 base = hufGroup->base-1;
360 limit = hufGroup->limit-1;
361 }
362 /* Read next Huffman-coded symbol. */
363 /* Note: It is far cheaper to read maxLen bits and
364 back up than it is to read minLen bits and then an
365 additional bit at a time, testing as we go.
366 Because there is a trailing last block (with file
367 CRC), there is no danger of the overread causing an
368 unexpected EOF for a valid compressed file. As a
369 further optimization, we do the read inline
370 (falling back to a call to get_bits if the buffer
371 runs dry). The following (up to got_huff_bits:) is
372 equivalent to j = get_bits(bd, hufGroup->maxLen);
373 */
374 while (bd->inbufBitCount < hufGroup->maxLen) {
375 if (bd->inbufPos == bd->inbufCount) {
376 j = get_bits(bd, hufGroup->maxLen);
377 goto got_huff_bits;
378 }
379 bd->inbufBits =
380 (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
381 bd->inbufBitCount += 8;
382 };
383 bd->inbufBitCount -= hufGroup->maxLen;
384 j = (bd->inbufBits >> bd->inbufBitCount)&
385 ((1 << hufGroup->maxLen)-1);
386got_huff_bits:
387 /* Figure how how many bits are in next symbol and
388 * unget extras */
389 i = hufGroup->minLen;
390 while (j > limit[i])
391 ++i;
392 bd->inbufBitCount += (hufGroup->maxLen - i);
393 /* Huffman decode value to get nextSym (with bounds checking) */
394 if ((i > hufGroup->maxLen)
395 || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
396 >= MAX_SYMBOLS))
397 return RETVAL_DATA_ERROR;
398 nextSym = hufGroup->permute[j];
399 /* We have now decoded the symbol, which indicates
400 either a new literal byte, or a repeated run of the
401 most recent literal byte. First, check if nextSym
402 indicates a repeated run, and if so loop collecting
403 how many times to repeat the last literal. */
404 if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
405 /* If this is the start of a new run, zero out
406 * counter */
407 if (!runPos) {
408 runPos = 1;
409 t = 0;
410 }
411 /* Neat trick that saves 1 symbol: instead of
412 or-ing 0 or 1 at each bit position, add 1
413 or 2 instead. For example, 1011 is 1 << 0
414 + 1 << 1 + 2 << 2. 1010 is 2 << 0 + 2 << 1
415 + 1 << 2. You can make any bit pattern
416 that way using 1 less symbol than the basic
417 or 0/1 method (except all bits 0, which
418 would use no symbols, but a run of length 0
419 doesn't mean anything in this context).
420 Thus space is saved. */
421 t += (runPos << nextSym);
422 /* +runPos if RUNA; +2*runPos if RUNB */
423
424 runPos <<= 1;
425 continue;
426 }
427 /* When we hit the first non-run symbol after a run,
428 we now know how many times to repeat the last
429 literal, so append that many copies to our buffer
430 of decoded symbols (dbuf) now. (The last literal
431 used is the one at the head of the mtfSymbol
432 array.) */
433 if (runPos) {
434 runPos = 0;
435 if (dbufCount+t >= dbufSize)
436 return RETVAL_DATA_ERROR;
437
438 uc = symToByte[mtfSymbol[0]];
439 byteCount[uc] += t;
440 while (t--)
441 dbuf[dbufCount++] = uc;
442 }
443 /* Is this the terminating symbol? */
444 if (nextSym > symTotal)
445 break;
446 /* At this point, nextSym indicates a new literal
447 character. Subtract one to get the position in the
448 MTF array at which this literal is currently to be
449 found. (Note that the result can't be -1 or 0,
450 because 0 and 1 are RUNA and RUNB. But another
451 instance of the first symbol in the mtf array,
452 position 0, would have been handled as part of a
453 run above. Therefore 1 unused mtf position minus 2
454 non-literal nextSym values equals -1.) */
455 if (dbufCount >= dbufSize)
456 return RETVAL_DATA_ERROR;
457 i = nextSym - 1;
458 uc = mtfSymbol[i];
459 /* Adjust the MTF array. Since we typically expect to
460 *move only a small number of symbols, and are bound
461 *by 256 in any case, using memmove here would
462 *typically be bigger and slower due to function call
463 *overhead and other assorted setup costs. */
464 do {
465 mtfSymbol[i] = mtfSymbol[i-1];
466 } while (--i);
467 mtfSymbol[0] = uc;
468 uc = symToByte[uc];
469 /* We have our literal byte. Save it into dbuf. */
470 byteCount[uc]++;
471 dbuf[dbufCount++] = (unsigned int)uc;
472 }
473 /* At this point, we've read all the Huffman-coded symbols
474 (and repeated runs) for this block from the input stream,
475 and decoded them into the intermediate buffer. There are
476 dbufCount many decoded bytes in dbuf[]. Now undo the
477 Burrows-Wheeler transform on dbuf. See
478 http://dogma.net/markn/articles/bwt/bwt.htm
479 */
480 /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
481 j = 0;
482 for (i = 0; i < 256; i++) {
483 k = j+byteCount[i];
484 byteCount[i] = j;
485 j = k;
486 }
487 /* Figure out what order dbuf would be in if we sorted it. */
488 for (i = 0; i < dbufCount; i++) {
489 uc = (unsigned char)(dbuf[i] & 0xff);
490 dbuf[byteCount[uc]] |= (i << 8);
491 byteCount[uc]++;
492 }
493 /* Decode first byte by hand to initialize "previous" byte.
494 Note that it doesn't get output, and if the first three
495 characters are identical it doesn't qualify as a run (hence
496 writeRunCountdown = 5). */
497 if (dbufCount) {
498 if (origPtr >= dbufCount)
499 return RETVAL_DATA_ERROR;
500 bd->writePos = dbuf[origPtr];
501 bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
502 bd->writePos >>= 8;
503 bd->writeRunCountdown = 5;
504 }
505 bd->writeCount = dbufCount;
506
507 return RETVAL_OK;
508}
509
510/* Undo burrows-wheeler transform on intermediate buffer to produce output.
511 If start_bunzip was initialized with out_fd =-1, then up to len bytes of
512 data are written to outbuf. Return value is number of bytes written or
513 error (all errors are negative numbers). If out_fd!=-1, outbuf and len
514 are ignored, data is written to out_fd and return is RETVAL_OK or error.
515*/
516
517static int INIT read_bunzip(struct bunzip_data *bd, char *outbuf, int len)
518{
519 const unsigned int *dbuf;
520 int pos, xcurrent, previous, gotcount;
521
522 /* If last read was short due to end of file, return last block now */
523 if (bd->writeCount < 0)
524 return bd->writeCount;
525
526 gotcount = 0;
527 dbuf = bd->dbuf;
528 pos = bd->writePos;
529 xcurrent = bd->writeCurrent;
530
531 /* We will always have pending decoded data to write into the output
532 buffer unless this is the very first call (in which case we haven't
533 Huffman-decoded a block into the intermediate buffer yet). */
534
535 if (bd->writeCopies) {
536 /* Inside the loop, writeCopies means extra copies (beyond 1) */
537 --bd->writeCopies;
538 /* Loop outputting bytes */
539 for (;;) {
540 /* If the output buffer is full, snapshot
541 * state and return */
542 if (gotcount >= len) {
543 bd->writePos = pos;
544 bd->writeCurrent = xcurrent;
545 bd->writeCopies++;
546 return len;
547 }
548 /* Write next byte into output buffer, updating CRC */
549 outbuf[gotcount++] = xcurrent;
550 bd->writeCRC = (((bd->writeCRC) << 8)
551 ^bd->crc32Table[((bd->writeCRC) >> 24)
552 ^xcurrent]);
553 /* Loop now if we're outputting multiple
554 * copies of this byte */
555 if (bd->writeCopies) {
556 --bd->writeCopies;
557 continue;
558 }
559decode_next_byte:
560 if (!bd->writeCount--)
561 break;
562 /* Follow sequence vector to undo
563 * Burrows-Wheeler transform */
564 previous = xcurrent;
565 pos = dbuf[pos];
566 xcurrent = pos&0xff;
567 pos >>= 8;
568 /* After 3 consecutive copies of the same
569 byte, the 4th is a repeat count. We count
570 down from 4 instead *of counting up because
571 testing for non-zero is faster */
572 if (--bd->writeRunCountdown) {
573 if (xcurrent != previous)
574 bd->writeRunCountdown = 4;
575 } else {
576 /* We have a repeated run, this byte
577 * indicates the count */
578 bd->writeCopies = xcurrent;
579 xcurrent = previous;
580 bd->writeRunCountdown = 5;
581 /* Sometimes there are just 3 bytes
582 * (run length 0) */
583 if (!bd->writeCopies)
584 goto decode_next_byte;
585 /* Subtract the 1 copy we'd output
586 * anyway to get extras */
587 --bd->writeCopies;
588 }
589 }
590 /* Decompression of this block completed successfully */
591 bd->writeCRC = ~bd->writeCRC;
592 bd->totalCRC = ((bd->totalCRC << 1) |
593 (bd->totalCRC >> 31)) ^ bd->writeCRC;
594 /* If this block had a CRC error, force file level CRC error. */
595 if (bd->writeCRC != bd->headerCRC) {
596 bd->totalCRC = bd->headerCRC+1;
597 return RETVAL_LAST_BLOCK;
598 }
599 }
600
601 /* Refill the intermediate buffer by Huffman-decoding next
602 * block of input */
603 /* (previous is just a convenient unused temp variable here) */
604 previous = get_next_block(bd);
605 if (previous) {
606 bd->writeCount = previous;
607 return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
608 }
609 bd->writeCRC = 0xffffffffUL;
610 pos = bd->writePos;
611 xcurrent = bd->writeCurrent;
612 goto decode_next_byte;
613}
614
615static int INIT nofill(void *buf, unsigned int len)
616{
617 return -1;
618}
619
620/* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain
621 a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are
622 ignored, and data is read from file handle into temporary buffer. */
623static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
624 int (*fill)(void*, unsigned int))
625{
626 struct bunzip_data *bd;
627 unsigned int i, j, c;
628 const unsigned int BZh0 =
629 (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
630 +(((unsigned int)'h') << 8)+(unsigned int)'0';
631
632 /* Figure out how much data to allocate */
633 i = sizeof(struct bunzip_data);
634
635 /* Allocate bunzip_data. Most fields initialize to zero. */
636 bd = *bdp = malloc(i);
637 memset(bd, 0, sizeof(struct bunzip_data));
638 /* Setup input buffer */
639 bd->inbuf = inbuf;
640 bd->inbufCount = len;
641 if (fill != NULL)
642 bd->fill = fill;
643 else
644 bd->fill = nofill;
645
646 /* Init the CRC32 table (big endian) */
647 for (i = 0; i < 256; i++) {
648 c = i << 24;
649 for (j = 8; j; j--)
650 c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
651 bd->crc32Table[i] = c;
652 }
653
654 /* Ensure that file starts with "BZh['1'-'9']." */
655 i = get_bits(bd, 32);
656 if (((unsigned int)(i-BZh0-1)) >= 9)
657 return RETVAL_NOT_BZIP_DATA;
658
659 /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
660 uncompressed data. Allocate intermediate buffer for block. */
661 bd->dbufSize = 100000*(i-BZh0);
662
663 bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
664 return RETVAL_OK;
665}
666
667/* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data,
668 not end of file.) */
669STATIC int INIT bunzip2(unsigned char *buf, int len,
670 int(*fill)(void*, unsigned int),
671 int(*flush)(void*, unsigned int),
672 unsigned char *outbuf,
673 int *pos,
674 void(*error_fn)(char *x))
675{
676 struct bunzip_data *bd;
677 int i = -1;
678 unsigned char *inbuf;
679
680 set_error_fn(error_fn);
681 if (flush)
682 outbuf = malloc(BZIP2_IOBUF_SIZE);
683 else
684 len -= 4; /* Uncompressed size hack active in pre-boot
685 environment */
686 if (!outbuf) {
687 error("Could not allocate output bufer");
688 return -1;
689 }
690 if (buf)
691 inbuf = buf;
692 else
693 inbuf = malloc(BZIP2_IOBUF_SIZE);
694 if (!inbuf) {
695 error("Could not allocate input bufer");
696 goto exit_0;
697 }
698 i = start_bunzip(&bd, inbuf, len, fill);
699 if (!i) {
700 for (;;) {
701 i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
702 if (i <= 0)
703 break;
704 if (!flush)
705 outbuf += i;
706 else
707 if (i != flush(outbuf, i)) {
708 i = RETVAL_UNEXPECTED_OUTPUT_EOF;
709 break;
710 }
711 }
712 }
713 /* Check CRC and release memory */
714 if (i == RETVAL_LAST_BLOCK) {
715 if (bd->headerCRC != bd->totalCRC)
716 error("Data integrity error when decompressing.");
717 else
718 i = RETVAL_OK;
719 } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
720 error("Compressed file ends unexpectedly");
721 }
722 if (bd->dbuf)
723 large_free(bd->dbuf);
724 if (pos)
725 *pos = bd->inbufPos;
726 free(bd);
727 if (!buf)
728 free(inbuf);
729exit_0:
730 if (flush)
731 free(outbuf);
732 return i;
733}
734
735#define decompress bunzip2
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
new file mode 100644
index 000000000000..839a329b4fc4
--- /dev/null
+++ b/lib/decompress_inflate.c
@@ -0,0 +1,167 @@
1#ifdef STATIC
2/* Pre-boot environment: included */
3
4/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
5 * errors about console_printk etc... on ARM */
6#define _LINUX_KERNEL_H
7
8#include "zlib_inflate/inftrees.c"
9#include "zlib_inflate/inffast.c"
10#include "zlib_inflate/inflate.c"
11
12#else /* STATIC */
13/* initramfs et al: linked */
14
15#include <linux/zutil.h>
16
17#include "zlib_inflate/inftrees.h"
18#include "zlib_inflate/inffast.h"
19#include "zlib_inflate/inflate.h"
20
21#include "zlib_inflate/infutil.h"
22
23#endif /* STATIC */
24
25#include <linux/decompress/mm.h>
26
27#define INBUF_LEN (16*1024)
28
29/* Included from initramfs et al code */
30STATIC int INIT gunzip(unsigned char *buf, int len,
31 int(*fill)(void*, unsigned int),
32 int(*flush)(void*, unsigned int),
33 unsigned char *out_buf,
34 int *pos,
35 void(*error_fn)(char *x)) {
36 u8 *zbuf;
37 struct z_stream_s *strm;
38 int rc;
39 size_t out_len;
40
41 set_error_fn(error_fn);
42 rc = -1;
43 if (flush) {
44 out_len = 0x8000; /* 32 K */
45 out_buf = malloc(out_len);
46 } else {
47 out_len = 0x7fffffff; /* no limit */
48 }
49 if (!out_buf) {
50 error("Out of memory while allocating output buffer");
51 goto gunzip_nomem1;
52 }
53
54 if (buf)
55 zbuf = buf;
56 else {
57 zbuf = malloc(INBUF_LEN);
58 len = 0;
59 }
60 if (!zbuf) {
61 error("Out of memory while allocating input buffer");
62 goto gunzip_nomem2;
63 }
64
65 strm = malloc(sizeof(*strm));
66 if (strm == NULL) {
67 error("Out of memory while allocating z_stream");
68 goto gunzip_nomem3;
69 }
70
71 strm->workspace = malloc(flush ? zlib_inflate_workspacesize() :
72 sizeof(struct inflate_state));
73 if (strm->workspace == NULL) {
74 error("Out of memory while allocating workspace");
75 goto gunzip_nomem4;
76 }
77
78 if (len == 0)
79 len = fill(zbuf, INBUF_LEN);
80
81 /* verify the gzip header */
82 if (len < 10 ||
83 zbuf[0] != 0x1f || zbuf[1] != 0x8b || zbuf[2] != 0x08) {
84 if (pos)
85 *pos = 0;
86 error("Not a gzip file");
87 goto gunzip_5;
88 }
89
90 /* skip over gzip header (1f,8b,08... 10 bytes total +
91 * possible asciz filename)
92 */
93 strm->next_in = zbuf + 10;
94 /* skip over asciz filename */
95 if (zbuf[3] & 0x8) {
96 while (strm->next_in[0])
97 strm->next_in++;
98 strm->next_in++;
99 }
100 strm->avail_in = len - (strm->next_in - zbuf);
101
102 strm->next_out = out_buf;
103 strm->avail_out = out_len;
104
105 rc = zlib_inflateInit2(strm, -MAX_WBITS);
106
107 if (!flush) {
108 WS(strm)->inflate_state.wsize = 0;
109 WS(strm)->inflate_state.window = NULL;
110 }
111
112 while (rc == Z_OK) {
113 if (strm->avail_in == 0) {
114 /* TODO: handle case where both pos and fill are set */
115 len = fill(zbuf, INBUF_LEN);
116 if (len < 0) {
117 rc = -1;
118 error("read error");
119 break;
120 }
121 strm->next_in = zbuf;
122 strm->avail_in = len;
123 }
124 rc = zlib_inflate(strm, 0);
125
126 /* Write any data generated */
127 if (flush && strm->next_out > out_buf) {
128 int l = strm->next_out - out_buf;
129 if (l != flush(out_buf, l)) {
130 rc = -1;
131 error("write error");
132 break;
133 }
134 strm->next_out = out_buf;
135 strm->avail_out = out_len;
136 }
137
138 /* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */
139 if (rc == Z_STREAM_END) {
140 rc = 0;
141 break;
142 } else if (rc != Z_OK) {
143 error("uncompression error");
144 rc = -1;
145 }
146 }
147
148 zlib_inflateEnd(strm);
149 if (pos)
150 /* add + 8 to skip over trailer */
151 *pos = strm->next_in - zbuf+8;
152
153gunzip_5:
154 free(strm->workspace);
155gunzip_nomem4:
156 free(strm);
157gunzip_nomem3:
158 if (!buf)
159 free(zbuf);
160gunzip_nomem2:
161 if (flush)
162 free(out_buf);
163gunzip_nomem1:
164 return rc; /* returns Z_OK (0) if successful */
165}
166
167#define decompress gunzip
diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c
new file mode 100644
index 000000000000..546f2f4c157e
--- /dev/null
+++ b/lib/decompress_unlzma.c
@@ -0,0 +1,647 @@
1/* Lzma decompressor for Linux kernel. Shamelessly snarfed
2 *from busybox 1.1.1
3 *
4 *Linux kernel adaptation
5 *Copyright (C) 2006 Alain < alain@knaff.lu >
6 *
7 *Based on small lzma deflate implementation/Small range coder
8 *implementation for lzma.
9 *Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org >
10 *
11 *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
12 *Copyright (C) 1999-2005 Igor Pavlov
13 *
14 *Copyrights of the parts, see headers below.
15 *
16 *
17 *This program is free software; you can redistribute it and/or
18 *modify it under the terms of the GNU Lesser General Public
19 *License as published by the Free Software Foundation; either
20 *version 2.1 of the License, or (at your option) any later version.
21 *
22 *This program is distributed in the hope that it will be useful,
23 *but WITHOUT ANY WARRANTY; without even the implied warranty of
24 *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 *Lesser General Public License for more details.
26 *
27 *You should have received a copy of the GNU Lesser General Public
28 *License along with this library; if not, write to the Free Software
29 *Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 */
31
32#ifndef STATIC
33#include <linux/decompress/unlzma.h>
34#endif /* STATIC */
35
36#include <linux/decompress/mm.h>
37
38#define MIN(a, b) (((a) < (b)) ? (a) : (b))
39
40static long long INIT read_int(unsigned char *ptr, int size)
41{
42 int i;
43 long long ret = 0;
44
45 for (i = 0; i < size; i++)
46 ret = (ret << 8) | ptr[size-i-1];
47 return ret;
48}
49
50#define ENDIAN_CONVERT(x) \
51 x = (typeof(x))read_int((unsigned char *)&x, sizeof(x))
52
53
54/* Small range coder implementation for lzma.
55 *Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org >
56 *
57 *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
58 *Copyright (c) 1999-2005 Igor Pavlov
59 */
60
61#include <linux/compiler.h>
62
63#define LZMA_IOBUF_SIZE 0x10000
64
65struct rc {
66 int (*fill)(void*, unsigned int);
67 uint8_t *ptr;
68 uint8_t *buffer;
69 uint8_t *buffer_end;
70 int buffer_size;
71 uint32_t code;
72 uint32_t range;
73 uint32_t bound;
74};
75
76
77#define RC_TOP_BITS 24
78#define RC_MOVE_BITS 5
79#define RC_MODEL_TOTAL_BITS 11
80
81
82/* Called twice: once at startup and once in rc_normalize() */
83static void INIT rc_read(struct rc *rc)
84{
85 rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE);
86 if (rc->buffer_size <= 0)
87 error("unexpected EOF");
88 rc->ptr = rc->buffer;
89 rc->buffer_end = rc->buffer + rc->buffer_size;
90}
91
92/* Called once */
93static inline void INIT rc_init(struct rc *rc,
94 int (*fill)(void*, unsigned int),
95 char *buffer, int buffer_size)
96{
97 rc->fill = fill;
98 rc->buffer = (uint8_t *)buffer;
99 rc->buffer_size = buffer_size;
100 rc->buffer_end = rc->buffer + rc->buffer_size;
101 rc->ptr = rc->buffer;
102
103 rc->code = 0;
104 rc->range = 0xFFFFFFFF;
105}
106
107static inline void INIT rc_init_code(struct rc *rc)
108{
109 int i;
110
111 for (i = 0; i < 5; i++) {
112 if (rc->ptr >= rc->buffer_end)
113 rc_read(rc);
114 rc->code = (rc->code << 8) | *rc->ptr++;
115 }
116}
117
118
119/* Called once. TODO: bb_maybe_free() */
120static inline void INIT rc_free(struct rc *rc)
121{
122 free(rc->buffer);
123}
124
125/* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */
126static void INIT rc_do_normalize(struct rc *rc)
127{
128 if (rc->ptr >= rc->buffer_end)
129 rc_read(rc);
130 rc->range <<= 8;
131 rc->code = (rc->code << 8) | *rc->ptr++;
132}
133static inline void INIT rc_normalize(struct rc *rc)
134{
135 if (rc->range < (1 << RC_TOP_BITS))
136 rc_do_normalize(rc);
137}
138
139/* Called 9 times */
140/* Why rc_is_bit_0_helper exists?
141 *Because we want to always expose (rc->code < rc->bound) to optimizer
142 */
143static inline uint32_t INIT rc_is_bit_0_helper(struct rc *rc, uint16_t *p)
144{
145 rc_normalize(rc);
146 rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
147 return rc->bound;
148}
149static inline int INIT rc_is_bit_0(struct rc *rc, uint16_t *p)
150{
151 uint32_t t = rc_is_bit_0_helper(rc, p);
152 return rc->code < t;
153}
154
155/* Called ~10 times, but very small, thus inlined */
156static inline void INIT rc_update_bit_0(struct rc *rc, uint16_t *p)
157{
158 rc->range = rc->bound;
159 *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
160}
161static inline void rc_update_bit_1(struct rc *rc, uint16_t *p)
162{
163 rc->range -= rc->bound;
164 rc->code -= rc->bound;
165 *p -= *p >> RC_MOVE_BITS;
166}
167
168/* Called 4 times in unlzma loop */
169static int INIT rc_get_bit(struct rc *rc, uint16_t *p, int *symbol)
170{
171 if (rc_is_bit_0(rc, p)) {
172 rc_update_bit_0(rc, p);
173 *symbol *= 2;
174 return 0;
175 } else {
176 rc_update_bit_1(rc, p);
177 *symbol = *symbol * 2 + 1;
178 return 1;
179 }
180}
181
182/* Called once */
183static inline int INIT rc_direct_bit(struct rc *rc)
184{
185 rc_normalize(rc);
186 rc->range >>= 1;
187 if (rc->code >= rc->range) {
188 rc->code -= rc->range;
189 return 1;
190 }
191 return 0;
192}
193
194/* Called twice */
195static inline void INIT
196rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol)
197{
198 int i = num_levels;
199
200 *symbol = 1;
201 while (i--)
202 rc_get_bit(rc, p + *symbol, symbol);
203 *symbol -= 1 << num_levels;
204}
205
206
207/*
208 * Small lzma deflate implementation.
209 * Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org >
210 *
211 * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/)
212 * Copyright (C) 1999-2005 Igor Pavlov
213 */
214
215
216struct lzma_header {
217 uint8_t pos;
218 uint32_t dict_size;
219 uint64_t dst_size;
220} __attribute__ ((packed)) ;
221
222
223#define LZMA_BASE_SIZE 1846
224#define LZMA_LIT_SIZE 768
225
226#define LZMA_NUM_POS_BITS_MAX 4
227
228#define LZMA_LEN_NUM_LOW_BITS 3
229#define LZMA_LEN_NUM_MID_BITS 3
230#define LZMA_LEN_NUM_HIGH_BITS 8
231
232#define LZMA_LEN_CHOICE 0
233#define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1)
234#define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1)
235#define LZMA_LEN_MID (LZMA_LEN_LOW \
236 + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS)))
237#define LZMA_LEN_HIGH (LZMA_LEN_MID \
238 +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS)))
239#define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS))
240
241#define LZMA_NUM_STATES 12
242#define LZMA_NUM_LIT_STATES 7
243
244#define LZMA_START_POS_MODEL_INDEX 4
245#define LZMA_END_POS_MODEL_INDEX 14
246#define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1))
247
248#define LZMA_NUM_POS_SLOT_BITS 6
249#define LZMA_NUM_LEN_TO_POS_STATES 4
250
251#define LZMA_NUM_ALIGN_BITS 4
252
253#define LZMA_MATCH_MIN_LEN 2
254
255#define LZMA_IS_MATCH 0
256#define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
257#define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES)
258#define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES)
259#define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES)
260#define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES)
261#define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \
262 + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
263#define LZMA_SPEC_POS (LZMA_POS_SLOT \
264 +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS))
265#define LZMA_ALIGN (LZMA_SPEC_POS \
266 + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX)
267#define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS))
268#define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS)
269#define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS)
270
271
272struct writer {
273 uint8_t *buffer;
274 uint8_t previous_byte;
275 size_t buffer_pos;
276 int bufsize;
277 size_t global_pos;
278 int(*flush)(void*, unsigned int);
279 struct lzma_header *header;
280};
281
282struct cstate {
283 int state;
284 uint32_t rep0, rep1, rep2, rep3;
285};
286
287static inline size_t INIT get_pos(struct writer *wr)
288{
289 return
290 wr->global_pos + wr->buffer_pos;
291}
292
293static inline uint8_t INIT peek_old_byte(struct writer *wr,
294 uint32_t offs)
295{
296 if (!wr->flush) {
297 int32_t pos;
298 while (offs > wr->header->dict_size)
299 offs -= wr->header->dict_size;
300 pos = wr->buffer_pos - offs;
301 return wr->buffer[pos];
302 } else {
303 uint32_t pos = wr->buffer_pos - offs;
304 while (pos >= wr->header->dict_size)
305 pos += wr->header->dict_size;
306 return wr->buffer[pos];
307 }
308
309}
310
311static inline void INIT write_byte(struct writer *wr, uint8_t byte)
312{
313 wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte;
314 if (wr->flush && wr->buffer_pos == wr->header->dict_size) {
315 wr->buffer_pos = 0;
316 wr->global_pos += wr->header->dict_size;
317 wr->flush((char *)wr->buffer, wr->header->dict_size);
318 }
319}
320
321
322static inline void INIT copy_byte(struct writer *wr, uint32_t offs)
323{
324 write_byte(wr, peek_old_byte(wr, offs));
325}
326
327static inline void INIT copy_bytes(struct writer *wr,
328 uint32_t rep0, int len)
329{
330 do {
331 copy_byte(wr, rep0);
332 len--;
333 } while (len != 0 && wr->buffer_pos < wr->header->dst_size);
334}
335
336static inline void INIT process_bit0(struct writer *wr, struct rc *rc,
337 struct cstate *cst, uint16_t *p,
338 int pos_state, uint16_t *prob,
339 int lc, uint32_t literal_pos_mask) {
340 int mi = 1;
341 rc_update_bit_0(rc, prob);
342 prob = (p + LZMA_LITERAL +
343 (LZMA_LIT_SIZE
344 * (((get_pos(wr) & literal_pos_mask) << lc)
345 + (wr->previous_byte >> (8 - lc))))
346 );
347
348 if (cst->state >= LZMA_NUM_LIT_STATES) {
349 int match_byte = peek_old_byte(wr, cst->rep0);
350 do {
351 int bit;
352 uint16_t *prob_lit;
353
354 match_byte <<= 1;
355 bit = match_byte & 0x100;
356 prob_lit = prob + 0x100 + bit + mi;
357 if (rc_get_bit(rc, prob_lit, &mi)) {
358 if (!bit)
359 break;
360 } else {
361 if (bit)
362 break;
363 }
364 } while (mi < 0x100);
365 }
366 while (mi < 0x100) {
367 uint16_t *prob_lit = prob + mi;
368 rc_get_bit(rc, prob_lit, &mi);
369 }
370 write_byte(wr, mi);
371 if (cst->state < 4)
372 cst->state = 0;
373 else if (cst->state < 10)
374 cst->state -= 3;
375 else
376 cst->state -= 6;
377}
378
379static inline void INIT process_bit1(struct writer *wr, struct rc *rc,
380 struct cstate *cst, uint16_t *p,
381 int pos_state, uint16_t *prob) {
382 int offset;
383 uint16_t *prob_len;
384 int num_bits;
385 int len;
386
387 rc_update_bit_1(rc, prob);
388 prob = p + LZMA_IS_REP + cst->state;
389 if (rc_is_bit_0(rc, prob)) {
390 rc_update_bit_0(rc, prob);
391 cst->rep3 = cst->rep2;
392 cst->rep2 = cst->rep1;
393 cst->rep1 = cst->rep0;
394 cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3;
395 prob = p + LZMA_LEN_CODER;
396 } else {
397 rc_update_bit_1(rc, prob);
398 prob = p + LZMA_IS_REP_G0 + cst->state;
399 if (rc_is_bit_0(rc, prob)) {
400 rc_update_bit_0(rc, prob);
401 prob = (p + LZMA_IS_REP_0_LONG
402 + (cst->state <<
403 LZMA_NUM_POS_BITS_MAX) +
404 pos_state);
405 if (rc_is_bit_0(rc, prob)) {
406 rc_update_bit_0(rc, prob);
407
408 cst->state = cst->state < LZMA_NUM_LIT_STATES ?
409 9 : 11;
410 copy_byte(wr, cst->rep0);
411 return;
412 } else {
413 rc_update_bit_1(rc, prob);
414 }
415 } else {
416 uint32_t distance;
417
418 rc_update_bit_1(rc, prob);
419 prob = p + LZMA_IS_REP_G1 + cst->state;
420 if (rc_is_bit_0(rc, prob)) {
421 rc_update_bit_0(rc, prob);
422 distance = cst->rep1;
423 } else {
424 rc_update_bit_1(rc, prob);
425 prob = p + LZMA_IS_REP_G2 + cst->state;
426 if (rc_is_bit_0(rc, prob)) {
427 rc_update_bit_0(rc, prob);
428 distance = cst->rep2;
429 } else {
430 rc_update_bit_1(rc, prob);
431 distance = cst->rep3;
432 cst->rep3 = cst->rep2;
433 }
434 cst->rep2 = cst->rep1;
435 }
436 cst->rep1 = cst->rep0;
437 cst->rep0 = distance;
438 }
439 cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11;
440 prob = p + LZMA_REP_LEN_CODER;
441 }
442
443 prob_len = prob + LZMA_LEN_CHOICE;
444 if (rc_is_bit_0(rc, prob_len)) {
445 rc_update_bit_0(rc, prob_len);
446 prob_len = (prob + LZMA_LEN_LOW
447 + (pos_state <<
448 LZMA_LEN_NUM_LOW_BITS));
449 offset = 0;
450 num_bits = LZMA_LEN_NUM_LOW_BITS;
451 } else {
452 rc_update_bit_1(rc, prob_len);
453 prob_len = prob + LZMA_LEN_CHOICE_2;
454 if (rc_is_bit_0(rc, prob_len)) {
455 rc_update_bit_0(rc, prob_len);
456 prob_len = (prob + LZMA_LEN_MID
457 + (pos_state <<
458 LZMA_LEN_NUM_MID_BITS));
459 offset = 1 << LZMA_LEN_NUM_LOW_BITS;
460 num_bits = LZMA_LEN_NUM_MID_BITS;
461 } else {
462 rc_update_bit_1(rc, prob_len);
463 prob_len = prob + LZMA_LEN_HIGH;
464 offset = ((1 << LZMA_LEN_NUM_LOW_BITS)
465 + (1 << LZMA_LEN_NUM_MID_BITS));
466 num_bits = LZMA_LEN_NUM_HIGH_BITS;
467 }
468 }
469
470 rc_bit_tree_decode(rc, prob_len, num_bits, &len);
471 len += offset;
472
473 if (cst->state < 4) {
474 int pos_slot;
475
476 cst->state += LZMA_NUM_LIT_STATES;
477 prob =
478 p + LZMA_POS_SLOT +
479 ((len <
480 LZMA_NUM_LEN_TO_POS_STATES ? len :
481 LZMA_NUM_LEN_TO_POS_STATES - 1)
482 << LZMA_NUM_POS_SLOT_BITS);
483 rc_bit_tree_decode(rc, prob,
484 LZMA_NUM_POS_SLOT_BITS,
485 &pos_slot);
486 if (pos_slot >= LZMA_START_POS_MODEL_INDEX) {
487 int i, mi;
488 num_bits = (pos_slot >> 1) - 1;
489 cst->rep0 = 2 | (pos_slot & 1);
490 if (pos_slot < LZMA_END_POS_MODEL_INDEX) {
491 cst->rep0 <<= num_bits;
492 prob = p + LZMA_SPEC_POS +
493 cst->rep0 - pos_slot - 1;
494 } else {
495 num_bits -= LZMA_NUM_ALIGN_BITS;
496 while (num_bits--)
497 cst->rep0 = (cst->rep0 << 1) |
498 rc_direct_bit(rc);
499 prob = p + LZMA_ALIGN;
500 cst->rep0 <<= LZMA_NUM_ALIGN_BITS;
501 num_bits = LZMA_NUM_ALIGN_BITS;
502 }
503 i = 1;
504 mi = 1;
505 while (num_bits--) {
506 if (rc_get_bit(rc, prob + mi, &mi))
507 cst->rep0 |= i;
508 i <<= 1;
509 }
510 } else
511 cst->rep0 = pos_slot;
512 if (++(cst->rep0) == 0)
513 return;
514 }
515
516 len += LZMA_MATCH_MIN_LEN;
517
518 copy_bytes(wr, cst->rep0, len);
519}
520
521
522
523STATIC inline int INIT unlzma(unsigned char *buf, int in_len,
524 int(*fill)(void*, unsigned int),
525 int(*flush)(void*, unsigned int),
526 unsigned char *output,
527 int *posp,
528 void(*error_fn)(char *x)
529 )
530{
531 struct lzma_header header;
532 int lc, pb, lp;
533 uint32_t pos_state_mask;
534 uint32_t literal_pos_mask;
535 uint16_t *p;
536 int num_probs;
537 struct rc rc;
538 int i, mi;
539 struct writer wr;
540 struct cstate cst;
541 unsigned char *inbuf;
542 int ret = -1;
543
544 set_error_fn(error_fn);
545 if (!flush)
546 in_len -= 4; /* Uncompressed size hack active in pre-boot
547 environment */
548 if (buf)
549 inbuf = buf;
550 else
551 inbuf = malloc(LZMA_IOBUF_SIZE);
552 if (!inbuf) {
553 error("Could not allocate input bufer");
554 goto exit_0;
555 }
556
557 cst.state = 0;
558 cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1;
559
560 wr.header = &header;
561 wr.flush = flush;
562 wr.global_pos = 0;
563 wr.previous_byte = 0;
564 wr.buffer_pos = 0;
565
566 rc_init(&rc, fill, inbuf, in_len);
567
568 for (i = 0; i < sizeof(header); i++) {
569 if (rc.ptr >= rc.buffer_end)
570 rc_read(&rc);
571 ((unsigned char *)&header)[i] = *rc.ptr++;
572 }
573
574 if (header.pos >= (9 * 5 * 5))
575 error("bad header");
576
577 mi = 0;
578 lc = header.pos;
579 while (lc >= 9) {
580 mi++;
581 lc -= 9;
582 }
583 pb = 0;
584 lp = mi;
585 while (lp >= 5) {
586 pb++;
587 lp -= 5;
588 }
589 pos_state_mask = (1 << pb) - 1;
590 literal_pos_mask = (1 << lp) - 1;
591
592 ENDIAN_CONVERT(header.dict_size);
593 ENDIAN_CONVERT(header.dst_size);
594
595 if (header.dict_size == 0)
596 header.dict_size = 1;
597
598 if (output)
599 wr.buffer = output;
600 else {
601 wr.bufsize = MIN(header.dst_size, header.dict_size);
602 wr.buffer = large_malloc(wr.bufsize);
603 }
604 if (wr.buffer == NULL)
605 goto exit_1;
606
607 num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
608 p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
609 if (p == 0)
610 goto exit_2;
611 num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
612 for (i = 0; i < num_probs; i++)
613 p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1;
614
615 rc_init_code(&rc);
616
617 while (get_pos(&wr) < header.dst_size) {
618 int pos_state = get_pos(&wr) & pos_state_mask;
619 uint16_t *prob = p + LZMA_IS_MATCH +
620 (cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state;
621 if (rc_is_bit_0(&rc, prob))
622 process_bit0(&wr, &rc, &cst, p, pos_state, prob,
623 lc, literal_pos_mask);
624 else {
625 process_bit1(&wr, &rc, &cst, p, pos_state, prob);
626 if (cst.rep0 == 0)
627 break;
628 }
629 }
630
631 if (posp)
632 *posp = rc.ptr-rc.buffer;
633 if (wr.flush)
634 wr.flush(wr.buffer, wr.buffer_pos);
635 ret = 0;
636 large_free(p);
637exit_2:
638 if (!output)
639 large_free(wr.buffer);
640exit_1:
641 if (!buf)
642 free(inbuf);
643exit_0:
644 return ret;
645}
646
647#define decompress unlzma
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c1827c..619313ed6c46 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@ static void init_shared_classes(void)
157#define SOFTIRQ_ENTER() \ 157#define SOFTIRQ_ENTER() \
158 local_bh_disable(); \ 158 local_bh_disable(); \
159 local_irq_disable(); \ 159 local_irq_disable(); \
160 trace_softirq_enter(); \ 160 lockdep_softirq_enter(); \
161 WARN_ON(!in_softirq()); 161 WARN_ON(!in_softirq());
162 162
163#define SOFTIRQ_EXIT() \ 163#define SOFTIRQ_EXIT() \
164 trace_softirq_exit(); \ 164 lockdep_softirq_exit(); \
165 local_irq_enable(); \ 165 local_irq_enable(); \
166 local_bh_enable(); 166 local_bh_enable();
167 167
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0fbd0121d91d..dc1674377009 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -396,7 +396,38 @@ static noinline char* put_dec(char *buf, unsigned long long num)
396#define SMALL 32 /* Must be 32 == 0x20 */ 396#define SMALL 32 /* Must be 32 == 0x20 */
397#define SPECIAL 64 /* 0x */ 397#define SPECIAL 64 /* 0x */
398 398
399static char *number(char *buf, char *end, unsigned long long num, int base, int size, int precision, int type) 399enum format_type {
400 FORMAT_TYPE_NONE, /* Just a string part */
401 FORMAT_TYPE_WITDH,
402 FORMAT_TYPE_PRECISION,
403 FORMAT_TYPE_CHAR,
404 FORMAT_TYPE_STR,
405 FORMAT_TYPE_PTR,
406 FORMAT_TYPE_PERCENT_CHAR,
407 FORMAT_TYPE_INVALID,
408 FORMAT_TYPE_LONG_LONG,
409 FORMAT_TYPE_ULONG,
410 FORMAT_TYPE_LONG,
411 FORMAT_TYPE_USHORT,
412 FORMAT_TYPE_SHORT,
413 FORMAT_TYPE_UINT,
414 FORMAT_TYPE_INT,
415 FORMAT_TYPE_NRCHARS,
416 FORMAT_TYPE_SIZE_T,
417 FORMAT_TYPE_PTRDIFF
418};
419
420struct printf_spec {
421 enum format_type type;
422 int flags; /* flags to number() */
423 int field_width; /* width of output field */
424 int base;
425 int precision; /* # of digits/chars */
426 int qualifier;
427};
428
429static char *number(char *buf, char *end, unsigned long long num,
430 struct printf_spec spec)
400{ 431{
401 /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ 432 /* we are called with base 8, 10 or 16, only, thus don't need "G..." */
402 static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ 433 static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */
@@ -404,32 +435,32 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
404 char tmp[66]; 435 char tmp[66];
405 char sign; 436 char sign;
406 char locase; 437 char locase;
407 int need_pfx = ((type & SPECIAL) && base != 10); 438 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
408 int i; 439 int i;
409 440
410 /* locase = 0 or 0x20. ORing digits or letters with 'locase' 441 /* locase = 0 or 0x20. ORing digits or letters with 'locase'
411 * produces same digits or (maybe lowercased) letters */ 442 * produces same digits or (maybe lowercased) letters */
412 locase = (type & SMALL); 443 locase = (spec.flags & SMALL);
413 if (type & LEFT) 444 if (spec.flags & LEFT)
414 type &= ~ZEROPAD; 445 spec.flags &= ~ZEROPAD;
415 sign = 0; 446 sign = 0;
416 if (type & SIGN) { 447 if (spec.flags & SIGN) {
417 if ((signed long long) num < 0) { 448 if ((signed long long) num < 0) {
418 sign = '-'; 449 sign = '-';
419 num = - (signed long long) num; 450 num = - (signed long long) num;
420 size--; 451 spec.field_width--;
421 } else if (type & PLUS) { 452 } else if (spec.flags & PLUS) {
422 sign = '+'; 453 sign = '+';
423 size--; 454 spec.field_width--;
424 } else if (type & SPACE) { 455 } else if (spec.flags & SPACE) {
425 sign = ' '; 456 sign = ' ';
426 size--; 457 spec.field_width--;
427 } 458 }
428 } 459 }
429 if (need_pfx) { 460 if (need_pfx) {
430 size--; 461 spec.field_width--;
431 if (base == 16) 462 if (spec.base == 16)
432 size--; 463 spec.field_width--;
433 } 464 }
434 465
435 /* generate full string in tmp[], in reverse order */ 466 /* generate full string in tmp[], in reverse order */
@@ -441,10 +472,10 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
441 tmp[i++] = (digits[do_div(num,base)] | locase); 472 tmp[i++] = (digits[do_div(num,base)] | locase);
442 } while (num != 0); 473 } while (num != 0);
443 */ 474 */
444 else if (base != 10) { /* 8 or 16 */ 475 else if (spec.base != 10) { /* 8 or 16 */
445 int mask = base - 1; 476 int mask = spec.base - 1;
446 int shift = 3; 477 int shift = 3;
447 if (base == 16) shift = 4; 478 if (spec.base == 16) shift = 4;
448 do { 479 do {
449 tmp[i++] = (digits[((unsigned char)num) & mask] | locase); 480 tmp[i++] = (digits[((unsigned char)num) & mask] | locase);
450 num >>= shift; 481 num >>= shift;
@@ -454,12 +485,12 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
454 } 485 }
455 486
456 /* printing 100 using %2d gives "100", not "00" */ 487 /* printing 100 using %2d gives "100", not "00" */
457 if (i > precision) 488 if (i > spec.precision)
458 precision = i; 489 spec.precision = i;
459 /* leading space padding */ 490 /* leading space padding */
460 size -= precision; 491 spec.field_width -= spec.precision;
461 if (!(type & (ZEROPAD+LEFT))) { 492 if (!(spec.flags & (ZEROPAD+LEFT))) {
462 while(--size >= 0) { 493 while(--spec.field_width >= 0) {
463 if (buf < end) 494 if (buf < end)
464 *buf = ' '; 495 *buf = ' ';
465 ++buf; 496 ++buf;
@@ -476,23 +507,23 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
476 if (buf < end) 507 if (buf < end)
477 *buf = '0'; 508 *buf = '0';
478 ++buf; 509 ++buf;
479 if (base == 16) { 510 if (spec.base == 16) {
480 if (buf < end) 511 if (buf < end)
481 *buf = ('X' | locase); 512 *buf = ('X' | locase);
482 ++buf; 513 ++buf;
483 } 514 }
484 } 515 }
485 /* zero or space padding */ 516 /* zero or space padding */
486 if (!(type & LEFT)) { 517 if (!(spec.flags & LEFT)) {
487 char c = (type & ZEROPAD) ? '0' : ' '; 518 char c = (spec.flags & ZEROPAD) ? '0' : ' ';
488 while (--size >= 0) { 519 while (--spec.field_width >= 0) {
489 if (buf < end) 520 if (buf < end)
490 *buf = c; 521 *buf = c;
491 ++buf; 522 ++buf;
492 } 523 }
493 } 524 }
494 /* hmm even more zero padding? */ 525 /* hmm even more zero padding? */
495 while (i <= --precision) { 526 while (i <= --spec.precision) {
496 if (buf < end) 527 if (buf < end)
497 *buf = '0'; 528 *buf = '0';
498 ++buf; 529 ++buf;
@@ -504,7 +535,7 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
504 ++buf; 535 ++buf;
505 } 536 }
506 /* trailing space padding */ 537 /* trailing space padding */
507 while (--size >= 0) { 538 while (--spec.field_width >= 0) {
508 if (buf < end) 539 if (buf < end)
509 *buf = ' '; 540 *buf = ' ';
510 ++buf; 541 ++buf;
@@ -512,17 +543,17 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
512 return buf; 543 return buf;
513} 544}
514 545
515static char *string(char *buf, char *end, char *s, int field_width, int precision, int flags) 546static char *string(char *buf, char *end, char *s, struct printf_spec spec)
516{ 547{
517 int len, i; 548 int len, i;
518 549
519 if ((unsigned long)s < PAGE_SIZE) 550 if ((unsigned long)s < PAGE_SIZE)
520 s = "<NULL>"; 551 s = "<NULL>";
521 552
522 len = strnlen(s, precision); 553 len = strnlen(s, spec.precision);
523 554
524 if (!(flags & LEFT)) { 555 if (!(spec.flags & LEFT)) {
525 while (len < field_width--) { 556 while (len < spec.field_width--) {
526 if (buf < end) 557 if (buf < end)
527 *buf = ' '; 558 *buf = ' ';
528 ++buf; 559 ++buf;
@@ -533,7 +564,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
533 *buf = *s; 564 *buf = *s;
534 ++buf; ++s; 565 ++buf; ++s;
535 } 566 }
536 while (len < field_width--) { 567 while (len < spec.field_width--) {
537 if (buf < end) 568 if (buf < end)
538 *buf = ' '; 569 *buf = ' ';
539 ++buf; 570 ++buf;
@@ -541,21 +572,24 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
541 return buf; 572 return buf;
542} 573}
543 574
544static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) 575static char *symbol_string(char *buf, char *end, void *ptr,
576 struct printf_spec spec)
545{ 577{
546 unsigned long value = (unsigned long) ptr; 578 unsigned long value = (unsigned long) ptr;
547#ifdef CONFIG_KALLSYMS 579#ifdef CONFIG_KALLSYMS
548 char sym[KSYM_SYMBOL_LEN]; 580 char sym[KSYM_SYMBOL_LEN];
549 sprint_symbol(sym, value); 581 sprint_symbol(sym, value);
550 return string(buf, end, sym, field_width, precision, flags); 582 return string(buf, end, sym, spec);
551#else 583#else
552 field_width = 2*sizeof(void *); 584 spec.field_width = 2*sizeof(void *);
553 flags |= SPECIAL | SMALL | ZEROPAD; 585 spec.flags |= SPECIAL | SMALL | ZEROPAD;
554 return number(buf, end, value, 16, field_width, precision, flags); 586 spec.base = 16;
587 return number(buf, end, value, spec);
555#endif 588#endif
556} 589}
557 590
558static char *resource_string(char *buf, char *end, struct resource *res, int field_width, int precision, int flags) 591static char *resource_string(char *buf, char *end, struct resource *res,
592 struct printf_spec spec)
559{ 593{
560#ifndef IO_RSRC_PRINTK_SIZE 594#ifndef IO_RSRC_PRINTK_SIZE
561#define IO_RSRC_PRINTK_SIZE 4 595#define IO_RSRC_PRINTK_SIZE 4
@@ -564,7 +598,11 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie
564#ifndef MEM_RSRC_PRINTK_SIZE 598#ifndef MEM_RSRC_PRINTK_SIZE
565#define MEM_RSRC_PRINTK_SIZE 8 599#define MEM_RSRC_PRINTK_SIZE 8
566#endif 600#endif
567 601 struct printf_spec num_spec = {
602 .base = 16,
603 .precision = -1,
604 .flags = SPECIAL | SMALL | ZEROPAD,
605 };
568 /* room for the actual numbers, the two "0x", -, [, ] and the final zero */ 606 /* room for the actual numbers, the two "0x", -, [, ] and the final zero */
569 char sym[4*sizeof(resource_size_t) + 8]; 607 char sym[4*sizeof(resource_size_t) + 8];
570 char *p = sym, *pend = sym + sizeof(sym); 608 char *p = sym, *pend = sym + sizeof(sym);
@@ -576,17 +614,18 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie
576 size = MEM_RSRC_PRINTK_SIZE; 614 size = MEM_RSRC_PRINTK_SIZE;
577 615
578 *p++ = '['; 616 *p++ = '[';
579 p = number(p, pend, res->start, 16, size, -1, SPECIAL | SMALL | ZEROPAD); 617 num_spec.field_width = size;
618 p = number(p, pend, res->start, num_spec);
580 *p++ = '-'; 619 *p++ = '-';
581 p = number(p, pend, res->end, 16, size, -1, SPECIAL | SMALL | ZEROPAD); 620 p = number(p, pend, res->end, num_spec);
582 *p++ = ']'; 621 *p++ = ']';
583 *p = 0; 622 *p = 0;
584 623
585 return string(buf, end, sym, field_width, precision, flags); 624 return string(buf, end, sym, spec);
586} 625}
587 626
588static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width, 627static char *mac_address_string(char *buf, char *end, u8 *addr,
589 int precision, int flags) 628 struct printf_spec spec)
590{ 629{
591 char mac_addr[6 * 3]; /* (6 * 2 hex digits), 5 colons and trailing zero */ 630 char mac_addr[6 * 3]; /* (6 * 2 hex digits), 5 colons and trailing zero */
592 char *p = mac_addr; 631 char *p = mac_addr;
@@ -594,16 +633,17 @@ static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width,
594 633
595 for (i = 0; i < 6; i++) { 634 for (i = 0; i < 6; i++) {
596 p = pack_hex_byte(p, addr[i]); 635 p = pack_hex_byte(p, addr[i]);
597 if (!(flags & SPECIAL) && i != 5) 636 if (!(spec.flags & SPECIAL) && i != 5)
598 *p++ = ':'; 637 *p++ = ':';
599 } 638 }
600 *p = '\0'; 639 *p = '\0';
640 spec.flags &= ~SPECIAL;
601 641
602 return string(buf, end, mac_addr, field_width, precision, flags & ~SPECIAL); 642 return string(buf, end, mac_addr, spec);
603} 643}
604 644
605static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width, 645static char *ip6_addr_string(char *buf, char *end, u8 *addr,
606 int precision, int flags) 646 struct printf_spec spec)
607{ 647{
608 char ip6_addr[8 * 5]; /* (8 * 4 hex digits), 7 colons and trailing zero */ 648 char ip6_addr[8 * 5]; /* (8 * 4 hex digits), 7 colons and trailing zero */
609 char *p = ip6_addr; 649 char *p = ip6_addr;
@@ -612,16 +652,17 @@ static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width,
612 for (i = 0; i < 8; i++) { 652 for (i = 0; i < 8; i++) {
613 p = pack_hex_byte(p, addr[2 * i]); 653 p = pack_hex_byte(p, addr[2 * i]);
614 p = pack_hex_byte(p, addr[2 * i + 1]); 654 p = pack_hex_byte(p, addr[2 * i + 1]);
615 if (!(flags & SPECIAL) && i != 7) 655 if (!(spec.flags & SPECIAL) && i != 7)
616 *p++ = ':'; 656 *p++ = ':';
617 } 657 }
618 *p = '\0'; 658 *p = '\0';
659 spec.flags &= ~SPECIAL;
619 660
620 return string(buf, end, ip6_addr, field_width, precision, flags & ~SPECIAL); 661 return string(buf, end, ip6_addr, spec);
621} 662}
622 663
623static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width, 664static char *ip4_addr_string(char *buf, char *end, u8 *addr,
624 int precision, int flags) 665 struct printf_spec spec)
625{ 666{
626 char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */ 667 char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */
627 char temp[3]; /* hold each IP quad in reverse order */ 668 char temp[3]; /* hold each IP quad in reverse order */
@@ -637,8 +678,9 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
637 *p++ = '.'; 678 *p++ = '.';
638 } 679 }
639 *p = '\0'; 680 *p = '\0';
681 spec.flags &= ~SPECIAL;
640 682
641 return string(buf, end, ip4_addr, field_width, precision, flags & ~SPECIAL); 683 return string(buf, end, ip4_addr, spec);
642} 684}
643 685
644/* 686/*
@@ -663,41 +705,233 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
663 * function pointers are really function descriptors, which contain a 705 * function pointers are really function descriptors, which contain a
664 * pointer to the real address. 706 * pointer to the real address.
665 */ 707 */
666static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags) 708static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
709 struct printf_spec spec)
667{ 710{
668 if (!ptr) 711 if (!ptr)
669 return string(buf, end, "(null)", field_width, precision, flags); 712 return string(buf, end, "(null)", spec);
670 713
671 switch (*fmt) { 714 switch (*fmt) {
672 case 'F': 715 case 'F':
673 ptr = dereference_function_descriptor(ptr); 716 ptr = dereference_function_descriptor(ptr);
674 /* Fallthrough */ 717 /* Fallthrough */
675 case 'S': 718 case 'S':
676 return symbol_string(buf, end, ptr, field_width, precision, flags); 719 return symbol_string(buf, end, ptr, spec);
677 case 'R': 720 case 'R':
678 return resource_string(buf, end, ptr, field_width, precision, flags); 721 return resource_string(buf, end, ptr, spec);
679 case 'm': 722 case 'm':
680 flags |= SPECIAL; 723 spec.flags |= SPECIAL;
681 /* Fallthrough */ 724 /* Fallthrough */
682 case 'M': 725 case 'M':
683 return mac_address_string(buf, end, ptr, field_width, precision, flags); 726 return mac_address_string(buf, end, ptr, spec);
684 case 'i': 727 case 'i':
685 flags |= SPECIAL; 728 spec.flags |= SPECIAL;
686 /* Fallthrough */ 729 /* Fallthrough */
687 case 'I': 730 case 'I':
688 if (fmt[1] == '6') 731 if (fmt[1] == '6')
689 return ip6_addr_string(buf, end, ptr, field_width, precision, flags); 732 return ip6_addr_string(buf, end, ptr, spec);
690 if (fmt[1] == '4') 733 if (fmt[1] == '4')
691 return ip4_addr_string(buf, end, ptr, field_width, precision, flags); 734 return ip4_addr_string(buf, end, ptr, spec);
692 flags &= ~SPECIAL; 735 spec.flags &= ~SPECIAL;
736 break;
737 }
738 spec.flags |= SMALL;
739 if (spec.field_width == -1) {
740 spec.field_width = 2*sizeof(void *);
741 spec.flags |= ZEROPAD;
742 }
743 spec.base = 16;
744
745 return number(buf, end, (unsigned long) ptr, spec);
746}
747
748/*
749 * Helper function to decode printf style format.
750 * Each call decode a token from the format and return the
751 * number of characters read (or likely the delta where it wants
752 * to go on the next call).
753 * The decoded token is returned through the parameters
754 *
755 * 'h', 'l', or 'L' for integer fields
756 * 'z' support added 23/7/1999 S.H.
757 * 'z' changed to 'Z' --davidm 1/25/99
758 * 't' added for ptrdiff_t
759 *
760 * @fmt: the format string
761 * @type of the token returned
762 * @flags: various flags such as +, -, # tokens..
763 * @field_width: overwritten width
764 * @base: base of the number (octal, hex, ...)
765 * @precision: precision of a number
766 * @qualifier: qualifier of a number (long, size_t, ...)
767 */
768static int format_decode(const char *fmt, struct printf_spec *spec)
769{
770 const char *start = fmt;
771
772 /* we finished early by reading the field width */
773 if (spec->type == FORMAT_TYPE_WITDH) {
774 if (spec->field_width < 0) {
775 spec->field_width = -spec->field_width;
776 spec->flags |= LEFT;
777 }
778 spec->type = FORMAT_TYPE_NONE;
779 goto precision;
780 }
781
782 /* we finished early by reading the precision */
783 if (spec->type == FORMAT_TYPE_PRECISION) {
784 if (spec->precision < 0)
785 spec->precision = 0;
786
787 spec->type = FORMAT_TYPE_NONE;
788 goto qualifier;
789 }
790
791 /* By default */
792 spec->type = FORMAT_TYPE_NONE;
793
794 for (; *fmt ; ++fmt) {
795 if (*fmt == '%')
796 break;
797 }
798
799 /* Return the current non-format string */
800 if (fmt != start || !*fmt)
801 return fmt - start;
802
803 /* Process flags */
804 spec->flags = 0;
805
806 while (1) { /* this also skips first '%' */
807 bool found = true;
808
809 ++fmt;
810
811 switch (*fmt) {
812 case '-': spec->flags |= LEFT; break;
813 case '+': spec->flags |= PLUS; break;
814 case ' ': spec->flags |= SPACE; break;
815 case '#': spec->flags |= SPECIAL; break;
816 case '0': spec->flags |= ZEROPAD; break;
817 default: found = false;
818 }
819
820 if (!found)
821 break;
822 }
823
824 /* get field width */
825 spec->field_width = -1;
826
827 if (isdigit(*fmt))
828 spec->field_width = skip_atoi(&fmt);
829 else if (*fmt == '*') {
830 /* it's the next argument */
831 spec->type = FORMAT_TYPE_WITDH;
832 return ++fmt - start;
833 }
834
835precision:
836 /* get the precision */
837 spec->precision = -1;
838 if (*fmt == '.') {
839 ++fmt;
840 if (isdigit(*fmt)) {
841 spec->precision = skip_atoi(&fmt);
842 if (spec->precision < 0)
843 spec->precision = 0;
844 } else if (*fmt == '*') {
845 /* it's the next argument */
846 spec->type = FORMAT_TYPE_WITDH;
847 return ++fmt - start;
848 }
849 }
850
851qualifier:
852 /* get the conversion qualifier */
853 spec->qualifier = -1;
854 if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
855 *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
856 spec->qualifier = *fmt;
857 ++fmt;
858 if (spec->qualifier == 'l' && *fmt == 'l') {
859 spec->qualifier = 'L';
860 ++fmt;
861 }
862 }
863
864 /* default base */
865 spec->base = 10;
866 switch (*fmt) {
867 case 'c':
868 spec->type = FORMAT_TYPE_CHAR;
869 return ++fmt - start;
870
871 case 's':
872 spec->type = FORMAT_TYPE_STR;
873 return ++fmt - start;
874
875 case 'p':
876 spec->type = FORMAT_TYPE_PTR;
877 return fmt - start;
878 /* skip alnum */
879
880 case 'n':
881 spec->type = FORMAT_TYPE_NRCHARS;
882 return ++fmt - start;
883
884 case '%':
885 spec->type = FORMAT_TYPE_PERCENT_CHAR;
886 return ++fmt - start;
887
888 /* integer number formats - set up the flags and "break" */
889 case 'o':
890 spec->base = 8;
693 break; 891 break;
892
893 case 'x':
894 spec->flags |= SMALL;
895
896 case 'X':
897 spec->base = 16;
898 break;
899
900 case 'd':
901 case 'i':
902 spec->flags |= SIGN;
903 case 'u':
904 break;
905
906 default:
907 spec->type = FORMAT_TYPE_INVALID;
908 return fmt - start;
694 } 909 }
695 flags |= SMALL; 910
696 if (field_width == -1) { 911 if (spec->qualifier == 'L')
697 field_width = 2*sizeof(void *); 912 spec->type = FORMAT_TYPE_LONG_LONG;
698 flags |= ZEROPAD; 913 else if (spec->qualifier == 'l') {
914 if (spec->flags & SIGN)
915 spec->type = FORMAT_TYPE_LONG;
916 else
917 spec->type = FORMAT_TYPE_ULONG;
918 } else if (spec->qualifier == 'Z' || spec->qualifier == 'z') {
919 spec->type = FORMAT_TYPE_SIZE_T;
920 } else if (spec->qualifier == 't') {
921 spec->type = FORMAT_TYPE_PTRDIFF;
922 } else if (spec->qualifier == 'h') {
923 if (spec->flags & SIGN)
924 spec->type = FORMAT_TYPE_SHORT;
925 else
926 spec->type = FORMAT_TYPE_USHORT;
927 } else {
928 if (spec->flags & SIGN)
929 spec->type = FORMAT_TYPE_INT;
930 else
931 spec->type = FORMAT_TYPE_UINT;
699 } 932 }
700 return number(buf, end, (unsigned long) ptr, 16, field_width, precision, flags); 933
934 return ++fmt - start;
701} 935}
702 936
703/** 937/**
@@ -726,18 +960,9 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
726int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) 960int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
727{ 961{
728 unsigned long long num; 962 unsigned long long num;
729 int base;
730 char *str, *end, c; 963 char *str, *end, c;
731 964 int read;
732 int flags; /* flags to number() */ 965 struct printf_spec spec = {0};
733
734 int field_width; /* width of output field */
735 int precision; /* min. # of digits for integers; max
736 number of chars for from string */
737 int qualifier; /* 'h', 'l', or 'L' for integer fields */
738 /* 'z' support added 23/7/1999 S.H. */
739 /* 'z' changed to 'Z' --davidm 1/25/99 */
740 /* 't' added for ptrdiff_t */
741 966
742 /* Reject out-of-range values early. Large positive sizes are 967 /* Reject out-of-range values early. Large positive sizes are
743 used for unknown buffer sizes. */ 968 used for unknown buffer sizes. */
@@ -758,184 +983,144 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
758 size = end - buf; 983 size = end - buf;
759 } 984 }
760 985
761 for (; *fmt ; ++fmt) { 986 while (*fmt) {
762 if (*fmt != '%') { 987 const char *old_fmt = fmt;
763 if (str < end)
764 *str = *fmt;
765 ++str;
766 continue;
767 }
768 988
769 /* process flags */ 989 read = format_decode(fmt, &spec);
770 flags = 0;
771 repeat:
772 ++fmt; /* this also skips first '%' */
773 switch (*fmt) {
774 case '-': flags |= LEFT; goto repeat;
775 case '+': flags |= PLUS; goto repeat;
776 case ' ': flags |= SPACE; goto repeat;
777 case '#': flags |= SPECIAL; goto repeat;
778 case '0': flags |= ZEROPAD; goto repeat;
779 }
780 990
781 /* get field width */ 991 fmt += read;
782 field_width = -1;
783 if (isdigit(*fmt))
784 field_width = skip_atoi(&fmt);
785 else if (*fmt == '*') {
786 ++fmt;
787 /* it's the next argument */
788 field_width = va_arg(args, int);
789 if (field_width < 0) {
790 field_width = -field_width;
791 flags |= LEFT;
792 }
793 }
794 992
795 /* get the precision */ 993 switch (spec.type) {
796 precision = -1; 994 case FORMAT_TYPE_NONE: {
797 if (*fmt == '.') { 995 int copy = read;
798 ++fmt; 996 if (str < end) {
799 if (isdigit(*fmt)) 997 if (copy > end - str)
800 precision = skip_atoi(&fmt); 998 copy = end - str;
801 else if (*fmt == '*') { 999 memcpy(str, old_fmt, copy);
802 ++fmt;
803 /* it's the next argument */
804 precision = va_arg(args, int);
805 } 1000 }
806 if (precision < 0) 1001 str += read;
807 precision = 0; 1002 break;
808 } 1003 }
809 1004
810 /* get the conversion qualifier */ 1005 case FORMAT_TYPE_WITDH:
811 qualifier = -1; 1006 spec.field_width = va_arg(args, int);
812 if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || 1007 break;
813 *fmt =='Z' || *fmt == 'z' || *fmt == 't') {
814 qualifier = *fmt;
815 ++fmt;
816 if (qualifier == 'l' && *fmt == 'l') {
817 qualifier = 'L';
818 ++fmt;
819 }
820 }
821 1008
822 /* default base */ 1009 case FORMAT_TYPE_PRECISION:
823 base = 10; 1010 spec.precision = va_arg(args, int);
1011 break;
824 1012
825 switch (*fmt) { 1013 case FORMAT_TYPE_CHAR:
826 case 'c': 1014 if (!(spec.flags & LEFT)) {
827 if (!(flags & LEFT)) { 1015 while (--spec.field_width > 0) {
828 while (--field_width > 0) {
829 if (str < end)
830 *str = ' ';
831 ++str;
832 }
833 }
834 c = (unsigned char) va_arg(args, int);
835 if (str < end)
836 *str = c;
837 ++str;
838 while (--field_width > 0) {
839 if (str < end) 1016 if (str < end)
840 *str = ' '; 1017 *str = ' ';
841 ++str; 1018 ++str;
842 }
843 continue;
844
845 case 's':
846 str = string(str, end, va_arg(args, char *), field_width, precision, flags);
847 continue;
848
849 case 'p':
850 str = pointer(fmt+1, str, end,
851 va_arg(args, void *),
852 field_width, precision, flags);
853 /* Skip all alphanumeric pointer suffixes */
854 while (isalnum(fmt[1]))
855 fmt++;
856 continue;
857
858 case 'n':
859 /* FIXME:
860 * What does C99 say about the overflow case here? */
861 if (qualifier == 'l') {
862 long * ip = va_arg(args, long *);
863 *ip = (str - buf);
864 } else if (qualifier == 'Z' || qualifier == 'z') {
865 size_t * ip = va_arg(args, size_t *);
866 *ip = (str - buf);
867 } else {
868 int * ip = va_arg(args, int *);
869 *ip = (str - buf);
870 }
871 continue;
872 1019
873 case '%': 1020 }
1021 }
1022 c = (unsigned char) va_arg(args, int);
1023 if (str < end)
1024 *str = c;
1025 ++str;
1026 while (--spec.field_width > 0) {
874 if (str < end) 1027 if (str < end)
875 *str = '%'; 1028 *str = ' ';
876 ++str; 1029 ++str;
877 continue; 1030 }
1031 break;
878 1032
879 /* integer number formats - set up the flags and "break" */ 1033 case FORMAT_TYPE_STR:
880 case 'o': 1034 str = string(str, end, va_arg(args, char *), spec);
881 base = 8; 1035 break;
882 break;
883 1036
884 case 'x': 1037 case FORMAT_TYPE_PTR:
885 flags |= SMALL; 1038 str = pointer(fmt+1, str, end, va_arg(args, void *),
886 case 'X': 1039 spec);
887 base = 16; 1040 while (isalnum(*fmt))
888 break; 1041 fmt++;
1042 break;
889 1043
890 case 'd': 1044 case FORMAT_TYPE_PERCENT_CHAR:
891 case 'i': 1045 if (str < end)
892 flags |= SIGN; 1046 *str = '%';
893 case 'u': 1047 ++str;
894 break; 1048 break;
895 1049
896 default: 1050 case FORMAT_TYPE_INVALID:
1051 if (str < end)
1052 *str = '%';
1053 ++str;
1054 if (*fmt) {
897 if (str < end) 1055 if (str < end)
898 *str = '%'; 1056 *str = *fmt;
899 ++str; 1057 ++str;
900 if (*fmt) { 1058 } else {
901 if (str < end) 1059 --fmt;
902 *str = *fmt; 1060 }
903 ++str; 1061 break;
904 } else { 1062
905 --fmt; 1063 case FORMAT_TYPE_NRCHARS: {
906 } 1064 int qualifier = spec.qualifier;
907 continue; 1065
1066 if (qualifier == 'l') {
1067 long *ip = va_arg(args, long *);
1068 *ip = (str - buf);
1069 } else if (qualifier == 'Z' ||
1070 qualifier == 'z') {
1071 size_t *ip = va_arg(args, size_t *);
1072 *ip = (str - buf);
1073 } else {
1074 int *ip = va_arg(args, int *);
1075 *ip = (str - buf);
1076 }
1077 break;
908 } 1078 }
909 if (qualifier == 'L') 1079
910 num = va_arg(args, long long); 1080 default:
911 else if (qualifier == 'l') { 1081 switch (spec.type) {
912 num = va_arg(args, unsigned long); 1082 case FORMAT_TYPE_LONG_LONG:
913 if (flags & SIGN) 1083 num = va_arg(args, long long);
914 num = (signed long) num; 1084 break;
915 } else if (qualifier == 'Z' || qualifier == 'z') { 1085 case FORMAT_TYPE_ULONG:
916 num = va_arg(args, size_t); 1086 num = va_arg(args, unsigned long);
917 } else if (qualifier == 't') { 1087 break;
918 num = va_arg(args, ptrdiff_t); 1088 case FORMAT_TYPE_LONG:
919 } else if (qualifier == 'h') { 1089 num = va_arg(args, long);
920 num = (unsigned short) va_arg(args, int); 1090 break;
921 if (flags & SIGN) 1091 case FORMAT_TYPE_SIZE_T:
922 num = (signed short) num; 1092 num = va_arg(args, size_t);
923 } else { 1093 break;
924 num = va_arg(args, unsigned int); 1094 case FORMAT_TYPE_PTRDIFF:
925 if (flags & SIGN) 1095 num = va_arg(args, ptrdiff_t);
926 num = (signed int) num; 1096 break;
1097 case FORMAT_TYPE_USHORT:
1098 num = (unsigned short) va_arg(args, int);
1099 break;
1100 case FORMAT_TYPE_SHORT:
1101 num = (short) va_arg(args, int);
1102 break;
1103 case FORMAT_TYPE_INT:
1104 num = (int) va_arg(args, int);
1105 break;
1106 default:
1107 num = va_arg(args, unsigned int);
1108 }
1109
1110 str = number(str, end, num, spec);
927 } 1111 }
928 str = number(str, end, num, base,
929 field_width, precision, flags);
930 } 1112 }
1113
931 if (size > 0) { 1114 if (size > 0) {
932 if (str < end) 1115 if (str < end)
933 *str = '\0'; 1116 *str = '\0';
934 else 1117 else
935 end[-1] = '\0'; 1118 end[-1] = '\0';
936 } 1119 }
1120
937 /* the trailing null byte doesn't count towards the total */ 1121 /* the trailing null byte doesn't count towards the total */
938 return str-buf; 1122 return str-buf;
1123
939} 1124}
940EXPORT_SYMBOL(vsnprintf); 1125EXPORT_SYMBOL(vsnprintf);
941 1126
@@ -1058,6 +1243,372 @@ int sprintf(char * buf, const char *fmt, ...)
1058} 1243}
1059EXPORT_SYMBOL(sprintf); 1244EXPORT_SYMBOL(sprintf);
1060 1245
1246#ifdef CONFIG_BINARY_PRINTF
1247/*
1248 * bprintf service:
1249 * vbin_printf() - VA arguments to binary data
1250 * bstr_printf() - Binary data to text string
1251 */
1252
1253/**
1254 * vbin_printf - Parse a format string and place args' binary value in a buffer
1255 * @bin_buf: The buffer to place args' binary value
1256 * @size: The size of the buffer(by words(32bits), not characters)
1257 * @fmt: The format string to use
1258 * @args: Arguments for the format string
1259 *
1260 * The format follows C99 vsnprintf, except %n is ignored, and its argument
1261 * is skiped.
1262 *
1263 * The return value is the number of words(32bits) which would be generated for
1264 * the given input.
1265 *
1266 * NOTE:
1267 * If the return value is greater than @size, the resulting bin_buf is NOT
1268 * valid for bstr_printf().
1269 */
1270int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
1271{
1272 struct printf_spec spec = {0};
1273 char *str, *end;
1274 int read;
1275
1276 str = (char *)bin_buf;
1277 end = (char *)(bin_buf + size);
1278
1279#define save_arg(type) \
1280do { \
1281 if (sizeof(type) == 8) { \
1282 unsigned long long value; \
1283 str = PTR_ALIGN(str, sizeof(u32)); \
1284 value = va_arg(args, unsigned long long); \
1285 if (str + sizeof(type) <= end) { \
1286 *(u32 *)str = *(u32 *)&value; \
1287 *(u32 *)(str + 4) = *((u32 *)&value + 1); \
1288 } \
1289 } else { \
1290 unsigned long value; \
1291 str = PTR_ALIGN(str, sizeof(type)); \
1292 value = va_arg(args, int); \
1293 if (str + sizeof(type) <= end) \
1294 *(typeof(type) *)str = (type)value; \
1295 } \
1296 str += sizeof(type); \
1297} while (0)
1298
1299
1300 while (*fmt) {
1301 read = format_decode(fmt, &spec);
1302
1303 fmt += read;
1304
1305 switch (spec.type) {
1306 case FORMAT_TYPE_NONE:
1307 break;
1308
1309 case FORMAT_TYPE_WITDH:
1310 case FORMAT_TYPE_PRECISION:
1311 save_arg(int);
1312 break;
1313
1314 case FORMAT_TYPE_CHAR:
1315 save_arg(char);
1316 break;
1317
1318 case FORMAT_TYPE_STR: {
1319 const char *save_str = va_arg(args, char *);
1320 size_t len;
1321 if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE
1322 || (unsigned long)save_str < PAGE_SIZE)
1323 save_str = "<NULL>";
1324 len = strlen(save_str);
1325 if (str + len + 1 < end)
1326 memcpy(str, save_str, len + 1);
1327 str += len + 1;
1328 break;
1329 }
1330
1331 case FORMAT_TYPE_PTR:
1332 save_arg(void *);
1333 /* skip all alphanumeric pointer suffixes */
1334 while (isalnum(*fmt))
1335 fmt++;
1336 break;
1337
1338 case FORMAT_TYPE_PERCENT_CHAR:
1339 break;
1340
1341 case FORMAT_TYPE_INVALID:
1342 if (!*fmt)
1343 --fmt;
1344 break;
1345
1346 case FORMAT_TYPE_NRCHARS: {
1347 /* skip %n 's argument */
1348 int qualifier = spec.qualifier;
1349 void *skip_arg;
1350 if (qualifier == 'l')
1351 skip_arg = va_arg(args, long *);
1352 else if (qualifier == 'Z' || qualifier == 'z')
1353 skip_arg = va_arg(args, size_t *);
1354 else
1355 skip_arg = va_arg(args, int *);
1356 break;
1357 }
1358
1359 default:
1360 switch (spec.type) {
1361
1362 case FORMAT_TYPE_LONG_LONG:
1363 save_arg(long long);
1364 break;
1365 case FORMAT_TYPE_ULONG:
1366 case FORMAT_TYPE_LONG:
1367 save_arg(unsigned long);
1368 break;
1369 case FORMAT_TYPE_SIZE_T:
1370 save_arg(size_t);
1371 break;
1372 case FORMAT_TYPE_PTRDIFF:
1373 save_arg(ptrdiff_t);
1374 break;
1375 case FORMAT_TYPE_USHORT:
1376 case FORMAT_TYPE_SHORT:
1377 save_arg(short);
1378 break;
1379 default:
1380 save_arg(int);
1381 }
1382 }
1383 }
1384 return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
1385
1386#undef save_arg
1387}
1388EXPORT_SYMBOL_GPL(vbin_printf);
1389
1390/**
1391 * bstr_printf - Format a string from binary arguments and place it in a buffer
1392 * @buf: The buffer to place the result into
1393 * @size: The size of the buffer, including the trailing null space
1394 * @fmt: The format string to use
1395 * @bin_buf: Binary arguments for the format string
1396 *
1397 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
1398 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
1399 * a binary buffer that generated by vbin_printf.
1400 *
1401 * The format follows C99 vsnprintf, but has some extensions:
1402 * %pS output the name of a text symbol
1403 * %pF output the name of a function pointer
1404 * %pR output the address range in a struct resource
1405 * %n is ignored
1406 *
1407 * The return value is the number of characters which would
1408 * be generated for the given input, excluding the trailing
1409 * '\0', as per ISO C99. If you want to have the exact
1410 * number of characters written into @buf as return value
1411 * (not including the trailing '\0'), use vscnprintf(). If the
1412 * return is greater than or equal to @size, the resulting
1413 * string is truncated.
1414 */
1415int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
1416{
1417 unsigned long long num;
1418 char *str, *end, c;
1419 const char *args = (const char *)bin_buf;
1420
1421 struct printf_spec spec = {0};
1422
1423 if (unlikely((int) size < 0)) {
1424 /* There can be only one.. */
1425 static char warn = 1;
1426 WARN_ON(warn);
1427 warn = 0;
1428 return 0;
1429 }
1430
1431 str = buf;
1432 end = buf + size;
1433
1434#define get_arg(type) \
1435({ \
1436 typeof(type) value; \
1437 if (sizeof(type) == 8) { \
1438 args = PTR_ALIGN(args, sizeof(u32)); \
1439 *(u32 *)&value = *(u32 *)args; \
1440 *((u32 *)&value + 1) = *(u32 *)(args + 4); \
1441 } else { \
1442 args = PTR_ALIGN(args, sizeof(type)); \
1443 value = *(typeof(type) *)args; \
1444 } \
1445 args += sizeof(type); \
1446 value; \
1447})
1448
1449 /* Make sure end is always >= buf */
1450 if (end < buf) {
1451 end = ((void *)-1);
1452 size = end - buf;
1453 }
1454
1455 while (*fmt) {
1456 int read;
1457 const char *old_fmt = fmt;
1458
1459 read = format_decode(fmt, &spec);
1460
1461 fmt += read;
1462
1463 switch (spec.type) {
1464 case FORMAT_TYPE_NONE: {
1465 int copy = read;
1466 if (str < end) {
1467 if (copy > end - str)
1468 copy = end - str;
1469 memcpy(str, old_fmt, copy);
1470 }
1471 str += read;
1472 break;
1473 }
1474
1475 case FORMAT_TYPE_WITDH:
1476 spec.field_width = get_arg(int);
1477 break;
1478
1479 case FORMAT_TYPE_PRECISION:
1480 spec.precision = get_arg(int);
1481 break;
1482
1483 case FORMAT_TYPE_CHAR:
1484 if (!(spec.flags & LEFT)) {
1485 while (--spec.field_width > 0) {
1486 if (str < end)
1487 *str = ' ';
1488 ++str;
1489 }
1490 }
1491 c = (unsigned char) get_arg(char);
1492 if (str < end)
1493 *str = c;
1494 ++str;
1495 while (--spec.field_width > 0) {
1496 if (str < end)
1497 *str = ' ';
1498 ++str;
1499 }
1500 break;
1501
1502 case FORMAT_TYPE_STR: {
1503 const char *str_arg = args;
1504 size_t len = strlen(str_arg);
1505 args += len + 1;
1506 str = string(str, end, (char *)str_arg, spec);
1507 break;
1508 }
1509
1510 case FORMAT_TYPE_PTR:
1511 str = pointer(fmt+1, str, end, get_arg(void *), spec);
1512 while (isalnum(*fmt))
1513 fmt++;
1514 break;
1515
1516 case FORMAT_TYPE_PERCENT_CHAR:
1517 if (str < end)
1518 *str = '%';
1519 ++str;
1520 break;
1521
1522 case FORMAT_TYPE_INVALID:
1523 if (str < end)
1524 *str = '%';
1525 ++str;
1526 if (*fmt) {
1527 if (str < end)
1528 *str = *fmt;
1529 ++str;
1530 } else {
1531 --fmt;
1532 }
1533 break;
1534
1535 case FORMAT_TYPE_NRCHARS:
1536 /* skip */
1537 break;
1538
1539 default:
1540 switch (spec.type) {
1541
1542 case FORMAT_TYPE_LONG_LONG:
1543 num = get_arg(long long);
1544 break;
1545 case FORMAT_TYPE_ULONG:
1546 num = get_arg(unsigned long);
1547 break;
1548 case FORMAT_TYPE_LONG:
1549 num = get_arg(unsigned long);
1550 break;
1551 case FORMAT_TYPE_SIZE_T:
1552 num = get_arg(size_t);
1553 break;
1554 case FORMAT_TYPE_PTRDIFF:
1555 num = get_arg(ptrdiff_t);
1556 break;
1557 case FORMAT_TYPE_USHORT:
1558 num = get_arg(unsigned short);
1559 break;
1560 case FORMAT_TYPE_SHORT:
1561 num = get_arg(short);
1562 break;
1563 case FORMAT_TYPE_UINT:
1564 num = get_arg(unsigned int);
1565 break;
1566 default:
1567 num = get_arg(int);
1568 }
1569
1570 str = number(str, end, num, spec);
1571 }
1572 }
1573
1574 if (size > 0) {
1575 if (str < end)
1576 *str = '\0';
1577 else
1578 end[-1] = '\0';
1579 }
1580
1581#undef get_arg
1582
1583 /* the trailing null byte doesn't count towards the total */
1584 return str - buf;
1585}
1586EXPORT_SYMBOL_GPL(bstr_printf);
1587
1588/**
1589 * bprintf - Parse a format string and place args' binary value in a buffer
1590 * @bin_buf: The buffer to place args' binary value
1591 * @size: The size of the buffer(by words(32bits), not characters)
1592 * @fmt: The format string to use
1593 * @...: Arguments for the format string
1594 *
1595 * The function returns the number of words(u32) written
1596 * into @bin_buf.
1597 */
1598int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
1599{
1600 va_list args;
1601 int ret;
1602
1603 va_start(args, fmt);
1604 ret = vbin_printf(bin_buf, size, fmt, args);
1605 va_end(args);
1606 return ret;
1607}
1608EXPORT_SYMBOL_GPL(bprintf);
1609
1610#endif /* CONFIG_BINARY_PRINTF */
1611
1061/** 1612/**
1062 * vsscanf - Unformat a buffer into a list of arguments 1613 * vsscanf - Unformat a buffer into a list of arguments
1063 * @buf: input buffer 1614 * @buf: input buffer
diff --git a/lib/zlib_inflate/inflate.h b/lib/zlib_inflate/inflate.h
index df8a6c92052d..3d17b3d1b21f 100644
--- a/lib/zlib_inflate/inflate.h
+++ b/lib/zlib_inflate/inflate.h
@@ -1,3 +1,6 @@
1#ifndef INFLATE_H
2#define INFLATE_H
3
1/* inflate.h -- internal inflate state definition 4/* inflate.h -- internal inflate state definition
2 * Copyright (C) 1995-2004 Mark Adler 5 * Copyright (C) 1995-2004 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h 6 * For conditions of distribution and use, see copyright notice in zlib.h
@@ -105,3 +108,4 @@ struct inflate_state {
105 unsigned short work[288]; /* work area for code table building */ 108 unsigned short work[288]; /* work area for code table building */
106 code codes[ENOUGH]; /* space for code tables */ 109 code codes[ENOUGH]; /* space for code tables */
107}; 110};
111#endif
diff --git a/lib/zlib_inflate/inftrees.h b/lib/zlib_inflate/inftrees.h
index 5f5219b1240e..b70b4731ac7a 100644
--- a/lib/zlib_inflate/inftrees.h
+++ b/lib/zlib_inflate/inftrees.h
@@ -1,3 +1,6 @@
1#ifndef INFTREES_H
2#define INFTREES_H
3
1/* inftrees.h -- header to use inftrees.c 4/* inftrees.h -- header to use inftrees.c
2 * Copyright (C) 1995-2005 Mark Adler 5 * Copyright (C) 1995-2005 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h 6 * For conditions of distribution and use, see copyright notice in zlib.h
@@ -53,3 +56,4 @@ typedef enum {
53extern int zlib_inflate_table (codetype type, unsigned short *lens, 56extern int zlib_inflate_table (codetype type, unsigned short *lens,
54 unsigned codes, code **table, 57 unsigned codes, code **table,
55 unsigned *bits, unsigned short *work); 58 unsigned *bits, unsigned short *work);
59#endif
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
31obj-$(CONFIG_FS_XIP) += filemap_xip.o 31obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
33ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
34obj-$(CONFIG_SMP) += percpu.o
35else
33obj-$(CONFIG_SMP) += allocpercpu.o 36obj-$(CONFIG_SMP) += allocpercpu.o
37endif
34obj-$(CONFIG_QUICKLIST) += quicklist.o 38obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 39obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..3653c570232b 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) 99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
100 100
101/** 101/**
102 * percpu_alloc_mask - initial setup of per-cpu data 102 * alloc_percpu - initial setup of per-cpu data
103 * @size: size of per-cpu object 103 * @size: size of per-cpu object
104 * @gfp: may sleep or not etc. 104 * @align: alignment
105 * @mask: populate per-data for cpu's selected through mask bits
106 * 105 *
107 * Populating per-cpu data for all online cpu's would be a typical use case, 106 * Allocate dynamic percpu area. Percpu objects are populated with
108 * which is simplified by the percpu_alloc() wrapper. 107 * zeroed buffers.
109 * Per-cpu objects are populated with zeroed buffers.
110 */ 108 */
111void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 109void *__alloc_percpu(size_t size, size_t align)
112{ 110{
113 /* 111 /*
114 * We allocate whole cache lines to avoid false sharing 112 * We allocate whole cache lines to avoid false sharing
115 */ 113 */
116 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); 114 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
117 void *pdata = kzalloc(sz, gfp); 115 void *pdata = kzalloc(sz, GFP_KERNEL);
118 void *__pdata = __percpu_disguise(pdata); 116 void *__pdata = __percpu_disguise(pdata);
119 117
118 /*
119 * Can't easily make larger alignment work with kmalloc. WARN
120 * on it. Larger alignment should only be used for module
121 * percpu sections on SMP for which this path isn't used.
122 */
123 WARN_ON_ONCE(align > __alignof__(unsigned long long));
124
120 if (unlikely(!pdata)) 125 if (unlikely(!pdata))
121 return NULL; 126 return NULL;
122 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) 127 if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
128 &cpu_possible_map)))
123 return __pdata; 129 return __pdata;
124 kfree(pdata); 130 kfree(pdata);
125 return NULL; 131 return NULL;
126} 132}
127EXPORT_SYMBOL_GPL(__percpu_alloc_mask); 133EXPORT_SYMBOL_GPL(__alloc_percpu);
128 134
129/** 135/**
130 * percpu_free - final cleanup of per-cpu data 136 * free_percpu - final cleanup of per-cpu data
131 * @__pdata: object to clean up 137 * @__pdata: object to clean up
132 * 138 *
133 * We simply clean up any per-cpu object left. No need for the client to 139 * We simply clean up any per-cpu object left. No need for the client to
134 * track and specify through a bis mask which per-cpu objects are to free. 140 * track and specify through a bis mask which per-cpu objects are to free.
135 */ 141 */
136void percpu_free(void *__pdata) 142void free_percpu(void *__pdata)
137{ 143{
138 if (unlikely(!__pdata)) 144 if (unlikely(!__pdata))
139 return; 145 return;
140 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 146 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
141 kfree(__percpu_disguise(__pdata)); 147 kfree(__percpu_disguise(__pdata));
142} 148}
143EXPORT_SYMBOL_GPL(percpu_free); 149EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..daf92713f7de 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -382,7 +382,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
383} 383}
384 384
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/** 385/**
387 * reserve_bootmem - mark a page range as usable 386 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range 387 * @addr: starting address of the range
@@ -403,7 +402,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
403 402
404 return mark_bootmem(start, end, 1, flags); 403 return mark_bootmem(start, end, 1, flags);
405} 404}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
407 405
408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 406static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step) 407 unsigned long step)
@@ -429,8 +427,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
429} 427}
430 428
431static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 429static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
432 unsigned long size, unsigned long align, 430 unsigned long size, unsigned long align,
433 unsigned long goal, unsigned long limit) 431 unsigned long goal, unsigned long limit)
434{ 432{
435 unsigned long fallback = 0; 433 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step; 434 unsigned long min, max, start, sidx, midx, step;
@@ -530,17 +528,34 @@ find_block:
530 return NULL; 528 return NULL;
531} 529}
532 530
531static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit)
534{
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM
536 bootmem_data_t *p_bdata;
537
538 p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
539 if (p_bdata)
540 return alloc_bootmem_core(p_bdata, size, align, goal, limit);
541#endif
542 return NULL;
543}
544
533static void * __init ___alloc_bootmem_nopanic(unsigned long size, 545static void * __init ___alloc_bootmem_nopanic(unsigned long size,
534 unsigned long align, 546 unsigned long align,
535 unsigned long goal, 547 unsigned long goal,
536 unsigned long limit) 548 unsigned long limit)
537{ 549{
538 bootmem_data_t *bdata; 550 bootmem_data_t *bdata;
551 void *region;
539 552
540restart: 553restart:
541 list_for_each_entry(bdata, &bdata_list, list) { 554 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
542 void *region; 555 if (region)
556 return region;
543 557
558 list_for_each_entry(bdata, &bdata_list, list) {
544 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 559 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
545 continue; 560 continue;
546 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) 561 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -618,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
618{ 633{
619 void *ptr; 634 void *ptr;
620 635
636 ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
637 if (ptr)
638 return ptr;
639
621 ptr = alloc_bootmem_core(bdata, size, align, goal, limit); 640 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
622 if (ptr) 641 if (ptr)
623 return ptr; 642 return ptr;
@@ -674,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
674{ 693{
675 void *ptr; 694 void *ptr;
676 695
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr)
698 return ptr;
699
677 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 700 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
678 if (ptr) 701 if (ptr)
679 return ptr; 702 return ptr;
diff --git a/mm/filemap.c b/mm/filemap.c
index 23acefe51808..126d3973b3d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1823,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1823 int copy = min(bytes, iov->iov_len - base); 1823 int copy = min(bytes, iov->iov_len - base);
1824 1824
1825 base = 0; 1825 base = 0;
1826 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); 1826 left = __copy_from_user_inatomic(vaddr, buf, copy);
1827 copied += copy; 1827 copied += copy;
1828 bytes -= copy; 1828 bytes -= copy;
1829 vaddr += copy; 1829 vaddr += copy;
@@ -1851,8 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
1851 if (likely(i->nr_segs == 1)) { 1851 if (likely(i->nr_segs == 1)) {
1852 int left; 1852 int left;
1853 char __user *buf = i->iov->iov_base + i->iov_offset; 1853 char __user *buf = i->iov->iov_base + i->iov_offset;
1854 left = __copy_from_user_inatomic_nocache(kaddr + offset, 1854 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1855 buf, bytes);
1856 copied = bytes - left; 1855 copied = bytes - left;
1857 } else { 1856 } else {
1858 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1857 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1880,7 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page,
1880 if (likely(i->nr_segs == 1)) { 1879 if (likely(i->nr_segs == 1)) {
1881 int left; 1880 int left;
1882 char __user *buf = i->iov->iov_base + i->iov_offset; 1881 char __user *buf = i->iov->iov_base + i->iov_offset;
1883 left = __copy_from_user_nocache(kaddr + offset, buf, bytes); 1882 left = __copy_from_user(kaddr + offset, buf, bytes);
1884 copied = bytes - left; 1883 copied = bytes - left;
1885 } else { 1884 } else {
1886 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1885 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..05fab3bc5b4b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -48,6 +48,8 @@
48#include <linux/rmap.h> 48#include <linux/rmap.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/kprobes.h>
52#include <linux/mutex.h>
51#include <linux/init.h> 53#include <linux/init.h>
52#include <linux/writeback.h> 54#include <linux/writeback.h>
53#include <linux/memcontrol.h> 55#include <linux/memcontrol.h>
@@ -99,6 +101,14 @@ int randomize_va_space __read_mostly =
99 2; 101 2;
100#endif 102#endif
101 103
104/*
105 * mutex protecting text section modification (dynamic code patching).
106 * some users need to sleep (allocating memory...) while they hold this lock.
107 *
108 * NOT exported to modules - patching kernel text is a really delicate matter.
109 */
110DEFINE_MUTEX(text_mutex);
111
102static int __init disable_randmaps(char *s) 112static int __init disable_randmaps(char *s)
103{ 113{
104 randomize_va_space = 0; 114 randomize_va_space = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed49ca93..a3803ea8c27d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1479,6 +1479,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1479 unsigned long did_some_progress; 1479 unsigned long did_some_progress;
1480 unsigned long pages_reclaimed = 0; 1480 unsigned long pages_reclaimed = 0;
1481 1481
1482 lockdep_trace_alloc(gfp_mask);
1483
1482 might_sleep_if(wait); 1484 might_sleep_if(wait);
1483 1485
1484 if (should_fail_alloc_page(gfp_mask, order)) 1486 if (should_fail_alloc_page(gfp_mask, order))
@@ -1578,12 +1580,15 @@ nofail_alloc:
1578 */ 1580 */
1579 cpuset_update_task_memory_state(); 1581 cpuset_update_task_memory_state();
1580 p->flags |= PF_MEMALLOC; 1582 p->flags |= PF_MEMALLOC;
1583
1584 lockdep_set_current_reclaim_state(gfp_mask);
1581 reclaim_state.reclaimed_slab = 0; 1585 reclaim_state.reclaimed_slab = 0;
1582 p->reclaim_state = &reclaim_state; 1586 p->reclaim_state = &reclaim_state;
1583 1587
1584 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1588 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
1585 1589
1586 p->reclaim_state = NULL; 1590 p->reclaim_state = NULL;
1591 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC; 1592 p->flags &= ~PF_MEMALLOC;
1588 1593
1589 cond_resched(); 1594 cond_resched();
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..bfe6a3afaf45
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,1226 @@
1/*
2 * linux/mm/percpu.c - percpu memory allocator
3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk
12 * is used for static percpu variables in the kernel image (special
13 * boot time alloc/init handling necessary as these areas need to be
14 * brought up before allocation services are running). Unit grows as
15 * necessary and all units grow or shrink in unison. When a chunk is
16 * filled up, another chunk is allocated. ie. in vmalloc area
17 *
18 * c0 c1 c2
19 * ------------------- ------------------- ------------
20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
21 * ------------------- ...... ------------------- .... ------------
22 *
23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart.
27 *
28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous
33 * area in the chunk. This helps the allocator not to iterate the
34 * chunk maps unnecessarily.
35 *
36 * Allocation state in each chunk is kept using an array of integers
37 * on chunk->map. A positive value in the map represents a free
38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk
42 * mapping during free.
43 *
44 * To use this allocator, arch code should do the followings.
45 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
47 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back
50 *
51 * - use pcpu_setup_first_chunk() during percpu area initialization to
52 * setup the first chunk containing the kernel static percpu area
53 */
54
55#include <linux/bitmap.h>
56#include <linux/bootmem.h>
57#include <linux/list.h>
58#include <linux/mm.h>
59#include <linux/module.h>
60#include <linux/mutex.h>
61#include <linux/percpu.h>
62#include <linux/pfn.h>
63#include <linux/rbtree.h>
64#include <linux/slab.h>
65#include <linux/spinlock.h>
66#include <linux/vmalloc.h>
67#include <linux/workqueue.h>
68
69#include <asm/cacheflush.h>
70#include <asm/tlbflush.h>
71
72#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
73#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
74
75struct pcpu_chunk {
76 struct list_head list; /* linked to pcpu_slot lists */
77 struct rb_node rb_node; /* key is chunk->vm->addr */
78 int free_size; /* free bytes in the chunk */
79 int contig_hint; /* max contiguous size hint */
80 struct vm_struct *vm; /* mapped vmalloc region */
81 int map_used; /* # of map entries used */
82 int map_alloc; /* # of map entries allocated */
83 int *map; /* allocation map */
84 bool immutable; /* no [de]population allowed */
85 struct page **page; /* points to page array */
86 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
87};
88
89static int pcpu_unit_pages __read_mostly;
90static int pcpu_unit_size __read_mostly;
91static int pcpu_chunk_size __read_mostly;
92static int pcpu_nr_slots __read_mostly;
93static size_t pcpu_chunk_struct_size __read_mostly;
94
95/* the address of the first chunk which starts with the kernel static area */
96void *pcpu_base_addr __read_mostly;
97EXPORT_SYMBOL_GPL(pcpu_base_addr);
98
99/* optional reserved chunk, only accessible for reserved allocations */
100static struct pcpu_chunk *pcpu_reserved_chunk;
101/* offset limit of the reserved chunk */
102static int pcpu_reserved_chunk_limit;
103
104/*
105 * Synchronization rules.
106 *
107 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
108 * protects allocation/reclaim paths, chunks and chunk->page arrays.
109 * The latter is a spinlock and protects the index data structures -
110 * chunk slots, rbtree, chunks and area maps in chunks.
111 *
112 * During allocation, pcpu_alloc_mutex is kept locked all the time and
113 * pcpu_lock is grabbed and released as necessary. All actual memory
114 * allocations are done using GFP_KERNEL with pcpu_lock released.
115 *
116 * Free path accesses and alters only the index data structures, so it
117 * can be safely called from atomic context. When memory needs to be
118 * returned to the system, free path schedules reclaim_work which
119 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
120 * reclaimed, release both locks and frees the chunks. Note that it's
121 * necessary to grab both locks to remove a chunk from circulation as
122 * allocation path might be referencing the chunk with only
123 * pcpu_alloc_mutex locked.
124 */
125static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
126static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
127
128static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
129static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
130
131/* reclaim work to release fully free chunks, scheduled from free path */
132static void pcpu_reclaim(struct work_struct *work);
133static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
134
135static int __pcpu_size_to_slot(int size)
136{
137 int highbit = fls(size); /* size is in bytes */
138 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
139}
140
141static int pcpu_size_to_slot(int size)
142{
143 if (size == pcpu_unit_size)
144 return pcpu_nr_slots - 1;
145 return __pcpu_size_to_slot(size);
146}
147
148static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
149{
150 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
151 return 0;
152
153 return pcpu_size_to_slot(chunk->free_size);
154}
155
156static int pcpu_page_idx(unsigned int cpu, int page_idx)
157{
158 return cpu * pcpu_unit_pages + page_idx;
159}
160
161static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
162 unsigned int cpu, int page_idx)
163{
164 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
165}
166
167static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
168 unsigned int cpu, int page_idx)
169{
170 return (unsigned long)chunk->vm->addr +
171 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
172}
173
174static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
175 int page_idx)
176{
177 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
178}
179
180/**
181 * pcpu_mem_alloc - allocate memory
182 * @size: bytes to allocate
183 *
184 * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
185 * kzalloc() is used; otherwise, vmalloc() is used. The returned
186 * memory is always zeroed.
187 *
188 * CONTEXT:
189 * Does GFP_KERNEL allocation.
190 *
191 * RETURNS:
192 * Pointer to the allocated area on success, NULL on failure.
193 */
194static void *pcpu_mem_alloc(size_t size)
195{
196 if (size <= PAGE_SIZE)
197 return kzalloc(size, GFP_KERNEL);
198 else {
199 void *ptr = vmalloc(size);
200 if (ptr)
201 memset(ptr, 0, size);
202 return ptr;
203 }
204}
205
206/**
207 * pcpu_mem_free - free memory
208 * @ptr: memory to free
209 * @size: size of the area
210 *
211 * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
212 */
213static void pcpu_mem_free(void *ptr, size_t size)
214{
215 if (size <= PAGE_SIZE)
216 kfree(ptr);
217 else
218 vfree(ptr);
219}
220
221/**
222 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
223 * @chunk: chunk of interest
224 * @oslot: the previous slot it was on
225 *
226 * This function is called after an allocation or free changed @chunk.
227 * New slot according to the changed state is determined and @chunk is
228 * moved to the slot. Note that the reserved chunk is never put on
229 * chunk slots.
230 *
231 * CONTEXT:
232 * pcpu_lock.
233 */
234static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
235{
236 int nslot = pcpu_chunk_slot(chunk);
237
238 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
239 if (oslot < nslot)
240 list_move(&chunk->list, &pcpu_slot[nslot]);
241 else
242 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
243 }
244}
245
246static struct rb_node **pcpu_chunk_rb_search(void *addr,
247 struct rb_node **parentp)
248{
249 struct rb_node **p = &pcpu_addr_root.rb_node;
250 struct rb_node *parent = NULL;
251 struct pcpu_chunk *chunk;
252
253 while (*p) {
254 parent = *p;
255 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
256
257 if (addr < chunk->vm->addr)
258 p = &(*p)->rb_left;
259 else if (addr > chunk->vm->addr)
260 p = &(*p)->rb_right;
261 else
262 break;
263 }
264
265 if (parentp)
266 *parentp = parent;
267 return p;
268}
269
270/**
271 * pcpu_chunk_addr_search - search for chunk containing specified address
272 * @addr: address to search for
273 *
274 * Look for chunk which might contain @addr. More specifically, it
275 * searchs for the chunk with the highest start address which isn't
276 * beyond @addr.
277 *
278 * CONTEXT:
279 * pcpu_lock.
280 *
281 * RETURNS:
282 * The address of the found chunk.
283 */
284static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
285{
286 struct rb_node *n, *parent;
287 struct pcpu_chunk *chunk;
288
289 /* is it in the reserved chunk? */
290 if (pcpu_reserved_chunk) {
291 void *start = pcpu_reserved_chunk->vm->addr;
292
293 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
294 return pcpu_reserved_chunk;
295 }
296
297 /* nah... search the regular ones */
298 n = *pcpu_chunk_rb_search(addr, &parent);
299 if (!n) {
300 /* no exactly matching chunk, the parent is the closest */
301 n = parent;
302 BUG_ON(!n);
303 }
304 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
305
306 if (addr < chunk->vm->addr) {
307 /* the parent was the next one, look for the previous one */
308 n = rb_prev(n);
309 BUG_ON(!n);
310 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
311 }
312
313 return chunk;
314}
315
316/**
317 * pcpu_chunk_addr_insert - insert chunk into address rb tree
318 * @new: chunk to insert
319 *
320 * Insert @new into address rb tree.
321 *
322 * CONTEXT:
323 * pcpu_lock.
324 */
325static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
326{
327 struct rb_node **p, *parent;
328
329 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
330 BUG_ON(*p);
331 rb_link_node(&new->rb_node, parent, p);
332 rb_insert_color(&new->rb_node, &pcpu_addr_root);
333}
334
335/**
336 * pcpu_extend_area_map - extend area map for allocation
337 * @chunk: target chunk
338 *
339 * Extend area map of @chunk so that it can accomodate an allocation.
340 * A single allocation can split an area into three areas, so this
341 * function makes sure that @chunk->map has at least two extra slots.
342 *
343 * CONTEXT:
344 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
345 * if area map is extended.
346 *
347 * RETURNS:
348 * 0 if noop, 1 if successfully extended, -errno on failure.
349 */
350static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
351{
352 int new_alloc;
353 int *new;
354 size_t size;
355
356 /* has enough? */
357 if (chunk->map_alloc >= chunk->map_used + 2)
358 return 0;
359
360 spin_unlock_irq(&pcpu_lock);
361
362 new_alloc = PCPU_DFL_MAP_ALLOC;
363 while (new_alloc < chunk->map_used + 2)
364 new_alloc *= 2;
365
366 new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
367 if (!new) {
368 spin_lock_irq(&pcpu_lock);
369 return -ENOMEM;
370 }
371
372 /*
373 * Acquire pcpu_lock and switch to new area map. Only free
374 * could have happened inbetween, so map_used couldn't have
375 * grown.
376 */
377 spin_lock_irq(&pcpu_lock);
378 BUG_ON(new_alloc < chunk->map_used + 2);
379
380 size = chunk->map_alloc * sizeof(chunk->map[0]);
381 memcpy(new, chunk->map, size);
382
383 /*
384 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
385 * one of the first chunks and still using static map.
386 */
387 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
388 pcpu_mem_free(chunk->map, size);
389
390 chunk->map_alloc = new_alloc;
391 chunk->map = new;
392 return 0;
393}
394
395/**
396 * pcpu_split_block - split a map block
397 * @chunk: chunk of interest
398 * @i: index of map block to split
399 * @head: head size in bytes (can be 0)
400 * @tail: tail size in bytes (can be 0)
401 *
402 * Split the @i'th map block into two or three blocks. If @head is
403 * non-zero, @head bytes block is inserted before block @i moving it
404 * to @i+1 and reducing its size by @head bytes.
405 *
406 * If @tail is non-zero, the target block, which can be @i or @i+1
407 * depending on @head, is reduced by @tail bytes and @tail byte block
408 * is inserted after the target block.
409 *
410 * @chunk->map must have enough free slots to accomodate the split.
411 *
412 * CONTEXT:
413 * pcpu_lock.
414 */
415static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
416 int head, int tail)
417{
418 int nr_extra = !!head + !!tail;
419
420 BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
421
422 /* insert new subblocks */
423 memmove(&chunk->map[i + nr_extra], &chunk->map[i],
424 sizeof(chunk->map[0]) * (chunk->map_used - i));
425 chunk->map_used += nr_extra;
426
427 if (head) {
428 chunk->map[i + 1] = chunk->map[i] - head;
429 chunk->map[i++] = head;
430 }
431 if (tail) {
432 chunk->map[i++] -= tail;
433 chunk->map[i] = tail;
434 }
435}
436
437/**
438 * pcpu_alloc_area - allocate area from a pcpu_chunk
439 * @chunk: chunk of interest
440 * @size: wanted size in bytes
441 * @align: wanted align
442 *
443 * Try to allocate @size bytes area aligned at @align from @chunk.
444 * Note that this function only allocates the offset. It doesn't
445 * populate or map the area.
446 *
447 * @chunk->map must have at least two free slots.
448 *
449 * CONTEXT:
450 * pcpu_lock.
451 *
452 * RETURNS:
453 * Allocated offset in @chunk on success, -1 if no matching area is
454 * found.
455 */
456static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
457{
458 int oslot = pcpu_chunk_slot(chunk);
459 int max_contig = 0;
460 int i, off;
461
462 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
463 bool is_last = i + 1 == chunk->map_used;
464 int head, tail;
465
466 /* extra for alignment requirement */
467 head = ALIGN(off, align) - off;
468 BUG_ON(i == 0 && head != 0);
469
470 if (chunk->map[i] < 0)
471 continue;
472 if (chunk->map[i] < head + size) {
473 max_contig = max(chunk->map[i], max_contig);
474 continue;
475 }
476
477 /*
478 * If head is small or the previous block is free,
479 * merge'em. Note that 'small' is defined as smaller
480 * than sizeof(int), which is very small but isn't too
481 * uncommon for percpu allocations.
482 */
483 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
484 if (chunk->map[i - 1] > 0)
485 chunk->map[i - 1] += head;
486 else {
487 chunk->map[i - 1] -= head;
488 chunk->free_size -= head;
489 }
490 chunk->map[i] -= head;
491 off += head;
492 head = 0;
493 }
494
495 /* if tail is small, just keep it around */
496 tail = chunk->map[i] - head - size;
497 if (tail < sizeof(int))
498 tail = 0;
499
500 /* split if warranted */
501 if (head || tail) {
502 pcpu_split_block(chunk, i, head, tail);
503 if (head) {
504 i++;
505 off += head;
506 max_contig = max(chunk->map[i - 1], max_contig);
507 }
508 if (tail)
509 max_contig = max(chunk->map[i + 1], max_contig);
510 }
511
512 /* update hint and mark allocated */
513 if (is_last)
514 chunk->contig_hint = max_contig; /* fully scanned */
515 else
516 chunk->contig_hint = max(chunk->contig_hint,
517 max_contig);
518
519 chunk->free_size -= chunk->map[i];
520 chunk->map[i] = -chunk->map[i];
521
522 pcpu_chunk_relocate(chunk, oslot);
523 return off;
524 }
525
526 chunk->contig_hint = max_contig; /* fully scanned */
527 pcpu_chunk_relocate(chunk, oslot);
528
529 /* tell the upper layer that this chunk has no matching area */
530 return -1;
531}
532
533/**
534 * pcpu_free_area - free area to a pcpu_chunk
535 * @chunk: chunk of interest
536 * @freeme: offset of area to free
537 *
538 * Free area starting from @freeme to @chunk. Note that this function
539 * only modifies the allocation map. It doesn't depopulate or unmap
540 * the area.
541 *
542 * CONTEXT:
543 * pcpu_lock.
544 */
545static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
546{
547 int oslot = pcpu_chunk_slot(chunk);
548 int i, off;
549
550 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
551 if (off == freeme)
552 break;
553 BUG_ON(off != freeme);
554 BUG_ON(chunk->map[i] > 0);
555
556 chunk->map[i] = -chunk->map[i];
557 chunk->free_size += chunk->map[i];
558
559 /* merge with previous? */
560 if (i > 0 && chunk->map[i - 1] >= 0) {
561 chunk->map[i - 1] += chunk->map[i];
562 chunk->map_used--;
563 memmove(&chunk->map[i], &chunk->map[i + 1],
564 (chunk->map_used - i) * sizeof(chunk->map[0]));
565 i--;
566 }
567 /* merge with next? */
568 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
569 chunk->map[i] += chunk->map[i + 1];
570 chunk->map_used--;
571 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
572 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
573 }
574
575 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
576 pcpu_chunk_relocate(chunk, oslot);
577}
578
579/**
580 * pcpu_unmap - unmap pages out of a pcpu_chunk
581 * @chunk: chunk of interest
582 * @page_start: page index of the first page to unmap
583 * @page_end: page index of the last page to unmap + 1
584 * @flush: whether to flush cache and tlb or not
585 *
586 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
587 * If @flush is true, vcache is flushed before unmapping and tlb
588 * after.
589 */
590static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
591 bool flush)
592{
593 unsigned int last = num_possible_cpus() - 1;
594 unsigned int cpu;
595
596 /* unmap must not be done on immutable chunk */
597 WARN_ON(chunk->immutable);
598
599 /*
600 * Each flushing trial can be very expensive, issue flush on
601 * the whole region at once rather than doing it for each cpu.
602 * This could be an overkill but is more scalable.
603 */
604 if (flush)
605 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
606 pcpu_chunk_addr(chunk, last, page_end));
607
608 for_each_possible_cpu(cpu)
609 unmap_kernel_range_noflush(
610 pcpu_chunk_addr(chunk, cpu, page_start),
611 (page_end - page_start) << PAGE_SHIFT);
612
613 /* ditto as flush_cache_vunmap() */
614 if (flush)
615 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
616 pcpu_chunk_addr(chunk, last, page_end));
617}
618
619/**
620 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
621 * @chunk: chunk to depopulate
622 * @off: offset to the area to depopulate
623 * @size: size of the area to depopulate in bytes
624 * @flush: whether to flush cache and tlb or not
625 *
626 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
627 * from @chunk. If @flush is true, vcache is flushed before unmapping
628 * and tlb after.
629 *
630 * CONTEXT:
631 * pcpu_alloc_mutex.
632 */
633static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
634 bool flush)
635{
636 int page_start = PFN_DOWN(off);
637 int page_end = PFN_UP(off + size);
638 int unmap_start = -1;
639 int uninitialized_var(unmap_end);
640 unsigned int cpu;
641 int i;
642
643 for (i = page_start; i < page_end; i++) {
644 for_each_possible_cpu(cpu) {
645 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
646
647 if (!*pagep)
648 continue;
649
650 __free_page(*pagep);
651
652 /*
653 * If it's partial depopulation, it might get
654 * populated or depopulated again. Mark the
655 * page gone.
656 */
657 *pagep = NULL;
658
659 unmap_start = unmap_start < 0 ? i : unmap_start;
660 unmap_end = i + 1;
661 }
662 }
663
664 if (unmap_start >= 0)
665 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
666}
667
668/**
669 * pcpu_map - map pages into a pcpu_chunk
670 * @chunk: chunk of interest
671 * @page_start: page index of the first page to map
672 * @page_end: page index of the last page to map + 1
673 *
674 * For each cpu, map pages [@page_start,@page_end) into @chunk.
675 * vcache is flushed afterwards.
676 */
677static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
678{
679 unsigned int last = num_possible_cpus() - 1;
680 unsigned int cpu;
681 int err;
682
683 /* map must not be done on immutable chunk */
684 WARN_ON(chunk->immutable);
685
686 for_each_possible_cpu(cpu) {
687 err = map_kernel_range_noflush(
688 pcpu_chunk_addr(chunk, cpu, page_start),
689 (page_end - page_start) << PAGE_SHIFT,
690 PAGE_KERNEL,
691 pcpu_chunk_pagep(chunk, cpu, page_start));
692 if (err < 0)
693 return err;
694 }
695
696 /* flush at once, please read comments in pcpu_unmap() */
697 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
698 pcpu_chunk_addr(chunk, last, page_end));
699 return 0;
700}
701
702/**
703 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
704 * @chunk: chunk of interest
705 * @off: offset to the area to populate
706 * @size: size of the area to populate in bytes
707 *
708 * For each cpu, populate and map pages [@page_start,@page_end) into
709 * @chunk. The area is cleared on return.
710 *
711 * CONTEXT:
712 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
713 */
714static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
715{
716 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
717 int page_start = PFN_DOWN(off);
718 int page_end = PFN_UP(off + size);
719 int map_start = -1;
720 int uninitialized_var(map_end);
721 unsigned int cpu;
722 int i;
723
724 for (i = page_start; i < page_end; i++) {
725 if (pcpu_chunk_page_occupied(chunk, i)) {
726 if (map_start >= 0) {
727 if (pcpu_map(chunk, map_start, map_end))
728 goto err;
729 map_start = -1;
730 }
731 continue;
732 }
733
734 map_start = map_start < 0 ? i : map_start;
735 map_end = i + 1;
736
737 for_each_possible_cpu(cpu) {
738 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
739
740 *pagep = alloc_pages_node(cpu_to_node(cpu),
741 alloc_mask, 0);
742 if (!*pagep)
743 goto err;
744 }
745 }
746
747 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
748 goto err;
749
750 for_each_possible_cpu(cpu)
751 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
752 size);
753
754 return 0;
755err:
756 /* likely under heavy memory pressure, give memory back */
757 pcpu_depopulate_chunk(chunk, off, size, true);
758 return -ENOMEM;
759}
760
761static void free_pcpu_chunk(struct pcpu_chunk *chunk)
762{
763 if (!chunk)
764 return;
765 if (chunk->vm)
766 free_vm_area(chunk->vm);
767 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
768 kfree(chunk);
769}
770
771static struct pcpu_chunk *alloc_pcpu_chunk(void)
772{
773 struct pcpu_chunk *chunk;
774
775 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
776 if (!chunk)
777 return NULL;
778
779 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
780 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
781 chunk->map[chunk->map_used++] = pcpu_unit_size;
782 chunk->page = chunk->page_ar;
783
784 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
785 if (!chunk->vm) {
786 free_pcpu_chunk(chunk);
787 return NULL;
788 }
789
790 INIT_LIST_HEAD(&chunk->list);
791 chunk->free_size = pcpu_unit_size;
792 chunk->contig_hint = pcpu_unit_size;
793
794 return chunk;
795}
796
797/**
798 * pcpu_alloc - the percpu allocator
799 * @size: size of area to allocate in bytes
800 * @align: alignment of area (max PAGE_SIZE)
801 * @reserved: allocate from the reserved chunk if available
802 *
803 * Allocate percpu area of @size bytes aligned at @align.
804 *
805 * CONTEXT:
806 * Does GFP_KERNEL allocation.
807 *
808 * RETURNS:
809 * Percpu pointer to the allocated area on success, NULL on failure.
810 */
811static void *pcpu_alloc(size_t size, size_t align, bool reserved)
812{
813 struct pcpu_chunk *chunk;
814 int slot, off;
815
816 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
817 WARN(true, "illegal size (%zu) or align (%zu) for "
818 "percpu allocation\n", size, align);
819 return NULL;
820 }
821
822 mutex_lock(&pcpu_alloc_mutex);
823 spin_lock_irq(&pcpu_lock);
824
825 /* serve reserved allocations from the reserved chunk if available */
826 if (reserved && pcpu_reserved_chunk) {
827 chunk = pcpu_reserved_chunk;
828 if (size > chunk->contig_hint ||
829 pcpu_extend_area_map(chunk) < 0)
830 goto fail_unlock;
831 off = pcpu_alloc_area(chunk, size, align);
832 if (off >= 0)
833 goto area_found;
834 goto fail_unlock;
835 }
836
837restart:
838 /* search through normal chunks */
839 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
840 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
841 if (size > chunk->contig_hint)
842 continue;
843
844 switch (pcpu_extend_area_map(chunk)) {
845 case 0:
846 break;
847 case 1:
848 goto restart; /* pcpu_lock dropped, restart */
849 default:
850 goto fail_unlock;
851 }
852
853 off = pcpu_alloc_area(chunk, size, align);
854 if (off >= 0)
855 goto area_found;
856 }
857 }
858
859 /* hmmm... no space left, create a new chunk */
860 spin_unlock_irq(&pcpu_lock);
861
862 chunk = alloc_pcpu_chunk();
863 if (!chunk)
864 goto fail_unlock_mutex;
865
866 spin_lock_irq(&pcpu_lock);
867 pcpu_chunk_relocate(chunk, -1);
868 pcpu_chunk_addr_insert(chunk);
869 goto restart;
870
871area_found:
872 spin_unlock_irq(&pcpu_lock);
873
874 /* populate, map and clear the area */
875 if (pcpu_populate_chunk(chunk, off, size)) {
876 spin_lock_irq(&pcpu_lock);
877 pcpu_free_area(chunk, off);
878 goto fail_unlock;
879 }
880
881 mutex_unlock(&pcpu_alloc_mutex);
882
883 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
884
885fail_unlock:
886 spin_unlock_irq(&pcpu_lock);
887fail_unlock_mutex:
888 mutex_unlock(&pcpu_alloc_mutex);
889 return NULL;
890}
891
892/**
893 * __alloc_percpu - allocate dynamic percpu area
894 * @size: size of area to allocate in bytes
895 * @align: alignment of area (max PAGE_SIZE)
896 *
897 * Allocate percpu area of @size bytes aligned at @align. Might
898 * sleep. Might trigger writeouts.
899 *
900 * CONTEXT:
901 * Does GFP_KERNEL allocation.
902 *
903 * RETURNS:
904 * Percpu pointer to the allocated area on success, NULL on failure.
905 */
906void *__alloc_percpu(size_t size, size_t align)
907{
908 return pcpu_alloc(size, align, false);
909}
910EXPORT_SYMBOL_GPL(__alloc_percpu);
911
912/**
913 * __alloc_reserved_percpu - allocate reserved percpu area
914 * @size: size of area to allocate in bytes
915 * @align: alignment of area (max PAGE_SIZE)
916 *
917 * Allocate percpu area of @size bytes aligned at @align from reserved
918 * percpu area if arch has set it up; otherwise, allocation is served
919 * from the same dynamic area. Might sleep. Might trigger writeouts.
920 *
921 * CONTEXT:
922 * Does GFP_KERNEL allocation.
923 *
924 * RETURNS:
925 * Percpu pointer to the allocated area on success, NULL on failure.
926 */
927void *__alloc_reserved_percpu(size_t size, size_t align)
928{
929 return pcpu_alloc(size, align, true);
930}
931
932/**
933 * pcpu_reclaim - reclaim fully free chunks, workqueue function
934 * @work: unused
935 *
936 * Reclaim all fully free chunks except for the first one.
937 *
938 * CONTEXT:
939 * workqueue context.
940 */
941static void pcpu_reclaim(struct work_struct *work)
942{
943 LIST_HEAD(todo);
944 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
945 struct pcpu_chunk *chunk, *next;
946
947 mutex_lock(&pcpu_alloc_mutex);
948 spin_lock_irq(&pcpu_lock);
949
950 list_for_each_entry_safe(chunk, next, head, list) {
951 WARN_ON(chunk->immutable);
952
953 /* spare the first one */
954 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
955 continue;
956
957 rb_erase(&chunk->rb_node, &pcpu_addr_root);
958 list_move(&chunk->list, &todo);
959 }
960
961 spin_unlock_irq(&pcpu_lock);
962 mutex_unlock(&pcpu_alloc_mutex);
963
964 list_for_each_entry_safe(chunk, next, &todo, list) {
965 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
966 free_pcpu_chunk(chunk);
967 }
968}
969
970/**
971 * free_percpu - free percpu area
972 * @ptr: pointer to area to free
973 *
974 * Free percpu area @ptr.
975 *
976 * CONTEXT:
977 * Can be called from atomic context.
978 */
979void free_percpu(void *ptr)
980{
981 void *addr = __pcpu_ptr_to_addr(ptr);
982 struct pcpu_chunk *chunk;
983 unsigned long flags;
984 int off;
985
986 if (!ptr)
987 return;
988
989 spin_lock_irqsave(&pcpu_lock, flags);
990
991 chunk = pcpu_chunk_addr_search(addr);
992 off = addr - chunk->vm->addr;
993
994 pcpu_free_area(chunk, off);
995
996 /* if there are more than one fully free chunks, wake up grim reaper */
997 if (chunk->free_size == pcpu_unit_size) {
998 struct pcpu_chunk *pos;
999
1000 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1001 if (pos != chunk) {
1002 schedule_work(&pcpu_reclaim_work);
1003 break;
1004 }
1005 }
1006
1007 spin_unlock_irqrestore(&pcpu_lock, flags);
1008}
1009EXPORT_SYMBOL_GPL(free_percpu);
1010
1011/**
1012 * pcpu_setup_first_chunk - initialize the first percpu chunk
1013 * @get_page_fn: callback to fetch page pointer
1014 * @static_size: the size of static percpu area in bytes
1015 * @reserved_size: the size of reserved percpu area in bytes
1016 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1017 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1018 * @base_addr: mapped address, NULL for auto
1019 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
1020 *
1021 * Initialize the first percpu chunk which contains the kernel static
1022 * perpcu area. This function is to be called from arch percpu area
1023 * setup path. The first two parameters are mandatory. The rest are
1024 * optional.
1025 *
1026 * @get_page_fn() should return pointer to percpu page given cpu
1027 * number and page number. It should at least return enough pages to
1028 * cover the static area. The returned pages for static area should
1029 * have been initialized with valid data. If @unit_size is specified,
1030 * it can also return pages after the static area. NULL return
1031 * indicates end of pages for the cpu. Note that @get_page_fn() must
1032 * return the same number of pages for all cpus.
1033 *
1034 * @reserved_size, if non-zero, specifies the amount of bytes to
1035 * reserve after the static area in the first chunk. This reserves
1036 * the first chunk such that it's available only through reserved
1037 * percpu allocation. This is primarily used to serve module percpu
1038 * static areas on architectures where the addressing model has
1039 * limited offset range for symbol relocations to guarantee module
1040 * percpu symbols fall inside the relocatable range.
1041 *
1042 * @unit_size, if non-negative, specifies unit size and must be
1043 * aligned to PAGE_SIZE and equal to or larger than @static_size +
1044 * @reserved_size + @dyn_size.
1045 *
1046 * @dyn_size, if non-negative, limits the number of bytes available
1047 * for dynamic allocation in the first chunk. Specifying non-negative
1048 * value make percpu leave alone the area beyond @static_size +
1049 * @reserved_size + @dyn_size.
1050 *
1051 * Non-null @base_addr means that the caller already allocated virtual
1052 * region for the first chunk and mapped it. percpu must not mess
1053 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1054 * @populate_pte_fn doesn't make any sense.
1055 *
1056 * @populate_pte_fn is used to populate the pagetable. NULL means the
1057 * caller already populated the pagetable.
1058 *
1059 * If the first chunk ends up with both reserved and dynamic areas, it
1060 * is served by two chunks - one to serve the core static and reserved
1061 * areas and the other for the dynamic area. They share the same vm
1062 * and page map but uses different area allocation map to stay away
1063 * from each other. The latter chunk is circulated in the chunk slots
1064 * and available for dynamic allocation like any other chunks.
1065 *
1066 * RETURNS:
1067 * The determined pcpu_unit_size which can be used to initialize
1068 * percpu access.
1069 */
1070size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1071 size_t static_size, size_t reserved_size,
1072 ssize_t unit_size, ssize_t dyn_size,
1073 void *base_addr,
1074 pcpu_populate_pte_fn_t populate_pte_fn)
1075{
1076 static struct vm_struct first_vm;
1077 static int smap[2], dmap[2];
1078 struct pcpu_chunk *schunk, *dchunk = NULL;
1079 unsigned int cpu;
1080 int nr_pages;
1081 int err, i;
1082
1083 /* santiy checks */
1084 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1085 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1086 BUG_ON(!static_size);
1087 if (unit_size >= 0) {
1088 BUG_ON(unit_size < static_size + reserved_size +
1089 (dyn_size >= 0 ? dyn_size : 0));
1090 BUG_ON(unit_size & ~PAGE_MASK);
1091 } else {
1092 BUG_ON(dyn_size >= 0);
1093 BUG_ON(base_addr);
1094 }
1095 BUG_ON(base_addr && populate_pte_fn);
1096
1097 if (unit_size >= 0)
1098 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1099 else
1100 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
1101 PFN_UP(static_size + reserved_size));
1102
1103 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1104 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1105 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1106 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
1107
1108 if (dyn_size < 0)
1109 dyn_size = pcpu_unit_size - static_size - reserved_size;
1110
1111 /*
1112 * Allocate chunk slots. The additional last slot is for
1113 * empty chunks.
1114 */
1115 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1116 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
1117 for (i = 0; i < pcpu_nr_slots; i++)
1118 INIT_LIST_HEAD(&pcpu_slot[i]);
1119
1120 /*
1121 * Initialize static chunk. If reserved_size is zero, the
1122 * static chunk covers static area + dynamic allocation area
1123 * in the first chunk. If reserved_size is not zero, it
1124 * covers static area + reserved area (mostly used for module
1125 * static percpu allocation).
1126 */
1127 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1128 INIT_LIST_HEAD(&schunk->list);
1129 schunk->vm = &first_vm;
1130 schunk->map = smap;
1131 schunk->map_alloc = ARRAY_SIZE(smap);
1132 schunk->page = schunk->page_ar;
1133
1134 if (reserved_size) {
1135 schunk->free_size = reserved_size;
1136 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */
1137 } else {
1138 schunk->free_size = dyn_size;
1139 dyn_size = 0; /* dynamic area covered */
1140 }
1141 schunk->contig_hint = schunk->free_size;
1142
1143 schunk->map[schunk->map_used++] = -static_size;
1144 if (schunk->free_size)
1145 schunk->map[schunk->map_used++] = schunk->free_size;
1146
1147 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1148
1149 /* init dynamic chunk if necessary */
1150 if (dyn_size) {
1151 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
1152 INIT_LIST_HEAD(&dchunk->list);
1153 dchunk->vm = &first_vm;
1154 dchunk->map = dmap;
1155 dchunk->map_alloc = ARRAY_SIZE(dmap);
1156 dchunk->page = schunk->page_ar; /* share page map with schunk */
1157
1158 dchunk->contig_hint = dchunk->free_size = dyn_size;
1159 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1160 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1161 }
1162
1163 /* allocate vm address */
1164 first_vm.flags = VM_ALLOC;
1165 first_vm.size = pcpu_chunk_size;
1166
1167 if (!base_addr)
1168 vm_area_register_early(&first_vm, PAGE_SIZE);
1169 else {
1170 /*
1171 * Pages already mapped. No need to remap into
1172 * vmalloc area. In this case the first chunks can't
1173 * be mapped or unmapped by percpu and are marked
1174 * immutable.
1175 */
1176 first_vm.addr = base_addr;
1177 schunk->immutable = true;
1178 if (dchunk)
1179 dchunk->immutable = true;
1180 }
1181
1182 /* assign pages */
1183 nr_pages = -1;
1184 for_each_possible_cpu(cpu) {
1185 for (i = 0; i < pcpu_unit_pages; i++) {
1186 struct page *page = get_page_fn(cpu, i);
1187
1188 if (!page)
1189 break;
1190 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1191 }
1192
1193 BUG_ON(i < PFN_UP(static_size));
1194
1195 if (nr_pages < 0)
1196 nr_pages = i;
1197 else
1198 BUG_ON(nr_pages != i);
1199 }
1200
1201 /* map them */
1202 if (populate_pte_fn) {
1203 for_each_possible_cpu(cpu)
1204 for (i = 0; i < nr_pages; i++)
1205 populate_pte_fn(pcpu_chunk_addr(schunk,
1206 cpu, i));
1207
1208 err = pcpu_map(schunk, 0, nr_pages);
1209 if (err)
1210 panic("failed to setup static percpu area, err=%d\n",
1211 err);
1212 }
1213
1214 /* link the first chunk in */
1215 if (!dchunk) {
1216 pcpu_chunk_relocate(schunk, -1);
1217 pcpu_chunk_addr_insert(schunk);
1218 } else {
1219 pcpu_chunk_relocate(dchunk, -1);
1220 pcpu_chunk_addr_insert(dchunk);
1221 }
1222
1223 /* we're done */
1224 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
1225 return pcpu_unit_size;
1226}
diff --git a/mm/slab.c b/mm/slab.c
index 4d00855629c4..9ec66c3e6ee0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,6 +102,7 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <trace/kmemtrace.h>
105#include <linux/rcupdate.h> 106#include <linux/rcupdate.h>
106#include <linux/string.h> 107#include <linux/string.h>
107#include <linux/uaccess.h> 108#include <linux/uaccess.h>
@@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
568 569
569#endif 570#endif
570 571
572#ifdef CONFIG_KMEMTRACE
573size_t slab_buffer_size(struct kmem_cache *cachep)
574{
575 return cachep->buffer_size;
576}
577EXPORT_SYMBOL(slab_buffer_size);
578#endif
579
571/* 580/*
572 * Do not go above this order unless 0 objects fit into the slab. 581 * Do not go above this order unless 0 objects fit into the slab.
573 */ 582 */
@@ -3318,6 +3327,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3318 unsigned long save_flags; 3327 unsigned long save_flags;
3319 void *ptr; 3328 void *ptr;
3320 3329
3330 lockdep_trace_alloc(flags);
3331
3321 if (slab_should_failslab(cachep, flags)) 3332 if (slab_should_failslab(cachep, flags))
3322 return NULL; 3333 return NULL;
3323 3334
@@ -3394,6 +3405,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3394 unsigned long save_flags; 3405 unsigned long save_flags;
3395 void *objp; 3406 void *objp;
3396 3407
3408 lockdep_trace_alloc(flags);
3409
3397 if (slab_should_failslab(cachep, flags)) 3410 if (slab_should_failslab(cachep, flags))
3398 return NULL; 3411 return NULL;
3399 3412
@@ -3550,10 +3563,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3550 */ 3563 */
3551void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3564void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3552{ 3565{
3553 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3566 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3567
3568 kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
3569 obj_size(cachep), cachep->buffer_size, flags);
3570
3571 return ret;
3554} 3572}
3555EXPORT_SYMBOL(kmem_cache_alloc); 3573EXPORT_SYMBOL(kmem_cache_alloc);
3556 3574
3575#ifdef CONFIG_KMEMTRACE
3576void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3577{
3578 return __cache_alloc(cachep, flags, __builtin_return_address(0));
3579}
3580EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3581#endif
3582
3557/** 3583/**
3558 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. 3584 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3559 * @cachep: the cache we're checking against 3585 * @cachep: the cache we're checking against
@@ -3598,23 +3624,47 @@ out:
3598#ifdef CONFIG_NUMA 3624#ifdef CONFIG_NUMA
3599void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3625void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3600{ 3626{
3601 return __cache_alloc_node(cachep, flags, nodeid, 3627 void *ret = __cache_alloc_node(cachep, flags, nodeid,
3602 __builtin_return_address(0)); 3628 __builtin_return_address(0));
3629
3630 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
3631 obj_size(cachep), cachep->buffer_size,
3632 flags, nodeid);
3633
3634 return ret;
3603} 3635}
3604EXPORT_SYMBOL(kmem_cache_alloc_node); 3636EXPORT_SYMBOL(kmem_cache_alloc_node);
3605 3637
3638#ifdef CONFIG_KMEMTRACE
3639void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3640 gfp_t flags,
3641 int nodeid)
3642{
3643 return __cache_alloc_node(cachep, flags, nodeid,
3644 __builtin_return_address(0));
3645}
3646EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3647#endif
3648
3606static __always_inline void * 3649static __always_inline void *
3607__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3650__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3608{ 3651{
3609 struct kmem_cache *cachep; 3652 struct kmem_cache *cachep;
3653 void *ret;
3610 3654
3611 cachep = kmem_find_general_cachep(size, flags); 3655 cachep = kmem_find_general_cachep(size, flags);
3612 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3656 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3613 return cachep; 3657 return cachep;
3614 return kmem_cache_alloc_node(cachep, flags, node); 3658 ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3659
3660 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
3661 (unsigned long) caller, ret,
3662 size, cachep->buffer_size, flags, node);
3663
3664 return ret;
3615} 3665}
3616 3666
3617#ifdef CONFIG_DEBUG_SLAB 3667#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3618void *__kmalloc_node(size_t size, gfp_t flags, int node) 3668void *__kmalloc_node(size_t size, gfp_t flags, int node)
3619{ 3669{
3620 return __do_kmalloc_node(size, flags, node, 3670 return __do_kmalloc_node(size, flags, node,
@@ -3647,6 +3697,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3647 void *caller) 3697 void *caller)
3648{ 3698{
3649 struct kmem_cache *cachep; 3699 struct kmem_cache *cachep;
3700 void *ret;
3650 3701
3651 /* If you want to save a few bytes .text space: replace 3702 /* If you want to save a few bytes .text space: replace
3652 * __ with kmem_. 3703 * __ with kmem_.
@@ -3656,11 +3707,17 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3656 cachep = __find_general_cachep(size, flags); 3707 cachep = __find_general_cachep(size, flags);
3657 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3708 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3658 return cachep; 3709 return cachep;
3659 return __cache_alloc(cachep, flags, caller); 3710 ret = __cache_alloc(cachep, flags, caller);
3711
3712 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
3713 (unsigned long) caller, ret,
3714 size, cachep->buffer_size, flags);
3715
3716 return ret;
3660} 3717}
3661 3718
3662 3719
3663#ifdef CONFIG_DEBUG_SLAB 3720#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3664void *__kmalloc(size_t size, gfp_t flags) 3721void *__kmalloc(size_t size, gfp_t flags)
3665{ 3722{
3666 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3723 return __do_kmalloc(size, flags, __builtin_return_address(0));
@@ -3699,6 +3756,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3699 debug_check_no_obj_freed(objp, obj_size(cachep)); 3756 debug_check_no_obj_freed(objp, obj_size(cachep));
3700 __cache_free(cachep, objp); 3757 __cache_free(cachep, objp);
3701 local_irq_restore(flags); 3758 local_irq_restore(flags);
3759
3760 kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
3702} 3761}
3703EXPORT_SYMBOL(kmem_cache_free); 3762EXPORT_SYMBOL(kmem_cache_free);
3704 3763
@@ -3725,6 +3784,8 @@ void kfree(const void *objp)
3725 debug_check_no_obj_freed(objp, obj_size(c)); 3784 debug_check_no_obj_freed(objp, obj_size(c));
3726 __cache_free(c, (void *)objp); 3785 __cache_free(c, (void *)objp);
3727 local_irq_restore(flags); 3786 local_irq_restore(flags);
3787
3788 kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
3728} 3789}
3729EXPORT_SYMBOL(kfree); 3790EXPORT_SYMBOL(kfree);
3730 3791
diff --git a/mm/slob.c b/mm/slob.c
index 52bc8a2bd9ef..596152926a8d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,6 +65,7 @@
65#include <linux/module.h> 65#include <linux/module.h>
66#include <linux/rcupdate.h> 66#include <linux/rcupdate.h>
67#include <linux/list.h> 67#include <linux/list.h>
68#include <trace/kmemtrace.h>
68#include <asm/atomic.h> 69#include <asm/atomic.h>
69 70
70/* 71/*
@@ -463,27 +464,40 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
463{ 464{
464 unsigned int *m; 465 unsigned int *m;
465 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 466 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
467 void *ret;
468
469 lockdep_trace_alloc(flags);
466 470
467 if (size < PAGE_SIZE - align) { 471 if (size < PAGE_SIZE - align) {
468 if (!size) 472 if (!size)
469 return ZERO_SIZE_PTR; 473 return ZERO_SIZE_PTR;
470 474
471 m = slob_alloc(size + align, gfp, align, node); 475 m = slob_alloc(size + align, gfp, align, node);
476
472 if (!m) 477 if (!m)
473 return NULL; 478 return NULL;
474 *m = size; 479 *m = size;
475 return (void *)m + align; 480 ret = (void *)m + align;
481
482 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
483 _RET_IP_, ret,
484 size, size + align, gfp, node);
476 } else { 485 } else {
477 void *ret; 486 unsigned int order = get_order(size);
478 487
479 ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); 488 ret = slob_new_page(gfp | __GFP_COMP, order, node);
480 if (ret) { 489 if (ret) {
481 struct page *page; 490 struct page *page;
482 page = virt_to_page(ret); 491 page = virt_to_page(ret);
483 page->private = size; 492 page->private = size;
484 } 493 }
485 return ret; 494
495 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
496 _RET_IP_, ret,
497 size, PAGE_SIZE << order, gfp, node);
486 } 498 }
499
500 return ret;
487} 501}
488EXPORT_SYMBOL(__kmalloc_node); 502EXPORT_SYMBOL(__kmalloc_node);
489 503
@@ -501,6 +515,8 @@ void kfree(const void *block)
501 slob_free(m, *m + align); 515 slob_free(m, *m + align);
502 } else 516 } else
503 put_page(&sp->page); 517 put_page(&sp->page);
518
519 kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
504} 520}
505EXPORT_SYMBOL(kfree); 521EXPORT_SYMBOL(kfree);
506 522
@@ -570,10 +586,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
570{ 586{
571 void *b; 587 void *b;
572 588
573 if (c->size < PAGE_SIZE) 589 if (c->size < PAGE_SIZE) {
574 b = slob_alloc(c->size, flags, c->align, node); 590 b = slob_alloc(c->size, flags, c->align, node);
575 else 591 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
592 _RET_IP_, b, c->size,
593 SLOB_UNITS(c->size) * SLOB_UNIT,
594 flags, node);
595 } else {
576 b = slob_new_page(flags, get_order(c->size), node); 596 b = slob_new_page(flags, get_order(c->size), node);
597 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
598 _RET_IP_, b, c->size,
599 PAGE_SIZE << get_order(c->size),
600 flags, node);
601 }
577 602
578 if (c->ctor) 603 if (c->ctor)
579 c->ctor(b); 604 c->ctor(b);
@@ -609,6 +634,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
609 } else { 634 } else {
610 __kmem_cache_free(b, c->size); 635 __kmem_cache_free(b, c->size);
611 } 636 }
637
638 kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
612} 639}
613EXPORT_SYMBOL(kmem_cache_free); 640EXPORT_SYMBOL(kmem_cache_free);
614 641
diff --git a/mm/slub.c b/mm/slub.c
index 0280eee6cf37..816734ed8aa3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <trace/kmemtrace.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/cpuset.h> 21#include <linux/cpuset.h>
21#include <linux/mempolicy.h> 22#include <linux/mempolicy.h>
@@ -1596,6 +1597,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1596 unsigned long flags; 1597 unsigned long flags;
1597 unsigned int objsize; 1598 unsigned int objsize;
1598 1599
1600 lockdep_trace_alloc(gfpflags);
1599 might_sleep_if(gfpflags & __GFP_WAIT); 1601 might_sleep_if(gfpflags & __GFP_WAIT);
1600 1602
1601 if (should_failslab(s->objsize, gfpflags)) 1603 if (should_failslab(s->objsize, gfpflags))
@@ -1623,18 +1625,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1623 1625
1624void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1626void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1625{ 1627{
1626 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1628 void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
1629
1630 kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
1631 s->objsize, s->size, gfpflags);
1632
1633 return ret;
1627} 1634}
1628EXPORT_SYMBOL(kmem_cache_alloc); 1635EXPORT_SYMBOL(kmem_cache_alloc);
1629 1636
1637#ifdef CONFIG_KMEMTRACE
1638void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1639{
1640 return slab_alloc(s, gfpflags, -1, _RET_IP_);
1641}
1642EXPORT_SYMBOL(kmem_cache_alloc_notrace);
1643#endif
1644
1630#ifdef CONFIG_NUMA 1645#ifdef CONFIG_NUMA
1631void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1646void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1632{ 1647{
1633 return slab_alloc(s, gfpflags, node, _RET_IP_); 1648 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1649
1650 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
1651 s->objsize, s->size, gfpflags, node);
1652
1653 return ret;
1634} 1654}
1635EXPORT_SYMBOL(kmem_cache_alloc_node); 1655EXPORT_SYMBOL(kmem_cache_alloc_node);
1636#endif 1656#endif
1637 1657
1658#ifdef CONFIG_KMEMTRACE
1659void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1660 gfp_t gfpflags,
1661 int node)
1662{
1663 return slab_alloc(s, gfpflags, node, _RET_IP_);
1664}
1665EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1666#endif
1667
1638/* 1668/*
1639 * Slow patch handling. This may still be called frequently since objects 1669 * Slow patch handling. This may still be called frequently since objects
1640 * have a longer lifetime than the cpu slabs in most processing loads. 1670 * have a longer lifetime than the cpu slabs in most processing loads.
@@ -1742,6 +1772,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1742 page = virt_to_head_page(x); 1772 page = virt_to_head_page(x);
1743 1773
1744 slab_free(s, page, x, _RET_IP_); 1774 slab_free(s, page, x, _RET_IP_);
1775
1776 kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
1745} 1777}
1746EXPORT_SYMBOL(kmem_cache_free); 1778EXPORT_SYMBOL(kmem_cache_free);
1747 1779
@@ -2475,7 +2507,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2475 * Kmalloc subsystem 2507 * Kmalloc subsystem
2476 *******************************************************************/ 2508 *******************************************************************/
2477 2509
2478struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; 2510struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
2479EXPORT_SYMBOL(kmalloc_caches); 2511EXPORT_SYMBOL(kmalloc_caches);
2480 2512
2481static int __init setup_slub_min_order(char *str) 2513static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2569,7 @@ panic:
2537} 2569}
2538 2570
2539#ifdef CONFIG_ZONE_DMA 2571#ifdef CONFIG_ZONE_DMA
2540static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; 2572static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2541 2573
2542static void sysfs_add_func(struct work_struct *w) 2574static void sysfs_add_func(struct work_struct *w)
2543{ 2575{
@@ -2657,8 +2689,9 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2657void *__kmalloc(size_t size, gfp_t flags) 2689void *__kmalloc(size_t size, gfp_t flags)
2658{ 2690{
2659 struct kmem_cache *s; 2691 struct kmem_cache *s;
2692 void *ret;
2660 2693
2661 if (unlikely(size > PAGE_SIZE)) 2694 if (unlikely(size > SLUB_MAX_SIZE))
2662 return kmalloc_large(size, flags); 2695 return kmalloc_large(size, flags);
2663 2696
2664 s = get_slab(size, flags); 2697 s = get_slab(size, flags);
@@ -2666,7 +2699,12 @@ void *__kmalloc(size_t size, gfp_t flags)
2666 if (unlikely(ZERO_OR_NULL_PTR(s))) 2699 if (unlikely(ZERO_OR_NULL_PTR(s)))
2667 return s; 2700 return s;
2668 2701
2669 return slab_alloc(s, flags, -1, _RET_IP_); 2702 ret = slab_alloc(s, flags, -1, _RET_IP_);
2703
2704 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
2705 size, s->size, flags);
2706
2707 return ret;
2670} 2708}
2671EXPORT_SYMBOL(__kmalloc); 2709EXPORT_SYMBOL(__kmalloc);
2672 2710
@@ -2685,16 +2723,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2685void *__kmalloc_node(size_t size, gfp_t flags, int node) 2723void *__kmalloc_node(size_t size, gfp_t flags, int node)
2686{ 2724{
2687 struct kmem_cache *s; 2725 struct kmem_cache *s;
2726 void *ret;
2727
2728 if (unlikely(size > SLUB_MAX_SIZE)) {
2729 ret = kmalloc_large_node(size, flags, node);
2688 2730
2689 if (unlikely(size > PAGE_SIZE)) 2731 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
2690 return kmalloc_large_node(size, flags, node); 2732 _RET_IP_, ret,
2733 size, PAGE_SIZE << get_order(size),
2734 flags, node);
2735
2736 return ret;
2737 }
2691 2738
2692 s = get_slab(size, flags); 2739 s = get_slab(size, flags);
2693 2740
2694 if (unlikely(ZERO_OR_NULL_PTR(s))) 2741 if (unlikely(ZERO_OR_NULL_PTR(s)))
2695 return s; 2742 return s;
2696 2743
2697 return slab_alloc(s, flags, node, _RET_IP_); 2744 ret = slab_alloc(s, flags, node, _RET_IP_);
2745
2746 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
2747 size, s->size, flags, node);
2748
2749 return ret;
2698} 2750}
2699EXPORT_SYMBOL(__kmalloc_node); 2751EXPORT_SYMBOL(__kmalloc_node);
2700#endif 2752#endif
@@ -2753,6 +2805,8 @@ void kfree(const void *x)
2753 return; 2805 return;
2754 } 2806 }
2755 slab_free(page->slab, page, object, _RET_IP_); 2807 slab_free(page->slab, page, object, _RET_IP_);
2808
2809 kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
2756} 2810}
2757EXPORT_SYMBOL(kfree); 2811EXPORT_SYMBOL(kfree);
2758 2812
@@ -2986,7 +3040,7 @@ void __init kmem_cache_init(void)
2986 caches++; 3040 caches++;
2987 } 3041 }
2988 3042
2989 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { 3043 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
2990 create_kmalloc_cache(&kmalloc_caches[i], 3044 create_kmalloc_cache(&kmalloc_caches[i],
2991 "kmalloc", 1 << i, GFP_KERNEL); 3045 "kmalloc", 1 << i, GFP_KERNEL);
2992 caches++; 3046 caches++;
@@ -3023,7 +3077,7 @@ void __init kmem_cache_init(void)
3023 slab_state = UP; 3077 slab_state = UP;
3024 3078
3025 /* Provide the correct kmalloc names now that the caches are up */ 3079 /* Provide the correct kmalloc names now that the caches are up */
3026 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) 3080 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3027 kmalloc_caches[i]. name = 3081 kmalloc_caches[i]. name =
3028 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3082 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3029 3083
@@ -3222,8 +3276,9 @@ static struct notifier_block __cpuinitdata slab_notifier = {
3222void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3276void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3223{ 3277{
3224 struct kmem_cache *s; 3278 struct kmem_cache *s;
3279 void *ret;
3225 3280
3226 if (unlikely(size > PAGE_SIZE)) 3281 if (unlikely(size > SLUB_MAX_SIZE))
3227 return kmalloc_large(size, gfpflags); 3282 return kmalloc_large(size, gfpflags);
3228 3283
3229 s = get_slab(size, gfpflags); 3284 s = get_slab(size, gfpflags);
@@ -3231,15 +3286,22 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3231 if (unlikely(ZERO_OR_NULL_PTR(s))) 3286 if (unlikely(ZERO_OR_NULL_PTR(s)))
3232 return s; 3287 return s;
3233 3288
3234 return slab_alloc(s, gfpflags, -1, caller); 3289 ret = slab_alloc(s, gfpflags, -1, caller);
3290
3291 /* Honor the call site pointer we recieved. */
3292 kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size,
3293 s->size, gfpflags);
3294
3295 return ret;
3235} 3296}
3236 3297
3237void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3298void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3238 int node, unsigned long caller) 3299 int node, unsigned long caller)
3239{ 3300{
3240 struct kmem_cache *s; 3301 struct kmem_cache *s;
3302 void *ret;
3241 3303
3242 if (unlikely(size > PAGE_SIZE)) 3304 if (unlikely(size > SLUB_MAX_SIZE))
3243 return kmalloc_large_node(size, gfpflags, node); 3305 return kmalloc_large_node(size, gfpflags, node);
3244 3306
3245 s = get_slab(size, gfpflags); 3307 s = get_slab(size, gfpflags);
@@ -3247,7 +3309,13 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3247 if (unlikely(ZERO_OR_NULL_PTR(s))) 3309 if (unlikely(ZERO_OR_NULL_PTR(s)))
3248 return s; 3310 return s;
3249 3311
3250 return slab_alloc(s, gfpflags, node, caller); 3312 ret = slab_alloc(s, gfpflags, node, caller);
3313
3314 /* Honor the call site pointer we recieved. */
3315 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret,
3316 size, s->size, gfpflags, node);
3317
3318 return ret;
3251} 3319}
3252 3320
3253#ifdef CONFIG_SLUB_DEBUG 3321#ifdef CONFIG_SLUB_DEBUG
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 520a75980269..af58324c361a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h> 26#include <linux/bootmem.h>
27#include <linux/pfn.h>
27 28
28#include <asm/atomic.h> 29#include <asm/atomic.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
152 * 153 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 154 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */ 155 */
155static int vmap_page_range(unsigned long start, unsigned long end, 156static int vmap_page_range_noflush(unsigned long start, unsigned long end,
156 pgprot_t prot, struct page **pages) 157 pgprot_t prot, struct page **pages)
157{ 158{
158 pgd_t *pgd; 159 pgd_t *pgd;
159 unsigned long next; 160 unsigned long next;
@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
169 if (err) 170 if (err)
170 break; 171 break;
171 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
172 flush_cache_vmap(start, end);
173 173
174 if (unlikely(err)) 174 if (unlikely(err))
175 return err; 175 return err;
176 return nr; 176 return nr;
177} 177}
178 178
179static int vmap_page_range(unsigned long start, unsigned long end,
180 pgprot_t prot, struct page **pages)
181{
182 int ret;
183
184 ret = vmap_page_range_noflush(start, end, prot, pages);
185 flush_cache_vmap(start, end);
186 return ret;
187}
188
179static inline int is_vmalloc_or_module_addr(const void *x) 189static inline int is_vmalloc_or_module_addr(const void *x)
180{ 190{
181 /* 191 /*
@@ -990,6 +1000,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
990} 1000}
991EXPORT_SYMBOL(vm_map_ram); 1001EXPORT_SYMBOL(vm_map_ram);
992 1002
1003/**
1004 * vm_area_register_early - register vmap area early during boot
1005 * @vm: vm_struct to register
1006 * @align: requested alignment
1007 *
1008 * This function is used to register kernel vm area before
1009 * vmalloc_init() is called. @vm->size and @vm->flags should contain
1010 * proper values on entry and other fields should be zero. On return,
1011 * vm->addr contains the allocated address.
1012 *
1013 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1014 */
1015void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1016{
1017 static size_t vm_init_off __initdata;
1018 unsigned long addr;
1019
1020 addr = ALIGN(VMALLOC_START + vm_init_off, align);
1021 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1022
1023 vm->addr = (void *)addr;
1024
1025 vm->next = vmlist;
1026 vmlist = vm;
1027}
1028
993void __init vmalloc_init(void) 1029void __init vmalloc_init(void)
994{ 1030{
995 struct vmap_area *va; 1031 struct vmap_area *va;
@@ -1017,6 +1053,58 @@ void __init vmalloc_init(void)
1017 vmap_initialized = true; 1053 vmap_initialized = true;
1018} 1054}
1019 1055
1056/**
1057 * map_kernel_range_noflush - map kernel VM area with the specified pages
1058 * @addr: start of the VM area to map
1059 * @size: size of the VM area to map
1060 * @prot: page protection flags to use
1061 * @pages: pages to map
1062 *
1063 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1064 * specify should have been allocated using get_vm_area() and its
1065 * friends.
1066 *
1067 * NOTE:
1068 * This function does NOT do any cache flushing. The caller is
1069 * responsible for calling flush_cache_vmap() on to-be-mapped areas
1070 * before calling this function.
1071 *
1072 * RETURNS:
1073 * The number of pages mapped on success, -errno on failure.
1074 */
1075int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1076 pgprot_t prot, struct page **pages)
1077{
1078 return vmap_page_range_noflush(addr, addr + size, prot, pages);
1079}
1080
1081/**
1082 * unmap_kernel_range_noflush - unmap kernel VM area
1083 * @addr: start of the VM area to unmap
1084 * @size: size of the VM area to unmap
1085 *
1086 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1087 * specify should have been allocated using get_vm_area() and its
1088 * friends.
1089 *
1090 * NOTE:
1091 * This function does NOT do any cache flushing. The caller is
1092 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1093 * before calling this function and flush_tlb_kernel_range() after.
1094 */
1095void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1096{
1097 vunmap_page_range(addr, addr + size);
1098}
1099
1100/**
1101 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1102 * @addr: start of the VM area to unmap
1103 * @size: size of the VM area to unmap
1104 *
1105 * Similar to unmap_kernel_range_noflush() but flushes vcache before
1106 * the unmapping and tlb after.
1107 */
1020void unmap_kernel_range(unsigned long addr, unsigned long size) 1108void unmap_kernel_range(unsigned long addr, unsigned long size)
1021{ 1109{
1022 unsigned long end = addr + size; 1110 unsigned long end = addr + size;
@@ -1267,6 +1355,7 @@ EXPORT_SYMBOL(vfree);
1267void vunmap(const void *addr) 1355void vunmap(const void *addr)
1268{ 1356{
1269 BUG_ON(in_interrupt()); 1357 BUG_ON(in_interrupt());
1358 might_sleep();
1270 __vunmap(addr, 0); 1359 __vunmap(addr, 0);
1271} 1360}
1272EXPORT_SYMBOL(vunmap); 1361EXPORT_SYMBOL(vunmap);
@@ -1286,6 +1375,8 @@ void *vmap(struct page **pages, unsigned int count,
1286{ 1375{
1287 struct vm_struct *area; 1376 struct vm_struct *area;
1288 1377
1378 might_sleep();
1379
1289 if (count > num_physpages) 1380 if (count > num_physpages)
1290 return NULL; 1381 return NULL;
1291 1382
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56ddf41149eb..479e46719394 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1965,6 +1965,8 @@ static int kswapd(void *p)
1965 }; 1965 };
1966 node_to_cpumask_ptr(cpumask, pgdat->node_id); 1966 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1967 1967
1968 lockdep_set_current_reclaim_state(GFP_KERNEL);
1969
1968 if (!cpumask_empty(cpumask)) 1970 if (!cpumask_empty(cpumask))
1969 set_cpus_allowed_ptr(tsk, cpumask); 1971 set_cpus_allowed_ptr(tsk, cpumask);
1970 current->reclaim_state = &reclaim_state; 1972 current->reclaim_state = &reclaim_state;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65a..3a3dad801354 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
1375int snmp_mib_init(void *ptr[2], size_t mibsize) 1375int snmp_mib_init(void *ptr[2], size_t mibsize)
1376{ 1376{
1377 BUG_ON(ptr == NULL); 1377 BUG_ON(ptr == NULL);
1378 ptr[0] = __alloc_percpu(mibsize); 1378 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
1379 if (!ptr[0]) 1379 if (!ptr[0])
1380 goto err0; 1380 goto err0;
1381 ptr[1] = __alloc_percpu(mibsize); 1381 ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
1382 if (!ptr[1]) 1382 if (!ptr[1])
1383 goto err1; 1383 goto err1;
1384 return 0; 1384 return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97f71153584f..bf895401218f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3376,7 +3376,7 @@ int __init ip_rt_init(void)
3376 int rc = 0; 3376 int rc = 0;
3377 3377
3378#ifdef CONFIG_NET_CLS_ROUTE 3378#ifdef CONFIG_NET_CLS_ROUTE
3379 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); 3379 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3380 if (!ip_rt_acct) 3380 if (!ip_rt_acct)
3381 panic("IP: failed to allocate ip_rt_acct\n"); 3381 panic("IP: failed to allocate ip_rt_acct\n");
3382#endif 3382#endif
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 01724e04c556..dffdc49878af 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -5,9 +5,9 @@
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DECLARE_TRACE(subsys_event, 7DECLARE_TRACE(subsys_event,
8 TPPROTO(struct inode *inode, struct file *file), 8 TP_PROTO(struct inode *inode, struct file *file),
9 TPARGS(inode, file)); 9 TP_ARGS(inode, file));
10DECLARE_TRACE(subsys_eventb, 10DECLARE_TRACE(subsys_eventb,
11 TPPROTO(void), 11 TP_PROTO(void),
12 TPARGS()); 12 TP_ARGS());
13#endif 13#endif
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index c7de8b39fcf1..39a9642927d3 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -112,13 +112,13 @@ endif
112# --------------------------------------------------------------------------- 112# ---------------------------------------------------------------------------
113 113
114# Default is built-in, unless we know otherwise 114# Default is built-in, unless we know otherwise
115modkern_cflags := $(CFLAGS_KERNEL) 115modkern_cflags = $(if $(part-of-module), $(CFLAGS_MODULE), $(CFLAGS_KERNEL))
116quiet_modtag := $(empty) $(empty) 116quiet_modtag := $(empty) $(empty)
117 117
118$(real-objs-m) : modkern_cflags := $(CFLAGS_MODULE) 118$(real-objs-m) : part-of-module := y
119$(real-objs-m:.o=.i) : modkern_cflags := $(CFLAGS_MODULE) 119$(real-objs-m:.o=.i) : part-of-module := y
120$(real-objs-m:.o=.s) : modkern_cflags := $(CFLAGS_MODULE) 120$(real-objs-m:.o=.s) : part-of-module := y
121$(real-objs-m:.o=.lst): modkern_cflags := $(CFLAGS_MODULE) 121$(real-objs-m:.o=.lst): part-of-module := y
122 122
123$(real-objs-m) : quiet_modtag := [M] 123$(real-objs-m) : quiet_modtag := [M]
124$(real-objs-m:.o=.i) : quiet_modtag := [M] 124$(real-objs-m:.o=.i) : quiet_modtag := [M]
@@ -205,7 +205,8 @@ endif
205ifdef CONFIG_FTRACE_MCOUNT_RECORD 205ifdef CONFIG_FTRACE_MCOUNT_RECORD
206cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ 206cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
207 "$(if $(CONFIG_64BIT),64,32)" \ 207 "$(if $(CONFIG_64BIT),64,32)" \
208 "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; 208 "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \
209 "$(if $(part-of-module),1,0)" "$(@)";
209endif 210endif
210 211
211define rule_cc_o_c 212define rule_cc_o_c
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index e06365775bdf..3b949a354470 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -186,3 +186,17 @@ quiet_cmd_gzip = GZIP $@
186cmd_gzip = gzip -f -9 < $< > $@ 186cmd_gzip = gzip -f -9 < $< > $@
187 187
188 188
189# Bzip2
190# ---------------------------------------------------------------------------
191
192# Bzip2 does not include size in file... so we have to fake that
193size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
194
195quiet_cmd_bzip2 = BZIP2 $@
196cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
197
198# Lzma
199# ---------------------------------------------------------------------------
200
201quiet_cmd_lzma = LZMA $@
202cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
new file mode 100644
index 000000000000..43e1b360cee6
--- /dev/null
+++ b/scripts/bin_size
@@ -0,0 +1,10 @@
1#!/bin/sh
2
3if [ $# = 0 ] ; then
4 echo Usage: $0 file
5fi
6
7size_dec=`stat -c "%s" $1`
8size_hex_echo_string=`printf "%08x" $size_dec |
9 sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
10/bin/echo -ne $size_hex_echo_string
diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh
new file mode 100644
index 000000000000..29493dc4528d
--- /dev/null
+++ b/scripts/gcc-x86_32-has-stack-protector.sh
@@ -0,0 +1,8 @@
1#!/bin/sh
2
3echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -fstack-protector - -o - 2> /dev/null | grep -q "%gs"
4if [ "$?" -eq "0" ] ; then
5 echo y
6else
7 echo n
8fi
diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh
index 325c0a1b03b6..afaec618b395 100644
--- a/scripts/gcc-x86_64-has-stack-protector.sh
+++ b/scripts/gcc-x86_64-has-stack-protector.sh
@@ -1,6 +1,8 @@
1#!/bin/sh 1#!/bin/sh
2 2
3echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" 3echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs"
4if [ "$?" -eq "0" ] ; then 4if [ "$?" -eq "0" ] ; then
5 echo $2 5 echo y
6else
7 echo n
6fi 8fi
diff --git a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh
index 5f3415f28736..3eea8f15131b 100644
--- a/scripts/gen_initramfs_list.sh
+++ b/scripts/gen_initramfs_list.sh
@@ -5,7 +5,7 @@
5# Released under the terms of the GNU GPL 5# Released under the terms of the GNU GPL
6# 6#
7# Generate a cpio packed initramfs. It uses gen_init_cpio to generate 7# Generate a cpio packed initramfs. It uses gen_init_cpio to generate
8# the cpio archive, and gzip to pack it. 8# the cpio archive, and then compresses it.
9# The script may also be used to generate the inputfile used for gen_init_cpio 9# The script may also be used to generate the inputfile used for gen_init_cpio
10# This script assumes that gen_init_cpio is located in usr/ directory 10# This script assumes that gen_init_cpio is located in usr/ directory
11 11
@@ -16,8 +16,8 @@ usage() {
16cat << EOF 16cat << EOF
17Usage: 17Usage:
18$0 [-o <file>] [-u <uid>] [-g <gid>] {-d | <cpio_source>} ... 18$0 [-o <file>] [-u <uid>] [-g <gid>] {-d | <cpio_source>} ...
19 -o <file> Create gzipped initramfs file named <file> using 19 -o <file> Create compressed initramfs file named <file> using
20 gen_init_cpio and gzip 20 gen_init_cpio and compressor depending on the extension
21 -u <uid> User ID to map to user ID 0 (root). 21 -u <uid> User ID to map to user ID 0 (root).
22 <uid> is only meaningful if <cpio_source> is a 22 <uid> is only meaningful if <cpio_source> is a
23 directory. "squash" forces all files to uid 0. 23 directory. "squash" forces all files to uid 0.
@@ -225,6 +225,7 @@ cpio_list=
225output="/dev/stdout" 225output="/dev/stdout"
226output_file="" 226output_file=""
227is_cpio_compressed= 227is_cpio_compressed=
228compr="gzip -9 -f"
228 229
229arg="$1" 230arg="$1"
230case "$arg" in 231case "$arg" in
@@ -233,11 +234,15 @@ case "$arg" in
233 echo "deps_initramfs := \\" 234 echo "deps_initramfs := \\"
234 shift 235 shift
235 ;; 236 ;;
236 "-o") # generate gzipped cpio image named $1 237 "-o") # generate compressed cpio image named $1
237 shift 238 shift
238 output_file="$1" 239 output_file="$1"
239 cpio_list="$(mktemp ${TMPDIR:-/tmp}/cpiolist.XXXXXX)" 240 cpio_list="$(mktemp ${TMPDIR:-/tmp}/cpiolist.XXXXXX)"
240 output=${cpio_list} 241 output=${cpio_list}
242 echo "$output_file" | grep -q "\.gz$" && compr="gzip -9 -f"
243 echo "$output_file" | grep -q "\.bz2$" && compr="bzip2 -9 -f"
244 echo "$output_file" | grep -q "\.lzma$" && compr="lzma -9 -f"
245 echo "$output_file" | grep -q "\.cpio$" && compr="cat"
241 shift 246 shift
242 ;; 247 ;;
243esac 248esac
@@ -274,7 +279,7 @@ while [ $# -gt 0 ]; do
274 esac 279 esac
275done 280done
276 281
277# If output_file is set we will generate cpio archive and gzip it 282# If output_file is set we will generate cpio archive and compress it
278# we are carefull to delete tmp files 283# we are carefull to delete tmp files
279if [ ! -z ${output_file} ]; then 284if [ ! -z ${output_file} ]; then
280 if [ -z ${cpio_file} ]; then 285 if [ -z ${cpio_file} ]; then
@@ -287,7 +292,8 @@ if [ ! -z ${output_file} ]; then
287 if [ "${is_cpio_compressed}" = "compressed" ]; then 292 if [ "${is_cpio_compressed}" = "compressed" ]; then
288 cat ${cpio_tfile} > ${output_file} 293 cat ${cpio_tfile} > ${output_file}
289 else 294 else
290 cat ${cpio_tfile} | gzip -f -9 - > ${output_file} 295 (cat ${cpio_tfile} | ${compr} - > ${output_file}) \
296 || (rm -f ${output_file} ; false)
291 fi 297 fi
292 [ -z ${cpio_file} ] && rm ${cpio_tfile} 298 [ -z ${cpio_file} ] && rm ${cpio_tfile}
293fi 299fi
diff --git a/scripts/headers_check.pl b/scripts/headers_check.pl
index db30fac3083e..56f90a480899 100644
--- a/scripts/headers_check.pl
+++ b/scripts/headers_check.pl
@@ -38,7 +38,7 @@ foreach my $file (@files) {
38 &check_asm_types(); 38 &check_asm_types();
39 &check_sizetypes(); 39 &check_sizetypes();
40 &check_prototypes(); 40 &check_prototypes();
41 &check_config(); 41 # Dropped for now. Too much noise &check_config();
42 } 42 }
43 close FH; 43 close FH;
44} 44}
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index ad2434b26970..6654cbed965b 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -500,6 +500,51 @@ static void optimize_token_table(void)
500 optimize_result(); 500 optimize_result();
501} 501}
502 502
503/* guess for "linker script provide" symbol */
504static int may_be_linker_script_provide_symbol(const struct sym_entry *se)
505{
506 const char *symbol = (char *)se->sym + 1;
507 int len = se->len - 1;
508
509 if (len < 8)
510 return 0;
511
512 if (symbol[0] != '_' || symbol[1] != '_')
513 return 0;
514
515 /* __start_XXXXX */
516 if (!memcmp(symbol + 2, "start_", 6))
517 return 1;
518
519 /* __stop_XXXXX */
520 if (!memcmp(symbol + 2, "stop_", 5))
521 return 1;
522
523 /* __end_XXXXX */
524 if (!memcmp(symbol + 2, "end_", 4))
525 return 1;
526
527 /* __XXXXX_start */
528 if (!memcmp(symbol + len - 6, "_start", 6))
529 return 1;
530
531 /* __XXXXX_end */
532 if (!memcmp(symbol + len - 4, "_end", 4))
533 return 1;
534
535 return 0;
536}
537
538static int prefix_underscores_count(const char *str)
539{
540 const char *tail = str;
541
542 while (*tail != '_')
543 tail++;
544
545 return tail - str;
546}
547
503static int compare_symbols(const void *a, const void *b) 548static int compare_symbols(const void *a, const void *b)
504{ 549{
505 const struct sym_entry *sa; 550 const struct sym_entry *sa;
@@ -521,6 +566,18 @@ static int compare_symbols(const void *a, const void *b)
521 if (wa != wb) 566 if (wa != wb)
522 return wa - wb; 567 return wa - wb;
523 568
569 /* sort by "linker script provide" type */
570 wa = may_be_linker_script_provide_symbol(sa);
571 wb = may_be_linker_script_provide_symbol(sb);
572 if (wa != wb)
573 return wa - wb;
574
575 /* sort by the number of prefix underscores */
576 wa = prefix_underscores_count((const char *)sa->sym + 1);
577 wb = prefix_underscores_count((const char *)sb->sym + 1);
578 if (wa != wb)
579 return wa - wb;
580
524 /* sort by initial order, so that other symbols are left undisturbed */ 581 /* sort by initial order, so that other symbols are left undisturbed */
525 return sa->start_pos - sb->start_pos; 582 return sa->start_pos - sb->start_pos;
526} 583}
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 88921611b22e..7e62303133dc 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -415,8 +415,9 @@ static int parse_elf(struct elf_info *info, const char *filename)
415 const char *secstrings 415 const char *secstrings
416 = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 416 = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
417 const char *secname; 417 const char *secname;
418 int nobits = sechdrs[i].sh_type == SHT_NOBITS;
418 419
419 if (sechdrs[i].sh_offset > info->size) { 420 if (!nobits && sechdrs[i].sh_offset > info->size) {
420 fatal("%s is truncated. sechdrs[i].sh_offset=%lu > " 421 fatal("%s is truncated. sechdrs[i].sh_offset=%lu > "
421 "sizeof(*hrd)=%zu\n", filename, 422 "sizeof(*hrd)=%zu\n", filename,
422 (unsigned long)sechdrs[i].sh_offset, 423 (unsigned long)sechdrs[i].sh_offset,
@@ -425,6 +426,8 @@ static int parse_elf(struct elf_info *info, const char *filename)
425 } 426 }
426 secname = secstrings + sechdrs[i].sh_name; 427 secname = secstrings + sechdrs[i].sh_name;
427 if (strcmp(secname, ".modinfo") == 0) { 428 if (strcmp(secname, ".modinfo") == 0) {
429 if (nobits)
430 fatal("%s has NOBITS .modinfo\n", filename);
428 info->modinfo = (void *)hdr + sechdrs[i].sh_offset; 431 info->modinfo = (void *)hdr + sechdrs[i].sh_offset;
429 info->modinfo_len = sechdrs[i].sh_size; 432 info->modinfo_len = sechdrs[i].sh_size;
430 } else if (strcmp(secname, "__ksymtab") == 0) 433 } else if (strcmp(secname, "__ksymtab") == 0)
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index fe831412bea9..409596eca124 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -100,14 +100,19 @@ $P =~ s@.*/@@g;
100 100
101my $V = '0.1'; 101my $V = '0.1';
102 102
103if ($#ARGV < 6) { 103if ($#ARGV < 7) {
104 print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n"; 104 print "usage: $P arch bits objdump objcopy cc ld nm rm mv is_module inputfile\n";
105 print "version: $V\n"; 105 print "version: $V\n";
106 exit(1); 106 exit(1);
107} 107}
108 108
109my ($arch, $bits, $objdump, $objcopy, $cc, 109my ($arch, $bits, $objdump, $objcopy, $cc,
110 $ld, $nm, $rm, $mv, $inputfile) = @ARGV; 110 $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV;
111
112# This file refers to mcount and shouldn't be ftraced, so lets' ignore it
113if ($inputfile eq "kernel/trace/ftrace.o") {
114 exit(0);
115}
111 116
112# Acceptable sections to record. 117# Acceptable sections to record.
113my %text_sections = ( 118my %text_sections = (
@@ -201,6 +206,13 @@ if ($arch eq "x86_64") {
201 $alignment = 2; 206 $alignment = 2;
202 $section_type = '%progbits'; 207 $section_type = '%progbits';
203 208
209} elsif ($arch eq "ia64") {
210 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
211 $type = "data8";
212
213 if ($is_module eq "0") {
214 $cc .= " -mconstant-gp";
215 }
204} else { 216} else {
205 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; 217 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
206} 218}
@@ -263,7 +275,6 @@ if (!$found_version) {
263 "\tDisabling local function references.\n"; 275 "\tDisabling local function references.\n";
264} 276}
265 277
266
267# 278#
268# Step 1: find all the local (static functions) and weak symbols. 279# Step 1: find all the local (static functions) and weak symbols.
269# 't' is local, 'w/W' is weak (we never use a weak function) 280# 't' is local, 'w/W' is weak (we never use a weak function)
@@ -331,13 +342,16 @@ sub update_funcs
331# 342#
332# Step 2: find the sections and mcount call sites 343# Step 2: find the sections and mcount call sites
333# 344#
334open(IN, "$objdump -dr $inputfile|") || die "error running $objdump"; 345open(IN, "$objdump -hdr $inputfile|") || die "error running $objdump";
335 346
336my $text; 347my $text;
337 348
349my $read_headers = 1;
350
338while (<IN>) { 351while (<IN>) {
339 # is it a section? 352 # is it a section?
340 if (/$section_regex/) { 353 if (/$section_regex/) {
354 $read_headers = 0;
341 355
342 # Only record text sections that we know are safe 356 # Only record text sections that we know are safe
343 if (defined($text_sections{$1})) { 357 if (defined($text_sections{$1})) {
@@ -371,6 +385,19 @@ while (<IN>) {
371 $ref_func = $text; 385 $ref_func = $text;
372 } 386 }
373 } 387 }
388 } elsif ($read_headers && /$mcount_section/) {
389 #
390 # Somehow the make process can execute this script on an
391 # object twice. If it does, we would duplicate the mcount
392 # section and it will cause the function tracer self test
393 # to fail. Check if the mcount section exists, and if it does,
394 # warn and exit.
395 #
396 print STDERR "ERROR: $mcount_section already in $inputfile\n" .
397 "\tThis may be an indication that your build is corrupted.\n" .
398 "\tDelete $inputfile and try again. If the same object file\n" .
399 "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n";
400 exit(-1);
374 } 401 }
375 402
376 # is this a call site to mcount? If so, record it to print later 403 # is this a call site to mcount? If so, record it to print later
diff --git a/sound/drivers/Kconfig b/sound/drivers/Kconfig
index 0bcf14640fde..84714a65e5c8 100644
--- a/sound/drivers/Kconfig
+++ b/sound/drivers/Kconfig
@@ -33,7 +33,7 @@ if SND_DRIVERS
33 33
34config SND_PCSP 34config SND_PCSP
35 tristate "PC-Speaker support (READ HELP!)" 35 tristate "PC-Speaker support (READ HELP!)"
36 depends on PCSPKR_PLATFORM && X86_PC && HIGH_RES_TIMERS 36 depends on PCSPKR_PLATFORM && X86 && HIGH_RES_TIMERS
37 depends on INPUT 37 depends on INPUT
38 depends on EXPERIMENTAL 38 depends on EXPERIMENTAL
39 select SND_PCM 39 select SND_PCM
diff --git a/usr/Kconfig b/usr/Kconfig
index 86cecb59dd07..43a3a0fe8f29 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -44,3 +44,92 @@ config INITRAMFS_ROOT_GID
44 owned by group root in the initial ramdisk image. 44 owned by group root in the initial ramdisk image.
45 45
46 If you are not sure, leave it set to "0". 46 If you are not sure, leave it set to "0".
47
48config RD_GZIP
49 bool "Initial ramdisk compressed using gzip"
50 default y
51 depends on BLK_DEV_INITRD=y
52 select DECOMPRESS_GZIP
53 help
54 Support loading of a gzip encoded initial ramdisk or cpio buffer.
55 If unsure, say Y.
56
57config RD_BZIP2
58 bool "Initial ramdisk compressed using bzip2"
59 default n
60 depends on BLK_DEV_INITRD=y
61 select DECOMPRESS_BZIP2
62 help
63 Support loading of a bzip2 encoded initial ramdisk or cpio buffer
64 If unsure, say N.
65
66config RD_LZMA
67 bool "Initial ramdisk compressed using lzma"
68 default n
69 depends on BLK_DEV_INITRD=y
70 select DECOMPRESS_LZMA
71 help
72 Support loading of a lzma encoded initial ramdisk or cpio buffer
73 If unsure, say N.
74
75choice
76 prompt "Built-in initramfs compression mode"
77 help
78 This setting is only meaningful if the INITRAMFS_SOURCE is
79 set. It decides by which algorithm the INITRAMFS_SOURCE will
80 be compressed.
81 Several compression algorithms are available, which differ
82 in efficiency, compression and decompression speed.
83 Compression speed is only relevant when building a kernel.
84 Decompression speed is relevant at each boot.
85
86 If you have any problems with bzip2 or lzma compressed
87 initramfs, mail me (Alain Knaff) <alain@knaff.lu>.
88
89 High compression options are mostly useful for users who
90 are low on disk space (embedded systems), but for whom ram
91 size matters less.
92
93 If in doubt, select 'gzip'
94
95config INITRAMFS_COMPRESSION_NONE
96 bool "None"
97 help
98 Do not compress the built-in initramfs at all. This may
99 sound wasteful in space, but, you should be aware that the
100 built-in initramfs will be compressed at a later stage
101 anyways along with the rest of the kernel, on those
102 architectures that support this.
103 However, not compressing the initramfs may lead to slightly
104 higher memory consumption during a short time at boot, while
105 both the cpio image and the unpacked filesystem image will
106 be present in memory simultaneously
107
108config INITRAMFS_COMPRESSION_GZIP
109 bool "Gzip"
110 depends on RD_GZIP
111 help
112 The old and tried gzip compression. Its compression ratio is
113 the poorest among the 3 choices; however its speed (both
114 compression and decompression) is the fastest.
115
116config INITRAMFS_COMPRESSION_BZIP2
117 bool "Bzip2"
118 depends on RD_BZIP2
119 help
120 Its compression ratio and speed is intermediate.
121 Decompression speed is slowest among the three. The initramfs
122 size is about 10% smaller with bzip2, in comparison to gzip.
123 Bzip2 uses a large amount of memory. For modern kernels you
124 will need at least 8MB RAM or more for booting.
125
126config INITRAMFS_COMPRESSION_LZMA
127 bool "LZMA"
128 depends on RD_LZMA
129 help
130 The most recent compression algorithm.
131 Its ratio is best, decompression speed is between the other
132 two. Compression is slowest. The initramfs size is about 33%
133 smaller with LZMA in comparison to gzip.
134
135endchoice
diff --git a/usr/Makefile b/usr/Makefile
index 201f27f8cbaf..b84894b3929d 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -6,13 +6,25 @@ klibcdirs:;
6PHONY += klibcdirs 6PHONY += klibcdirs
7 7
8 8
9# No compression
10suffix_$(CONFIG_INITRAMFS_COMPRESSION_NONE) =
11
12# Gzip, but no bzip2
13suffix_$(CONFIG_INITRAMFS_COMPRESSION_GZIP) = .gz
14
15# Bzip2
16suffix_$(CONFIG_INITRAMFS_COMPRESSION_BZIP2) = .bz2
17
18# Lzma
19suffix_$(CONFIG_INITRAMFS_COMPRESSION_LZMA) = .lzma
20
9# Generate builtin.o based on initramfs_data.o 21# Generate builtin.o based on initramfs_data.o
10obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o 22obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data$(suffix_y).o
11 23
12# initramfs_data.o contains the initramfs_data.cpio.gz image. 24# initramfs_data.o contains the compressed initramfs_data.cpio image.
13# The image is included using .incbin, a dependency which is not 25# The image is included using .incbin, a dependency which is not
14# tracked automatically. 26# tracked automatically.
15$(obj)/initramfs_data.o: $(obj)/initramfs_data.cpio.gz FORCE 27$(obj)/initramfs_data$(suffix_y).o: $(obj)/initramfs_data.cpio$(suffix_y) FORCE
16 28
17##### 29#####
18# Generate the initramfs cpio archive 30# Generate the initramfs cpio archive
@@ -25,28 +37,28 @@ ramfs-args := \
25 $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \ 37 $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \
26 $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID)) 38 $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID))
27 39
28# .initramfs_data.cpio.gz.d is used to identify all files included 40# .initramfs_data.cpio.d is used to identify all files included
29# in initramfs and to detect if any files are added/removed. 41# in initramfs and to detect if any files are added/removed.
30# Removed files are identified by directory timestamp being updated 42# Removed files are identified by directory timestamp being updated
31# The dependency list is generated by gen_initramfs.sh -l 43# The dependency list is generated by gen_initramfs.sh -l
32ifneq ($(wildcard $(obj)/.initramfs_data.cpio.gz.d),) 44ifneq ($(wildcard $(obj)/.initramfs_data.cpio.d),)
33 include $(obj)/.initramfs_data.cpio.gz.d 45 include $(obj)/.initramfs_data.cpio.d
34endif 46endif
35 47
36quiet_cmd_initfs = GEN $@ 48quiet_cmd_initfs = GEN $@
37 cmd_initfs = $(initramfs) -o $@ $(ramfs-args) $(ramfs-input) 49 cmd_initfs = $(initramfs) -o $@ $(ramfs-args) $(ramfs-input)
38 50
39targets := initramfs_data.cpio.gz 51targets := initramfs_data.cpio.gz initramfs_data.cpio.bz2 initramfs_data.cpio.lzma initramfs_data.cpio
40# do not try to update files included in initramfs 52# do not try to update files included in initramfs
41$(deps_initramfs): ; 53$(deps_initramfs): ;
42 54
43$(deps_initramfs): klibcdirs 55$(deps_initramfs): klibcdirs
44# We rebuild initramfs_data.cpio.gz if: 56# We rebuild initramfs_data.cpio if:
45# 1) Any included file is newer then initramfs_data.cpio.gz 57# 1) Any included file is newer then initramfs_data.cpio
46# 2) There are changes in which files are included (added or deleted) 58# 2) There are changes in which files are included (added or deleted)
47# 3) If gen_init_cpio are newer than initramfs_data.cpio.gz 59# 3) If gen_init_cpio are newer than initramfs_data.cpio
48# 4) arguments to gen_initramfs.sh changes 60# 4) arguments to gen_initramfs.sh changes
49$(obj)/initramfs_data.cpio.gz: $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs 61$(obj)/initramfs_data.cpio$(suffix_y): $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs
50 $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.gz.d 62 $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.d
51 $(call if_changed,initfs) 63 $(call if_changed,initfs)
52 64
diff --git a/usr/initramfs_data.S b/usr/initramfs_data.S
index c2e1ad424f4a..7c6973d8d829 100644
--- a/usr/initramfs_data.S
+++ b/usr/initramfs_data.S
@@ -26,5 +26,5 @@ SECTIONS
26*/ 26*/
27 27
28.section .init.ramfs,"a" 28.section .init.ramfs,"a"
29.incbin "usr/initramfs_data.cpio.gz" 29.incbin "usr/initramfs_data.cpio"
30 30
diff --git a/usr/initramfs_data.bz2.S b/usr/initramfs_data.bz2.S
new file mode 100644
index 000000000000..bc54d090365c
--- /dev/null
+++ b/usr/initramfs_data.bz2.S
@@ -0,0 +1,29 @@
1/*
2 initramfs_data includes the compressed binary that is the
3 filesystem used for early user space.
4 Note: Older versions of "as" (prior to binutils 2.11.90.0.23
5 released on 2001-07-14) dit not support .incbin.
6 If you are forced to use older binutils than that then the
7 following trick can be applied to create the resulting binary:
8
9
10 ld -m elf_i386 --format binary --oformat elf32-i386 -r \
11 -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o
12 ld -m elf_i386 -r -o built-in.o initramfs_data.o
13
14 initramfs_data.scr looks like this:
15SECTIONS
16{
17 .init.ramfs : { *(.data) }
18}
19
20 The above example is for i386 - the parameters vary from architectures.
21 Eventually look up LDFLAGS_BLOB in an older version of the
22 arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced.
23
24 Using .incbin has the advantage over ld that the correct flags are set
25 in the ELF header, as required by certain architectures.
26*/
27
28.section .init.ramfs,"a"
29.incbin "usr/initramfs_data.cpio.bz2"
diff --git a/usr/initramfs_data.gz.S b/usr/initramfs_data.gz.S
new file mode 100644
index 000000000000..890c8dd1d6bd
--- /dev/null
+++ b/usr/initramfs_data.gz.S
@@ -0,0 +1,29 @@
1/*
2 initramfs_data includes the compressed binary that is the
3 filesystem used for early user space.
4 Note: Older versions of "as" (prior to binutils 2.11.90.0.23
5 released on 2001-07-14) dit not support .incbin.
6 If you are forced to use older binutils than that then the
7 following trick can be applied to create the resulting binary:
8
9
10 ld -m elf_i386 --format binary --oformat elf32-i386 -r \
11 -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o
12 ld -m elf_i386 -r -o built-in.o initramfs_data.o
13
14 initramfs_data.scr looks like this:
15SECTIONS
16{
17 .init.ramfs : { *(.data) }
18}
19
20 The above example is for i386 - the parameters vary from architectures.
21 Eventually look up LDFLAGS_BLOB in an older version of the
22 arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced.
23
24 Using .incbin has the advantage over ld that the correct flags are set
25 in the ELF header, as required by certain architectures.
26*/
27
28.section .init.ramfs,"a"
29.incbin "usr/initramfs_data.cpio.gz"
diff --git a/usr/initramfs_data.lzma.S b/usr/initramfs_data.lzma.S
new file mode 100644
index 000000000000..e11469e48562
--- /dev/null
+++ b/usr/initramfs_data.lzma.S
@@ -0,0 +1,29 @@
1/*
2 initramfs_data includes the compressed binary that is the
3 filesystem used for early user space.
4 Note: Older versions of "as" (prior to binutils 2.11.90.0.23
5 released on 2001-07-14) dit not support .incbin.
6 If you are forced to use older binutils than that then the
7 following trick can be applied to create the resulting binary:
8
9
10 ld -m elf_i386 --format binary --oformat elf32-i386 -r \
11 -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o
12 ld -m elf_i386 -r -o built-in.o initramfs_data.o
13
14 initramfs_data.scr looks like this:
15SECTIONS
16{
17 .init.ramfs : { *(.data) }
18}
19
20 The above example is for i386 - the parameters vary from architectures.
21 Eventually look up LDFLAGS_BLOB in an older version of the
22 arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced.
23
24 Using .incbin has the advantage over ld that the correct flags are set
25 in the ELF header, as required by certain architectures.
26*/
27
28.section .init.ramfs,"a"
29.incbin "usr/initramfs_data.cpio.lzma"