aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2016-03-16 04:01:55 -0400
committerIngo Molnar <mingo@kernel.org>2016-03-16 04:01:55 -0400
commitba4e06d68ea4fd2be401d7226c68941892d6bbaf (patch)
treea9a7125a8c88ba543e4fcfb907869b97688dee3c
parent743146db071c4a828159211a295d12ff4f61752f (diff)
parent710d60cbf1b312a8075a2158cbfbbd9c66132dcc (diff)
Merge branch 'linus' into x86/urgent, to pick up dependencies for a fix
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt26
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt44
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt7
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt49
-rw-r--r--Documentation/kernel-parameters.txt13
-rw-r--r--Documentation/ptp/testptp.c6
-rw-r--r--Documentation/x86/early-microcode.txt25
-rw-r--r--Documentation/x86/x86_64/boot-options.txt2
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/alpha/kernel/smp.c2
-rw-r--r--arch/arc/kernel/smp.c2
-rw-r--r--arch/arm/kernel/smp.c2
-rw-r--r--arch/arm/mach-mvebu/Kconfig6
-rw-r--r--arch/arm64/kernel/smp.c2
-rw-r--r--arch/blackfin/mach-common/smp.c2
-rw-r--r--arch/hexagon/kernel/smp.c2
-rw-r--r--arch/ia64/kernel/smpboot.c2
-rw-r--r--arch/m32r/kernel/smpboot.c2
-rw-r--r--arch/metag/kernel/smp.c2
-rw-r--r--arch/mips/Kconfig8
-rw-r--r--arch/mips/ath79/irq.c244
-rw-r--r--arch/mips/bmips/irq.c10
-rw-r--r--arch/mips/include/asm/mach-ath79/ath79.h4
-rw-r--r--arch/mips/include/asm/smp-ops.h5
-rw-r--r--arch/mips/kernel/Makefile1
-rw-r--r--arch/mips/kernel/smp-cmp.c4
-rw-r--r--arch/mips/kernel/smp-cps.c4
-rw-r--r--arch/mips/kernel/smp-mt.c2
-rw-r--r--arch/mips/kernel/smp.c138
-rw-r--r--arch/mn10300/kernel/smp.c2
-rw-r--r--arch/parisc/kernel/smp.c2
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/kernel/smp_32.c2
-rw-r--r--arch/sparc/kernel/smp_64.c2
-rw-r--r--arch/tile/kernel/smpboot.c2
-rw-r--r--arch/x86/Kconfig27
-rw-r--r--arch/x86/Kconfig.debug10
-rw-r--r--arch/x86/boot/cpuflags.h2
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/tools/build.c1
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c2
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c2
-rw-r--r--arch/x86/entry/calling.h31
-rw-r--r--arch/x86/entry/common.c106
-rw-r--r--arch/x86/entry/entry_32.S268
-rw-r--r--arch/x86/entry/entry_64.S286
-rw-r--r--arch/x86/entry/entry_64_compat.S102
-rw-r--r--arch/x86/entry/syscall_32.c10
-rw-r--r--arch/x86/entry/syscall_64.c13
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl20
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh58
-rw-r--r--arch/x86/entry/vdso/vdso2c.h7
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S2
-rw-r--r--arch/x86/entry/vdso/vma.c127
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c9
-rw-r--r--arch/x86/include/asm/alternative.h6
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/arch_hweight.h2
-rw-r--r--arch/x86/include/asm/bitops.h36
-rw-r--r--arch/x86/include/asm/clocksource.h9
-rw-r--r--arch/x86/include/asm/cmpxchg.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h448
-rw-r--r--arch/x86/include/asm/cpufeatures.h300
-rw-r--r--arch/x86/include/asm/desc_defs.h23
-rw-r--r--arch/x86/include/asm/dmi.h2
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h18
-rw-r--r--arch/x86/include/asm/frame.h59
-rw-r--r--arch/x86/include/asm/imr.h2
-rw-r--r--arch/x86/include/asm/ipi.h58
-rw-r--r--arch/x86/include/asm/irq_work.h2
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/microcode.h26
-rw-r--r--arch/x86/include/asm/microcode_intel.h1
-rw-r--r--arch/x86/include/asm/mmu.h3
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/proto.h15
-rw-r--r--arch/x86/include/asm/sighandling.h1
-rw-r--r--arch/x86/include/asm/smap.h2
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h58
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/asm/vdso.h3
-rw-r--r--arch/x86/include/asm/vgtod.h6
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h32
-rw-r--r--arch/x86/include/uapi/asm/ucontext.h53
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c4
-rw-r--r--arch/x86/kernel/apic/ipi.c60
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/asm-offsets_32.c7
-rw-r--r--arch/x86/kernel/asm-offsets_64.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c55
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/match.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c15
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c19
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c285
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c58
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh6
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/fpu/core.c52
-rw-r--r--arch/x86/kernel/fpu/init.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.c3
-rw-r--r--arch/x86/kernel/ftrace.c11
-rw-r--r--arch/x86/kernel/head64.c14
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c1
-rw-r--r--arch/x86/kernel/mcount_64.S14
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/signal.c127
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/traps.c141
-rw-r--r--arch/x86/kernel/tsc.c67
-rw-r--r--arch/x86/kernel/verify_cpu.S2
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmdline.c60
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/dump_pagetables.c11
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c24
-rw-r--r--arch/x86/mm/kasan_init_64.c17
-rw-r--r--arch/x86/mm/kmmio.c88
-rw-r--r--arch/x86/mm/mmap.c14
-rw-r--r--arch/x86/mm/numa.c67
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/setup_nx.c6
-rw-r--r--arch/x86/oprofile/op_model_amd.c1
-rw-r--r--arch/x86/platform/geode/alix.c14
-rw-r--r--arch/x86/platform/geode/geos.c8
-rw-r--r--arch/x86/platform/geode/net5501.c8
-rw-r--r--arch/x86/platform/intel-mid/mfld.c5
-rw-r--r--arch/x86/platform/intel-mid/mrfl.c5
-rw-r--r--arch/x86/platform/intel-quark/imr.c59
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c30
-rw-r--r--arch/x86/um/asm/barrier.h2
-rw-r--r--arch/x86/um/sys_call_table_32.c4
-rw-r--r--arch/x86/um/sys_call_table_64.c7
-rw-r--r--arch/x86/um/user-offsets.c6
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/xtensa/kernel/smp.c2
-rw-r--r--drivers/clocksource/Kconfig1
-rw-r--r--drivers/clocksource/arm_arch_timer.c40
-rw-r--r--drivers/clocksource/arm_global_timer.c18
-rw-r--r--drivers/clocksource/exynos_mct.c2
-rw-r--r--drivers/clocksource/rockchip_timer.c21
-rw-r--r--drivers/clocksource/time-lpc32xx.c66
-rw-r--r--drivers/cpufreq/intel_pstate.c2
-rw-r--r--drivers/irqchip/Kconfig28
-rw-r--r--drivers/irqchip/Makefile8
-rw-r--r--drivers/irqchip/irq-alpine-msi.c293
-rw-r--r--drivers/irqchip/irq-armada-370-xp.c156
-rw-r--r--drivers/irqchip/irq-ath79-cpu.c97
-rw-r--r--drivers/irqchip/irq-ath79-misc.c189
-rw-r--r--drivers/irqchip/irq-atmel-aic-common.c14
-rw-r--r--drivers/irqchip/irq-atmel-aic-common.h7
-rw-r--r--drivers/irqchip/irq-atmel-aic.c9
-rw-r--r--drivers/irqchip/irq-atmel-aic5.c9
-rw-r--r--drivers/irqchip/irq-bcm2836.c1
-rw-r--r--drivers/irqchip/irq-bcm6345-l1.c364
-rw-r--r--drivers/irqchip/irq-gic-realview.c44
-rw-r--r--drivers/irqchip/irq-gic-v2m.c14
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c10
-rw-r--r--drivers/irqchip/irq-gic-v3.c349
-rw-r--r--drivers/irqchip/irq-gic.c2
-rw-r--r--drivers/irqchip/irq-mips-gic.c354
-rw-r--r--drivers/irqchip/irq-mvebu-odmi.c236
-rw-r--r--drivers/irqchip/irq-mxs.c2
-rw-r--r--drivers/irqchip/irq-sunxi-nmi.c4
-rw-r--r--drivers/irqchip/irq-tango.c232
-rw-r--r--drivers/irqchip/irq-ts4800.c2
-rw-r--r--drivers/net/ethernet/intel/Kconfig9
-rw-r--r--drivers/net/ethernet/intel/e1000e/defines.h5
-rw-r--r--drivers/net/ethernet/intel/e1000e/ptp.c85
-rw-r--r--drivers/net/ethernet/intel/e1000e/regs.h4
-rw-r--r--drivers/ptp/ptp_chardev.c27
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--include/linux/clockchips.h4
-rw-r--r--include/linux/clocksource.h45
-rw-r--r--include/linux/compiler.h12
-rw-r--r--include/linux/cpu.h27
-rw-r--r--include/linux/cpuhotplug.h93
-rw-r--r--include/linux/irq.h27
-rw-r--r--include/linux/irqchip/mips-gic.h3
-rw-r--r--include/linux/irqdomain.h45
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/mm_types.h22
-rw-r--r--include/linux/notifier.h2
-rw-r--r--include/linux/pps_kernel.h17
-rw-r--r--include/linux/ptp_clock_kernel.h8
-rw-r--r--include/linux/rcupdate.h6
-rw-r--r--include/linux/srcu.h19
-rw-r--r--include/linux/timekeeper_internal.h2
-rw-r--r--include/linux/timekeeping.h58
-rw-r--r--include/trace/events/cpuhp.h66
-rw-r--r--include/uapi/linux/ptp_clock.h13
-rw-r--r--init/main.c16
-rw-r--r--kernel/cpu.c1162
-rw-r--r--kernel/events/uprobes.c1
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/ipi.c326
-rw-r--r--kernel/irq/irqdesc.c21
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/rcu/rcutorture.c14
-rw-r--r--kernel/rcu/tiny_plugin.h15
-rw-r--r--kernel/rcu/tree.c214
-rw-r--r--kernel/rcu/tree.h42
-rw-r--r--kernel/rcu/tree_plugin.h27
-rw-r--r--kernel/rcu/update.c1
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/idle.c9
-rw-r--r--kernel/smp.c1
-rw-r--r--kernel/smpboot.c6
-rw-r--r--kernel/smpboot.h6
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/timekeeping.c286
-rw-r--r--lib/Kconfig.debug13
-rw-r--r--lib/atomic64_test.c2
-rw-r--r--mm/memory.c25
-rw-r--r--mm/mmap.c13
-rwxr-xr-xscripts/checkpatch.pl3
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh6
-rw-r--r--tools/testing/selftests/x86/Makefile17
-rw-r--r--tools/testing/selftests/x86/check_initial_reg_state.c109
-rw-r--r--tools/testing/selftests/x86/ptrace_syscall.c132
-rw-r--r--tools/testing/selftests/x86/sigreturn.c230
-rw-r--r--tools/testing/selftests/x86/syscall_nt.c57
261 files changed, 7703 insertions, 2756 deletions
diff --git a/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt
new file mode 100644
index 000000000000..f6f1c14bf99b
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt
@@ -0,0 +1,26 @@
1Alpine MSIX controller
2
3See arm,gic-v3.txt for SPI and MSI definitions.
4
5Required properties:
6
7- compatible: should be "al,alpine-msix"
8- reg: physical base address and size of the registers
9- interrupt-parent: specifies the parent interrupt controller.
10- interrupt-controller: identifies the node as an interrupt controller
11- msi-controller: identifies the node as an PCI Message Signaled Interrupt
12 controller
13- al,msi-base-spi: SPI base of the MSI frame
14- al,msi-num-spis: number of SPIs assigned to the MSI frame, relative to SPI0
15
16Example:
17
18msix: msix {
19 compatible = "al,alpine-msix";
20 reg = <0x0 0xfbe00000 0x0 0x100000>;
21 interrupt-parent = <&gic>;
22 interrupt-controller;
23 msi-controller;
24 al,msi-base-spi = <160>;
25 al,msi-num-spis = <160>;
26};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
index 5a1cb4bc3dfe..793c20ff8fcc 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
@@ -16,6 +16,7 @@ Main node required properties:
16 "arm,cortex-a15-gic" 16 "arm,cortex-a15-gic"
17 "arm,cortex-a7-gic" 17 "arm,cortex-a7-gic"
18 "arm,cortex-a9-gic" 18 "arm,cortex-a9-gic"
19 "arm,eb11mp-gic"
19 "arm,gic-400" 20 "arm,gic-400"
20 "arm,pl390" 21 "arm,pl390"
21 "arm,tc11mp-gic" 22 "arm,tc11mp-gic"
diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
new file mode 100644
index 000000000000..8af0a8e613ab
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
@@ -0,0 +1,44 @@
1
2* Marvell ODMI for MSI support
3
4Some Marvell SoCs have an On-Die Message Interrupt (ODMI) controller
5which can be used by on-board peripheral for MSI interrupts.
6
7Required properties:
8
9- compatible : The value here should contain:
10
11 "marvell,ap806-odmi-controller", "marvell,odmi-controller".
12
13- interrupt,controller : Identifies the node as an interrupt controller.
14
15- msi-controller : Identifies the node as an MSI controller.
16
17- marvell,odmi-frames : Number of ODMI frames available. Each frame
18 provides a number of events.
19
20- reg : List of register definitions, one for each
21 ODMI frame.
22
23- marvell,spi-base : List of GIC base SPI interrupts, one for each
24 ODMI frame. Those SPI interrupts are 0-based,
25 i.e marvell,spi-base = <128> will use SPI #96.
26 See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
27 for details about the GIC Device Tree binding.
28
29- interrupt-parent : Reference to the parent interrupt controller.
30
31Example:
32
33 odmi: odmi@300000 {
34 compatible = "marvell,ap806-odm-controller",
35 "marvell,odmi-controller";
36 interrupt-controller;
37 msi-controller;
38 marvell,odmi-frames = <4>;
39 reg = <0x300000 0x4000>,
40 <0x304000 0x4000>,
41 <0x308000 0x4000>,
42 <0x30C000 0x4000>;
43 marvell,spi-base = <128>, <136>, <144>, <152>;
44 };
diff --git a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
index aae4c384ee1f..173595305e26 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
@@ -23,6 +23,12 @@ Optional properties:
23- mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors 23- mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors
24 to which the GIC may not route interrupts. Valid values are 2 - 7. 24 to which the GIC may not route interrupts. Valid values are 2 - 7.
25 This property is ignored if the CPU is started in EIC mode. 25 This property is ignored if the CPU is started in EIC mode.
26- mti,reserved-ipi-vectors : Specifies the range of GIC interrupts that are
27 reserved for IPIs.
28 It accepts 2 values, the 1st is the starting interrupt and the 2nd is the size
29 of the reserved range.
30 If not specified, the driver will allocate the last 2 * number of VPEs in the
31 system.
26 32
27Required properties for timer sub-node: 33Required properties for timer sub-node:
28- compatible : Should be "mti,gic-timer". 34- compatible : Should be "mti,gic-timer".
@@ -44,6 +50,7 @@ Example:
44 #interrupt-cells = <3>; 50 #interrupt-cells = <3>;
45 51
46 mti,reserved-cpu-vectors = <7>; 52 mti,reserved-cpu-vectors = <7>;
53 mti,reserved-ipi-vectors = <40 8>;
47 54
48 timer { 55 timer {
49 compatible = "mti,gic-timer"; 56 compatible = "mti,gic-timer";
diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
new file mode 100644
index 000000000000..1f441fa0ad40
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
@@ -0,0 +1,49 @@
1Sigma Designs SMP86xx/SMP87xx secondary interrupt controller
2
3Required properties:
4- compatible: should be "sigma,smp8642-intc"
5- reg: physical address of MMIO region
6- ranges: address space mapping of child nodes
7- interrupt-parent: phandle of parent interrupt controller
8- interrupt-controller: boolean
9- #address-cells: should be <1>
10- #size-cells: should be <1>
11
12One child node per control block with properties:
13- reg: address of registers for this control block
14- interrupt-controller: boolean
15- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt
16- interrupts: interrupt spec of primary interrupt controller
17
18Example:
19
20interrupt-controller@6e000 {
21 compatible = "sigma,smp8642-intc";
22 reg = <0x6e000 0x400>;
23 ranges = <0x0 0x6e000 0x400>;
24 interrupt-parent = <&gic>;
25 interrupt-controller;
26 #address-cells = <1>;
27 #size-cells = <1>;
28
29 irq0: interrupt-controller@0 {
30 reg = <0x000 0x100>;
31 interrupt-controller;
32 #interrupt-cells = <2>;
33 interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
34 };
35
36 irq1: interrupt-controller@100 {
37 reg = <0x100 0x100>;
38 interrupt-controller;
39 #interrupt-cells = <2>;
40 interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
41 };
42
43 irq2: interrupt-controller@300 {
44 reg = <0x300 0x100>;
45 interrupt-controller;
46 #interrupt-cells = <2>;
47 interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
48 };
49};
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8ae47a7b4923..4d9ca7d92a20 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
666 666
667 clearcpuid=BITNUM [X86] 667 clearcpuid=BITNUM [X86]
668 Disable CPUID feature X for the kernel. See 668 Disable CPUID feature X for the kernel. See
669 arch/x86/include/asm/cpufeature.h for the valid bit 669 arch/x86/include/asm/cpufeatures.h for the valid bit
670 numbers. Note the Linux specific bits are not necessarily 670 numbers. Note the Linux specific bits are not necessarily
671 stable over kernel options, but the vendor specific 671 stable over kernel options, but the vendor specific
672 ones should be. 672 ones should be.
@@ -1687,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1687 ip= [IP_PNP] 1687 ip= [IP_PNP]
1688 See Documentation/filesystems/nfs/nfsroot.txt. 1688 See Documentation/filesystems/nfs/nfsroot.txt.
1689 1689
1690 irqaffinity= [SMP] Set the default irq affinity mask
1691 Format:
1692 <cpu number>,...,<cpu number>
1693 or
1694 <cpu number>-<cpu number>
1695 (must be a positive range in ascending order)
1696 or a mixture
1697 <cpu number>,...,<cpu number>-<cpu number>
1698
1690 irqfixup [HW] 1699 irqfixup [HW]
1691 When an interrupt is not handled search all handlers 1700 When an interrupt is not handled search all handlers
1692 for it. Intended to get systems with badly broken 1701 for it. Intended to get systems with badly broken
@@ -2566,6 +2575,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2566 2575
2567 nointroute [IA-64] 2576 nointroute [IA-64]
2568 2577
2578 noinvpcid [X86] Disable the INVPCID cpu feature.
2579
2569 nojitter [IA-64] Disables jitter checking for ITC timers. 2580 nojitter [IA-64] Disables jitter checking for ITC timers.
2570 2581
2571 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver 2582 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c
index 6c6247aaa7b9..d99012f41602 100644
--- a/Documentation/ptp/testptp.c
+++ b/Documentation/ptp/testptp.c
@@ -277,13 +277,15 @@ int main(int argc, char *argv[])
277 " %d external time stamp channels\n" 277 " %d external time stamp channels\n"
278 " %d programmable periodic signals\n" 278 " %d programmable periodic signals\n"
279 " %d pulse per second\n" 279 " %d pulse per second\n"
280 " %d programmable pins\n", 280 " %d programmable pins\n"
281 " %d cross timestamping\n",
281 caps.max_adj, 282 caps.max_adj,
282 caps.n_alarm, 283 caps.n_alarm,
283 caps.n_ext_ts, 284 caps.n_ext_ts,
284 caps.n_per_out, 285 caps.n_per_out,
285 caps.pps, 286 caps.pps,
286 caps.n_pins); 287 caps.n_pins,
288 caps.cross_timestamping);
287 } 289 }
288 } 290 }
289 291
diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt
index d62bea6796da..c956d99cf1de 100644
--- a/Documentation/x86/early-microcode.txt
+++ b/Documentation/x86/early-microcode.txt
@@ -40,3 +40,28 @@ cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin)
40find . | cpio -o -H newc >../ucode.cpio 40find . | cpio -o -H newc >../ucode.cpio
41cd .. 41cd ..
42cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img 42cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
43
44Builtin microcode
45=================
46
47We can also load builtin microcode supplied through the regular firmware
48builtin method CONFIG_FIRMWARE_IN_KERNEL. Here's an example:
49
50CONFIG_FIRMWARE_IN_KERNEL=y
51CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
52CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
53
54This basically means, you have the following tree structure locally:
55
56/lib/firmware/
57|-- amd-ucode
58...
59| |-- microcode_amd_fam15h.bin
60...
61|-- intel-ucode
62...
63| |-- 06-3a-09
64...
65
66so that the build system can find those files and integrate them into
67the final kernel image. The early loader finds them and applies them.
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 68ed3114c363..0965a71f9942 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -60,6 +60,8 @@ Machine check
60 threshold to 1. Enabling this may make memory predictive failure 60 threshold to 1. Enabling this may make memory predictive failure
61 analysis less effective if the bios sets thresholds for memory 61 analysis less effective if the bios sets thresholds for memory
62 errors since we will not see details for all errors. 62 errors since we will not see details for all errors.
63 mce=recovery
64 Force-enable recoverable machine check code paths
63 65
64 nomce (for compatibility with i386): same as mce=off 66 nomce (for compatibility with i386): same as mce=off
65 67
diff --git a/MAINTAINERS b/MAINTAINERS
index 2061ea77667c..57adf395a61f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2422,6 +2422,7 @@ F: arch/mips/bmips/*
2422F: arch/mips/include/asm/mach-bmips/* 2422F: arch/mips/include/asm/mach-bmips/*
2423F: arch/mips/kernel/*bmips* 2423F: arch/mips/kernel/*bmips*
2424F: arch/mips/boot/dts/brcm/bcm*.dts* 2424F: arch/mips/boot/dts/brcm/bcm*.dts*
2425F: drivers/irqchip/irq-bcm63*
2425F: drivers/irqchip/irq-bcm7* 2426F: drivers/irqchip/irq-bcm7*
2426F: drivers/irqchip/irq-brcmstb* 2427F: drivers/irqchip/irq-brcmstb*
2427F: include/linux/bcm963xx_nvram.h 2428F: include/linux/bcm963xx_nvram.h
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 2f24447fef92..46bf263c3153 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -168,7 +168,7 @@ smp_callin(void)
168 cpuid, current, current->active_mm)); 168 cpuid, current, current->active_mm));
169 169
170 preempt_disable(); 170 preempt_disable();
171 cpu_startup_entry(CPUHP_ONLINE); 171 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
172} 172}
173 173
174/* Wait until hwrpb->txrdy is clear for cpu. Return -1 on timeout. */ 174/* Wait until hwrpb->txrdy is clear for cpu. Return -1 on timeout. */
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 424e937da5c8..4cb3add77c75 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -142,7 +142,7 @@ void start_kernel_secondary(void)
142 142
143 local_irq_enable(); 143 local_irq_enable();
144 preempt_disable(); 144 preempt_disable();
145 cpu_startup_entry(CPUHP_ONLINE); 145 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
146} 146}
147 147
148/* 148/*
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 37312f6749f3..baee70267f29 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void)
409 /* 409 /*
410 * OK, it's off to the idle thread for us 410 * OK, it's off to the idle thread for us
411 */ 411 */
412 cpu_startup_entry(CPUHP_ONLINE); 412 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
413} 413}
414 414
415void __init smp_cpus_done(unsigned int max_cpus) 415void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index 64e3d2ce9a07..b003e3afd693 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -3,7 +3,6 @@ menuconfig ARCH_MVEBU
3 depends on ARCH_MULTI_V7 || ARCH_MULTI_V5 3 depends on ARCH_MULTI_V7 || ARCH_MULTI_V5
4 select ARCH_SUPPORTS_BIG_ENDIAN 4 select ARCH_SUPPORTS_BIG_ENDIAN
5 select CLKSRC_MMIO 5 select CLKSRC_MMIO
6 select GENERIC_IRQ_CHIP
7 select PINCTRL 6 select PINCTRL
8 select PLAT_ORION 7 select PLAT_ORION
9 select SOC_BUS 8 select SOC_BUS
@@ -29,6 +28,7 @@ config MACH_ARMADA_370
29 bool "Marvell Armada 370 boards" 28 bool "Marvell Armada 370 boards"
30 depends on ARCH_MULTI_V7 29 depends on ARCH_MULTI_V7
31 select ARMADA_370_CLK 30 select ARMADA_370_CLK
31 select ARMADA_370_XP_IRQ
32 select CPU_PJ4B 32 select CPU_PJ4B
33 select MACH_MVEBU_V7 33 select MACH_MVEBU_V7
34 select PINCTRL_ARMADA_370 34 select PINCTRL_ARMADA_370
@@ -39,6 +39,7 @@ config MACH_ARMADA_370
39config MACH_ARMADA_375 39config MACH_ARMADA_375
40 bool "Marvell Armada 375 boards" 40 bool "Marvell Armada 375 boards"
41 depends on ARCH_MULTI_V7 41 depends on ARCH_MULTI_V7
42 select ARMADA_370_XP_IRQ
42 select ARM_ERRATA_720789 43 select ARM_ERRATA_720789
43 select ARM_ERRATA_753970 44 select ARM_ERRATA_753970
44 select ARM_GIC 45 select ARM_GIC
@@ -58,6 +59,7 @@ config MACH_ARMADA_38X
58 select ARM_ERRATA_720789 59 select ARM_ERRATA_720789
59 select ARM_ERRATA_753970 60 select ARM_ERRATA_753970
60 select ARM_GIC 61 select ARM_GIC
62 select ARMADA_370_XP_IRQ
61 select ARMADA_38X_CLK 63 select ARMADA_38X_CLK
62 select HAVE_ARM_SCU 64 select HAVE_ARM_SCU
63 select HAVE_ARM_TWD if SMP 65 select HAVE_ARM_TWD if SMP
@@ -72,6 +74,7 @@ config MACH_ARMADA_39X
72 bool "Marvell Armada 39x boards" 74 bool "Marvell Armada 39x boards"
73 depends on ARCH_MULTI_V7 75 depends on ARCH_MULTI_V7
74 select ARM_GIC 76 select ARM_GIC
77 select ARMADA_370_XP_IRQ
75 select ARMADA_39X_CLK 78 select ARMADA_39X_CLK
76 select CACHE_L2X0 79 select CACHE_L2X0
77 select HAVE_ARM_SCU 80 select HAVE_ARM_SCU
@@ -86,6 +89,7 @@ config MACH_ARMADA_39X
86config MACH_ARMADA_XP 89config MACH_ARMADA_XP
87 bool "Marvell Armada XP boards" 90 bool "Marvell Armada XP boards"
88 depends on ARCH_MULTI_V7 91 depends on ARCH_MULTI_V7
92 select ARMADA_370_XP_IRQ
89 select ARMADA_XP_CLK 93 select ARMADA_XP_CLK
90 select CPU_PJ4B 94 select CPU_PJ4B
91 select MACH_MVEBU_V7 95 select MACH_MVEBU_V7
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index b1adc51b2c2e..460765799c64 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void)
195 /* 195 /*
196 * OK, it's off to the idle thread for us 196 * OK, it's off to the idle thread for us
197 */ 197 */
198 cpu_startup_entry(CPUHP_ONLINE); 198 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
199} 199}
200 200
201#ifdef CONFIG_HOTPLUG_CPU 201#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 0030e21cfceb..23c4ef5f8bdc 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -333,7 +333,7 @@ void secondary_start_kernel(void)
333 333
334 /* We are done with local CPU inits, unblock the boot CPU. */ 334 /* We are done with local CPU inits, unblock the boot CPU. */
335 set_cpu_online(cpu, true); 335 set_cpu_online(cpu, true);
336 cpu_startup_entry(CPUHP_ONLINE); 336 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
337} 337}
338 338
339void __init smp_prepare_boot_cpu(void) 339void __init smp_prepare_boot_cpu(void)
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index ff759f26b96a..983bae7d2665 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -180,7 +180,7 @@ void start_secondary(void)
180 180
181 local_irq_enable(); 181 local_irq_enable();
182 182
183 cpu_startup_entry(CPUHP_ONLINE); 183 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
184} 184}
185 185
186 186
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 0e76fad27975..74fe317477e6 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -454,7 +454,7 @@ start_secondary (void *unused)
454 preempt_disable(); 454 preempt_disable();
455 smp_callin(); 455 smp_callin();
456 456
457 cpu_startup_entry(CPUHP_ONLINE); 457 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
458 return 0; 458 return 0;
459} 459}
460 460
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index a468467542f4..f98d2f6519d6 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -432,7 +432,7 @@ int __init start_secondary(void *unused)
432 */ 432 */
433 local_flush_tlb_all(); 433 local_flush_tlb_all();
434 434
435 cpu_startup_entry(CPUHP_ONLINE); 435 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
436 return 0; 436 return 0;
437} 437}
438 438
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index c3c6f0864881..bad13232de51 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void)
396 /* 396 /*
397 * OK, it's off to the idle thread for us 397 * OK, it's off to the idle thread for us
398 */ 398 */
399 cpu_startup_entry(CPUHP_ONLINE); 399 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
400} 400}
401 401
402void __init smp_cpus_done(unsigned int max_cpus) 402void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d3da79dda629..a65eacf59918 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -151,6 +151,7 @@ config BMIPS_GENERIC
151 select CSRC_R4K 151 select CSRC_R4K
152 select SYNC_R4K 152 select SYNC_R4K
153 select COMMON_CLK 153 select COMMON_CLK
154 select BCM6345_L1_IRQ
154 select BCM7038_L1_IRQ 155 select BCM7038_L1_IRQ
155 select BCM7120_L2_IRQ 156 select BCM7120_L2_IRQ
156 select BRCMSTB_L2_IRQ 157 select BRCMSTB_L2_IRQ
@@ -2169,7 +2170,6 @@ config MIPS_MT_SMP
2169 select CPU_MIPSR2_IRQ_VI 2170 select CPU_MIPSR2_IRQ_VI
2170 select CPU_MIPSR2_IRQ_EI 2171 select CPU_MIPSR2_IRQ_EI
2171 select SYNC_R4K 2172 select SYNC_R4K
2172 select MIPS_GIC_IPI if MIPS_GIC
2173 select MIPS_MT 2173 select MIPS_MT
2174 select SMP 2174 select SMP
2175 select SMP_UP 2175 select SMP_UP
@@ -2267,7 +2267,6 @@ config MIPS_VPE_APSP_API_MT
2267config MIPS_CMP 2267config MIPS_CMP
2268 bool "MIPS CMP framework support (DEPRECATED)" 2268 bool "MIPS CMP framework support (DEPRECATED)"
2269 depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6 2269 depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
2270 select MIPS_GIC_IPI if MIPS_GIC
2271 select SMP 2270 select SMP
2272 select SYNC_R4K 2271 select SYNC_R4K
2273 select SYS_SUPPORTS_SMP 2272 select SYS_SUPPORTS_SMP
@@ -2287,7 +2286,6 @@ config MIPS_CPS
2287 select MIPS_CM 2286 select MIPS_CM
2288 select MIPS_CPC 2287 select MIPS_CPC
2289 select MIPS_CPS_PM if HOTPLUG_CPU 2288 select MIPS_CPS_PM if HOTPLUG_CPU
2290 select MIPS_GIC_IPI if MIPS_GIC
2291 select SMP 2289 select SMP
2292 select SYNC_R4K if (CEVT_R4K || CSRC_R4K) 2290 select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
2293 select SYS_SUPPORTS_HOTPLUG_CPU 2291 select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2305,10 +2303,6 @@ config MIPS_CPS_PM
2305 select MIPS_CPC 2303 select MIPS_CPC
2306 bool 2304 bool
2307 2305
2308config MIPS_GIC_IPI
2309 depends on MIPS_GIC
2310 bool
2311
2312config MIPS_CM 2306config MIPS_CM
2313 bool 2307 bool
2314 2308
diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c
index 511c06560dc1..2dfff1f19004 100644
--- a/arch/mips/ath79/irq.c
+++ b/arch/mips/ath79/irq.c
@@ -26,90 +26,6 @@
26#include "common.h" 26#include "common.h"
27#include "machtypes.h" 27#include "machtypes.h"
28 28
29static void __init ath79_misc_intc_domain_init(
30 struct device_node *node, int irq);
31
32static void ath79_misc_irq_handler(struct irq_desc *desc)
33{
34 struct irq_domain *domain = irq_desc_get_handler_data(desc);
35 void __iomem *base = domain->host_data;
36 u32 pending;
37
38 pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
39 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
40
41 if (!pending) {
42 spurious_interrupt();
43 return;
44 }
45
46 while (pending) {
47 int bit = __ffs(pending);
48
49 generic_handle_irq(irq_linear_revmap(domain, bit));
50 pending &= ~BIT(bit);
51 }
52}
53
54static void ar71xx_misc_irq_unmask(struct irq_data *d)
55{
56 void __iomem *base = irq_data_get_irq_chip_data(d);
57 unsigned int irq = d->hwirq;
58 u32 t;
59
60 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
61 __raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
62
63 /* flush write */
64 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
65}
66
67static void ar71xx_misc_irq_mask(struct irq_data *d)
68{
69 void __iomem *base = irq_data_get_irq_chip_data(d);
70 unsigned int irq = d->hwirq;
71 u32 t;
72
73 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
74 __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
75
76 /* flush write */
77 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
78}
79
80static void ar724x_misc_irq_ack(struct irq_data *d)
81{
82 void __iomem *base = irq_data_get_irq_chip_data(d);
83 unsigned int irq = d->hwirq;
84 u32 t;
85
86 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
87 __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
88
89 /* flush write */
90 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
91}
92
93static struct irq_chip ath79_misc_irq_chip = {
94 .name = "MISC",
95 .irq_unmask = ar71xx_misc_irq_unmask,
96 .irq_mask = ar71xx_misc_irq_mask,
97};
98
99static void __init ath79_misc_irq_init(void)
100{
101 if (soc_is_ar71xx() || soc_is_ar913x())
102 ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
103 else if (soc_is_ar724x() ||
104 soc_is_ar933x() ||
105 soc_is_ar934x() ||
106 soc_is_qca955x())
107 ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
108 else
109 BUG();
110
111 ath79_misc_intc_domain_init(NULL, ATH79_CPU_IRQ(6));
112}
113 29
114static void ar934x_ip2_irq_dispatch(struct irq_desc *desc) 30static void ar934x_ip2_irq_dispatch(struct irq_desc *desc)
115{ 31{
@@ -212,142 +128,12 @@ static void qca955x_irq_init(void)
212 irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch); 128 irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch);
213} 129}
214 130
215/*
216 * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
217 * these devices typically allocate coherent DMA memory, however the
218 * DMA controller may still have some unsynchronized data in the FIFO.
219 * Issue a flush in the handlers to ensure that the driver sees
220 * the update.
221 *
222 * This array map the interrupt lines to the DDR write buffer channels.
223 */
224
225static unsigned irq_wb_chan[8] = {
226 -1, -1, -1, -1, -1, -1, -1, -1,
227};
228
229asmlinkage void plat_irq_dispatch(void)
230{
231 unsigned long pending;
232 int irq;
233
234 pending = read_c0_status() & read_c0_cause() & ST0_IM;
235
236 if (!pending) {
237 spurious_interrupt();
238 return;
239 }
240
241 pending >>= CAUSEB_IP;
242 while (pending) {
243 irq = fls(pending) - 1;
244 if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
245 ath79_ddr_wb_flush(irq_wb_chan[irq]);
246 do_IRQ(MIPS_CPU_IRQ_BASE + irq);
247 pending &= ~BIT(irq);
248 }
249}
250
251static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
252{
253 irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
254 irq_set_chip_data(irq, d->host_data);
255 return 0;
256}
257
258static const struct irq_domain_ops misc_irq_domain_ops = {
259 .xlate = irq_domain_xlate_onecell,
260 .map = misc_map,
261};
262
263static void __init ath79_misc_intc_domain_init(
264 struct device_node *node, int irq)
265{
266 void __iomem *base = ath79_reset_base;
267 struct irq_domain *domain;
268
269 domain = irq_domain_add_legacy(node, ATH79_MISC_IRQ_COUNT,
270 ATH79_MISC_IRQ_BASE, 0, &misc_irq_domain_ops, base);
271 if (!domain)
272 panic("Failed to add MISC irqdomain");
273
274 /* Disable and clear all interrupts */
275 __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
276 __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
277
278 irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
279}
280
281static int __init ath79_misc_intc_of_init(
282 struct device_node *node, struct device_node *parent)
283{
284 int irq;
285
286 irq = irq_of_parse_and_map(node, 0);
287 if (!irq)
288 panic("Failed to get MISC IRQ");
289
290 ath79_misc_intc_domain_init(node, irq);
291 return 0;
292}
293
294static int __init ar7100_misc_intc_of_init(
295 struct device_node *node, struct device_node *parent)
296{
297 ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
298 return ath79_misc_intc_of_init(node, parent);
299}
300
301IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
302 ar7100_misc_intc_of_init);
303
304static int __init ar7240_misc_intc_of_init(
305 struct device_node *node, struct device_node *parent)
306{
307 ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
308 return ath79_misc_intc_of_init(node, parent);
309}
310
311IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
312 ar7240_misc_intc_of_init);
313
314static int __init ar79_cpu_intc_of_init(
315 struct device_node *node, struct device_node *parent)
316{
317 int err, i, count;
318
319 /* Fill the irq_wb_chan table */
320 count = of_count_phandle_with_args(
321 node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
322
323 for (i = 0; i < count; i++) {
324 struct of_phandle_args args;
325 u32 irq = i;
326
327 of_property_read_u32_index(
328 node, "qca,ddr-wb-channel-interrupts", i, &irq);
329 if (irq >= ARRAY_SIZE(irq_wb_chan))
330 continue;
331
332 err = of_parse_phandle_with_args(
333 node, "qca,ddr-wb-channels",
334 "#qca,ddr-wb-channel-cells",
335 i, &args);
336 if (err)
337 return err;
338
339 irq_wb_chan[irq] = args.args[0];
340 pr_info("IRQ: Set flush channel of IRQ%d to %d\n",
341 irq, args.args[0]);
342 }
343
344 return mips_cpu_irq_of_init(node, parent);
345}
346IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
347 ar79_cpu_intc_of_init);
348
349void __init arch_init_irq(void) 131void __init arch_init_irq(void)
350{ 132{
133 unsigned irq_wb_chan2 = -1;
134 unsigned irq_wb_chan3 = -1;
135 bool misc_is_ar71xx;
136
351 if (mips_machtype == ATH79_MACH_GENERIC_OF) { 137 if (mips_machtype == ATH79_MACH_GENERIC_OF) {
352 irqchip_init(); 138 irqchip_init();
353 return; 139 return;
@@ -355,14 +141,26 @@ void __init arch_init_irq(void)
355 141
356 if (soc_is_ar71xx() || soc_is_ar724x() || 142 if (soc_is_ar71xx() || soc_is_ar724x() ||
357 soc_is_ar913x() || soc_is_ar933x()) { 143 soc_is_ar913x() || soc_is_ar933x()) {
358 irq_wb_chan[2] = 3; 144 irq_wb_chan2 = 3;
359 irq_wb_chan[3] = 2; 145 irq_wb_chan3 = 2;
360 } else if (soc_is_ar934x()) { 146 } else if (soc_is_ar934x()) {
361 irq_wb_chan[3] = 2; 147 irq_wb_chan3 = 2;
362 } 148 }
363 149
364 mips_cpu_irq_init(); 150 ath79_cpu_irq_init(irq_wb_chan2, irq_wb_chan3);
365 ath79_misc_irq_init(); 151
152 if (soc_is_ar71xx() || soc_is_ar913x())
153 misc_is_ar71xx = true;
154 else if (soc_is_ar724x() ||
155 soc_is_ar933x() ||
156 soc_is_ar934x() ||
157 soc_is_qca955x())
158 misc_is_ar71xx = false;
159 else
160 BUG();
161 ath79_misc_irq_init(
162 ath79_reset_base + AR71XX_RESET_REG_MISC_INT_STATUS,
163 ATH79_CPU_IRQ(6), ATH79_MISC_IRQ_BASE, misc_is_ar71xx);
366 164
367 if (soc_is_ar934x()) 165 if (soc_is_ar934x())
368 ar934x_ip2_irq_init(); 166 ar934x_ip2_irq_init();
diff --git a/arch/mips/bmips/irq.c b/arch/mips/bmips/irq.c
index e7fc6f9348ba..7efefcf44033 100644
--- a/arch/mips/bmips/irq.c
+++ b/arch/mips/bmips/irq.c
@@ -15,6 +15,12 @@
15#include <asm/irq_cpu.h> 15#include <asm/irq_cpu.h>
16#include <asm/time.h> 16#include <asm/time.h>
17 17
18static const struct of_device_id smp_intc_dt_match[] = {
19 { .compatible = "brcm,bcm7038-l1-intc" },
20 { .compatible = "brcm,bcm6345-l1-intc" },
21 {}
22};
23
18unsigned int get_c0_compare_int(void) 24unsigned int get_c0_compare_int(void)
19{ 25{
20 return CP0_LEGACY_COMPARE_IRQ; 26 return CP0_LEGACY_COMPARE_IRQ;
@@ -24,8 +30,8 @@ void __init arch_init_irq(void)
24{ 30{
25 struct device_node *dn; 31 struct device_node *dn;
26 32
27 /* Only the STB (bcm7038) controller supports SMP IRQ affinity */ 33 /* Only these controllers support SMP IRQ affinity */
28 dn = of_find_compatible_node(NULL, NULL, "brcm,bcm7038-l1-intc"); 34 dn = of_find_matching_node(NULL, smp_intc_dt_match);
29 if (dn) 35 if (dn)
30 of_node_put(dn); 36 of_node_put(dn);
31 else 37 else
diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h
index 2b3487213d1e..441faa92c3cd 100644
--- a/arch/mips/include/asm/mach-ath79/ath79.h
+++ b/arch/mips/include/asm/mach-ath79/ath79.h
@@ -144,4 +144,8 @@ static inline u32 ath79_reset_rr(unsigned reg)
144void ath79_device_reset_set(u32 mask); 144void ath79_device_reset_set(u32 mask);
145void ath79_device_reset_clear(u32 mask); 145void ath79_device_reset_clear(u32 mask);
146 146
147void ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3);
148void ath79_misc_irq_init(void __iomem *regs, int irq,
149 int irq_base, bool is_ar71xx);
150
147#endif /* __ASM_MACH_ATH79_H */ 151#endif /* __ASM_MACH_ATH79_H */
diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h
index 6ba1fb8b11e2..db7c322f057f 100644
--- a/arch/mips/include/asm/smp-ops.h
+++ b/arch/mips/include/asm/smp-ops.h
@@ -44,8 +44,9 @@ static inline void plat_smp_setup(void)
44 mp_ops->smp_setup(); 44 mp_ops->smp_setup();
45} 45}
46 46
47extern void gic_send_ipi_single(int cpu, unsigned int action); 47extern void mips_smp_send_ipi_single(int cpu, unsigned int action);
48extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action); 48extern void mips_smp_send_ipi_mask(const struct cpumask *mask,
49 unsigned int action);
49 50
50#else /* !CONFIG_SMP */ 51#else /* !CONFIG_SMP */
51 52
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 68e2b7db9348..b0988fd62fcc 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -52,7 +52,6 @@ obj-$(CONFIG_MIPS_MT_SMP) += smp-mt.o
52obj-$(CONFIG_MIPS_CMP) += smp-cmp.o 52obj-$(CONFIG_MIPS_CMP) += smp-cmp.o
53obj-$(CONFIG_MIPS_CPS) += smp-cps.o cps-vec.o 53obj-$(CONFIG_MIPS_CPS) += smp-cps.o cps-vec.o
54obj-$(CONFIG_MIPS_CPS_NS16550) += cps-vec-ns16550.o 54obj-$(CONFIG_MIPS_CPS_NS16550) += cps-vec-ns16550.o
55obj-$(CONFIG_MIPS_GIC_IPI) += smp-gic.o
56obj-$(CONFIG_MIPS_SPRAM) += spram.o 55obj-$(CONFIG_MIPS_SPRAM) += spram.o
57 56
58obj-$(CONFIG_MIPS_VPE_LOADER) += vpe.o 57obj-$(CONFIG_MIPS_VPE_LOADER) += vpe.o
diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c
index d5e0f949dc48..76923349b4fe 100644
--- a/arch/mips/kernel/smp-cmp.c
+++ b/arch/mips/kernel/smp-cmp.c
@@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus)
149} 149}
150 150
151struct plat_smp_ops cmp_smp_ops = { 151struct plat_smp_ops cmp_smp_ops = {
152 .send_ipi_single = gic_send_ipi_single, 152 .send_ipi_single = mips_smp_send_ipi_single,
153 .send_ipi_mask = gic_send_ipi_mask, 153 .send_ipi_mask = mips_smp_send_ipi_mask,
154 .init_secondary = cmp_init_secondary, 154 .init_secondary = cmp_init_secondary,
155 .smp_finish = cmp_smp_finish, 155 .smp_finish = cmp_smp_finish,
156 .boot_secondary = cmp_boot_secondary, 156 .boot_secondary = cmp_boot_secondary,
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 2ad4e4c96d61..253e1409338c 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = {
472 .boot_secondary = cps_boot_secondary, 472 .boot_secondary = cps_boot_secondary,
473 .init_secondary = cps_init_secondary, 473 .init_secondary = cps_init_secondary,
474 .smp_finish = cps_smp_finish, 474 .smp_finish = cps_smp_finish,
475 .send_ipi_single = gic_send_ipi_single, 475 .send_ipi_single = mips_smp_send_ipi_single,
476 .send_ipi_mask = gic_send_ipi_mask, 476 .send_ipi_mask = mips_smp_send_ipi_mask,
477#ifdef CONFIG_HOTPLUG_CPU 477#ifdef CONFIG_HOTPLUG_CPU
478 .cpu_disable = cps_cpu_disable, 478 .cpu_disable = cps_cpu_disable,
479 .cpu_die = cps_cpu_die, 479 .cpu_die = cps_cpu_die,
diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c
index 86311a164ef1..4f9570a57e8d 100644
--- a/arch/mips/kernel/smp-mt.c
+++ b/arch/mips/kernel/smp-mt.c
@@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action)
121 121
122#ifdef CONFIG_MIPS_GIC 122#ifdef CONFIG_MIPS_GIC
123 if (gic_present) { 123 if (gic_present) {
124 gic_send_ipi_single(cpu, action); 124 mips_smp_send_ipi_single(cpu, action);
125 return; 125 return;
126 } 126 }
127#endif 127#endif
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 2b521e07b860..37708d9af638 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -33,12 +33,16 @@
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/err.h> 34#include <linux/err.h>
35#include <linux/ftrace.h> 35#include <linux/ftrace.h>
36#include <linux/irqdomain.h>
37#include <linux/of.h>
38#include <linux/of_irq.h>
36 39
37#include <linux/atomic.h> 40#include <linux/atomic.h>
38#include <asm/cpu.h> 41#include <asm/cpu.h>
39#include <asm/processor.h> 42#include <asm/processor.h>
40#include <asm/idle.h> 43#include <asm/idle.h>
41#include <asm/r4k-timer.h> 44#include <asm/r4k-timer.h>
45#include <asm/mips-cpc.h>
42#include <asm/mmu_context.h> 46#include <asm/mmu_context.h>
43#include <asm/time.h> 47#include <asm/time.h>
44#include <asm/setup.h> 48#include <asm/setup.h>
@@ -79,6 +83,11 @@ static cpumask_t cpu_core_setup_map;
79 83
80cpumask_t cpu_coherent_mask; 84cpumask_t cpu_coherent_mask;
81 85
86#ifdef CONFIG_GENERIC_IRQ_IPI
87static struct irq_desc *call_desc;
88static struct irq_desc *sched_desc;
89#endif
90
82static inline void set_cpu_sibling_map(int cpu) 91static inline void set_cpu_sibling_map(int cpu)
83{ 92{
84 int i; 93 int i;
@@ -146,6 +155,133 @@ void register_smp_ops(struct plat_smp_ops *ops)
146 mp_ops = ops; 155 mp_ops = ops;
147} 156}
148 157
158#ifdef CONFIG_GENERIC_IRQ_IPI
159void mips_smp_send_ipi_single(int cpu, unsigned int action)
160{
161 mips_smp_send_ipi_mask(cpumask_of(cpu), action);
162}
163
164void mips_smp_send_ipi_mask(const struct cpumask *mask, unsigned int action)
165{
166 unsigned long flags;
167 unsigned int core;
168 int cpu;
169
170 local_irq_save(flags);
171
172 switch (action) {
173 case SMP_CALL_FUNCTION:
174 __ipi_send_mask(call_desc, mask);
175 break;
176
177 case SMP_RESCHEDULE_YOURSELF:
178 __ipi_send_mask(sched_desc, mask);
179 break;
180
181 default:
182 BUG();
183 }
184
185 if (mips_cpc_present()) {
186 for_each_cpu(cpu, mask) {
187 core = cpu_data[cpu].core;
188
189 if (core == current_cpu_data.core)
190 continue;
191
192 while (!cpumask_test_cpu(cpu, &cpu_coherent_mask)) {
193 mips_cpc_lock_other(core);
194 write_cpc_co_cmd(CPC_Cx_CMD_PWRUP);
195 mips_cpc_unlock_other();
196 }
197 }
198 }
199
200 local_irq_restore(flags);
201}
202
203
204static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
205{
206 scheduler_ipi();
207
208 return IRQ_HANDLED;
209}
210
211static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
212{
213 generic_smp_call_function_interrupt();
214
215 return IRQ_HANDLED;
216}
217
218static struct irqaction irq_resched = {
219 .handler = ipi_resched_interrupt,
220 .flags = IRQF_PERCPU,
221 .name = "IPI resched"
222};
223
224static struct irqaction irq_call = {
225 .handler = ipi_call_interrupt,
226 .flags = IRQF_PERCPU,
227 .name = "IPI call"
228};
229
230static __init void smp_ipi_init_one(unsigned int virq,
231 struct irqaction *action)
232{
233 int ret;
234
235 irq_set_handler(virq, handle_percpu_irq);
236 ret = setup_irq(virq, action);
237 BUG_ON(ret);
238}
239
240static int __init mips_smp_ipi_init(void)
241{
242 unsigned int call_virq, sched_virq;
243 struct irq_domain *ipidomain;
244 struct device_node *node;
245
246 node = of_irq_find_parent(of_root);
247 ipidomain = irq_find_matching_host(node, DOMAIN_BUS_IPI);
248
249 /*
250 * Some platforms have half DT setup. So if we found irq node but
251 * didn't find an ipidomain, try to search for one that is not in the
252 * DT.
253 */
254 if (node && !ipidomain)
255 ipidomain = irq_find_matching_host(NULL, DOMAIN_BUS_IPI);
256
257 BUG_ON(!ipidomain);
258
259 call_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
260 BUG_ON(!call_virq);
261
262 sched_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
263 BUG_ON(!sched_virq);
264
265 if (irq_domain_is_ipi_per_cpu(ipidomain)) {
266 int cpu;
267
268 for_each_cpu(cpu, cpu_possible_mask) {
269 smp_ipi_init_one(call_virq + cpu, &irq_call);
270 smp_ipi_init_one(sched_virq + cpu, &irq_resched);
271 }
272 } else {
273 smp_ipi_init_one(call_virq, &irq_call);
274 smp_ipi_init_one(sched_virq, &irq_resched);
275 }
276
277 call_desc = irq_to_desc(call_virq);
278 sched_desc = irq_to_desc(sched_virq);
279
280 return 0;
281}
282early_initcall(mips_smp_ipi_init);
283#endif
284
149/* 285/*
150 * First C code run on the secondary CPUs after being started up by 286 * First C code run on the secondary CPUs after being started up by
151 * the master. 287 * the master.
@@ -192,7 +328,7 @@ asmlinkage void start_secondary(void)
192 WARN_ON_ONCE(!irqs_disabled()); 328 WARN_ON_ONCE(!irqs_disabled());
193 mp_ops->smp_finish(); 329 mp_ops->smp_finish();
194 330
195 cpu_startup_entry(CPUHP_ONLINE); 331 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
196} 332}
197 333
198static void stop_this_cpu(void *dummy) 334static void stop_this_cpu(void *dummy)
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index f984193718b1..426173c4b0b9 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -675,7 +675,7 @@ int __init start_secondary(void *unused)
675#ifdef CONFIG_GENERIC_CLOCKEVENTS 675#ifdef CONFIG_GENERIC_CLOCKEVENTS
676 init_clockevents(); 676 init_clockevents();
677#endif 677#endif
678 cpu_startup_entry(CPUHP_ONLINE); 678 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
679 return 0; 679 return 0;
680} 680}
681 681
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 52e85973a283..c2a9cc55a62f 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -305,7 +305,7 @@ void __init smp_callin(void)
305 305
306 local_irq_enable(); /* Interrupts have been off until now */ 306 local_irq_enable(); /* Interrupts have been off until now */
307 307
308 cpu_startup_entry(CPUHP_ONLINE); 308 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
309 309
310 /* NOTREACHED */ 310 /* NOTREACHED */
311 panic("smp_callin() AAAAaaaaahhhh....\n"); 311 panic("smp_callin() AAAAaaaaahhhh....\n");
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec9ec2058d2d..cc13d4c83291 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -727,7 +727,7 @@ void start_secondary(void *unused)
727 727
728 local_irq_enable(); 728 local_irq_enable();
729 729
730 cpu_startup_entry(CPUHP_ONLINE); 730 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
731 731
732 BUG(); 732 BUG();
733} 733}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3c65a8eae34d..40a6b4f9c36c 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -798,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid)
798 set_cpu_online(smp_processor_id(), true); 798 set_cpu_online(smp_processor_id(), true);
799 inc_irq_stat(CPU_RST); 799 inc_irq_stat(CPU_RST);
800 local_irq_enable(); 800 local_irq_enable();
801 cpu_startup_entry(CPUHP_ONLINE); 801 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
802} 802}
803 803
804/* Upping and downing of CPUs */ 804/* Upping and downing of CPUs */
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index de6be008fc01..13f633add29a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -203,7 +203,7 @@ asmlinkage void start_secondary(void)
203 set_cpu_online(cpu, true); 203 set_cpu_online(cpu, true);
204 per_cpu(cpu_state, cpu) = CPU_ONLINE; 204 per_cpu(cpu_state, cpu) = CPU_ONLINE;
205 205
206 cpu_startup_entry(CPUHP_ONLINE); 206 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
207} 207}
208 208
209extern struct { 209extern struct {
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index b3a5d81b20f0..fb30e7c6a5b1 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg)
364 local_irq_enable(); 364 local_irq_enable();
365 365
366 wmb(); 366 wmb();
367 cpu_startup_entry(CPUHP_ONLINE); 367 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
368 368
369 /* We should never reach here! */ 369 /* We should never reach here! */
370 BUG(); 370 BUG();
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 19cd08d18672..8a6151a628ce 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -134,7 +134,7 @@ void smp_callin(void)
134 134
135 local_irq_enable(); 135 local_irq_enable();
136 136
137 cpu_startup_entry(CPUHP_ONLINE); 137 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
138} 138}
139 139
140void cpu_panic(void) 140void cpu_panic(void)
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 20d52a98e171..6c0abaacec33 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -208,7 +208,7 @@ void online_secondary(void)
208 /* Set up tile-timer clock-event device on this cpu */ 208 /* Set up tile-timer clock-event device on this cpu */
209 setup_tile_timer(); 209 setup_tile_timer();
210 210
211 cpu_startup_entry(CPUHP_ONLINE); 211 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
212} 212}
213 213
214int __cpu_up(unsigned int cpu, struct task_struct *tidle) 214int __cpu_up(unsigned int cpu, struct task_struct *tidle)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b1051057e5b0..8f2e6659281b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1163,22 +1163,23 @@ config MICROCODE
1163 bool "CPU microcode loading support" 1163 bool "CPU microcode loading support"
1164 default y 1164 default y
1165 depends on CPU_SUP_AMD || CPU_SUP_INTEL 1165 depends on CPU_SUP_AMD || CPU_SUP_INTEL
1166 depends on BLK_DEV_INITRD
1167 select FW_LOADER 1166 select FW_LOADER
1168 ---help--- 1167 ---help---
1169
1170 If you say Y here, you will be able to update the microcode on 1168 If you say Y here, you will be able to update the microcode on
1171 certain Intel and AMD processors. The Intel support is for the 1169 Intel and AMD processors. The Intel support is for the IA32 family,
1172 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, 1170 e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
1173 Xeon etc. The AMD support is for families 0x10 and later. You will 1171 AMD support is for families 0x10 and later. You will obviously need
1174 obviously need the actual microcode binary data itself which is not 1172 the actual microcode binary data itself which is not shipped with
1175 shipped with the Linux kernel. 1173 the Linux kernel.
1176 1174
1177 This option selects the general module only, you need to select 1175 The preferred method to load microcode from a detached initrd is described
1178 at least one vendor specific module as well. 1176 in Documentation/x86/early-microcode.txt. For that you need to enable
1179 1177 CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
1180 To compile this driver as a module, choose M here: the module 1178 initrd for microcode blobs.
1181 will be called microcode. 1179
1180 In addition, you can build-in the microcode into the kernel. For that you
1181 need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
1182 to the CONFIG_EXTRA_FIRMWARE config option.
1182 1183
1183config MICROCODE_INTEL 1184config MICROCODE_INTEL
1184 bool "Intel microcode loading support" 1185 bool "Intel microcode loading support"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7816b7b276f4..67eec55093a5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -338,16 +338,6 @@ config DEBUG_IMR_SELFTEST
338 338
339 If unsure say N here. 339 If unsure say N here.
340 340
341config X86_DEBUG_STATIC_CPU_HAS
342 bool "Debug alternatives"
343 depends on DEBUG_KERNEL
344 ---help---
345 This option causes additional code to be generated which
346 fails if static_cpu_has() is used before alternatives have
347 run.
348
349 If unsure, say N.
350
351config X86_DEBUG_FPU 341config X86_DEBUG_FPU
352 bool "Debug the x86 FPU code" 342 bool "Debug the x86 FPU code"
353 depends on DEBUG_KERNEL 343 depends on DEBUG_KERNEL
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index ea97697e51e4..4cb404fd45ce 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -1,7 +1,7 @@
1#ifndef BOOT_CPUFLAGS_H 1#ifndef BOOT_CPUFLAGS_H
2#define BOOT_CPUFLAGS_H 2#define BOOT_CPUFLAGS_H
3 3
4#include <asm/cpufeature.h> 4#include <asm/cpufeatures.h>
5#include <asm/processor-flags.h> 5#include <asm/processor-flags.h>
6 6
7struct cpu_features { 7struct cpu_features {
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 637097e66a62..f72498dc90d2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -17,7 +17,7 @@
17 17
18#include "../include/asm/required-features.h" 18#include "../include/asm/required-features.h"
19#include "../include/asm/disabled-features.h" 19#include "../include/asm/disabled-features.h"
20#include "../include/asm/cpufeature.h" 20#include "../include/asm/cpufeatures.h"
21#include "../kernel/cpu/capflags.c" 21#include "../kernel/cpu/capflags.c"
22 22
23int main(void) 23int main(void)
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a7661c430cd9..0702d2531bc7 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -49,7 +49,6 @@ typedef unsigned int u32;
49 49
50/* This must be large enough to hold the entire setup */ 50/* This must be large enough to hold the entire setup */
51u8 buf[SETUP_SECT_MAX*512]; 51u8 buf[SETUP_SECT_MAX*512];
52int is_big_kernel;
53 52
54#define PECOFF_RELOC_RESERVE 0x20 53#define PECOFF_RELOC_RESERVE 0x20
55 54
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 028be48c8839..e25a1630320c 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y
288CONFIG_NLS_UTF8=y 288CONFIG_NLS_UTF8=y
289CONFIG_PRINTK_TIME=y 289CONFIG_PRINTK_TIME=y
290# CONFIG_ENABLE_WARN_DEPRECATED is not set 290# CONFIG_ENABLE_WARN_DEPRECATED is not set
291CONFIG_FRAME_WARN=2048 291CONFIG_FRAME_WARN=1024
292CONFIG_MAGIC_SYSRQ=y 292CONFIG_MAGIC_SYSRQ=y
293# CONFIG_UNUSED_SYMBOLS is not set 293# CONFIG_UNUSED_SYMBOLS is not set
294CONFIG_DEBUG_KERNEL=y 294CONFIG_DEBUG_KERNEL=y
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 07d2c6c86a54..27226df3f7d8 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -33,7 +33,7 @@
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <crypto/internal/hash.h> 34#include <crypto/internal/hash.h>
35 35
36#include <asm/cpufeature.h> 36#include <asm/cpufeatures.h>
37#include <asm/cpu_device_id.h> 37#include <asm/cpu_device_id.h>
38#include <asm/fpu/api.h> 38#include <asm/fpu/api.h>
39 39
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 0e9871693f24..0857b1a1de3b 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -30,7 +30,7 @@
30#include <linux/kernel.h> 30#include <linux/kernel.h>
31#include <crypto/internal/hash.h> 31#include <crypto/internal/hash.h>
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeatures.h>
34#include <asm/cpu_device_id.h> 34#include <asm/cpu_device_id.h>
35#include <asm/fpu/internal.h> 35#include <asm/fpu/internal.h>
36 36
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index a3fcfc97a311..cd4df9322501 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -30,7 +30,7 @@
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <asm/fpu/api.h> 32#include <asm/fpu/api.h>
33#include <asm/cpufeature.h> 33#include <asm/cpufeatures.h>
34#include <asm/cpu_device_id.h> 34#include <asm/cpu_device_id.h>
35 35
36asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, 36asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e09868..9a9e5884066c 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
201 .byte 0xf1 201 .byte 0xf1
202 .endm 202 .endm
203 203
204#else /* CONFIG_X86_64 */
205
206/*
207 * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
208 * are different from the entry_32.S versions in not changing the segment
209 * registers. So only suitable for in kernel use, not when transitioning
210 * from or to user space. The resulting stack frame is not a standard
211 * pt_regs frame. The main use case is calling C code from assembler
212 * when all the registers need to be preserved.
213 */
214
215 .macro SAVE_ALL
216 pushl %eax
217 pushl %ebp
218 pushl %edi
219 pushl %esi
220 pushl %edx
221 pushl %ecx
222 pushl %ebx
223 .endm
224
225 .macro RESTORE_ALL
226 popl %ebx
227 popl %ecx
228 popl %edx
229 popl %esi
230 popl %edi
231 popl %ebp
232 popl %eax
233 .endm
234
235#endif /* CONFIG_X86_64 */ 204#endif /* CONFIG_X86_64 */
236 205
237/* 206/*
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03663740c866..e79d93d44ecd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
26#include <asm/traps.h> 26#include <asm/traps.h>
27#include <asm/vdso.h> 27#include <asm/vdso.h>
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/cpufeature.h>
29 30
30#define CREATE_TRACE_POINTS 31#define CREATE_TRACE_POINTS
31#include <trace/events/syscalls.h> 32#include <trace/events/syscalls.h>
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
44 CT_WARN_ON(ct_state() != CONTEXT_USER); 45 CT_WARN_ON(ct_state() != CONTEXT_USER);
45 user_exit(); 46 user_exit();
46} 47}
48#else
49static inline void enter_from_user_mode(void) {}
47#endif 50#endif
48 51
49static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) 52static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
84 87
85 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; 88 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
86 89
87#ifdef CONFIG_CONTEXT_TRACKING
88 /*
89 * If TIF_NOHZ is set, we are required to call user_exit() before
90 * doing anything that could touch RCU.
91 */
92 if (work & _TIF_NOHZ) {
93 enter_from_user_mode();
94 work &= ~_TIF_NOHZ;
95 }
96#endif
97
98#ifdef CONFIG_SECCOMP 90#ifdef CONFIG_SECCOMP
99 /* 91 /*
100 * Do seccomp first -- it should minimize exposure of other 92 * Do seccomp first -- it should minimize exposure of other
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
171 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 163 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
172 BUG_ON(regs != task_pt_regs(current)); 164 BUG_ON(regs != task_pt_regs(current));
173 165
174 /*
175 * If we stepped into a sysenter/syscall insn, it trapped in
176 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
177 * If user-mode had set TF itself, then it's still clear from
178 * do_debug() and we need to set it again to restore the user
179 * state. If we entered on the slow path, TF was already set.
180 */
181 if (work & _TIF_SINGLESTEP)
182 regs->flags |= X86_EFLAGS_TF;
183
184#ifdef CONFIG_SECCOMP 166#ifdef CONFIG_SECCOMP
185 /* 167 /*
186 * Call seccomp_phase2 before running the other hooks so that 168 * Call seccomp_phase2 before running the other hooks so that
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
268/* Called with IRQs disabled. */ 250/* Called with IRQs disabled. */
269__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) 251__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
270{ 252{
253 struct thread_info *ti = pt_regs_to_thread_info(regs);
271 u32 cached_flags; 254 u32 cached_flags;
272 255
273 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) 256 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
275 258
276 lockdep_sys_exit(); 259 lockdep_sys_exit();
277 260
278 cached_flags = 261 cached_flags = READ_ONCE(ti->flags);
279 READ_ONCE(pt_regs_to_thread_info(regs)->flags);
280 262
281 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) 263 if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
282 exit_to_usermode_loop(regs, cached_flags); 264 exit_to_usermode_loop(regs, cached_flags);
283 265
266#ifdef CONFIG_COMPAT
267 /*
268 * Compat syscalls set TS_COMPAT. Make sure we clear it before
269 * returning to user mode. We need to clear it *after* signal
270 * handling, because syscall restart has a fixup for compat
271 * syscalls. The fixup is exercised by the ptrace_syscall_32
272 * selftest.
273 */
274 ti->status &= ~TS_COMPAT;
275#endif
276
284 user_enter(); 277 user_enter();
285} 278}
286 279
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
332 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) 325 if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
333 syscall_slow_exit_work(regs, cached_flags); 326 syscall_slow_exit_work(regs, cached_flags);
334 327
335#ifdef CONFIG_COMPAT 328 local_irq_disable();
329 prepare_exit_to_usermode(regs);
330}
331
332#ifdef CONFIG_X86_64
333__visible void do_syscall_64(struct pt_regs *regs)
334{
335 struct thread_info *ti = pt_regs_to_thread_info(regs);
336 unsigned long nr = regs->orig_ax;
337
338 enter_from_user_mode();
339 local_irq_enable();
340
341 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
342 nr = syscall_trace_enter(regs);
343
336 /* 344 /*
337 * Compat syscalls set TS_COMPAT. Make sure we clear it before 345 * NB: Native and x32 syscalls are dispatched from the same
338 * returning to user mode. 346 * table. The only functional difference is the x32 bit in
347 * regs->orig_ax, which changes the behavior of some syscalls.
339 */ 348 */
340 ti->status &= ~TS_COMPAT; 349 if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
341#endif 350 regs->ax = sys_call_table[nr & __SYSCALL_MASK](
351 regs->di, regs->si, regs->dx,
352 regs->r10, regs->r8, regs->r9);
353 }
342 354
343 local_irq_disable(); 355 syscall_return_slowpath(regs);
344 prepare_exit_to_usermode(regs);
345} 356}
357#endif
346 358
347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 359#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
348/* 360/*
349 * Does a 32-bit syscall. Called with IRQs on and does all entry and 361 * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
350 * exit work and returns with IRQs off. This function is extremely hot 362 * all entry and exit work and returns with IRQs off. This function is
351 * in workloads that use it, and it's usually called from 363 * extremely hot in workloads that use it, and it's usually called from
352 * do_fast_syscall_32, so forcibly inline it to improve performance. 364 * do_fast_syscall_32, so forcibly inline it to improve performance.
353 */ 365 */
354#ifdef CONFIG_X86_32 366static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
355/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
356__visible
357#else
358/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
359static
360#endif
361__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
362{ 367{
363 struct thread_info *ti = pt_regs_to_thread_info(regs); 368 struct thread_info *ti = pt_regs_to_thread_info(regs);
364 unsigned int nr = (unsigned int)regs->orig_ax; 369 unsigned int nr = (unsigned int)regs->orig_ax;
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
393 syscall_return_slowpath(regs); 398 syscall_return_slowpath(regs);
394} 399}
395 400
396#ifdef CONFIG_X86_64 401/* Handles int $0x80 */
397/* Handles INT80 on 64-bit kernels */ 402__visible void do_int80_syscall_32(struct pt_regs *regs)
398__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
399{ 403{
404 enter_from_user_mode();
400 local_irq_enable(); 405 local_irq_enable();
401 do_syscall_32_irqs_on(regs); 406 do_syscall_32_irqs_on(regs);
402} 407}
403#endif
404 408
405/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 409/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
406__visible long do_fast_syscall_32(struct pt_regs *regs) 410__visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
420 */ 424 */
421 regs->ip = landing_pad; 425 regs->ip = landing_pad;
422 426
423 /* 427 enter_from_user_mode();
424 * Fetch EBP from where the vDSO stashed it. 428
425 *
426 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
427 */
428 local_irq_enable(); 429 local_irq_enable();
430
431 /* Fetch EBP from where the vDSO stashed it. */
429 if ( 432 if (
430#ifdef CONFIG_X86_64 433#ifdef CONFIG_X86_64
431 /* 434 /*
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
443 /* User code screwed up. */ 446 /* User code screwed up. */
444 local_irq_disable(); 447 local_irq_disable();
445 regs->ax = -EFAULT; 448 regs->ax = -EFAULT;
446#ifdef CONFIG_CONTEXT_TRACKING
447 enter_from_user_mode();
448#endif
449 prepare_exit_to_usermode(regs); 449 prepare_exit_to_usermode(regs);
450 return 0; /* Keep it simple: use IRET. */ 450 return 0; /* Keep it simple: use IRET. */
451 } 451 }
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bb3e376d0f33..10868aa734dc 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
40#include <asm/processor-flags.h> 40#include <asm/processor-flags.h>
41#include <asm/ftrace.h> 41#include <asm/ftrace.h>
42#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
43#include <asm/cpufeature.h> 43#include <asm/cpufeatures.h>
44#include <asm/alternative-asm.h> 44#include <asm/alternative-asm.h>
45#include <asm/asm.h> 45#include <asm/asm.h>
46#include <asm/smap.h> 46#include <asm/smap.h>
@@ -287,14 +287,64 @@ need_resched:
287END(resume_kernel) 287END(resume_kernel)
288#endif 288#endif
289 289
290 # SYSENTER call handler stub 290GLOBAL(__begin_SYSENTER_singlestep_region)
291/*
292 * All code from here through __end_SYSENTER_singlestep_region is subject
293 * to being single-stepped if a user program sets TF and executes SYSENTER.
294 * There is absolutely nothing that we can do to prevent this from happening
295 * (thanks Intel!). To keep our handling of this situation as simple as
296 * possible, we handle TF just like AC and NT, except that our #DB handler
297 * will ignore all of the single-step traps generated in this range.
298 */
299
300#ifdef CONFIG_XEN
301/*
302 * Xen doesn't set %esp to be precisely what the normal SYSENTER
303 * entry point expects, so fix it up before using the normal path.
304 */
305ENTRY(xen_sysenter_target)
306 addl $5*4, %esp /* remove xen-provided frame */
307 jmp sysenter_past_esp
308#endif
309
310/*
311 * 32-bit SYSENTER entry.
312 *
313 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
314 * if X86_FEATURE_SEP is available. This is the preferred system call
315 * entry on 32-bit systems.
316 *
317 * The SYSENTER instruction, in principle, should *only* occur in the
318 * vDSO. In practice, a small number of Android devices were shipped
319 * with a copy of Bionic that inlined a SYSENTER instruction. This
320 * never happened in any of Google's Bionic versions -- it only happened
321 * in a narrow range of Intel-provided versions.
322 *
323 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
324 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
325 * SYSENTER does not save anything on the stack,
326 * and does not save old EIP (!!!), ESP, or EFLAGS.
327 *
328 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
329 * user and/or vm86 state), we explicitly disable the SYSENTER
330 * instruction in vm86 mode by reprogramming the MSRs.
331 *
332 * Arguments:
333 * eax system call number
334 * ebx arg1
335 * ecx arg2
336 * edx arg3
337 * esi arg4
338 * edi arg5
339 * ebp user stack
340 * 0(%ebp) arg6
341 */
291ENTRY(entry_SYSENTER_32) 342ENTRY(entry_SYSENTER_32)
292 movl TSS_sysenter_sp0(%esp), %esp 343 movl TSS_sysenter_sp0(%esp), %esp
293sysenter_past_esp: 344sysenter_past_esp:
294 pushl $__USER_DS /* pt_regs->ss */ 345 pushl $__USER_DS /* pt_regs->ss */
295 pushl %ebp /* pt_regs->sp (stashed in bp) */ 346 pushl %ebp /* pt_regs->sp (stashed in bp) */
296 pushfl /* pt_regs->flags (except IF = 0) */ 347 pushfl /* pt_regs->flags (except IF = 0) */
297 ASM_CLAC /* Clear AC after saving FLAGS */
298 orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ 348 orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
299 pushl $__USER_CS /* pt_regs->cs */ 349 pushl $__USER_CS /* pt_regs->cs */
300 pushl $0 /* pt_regs->ip = 0 (placeholder) */ 350 pushl $0 /* pt_regs->ip = 0 (placeholder) */
@@ -302,6 +352,29 @@ sysenter_past_esp:
302 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 352 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
303 353
304 /* 354 /*
355 * SYSENTER doesn't filter flags, so we need to clear NT, AC
356 * and TF ourselves. To save a few cycles, we can check whether
357 * either was set instead of doing an unconditional popfq.
358 * This needs to happen before enabling interrupts so that
359 * we don't get preempted with NT set.
360 *
361 * If TF is set, we will single-step all the way to here -- do_debug
362 * will ignore all the traps. (Yes, this is slow, but so is
363 * single-stepping in general. This allows us to avoid having
364 * a more complicated code to handle the case where a user program
365 * forces us to single-step through the SYSENTER entry code.)
366 *
367 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
368 * out-of-line as an optimization: NT is unlikely to be set in the
369 * majority of the cases and instead of polluting the I$ unnecessarily,
370 * we're keeping that code behind a branch which will predict as
371 * not-taken and therefore its instructions won't be fetched.
372 */
373 testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
374 jnz .Lsysenter_fix_flags
375.Lsysenter_flags_fixed:
376
377 /*
305 * User mode is traced as though IRQs are on, and SYSENTER 378 * User mode is traced as though IRQs are on, and SYSENTER
306 * turned them off. 379 * turned them off.
307 */ 380 */
@@ -327,6 +400,15 @@ sysenter_past_esp:
327 popl %eax /* pt_regs->ax */ 400 popl %eax /* pt_regs->ax */
328 401
329 /* 402 /*
403 * Restore all flags except IF. (We restore IF separately because
404 * STI gives a one-instruction window in which we won't be interrupted,
405 * whereas POPF does not.)
406 */
407 addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
408 btr $X86_EFLAGS_IF_BIT, (%esp)
409 popfl
410
411 /*
330 * Return back to the vDSO, which will pop ecx and edx. 412 * Return back to the vDSO, which will pop ecx and edx.
331 * Don't bother with DS and ES (they already contain __USER_DS). 413 * Don't bother with DS and ES (they already contain __USER_DS).
332 */ 414 */
@@ -339,28 +421,63 @@ sysenter_past_esp:
339.popsection 421.popsection
340 _ASM_EXTABLE(1b, 2b) 422 _ASM_EXTABLE(1b, 2b)
341 PTGS_TO_GS_EX 423 PTGS_TO_GS_EX
424
425.Lsysenter_fix_flags:
426 pushl $X86_EFLAGS_FIXED
427 popfl
428 jmp .Lsysenter_flags_fixed
429GLOBAL(__end_SYSENTER_singlestep_region)
342ENDPROC(entry_SYSENTER_32) 430ENDPROC(entry_SYSENTER_32)
343 431
344 # system call handler stub 432/*
433 * 32-bit legacy system call entry.
434 *
435 * 32-bit x86 Linux system calls traditionally used the INT $0x80
436 * instruction. INT $0x80 lands here.
437 *
438 * This entry point can be used by any 32-bit perform system calls.
439 * Instances of INT $0x80 can be found inline in various programs and
440 * libraries. It is also used by the vDSO's __kernel_vsyscall
441 * fallback for hardware that doesn't support a faster entry method.
442 * Restarted 32-bit system calls also fall back to INT $0x80
443 * regardless of what instruction was originally used to do the system
444 * call. (64-bit programs can use INT $0x80 as well, but they can
445 * only run on 64-bit kernels and therefore land in
446 * entry_INT80_compat.)
447 *
448 * This is considered a slow path. It is not used by most libc
449 * implementations on modern hardware except during process startup.
450 *
451 * Arguments:
452 * eax system call number
453 * ebx arg1
454 * ecx arg2
455 * edx arg3
456 * esi arg4
457 * edi arg5
458 * ebp arg6
459 */
345ENTRY(entry_INT80_32) 460ENTRY(entry_INT80_32)
346 ASM_CLAC 461 ASM_CLAC
347 pushl %eax /* pt_regs->orig_ax */ 462 pushl %eax /* pt_regs->orig_ax */
348 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ 463 SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
349 464
350 /* 465 /*
351 * User mode is traced as though IRQs are on. Unlike the 64-bit 466 * User mode is traced as though IRQs are on, and the interrupt gate
352 * case, INT80 is a trap gate on 32-bit kernels, so interrupts 467 * turned them off.
353 * are already on (unless user code is messing around with iopl).
354 */ 468 */
469 TRACE_IRQS_OFF
355 470
356 movl %esp, %eax 471 movl %esp, %eax
357 call do_syscall_32_irqs_on 472 call do_int80_syscall_32
358.Lsyscall_32_done: 473.Lsyscall_32_done:
359 474
360restore_all: 475restore_all:
361 TRACE_IRQS_IRET 476 TRACE_IRQS_IRET
362restore_all_notrace: 477restore_all_notrace:
363#ifdef CONFIG_X86_ESPFIX32 478#ifdef CONFIG_X86_ESPFIX32
479 ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
480
364 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 481 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
365 /* 482 /*
366 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we 483 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +504,6 @@ ENTRY(iret_exc )
387 504
388#ifdef CONFIG_X86_ESPFIX32 505#ifdef CONFIG_X86_ESPFIX32
389ldt_ss: 506ldt_ss:
390#ifdef CONFIG_PARAVIRT
391 /*
392 * The kernel can't run on a non-flat stack if paravirt mode
393 * is active. Rather than try to fixup the high bits of
394 * ESP, bypass this code entirely. This may break DOSemu
395 * and/or Wine support in a paravirt VM, although the option
396 * is still available to implement the setting of the high
397 * 16-bits in the INTERRUPT_RETURN paravirt-op.
398 */
399 cmpl $0, pv_info+PARAVIRT_enabled
400 jne restore_nocheck
401#endif
402
403/* 507/*
404 * Setup and switch to ESPFIX stack 508 * Setup and switch to ESPFIX stack
405 * 509 *
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
632END(spurious_interrupt_bug) 736END(spurious_interrupt_bug)
633 737
634#ifdef CONFIG_XEN 738#ifdef CONFIG_XEN
635/*
636 * Xen doesn't set %esp to be precisely what the normal SYSENTER
637 * entry point expects, so fix it up before using the normal path.
638 */
639ENTRY(xen_sysenter_target)
640 addl $5*4, %esp /* remove xen-provided frame */
641 jmp sysenter_past_esp
642
643ENTRY(xen_hypervisor_callback) 739ENTRY(xen_hypervisor_callback)
644 pushl $-1 /* orig_ax = -1 => not a system call */ 740 pushl $-1 /* orig_ax = -1 => not a system call */
645 SAVE_ALL 741 SAVE_ALL
@@ -939,51 +1035,48 @@ error_code:
939 jmp ret_from_exception 1035 jmp ret_from_exception
940END(page_fault) 1036END(page_fault)
941 1037
942/*
943 * Debug traps and NMI can happen at the one SYSENTER instruction
944 * that sets up the real kernel stack. Check here, since we can't
945 * allow the wrong stack to be used.
946 *
947 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
948 * already pushed 3 words if it hits on the sysenter instruction:
949 * eflags, cs and eip.
950 *
951 * We just load the right stack, and push the three (known) values
952 * by hand onto the new stack - while updating the return eip past
953 * the instruction that would have done it for sysenter.
954 */
955.macro FIX_STACK offset ok label
956 cmpw $__KERNEL_CS, 4(%esp)
957 jne \ok
958\label:
959 movl TSS_sysenter_sp0 + \offset(%esp), %esp
960 pushfl
961 pushl $__KERNEL_CS
962 pushl $sysenter_past_esp
963.endm
964
965ENTRY(debug) 1038ENTRY(debug)
1039 /*
1040 * #DB can happen at the first instruction of
1041 * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
1042 * happens, then we will be running on a very small stack. We
1043 * need to detect this condition and switch to the thread
1044 * stack before calling any C code at all.
1045 *
1046 * If you edit this code, keep in mind that NMIs can happen in here.
1047 */
966 ASM_CLAC 1048 ASM_CLAC
967 cmpl $entry_SYSENTER_32, (%esp)
968 jne debug_stack_correct
969 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
970debug_stack_correct:
971 pushl $-1 # mark this as an int 1049 pushl $-1 # mark this as an int
972 SAVE_ALL 1050 SAVE_ALL
973 TRACE_IRQS_OFF
974 xorl %edx, %edx # error code 0 1051 xorl %edx, %edx # error code 0
975 movl %esp, %eax # pt_regs pointer 1052 movl %esp, %eax # pt_regs pointer
1053
1054 /* Are we currently on the SYSENTER stack? */
1055 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
1056 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
1057 cmpl $SIZEOF_SYSENTER_stack, %ecx
1058 jb .Ldebug_from_sysenter_stack
1059
1060 TRACE_IRQS_OFF
1061 call do_debug
1062 jmp ret_from_exception
1063
1064.Ldebug_from_sysenter_stack:
1065 /* We're on the SYSENTER stack. Switch off. */
1066 movl %esp, %ebp
1067 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1068 TRACE_IRQS_OFF
976 call do_debug 1069 call do_debug
1070 movl %ebp, %esp
977 jmp ret_from_exception 1071 jmp ret_from_exception
978END(debug) 1072END(debug)
979 1073
980/* 1074/*
981 * NMI is doubly nasty. It can happen _while_ we're handling 1075 * NMI is doubly nasty. It can happen on the first instruction of
982 * a debug fault, and the debug fault hasn't yet been able to 1076 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
983 * clear up the stack. So we first check whether we got an 1077 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
984 * NMI on the sysenter entry path, but after that we need to 1078 * switched stacks. We handle both conditions by simply checking whether we
985 * check whether we got an NMI on the debug path where the debug 1079 * interrupted kernel code running on the SYSENTER stack.
986 * fault happened on the sysenter path.
987 */ 1080 */
988ENTRY(nmi) 1081ENTRY(nmi)
989 ASM_CLAC 1082 ASM_CLAC
@@ -994,41 +1087,32 @@ ENTRY(nmi)
994 popl %eax 1087 popl %eax
995 je nmi_espfix_stack 1088 je nmi_espfix_stack
996#endif 1089#endif
997 cmpl $entry_SYSENTER_32, (%esp) 1090
998 je nmi_stack_fixup 1091 pushl %eax # pt_regs->orig_ax
999 pushl %eax
1000 movl %esp, %eax
1001 /*
1002 * Do not access memory above the end of our stack page,
1003 * it might not exist.
1004 */
1005 andl $(THREAD_SIZE-1), %eax
1006 cmpl $(THREAD_SIZE-20), %eax
1007 popl %eax
1008 jae nmi_stack_correct
1009 cmpl $entry_SYSENTER_32, 12(%esp)
1010 je nmi_debug_stack_check
1011nmi_stack_correct:
1012 pushl %eax
1013 SAVE_ALL 1092 SAVE_ALL
1014 xorl %edx, %edx # zero error code 1093 xorl %edx, %edx # zero error code
1015 movl %esp, %eax # pt_regs pointer 1094 movl %esp, %eax # pt_regs pointer
1095
1096 /* Are we currently on the SYSENTER stack? */
1097 PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
1098 subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
1099 cmpl $SIZEOF_SYSENTER_stack, %ecx
1100 jb .Lnmi_from_sysenter_stack
1101
1102 /* Not on SYSENTER stack. */
1016 call do_nmi 1103 call do_nmi
1017 jmp restore_all_notrace 1104 jmp restore_all_notrace
1018 1105
1019nmi_stack_fixup: 1106.Lnmi_from_sysenter_stack:
1020 FIX_STACK 12, nmi_stack_correct, 1 1107 /*
1021 jmp nmi_stack_correct 1108 * We're on the SYSENTER stack. Switch off. No one (not even debug)
1022 1109 * is using the thread stack right now, so it's safe for us to use it.
1023nmi_debug_stack_check: 1110 */
1024 cmpw $__KERNEL_CS, 16(%esp) 1111 movl %esp, %ebp
1025 jne nmi_stack_correct 1112 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
1026 cmpl $debug, (%esp) 1113 call do_nmi
1027 jb nmi_stack_correct 1114 movl %ebp, %esp
1028 cmpl $debug_esp_fix_insn, (%esp) 1115 jmp restore_all_notrace
1029 ja nmi_stack_correct
1030 FIX_STACK 24, nmi_stack_correct, 1
1031 jmp nmi_stack_correct
1032 1116
1033#ifdef CONFIG_X86_ESPFIX32 1117#ifdef CONFIG_X86_ESPFIX32
1034nmi_espfix_stack: 1118nmi_espfix_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3cfceb6..858b555e274b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
103/* 103/*
104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
105 * 105 *
106 * This is the only entry point used for 64-bit system calls. The
107 * hardware interface is reasonably well designed and the register to
108 * argument mapping Linux uses fits well with the registers that are
109 * available when SYSCALL is used.
110 *
111 * SYSCALL instructions can be found inlined in libc implementations as
112 * well as some other programs and libraries. There are also a handful
113 * of SYSCALL instructions in the vDSO used, for example, as a
114 * clock_gettimeofday fallback.
115 *
106 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 116 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
107 * then loads new ss, cs, and rip from previously programmed MSRs. 117 * then loads new ss, cs, and rip from previously programmed MSRs.
108 * rflags gets masked by a value from another MSR (so CLD and CLAC 118 * rflags gets masked by a value from another MSR (so CLD and CLAC
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
145 movq %rsp, PER_CPU_VAR(rsp_scratch) 155 movq %rsp, PER_CPU_VAR(rsp_scratch)
146 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 156 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
147 157
158 TRACE_IRQS_OFF
159
148 /* Construct struct pt_regs on stack */ 160 /* Construct struct pt_regs on stack */
149 pushq $__USER_DS /* pt_regs->ss */ 161 pushq $__USER_DS /* pt_regs->ss */
150 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 162 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
151 /*
152 * Re-enable interrupts.
153 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
154 * must execute atomically in the face of possible interrupt-driven
155 * task preemption. We must enable interrupts only after we're done
156 * with using rsp_scratch:
157 */
158 ENABLE_INTERRUPTS(CLBR_NONE)
159 pushq %r11 /* pt_regs->flags */ 163 pushq %r11 /* pt_regs->flags */
160 pushq $__USER_CS /* pt_regs->cs */ 164 pushq $__USER_CS /* pt_regs->cs */
161 pushq %rcx /* pt_regs->ip */ 165 pushq %rcx /* pt_regs->ip */
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
171 pushq %r11 /* pt_regs->r11 */ 175 pushq %r11 /* pt_regs->r11 */
172 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 176 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
173 177
174 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 178 /*
175 jnz tracesys 179 * If we need to do entry work or if we guess we'll need to do
180 * exit work, go straight to the slow path.
181 */
182 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
183 jnz entry_SYSCALL64_slow_path
184
176entry_SYSCALL_64_fastpath: 185entry_SYSCALL_64_fastpath:
186 /*
187 * Easy case: enable interrupts and issue the syscall. If the syscall
188 * needs pt_regs, we'll call a stub that disables interrupts again
189 * and jumps to the slow path.
190 */
191 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS(CLBR_NONE)
177#if __SYSCALL_MASK == ~0 193#if __SYSCALL_MASK == ~0
178 cmpq $__NR_syscall_max, %rax 194 cmpq $__NR_syscall_max, %rax
179#else 195#else
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
182#endif 198#endif
183 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 199 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
184 movq %r10, %rcx 200 movq %r10, %rcx
201
202 /*
203 * This call instruction is handled specially in stub_ptregs_64.
204 * It might end up jumping to the slow path. If it jumps, RAX
205 * and all argument registers are clobbered.
206 */
185 call *sys_call_table(, %rax, 8) 207 call *sys_call_table(, %rax, 8)
208.Lentry_SYSCALL_64_after_fastpath_call:
209
186 movq %rax, RAX(%rsp) 210 movq %rax, RAX(%rsp)
1871: 2111:
188/*
189 * Syscall return path ending with SYSRET (fast path).
190 * Has incompletely filled pt_regs.
191 */
192 LOCKDEP_SYS_EXIT
193 /*
194 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
195 * it is too small to ever cause noticeable irq latency.
196 */
197 DISABLE_INTERRUPTS(CLBR_NONE)
198 212
199 /* 213 /*
200 * We must check ti flags with interrupts (or at least preemption) 214 * If we get here, then we know that pt_regs is clean for SYSRET64.
201 * off because we must *never* return to userspace without 215 * If we see that no exit work is required (which we are required
202 * processing exit work that is enqueued if we're preempted here. 216 * to check with IRQs off), then we can go straight to SYSRET64.
203 * In particular, returning to userspace with any of the one-shot
204 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
205 * very bad.
206 */ 217 */
218 DISABLE_INTERRUPTS(CLBR_NONE)
219 TRACE_IRQS_OFF
207 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 220 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
208 jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ 221 jnz 1f
209 222
210 RESTORE_C_REGS_EXCEPT_RCX_R11 223 LOCKDEP_SYS_EXIT
224 TRACE_IRQS_ON /* user mode is traced as IRQs on */
211 movq RIP(%rsp), %rcx 225 movq RIP(%rsp), %rcx
212 movq EFLAGS(%rsp), %r11 226 movq EFLAGS(%rsp), %r11
227 RESTORE_C_REGS_EXCEPT_RCX_R11
213 movq RSP(%rsp), %rsp 228 movq RSP(%rsp), %rsp
214 /*
215 * 64-bit SYSRET restores rip from rcx,
216 * rflags from r11 (but RF and VM bits are forced to 0),
217 * cs and ss are loaded from MSRs.
218 * Restoration of rflags re-enables interrupts.
219 *
220 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
221 * descriptor is not reinitialized. This means that we should
222 * avoid SYSRET with SS == NULL, which could happen if we schedule,
223 * exit the kernel, and re-enter using an interrupt vector. (All
224 * interrupt entries on x86_64 set SS to NULL.) We prevent that
225 * from happening by reloading SS in __switch_to. (Actually
226 * detecting the failure in 64-bit userspace is tricky but can be
227 * done.)
228 */
229 USERGS_SYSRET64 229 USERGS_SYSRET64
230 230
231GLOBAL(int_ret_from_sys_call_irqs_off) 2311:
232 /*
233 * The fast path looked good when we started, but something changed
234 * along the way and we need to switch to the slow path. Calling
235 * raise(3) will trigger this, for example. IRQs are off.
236 */
232 TRACE_IRQS_ON 237 TRACE_IRQS_ON
233 ENABLE_INTERRUPTS(CLBR_NONE) 238 ENABLE_INTERRUPTS(CLBR_NONE)
234 jmp int_ret_from_sys_call
235
236 /* Do syscall entry tracing */
237tracesys:
238 movq %rsp, %rdi
239 movl $AUDIT_ARCH_X86_64, %esi
240 call syscall_trace_enter_phase1
241 test %rax, %rax
242 jnz tracesys_phase2 /* if needed, run the slow path */
243 RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
244 movq ORIG_RAX(%rsp), %rax
245 jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
246
247tracesys_phase2:
248 SAVE_EXTRA_REGS 239 SAVE_EXTRA_REGS
249 movq %rsp, %rdi 240 movq %rsp, %rdi
250 movl $AUDIT_ARCH_X86_64, %esi 241 call syscall_return_slowpath /* returns with IRQs disabled */
251 movq %rax, %rdx 242 jmp return_from_SYSCALL_64
252 call syscall_trace_enter_phase2
253
254 /*
255 * Reload registers from stack in case ptrace changed them.
256 * We don't reload %rax because syscall_trace_entry_phase2() returned
257 * the value it wants us to use in the table lookup.
258 */
259 RESTORE_C_REGS_EXCEPT_RAX
260 RESTORE_EXTRA_REGS
261#if __SYSCALL_MASK == ~0
262 cmpq $__NR_syscall_max, %rax
263#else
264 andl $__SYSCALL_MASK, %eax
265 cmpl $__NR_syscall_max, %eax
266#endif
267 ja 1f /* return -ENOSYS (already in pt_regs->ax) */
268 movq %r10, %rcx /* fixup for C */
269 call *sys_call_table(, %rax, 8)
270 movq %rax, RAX(%rsp)
2711:
272 /* Use IRET because user could have changed pt_regs->foo */
273 243
274/* 244entry_SYSCALL64_slow_path:
275 * Syscall return path ending with IRET. 245 /* IRQs are off. */
276 * Has correct iret frame.
277 */
278GLOBAL(int_ret_from_sys_call)
279 SAVE_EXTRA_REGS 246 SAVE_EXTRA_REGS
280 movq %rsp, %rdi 247 movq %rsp, %rdi
281 call syscall_return_slowpath /* returns with IRQs disabled */ 248 call do_syscall_64 /* returns with IRQs disabled */
249
250return_from_SYSCALL_64:
282 RESTORE_EXTRA_REGS 251 RESTORE_EXTRA_REGS
283 TRACE_IRQS_IRETQ /* we're about to change IF */ 252 TRACE_IRQS_IRETQ /* we're about to change IF */
284 253
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
355 jmp restore_c_regs_and_iret 324 jmp restore_c_regs_and_iret
356END(entry_SYSCALL_64) 325END(entry_SYSCALL_64)
357 326
327ENTRY(stub_ptregs_64)
328 /*
329 * Syscalls marked as needing ptregs land here.
330 * If we are on the fast path, we need to save the extra regs,
331 * which we achieve by trying again on the slow path. If we are on
332 * the slow path, the extra regs are already saved.
333 *
334 * RAX stores a pointer to the C function implementing the syscall.
335 * IRQs are on.
336 */
337 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
338 jne 1f
358 339
359 .macro FORK_LIKE func
360ENTRY(stub_\func)
361 SAVE_EXTRA_REGS 8
362 jmp sys_\func
363END(stub_\func)
364 .endm
365
366 FORK_LIKE clone
367 FORK_LIKE fork
368 FORK_LIKE vfork
369
370ENTRY(stub_execve)
371 call sys_execve
372return_from_execve:
373 testl %eax, %eax
374 jz 1f
375 /* exec failed, can use fast SYSRET code path in this case */
376 ret
3771:
378 /* must use IRET code path (pt_regs->cs may have changed) */
379 addq $8, %rsp
380 ZERO_EXTRA_REGS
381 movq %rax, RAX(%rsp)
382 jmp int_ret_from_sys_call
383END(stub_execve)
384/*
385 * Remaining execve stubs are only 7 bytes long.
386 * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
387 */
388 .align 8
389GLOBAL(stub_execveat)
390 call sys_execveat
391 jmp return_from_execve
392END(stub_execveat)
393
394#if defined(CONFIG_X86_X32_ABI)
395 .align 8
396GLOBAL(stub_x32_execve)
397 call compat_sys_execve
398 jmp return_from_execve
399END(stub_x32_execve)
400 .align 8
401GLOBAL(stub_x32_execveat)
402 call compat_sys_execveat
403 jmp return_from_execve
404END(stub_x32_execveat)
405#endif
406
407/*
408 * sigreturn is special because it needs to restore all registers on return.
409 * This cannot be done with SYSRET, so use the IRET return path instead.
410 */
411ENTRY(stub_rt_sigreturn)
412 /* 340 /*
413 * SAVE_EXTRA_REGS result is not normally needed: 341 * Called from fast path -- disable IRQs again, pop return address
414 * sigreturn overwrites all pt_regs->GPREGS. 342 * and jump to slow path
415 * But sigreturn can fail (!), and there is no easy way to detect that.
416 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
417 * we SAVE_EXTRA_REGS here.
418 */ 343 */
419 SAVE_EXTRA_REGS 8 344 DISABLE_INTERRUPTS(CLBR_NONE)
420 call sys_rt_sigreturn 345 TRACE_IRQS_OFF
421return_from_stub: 346 popq %rax
422 addq $8, %rsp 347 jmp entry_SYSCALL64_slow_path
423 RESTORE_EXTRA_REGS
424 movq %rax, RAX(%rsp)
425 jmp int_ret_from_sys_call
426END(stub_rt_sigreturn)
427 348
428#ifdef CONFIG_X86_X32_ABI 3491:
429ENTRY(stub_x32_rt_sigreturn) 350 /* Called from C */
430 SAVE_EXTRA_REGS 8 351 jmp *%rax /* called from C */
431 call sys32_x32_rt_sigreturn 352END(stub_ptregs_64)
432 jmp return_from_stub 353
433END(stub_x32_rt_sigreturn) 354.macro ptregs_stub func
434#endif 355ENTRY(ptregs_\func)
356 leaq \func(%rip), %rax
357 jmp stub_ptregs_64
358END(ptregs_\func)
359.endm
360
361/* Instantiate ptregs_stub for each ptregs-using syscall */
362#define __SYSCALL_64_QUAL_(sym)
363#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
364#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
365#include <asm/syscalls_64.h>
435 366
436/* 367/*
437 * A newly forked process directly context switches into this address. 368 * A newly forked process directly context switches into this address.
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
439 * rdi: prev task we switched from 370 * rdi: prev task we switched from
440 */ 371 */
441ENTRY(ret_from_fork) 372ENTRY(ret_from_fork)
442
443 LOCK ; btr $TIF_FORK, TI_flags(%r8) 373 LOCK ; btr $TIF_FORK, TI_flags(%r8)
444 374
445 pushq $0x0002 375 pushq $0x0002
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
447 377
448 call schedule_tail /* rdi: 'prev' task parameter */ 378 call schedule_tail /* rdi: 'prev' task parameter */
449 379
450 RESTORE_EXTRA_REGS
451
452 testb $3, CS(%rsp) /* from kernel_thread? */ 380 testb $3, CS(%rsp) /* from kernel_thread? */
381 jnz 1f
453 382
454 /* 383 /*
455 * By the time we get here, we have no idea whether our pt_regs, 384 * We came from kernel_thread. This code path is quite twisted, and
456 * ti flags, and ti status came from the 64-bit SYSCALL fast path, 385 * someone should clean it up.
457 * the slow path, or one of the 32-bit compat paths. 386 *
458 * Use IRET code path to return, since it can safely handle 387 * copy_thread_tls stashes the function pointer in RBX and the
459 * all of the above. 388 * parameter to be passed in RBP. The called function is permitted
389 * to call do_execve and thereby jump to user mode.
460 */ 390 */
461 jnz int_ret_from_sys_call 391 movq RBP(%rsp), %rdi
392 call *RBX(%rsp)
393 movl $0, RAX(%rsp)
462 394
463 /* 395 /*
464 * We came from kernel_thread 396 * Fall through as though we're exiting a syscall. This makes a
465 * nb: we depend on RESTORE_EXTRA_REGS above 397 * twisted sort of sense if we just called do_execve.
466 */ 398 */
467 movq %rbp, %rdi 399
468 call *%rbx 4001:
469 movl $0, RAX(%rsp) 401 movq %rsp, %rdi
470 RESTORE_EXTRA_REGS 402 call syscall_return_slowpath /* returns with IRQs disabled */
471 jmp int_ret_from_sys_call 403 TRACE_IRQS_ON /* user mode is traced as IRQS on */
404 SWAPGS
405 jmp restore_regs_and_iret
472END(ret_from_fork) 406END(ret_from_fork)
473 407
474/* 408/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990eeee40b..847f2f0c31e5 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
19 .section .entry.text, "ax" 19 .section .entry.text, "ax"
20 20
21/* 21/*
22 * 32-bit SYSENTER instruction entry. 22 * 32-bit SYSENTER entry.
23 * 23 *
24 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 24 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
25 * IF and VM in rflags are cleared (IOW: interrupts are off). 25 * on 64-bit kernels running on Intel CPUs.
26 *
27 * The SYSENTER instruction, in principle, should *only* occur in the
28 * vDSO. In practice, a small number of Android devices were shipped
29 * with a copy of Bionic that inlined a SYSENTER instruction. This
30 * never happened in any of Google's Bionic versions -- it only happened
31 * in a narrow range of Intel-provided versions.
32 *
33 * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
34 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
26 * SYSENTER does not save anything on the stack, 35 * SYSENTER does not save anything on the stack,
27 * and does not save old rip (!!!) and rflags. 36 * and does not save old RIP (!!!), RSP, or RFLAGS.
28 * 37 *
29 * Arguments: 38 * Arguments:
30 * eax system call number 39 * eax system call number
@@ -35,10 +44,6 @@
35 * edi arg5 44 * edi arg5
36 * ebp user stack 45 * ebp user stack
37 * 0(%ebp) arg6 46 * 0(%ebp) arg6
38 *
39 * This is purely a fast path. For anything complicated we use the int 0x80
40 * path below. We set up a complete hardware stack frame to share code
41 * with the int 0x80 path.
42 */ 47 */
43ENTRY(entry_SYSENTER_compat) 48ENTRY(entry_SYSENTER_compat)
44 /* Interrupts are off on entry. */ 49 /* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
66 */ 71 */
67 pushfq /* pt_regs->flags (except IF = 0) */ 72 pushfq /* pt_regs->flags (except IF = 0) */
68 orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ 73 orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
69 ASM_CLAC /* Clear AC after saving FLAGS */
70
71 pushq $__USER32_CS /* pt_regs->cs */ 74 pushq $__USER32_CS /* pt_regs->cs */
72 xorq %r8,%r8 75 xorq %r8,%r8
73 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ 76 pushq %r8 /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
90 cld 93 cld
91 94
92 /* 95 /*
93 * Sysenter doesn't filter flags, so we need to clear NT 96 * SYSENTER doesn't filter flags, so we need to clear NT and AC
94 * ourselves. To save a few cycles, we can check whether 97 * ourselves. To save a few cycles, we can check whether
95 * NT was set instead of doing an unconditional popfq. 98 * either was set instead of doing an unconditional popfq.
96 * This needs to happen before enabling interrupts so that 99 * This needs to happen before enabling interrupts so that
97 * we don't get preempted with NT set. 100 * we don't get preempted with NT set.
98 * 101 *
102 * If TF is set, we will single-step all the way to here -- do_debug
103 * will ignore all the traps. (Yes, this is slow, but so is
104 * single-stepping in general. This allows us to avoid having
105 * a more complicated code to handle the case where a user program
106 * forces us to single-step through the SYSENTER entry code.)
107 *
99 * NB.: .Lsysenter_fix_flags is a label with the code under it moved 108 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
100 * out-of-line as an optimization: NT is unlikely to be set in the 109 * out-of-line as an optimization: NT is unlikely to be set in the
101 * majority of the cases and instead of polluting the I$ unnecessarily, 110 * majority of the cases and instead of polluting the I$ unnecessarily,
102 * we're keeping that code behind a branch which will predict as 111 * we're keeping that code behind a branch which will predict as
103 * not-taken and therefore its instructions won't be fetched. 112 * not-taken and therefore its instructions won't be fetched.
104 */ 113 */
105 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 114 testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
106 jnz .Lsysenter_fix_flags 115 jnz .Lsysenter_fix_flags
107.Lsysenter_flags_fixed: 116.Lsysenter_flags_fixed:
108 117
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
123 pushq $X86_EFLAGS_FIXED 132 pushq $X86_EFLAGS_FIXED
124 popfq 133 popfq
125 jmp .Lsysenter_flags_fixed 134 jmp .Lsysenter_flags_fixed
135GLOBAL(__end_entry_SYSENTER_compat)
126ENDPROC(entry_SYSENTER_compat) 136ENDPROC(entry_SYSENTER_compat)
127 137
128/* 138/*
129 * 32-bit SYSCALL instruction entry. 139 * 32-bit SYSCALL entry.
140 *
141 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
142 * on 64-bit kernels running on AMD CPUs.
143 *
144 * The SYSCALL instruction, in principle, should *only* occur in the
145 * vDSO. In practice, it appears that this really is the case.
146 * As evidence:
147 *
148 * - The calling convention for SYSCALL has changed several times without
149 * anyone noticing.
130 * 150 *
131 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 151 * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
132 * then loads new ss, cs, and rip from previously programmed MSRs. 152 * user task that did SYSCALL without immediately reloading SS
133 * rflags gets masked by a value from another MSR (so CLD and CLAC 153 * would randomly crash.
134 * are not needed). SYSCALL does not save anything on the stack
135 * and does not change rsp.
136 * 154 *
137 * Note: rflags saving+masking-with-MSR happens only in Long mode 155 * - Most programmers do not directly target AMD CPUs, and the 32-bit
156 * SYSCALL instruction does not exist on Intel CPUs. Even on AMD
157 * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
158 * because the SYSCALL instruction in legacy/native 32-bit mode (as
159 * opposed to compat mode) is sufficiently poorly designed as to be
160 * essentially unusable.
161 *
162 * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
163 * RFLAGS to R11, then loads new SS, CS, and RIP from previously
164 * programmed MSRs. RFLAGS gets masked by a value from another MSR
165 * (so CLD and CLAC are not needed). SYSCALL does not save anything on
166 * the stack and does not change RSP.
167 *
168 * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
138 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 169 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
139 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 170 * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
140 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 171 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
141 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 172 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
142 * 173 *
@@ -236,7 +267,21 @@ sysret32_from_system_call:
236END(entry_SYSCALL_compat) 267END(entry_SYSCALL_compat)
237 268
238/* 269/*
239 * Emulated IA32 system calls via int 0x80. 270 * 32-bit legacy system call entry.
271 *
272 * 32-bit x86 Linux system calls traditionally used the INT $0x80
273 * instruction. INT $0x80 lands here.
274 *
275 * This entry point can be used by 32-bit and 64-bit programs to perform
276 * 32-bit system calls. Instances of INT $0x80 can be found inline in
277 * various programs and libraries. It is also used by the vDSO's
278 * __kernel_vsyscall fallback for hardware that doesn't support a faster
279 * entry method. Restarted 32-bit system calls also fall back to INT
280 * $0x80 regardless of what instruction was originally used to do the
281 * system call.
282 *
283 * This is considered a slow path. It is not used by most libc
284 * implementations on modern hardware except during process startup.
240 * 285 *
241 * Arguments: 286 * Arguments:
242 * eax system call number 287 * eax system call number
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat)
245 * edx arg3 290 * edx arg3
246 * esi arg4 291 * esi arg4
247 * edi arg5 292 * edi arg5
248 * ebp arg6 (note: not saved in the stack frame, should not be touched) 293 * ebp arg6
249 *
250 * Notes:
251 * Uses the same stack frame as the x86-64 version.
252 * All registers except eax must be saved (but ptrace may violate that).
253 * Arguments are zero extended. For system calls that want sign extension and
254 * take long arguments a wrapper is needed. Most calls can just be called
255 * directly.
256 * Assumes it is only called from user space and entered with interrupts off.
257 */ 294 */
258
259ENTRY(entry_INT80_compat) 295ENTRY(entry_INT80_compat)
260 /* 296 /*
261 * Interrupts are off on entry. 297 * Interrupts are off on entry.
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat)
300 TRACE_IRQS_OFF 336 TRACE_IRQS_OFF
301 337
302 movq %rsp, %rdi 338 movq %rsp, %rdi
303 call do_syscall_32_irqs_off 339 call do_int80_syscall_32
304.Lsyscall_32_done: 340.Lsyscall_32_done:
305 341
306 /* Go back to user mode. */ 342 /* Go back to user mode. */
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/syscall.h> 7#include <asm/syscall.h>
8 8
9#ifdef CONFIG_IA32_EMULATION 9#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
10#define SYM(sym, compat) compat
11#else
12#define SYM(sym, compat) sym
13#endif
14
15#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
16#include <asm/syscalls_32.h> 10#include <asm/syscalls_32.h>
17#undef __SYSCALL_I386 11#undef __SYSCALL_I386
18 12
19#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), 13#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
20 14
21extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 15extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
22 16
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,19 +6,14 @@
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7#include <asm/syscall.h> 7#include <asm/syscall.h>
8 8
9#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 9#define __SYSCALL_64_QUAL_(sym) sym
10#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
10 11
11#ifdef CONFIG_X86_X32_ABI 12#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
12# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
13#else
14# define __SYSCALL_X32(nr, sym, compat) /* nothing */
15#endif
16
17#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
18#include <asm/syscalls_64.h> 13#include <asm/syscalls_64.h>
19#undef __SYSCALL_64 14#undef __SYSCALL_64
20 15
21#define __SYSCALL_64(nr, sym, compat) [nr] = sym, 16#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
22 17
23extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 18extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
24 19
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a50bdc..2e5b565adacc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
2112 common brk sys_brk 2112 common brk sys_brk
2213 64 rt_sigaction sys_rt_sigaction 2213 64 rt_sigaction sys_rt_sigaction
2314 common rt_sigprocmask sys_rt_sigprocmask 2314 common rt_sigprocmask sys_rt_sigprocmask
2415 64 rt_sigreturn stub_rt_sigreturn 2415 64 rt_sigreturn sys_rt_sigreturn/ptregs
2516 64 ioctl sys_ioctl 2516 64 ioctl sys_ioctl
2617 common pread64 sys_pread64 2617 common pread64 sys_pread64
2718 common pwrite64 sys_pwrite64 2718 common pwrite64 sys_pwrite64
@@ -62,10 +62,10 @@
6253 common socketpair sys_socketpair 6253 common socketpair sys_socketpair
6354 64 setsockopt sys_setsockopt 6354 64 setsockopt sys_setsockopt
6455 64 getsockopt sys_getsockopt 6455 64 getsockopt sys_getsockopt
6556 common clone stub_clone 6556 common clone sys_clone/ptregs
6657 common fork stub_fork 6657 common fork sys_fork/ptregs
6758 common vfork stub_vfork 6758 common vfork sys_vfork/ptregs
6859 64 execve stub_execve 6859 64 execve sys_execve/ptregs
6960 common exit sys_exit 6960 common exit sys_exit
7061 common wait4 sys_wait4 7061 common wait4 sys_wait4
7162 common kill sys_kill 7162 common kill sys_kill
@@ -178,7 +178,7 @@
178169 common reboot sys_reboot 178169 common reboot sys_reboot
179170 common sethostname sys_sethostname 179170 common sethostname sys_sethostname
180171 common setdomainname sys_setdomainname 180171 common setdomainname sys_setdomainname
181172 common iopl sys_iopl 181172 common iopl sys_iopl/ptregs
182173 common ioperm sys_ioperm 182173 common ioperm sys_ioperm
183174 64 create_module 183174 64 create_module
184175 common init_module sys_init_module 184175 common init_module sys_init_module
@@ -328,7 +328,7 @@
328319 common memfd_create sys_memfd_create 328319 common memfd_create sys_memfd_create
329320 common kexec_file_load sys_kexec_file_load 329320 common kexec_file_load sys_kexec_file_load
330321 common bpf sys_bpf 330321 common bpf sys_bpf
331322 64 execveat stub_execveat 331322 64 execveat sys_execveat/ptregs
332323 common userfaultfd sys_userfaultfd 332323 common userfaultfd sys_userfaultfd
333324 common membarrier sys_membarrier 333324 common membarrier sys_membarrier
334325 common mlock2 sys_mlock2 334325 common mlock2 sys_mlock2
@@ -339,14 +339,14 @@
339# for native 64-bit operation. 339# for native 64-bit operation.
340# 340#
341512 x32 rt_sigaction compat_sys_rt_sigaction 341512 x32 rt_sigaction compat_sys_rt_sigaction
342513 x32 rt_sigreturn stub_x32_rt_sigreturn 342513 x32 rt_sigreturn sys32_x32_rt_sigreturn
343514 x32 ioctl compat_sys_ioctl 343514 x32 ioctl compat_sys_ioctl
344515 x32 readv compat_sys_readv 344515 x32 readv compat_sys_readv
345516 x32 writev compat_sys_writev 345516 x32 writev compat_sys_writev
346517 x32 recvfrom compat_sys_recvfrom 346517 x32 recvfrom compat_sys_recvfrom
347518 x32 sendmsg compat_sys_sendmsg 347518 x32 sendmsg compat_sys_sendmsg
348519 x32 recvmsg compat_sys_recvmsg 348519 x32 recvmsg compat_sys_recvmsg
349520 x32 execve stub_x32_execve 349520 x32 execve compat_sys_execve/ptregs
350521 x32 ptrace compat_sys_ptrace 350521 x32 ptrace compat_sys_ptrace
351522 x32 rt_sigpending compat_sys_rt_sigpending 351522 x32 rt_sigpending compat_sys_rt_sigpending
352523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 352523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
371542 x32 getsockopt compat_sys_getsockopt 371542 x32 getsockopt compat_sys_getsockopt
372543 x32 io_setup compat_sys_io_setup 372543 x32 io_setup compat_sys_io_setup
373544 x32 io_submit compat_sys_io_submit 373544 x32 io_submit compat_sys_io_submit
374545 x32 execveat stub_x32_execveat 374545 x32 execveat compat_sys_execveat/ptregs
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,63 @@
3in="$1" 3in="$1"
4out="$2" 4out="$2"
5 5
6syscall_macro() {
7 abi="$1"
8 nr="$2"
9 entry="$3"
10
11 # Entry can be either just a function name or "function/qualifier"
12 real_entry="${entry%%/*}"
13 qualifier="${entry:${#real_entry}}" # Strip the function name
14 qualifier="${qualifier:1}" # Strip the slash, if any
15
16 echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
17}
18
19emit() {
20 abi="$1"
21 nr="$2"
22 entry="$3"
23 compat="$4"
24
25 if [ "$abi" == "64" -a -n "$compat" ]; then
26 echo "a compat entry for a 64-bit syscall makes no sense" >&2
27 exit 1
28 fi
29
30 if [ -z "$compat" ]; then
31 if [ -n "$entry" ]; then
32 syscall_macro "$abi" "$nr" "$entry"
33 fi
34 else
35 echo "#ifdef CONFIG_X86_32"
36 if [ -n "$entry" ]; then
37 syscall_macro "$abi" "$nr" "$entry"
38 fi
39 echo "#else"
40 syscall_macro "$abi" "$nr" "$compat"
41 echo "#endif"
42 fi
43}
44
6grep '^[0-9]' "$in" | sort -n | ( 45grep '^[0-9]' "$in" | sort -n | (
7 while read nr abi name entry compat; do 46 while read nr abi name entry compat; do
8 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` 47 abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
9 if [ -n "$compat" ]; then 48 if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
10 echo "__SYSCALL_${abi}($nr, $entry, $compat)" 49 # COMMON is the same as 64, except that we don't expect X32
11 elif [ -n "$entry" ]; then 50 # programs to use it. Our expectation has nothing to do with
12 echo "__SYSCALL_${abi}($nr, $entry, $entry)" 51 # any generated code, so treat them the same.
52 emit 64 "$nr" "$entry" "$compat"
53 elif [ "$abi" == "X32" ]; then
54 # X32 is equivalent to 64 on an X32-compatible kernel.
55 echo "#ifdef CONFIG_X86_X32_ABI"
56 emit 64 "$nr" "$entry" "$compat"
57 echo "#endif"
58 elif [ "$abi" == "I386" ]; then
59 emit "$abi" "$nr" "$entry" "$compat"
60 else
61 echo "Unknown abi $abi" >&2
62 exit 1
13 fi 63 fi
14 done 64 done
15) > "$out" 65) > "$out"
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 3f69326ed545..63a03bb91497 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
150 } 150 }
151 fprintf(outfile, "\n};\n\n"); 151 fprintf(outfile, "\n};\n\n");
152 152
153 fprintf(outfile, "static struct page *pages[%lu];\n\n",
154 mapping_size / 4096);
155
156 fprintf(outfile, "const struct vdso_image %s = {\n", name); 153 fprintf(outfile, "const struct vdso_image %s = {\n", name);
157 fprintf(outfile, "\t.data = raw_data,\n"); 154 fprintf(outfile, "\t.data = raw_data,\n");
158 fprintf(outfile, "\t.size = %lu,\n", mapping_size); 155 fprintf(outfile, "\t.size = %lu,\n", mapping_size);
159 fprintf(outfile, "\t.text_mapping = {\n");
160 fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
161 fprintf(outfile, "\t\t.pages = pages,\n");
162 fprintf(outfile, "\t},\n");
163 if (alt_sec) { 156 if (alt_sec) {
164 fprintf(outfile, "\t.alt = %lu,\n", 157 fprintf(outfile, "\t.alt = %lu,\n",
165 (unsigned long)GET_LE(&alt_sec->sh_offset)); 158 (unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 08a317a9ae4b..7853b53959cd 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/mm_types.h> 12#include <linux/mm_types.h>
13 13
14#include <asm/cpufeature.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/vdso.h> 15#include <asm/vdso.h>
17 16
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 3a1d9297074b..0109ac6cb79c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
3*/ 3*/
4 4
5#include <asm/dwarf2.h> 5#include <asm/dwarf2.h>
6#include <asm/cpufeature.h> 6#include <asm/cpufeatures.h>
7#include <asm/alternative-asm.h> 7#include <asm/alternative-asm.h>
8 8
9/* 9/*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f69e264ac4..10f704584922 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
20#include <asm/page.h> 20#include <asm/page.h>
21#include <asm/hpet.h> 21#include <asm/hpet.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/cpufeature.h>
23 24
24#if defined(CONFIG_X86_64) 25#if defined(CONFIG_X86_64)
25unsigned int __read_mostly vdso64_enabled = 1; 26unsigned int __read_mostly vdso64_enabled = 1;
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
27 28
28void __init init_vdso_image(const struct vdso_image *image) 29void __init init_vdso_image(const struct vdso_image *image)
29{ 30{
30 int i;
31 int npages = (image->size) / PAGE_SIZE;
32
33 BUG_ON(image->size % PAGE_SIZE != 0); 31 BUG_ON(image->size % PAGE_SIZE != 0);
34 for (i = 0; i < npages; i++)
35 image->text_mapping.pages[i] =
36 virt_to_page(image->data + i*PAGE_SIZE);
37 32
38 apply_alternatives((struct alt_instr *)(image->data + image->alt), 33 apply_alternatives((struct alt_instr *)(image->data + image->alt),
39 (struct alt_instr *)(image->data + image->alt + 34 (struct alt_instr *)(image->data + image->alt +
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
90#endif 85#endif
91} 86}
92 87
88static int vdso_fault(const struct vm_special_mapping *sm,
89 struct vm_area_struct *vma, struct vm_fault *vmf)
90{
91 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
92
93 if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
94 return VM_FAULT_SIGBUS;
95
96 vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
97 get_page(vmf->page);
98 return 0;
99}
100
101static const struct vm_special_mapping text_mapping = {
102 .name = "[vdso]",
103 .fault = vdso_fault,
104};
105
106static int vvar_fault(const struct vm_special_mapping *sm,
107 struct vm_area_struct *vma, struct vm_fault *vmf)
108{
109 const struct vdso_image *image = vma->vm_mm->context.vdso_image;
110 long sym_offset;
111 int ret = -EFAULT;
112
113 if (!image)
114 return VM_FAULT_SIGBUS;
115
116 sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
117 image->sym_vvar_start;
118
119 /*
120 * Sanity check: a symbol offset of zero means that the page
121 * does not exist for this vdso image, not that the page is at
122 * offset zero relative to the text mapping. This should be
123 * impossible here, because sym_offset should only be zero for
124 * the page past the end of the vvar mapping.
125 */
126 if (sym_offset == 0)
127 return VM_FAULT_SIGBUS;
128
129 if (sym_offset == image->sym_vvar_page) {
130 ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
131 __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
132 } else if (sym_offset == image->sym_hpet_page) {
133#ifdef CONFIG_HPET_TIMER
134 if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
135 ret = vm_insert_pfn_prot(
136 vma,
137 (unsigned long)vmf->virtual_address,
138 hpet_address >> PAGE_SHIFT,
139 pgprot_noncached(PAGE_READONLY));
140 }
141#endif
142 } else if (sym_offset == image->sym_pvclock_page) {
143 struct pvclock_vsyscall_time_info *pvti =
144 pvclock_pvti_cpu0_va();
145 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
146 ret = vm_insert_pfn(
147 vma,
148 (unsigned long)vmf->virtual_address,
149 __pa(pvti) >> PAGE_SHIFT);
150 }
151 }
152
153 if (ret == 0 || ret == -EBUSY)
154 return VM_FAULT_NOPAGE;
155
156 return VM_FAULT_SIGBUS;
157}
158
93static int map_vdso(const struct vdso_image *image, bool calculate_addr) 159static int map_vdso(const struct vdso_image *image, bool calculate_addr)
94{ 160{
95 struct mm_struct *mm = current->mm; 161 struct mm_struct *mm = current->mm;
96 struct vm_area_struct *vma; 162 struct vm_area_struct *vma;
97 unsigned long addr, text_start; 163 unsigned long addr, text_start;
98 int ret = 0; 164 int ret = 0;
99 static struct page *no_pages[] = {NULL}; 165 static const struct vm_special_mapping vvar_mapping = {
100 static struct vm_special_mapping vvar_mapping = {
101 .name = "[vvar]", 166 .name = "[vvar]",
102 .pages = no_pages, 167 .fault = vvar_fault,
103 }; 168 };
104 struct pvclock_vsyscall_time_info *pvti;
105 169
106 if (calculate_addr) { 170 if (calculate_addr) {
107 addr = vdso_addr(current->mm->start_stack, 171 addr = vdso_addr(current->mm->start_stack,
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
121 185
122 text_start = addr - image->sym_vvar_start; 186 text_start = addr - image->sym_vvar_start;
123 current->mm->context.vdso = (void __user *)text_start; 187 current->mm->context.vdso = (void __user *)text_start;
188 current->mm->context.vdso_image = image;
124 189
125 /* 190 /*
126 * MAYWRITE to allow gdb to COW and set breakpoints 191 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
130 image->size, 195 image->size,
131 VM_READ|VM_EXEC| 196 VM_READ|VM_EXEC|
132 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 197 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
133 &image->text_mapping); 198 &text_mapping);
134 199
135 if (IS_ERR(vma)) { 200 if (IS_ERR(vma)) {
136 ret = PTR_ERR(vma); 201 ret = PTR_ERR(vma);
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
140 vma = _install_special_mapping(mm, 205 vma = _install_special_mapping(mm,
141 addr, 206 addr,
142 -image->sym_vvar_start, 207 -image->sym_vvar_start,
143 VM_READ|VM_MAYREAD, 208 VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
209 VM_PFNMAP,
144 &vvar_mapping); 210 &vvar_mapping);
145 211
146 if (IS_ERR(vma)) { 212 if (IS_ERR(vma)) {
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
148 goto up_fail; 214 goto up_fail;
149 } 215 }
150 216
151 if (image->sym_vvar_page)
152 ret = remap_pfn_range(vma,
153 text_start + image->sym_vvar_page,
154 __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
155 PAGE_SIZE,
156 PAGE_READONLY);
157
158 if (ret)
159 goto up_fail;
160
161#ifdef CONFIG_HPET_TIMER
162 if (hpet_address && image->sym_hpet_page) {
163 ret = io_remap_pfn_range(vma,
164 text_start + image->sym_hpet_page,
165 hpet_address >> PAGE_SHIFT,
166 PAGE_SIZE,
167 pgprot_noncached(PAGE_READONLY));
168
169 if (ret)
170 goto up_fail;
171 }
172#endif
173
174 pvti = pvclock_pvti_cpu0_va();
175 if (pvti && image->sym_pvclock_page) {
176 ret = remap_pfn_range(vma,
177 text_start + image->sym_pvclock_page,
178 __pa(pvti) >> PAGE_SHIFT,
179 PAGE_SIZE,
180 PAGE_READONLY);
181
182 if (ret)
183 goto up_fail;
184 }
185
186up_fail: 217up_fail:
187 if (ret) 218 if (ret)
188 current->mm->context.vdso = NULL; 219 current->mm->context.vdso = NULL;
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
254#ifdef CONFIG_NUMA 285#ifdef CONFIG_NUMA
255 node = cpu_to_node(cpu); 286 node = cpu_to_node(cpu);
256#endif 287#endif
257 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 288 if (static_cpu_has(X86_FEATURE_RDTSCP))
258 write_rdtscp_aux((node << 12) | cpu); 289 write_rdtscp_aux((node << 12) | cpu);
259 290
260 /* 291 /*
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..0fb3a104ac62 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
16#include <asm/vgtod.h> 16#include <asm/vgtod.h>
17#include <asm/vvar.h> 17#include <asm/vvar.h>
18 18
19int vclocks_used __read_mostly;
20
19DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); 21DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
20 22
21void update_vsyscall_tz(void) 23void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
26 28
27void update_vsyscall(struct timekeeper *tk) 29void update_vsyscall(struct timekeeper *tk)
28{ 30{
31 int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
29 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; 32 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
30 33
34 /* Mark the new vclock used. */
35 BUILD_BUG_ON(VCLOCK_MAX >= 32);
36 WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
37
31 gtod_write_begin(vdata); 38 gtod_write_begin(vdata);
32 39
33 /* copy vsyscall data */ 40 /* copy vsyscall data */
34 vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; 41 vdata->vclock_mode = vclock_mode;
35 vdata->cycle_last = tk->tkr_mono.cycle_last; 42 vdata->cycle_last = tk->tkr_mono.cycle_last;
36 vdata->mask = tk->tkr_mono.mask; 43 vdata->mask = tk->tkr_mono.mask;
37 vdata->mult = tk->tkr_mono.mult; 44 vdata->mult = tk->tkr_mono.mult;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 7bfc85bbb8ff..99afb665a004 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -152,12 +152,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
152 ".popsection" 152 ".popsection"
153 153
154/* 154/*
155 * This must be included *after* the definition of ALTERNATIVE due to
156 * <asm/arch_hweight.h>
157 */
158#include <asm/cpufeature.h>
159
160/*
161 * Alternative instructions for different CPU types or capabilities. 155 * Alternative instructions for different CPU types or capabilities.
162 * 156 *
163 * This allows to use optimized instructions even on generic binary 157 * This allows to use optimized instructions even on generic binary
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c80f6b6f3da2..0899cfc8dfe8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -6,7 +6,6 @@
6 6
7#include <asm/alternative.h> 7#include <asm/alternative.h>
8#include <asm/cpufeature.h> 8#include <asm/cpufeature.h>
9#include <asm/processor.h>
10#include <asm/apicdef.h> 9#include <asm/apicdef.h>
11#include <linux/atomic.h> 10#include <linux/atomic.h>
12#include <asm/fixmap.h> 11#include <asm/fixmap.h>
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 259a7c1ef709..02e799fa43d1 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_HWEIGHT_H 1#ifndef _ASM_X86_HWEIGHT_H
2#define _ASM_X86_HWEIGHT_H 2#define _ASM_X86_HWEIGHT_H
3 3
4#include <asm/cpufeatures.h>
5
4#ifdef CONFIG_64BIT 6#ifdef CONFIG_64BIT
5/* popcnt %edi, %eax -- redundant REX prefix for alignment */ 7/* popcnt %edi, %eax -- redundant REX prefix for alignment */
6#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" 8#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b954d5e4..7766d1cf096e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
91 * If it's called on the same region of memory simultaneously, the effect 91 * If it's called on the same region of memory simultaneously, the effect
92 * may be that only one operation succeeds. 92 * may be that only one operation succeeds.
93 */ 93 */
94static inline void __set_bit(long nr, volatile unsigned long *addr) 94static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
95{ 95{
96 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); 96 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
97} 97}
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
128 * clear_bit() is atomic and implies release semantics before the memory 128 * clear_bit() is atomic and implies release semantics before the memory
129 * operation. It can be used for an unlock. 129 * operation. It can be used for an unlock.
130 */ 130 */
131static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) 131static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
132{ 132{
133 barrier(); 133 barrier();
134 clear_bit(nr, addr); 134 clear_bit(nr, addr);
135} 135}
136 136
137static inline void __clear_bit(long nr, volatile unsigned long *addr) 137static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
138{ 138{
139 asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); 139 asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
140} 140}
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
151 * No memory barrier is required here, because x86 cannot reorder stores past 151 * No memory barrier is required here, because x86 cannot reorder stores past
152 * older loads. Same principle as spin_unlock. 152 * older loads. Same principle as spin_unlock.
153 */ 153 */
154static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) 154static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
155{ 155{
156 barrier(); 156 barrier();
157 __clear_bit(nr, addr); 157 __clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
166 * If it's called on the same region of memory simultaneously, the effect 166 * If it's called on the same region of memory simultaneously, the effect
167 * may be that only one operation succeeds. 167 * may be that only one operation succeeds.
168 */ 168 */
169static inline void __change_bit(long nr, volatile unsigned long *addr) 169static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
170{ 170{
171 asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); 171 asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
172} 172}
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
180 * Note that @nr may be almost arbitrarily large; this function is not 180 * Note that @nr may be almost arbitrarily large; this function is not
181 * restricted to acting on a single-word quantity. 181 * restricted to acting on a single-word quantity.
182 */ 182 */
183static inline void change_bit(long nr, volatile unsigned long *addr) 183static __always_inline void change_bit(long nr, volatile unsigned long *addr)
184{ 184{
185 if (IS_IMMEDIATE(nr)) { 185 if (IS_IMMEDIATE(nr)) {
186 asm volatile(LOCK_PREFIX "xorb %1,%0" 186 asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
201 * This operation is atomic and cannot be reordered. 201 * This operation is atomic and cannot be reordered.
202 * It also implies a memory barrier. 202 * It also implies a memory barrier.
203 */ 203 */
204static inline int test_and_set_bit(long nr, volatile unsigned long *addr) 204static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
205{ 205{
206 GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); 206 GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
207} 207}
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
228 * If two examples of this operation race, one can appear to succeed 228 * If two examples of this operation race, one can appear to succeed
229 * but actually fail. You must protect multiple accesses with a lock. 229 * but actually fail. You must protect multiple accesses with a lock.
230 */ 230 */
231static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) 231static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
232{ 232{
233 int oldbit; 233 int oldbit;
234 234
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
247 * This operation is atomic and cannot be reordered. 247 * This operation is atomic and cannot be reordered.
248 * It also implies a memory barrier. 248 * It also implies a memory barrier.
249 */ 249 */
250static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) 250static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
251{ 251{
252 GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); 252 GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
253} 253}
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
268 * accessed from a hypervisor on the same CPU if running in a VM: don't change 268 * accessed from a hypervisor on the same CPU if running in a VM: don't change
269 * this without also updating arch/x86/kernel/kvm.c 269 * this without also updating arch/x86/kernel/kvm.c
270 */ 270 */
271static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) 271static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
272{ 272{
273 int oldbit; 273 int oldbit;
274 274
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
280} 280}
281 281
282/* WARNING: non atomic and it can be reordered! */ 282/* WARNING: non atomic and it can be reordered! */
283static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) 283static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
284{ 284{
285 int oldbit; 285 int oldbit;
286 286
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
300 * This operation is atomic and cannot be reordered. 300 * This operation is atomic and cannot be reordered.
301 * It also implies a memory barrier. 301 * It also implies a memory barrier.
302 */ 302 */
303static inline int test_and_change_bit(long nr, volatile unsigned long *addr) 303static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
304{ 304{
305 GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); 305 GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
306} 306}
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
311 (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; 311 (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
312} 312}
313 313
314static inline int variable_test_bit(long nr, volatile const unsigned long *addr) 314static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
315{ 315{
316 int oldbit; 316 int oldbit;
317 317
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
343 * 343 *
344 * Undefined if no bit exists, so code should check against 0 first. 344 * Undefined if no bit exists, so code should check against 0 first.
345 */ 345 */
346static inline unsigned long __ffs(unsigned long word) 346static __always_inline unsigned long __ffs(unsigned long word)
347{ 347{
348 asm("rep; bsf %1,%0" 348 asm("rep; bsf %1,%0"
349 : "=r" (word) 349 : "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
357 * 357 *
358 * Undefined if no zero exists, so code should check against ~0UL first. 358 * Undefined if no zero exists, so code should check against ~0UL first.
359 */ 359 */
360static inline unsigned long ffz(unsigned long word) 360static __always_inline unsigned long ffz(unsigned long word)
361{ 361{
362 asm("rep; bsf %1,%0" 362 asm("rep; bsf %1,%0"
363 : "=r" (word) 363 : "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
371 * 371 *
372 * Undefined if no set bit exists, so code should check against 0 first. 372 * Undefined if no set bit exists, so code should check against 0 first.
373 */ 373 */
374static inline unsigned long __fls(unsigned long word) 374static __always_inline unsigned long __fls(unsigned long word)
375{ 375{
376 asm("bsr %1,%0" 376 asm("bsr %1,%0"
377 : "=r" (word) 377 : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
393 * set bit if value is nonzero. The first (least significant) bit 393 * set bit if value is nonzero. The first (least significant) bit
394 * is at position 1. 394 * is at position 1.
395 */ 395 */
396static inline int ffs(int x) 396static __always_inline int ffs(int x)
397{ 397{
398 int r; 398 int r;
399 399
@@ -434,7 +434,7 @@ static inline int ffs(int x)
434 * set bit if value is nonzero. The last (most significant) bit is 434 * set bit if value is nonzero. The last (most significant) bit is
435 * at position 32. 435 * at position 32.
436 */ 436 */
437static inline int fls(int x) 437static __always_inline int fls(int x)
438{ 438{
439 int r; 439 int r;
440 440
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index eda81dc0f4ae..d194266acb28 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,10 +3,11 @@
3#ifndef _ASM_X86_CLOCKSOURCE_H 3#ifndef _ASM_X86_CLOCKSOURCE_H
4#define _ASM_X86_CLOCKSOURCE_H 4#define _ASM_X86_CLOCKSOURCE_H
5 5
6#define VCLOCK_NONE 0 /* No vDSO clock available. */ 6#define VCLOCK_NONE 0 /* No vDSO clock available. */
7#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 7#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
8#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 8#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
9#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ 9#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
10#define VCLOCK_MAX 3
10 11
11struct arch_clocksource_data { 12struct arch_clocksource_data {
12 int vclock_mode; 13 int vclock_mode;
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index ad19841eddfe..9733361fed6f 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
2#define ASM_X86_CMPXCHG_H 2#define ASM_X86_CMPXCHG_H
3 3
4#include <linux/compiler.h> 4#include <linux/compiler.h>
5#include <asm/cpufeatures.h>
5#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 6#include <asm/alternative.h> /* Provides LOCK_PREFIX */
6 7
7/* 8/*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c9464297..68e4e8258b84 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -1,288 +1,7 @@
1/*
2 * Defines x86 CPU feature bits
3 */
4#ifndef _ASM_X86_CPUFEATURE_H 1#ifndef _ASM_X86_CPUFEATURE_H
5#define _ASM_X86_CPUFEATURE_H 2#define _ASM_X86_CPUFEATURE_H
6 3
7#ifndef _ASM_X86_REQUIRED_FEATURES_H 4#include <asm/processor.h>
8#include <asm/required-features.h>
9#endif
10
11#ifndef _ASM_X86_DISABLED_FEATURES_H
12#include <asm/disabled-features.h>
13#endif
14
15#define NCAPINTS 16 /* N 32-bit words worth of info */
16#define NBUGINTS 1 /* N 32-bit bug flags */
17
18/*
19 * Note: If the comment begins with a quoted string, that string is used
20 * in /proc/cpuinfo instead of the macro name. If the string is "",
21 * this feature bit is not displayed in /proc/cpuinfo at all.
22 */
23
24/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
25#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
26#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
27#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
28#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
29#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
30#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
31#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
32#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
33#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
34#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
35#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
36#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
37#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
38#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
39#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
40 /* (plus FCMOVcc, FCOMI with FPU) */
41#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
42#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
43#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
44#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
45#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
46#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
47#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
48#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
49#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
50#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
51#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
52#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
53#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
54#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
55#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
56
57/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
58/* Don't duplicate feature flags which are redundant with Intel! */
59#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
60#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
61#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
62#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
63#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
64#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
65#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
66#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
67#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
68#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
69
70/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
71#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
72#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
73#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
74
75/* Other features, Linux-defined mapping, word 3 */
76/* This range is used for feature bits which conflict or are synthesized */
77#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
78#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
79#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
80#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
81/* cpu types for specific tunings: */
82#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
83#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
84#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
85#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
86#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
87#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
88/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
89#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
90#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
91#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
92#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
93#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
94#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
95#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
96#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
97/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
98#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
99#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
100#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
101#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
102#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
103/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
104#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
105#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
106#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
107#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
108#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
109
110/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
111#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
112#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
113#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
114#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
115#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
116#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
117#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
118#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
119#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
120#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
121#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
122#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
123#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
124#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
125#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
126#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
127#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
128#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
129#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
130#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
131#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
132#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
133#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
134#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
135#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
136#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
137#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
138#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
139#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
140#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
141#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
142
143/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
144#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
145#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
146#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
147#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
148#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
149#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
150#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
151#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
152#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
153#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
154
155/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
156#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
157#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
158#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
159#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
160#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
161#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
162#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
163#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
164#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
165#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
166#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
167#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
168#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
169#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
170#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
171#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
172#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
173#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
174#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
175#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
176#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
177#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
178#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
179#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
180#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
181
182/*
183 * Auxiliary flags: Linux defined - For features scattered in various
184 * CPUID levels like 0x6, 0xA etc, word 7.
185 *
186 * Reuse free bits when adding new feature flags!
187 */
188
189#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
190#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
191
192#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
193#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
194
195#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
196
197/* Virtualization flags: Linux defined, word 8 */
198#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
199#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
200#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
201#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
202#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
203
204#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
205#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
206
207
208/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
209#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
210#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
211#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
212#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
213#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
214#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
215#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
216#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
217#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
218#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
219#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
220#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
221#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
222#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
223#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
224#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
225#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
226#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
227#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
228#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
229#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
230#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
231#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
232
233/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
234#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
235#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
236#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
237#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
238
239/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
240#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
241
242/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
243#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
244
245/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
246#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
247
248/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
249#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
250#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
251#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
252#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
253#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
254#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
255#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
256#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
257#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
258#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
259
260/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
261#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
262#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
263#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
264#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
265#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
266#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
267#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
268#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
269#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
270#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
271
272/*
273 * BUG word(s)
274 */
275#define X86_BUG(x) (NCAPINTS*32 + (x))
276
277#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
278#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
279#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
280#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
281#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
282#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
283#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
284#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
285#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
286 5
287#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 6#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
288 7
@@ -369,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
369 * is not relevant. 88 * is not relevant.
370 */ 89 */
371#define cpu_feature_enabled(bit) \ 90#define cpu_feature_enabled(bit) \
372 (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \ 91 (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
373 cpu_has(&boot_cpu_data, bit))
374 92
375#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) 93#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
376 94
@@ -406,106 +124,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
406#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) 124#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
407#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 125#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
408/* 126/*
409 * Do not add any more of those clumsy macros - use static_cpu_has_safe() for 127 * Do not add any more of those clumsy macros - use static_cpu_has() for
410 * fast paths and boot_cpu_has() otherwise! 128 * fast paths and boot_cpu_has() otherwise!
411 */ 129 */
412 130
413#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) 131#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
414extern void warn_pre_alternatives(void);
415extern bool __static_cpu_has_safe(u16 bit);
416
417/* 132/*
418 * Static testing of CPU features. Used the same as boot_cpu_has(). 133 * Static testing of CPU features. Used the same as boot_cpu_has().
419 * These are only valid after alternatives have run, but will statically 134 * These will statically patch the target code for additional
420 * patch the target code for additional performance. 135 * performance.
421 */ 136 */
422static __always_inline __pure bool __static_cpu_has(u16 bit) 137static __always_inline __pure bool _static_cpu_has(u16 bit)
423{
424#ifdef CC_HAVE_ASM_GOTO
425
426#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
427
428 /*
429 * Catch too early usage of this before alternatives
430 * have run.
431 */
432 asm_volatile_goto("1: jmp %l[t_warn]\n"
433 "2:\n"
434 ".section .altinstructions,\"a\"\n"
435 " .long 1b - .\n"
436 " .long 0\n" /* no replacement */
437 " .word %P0\n" /* 1: do replace */
438 " .byte 2b - 1b\n" /* source len */
439 " .byte 0\n" /* replacement len */
440 " .byte 0\n" /* pad len */
441 ".previous\n"
442 /* skipping size check since replacement size = 0 */
443 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
444
445#endif
446
447 asm_volatile_goto("1: jmp %l[t_no]\n"
448 "2:\n"
449 ".section .altinstructions,\"a\"\n"
450 " .long 1b - .\n"
451 " .long 0\n" /* no replacement */
452 " .word %P0\n" /* feature bit */
453 " .byte 2b - 1b\n" /* source len */
454 " .byte 0\n" /* replacement len */
455 " .byte 0\n" /* pad len */
456 ".previous\n"
457 /* skipping size check since replacement size = 0 */
458 : : "i" (bit) : : t_no);
459 return true;
460 t_no:
461 return false;
462
463#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
464 t_warn:
465 warn_pre_alternatives();
466 return false;
467#endif
468
469#else /* CC_HAVE_ASM_GOTO */
470
471 u8 flag;
472 /* Open-coded due to __stringify() in ALTERNATIVE() */
473 asm volatile("1: movb $0,%0\n"
474 "2:\n"
475 ".section .altinstructions,\"a\"\n"
476 " .long 1b - .\n"
477 " .long 3f - .\n"
478 " .word %P1\n" /* feature bit */
479 " .byte 2b - 1b\n" /* source len */
480 " .byte 4f - 3f\n" /* replacement len */
481 " .byte 0\n" /* pad len */
482 ".previous\n"
483 ".section .discard,\"aw\",@progbits\n"
484 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
485 ".previous\n"
486 ".section .altinstr_replacement,\"ax\"\n"
487 "3: movb $1,%0\n"
488 "4:\n"
489 ".previous\n"
490 : "=qm" (flag) : "i" (bit));
491 return flag;
492
493#endif /* CC_HAVE_ASM_GOTO */
494}
495
496#define static_cpu_has(bit) \
497( \
498 __builtin_constant_p(boot_cpu_has(bit)) ? \
499 boot_cpu_has(bit) : \
500 __builtin_constant_p(bit) ? \
501 __static_cpu_has(bit) : \
502 boot_cpu_has(bit) \
503)
504
505static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
506{ 138{
507#ifdef CC_HAVE_ASM_GOTO 139 asm_volatile_goto("1: jmp 6f\n"
508 asm_volatile_goto("1: jmp %l[t_dynamic]\n"
509 "2:\n" 140 "2:\n"
510 ".skip -(((5f-4f) - (2b-1b)) > 0) * " 141 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
511 "((5f-4f) - (2b-1b)),0x90\n" 142 "((5f-4f) - (2b-1b)),0x90\n"
@@ -530,66 +161,34 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
530 " .byte 0\n" /* repl len */ 161 " .byte 0\n" /* repl len */
531 " .byte 0\n" /* pad len */ 162 " .byte 0\n" /* pad len */
532 ".previous\n" 163 ".previous\n"
533 : : "i" (bit), "i" (X86_FEATURE_ALWAYS) 164 ".section .altinstr_aux,\"ax\"\n"
534 : : t_dynamic, t_no); 165 "6:\n"
166 " testb %[bitnum],%[cap_byte]\n"
167 " jnz %l[t_yes]\n"
168 " jmp %l[t_no]\n"
169 ".previous\n"
170 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
171 [bitnum] "i" (1 << (bit & 7)),
172 [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
173 : : t_yes, t_no);
174 t_yes:
535 return true; 175 return true;
536 t_no: 176 t_no:
537 return false; 177 return false;
538 t_dynamic:
539 return __static_cpu_has_safe(bit);
540#else
541 u8 flag;
542 /* Open-coded due to __stringify() in ALTERNATIVE() */
543 asm volatile("1: movb $2,%0\n"
544 "2:\n"
545 ".section .altinstructions,\"a\"\n"
546 " .long 1b - .\n" /* src offset */
547 " .long 3f - .\n" /* repl offset */
548 " .word %P2\n" /* always replace */
549 " .byte 2b - 1b\n" /* source len */
550 " .byte 4f - 3f\n" /* replacement len */
551 " .byte 0\n" /* pad len */
552 ".previous\n"
553 ".section .discard,\"aw\",@progbits\n"
554 " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
555 ".previous\n"
556 ".section .altinstr_replacement,\"ax\"\n"
557 "3: movb $0,%0\n"
558 "4:\n"
559 ".previous\n"
560 ".section .altinstructions,\"a\"\n"
561 " .long 1b - .\n" /* src offset */
562 " .long 5f - .\n" /* repl offset */
563 " .word %P1\n" /* feature bit */
564 " .byte 4b - 3b\n" /* src len */
565 " .byte 6f - 5f\n" /* repl len */
566 " .byte 0\n" /* pad len */
567 ".previous\n"
568 ".section .discard,\"aw\",@progbits\n"
569 " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
570 ".previous\n"
571 ".section .altinstr_replacement,\"ax\"\n"
572 "5: movb $1,%0\n"
573 "6:\n"
574 ".previous\n"
575 : "=qm" (flag)
576 : "i" (bit), "i" (X86_FEATURE_ALWAYS));
577 return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
578#endif /* CC_HAVE_ASM_GOTO */
579} 178}
580 179
581#define static_cpu_has_safe(bit) \ 180#define static_cpu_has(bit) \
582( \ 181( \
583 __builtin_constant_p(boot_cpu_has(bit)) ? \ 182 __builtin_constant_p(boot_cpu_has(bit)) ? \
584 boot_cpu_has(bit) : \ 183 boot_cpu_has(bit) : \
585 _static_cpu_has_safe(bit) \ 184 _static_cpu_has(bit) \
586) 185)
587#else 186#else
588/* 187/*
589 * gcc 3.x is too stupid to do the static test; fall back to dynamic. 188 * Fall back to dynamic for gcc versions which don't support asm goto. Should be
189 * a minority now anyway.
590 */ 190 */
591#define static_cpu_has(bit) boot_cpu_has(bit) 191#define static_cpu_has(bit) boot_cpu_has(bit)
592#define static_cpu_has_safe(bit) boot_cpu_has(bit)
593#endif 192#endif
594 193
595#define cpu_has_bug(c, bit) cpu_has(c, (bit)) 194#define cpu_has_bug(c, bit) cpu_has(c, (bit))
@@ -597,7 +196,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
597#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) 196#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
598 197
599#define static_cpu_has_bug(bit) static_cpu_has((bit)) 198#define static_cpu_has_bug(bit) static_cpu_has((bit))
600#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit))
601#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) 199#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
602 200
603#define MAX_CPU_FEATURES (NCAPINTS * 32) 201#define MAX_CPU_FEATURES (NCAPINTS * 32)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 000000000000..074b7604bd51
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,300 @@
1#ifndef _ASM_X86_CPUFEATURES_H
2#define _ASM_X86_CPUFEATURES_H
3
4#ifndef _ASM_X86_REQUIRED_FEATURES_H
5#include <asm/required-features.h>
6#endif
7
8#ifndef _ASM_X86_DISABLED_FEATURES_H
9#include <asm/disabled-features.h>
10#endif
11
12/*
13 * Defines x86 CPU feature bits
14 */
15#define NCAPINTS 16 /* N 32-bit words worth of info */
16#define NBUGINTS 1 /* N 32-bit bug flags */
17
18/*
19 * Note: If the comment begins with a quoted string, that string is used
20 * in /proc/cpuinfo instead of the macro name. If the string is "",
21 * this feature bit is not displayed in /proc/cpuinfo at all.
22 */
23
24/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
25#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
26#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
27#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
28#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
29#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
30#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
31#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
32#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
33#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
34#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
35#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
36#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
37#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
38#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
39#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
40 /* (plus FCMOVcc, FCOMI with FPU) */
41#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
42#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
43#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
44#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
45#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
46#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
47#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
48#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
49#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
50#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
51#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
52#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
53#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
54#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
55#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
56
57/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
58/* Don't duplicate feature flags which are redundant with Intel! */
59#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
60#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
61#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
62#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
63#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
64#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
65#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
66#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
67#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
68#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
69
70/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
71#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
72#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
73#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
74
75/* Other features, Linux-defined mapping, word 3 */
76/* This range is used for feature bits which conflict or are synthesized */
77#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
78#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
79#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
80#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
81/* cpu types for specific tunings: */
82#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
83#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
84#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
85#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
86#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
87#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
88#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
89#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
90#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
91#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
92#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
93#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
94#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
95#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
96#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
97/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
98#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
99#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
100#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
101#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
102#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
103/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
104#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
105#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
106#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
107#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
108#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
109#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
110
111/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
112#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
113#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
114#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
115#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
116#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
117#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
118#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
119#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
120#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
121#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
122#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
123#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
124#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
125#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
126#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
127#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
128#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
129#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
130#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
131#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
132#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
133#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
134#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
135#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
136#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
137#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
138#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
139#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
140#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
141#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
142#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
143
144/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
145#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
146#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
147#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
148#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
149#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
150#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
151#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
152#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
153#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
154#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
155
156/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
157#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
158#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
159#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
160#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
161#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
162#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
163#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
164#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
165#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
166#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
167#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
168#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
169#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
170#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
171#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
172#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
173#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
174#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
175#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
176#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
177#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
178#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
179#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
180#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
181#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
182
183/*
184 * Auxiliary flags: Linux defined - For features scattered in various
185 * CPUID levels like 0x6, 0xA etc, word 7.
186 *
187 * Reuse free bits when adding new feature flags!
188 */
189
190#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
191#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
192
193#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
194#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
195
196#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
197
198/* Virtualization flags: Linux defined, word 8 */
199#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
200#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
201#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
202#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
203#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
204
205#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
206#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
207
208
209/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
210#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
211#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
212#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
213#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
214#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
215#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
216#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
217#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
218#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
219#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
220#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
221#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
222#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
223#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
224#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
225#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
226#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
227#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
228#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
229#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
230#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
231#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
232#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
233#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
234#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
235#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
236
237/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
238#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
239#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
240#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
241#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
242
243/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
244#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
245
246/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
247#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
248
249/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
250#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
251
252/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
253#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
254#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
255#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
256#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
257#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
258#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
259#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
260#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
261#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
262#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
263
264/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
265#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
266#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
267#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
268#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
269#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
270#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
271#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
272#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
273#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
274#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
275#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
276
277/*
278 * BUG word(s)
279 */
280#define X86_BUG(x) (NCAPINTS*32 + (x))
281
282#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
283#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
284#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
285#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
286#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
287#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
288#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
289#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
290#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
291
292#ifdef CONFIG_X86_32
293/*
294 * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
295 * to avoid confusion.
296 */
297#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
298#endif
299
300#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 278441f39856..eb5deb42484d 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -98,4 +98,27 @@ struct desc_ptr {
98 98
99#endif /* !__ASSEMBLY__ */ 99#endif /* !__ASSEMBLY__ */
100 100
101/* Access rights as returned by LAR */
102#define AR_TYPE_RODATA (0 * (1 << 9))
103#define AR_TYPE_RWDATA (1 * (1 << 9))
104#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9))
105#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9))
106#define AR_TYPE_XOCODE (4 * (1 << 9))
107#define AR_TYPE_XRCODE (5 * (1 << 9))
108#define AR_TYPE_XOCODE_CONF (6 * (1 << 9))
109#define AR_TYPE_XRCODE_CONF (7 * (1 << 9))
110#define AR_TYPE_MASK (7 * (1 << 9))
111
112#define AR_DPL0 (0 * (1 << 13))
113#define AR_DPL3 (3 * (1 << 13))
114#define AR_DPL_MASK (3 * (1 << 13))
115
116#define AR_A (1 << 8) /* "Accessed" */
117#define AR_S (1 << 12) /* If clear, "System" segment */
118#define AR_P (1 << 15) /* "Present" */
119#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */
120#define AR_L (1 << 21) /* "Long mode" for code segments */
121#define AR_DB (1 << 22) /* D/B, effect depends on type */
122#define AR_G (1 << 23) /* "Granularity" (limit in pages) */
123
101#endif /* _ASM_X86_DESC_DEFS_H */ 124#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 535192f6bfad..3c69fed215c5 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
15/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
16#define dmi_early_remap early_ioremap 16#define dmi_early_remap early_ioremap
17#define dmi_early_unmap early_iounmap 17#define dmi_early_unmap early_iounmap
18#define dmi_remap ioremap 18#define dmi_remap ioremap_cache
19#define dmi_unmap iounmap 19#define dmi_unmap iounmap
20 20
21#endif /* _ASM_X86_DMI_H */ 21#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6d7d0e52ed5a..8554f960e21b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
138extern int fixmaps_set; 138extern int fixmaps_set;
139 139
140extern pte_t *kmap_pte; 140extern pte_t *kmap_pte;
141extern pgprot_t kmap_prot; 141#define kmap_prot PAGE_KERNEL
142extern pte_t *pkmap_page_table; 142extern pte_t *pkmap_page_table;
143 143
144void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); 144void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0fd440df63f1..a2124343edf5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -17,6 +17,7 @@
17#include <asm/user.h> 17#include <asm/user.h>
18#include <asm/fpu/api.h> 18#include <asm/fpu/api.h>
19#include <asm/fpu/xstate.h> 19#include <asm/fpu/xstate.h>
20#include <asm/cpufeature.h>
20 21
21/* 22/*
22 * High level FPU state handling functions: 23 * High level FPU state handling functions:
@@ -58,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
58 */ 59 */
59static __always_inline __pure bool use_eager_fpu(void) 60static __always_inline __pure bool use_eager_fpu(void)
60{ 61{
61 return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); 62 return static_cpu_has(X86_FEATURE_EAGER_FPU);
62} 63}
63 64
64static __always_inline __pure bool use_xsaveopt(void) 65static __always_inline __pure bool use_xsaveopt(void)
65{ 66{
66 return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); 67 return static_cpu_has(X86_FEATURE_XSAVEOPT);
67} 68}
68 69
69static __always_inline __pure bool use_xsave(void) 70static __always_inline __pure bool use_xsave(void)
70{ 71{
71 return static_cpu_has_safe(X86_FEATURE_XSAVE); 72 return static_cpu_has(X86_FEATURE_XSAVE);
72} 73}
73 74
74static __always_inline __pure bool use_fxsr(void) 75static __always_inline __pure bool use_fxsr(void)
75{ 76{
76 return static_cpu_has_safe(X86_FEATURE_FXSR); 77 return static_cpu_has(X86_FEATURE_FXSR);
77} 78}
78 79
79/* 80/*
@@ -300,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
300 301
301 WARN_ON(system_state != SYSTEM_BOOTING); 302 WARN_ON(system_state != SYSTEM_BOOTING);
302 303
303 if (static_cpu_has_safe(X86_FEATURE_XSAVES)) 304 if (static_cpu_has(X86_FEATURE_XSAVES))
304 XSTATE_OP(XSAVES, xstate, lmask, hmask, err); 305 XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
305 else 306 else
306 XSTATE_OP(XSAVE, xstate, lmask, hmask, err); 307 XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
@@ -322,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
322 323
323 WARN_ON(system_state != SYSTEM_BOOTING); 324 WARN_ON(system_state != SYSTEM_BOOTING);
324 325
325 if (static_cpu_has_safe(X86_FEATURE_XSAVES)) 326 if (static_cpu_has(X86_FEATURE_XSAVES))
326 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 327 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
327 else 328 else
328 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 329 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -460,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
460 * pending. Clear the x87 state here by setting it to fixed values. 461 * pending. Clear the x87 state here by setting it to fixed values.
461 * "m" is a random variable that should be in L1. 462 * "m" is a random variable that should be in L1.
462 */ 463 */
463 if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { 464 if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
464 asm volatile( 465 asm volatile(
465 "fnclex\n\t" 466 "fnclex\n\t"
466 "emms\n\t" 467 "emms\n\t"
@@ -589,7 +590,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
589 * If the task has used the math, pre-load the FPU on xsave processors 590 * If the task has used the math, pre-load the FPU on xsave processors
590 * or if the past 5 consecutive context-switches used math. 591 * or if the past 5 consecutive context-switches used math.
591 */ 592 */
592 fpu.preload = new_fpu->fpstate_active && 593 fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
594 new_fpu->fpstate_active &&
593 (use_eager_fpu() || new_fpu->counter > 5); 595 (use_eager_fpu() || new_fpu->counter > 5);
594 596
595 if (old_fpu->fpregs_active) { 597 if (old_fpu->fpregs_active) {
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 793179cf8e21..6e4d170726b7 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,23 +1,44 @@
1#ifdef __ASSEMBLY__ 1#ifndef _ASM_X86_FRAME_H
2#define _ASM_X86_FRAME_H
2 3
3#include <asm/asm.h> 4#include <asm/asm.h>
4 5
5/* The annotation hides the frame from the unwinder and makes it look 6/*
6 like a ordinary ebp save/restore. This avoids some special cases for 7 * These are stack frame creation macros. They should be used by every
7 frame pointer later */ 8 * callable non-leaf asm function to make kernel stack traces more reliable.
9 */
10
8#ifdef CONFIG_FRAME_POINTER 11#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 12
10 __ASM_SIZE(push,) %__ASM_REG(bp) 13#ifdef __ASSEMBLY__
11 __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) 14
12 .endm 15.macro FRAME_BEGIN
13 .macro ENDFRAME 16 push %_ASM_BP
14 __ASM_SIZE(pop,) %__ASM_REG(bp) 17 _ASM_MOV %_ASM_SP, %_ASM_BP
15 .endm 18.endm
16#else 19
17 .macro FRAME 20.macro FRAME_END
18 .endm 21 pop %_ASM_BP
19 .macro ENDFRAME 22.endm
20 .endm 23
21#endif 24#else /* !__ASSEMBLY__ */
22 25
23#endif /* __ASSEMBLY__ */ 26#define FRAME_BEGIN \
27 "push %" _ASM_BP "\n" \
28 _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
29
30#define FRAME_END "pop %" _ASM_BP "\n"
31
32#endif /* __ASSEMBLY__ */
33
34#define FRAME_OFFSET __ASM_SEL(4, 8)
35
36#else /* !CONFIG_FRAME_POINTER */
37
38#define FRAME_BEGIN
39#define FRAME_END
40#define FRAME_OFFSET 0
41
42#endif /* CONFIG_FRAME_POINTER */
43
44#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
index cd2ce4068441..ebea2c9d2cdc 100644
--- a/arch/x86/include/asm/imr.h
+++ b/arch/x86/include/asm/imr.h
@@ -53,7 +53,7 @@
53#define IMR_MASK (IMR_ALIGN - 1) 53#define IMR_MASK (IMR_ALIGN - 1)
54 54
55int imr_add_range(phys_addr_t base, size_t size, 55int imr_add_range(phys_addr_t base, size_t size,
56 unsigned int rmask, unsigned int wmask, bool lock); 56 unsigned int rmask, unsigned int wmask);
57 57
58int imr_remove_range(phys_addr_t base, size_t size); 58int imr_remove_range(phys_addr_t base, size_t size);
59 59
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index cfc9a0d2d07c..a4fe16e42b7b 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void)
57 cpu_relax(); 57 cpu_relax();
58} 58}
59 59
60static inline void 60void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
61__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
62{
63 /*
64 * Subtle. In the case of the 'never do double writes' workaround
65 * we have to lock out interrupts to be safe. As we don't care
66 * of the value read we use an atomic rmw access to avoid costly
67 * cli/sti. Otherwise we use an even cheaper single atomic write
68 * to the APIC.
69 */
70 unsigned int cfg;
71
72 /*
73 * Wait for idle.
74 */
75 __xapic_wait_icr_idle();
76
77 /*
78 * No need to touch the target chip field
79 */
80 cfg = __prepare_ICR(shortcut, vector, dest);
81
82 /*
83 * Send the IPI. The write to APIC_ICR fires this off.
84 */
85 native_apic_mem_write(APIC_ICR, cfg);
86}
87 61
88/* 62/*
89 * This is used to send an IPI with no shorthand notation (the destination is 63 * This is used to send an IPI with no shorthand notation (the destination is
90 * specified in bits 56 to 63 of the ICR). 64 * specified in bits 56 to 63 of the ICR).
91 */ 65 */
92static inline void 66void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
93 __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
94{
95 unsigned long cfg;
96
97 /*
98 * Wait for idle.
99 */
100 if (unlikely(vector == NMI_VECTOR))
101 safe_apic_wait_icr_idle();
102 else
103 __xapic_wait_icr_idle();
104
105 /*
106 * prepare target chip field
107 */
108 cfg = __prepare_ICR2(mask);
109 native_apic_mem_write(APIC_ICR2, cfg);
110
111 /*
112 * program the ICR
113 */
114 cfg = __prepare_ICR(0, vector, dest);
115
116 /*
117 * Send the IPI. The write to APIC_ICR fires this off.
118 */
119 native_apic_mem_write(APIC_ICR, cfg);
120}
121 67
122extern void default_send_IPI_single(int cpu, int vector); 68extern void default_send_IPI_single(int cpu, int vector);
123extern void default_send_IPI_single_phys(int cpu, int vector); 69extern void default_send_IPI_single_phys(int cpu, int vector);
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 78162f8e248b..d0afb05c84fc 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_IRQ_WORK_H 1#ifndef _ASM_IRQ_WORK_H
2#define _ASM_IRQ_WORK_H 2#define _ASM_IRQ_WORK_H
3 3
4#include <asm/processor.h> 4#include <asm/cpufeature.h>
5 5
6static inline bool arch_irq_work_has_interrupt(void) 6static inline bool arch_irq_work_has_interrupt(void)
7{ 7{
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cfff34172be0..92b6f651fa4f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -135,6 +135,7 @@ struct mca_config {
135 bool ignore_ce; 135 bool ignore_ce;
136 bool disabled; 136 bool disabled;
137 bool ser; 137 bool ser;
138 bool recovery;
138 bool bios_cmci_threshold; 139 bool bios_cmci_threshold;
139 u8 banks; 140 u8 banks;
140 s8 bootlog; 141 s8 bootlog;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 1e1b07a5a738..9d3a96c4da78 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -3,6 +3,7 @@
3 3
4#include <asm/cpu.h> 4#include <asm/cpu.h>
5#include <linux/earlycpio.h> 5#include <linux/earlycpio.h>
6#include <linux/initrd.h>
6 7
7#define native_rdmsr(msr, val1, val2) \ 8#define native_rdmsr(msr, val1, val2) \
8do { \ 9do { \
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { }
143static inline bool 144static inline bool
144get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } 145get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
145#endif 146#endif
147
148static inline unsigned long get_initrd_start(void)
149{
150#ifdef CONFIG_BLK_DEV_INITRD
151 return initrd_start;
152#else
153 return 0;
154#endif
155}
156
157static inline unsigned long get_initrd_start_addr(void)
158{
159#ifdef CONFIG_BLK_DEV_INITRD
160#ifdef CONFIG_X86_32
161 unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
162
163 return (unsigned long)__pa_nodebug(*initrd_start_p);
164#else
165 return get_initrd_start();
166#endif
167#else /* CONFIG_BLK_DEV_INITRD */
168 return 0;
169#endif
170}
171
146#endif /* _ASM_X86_MICROCODE_H */ 172#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 8559b0102ea1..603417f8dd6c 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -40,7 +40,6 @@ struct extended_sigtable {
40#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) 40#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
41#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) 41#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
42#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) 42#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
43#define DWSIZE (sizeof(u32))
44 43
45#define get_totalsize(mc) \ 44#define get_totalsize(mc) \
46 (((struct microcode_intel *)mc)->hdr.datasize ? \ 45 (((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 55234d5e7160..1ea0baef1175 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,7 +19,8 @@ typedef struct {
19#endif 19#endif
20 20
21 struct mutex lock; 21 struct mutex lock;
22 void __user *vdso; 22 void __user *vdso; /* vdso base address */
23 const struct vdso_image *vdso_image; /* vdso image in use */
23 24
24 atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ 25 atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
25} mm_context_t; 26} mm_context_t;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402ef3b84..984ab75bf621 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,7 +1,12 @@
1#ifndef _ASM_X86_MSR_INDEX_H 1#ifndef _ASM_X86_MSR_INDEX_H
2#define _ASM_X86_MSR_INDEX_H 2#define _ASM_X86_MSR_INDEX_H
3 3
4/* CPU model specific register (MSR) numbers */ 4/*
5 * CPU model specific register (MSR) numbers.
6 *
7 * Do not add new entries to this file unless the definitions are shared
8 * between multiple compilation units.
9 */
5 10
6/* x86-64 specific MSRs */ 11/* x86-64 specific MSRs */
7#define MSR_EFER 0xc0000080 /* extended feature register */ 12#define MSR_EFER 0xc0000080 /* extended feature register */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index c70689b5e5aa..0deeb2d26df7 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -3,6 +3,8 @@
3 3
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#include <asm/cpufeature.h>
7
6#define MWAIT_SUBSTATE_MASK 0xf 8#define MWAIT_SUBSTATE_MASK 0xf
7#define MWAIT_CSTATE_MASK 0xf 9#define MWAIT_CSTATE_MASK 0xf
8#define MWAIT_SUBSTATE_SIZE 4 10#define MWAIT_SUBSTATE_SIZE 4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 813384ef811a..983738ac014c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -13,7 +13,7 @@ struct vm86;
13#include <asm/types.h> 13#include <asm/types.h>
14#include <uapi/asm/sigcontext.h> 14#include <uapi/asm/sigcontext.h>
15#include <asm/current.h> 15#include <asm/current.h>
16#include <asm/cpufeature.h> 16#include <asm/cpufeatures.h>
17#include <asm/page.h> 17#include <asm/page.h>
18#include <asm/pgtable_types.h> 18#include <asm/pgtable_types.h>
19#include <asm/percpu.h> 19#include <asm/percpu.h>
@@ -24,7 +24,6 @@ struct vm86;
24#include <asm/fpu/types.h> 24#include <asm/fpu/types.h>
25 25
26#include <linux/personality.h> 26#include <linux/personality.h>
27#include <linux/cpumask.h>
28#include <linux/cache.h> 27#include <linux/cache.h>
29#include <linux/threads.h> 28#include <linux/threads.h>
30#include <linux/math64.h> 29#include <linux/math64.h>
@@ -300,10 +299,13 @@ struct tss_struct {
300 */ 299 */
301 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 300 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
302 301
302#ifdef CONFIG_X86_32
303 /* 303 /*
304 * Space for the temporary SYSENTER stack: 304 * Space for the temporary SYSENTER stack.
305 */ 305 */
306 unsigned long SYSENTER_stack_canary;
306 unsigned long SYSENTER_stack[64]; 307 unsigned long SYSENTER_stack[64];
308#endif
307 309
308} ____cacheline_aligned; 310} ____cacheline_aligned;
309 311
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a4a77286cb1d..9b9b30b19441 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -7,12 +7,23 @@
7 7
8void syscall_init(void); 8void syscall_init(void);
9 9
10#ifdef CONFIG_X86_64
10void entry_SYSCALL_64(void); 11void entry_SYSCALL_64(void);
11void entry_SYSCALL_compat(void); 12#endif
13
14#ifdef CONFIG_X86_32
12void entry_INT80_32(void); 15void entry_INT80_32(void);
13void entry_INT80_compat(void);
14void entry_SYSENTER_32(void); 16void entry_SYSENTER_32(void);
17void __begin_SYSENTER_singlestep_region(void);
18void __end_SYSENTER_singlestep_region(void);
19#endif
20
21#ifdef CONFIG_IA32_EMULATION
15void entry_SYSENTER_compat(void); 22void entry_SYSENTER_compat(void);
23void __end_entry_SYSENTER_compat(void);
24void entry_SYSCALL_compat(void);
25void entry_INT80_compat(void);
26#endif
16 27
17void x86_configure_nx(void); 28void x86_configure_nx(void);
18void x86_report_nx(void); 29void x86_report_nx(void);
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 89db46752a8f..452c88b8ad06 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,7 +13,6 @@
13 X86_EFLAGS_CF | X86_EFLAGS_RF) 13 X86_EFLAGS_CF | X86_EFLAGS_RF)
14 14
15void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 15void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
16int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
17int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 16int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
18 struct pt_regs *regs, unsigned long mask); 17 struct pt_regs *regs, unsigned long mask);
19 18
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index ba665ebd17bb..db333300bd4b 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -15,7 +15,7 @@
15 15
16#include <linux/stringify.h> 16#include <linux/stringify.h>
17#include <asm/nops.h> 17#include <asm/nops.h>
18#include <asm/cpufeature.h> 18#include <asm/cpufeatures.h>
19 19
20/* "Raw" instruction opcodes */ 20/* "Raw" instruction opcodes */
21#define __ASM_CLAC .byte 0x0f,0x01,0xca 21#define __ASM_CLAC .byte 0x0f,0x01,0xca
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index dfcf0727623b..20a3de5cb3b0 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,6 @@
16#endif 16#endif
17#include <asm/thread_info.h> 17#include <asm/thread_info.h>
18#include <asm/cpumask.h> 18#include <asm/cpumask.h>
19#include <asm/cpufeature.h>
20 19
21extern int smp_num_siblings; 20extern int smp_num_siblings;
22extern unsigned int num_processors; 21extern unsigned int num_processors;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c7b551028740..82866697fcf1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@
49 */ 49 */
50#ifndef __ASSEMBLY__ 50#ifndef __ASSEMBLY__
51struct task_struct; 51struct task_struct;
52#include <asm/processor.h> 52#include <asm/cpufeature.h>
53#include <linux/atomic.h> 53#include <linux/atomic.h>
54 54
55struct thread_info { 55struct thread_info {
@@ -134,10 +134,13 @@ struct thread_info {
134#define _TIF_ADDR32 (1 << TIF_ADDR32) 134#define _TIF_ADDR32 (1 << TIF_ADDR32)
135#define _TIF_X32 (1 << TIF_X32) 135#define _TIF_X32 (1 << TIF_X32)
136 136
137/* work to do in syscall_trace_enter() */ 137/*
138 * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
139 * enter_from_user_mode()
140 */
138#define _TIF_WORK_SYSCALL_ENTRY \ 141#define _TIF_WORK_SYSCALL_ENTRY \
139 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ 142 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
140 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ 143 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
141 _TIF_NOHZ) 144 _TIF_NOHZ)
142 145
143/* work to do on any return to user space */ 146/* work to do on any return to user space */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029405a3..c24b4224d439 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,8 +5,57 @@
5#include <linux/sched.h> 5#include <linux/sched.h>
6 6
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/cpufeature.h>
8#include <asm/special_insns.h> 9#include <asm/special_insns.h>
9 10
11static inline void __invpcid(unsigned long pcid, unsigned long addr,
12 unsigned long type)
13{
14 struct { u64 d[2]; } desc = { { pcid, addr } };
15
16 /*
17 * The memory clobber is because the whole point is to invalidate
18 * stale TLB entries and, especially if we're flushing global
19 * mappings, we don't want the compiler to reorder any subsequent
20 * memory accesses before the TLB flush.
21 *
22 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
23 * invpcid (%rcx), %rax in long mode.
24 */
25 asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
26 : : "m" (desc), "a" (type), "c" (&desc) : "memory");
27}
28
29#define INVPCID_TYPE_INDIV_ADDR 0
30#define INVPCID_TYPE_SINGLE_CTXT 1
31#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
32#define INVPCID_TYPE_ALL_NON_GLOBAL 3
33
34/* Flush all mappings for a given pcid and addr, not including globals. */
35static inline void invpcid_flush_one(unsigned long pcid,
36 unsigned long addr)
37{
38 __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
39}
40
41/* Flush all mappings for a given PCID, not including globals. */
42static inline void invpcid_flush_single_context(unsigned long pcid)
43{
44 __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
45}
46
47/* Flush all mappings, including globals, for all PCIDs. */
48static inline void invpcid_flush_all(void)
49{
50 __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
51}
52
53/* Flush all mappings for all PCIDs except globals. */
54static inline void invpcid_flush_all_nonglobals(void)
55{
56 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
57}
58
10#ifdef CONFIG_PARAVIRT 59#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h> 60#include <asm/paravirt.h>
12#else 61#else
@@ -104,6 +153,15 @@ static inline void __native_flush_tlb_global(void)
104{ 153{
105 unsigned long flags; 154 unsigned long flags;
106 155
156 if (static_cpu_has(X86_FEATURE_INVPCID)) {
157 /*
158 * Using INVPCID is considerably faster than a pair of writes
159 * to CR4 sandwiched inside an IRQ flag save/restore.
160 */
161 invpcid_flush_all();
162 return;
163 }
164
107 /* 165 /*
108 * Read-modify-write to CR4 - protect it from preemption and 166 * Read-modify-write to CR4 - protect it from preemption and
109 * from interrupts. (Use the raw variant because this code can 167 * from interrupts. (Use the raw variant because this code can
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 6d7c5479bcea..174c4212780a 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void)
29 return rdtsc(); 29 return rdtsc();
30} 30}
31 31
32extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
33
32extern void tsc_init(void); 34extern void tsc_init(void);
33extern void mark_tsc_unstable(char *reason); 35extern void mark_tsc_unstable(char *reason);
34extern int unsynchronized_tsc(void); 36extern int unsynchronized_tsc(void);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index b89c34c4019b..307698688fa1 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,7 +8,7 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/lockdep.h> 9#include <linux/lockdep.h>
10#include <asm/alternative.h> 10#include <asm/alternative.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeatures.h>
12#include <asm/page.h> 12#include <asm/page.h>
13 13
14/* 14/*
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index deabaf9759b6..43dc55be524e 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -13,9 +13,6 @@ struct vdso_image {
13 void *data; 13 void *data;
14 unsigned long size; /* Always a multiple of PAGE_SIZE */ 14 unsigned long size; /* Always a multiple of PAGE_SIZE */
15 15
16 /* text_mapping.pages is big enough for data/size page pointers */
17 struct vm_special_mapping text_mapping;
18
19 unsigned long alt, alt_len; 16 unsigned long alt, alt_len;
20 17
21 long sym_vvar_start; /* Negative offset to the vvar area */ 18 long sym_vvar_start; /* Negative offset to the vvar area */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index f556c4843aa1..e728699db774 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -37,6 +37,12 @@ struct vsyscall_gtod_data {
37}; 37};
38extern struct vsyscall_gtod_data vsyscall_gtod_data; 38extern struct vsyscall_gtod_data vsyscall_gtod_data;
39 39
40extern int vclocks_used;
41static inline bool vclock_was_used(int vclock)
42{
43 return READ_ONCE(vclocks_used) & (1 << vclock);
44}
45
40static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) 46static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
41{ 47{
42 unsigned ret; 48 unsigned ret;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d485232f1e9f..62d4111c1c54 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -256,7 +256,7 @@ struct sigcontext_64 {
256 __u16 cs; 256 __u16 cs;
257 __u16 gs; 257 __u16 gs;
258 __u16 fs; 258 __u16 fs;
259 __u16 __pad0; 259 __u16 ss;
260 __u64 err; 260 __u64 err;
261 __u64 trapno; 261 __u64 trapno;
262 __u64 oldmask; 262 __u64 oldmask;
@@ -341,9 +341,37 @@ struct sigcontext {
341 __u64 rip; 341 __u64 rip;
342 __u64 eflags; /* RFLAGS */ 342 __u64 eflags; /* RFLAGS */
343 __u16 cs; 343 __u16 cs;
344
345 /*
346 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
347 * Linux saved and restored fs and gs in these slots. This
348 * was counterproductive, as fsbase and gsbase were never
349 * saved, so arch_prctl was presumably unreliable.
350 *
351 * These slots should never be reused without extreme caution:
352 *
353 * - Some DOSEMU versions stash fs and gs in these slots manually,
354 * thus overwriting anything the kernel expects to be preserved
355 * in these slots.
356 *
357 * - If these slots are ever needed for any other purpose,
358 * there is some risk that very old 64-bit binaries could get
359 * confused. I doubt that many such binaries still work,
360 * though, since the same patch in 2.5.64 also removed the
361 * 64-bit set_thread_area syscall, so it appears that there
362 * is no TLS API beyond modify_ldt that works in both pre-
363 * and post-2.5.64 kernels.
364 *
365 * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
366 * save/restore, it will most likely need to be opt-in and use
367 * different context slots.
368 */
344 __u16 gs; 369 __u16 gs;
345 __u16 fs; 370 __u16 fs;
346 __u16 __pad0; 371 union {
372 __u16 ss; /* If UC_SIGCONTEXT_SS */
373 __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
374 };
347 __u64 err; 375 __u64 err;
348 __u64 trapno; 376 __u64 trapno;
349 __u64 oldmask; 377 __u64 oldmask;
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
index b7c29c8017f2..e3d1ec90616e 100644
--- a/arch/x86/include/uapi/asm/ucontext.h
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -1,11 +1,54 @@
1#ifndef _ASM_X86_UCONTEXT_H 1#ifndef _ASM_X86_UCONTEXT_H
2#define _ASM_X86_UCONTEXT_H 2#define _ASM_X86_UCONTEXT_H
3 3
4#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state 4/*
5 * information in the memory layout pointed 5 * Indicates the presence of extended state information in the memory
6 * by the fpstate pointer in the ucontext's 6 * layout pointed by the fpstate pointer in the ucontext's sigcontext
7 * sigcontext struct (uc_mcontext). 7 * struct (uc_mcontext).
8 */ 8 */
9#define UC_FP_XSTATE 0x1
10
11#ifdef __x86_64__
12/*
13 * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
14 * kernels that save SS in the sigcontext. All kernels that set
15 * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
16 * regardless of SS (i.e. they implement espfix).
17 *
18 * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
19 * when delivering a signal that came from 64-bit code.
20 *
21 * Sigreturn restores SS as follows:
22 *
23 * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
24 * saved CS is not 64-bit)
25 * new SS = saved SS (will fail IRET and signal if invalid)
26 * else
27 * new SS = a flat 32-bit data segment
28 *
29 * This behavior serves three purposes:
30 *
31 * - Legacy programs that construct a 64-bit sigcontext from scratch
32 * with zero or garbage in the SS slot (e.g. old CRIU) and call
33 * sigreturn will still work.
34 *
35 * - Old DOSEMU versions sometimes catch a signal from a segmented
36 * context, delete the old SS segment (with modify_ldt), and change
37 * the saved CS to a 64-bit segment. These DOSEMU versions expect
38 * sigreturn to send them back to 64-bit mode without killing them,
39 * despite the fact that the SS selector when the signal was raised is
40 * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel
41 * will fix up SS for these DOSEMU versions.
42 *
43 * - Old and new programs that catch a signal and return without
44 * modifying the saved context will end up in exactly the state they
45 * started in, even if they were running in a segmented context when
46 * the signal was raised.. Old kernels would lose track of the
47 * previous SS value.
48 */
49#define UC_SIGCONTEXT_SS 0x2
50#define UC_STRICT_RESTORE_SS 0x4
51#endif
9 52
10#include <asm-generic/ucontext.h> 53#include <asm-generic/ucontext.h>
11 54
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 9968f30cca3e..76f89e2b245a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ void flat_init_apic_ldr(void)
53 apic_write(APIC_LDR, val); 53 apic_write(APIC_LDR, val);
54} 54}
55 55
56static inline void _flat_send_IPI_mask(unsigned long mask, int vector) 56static void _flat_send_IPI_mask(unsigned long mask, int vector)
57{ 57{
58 unsigned long flags; 58 unsigned long flags;
59 59
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c80c02c6ec49..ab5c2c685a3c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
30 unsigned long value; 30 unsigned long value;
31 unsigned int id = (x >> 24) & 0xff; 31 unsigned int id = (x >> 24) & 0xff;
32 32
33 if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { 33 if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
34 rdmsrl(MSR_FAM10H_NODE_ID, value); 34 rdmsrl(MSR_FAM10H_NODE_ID, value);
35 id |= (value << 2) & 0xff00; 35 id |= (value << 2) & 0xff00;
36 } 36 }
@@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
178 this_cpu_write(cpu_llc_id, node); 178 this_cpu_write(cpu_llc_id, node);
179 179
180 /* Account for nodes per socket in multi-core-module processors */ 180 /* Account for nodes per socket in multi-core-module processors */
181 if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { 181 if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
182 rdmsrl(MSR_FAM10H_NODE_ID, val); 182 rdmsrl(MSR_FAM10H_NODE_ID, val);
183 nodes = ((val >> 3) & 7) + 1; 183 nodes = ((val >> 3) & 7) + 1;
184 } 184 }
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index eb45fc9b6124..28bde88b0085 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,66 @@
18#include <asm/proto.h> 18#include <asm/proto.h>
19#include <asm/ipi.h> 19#include <asm/ipi.h>
20 20
21void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
22{
23 /*
24 * Subtle. In the case of the 'never do double writes' workaround
25 * we have to lock out interrupts to be safe. As we don't care
26 * of the value read we use an atomic rmw access to avoid costly
27 * cli/sti. Otherwise we use an even cheaper single atomic write
28 * to the APIC.
29 */
30 unsigned int cfg;
31
32 /*
33 * Wait for idle.
34 */
35 __xapic_wait_icr_idle();
36
37 /*
38 * No need to touch the target chip field
39 */
40 cfg = __prepare_ICR(shortcut, vector, dest);
41
42 /*
43 * Send the IPI. The write to APIC_ICR fires this off.
44 */
45 native_apic_mem_write(APIC_ICR, cfg);
46}
47
48/*
49 * This is used to send an IPI with no shorthand notation (the destination is
50 * specified in bits 56 to 63 of the ICR).
51 */
52void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
53{
54 unsigned long cfg;
55
56 /*
57 * Wait for idle.
58 */
59 if (unlikely(vector == NMI_VECTOR))
60 safe_apic_wait_icr_idle();
61 else
62 __xapic_wait_icr_idle();
63
64 /*
65 * prepare target chip field
66 */
67 cfg = __prepare_ICR2(mask);
68 native_apic_mem_write(APIC_ICR2, cfg);
69
70 /*
71 * program the ICR
72 */
73 cfg = __prepare_ICR(0, vector, dest);
74
75 /*
76 * Send the IPI. The write to APIC_ICR fires this off.
77 */
78 native_apic_mem_write(APIC_ICR, cfg);
79}
80
21void default_send_IPI_single_phys(int cpu, int vector) 81void default_send_IPI_single_phys(int cpu, int vector)
22{ 82{
23 unsigned long flags; 83 unsigned long flags;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 84a7524b202c..5c042466f274 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -59,7 +59,6 @@ void common(void) {
59 59
60#ifdef CONFIG_PARAVIRT 60#ifdef CONFIG_PARAVIRT
61 BLANK(); 61 BLANK();
62 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
63 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); 62 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
64 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); 63 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
65 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 64 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6ce39025f467..ecdc1d217dc0 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
7#include <linux/lguest.h> 7#include <linux/lguest.h>
8#include "../../../drivers/lguest/lg.h" 8#include "../../../drivers/lguest/lg.h"
9 9
10#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, 10#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
11static char syscalls[] = { 11static char syscalls[] = {
12#include <asm/syscalls_32.h> 12#include <asm/syscalls_32.h>
13}; 13};
@@ -52,6 +52,11 @@ void foo(void)
52 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 52 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
53 offsetofend(struct tss_struct, SYSENTER_stack)); 53 offsetofend(struct tss_struct, SYSENTER_stack));
54 54
55 /* Offset from cpu_tss to SYSENTER_stack */
56 OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
57 /* Size of SYSENTER_stack */
58 DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
59
55#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 60#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
56 BLANK(); 61 BLANK();
57 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 62 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f2edafb5f24e..d875f97d4e0b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,17 +4,11 @@
4 4
5#include <asm/ia32.h> 5#include <asm/ia32.h>
6 6
7#define __SYSCALL_64(nr, sym, compat) [nr] = 1, 7#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
8#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
9#ifdef CONFIG_X86_X32_ABI
10# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
11#else
12# define __SYSCALL_X32(nr, sym, compat) /* nothing */
13#endif
14static char syscalls_64[] = { 8static char syscalls_64[] = {
15#include <asm/syscalls_64.h> 9#include <asm/syscalls_64.h>
16}; 10};
17#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, 11#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
18static char syscalls_ia32[] = { 12static char syscalls_ia32[] = {
19#include <asm/syscalls_32.h> 13#include <asm/syscalls_32.h>
20}; 14};
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7a60424d63fa..0d373d7affc8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -42,7 +42,7 @@ ifdef CONFIG_X86_FEATURE_NAMES
42quiet_cmd_mkcapflags = MKCAP $@ 42quiet_cmd_mkcapflags = MKCAP $@
43 cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ 43 cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
44 44
45cpufeature = $(src)/../../include/asm/cpufeature.h 45cpufeature = $(src)/../../include/asm/cpufeatures.h
46 46
47targets += capflags.c 47targets += capflags.c
48$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE 48$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index ce197bb7c129..1661d8ec9280 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,7 +1,7 @@
1#include <linux/bitops.h> 1#include <linux/bitops.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3 3
4#include <asm/processor.h> 4#include <asm/cpufeature.h>
5#include <asm/e820.h> 5#include <asm/e820.h>
6#include <asm/mtrr.h> 6#include <asm/mtrr.h>
7#include <asm/msr.h> 7#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 81cf716f6f97..249461f95851 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
162} 162}
163__setup("nompx", x86_mpx_setup); 163__setup("nompx", x86_mpx_setup);
164 164
165static int __init x86_noinvpcid_setup(char *s)
166{
167 /* noinvpcid doesn't accept parameters */
168 if (s)
169 return -EINVAL;
170
171 /* do not emit a message if the feature is not present */
172 if (!boot_cpu_has(X86_FEATURE_INVPCID))
173 return 0;
174
175 setup_clear_cpu_cap(X86_FEATURE_INVPCID);
176 pr_info("noinvpcid: INVPCID feature disabled\n");
177 return 0;
178}
179early_param("noinvpcid", x86_noinvpcid_setup);
180
165#ifdef CONFIG_X86_32 181#ifdef CONFIG_X86_32
166static int cachesize_override = -1; 182static int cachesize_override = -1;
167static int disable_x86_serial_nr = 1; 183static int disable_x86_serial_nr = 1;
@@ -801,6 +817,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
801#else 817#else
802 set_cpu_cap(c, X86_FEATURE_NOPL); 818 set_cpu_cap(c, X86_FEATURE_NOPL);
803#endif 819#endif
820
821 /*
822 * ESPFIX is a strange bug. All real CPUs have it. Paravirt
823 * systems that run Linux at CPL > 0 may or may not have the
824 * issue, but, even if they have the issue, there's absolutely
825 * nothing we can do about it because we can't use the real IRET
826 * instruction.
827 *
828 * NB: For the time being, only 32-bit kernels support
829 * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
830 * whether to apply espfix using paravirt hooks. If any
831 * non-paravirt system ever shows up that does *not* have the
832 * ESPFIX issue, we can change this.
833 */
834#ifdef CONFIG_X86_32
835#ifdef CONFIG_PARAVIRT
836 do {
837 extern void native_iret(void);
838 if (pv_cpu_ops.iret == native_iret)
839 set_cpu_bug(c, X86_BUG_ESPFIX);
840 } while (0);
841#else
842 set_cpu_bug(c, X86_BUG_ESPFIX);
843#endif
844#endif
804} 845}
805 846
806static void generic_identify(struct cpuinfo_x86 *c) 847static void generic_identify(struct cpuinfo_x86 *c)
@@ -1475,20 +1516,6 @@ void cpu_init(void)
1475} 1516}
1476#endif 1517#endif
1477 1518
1478#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
1479void warn_pre_alternatives(void)
1480{
1481 WARN(1, "You're using static_cpu_has before alternatives have run!\n");
1482}
1483EXPORT_SYMBOL_GPL(warn_pre_alternatives);
1484#endif
1485
1486inline bool __static_cpu_has_safe(u16 bit)
1487{
1488 return boot_cpu_has(bit);
1489}
1490EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
1491
1492static void bsp_resume(void) 1519static void bsp_resume(void)
1493{ 1520{
1494 if (this_cpu->c_bsp_resume) 1521 if (this_cpu->c_bsp_resume)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 187bb583d0df..6adef9cac23e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -8,6 +8,7 @@
8#include <linux/timer.h> 8#include <linux/timer.h>
9#include <asm/pci-direct.h> 9#include <asm/pci-direct.h>
10#include <asm/tsc.h> 10#include <asm/tsc.h>
11#include <asm/cpufeature.h>
11 12
12#include "cpu.h" 13#include "cpu.h"
13 14
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 38766c2b5b00..1f7fdb91a818 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -8,7 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10 10
11#include <asm/processor.h> 11#include <asm/cpufeature.h>
12#include <asm/pgtable.h> 12#include <asm/pgtable.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/bugs.h> 14#include <asm/bugs.h>
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6ed779efff26..de6626c18e42 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -14,7 +14,7 @@
14#include <linux/sysfs.h> 14#include <linux/sysfs.h>
15#include <linux/pci.h> 15#include <linux/pci.h>
16 16
17#include <asm/processor.h> 17#include <asm/cpufeature.h>
18#include <asm/amd_nb.h> 18#include <asm/amd_nb.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index afa9f0d487ea..fbb5e90557a5 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,5 +1,5 @@
1#include <asm/cpu_device_id.h> 1#include <asm/cpu_device_id.h>
2#include <asm/processor.h> 2#include <asm/cpufeature.h>
3#include <linux/cpu.h> 3#include <linux/cpu.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 524f2a8492d7..f0c921b03e42 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1578,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1578 1578
1579 if (c->x86 == 6 && c->x86_model == 45) 1579 if (c->x86 == 6 && c->x86_model == 45)
1580 quirk_no_way_out = quirk_sandybridge_ifu; 1580 quirk_no_way_out = quirk_sandybridge_ifu;
1581 /*
1582 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
1583 * whether this processor will actually generate recoverable
1584 * machine checks. Check to see if this is an E7 model Xeon.
1585 * We can't do a model number check because E5 and E7 use the
1586 * same model number. E5 doesn't support recovery, E7 does.
1587 */
1588 if (mca_cfg.recovery || (mca_cfg.ser &&
1589 !strncmp(c->x86_model_id,
1590 "Intel(R) Xeon(R) CPU E7-", 24)))
1591 set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
1581 } 1592 }
1582 if (cfg->monarch_timeout < 0) 1593 if (cfg->monarch_timeout < 0)
1583 cfg->monarch_timeout = 0; 1594 cfg->monarch_timeout = 0;
@@ -2030,6 +2041,8 @@ static int __init mcheck_enable(char *str)
2030 cfg->bootlog = (str[0] == 'b'); 2041 cfg->bootlog = (str[0] == 'b');
2031 else if (!strcmp(str, "bios_cmci_threshold")) 2042 else if (!strcmp(str, "bios_cmci_threshold"))
2032 cfg->bios_cmci_threshold = true; 2043 cfg->bios_cmci_threshold = true;
2044 else if (!strcmp(str, "recovery"))
2045 cfg->recovery = true;
2033 else if (isdigit(str[0])) { 2046 else if (isdigit(str[0])) {
2034 if (get_option(&str, &cfg->tolerant) == 2) 2047 if (get_option(&str, &cfg->tolerant) == 2)
2035 get_option(&str, &(cfg->monarch_timeout)); 2048 get_option(&str, &(cfg->monarch_timeout));
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 75d3aab5f7b2..8581963894c7 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void)
431 else 431 else
432 container = cont_va; 432 container = cont_va;
433 433
434 if (ucode_new_rev)
435 pr_info("microcode: updated early to new patch_level=0x%08x\n",
436 ucode_new_rev);
437
438 eax = cpuid_eax(0x00000001); 434 eax = cpuid_eax(0x00000001);
439 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 435 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
440 436
@@ -469,8 +465,7 @@ void reload_ucode_amd(void)
469 if (mc && rev < mc->hdr.patch_id) { 465 if (mc && rev < mc->hdr.patch_id) {
470 if (!__apply_microcode_amd(mc)) { 466 if (!__apply_microcode_amd(mc)) {
471 ucode_new_rev = mc->hdr.patch_id; 467 ucode_new_rev = mc->hdr.patch_id;
472 pr_info("microcode: reload patch_level=0x%08x\n", 468 pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
473 ucode_new_rev);
474 } 469 }
475 } 470 }
476} 471}
@@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
793 return -EINVAL; 788 return -EINVAL;
794 } 789 }
795 790
796 patch->data = kzalloc(patch_size, GFP_KERNEL); 791 patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
797 if (!patch->data) { 792 if (!patch->data) {
798 pr_err("Patch data allocation failure.\n"); 793 pr_err("Patch data allocation failure.\n");
799 kfree(patch); 794 kfree(patch);
800 return -EINVAL; 795 return -EINVAL;
801 } 796 }
802 797
803 /* All looks ok, copy patch... */
804 memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
805 INIT_LIST_HEAD(&patch->plist); 798 INIT_LIST_HEAD(&patch->plist);
806 patch->patch_id = mc_hdr->patch_id; 799 patch->patch_id = mc_hdr->patch_id;
807 patch->equiv_cpu = proc_id; 800 patch->equiv_cpu = proc_id;
@@ -957,6 +950,10 @@ struct microcode_ops * __init init_amd_microcode(void)
957 return NULL; 950 return NULL;
958 } 951 }
959 952
953 if (ucode_new_rev)
954 pr_info_once("microcode updated early to new patch_level=0x%08x\n",
955 ucode_new_rev);
956
960 return &microcode_amd_ops; 957 return &microcode_amd_ops;
961} 958}
962 959
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index faec7120c508..ac360bfbbdb6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -43,16 +43,8 @@
43#define MICROCODE_VERSION "2.01" 43#define MICROCODE_VERSION "2.01"
44 44
45static struct microcode_ops *microcode_ops; 45static struct microcode_ops *microcode_ops;
46
47static bool dis_ucode_ldr; 46static bool dis_ucode_ldr;
48 47
49static int __init disable_loader(char *str)
50{
51 dis_ucode_ldr = true;
52 return 1;
53}
54__setup("dis_ucode_ldr", disable_loader);
55
56/* 48/*
57 * Synchronization. 49 * Synchronization.
58 * 50 *
@@ -81,15 +73,16 @@ struct cpu_info_ctx {
81 73
82static bool __init check_loader_disabled_bsp(void) 74static bool __init check_loader_disabled_bsp(void)
83{ 75{
76 static const char *__dis_opt_str = "dis_ucode_ldr";
77
84#ifdef CONFIG_X86_32 78#ifdef CONFIG_X86_32
85 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); 79 const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
86 const char *opt = "dis_ucode_ldr"; 80 const char *option = (const char *)__pa_nodebug(__dis_opt_str);
87 const char *option = (const char *)__pa_nodebug(opt);
88 bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); 81 bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
89 82
90#else /* CONFIG_X86_64 */ 83#else /* CONFIG_X86_64 */
91 const char *cmdline = boot_command_line; 84 const char *cmdline = boot_command_line;
92 const char *option = "dis_ucode_ldr"; 85 const char *option = __dis_opt_str;
93 bool *res = &dis_ucode_ldr; 86 bool *res = &dis_ucode_ldr;
94#endif 87#endif
95 88
@@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
479 enum ucode_state ustate; 472 enum ucode_state ustate;
480 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 473 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
481 474
482 if (uci && uci->valid) 475 if (uci->valid)
483 return UCODE_OK; 476 return UCODE_OK;
484 477
485 if (collect_cpu_info(cpu)) 478 if (collect_cpu_info(cpu))
@@ -630,7 +623,7 @@ int __init microcode_init(void)
630 struct cpuinfo_x86 *c = &boot_cpu_data; 623 struct cpuinfo_x86 *c = &boot_cpu_data;
631 int error; 624 int error;
632 625
633 if (paravirt_enabled() || dis_ucode_ldr) 626 if (dis_ucode_ldr)
634 return -EINVAL; 627 return -EINVAL;
635 628
636 if (c->x86_vendor == X86_VENDOR_INTEL) 629 if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ee81c544ee0d..cbb3cf09b065 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,9 +39,15 @@
39#include <asm/setup.h> 39#include <asm/setup.h>
40#include <asm/msr.h> 40#include <asm/msr.h>
41 41
42static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; 42/*
43 * Temporary microcode blobs pointers storage. We note here the pointers to
44 * microcode blobs we've got from whatever storage (detached initrd, builtin).
45 * Later on, we put those into final storage mc_saved_data.mc_saved.
46 */
47static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
48
43static struct mc_saved_data { 49static struct mc_saved_data {
44 unsigned int mc_saved_count; 50 unsigned int num_saved;
45 struct microcode_intel **mc_saved; 51 struct microcode_intel **mc_saved;
46} mc_saved_data; 52} mc_saved_data;
47 53
@@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved,
78} 84}
79 85
80static inline void 86static inline void
81copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, 87copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
82 unsigned long off, int num_saved) 88 unsigned long off, int num_saved)
83{ 89{
84 int i; 90 int i;
85 91
86 for (i = 0; i < num_saved; i++) 92 for (i = 0; i < num_saved; i++)
87 mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); 93 mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
88} 94}
89 95
90#ifdef CONFIG_X86_32 96#ifdef CONFIG_X86_32
91static void 97static void
92microcode_phys(struct microcode_intel **mc_saved_tmp, 98microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
93 struct mc_saved_data *mc_saved_data)
94{ 99{
95 int i; 100 int i;
96 struct microcode_intel ***mc_saved; 101 struct microcode_intel ***mc_saved;
97 102
98 mc_saved = (struct microcode_intel ***) 103 mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
99 __pa_nodebug(&mc_saved_data->mc_saved); 104
100 for (i = 0; i < mc_saved_data->mc_saved_count; i++) { 105 for (i = 0; i < mcs->num_saved; i++) {
101 struct microcode_intel *p; 106 struct microcode_intel *p;
102 107
103 p = *(struct microcode_intel **) 108 p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
104 __pa_nodebug(mc_saved_data->mc_saved + i);
105 mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); 109 mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
106 } 110 }
107} 111}
108#endif 112#endif
109 113
110static enum ucode_state 114static enum ucode_state
111load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, 115load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
112 unsigned long initrd_start, struct ucode_cpu_info *uci) 116 unsigned long offset, struct ucode_cpu_info *uci)
113{ 117{
114 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; 118 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
115 unsigned int count = mc_saved_data->mc_saved_count; 119 unsigned int count = mcs->num_saved;
116 120
117 if (!mc_saved_data->mc_saved) { 121 if (!mcs->mc_saved) {
118 copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); 122 copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
119 123
120 return load_microcode_early(mc_saved_tmp, count, uci); 124 return load_microcode_early(mc_saved_tmp, count, uci);
121 } else { 125 } else {
122#ifdef CONFIG_X86_32 126#ifdef CONFIG_X86_32
123 microcode_phys(mc_saved_tmp, mc_saved_data); 127 microcode_phys(mc_saved_tmp, mcs);
124 return load_microcode_early(mc_saved_tmp, count, uci); 128 return load_microcode_early(mc_saved_tmp, count, uci);
125#else 129#else
126 return load_microcode_early(mc_saved_data->mc_saved, 130 return load_microcode_early(mcs->mc_saved, count, uci);
127 count, uci);
128#endif 131#endif
129 } 132 }
130} 133}
@@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
175} 178}
176 179
177static int 180static int
178save_microcode(struct mc_saved_data *mc_saved_data, 181save_microcode(struct mc_saved_data *mcs,
179 struct microcode_intel **mc_saved_src, 182 struct microcode_intel **mc_saved_src,
180 unsigned int mc_saved_count) 183 unsigned int num_saved)
181{ 184{
182 int i, j; 185 int i, j;
183 struct microcode_intel **saved_ptr; 186 struct microcode_intel **saved_ptr;
184 int ret; 187 int ret;
185 188
186 if (!mc_saved_count) 189 if (!num_saved)
187 return -EINVAL; 190 return -EINVAL;
188 191
189 /* 192 /*
190 * Copy new microcode data. 193 * Copy new microcode data.
191 */ 194 */
192 saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); 195 saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
193 if (!saved_ptr) 196 if (!saved_ptr)
194 return -ENOMEM; 197 return -ENOMEM;
195 198
196 for (i = 0; i < mc_saved_count; i++) { 199 for (i = 0; i < num_saved; i++) {
197 struct microcode_header_intel *mc_hdr; 200 struct microcode_header_intel *mc_hdr;
198 struct microcode_intel *mc; 201 struct microcode_intel *mc;
199 unsigned long size; 202 unsigned long size;
@@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data,
207 mc_hdr = &mc->hdr; 210 mc_hdr = &mc->hdr;
208 size = get_totalsize(mc_hdr); 211 size = get_totalsize(mc_hdr);
209 212
210 saved_ptr[i] = kmalloc(size, GFP_KERNEL); 213 saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
211 if (!saved_ptr[i]) { 214 if (!saved_ptr[i]) {
212 ret = -ENOMEM; 215 ret = -ENOMEM;
213 goto err; 216 goto err;
214 } 217 }
215
216 memcpy(saved_ptr[i], mc, size);
217 } 218 }
218 219
219 /* 220 /*
220 * Point to newly saved microcode. 221 * Point to newly saved microcode.
221 */ 222 */
222 mc_saved_data->mc_saved = saved_ptr; 223 mcs->mc_saved = saved_ptr;
223 mc_saved_data->mc_saved_count = mc_saved_count; 224 mcs->num_saved = num_saved;
224 225
225 return 0; 226 return 0;
226 227
@@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
284 * BSP can stay in the platform. 285 * BSP can stay in the platform.
285 */ 286 */
286static enum ucode_state __init 287static enum ucode_state __init
287get_matching_model_microcode(int cpu, unsigned long start, 288get_matching_model_microcode(unsigned long start, void *data, size_t size,
288 void *data, size_t size, 289 struct mc_saved_data *mcs, unsigned long *mc_ptrs,
289 struct mc_saved_data *mc_saved_data,
290 unsigned long *mc_saved_in_initrd,
291 struct ucode_cpu_info *uci) 290 struct ucode_cpu_info *uci)
292{ 291{
293 u8 *ucode_ptr = data; 292 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
294 unsigned int leftover = size; 293 struct microcode_header_intel *mc_header;
294 unsigned int num_saved = mcs->num_saved;
295 enum ucode_state state = UCODE_OK; 295 enum ucode_state state = UCODE_OK;
296 unsigned int leftover = size;
297 u8 *ucode_ptr = data;
296 unsigned int mc_size; 298 unsigned int mc_size;
297 struct microcode_header_intel *mc_header;
298 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
299 unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
300 int i; 299 int i;
301 300
302 while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { 301 while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
303 302
304 if (leftover < sizeof(mc_header)) 303 if (leftover < sizeof(mc_header))
305 break; 304 break;
@@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start,
318 * the platform, we need to find and save microcode patches 317 * the platform, we need to find and save microcode patches
319 * with the same family and model as the BSP. 318 * with the same family and model as the BSP.
320 */ 319 */
321 if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != 320 if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
322 UCODE_OK) {
323 ucode_ptr += mc_size; 321 ucode_ptr += mc_size;
324 continue; 322 continue;
325 } 323 }
326 324
327 mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); 325 num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
328 326
329 ucode_ptr += mc_size; 327 ucode_ptr += mc_size;
330 } 328 }
331 329
332 if (leftover) { 330 if (leftover) {
333 state = UCODE_ERROR; 331 state = UCODE_ERROR;
334 goto out; 332 return state;
335 } 333 }
336 334
337 if (mc_saved_count == 0) { 335 if (!num_saved) {
338 state = UCODE_NFOUND; 336 state = UCODE_NFOUND;
339 goto out; 337 return state;
340 } 338 }
341 339
342 for (i = 0; i < mc_saved_count; i++) 340 for (i = 0; i < num_saved; i++)
343 mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; 341 mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
342
343 mcs->num_saved = num_saved;
344 344
345 mc_saved_data->mc_saved_count = mc_saved_count;
346out:
347 return state; 345 return state;
348} 346}
349 347
@@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
373 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); 371 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
374 csig.pf = 1 << ((val[1] >> 18) & 7); 372 csig.pf = 1 << ((val[1] >> 18) & 7);
375 } 373 }
376 native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); 374 native_wrmsrl(MSR_IA32_UCODE_REV, 0);
377 375
378 /* As documented in the SDM: Do a CPUID 1 here */ 376 /* As documented in the SDM: Do a CPUID 1 here */
379 sync_core(); 377 sync_core();
@@ -396,11 +394,11 @@ static void show_saved_mc(void)
396 unsigned int sig, pf, rev, total_size, data_size, date; 394 unsigned int sig, pf, rev, total_size, data_size, date;
397 struct ucode_cpu_info uci; 395 struct ucode_cpu_info uci;
398 396
399 if (mc_saved_data.mc_saved_count == 0) { 397 if (!mc_saved_data.num_saved) {
400 pr_debug("no microcode data saved.\n"); 398 pr_debug("no microcode data saved.\n");
401 return; 399 return;
402 } 400 }
403 pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); 401 pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
404 402
405 collect_cpu_info_early(&uci); 403 collect_cpu_info_early(&uci);
406 404
@@ -409,7 +407,7 @@ static void show_saved_mc(void)
409 rev = uci.cpu_sig.rev; 407 rev = uci.cpu_sig.rev;
410 pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); 408 pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
411 409
412 for (i = 0; i < mc_saved_data.mc_saved_count; i++) { 410 for (i = 0; i < mc_saved_data.num_saved; i++) {
413 struct microcode_header_intel *mc_saved_header; 411 struct microcode_header_intel *mc_saved_header;
414 struct extended_sigtable *ext_header; 412 struct extended_sigtable *ext_header;
415 int ext_sigcount; 413 int ext_sigcount;
@@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc)
465{ 463{
466 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; 464 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
467 unsigned int mc_saved_count_init; 465 unsigned int mc_saved_count_init;
468 unsigned int mc_saved_count; 466 unsigned int num_saved;
469 struct microcode_intel **mc_saved; 467 struct microcode_intel **mc_saved;
470 int ret = 0; 468 int ret = 0;
471 int i; 469 int i;
@@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc)
476 */ 474 */
477 mutex_lock(&x86_cpu_microcode_mutex); 475 mutex_lock(&x86_cpu_microcode_mutex);
478 476
479 mc_saved_count_init = mc_saved_data.mc_saved_count; 477 mc_saved_count_init = mc_saved_data.num_saved;
480 mc_saved_count = mc_saved_data.mc_saved_count; 478 num_saved = mc_saved_data.num_saved;
481 mc_saved = mc_saved_data.mc_saved; 479 mc_saved = mc_saved_data.mc_saved;
482 480
483 if (mc_saved && mc_saved_count) 481 if (mc_saved && num_saved)
484 memcpy(mc_saved_tmp, mc_saved, 482 memcpy(mc_saved_tmp, mc_saved,
485 mc_saved_count * sizeof(struct microcode_intel *)); 483 num_saved * sizeof(struct microcode_intel *));
486 /* 484 /*
487 * Save the microcode patch mc in mc_save_tmp structure if it's a newer 485 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
488 * version. 486 * version.
489 */ 487 */
490 mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); 488 num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
491 489
492 /* 490 /*
493 * Save the mc_save_tmp in global mc_saved_data. 491 * Save the mc_save_tmp in global mc_saved_data.
494 */ 492 */
495 ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); 493 ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
496 if (ret) { 494 if (ret) {
497 pr_err("Cannot save microcode patch.\n"); 495 pr_err("Cannot save microcode patch.\n");
498 goto out; 496 goto out;
@@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
536 534
537static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; 535static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
538static __init enum ucode_state 536static __init enum ucode_state
539scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, 537scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
540 unsigned long start, unsigned long size, 538 unsigned long start, unsigned long size,
541 struct ucode_cpu_info *uci) 539 struct ucode_cpu_info *uci)
542{ 540{
@@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
551 cd.data = NULL; 549 cd.data = NULL;
552 cd.size = 0; 550 cd.size = 0;
553 551
554 cd = find_cpio_data(p, (void *)start, size, &offset); 552 /* try built-in microcode if no initrd */
555 if (!cd.data) { 553 if (!size) {
556 if (!load_builtin_intel_microcode(&cd)) 554 if (!load_builtin_intel_microcode(&cd))
557 return UCODE_ERROR; 555 return UCODE_ERROR;
556 } else {
557 cd = find_cpio_data(p, (void *)start, size, &offset);
558 if (!cd.data)
559 return UCODE_ERROR;
558 } 560 }
559 561
560 return get_matching_model_microcode(0, start, cd.data, cd.size, 562 return get_matching_model_microcode(start, cd.data, cd.size,
561 mc_saved_data, initrd, uci); 563 mcs, mc_ptrs, uci);
562} 564}
563 565
564/* 566/*
@@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
567static void 569static void
568print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) 570print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
569{ 571{
570 int cpu = smp_processor_id(); 572 pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
571 573 uci->cpu_sig.rev,
572 pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", 574 date & 0xffff,
573 cpu, 575 date >> 24,
574 uci->cpu_sig.rev, 576 (date >> 16) & 0xff);
575 date & 0xffff,
576 date >> 24,
577 (date >> 16) & 0xff);
578} 577}
579 578
580#ifdef CONFIG_X86_32 579#ifdef CONFIG_X86_32
@@ -603,19 +602,19 @@ void show_ucode_info_early(void)
603 */ 602 */
604static void print_ucode(struct ucode_cpu_info *uci) 603static void print_ucode(struct ucode_cpu_info *uci)
605{ 604{
606 struct microcode_intel *mc_intel; 605 struct microcode_intel *mc;
607 int *delay_ucode_info_p; 606 int *delay_ucode_info_p;
608 int *current_mc_date_p; 607 int *current_mc_date_p;
609 608
610 mc_intel = uci->mc; 609 mc = uci->mc;
611 if (mc_intel == NULL) 610 if (!mc)
612 return; 611 return;
613 612
614 delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); 613 delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
615 current_mc_date_p = (int *)__pa_nodebug(&current_mc_date); 614 current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
616 615
617 *delay_ucode_info_p = 1; 616 *delay_ucode_info_p = 1;
618 *current_mc_date_p = mc_intel->hdr.date; 617 *current_mc_date_p = mc->hdr.date;
619} 618}
620#else 619#else
621 620
@@ -630,37 +629,35 @@ static inline void flush_tlb_early(void)
630 629
631static inline void print_ucode(struct ucode_cpu_info *uci) 630static inline void print_ucode(struct ucode_cpu_info *uci)
632{ 631{
633 struct microcode_intel *mc_intel; 632 struct microcode_intel *mc;
634 633
635 mc_intel = uci->mc; 634 mc = uci->mc;
636 if (mc_intel == NULL) 635 if (!mc)
637 return; 636 return;
638 637
639 print_ucode_info(uci, mc_intel->hdr.date); 638 print_ucode_info(uci, mc->hdr.date);
640} 639}
641#endif 640#endif
642 641
643static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) 642static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
644{ 643{
645 struct microcode_intel *mc_intel; 644 struct microcode_intel *mc;
646 unsigned int val[2]; 645 unsigned int val[2];
647 646
648 mc_intel = uci->mc; 647 mc = uci->mc;
649 if (mc_intel == NULL) 648 if (!mc)
650 return 0; 649 return 0;
651 650
652 /* write microcode via MSR 0x79 */ 651 /* write microcode via MSR 0x79 */
653 native_wrmsr(MSR_IA32_UCODE_WRITE, 652 native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
654 (unsigned long) mc_intel->bits, 653 native_wrmsrl(MSR_IA32_UCODE_REV, 0);
655 (unsigned long) mc_intel->bits >> 16 >> 16);
656 native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
657 654
658 /* As documented in the SDM: Do a CPUID 1 here */ 655 /* As documented in the SDM: Do a CPUID 1 here */
659 sync_core(); 656 sync_core();
660 657
661 /* get the current revision from MSR 0x8B */ 658 /* get the current revision from MSR 0x8B */
662 native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 659 native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
663 if (val[1] != mc_intel->hdr.rev) 660 if (val[1] != mc->hdr.rev)
664 return -1; 661 return -1;
665 662
666#ifdef CONFIG_X86_64 663#ifdef CONFIG_X86_64
@@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
672 if (early) 669 if (early)
673 print_ucode(uci); 670 print_ucode(uci);
674 else 671 else
675 print_ucode_info(uci, mc_intel->hdr.date); 672 print_ucode_info(uci, mc->hdr.date);
676 673
677 return 0; 674 return 0;
678} 675}
679 676
680/* 677/*
681 * This function converts microcode patch offsets previously stored in 678 * This function converts microcode patch offsets previously stored in
682 * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. 679 * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
683 */ 680 */
684int __init save_microcode_in_initrd_intel(void) 681int __init save_microcode_in_initrd_intel(void)
685{ 682{
686 unsigned int count = mc_saved_data.mc_saved_count; 683 unsigned int count = mc_saved_data.num_saved;
687 struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; 684 struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
688 int ret = 0; 685 int ret = 0;
689 686
690 if (count == 0) 687 if (!count)
691 return ret; 688 return ret;
692 689
693 copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); 690 copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
691
694 ret = save_microcode(&mc_saved_data, mc_saved, count); 692 ret = save_microcode(&mc_saved_data, mc_saved, count);
695 if (ret) 693 if (ret)
696 pr_err("Cannot save microcode patches from initrd.\n"); 694 pr_err("Cannot save microcode patches from initrd.\n");
@@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void)
701} 699}
702 700
703static void __init 701static void __init
704_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, 702_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
705 unsigned long *initrd,
706 unsigned long start, unsigned long size) 703 unsigned long start, unsigned long size)
707{ 704{
708 struct ucode_cpu_info uci; 705 struct ucode_cpu_info uci;
@@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
710 707
711 collect_cpu_info_early(&uci); 708 collect_cpu_info_early(&uci);
712 709
713 ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); 710 ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
714 if (ret != UCODE_OK) 711 if (ret != UCODE_OK)
715 return; 712 return;
716 713
717 ret = load_microcode(mc_saved_data, initrd, start, &uci); 714 ret = load_microcode(mcs, mc_ptrs, start, &uci);
718 if (ret != UCODE_OK) 715 if (ret != UCODE_OK)
719 return; 716 return;
720 717
@@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void)
728 struct boot_params *p; 725 struct boot_params *p;
729 726
730 p = (struct boot_params *)__pa_nodebug(&boot_params); 727 p = (struct boot_params *)__pa_nodebug(&boot_params);
731 start = p->hdr.ramdisk_image;
732 size = p->hdr.ramdisk_size; 728 size = p->hdr.ramdisk_size;
733 729
734 _load_ucode_intel_bsp( 730 /*
735 (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), 731 * Set start only if we have an initrd image. We cannot use initrd_start
736 (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), 732 * because it is not set that early yet.
737 start, size); 733 */
734 start = (size ? p->hdr.ramdisk_image : 0);
735
736 _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
737 (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
738 start, size);
738#else 739#else
739 start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
740 size = boot_params.hdr.ramdisk_size; 740 size = boot_params.hdr.ramdisk_size;
741 start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
741 742
742 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); 743 _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
743#endif 744#endif
744} 745}
745 746
746void load_ucode_intel_ap(void) 747void load_ucode_intel_ap(void)
747{ 748{
748 struct mc_saved_data *mc_saved_data_p; 749 unsigned long *mcs_tmp_p;
750 struct mc_saved_data *mcs_p;
749 struct ucode_cpu_info uci; 751 struct ucode_cpu_info uci;
750 unsigned long *mc_saved_in_initrd_p;
751 unsigned long initrd_start_addr;
752 enum ucode_state ret; 752 enum ucode_state ret;
753#ifdef CONFIG_X86_32 753#ifdef CONFIG_X86_32
754 unsigned long *initrd_start_p;
755 754
756 mc_saved_in_initrd_p = 755 mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
757 (unsigned long *)__pa_nodebug(mc_saved_in_initrd); 756 mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
758 mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
759 initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
760 initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
761#else 757#else
762 mc_saved_data_p = &mc_saved_data; 758 mcs_tmp_p = mc_tmp_ptrs;
763 mc_saved_in_initrd_p = mc_saved_in_initrd; 759 mcs_p = &mc_saved_data;
764 initrd_start_addr = initrd_start;
765#endif 760#endif
766 761
767 /* 762 /*
768 * If there is no valid ucode previously saved in memory, no need to 763 * If there is no valid ucode previously saved in memory, no need to
769 * update ucode on this AP. 764 * update ucode on this AP.
770 */ 765 */
771 if (mc_saved_data_p->mc_saved_count == 0) 766 if (!mcs_p->num_saved)
772 return; 767 return;
773 768
774 collect_cpu_info_early(&uci); 769 collect_cpu_info_early(&uci);
775 ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, 770 ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
776 initrd_start_addr, &uci);
777
778 if (ret != UCODE_OK) 771 if (ret != UCODE_OK)
779 return; 772 return;
780 773
@@ -786,13 +779,13 @@ void reload_ucode_intel(void)
786 struct ucode_cpu_info uci; 779 struct ucode_cpu_info uci;
787 enum ucode_state ret; 780 enum ucode_state ret;
788 781
789 if (!mc_saved_data.mc_saved_count) 782 if (!mc_saved_data.num_saved)
790 return; 783 return;
791 784
792 collect_cpu_info_early(&uci); 785 collect_cpu_info_early(&uci);
793 786
794 ret = load_microcode_early(mc_saved_data.mc_saved, 787 ret = load_microcode_early(mc_saved_data.mc_saved,
795 mc_saved_data.mc_saved_count, &uci); 788 mc_saved_data.num_saved, &uci);
796 if (ret != UCODE_OK) 789 if (ret != UCODE_OK)
797 return; 790 return;
798 791
@@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
825 * return 0 - no update found 818 * return 0 - no update found
826 * return 1 - found update 819 * return 1 - found update
827 */ 820 */
828static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) 821static int get_matching_mc(struct microcode_intel *mc, int cpu)
829{ 822{
830 struct cpu_signature cpu_sig; 823 struct cpu_signature cpu_sig;
831 unsigned int csig, cpf, crev; 824 unsigned int csig, cpf, crev;
@@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
836 cpf = cpu_sig.pf; 829 cpf = cpu_sig.pf;
837 crev = cpu_sig.rev; 830 crev = cpu_sig.rev;
838 831
839 return has_newer_microcode(mc_intel, csig, cpf, crev); 832 return has_newer_microcode(mc, csig, cpf, crev);
840} 833}
841 834
842static int apply_microcode_intel(int cpu) 835static int apply_microcode_intel(int cpu)
843{ 836{
844 struct microcode_intel *mc_intel; 837 struct microcode_intel *mc;
845 struct ucode_cpu_info *uci; 838 struct ucode_cpu_info *uci;
839 struct cpuinfo_x86 *c;
846 unsigned int val[2]; 840 unsigned int val[2];
847 int cpu_num = raw_smp_processor_id();
848 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
849
850 uci = ucode_cpu_info + cpu;
851 mc_intel = uci->mc;
852 841
853 /* We should bind the task to the CPU */ 842 /* We should bind the task to the CPU */
854 BUG_ON(cpu_num != cpu); 843 if (WARN_ON(raw_smp_processor_id() != cpu))
844 return -1;
855 845
856 if (mc_intel == NULL) 846 uci = ucode_cpu_info + cpu;
847 mc = uci->mc;
848 if (!mc)
857 return 0; 849 return 0;
858 850
859 /* 851 /*
860 * Microcode on this CPU could be updated earlier. Only apply the 852 * Microcode on this CPU could be updated earlier. Only apply the
861 * microcode patch in mc_intel when it is newer than the one on this 853 * microcode patch in mc when it is newer than the one on this
862 * CPU. 854 * CPU.
863 */ 855 */
864 if (get_matching_mc(mc_intel, cpu) == 0) 856 if (!get_matching_mc(mc, cpu))
865 return 0; 857 return 0;
866 858
867 /* write microcode via MSR 0x79 */ 859 /* write microcode via MSR 0x79 */
868 wrmsr(MSR_IA32_UCODE_WRITE, 860 wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
869 (unsigned long) mc_intel->bits, 861 wrmsrl(MSR_IA32_UCODE_REV, 0);
870 (unsigned long) mc_intel->bits >> 16 >> 16);
871 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
872 862
873 /* As documented in the SDM: Do a CPUID 1 here */ 863 /* As documented in the SDM: Do a CPUID 1 here */
874 sync_core(); 864 sync_core();
@@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu)
876 /* get the current revision from MSR 0x8B */ 866 /* get the current revision from MSR 0x8B */
877 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 867 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
878 868
879 if (val[1] != mc_intel->hdr.rev) { 869 if (val[1] != mc->hdr.rev) {
880 pr_err("CPU%d update to revision 0x%x failed\n", 870 pr_err("CPU%d update to revision 0x%x failed\n",
881 cpu_num, mc_intel->hdr.rev); 871 cpu, mc->hdr.rev);
882 return -1; 872 return -1;
883 } 873 }
874
884 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", 875 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
885 cpu_num, val[1], 876 cpu, val[1],
886 mc_intel->hdr.date & 0xffff, 877 mc->hdr.date & 0xffff,
887 mc_intel->hdr.date >> 24, 878 mc->hdr.date >> 24,
888 (mc_intel->hdr.date >> 16) & 0xff); 879 (mc->hdr.date >> 16) & 0xff);
880
881 c = &cpu_data(cpu);
889 882
890 uci->cpu_sig.rev = val[1]; 883 uci->cpu_sig.rev = val[1];
891 c->microcode = val[1]; 884 c->microcode = val[1];
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index b96896bcbdaf..2ce1a7dc45b7 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err)
49 unsigned long total_size, data_size, ext_table_size; 49 unsigned long total_size, data_size, ext_table_size;
50 struct microcode_header_intel *mc_header = mc; 50 struct microcode_header_intel *mc_header = mc;
51 struct extended_sigtable *ext_header = NULL; 51 struct extended_sigtable *ext_header = NULL;
52 int sum, orig_sum, ext_sigcount = 0, i; 52 u32 sum, orig_sum, ext_sigcount = 0, i;
53 struct extended_signature *ext_sig; 53 struct extended_signature *ext_sig;
54 54
55 total_size = get_totalsize(mc_header); 55 total_size = get_totalsize(mc_header);
@@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err)
57 57
58 if (data_size + MC_HEADER_SIZE > total_size) { 58 if (data_size + MC_HEADER_SIZE > total_size) {
59 if (print_err) 59 if (print_err)
60 pr_err("error! Bad data size in microcode data file\n"); 60 pr_err("Error: bad microcode data file size.\n");
61 return -EINVAL; 61 return -EINVAL;
62 } 62 }
63 63
64 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 64 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
65 if (print_err) 65 if (print_err)
66 pr_err("error! Unknown microcode update format\n"); 66 pr_err("Error: invalid/unknown microcode update format.\n");
67 return -EINVAL; 67 return -EINVAL;
68 } 68 }
69
69 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 70 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
70 if (ext_table_size) { 71 if (ext_table_size) {
72 u32 ext_table_sum = 0;
73 u32 *ext_tablep;
74
71 if ((ext_table_size < EXT_HEADER_SIZE) 75 if ((ext_table_size < EXT_HEADER_SIZE)
72 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 76 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
73 if (print_err) 77 if (print_err)
74 pr_err("error! Small exttable size in microcode data file\n"); 78 pr_err("Error: truncated extended signature table.\n");
75 return -EINVAL; 79 return -EINVAL;
76 } 80 }
81
77 ext_header = mc + MC_HEADER_SIZE + data_size; 82 ext_header = mc + MC_HEADER_SIZE + data_size;
78 if (ext_table_size != exttable_size(ext_header)) { 83 if (ext_table_size != exttable_size(ext_header)) {
79 if (print_err) 84 if (print_err)
80 pr_err("error! Bad exttable size in microcode data file\n"); 85 pr_err("Error: extended signature table size mismatch.\n");
81 return -EFAULT; 86 return -EFAULT;
82 } 87 }
88
83 ext_sigcount = ext_header->count; 89 ext_sigcount = ext_header->count;
84 }
85 90
86 /* check extended table checksum */ 91 /*
87 if (ext_table_size) { 92 * Check extended table checksum: the sum of all dwords that
88 int ext_table_sum = 0; 93 * comprise a valid table must be 0.
89 int *ext_tablep = (int *)ext_header; 94 */
95 ext_tablep = (u32 *)ext_header;
90 96
91 i = ext_table_size / DWSIZE; 97 i = ext_table_size / sizeof(u32);
92 while (i--) 98 while (i--)
93 ext_table_sum += ext_tablep[i]; 99 ext_table_sum += ext_tablep[i];
100
94 if (ext_table_sum) { 101 if (ext_table_sum) {
95 if (print_err) 102 if (print_err)
96 pr_warn("aborting, bad extended signature table checksum\n"); 103 pr_warn("Bad extended signature table checksum, aborting.\n");
97 return -EINVAL; 104 return -EINVAL;
98 } 105 }
99 } 106 }
100 107
101 /* calculate the checksum */ 108 /*
109 * Calculate the checksum of update data and header. The checksum of
110 * valid update data and header including the extended signature table
111 * must be 0.
112 */
102 orig_sum = 0; 113 orig_sum = 0;
103 i = (MC_HEADER_SIZE + data_size) / DWSIZE; 114 i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
104 while (i--) 115 while (i--)
105 orig_sum += ((int *)mc)[i]; 116 orig_sum += ((u32 *)mc)[i];
117
106 if (orig_sum) { 118 if (orig_sum) {
107 if (print_err) 119 if (print_err)
108 pr_err("aborting, bad checksum\n"); 120 pr_err("Bad microcode data checksum, aborting.\n");
109 return -EINVAL; 121 return -EINVAL;
110 } 122 }
123
111 if (!ext_table_size) 124 if (!ext_table_size)
112 return 0; 125 return 0;
113 /* check extended signature checksum */ 126
127 /*
128 * Check extended signature checksum: 0 => valid.
129 */
114 for (i = 0; i < ext_sigcount; i++) { 130 for (i = 0; i < ext_sigcount; i++) {
115 ext_sig = (void *)ext_header + EXT_HEADER_SIZE + 131 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
116 EXT_SIGNATURE_SIZE * i; 132 EXT_SIGNATURE_SIZE * i;
117 sum = orig_sum 133
118 - (mc_header->sig + mc_header->pf + mc_header->cksum) 134 sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
119 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 135 (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
120 if (sum) { 136 if (sum) {
121 if (print_err) 137 if (print_err)
122 pr_err("aborting, bad checksum\n"); 138 pr_err("Bad extended signature checksum, aborting.\n");
123 return -EINVAL; 139 return -EINVAL;
124 } 140 }
125 } 141 }
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 3f20710a5b23..6988c74409a8 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,6 +1,6 @@
1#!/bin/sh 1#!/bin/sh
2# 2#
3# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h 3# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
4# 4#
5 5
6IN=$1 6IN=$1
@@ -49,8 +49,8 @@ dump_array()
49trap 'rm "$OUT"' EXIT 49trap 'rm "$OUT"' EXIT
50 50
51( 51(
52 echo "#ifndef _ASM_X86_CPUFEATURE_H" 52 echo "#ifndef _ASM_X86_CPUFEATURES_H"
53 echo "#include <asm/cpufeature.h>" 53 echo "#include <asm/cpufeatures.h>"
54 echo "#endif" 54 echo "#endif"
55 echo "" 55 echo ""
56 56
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ba80d68f683e..10f8d4796240 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -47,7 +47,7 @@
47#include <linux/smp.h> 47#include <linux/smp.h>
48#include <linux/syscore_ops.h> 48#include <linux/syscore_ops.h>
49 49
50#include <asm/processor.h> 50#include <asm/cpufeature.h>
51#include <asm/e820.h> 51#include <asm/e820.h>
52#include <asm/mtrr.h> 52#include <asm/mtrr.h>
53#include <asm/msr.h> 53#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index e3b4d1841175..34178564be2a 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,6 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <asm/processor.h> 3#include <asm/cpufeature.h>
4#include <asm/msr.h> 4#include <asm/msr.h>
5#include "cpu.h" 5#include "cpu.h"
6 6
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 837365f10912..621b501f8935 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -24,6 +24,7 @@
24#include <asm/e820.h> 24#include <asm/e820.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include <asm/cpufeature.h>
27 28
28/* 29/*
29 * The e820 map is the map that gets modified e.g. with command line parameters 30 * The e820 map is the map that gets modified e.g. with command line parameters
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d5804adfa6da..0b1b9abd4d5f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -114,6 +114,10 @@ void __kernel_fpu_begin(void)
114 kernel_fpu_disable(); 114 kernel_fpu_disable();
115 115
116 if (fpu->fpregs_active) { 116 if (fpu->fpregs_active) {
117 /*
118 * Ignore return value -- we don't care if reg state
119 * is clobbered.
120 */
117 copy_fpregs_to_fpstate(fpu); 121 copy_fpregs_to_fpstate(fpu);
118 } else { 122 } else {
119 this_cpu_write(fpu_fpregs_owner_ctx, NULL); 123 this_cpu_write(fpu_fpregs_owner_ctx, NULL);
@@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu)
189 193
190 preempt_disable(); 194 preempt_disable();
191 if (fpu->fpregs_active) { 195 if (fpu->fpregs_active) {
192 if (!copy_fpregs_to_fpstate(fpu)) 196 if (!copy_fpregs_to_fpstate(fpu)) {
193 fpregs_deactivate(fpu); 197 if (use_eager_fpu())
198 copy_kernel_to_fpregs(&fpu->state);
199 else
200 fpregs_deactivate(fpu);
201 }
194 } 202 }
195 preempt_enable(); 203 preempt_enable();
196} 204}
@@ -223,14 +231,15 @@ void fpstate_init(union fpregs_state *state)
223} 231}
224EXPORT_SYMBOL_GPL(fpstate_init); 232EXPORT_SYMBOL_GPL(fpstate_init);
225 233
226/* 234int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
227 * Copy the current task's FPU state to a new task's FPU context.
228 *
229 * In both the 'eager' and the 'lazy' case we save hardware registers
230 * directly to the destination buffer.
231 */
232static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
233{ 235{
236 dst_fpu->counter = 0;
237 dst_fpu->fpregs_active = 0;
238 dst_fpu->last_cpu = -1;
239
240 if (!src_fpu->fpstate_active || !cpu_has_fpu)
241 return 0;
242
234 WARN_ON_FPU(src_fpu != &current->thread.fpu); 243 WARN_ON_FPU(src_fpu != &current->thread.fpu);
235 244
236 /* 245 /*
@@ -243,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
243 /* 252 /*
244 * Save current FPU registers directly into the child 253 * Save current FPU registers directly into the child
245 * FPU context, without any memory-to-memory copying. 254 * FPU context, without any memory-to-memory copying.
246 * 255 * In lazy mode, if the FPU context isn't loaded into
247 * If the FPU context got destroyed in the process (FNSAVE 256 * fpregs, CR0.TS will be set and do_device_not_available
248 * done on old CPUs) then copy it back into the source 257 * will load the FPU context.
249 * context and mark the current task for lazy restore.
250 * 258 *
251 * We have to do all this with preemption disabled, 259 * We have to do all this with preemption disabled,
252 * mostly because of the FNSAVE case, because in that 260 * mostly because of the FNSAVE case, because in that
@@ -259,19 +267,13 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
259 preempt_disable(); 267 preempt_disable();
260 if (!copy_fpregs_to_fpstate(dst_fpu)) { 268 if (!copy_fpregs_to_fpstate(dst_fpu)) {
261 memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); 269 memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
262 fpregs_deactivate(src_fpu); 270
271 if (use_eager_fpu())
272 copy_kernel_to_fpregs(&src_fpu->state);
273 else
274 fpregs_deactivate(src_fpu);
263 } 275 }
264 preempt_enable(); 276 preempt_enable();
265}
266
267int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
268{
269 dst_fpu->counter = 0;
270 dst_fpu->fpregs_active = 0;
271 dst_fpu->last_cpu = -1;
272
273 if (src_fpu->fpstate_active && cpu_has_fpu)
274 fpu_copy(dst_fpu, src_fpu);
275 277
276 return 0; 278 return 0;
277} 279}
@@ -425,7 +427,7 @@ void fpu__clear(struct fpu *fpu)
425{ 427{
426 WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */ 428 WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
427 429
428 if (!use_eager_fpu()) { 430 if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
429 /* FPU state will be reallocated lazily at the first use. */ 431 /* FPU state will be reallocated lazily at the first use. */
430 fpu__drop(fpu); 432 fpu__drop(fpu);
431 } else { 433 } else {
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index bd08fb77073d..54c86fffbf9f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -262,7 +262,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
262 * not only saved the restores along the way, but we also have the 262 * not only saved the restores along the way, but we also have the
263 * FPU ready to be used for the original task. 263 * FPU ready to be used for the original task.
264 * 264 *
265 * 'eager' switching is used on modern CPUs, there we switch the FPU 265 * 'lazy' is deprecated because it's almost never a performance win
266 * and it's much more complicated than 'eager'.
267 *
268 * 'eager' switching is by default on all CPUs, there we switch the FPU
266 * state during every context switch, regardless of whether the task 269 * state during every context switch, regardless of whether the task
267 * has used FPU instructions in that time slice or not. This is done 270 * has used FPU instructions in that time slice or not. This is done
268 * because modern FPU context saving instructions are able to optimize 271 * because modern FPU context saving instructions are able to optimize
@@ -273,7 +276,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
273 * to use 'eager' restores, if we detect that a task is using the FPU 276 * to use 'eager' restores, if we detect that a task is using the FPU
274 * frequently. See the fpu->counter logic in fpu/internal.h for that. ] 277 * frequently. See the fpu->counter logic in fpu/internal.h for that. ]
275 */ 278 */
276static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; 279static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
277 280
278/* 281/*
279 * Find supported xfeatures based on cpu features and command-line input. 282 * Find supported xfeatures based on cpu features and command-line input.
@@ -344,15 +347,9 @@ static void __init fpu__init_system_ctx_switch(void)
344 */ 347 */
345static void __init fpu__init_parse_early_param(void) 348static void __init fpu__init_parse_early_param(void)
346{ 349{
347 /*
348 * No need to check "eagerfpu=auto" again, since it is the
349 * initial default.
350 */
351 if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { 350 if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
352 eagerfpu = DISABLE; 351 eagerfpu = DISABLE;
353 fpu__clear_eager_fpu_features(); 352 fpu__clear_eager_fpu_features();
354 } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
355 eagerfpu = ENABLE;
356 } 353 }
357 354
358 if (cmdline_find_option_bool(boot_command_line, "no387")) 355 if (cmdline_find_option_bool(boot_command_line, "no387"))
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d425cda5ae6d..6e8354f5a593 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
51 setup_clear_cpu_cap(X86_FEATURE_AVX512PF); 51 setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
52 setup_clear_cpu_cap(X86_FEATURE_AVX512ER); 52 setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
53 setup_clear_cpu_cap(X86_FEATURE_AVX512CD); 53 setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
54 setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
55 setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
56 setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
54 setup_clear_cpu_cap(X86_FEATURE_MPX); 57 setup_clear_cpu_cap(X86_FEATURE_MPX);
55 setup_clear_cpu_cap(X86_FEATURE_XGETBV1); 58 setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
56} 59}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 05c9e3f5b6d7..702547ce33c9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { }
697#endif 697#endif
698 698
699/* Defined as markers to the end of the ftrace default trampolines */ 699/* Defined as markers to the end of the ftrace default trampolines */
700extern void ftrace_caller_end(void);
701extern void ftrace_regs_caller_end(void); 700extern void ftrace_regs_caller_end(void);
702extern void ftrace_return(void); 701extern void ftrace_epilogue(void);
703extern void ftrace_caller_op_ptr(void); 702extern void ftrace_caller_op_ptr(void);
704extern void ftrace_regs_caller_op_ptr(void); 703extern void ftrace_regs_caller_op_ptr(void);
705 704
@@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
746 op_offset = (unsigned long)ftrace_regs_caller_op_ptr; 745 op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
747 } else { 746 } else {
748 start_offset = (unsigned long)ftrace_caller; 747 start_offset = (unsigned long)ftrace_caller;
749 end_offset = (unsigned long)ftrace_caller_end; 748 end_offset = (unsigned long)ftrace_epilogue;
750 op_offset = (unsigned long)ftrace_caller_op_ptr; 749 op_offset = (unsigned long)ftrace_caller_op_ptr;
751 } 750 }
752 751
@@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
754 753
755 /* 754 /*
756 * Allocate enough size to store the ftrace_caller code, 755 * Allocate enough size to store the ftrace_caller code,
757 * the jmp to ftrace_return, as well as the address of 756 * the jmp to ftrace_epilogue, as well as the address of
758 * the ftrace_ops this trampoline is used for. 757 * the ftrace_ops this trampoline is used for.
759 */ 758 */
760 trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); 759 trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
@@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
772 771
773 ip = (unsigned long)trampoline + size; 772 ip = (unsigned long)trampoline + size;
774 773
775 /* The trampoline ends with a jmp to ftrace_return */ 774 /* The trampoline ends with a jmp to ftrace_epilogue */
776 jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); 775 jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
777 memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); 776 memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
778 777
779 /* 778 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2c0f3407bd1f..1f4422d5c8d0 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
40/* Wipe all early page tables except for the kernel symbol map */ 40/* Wipe all early page tables except for the kernel symbol map */
41static void __init reset_early_page_tables(void) 41static void __init reset_early_page_tables(void)
42{ 42{
43 unsigned long i; 43 memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
44
45 for (i = 0; i < PTRS_PER_PGD-1; i++)
46 early_level4_pgt[i].pgd = 0;
47
48 next_early_pgt = 0; 44 next_early_pgt = 0;
49
50 write_cr3(__pa_nodebug(early_level4_pgt)); 45 write_cr3(__pa_nodebug(early_level4_pgt));
51} 46}
52 47
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
54int __init early_make_pgtable(unsigned long address) 49int __init early_make_pgtable(unsigned long address)
55{ 50{
56 unsigned long physaddr = address - __PAGE_OFFSET; 51 unsigned long physaddr = address - __PAGE_OFFSET;
57 unsigned long i;
58 pgdval_t pgd, *pgd_p; 52 pgdval_t pgd, *pgd_p;
59 pudval_t pud, *pud_p; 53 pudval_t pud, *pud_p;
60 pmdval_t pmd, *pmd_p; 54 pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
81 } 75 }
82 76
83 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; 77 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
84 for (i = 0; i < PTRS_PER_PUD; i++) 78 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
85 pud_p[i] = 0;
86 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 79 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
87 } 80 }
88 pud_p += pud_index(address); 81 pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
97 } 90 }
98 91
99 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; 92 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
100 for (i = 0; i < PTRS_PER_PMD; i++) 93 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
101 pmd_p[i] = 0;
102 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 94 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
103 } 95 }
104 pmd = (physaddr & PMD_MASK) + early_pmd_flags; 96 pmd = (physaddr & PMD_MASK) + early_pmd_flags;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae24b6d2..54cdbd2003fe 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,7 +19,7 @@
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h> 21#include <asm/msr-index.h>
22#include <asm/cpufeature.h> 22#include <asm/cpufeatures.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h> 24#include <asm/nops.h>
25#include <asm/bootparam.h> 25#include <asm/bootparam.h>
@@ -389,6 +389,12 @@ default_entry:
389 /* Make changes effective */ 389 /* Make changes effective */
390 wrmsr 390 wrmsr
391 391
392 /*
393 * And make sure that all the mappings we set up have NX set from
394 * the beginning.
395 */
396 orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
397
392enable_paging: 398enable_paging:
393 399
394/* 400/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..22fbf9df61bb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,6 @@
38#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 38#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
39 39
40L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) 40L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
41L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
42L4_START_KERNEL = pgd_index(__START_KERNEL_map) 41L4_START_KERNEL = pgd_index(__START_KERNEL_map)
43L3_START_KERNEL = pud_index(__START_KERNEL_map) 42L3_START_KERNEL = pud_index(__START_KERNEL_map)
44 43
@@ -76,9 +75,7 @@ startup_64:
76 subq $_text - __START_KERNEL_map, %rbp 75 subq $_text - __START_KERNEL_map, %rbp
77 76
78 /* Is the address not 2M aligned? */ 77 /* Is the address not 2M aligned? */
79 movq %rbp, %rax 78 testl $~PMD_PAGE_MASK, %ebp
80 andl $~PMD_PAGE_MASK, %eax
81 testl %eax, %eax
82 jnz bad_address 79 jnz bad_address
83 80
84 /* 81 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b8e6ff5cd5d0..be0ebbb6d1d1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
12#include <linux/pm.h> 12#include <linux/pm.h>
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/cpufeature.h>
15#include <asm/irqdomain.h> 16#include <asm/irqdomain.h>
16#include <asm/fixmap.h> 17#include <asm/fixmap.h>
17#include <asm/hpet.h> 18#include <asm/hpet.h>
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 87e1762e2bca..ed48a9f465f8 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -168,12 +168,14 @@ GLOBAL(ftrace_call)
168 restore_mcount_regs 168 restore_mcount_regs
169 169
170 /* 170 /*
171 * The copied trampoline must call ftrace_return as it 171 * The copied trampoline must call ftrace_epilogue as it
172 * still may need to call the function graph tracer. 172 * still may need to call the function graph tracer.
173 *
174 * The code up to this label is copied into trampolines so
175 * think twice before adding any new code or changing the
176 * layout here.
173 */ 177 */
174GLOBAL(ftrace_caller_end) 178GLOBAL(ftrace_epilogue)
175
176GLOBAL(ftrace_return)
177 179
178#ifdef CONFIG_FUNCTION_GRAPH_TRACER 180#ifdef CONFIG_FUNCTION_GRAPH_TRACER
179GLOBAL(ftrace_graph_call) 181GLOBAL(ftrace_graph_call)
@@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call)
244 popfq 246 popfq
245 247
246 /* 248 /*
247 * As this jmp to ftrace_return can be a short jump 249 * As this jmp to ftrace_epilogue can be a short jump
248 * it must not be copied into the trampoline. 250 * it must not be copied into the trampoline.
249 * The trampoline will add the code to jump 251 * The trampoline will add the code to jump
250 * to the return. 252 * to the return.
251 */ 253 */
252GLOBAL(ftrace_regs_caller_end) 254GLOBAL(ftrace_regs_caller_end)
253 255
254 jmp ftrace_return 256 jmp ftrace_epilogue
255 257
256END(ftrace_regs_caller) 258END(ftrace_regs_caller)
257 259
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 64f9616f93f1..7f3550acde1b 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,7 @@
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/gfp.h> 41#include <linux/gfp.h>
42 42
43#include <asm/processor.h> 43#include <asm/cpufeature.h>
44#include <asm/msr.h> 44#include <asm/msr.h>
45 45
46static struct class *msr_class; 46static struct class *msr_class;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9decee2bfdbe..2915d54e9dd5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
57 */ 57 */
58 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 58 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
59#endif 59#endif
60#ifdef CONFIG_X86_32
61 .SYSENTER_stack_canary = STACK_END_MAGIC,
62#endif
60}; 63};
61EXPORT_PER_CPU_SYMBOL(cpu_tss); 64EXPORT_PER_CPU_SYMBOL(cpu_tss);
62 65
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cb6282c3638f..548ddf7d6fd2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,7 +61,38 @@
61 regs->seg = GET_SEG(seg) | 3; \ 61 regs->seg = GET_SEG(seg) | 3; \
62} while (0) 62} while (0)
63 63
64int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) 64#ifdef CONFIG_X86_64
65/*
66 * If regs->ss will cause an IRET fault, change it. Otherwise leave it
67 * alone. Using this generally makes no sense unless
68 * user_64bit_mode(regs) would return true.
69 */
70static void force_valid_ss(struct pt_regs *regs)
71{
72 u32 ar;
73 asm volatile ("lar %[old_ss], %[ar]\n\t"
74 "jz 1f\n\t" /* If invalid: */
75 "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
76 "1:"
77 : [ar] "=r" (ar)
78 : [old_ss] "rm" ((u16)regs->ss));
79
80 /*
81 * For a valid 64-bit user context, we need DPL 3, type
82 * read-write data or read-write exp-down data, and S and P
83 * set. We can't use VERW because VERW doesn't check the
84 * P bit.
85 */
86 ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
87 if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
88 ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
89 regs->ss = __USER_DS;
90}
91#endif
92
93static int restore_sigcontext(struct pt_regs *regs,
94 struct sigcontext __user *sc,
95 unsigned long uc_flags)
65{ 96{
66 unsigned long buf_val; 97 unsigned long buf_val;
67 void __user *buf; 98 void __user *buf;
@@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
94 COPY(r15); 125 COPY(r15);
95#endif /* CONFIG_X86_64 */ 126#endif /* CONFIG_X86_64 */
96 127
97#ifdef CONFIG_X86_32
98 COPY_SEG_CPL3(cs); 128 COPY_SEG_CPL3(cs);
99 COPY_SEG_CPL3(ss); 129 COPY_SEG_CPL3(ss);
100#else /* !CONFIG_X86_32 */ 130
101 /* Kernel saves and restores only the CS segment register on signals, 131#ifdef CONFIG_X86_64
102 * which is the bare minimum needed to allow mixed 32/64-bit code. 132 /*
103 * App's signal handler can save/restore other segments if needed. */ 133 * Fix up SS if needed for the benefit of old DOSEMU and
104 COPY_SEG_CPL3(cs); 134 * CRIU.
105#endif /* CONFIG_X86_32 */ 135 */
136 if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
137 user_64bit_mode(regs)))
138 force_valid_ss(regs);
139#endif
106 140
107 get_user_ex(tmpflags, &sc->flags); 141 get_user_ex(tmpflags, &sc->flags);
108 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 142 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
165 put_user_ex(regs->cs, &sc->cs); 199 put_user_ex(regs->cs, &sc->cs);
166 put_user_ex(0, &sc->gs); 200 put_user_ex(0, &sc->gs);
167 put_user_ex(0, &sc->fs); 201 put_user_ex(0, &sc->fs);
202 put_user_ex(regs->ss, &sc->ss);
168#endif /* CONFIG_X86_32 */ 203#endif /* CONFIG_X86_32 */
169 204
170 put_user_ex(fpstate, &sc->fpstate); 205 put_user_ex(fpstate, &sc->fpstate);
@@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
403 return 0; 438 return 0;
404} 439}
405#else /* !CONFIG_X86_32 */ 440#else /* !CONFIG_X86_32 */
441static unsigned long frame_uc_flags(struct pt_regs *regs)
442{
443 unsigned long flags;
444
445 if (cpu_has_xsave)
446 flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
447 else
448 flags = UC_SIGCONTEXT_SS;
449
450 if (likely(user_64bit_mode(regs)))
451 flags |= UC_STRICT_RESTORE_SS;
452
453 return flags;
454}
455
406static int __setup_rt_frame(int sig, struct ksignal *ksig, 456static int __setup_rt_frame(int sig, struct ksignal *ksig,
407 sigset_t *set, struct pt_regs *regs) 457 sigset_t *set, struct pt_regs *regs)
408{ 458{
@@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
422 472
423 put_user_try { 473 put_user_try {
424 /* Create the ucontext. */ 474 /* Create the ucontext. */
425 if (cpu_has_xsave) 475 put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
426 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
427 else
428 put_user_ex(0, &frame->uc.uc_flags);
429 put_user_ex(0, &frame->uc.uc_link); 476 put_user_ex(0, &frame->uc.uc_link);
430 save_altstack_ex(&frame->uc.uc_stack, regs->sp); 477 save_altstack_ex(&frame->uc.uc_stack, regs->sp);
431 478
@@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
459 506
460 regs->sp = (unsigned long)frame; 507 regs->sp = (unsigned long)frame;
461 508
462 /* Set up the CS register to run signal handlers in 64-bit mode, 509 /*
463 even if the handler happens to be interrupting 32-bit code. */ 510 * Set up the CS and SS registers to run signal handlers in
511 * 64-bit mode, even if the handler happens to be interrupting
512 * 32-bit or 16-bit code.
513 *
514 * SS is subtle. In 64-bit mode, we don't need any particular
515 * SS descriptor, but we do need SS to be valid. It's possible
516 * that the old SS is entirely bogus -- this can happen if the
517 * signal we're trying to deliver is #GP or #SS caused by a bad
518 * SS value. We also have a compatbility issue here: DOSEMU
519 * relies on the contents of the SS register indicating the
520 * SS value at the time of the signal, even though that code in
521 * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
522 * avoids relying on sigreturn to restore SS; instead it uses
523 * a trampoline.) So we do our best: if the old SS was valid,
524 * we keep it. Otherwise we replace it.
525 */
464 regs->cs = __USER_CS; 526 regs->cs = __USER_CS;
465 527
528 if (unlikely(regs->ss != __USER_DS))
529 force_valid_ss(regs);
530
466 return 0; 531 return 0;
467} 532}
468#endif /* CONFIG_X86_32 */ 533#endif /* CONFIG_X86_32 */
@@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
489 554
490 put_user_try { 555 put_user_try {
491 /* Create the ucontext. */ 556 /* Create the ucontext. */
492 if (cpu_has_xsave) 557 put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
493 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
494 else
495 put_user_ex(0, &frame->uc.uc_flags);
496 put_user_ex(0, &frame->uc.uc_link); 558 put_user_ex(0, &frame->uc.uc_link);
497 compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); 559 compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
498 put_user_ex(0, &frame->uc.uc__pad0); 560 put_user_ex(0, &frame->uc.uc__pad0);
@@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void)
554 616
555 set_current_blocked(&set); 617 set_current_blocked(&set);
556 618
557 if (restore_sigcontext(regs, &frame->sc)) 619 /*
620 * x86_32 has no uc_flags bits relevant to restore_sigcontext.
621 * Save a few cycles by skipping the __get_user.
622 */
623 if (restore_sigcontext(regs, &frame->sc, 0))
558 goto badframe; 624 goto badframe;
559 return regs->ax; 625 return regs->ax;
560 626
@@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void)
570 struct pt_regs *regs = current_pt_regs(); 636 struct pt_regs *regs = current_pt_regs();
571 struct rt_sigframe __user *frame; 637 struct rt_sigframe __user *frame;
572 sigset_t set; 638 sigset_t set;
639 unsigned long uc_flags;
573 640
574 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 641 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
575 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 642 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
576 goto badframe; 643 goto badframe;
577 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 644 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
578 goto badframe; 645 goto badframe;
646 if (__get_user(uc_flags, &frame->uc.uc_flags))
647 goto badframe;
579 648
580 set_current_blocked(&set); 649 set_current_blocked(&set);
581 650
582 if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) 651 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
583 goto badframe; 652 goto badframe;
584 653
585 if (restore_altstack(&frame->uc.uc_stack)) 654 if (restore_altstack(&frame->uc.uc_stack))
@@ -692,12 +761,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
692 761
693static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) 762static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
694{ 763{
695#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) 764#ifdef CONFIG_X86_64
765 if (is_ia32_task())
766 return __NR_ia32_restart_syscall;
767#endif
768#ifdef CONFIG_X86_X32_ABI
769 return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
770#else
696 return __NR_restart_syscall; 771 return __NR_restart_syscall;
697#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ 772#endif
698 return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
699 __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
700#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
701} 773}
702 774
703/* 775/*
@@ -763,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
763 struct pt_regs *regs = current_pt_regs(); 835 struct pt_regs *regs = current_pt_regs();
764 struct rt_sigframe_x32 __user *frame; 836 struct rt_sigframe_x32 __user *frame;
765 sigset_t set; 837 sigset_t set;
838 unsigned long uc_flags;
766 839
767 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 840 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
768 841
@@ -770,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
770 goto badframe; 843 goto badframe;
771 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 844 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
772 goto badframe; 845 goto badframe;
846 if (__get_user(uc_flags, &frame->uc.uc_flags))
847 goto badframe;
773 848
774 set_current_blocked(&set); 849 set_current_blocked(&set);
775 850
776 if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) 851 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
777 goto badframe; 852 goto badframe;
778 853
779 if (compat_restore_altstack(&frame->uc.uc_stack)) 854 if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3bf1e0b5f827..643dbdccf4bc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -256,7 +256,7 @@ static void notrace start_secondary(void *unused)
256 x86_cpuinit.setup_percpu_clockev(); 256 x86_cpuinit.setup_percpu_clockev();
257 257
258 wmb(); 258 wmb();
259 cpu_startup_entry(CPUHP_ONLINE); 259 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
260} 260}
261 261
262int topology_update_package_map(unsigned int apicid, unsigned int cpu) 262int topology_update_package_map(unsigned int apicid, unsigned int cpu)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 211c11c7bba4..06cbe25861f1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
83DECLARE_BITMAP(used_vectors, NR_VECTORS); 83DECLARE_BITMAP(used_vectors, NR_VECTORS);
84EXPORT_SYMBOL_GPL(used_vectors); 84EXPORT_SYMBOL_GPL(used_vectors);
85 85
86static inline void conditional_sti(struct pt_regs *regs) 86static inline void cond_local_irq_enable(struct pt_regs *regs)
87{ 87{
88 if (regs->flags & X86_EFLAGS_IF) 88 if (regs->flags & X86_EFLAGS_IF)
89 local_irq_enable(); 89 local_irq_enable();
90} 90}
91 91
92static inline void preempt_conditional_sti(struct pt_regs *regs) 92static inline void cond_local_irq_disable(struct pt_regs *regs)
93{
94 preempt_count_inc();
95 if (regs->flags & X86_EFLAGS_IF)
96 local_irq_enable();
97}
98
99static inline void conditional_cli(struct pt_regs *regs)
100{
101 if (regs->flags & X86_EFLAGS_IF)
102 local_irq_disable();
103}
104
105static inline void preempt_conditional_cli(struct pt_regs *regs)
106{ 93{
107 if (regs->flags & X86_EFLAGS_IF) 94 if (regs->flags & X86_EFLAGS_IF)
108 local_irq_disable(); 95 local_irq_disable();
109 preempt_count_dec();
110} 96}
111 97
112void ist_enter(struct pt_regs *regs) 98void ist_enter(struct pt_regs *regs)
@@ -262,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
262 tsk->thread.error_code = error_code; 248 tsk->thread.error_code = error_code;
263 tsk->thread.trap_nr = trapnr; 249 tsk->thread.trap_nr = trapnr;
264 250
265#ifdef CONFIG_X86_64
266 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 251 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
267 printk_ratelimit()) { 252 printk_ratelimit()) {
268 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", 253 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -271,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
271 print_vma_addr(" in ", regs->ip); 256 print_vma_addr(" in ", regs->ip);
272 pr_cont("\n"); 257 pr_cont("\n");
273 } 258 }
274#endif
275 259
276 force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); 260 force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
277} 261}
@@ -286,7 +270,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
286 270
287 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != 271 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
288 NOTIFY_STOP) { 272 NOTIFY_STOP) {
289 conditional_sti(regs); 273 cond_local_irq_enable(regs);
290 do_trap(trapnr, signr, str, regs, error_code, 274 do_trap(trapnr, signr, str, regs, error_code,
291 fill_trap_info(regs, signr, trapnr, &info)); 275 fill_trap_info(regs, signr, trapnr, &info));
292 } 276 }
@@ -368,7 +352,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
368 if (notify_die(DIE_TRAP, "bounds", regs, error_code, 352 if (notify_die(DIE_TRAP, "bounds", regs, error_code,
369 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) 353 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
370 return; 354 return;
371 conditional_sti(regs); 355 cond_local_irq_enable(regs);
372 356
373 if (!user_mode(regs)) 357 if (!user_mode(regs))
374 die("bounds", regs, error_code); 358 die("bounds", regs, error_code);
@@ -443,7 +427,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
443 struct task_struct *tsk; 427 struct task_struct *tsk;
444 428
445 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 429 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
446 conditional_sti(regs); 430 cond_local_irq_enable(regs);
447 431
448 if (v8086_mode(regs)) { 432 if (v8086_mode(regs)) {
449 local_irq_enable(); 433 local_irq_enable();
@@ -517,9 +501,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
517 * as we may switch to the interrupt stack. 501 * as we may switch to the interrupt stack.
518 */ 502 */
519 debug_stack_usage_inc(); 503 debug_stack_usage_inc();
520 preempt_conditional_sti(regs); 504 preempt_disable();
505 cond_local_irq_enable(regs);
521 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); 506 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
522 preempt_conditional_cli(regs); 507 cond_local_irq_disable(regs);
508 preempt_enable_no_resched();
523 debug_stack_usage_dec(); 509 debug_stack_usage_dec();
524exit: 510exit:
525 ist_exit(regs); 511 ist_exit(regs);
@@ -571,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
571NOKPROBE_SYMBOL(fixup_bad_iret); 557NOKPROBE_SYMBOL(fixup_bad_iret);
572#endif 558#endif
573 559
560static bool is_sysenter_singlestep(struct pt_regs *regs)
561{
562 /*
563 * We don't try for precision here. If we're anywhere in the region of
564 * code that can be single-stepped in the SYSENTER entry path, then
565 * assume that this is a useless single-step trap due to SYSENTER
566 * being invoked with TF set. (We don't know in advance exactly
567 * which instructions will be hit because BTF could plausibly
568 * be set.)
569 */
570#ifdef CONFIG_X86_32
571 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
572 (unsigned long)__end_SYSENTER_singlestep_region -
573 (unsigned long)__begin_SYSENTER_singlestep_region;
574#elif defined(CONFIG_IA32_EMULATION)
575 return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
576 (unsigned long)__end_entry_SYSENTER_compat -
577 (unsigned long)entry_SYSENTER_compat;
578#else
579 return false;
580#endif
581}
582
574/* 583/*
575 * Our handling of the processor debug registers is non-trivial. 584 * Our handling of the processor debug registers is non-trivial.
576 * We do not clear them on entry and exit from the kernel. Therefore 585 * We do not clear them on entry and exit from the kernel. Therefore
@@ -605,11 +614,42 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
605 ist_enter(regs); 614 ist_enter(regs);
606 615
607 get_debugreg(dr6, 6); 616 get_debugreg(dr6, 6);
617 /*
618 * The Intel SDM says:
619 *
620 * Certain debug exceptions may clear bits 0-3. The remaining
621 * contents of the DR6 register are never cleared by the
622 * processor. To avoid confusion in identifying debug
623 * exceptions, debug handlers should clear the register before
624 * returning to the interrupted task.
625 *
626 * Keep it simple: clear DR6 immediately.
627 */
628 set_debugreg(0, 6);
608 629
609 /* Filter out all the reserved bits which are preset to 1 */ 630 /* Filter out all the reserved bits which are preset to 1 */
610 dr6 &= ~DR6_RESERVED; 631 dr6 &= ~DR6_RESERVED;
611 632
612 /* 633 /*
634 * The SDM says "The processor clears the BTF flag when it
635 * generates a debug exception." Clear TIF_BLOCKSTEP to keep
636 * TIF_BLOCKSTEP in sync with the hardware BTF flag.
637 */
638 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
639
640 if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
641 is_sysenter_singlestep(regs))) {
642 dr6 &= ~DR_STEP;
643 if (!dr6)
644 goto exit;
645 /*
646 * else we might have gotten a single-step trap and hit a
647 * watchpoint at the same time, in which case we should fall
648 * through and handle the watchpoint.
649 */
650 }
651
652 /*
613 * If dr6 has no reason to give us about the origin of this trap, 653 * If dr6 has no reason to give us about the origin of this trap,
614 * then it's very likely the result of an icebp/int01 trap. 654 * then it's very likely the result of an icebp/int01 trap.
615 * User wants a sigtrap for that. 655 * User wants a sigtrap for that.
@@ -617,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
617 if (!dr6 && user_mode(regs)) 657 if (!dr6 && user_mode(regs))
618 user_icebp = 1; 658 user_icebp = 1;
619 659
620 /* Catch kmemcheck conditions first of all! */ 660 /* Catch kmemcheck conditions! */
621 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 661 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
622 goto exit; 662 goto exit;
623 663
624 /* DR6 may or may not be cleared by the CPU */
625 set_debugreg(0, 6);
626
627 /*
628 * The processor cleared BTF, so don't mark that we need it set.
629 */
630 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
631
632 /* Store the virtualized DR6 value */ 664 /* Store the virtualized DR6 value */
633 tsk->thread.debugreg6 = dr6; 665 tsk->thread.debugreg6 = dr6;
634 666
@@ -648,24 +680,25 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
648 debug_stack_usage_inc(); 680 debug_stack_usage_inc();
649 681
650 /* It's safe to allow irq's after DR6 has been saved */ 682 /* It's safe to allow irq's after DR6 has been saved */
651 preempt_conditional_sti(regs); 683 preempt_disable();
684 cond_local_irq_enable(regs);
652 685
653 if (v8086_mode(regs)) { 686 if (v8086_mode(regs)) {
654 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 687 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
655 X86_TRAP_DB); 688 X86_TRAP_DB);
656 preempt_conditional_cli(regs); 689 cond_local_irq_disable(regs);
690 preempt_enable_no_resched();
657 debug_stack_usage_dec(); 691 debug_stack_usage_dec();
658 goto exit; 692 goto exit;
659 } 693 }
660 694
661 /* 695 if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
662 * Single-stepping through system calls: ignore any exceptions in 696 /*
663 * kernel space, but re-enable TF when returning to user mode. 697 * Historical junk that used to handle SYSENTER single-stepping.
664 * 698 * This should be unreachable now. If we survive for a while
665 * We already checked v86 mode above, so we can check for kernel mode 699 * without anyone hitting this warning, we'll turn this into
666 * by just checking the CPL of CS. 700 * an oops.
667 */ 701 */
668 if ((dr6 & DR_STEP) && !user_mode(regs)) {
669 tsk->thread.debugreg6 &= ~DR_STEP; 702 tsk->thread.debugreg6 &= ~DR_STEP;
670 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 703 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
671 regs->flags &= ~X86_EFLAGS_TF; 704 regs->flags &= ~X86_EFLAGS_TF;
@@ -673,10 +706,19 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
673 si_code = get_si_code(tsk->thread.debugreg6); 706 si_code = get_si_code(tsk->thread.debugreg6);
674 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 707 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
675 send_sigtrap(tsk, regs, error_code, si_code); 708 send_sigtrap(tsk, regs, error_code, si_code);
676 preempt_conditional_cli(regs); 709 cond_local_irq_disable(regs);
710 preempt_enable_no_resched();
677 debug_stack_usage_dec(); 711 debug_stack_usage_dec();
678 712
679exit: 713exit:
714#if defined(CONFIG_X86_32)
715 /*
716 * This is the most likely code path that involves non-trivial use
717 * of the SYSENTER stack. Check that we haven't overrun it.
718 */
719 WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
720 "Overran or corrupted SYSENTER stack\n");
721#endif
680 ist_exit(regs); 722 ist_exit(regs);
681} 723}
682NOKPROBE_SYMBOL(do_debug); 724NOKPROBE_SYMBOL(do_debug);
@@ -696,7 +738,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
696 738
697 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) 739 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
698 return; 740 return;
699 conditional_sti(regs); 741 cond_local_irq_enable(regs);
700 742
701 if (!user_mode(regs)) { 743 if (!user_mode(regs)) {
702 if (!fixup_exception(regs, trapnr)) { 744 if (!fixup_exception(regs, trapnr)) {
@@ -743,20 +785,19 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
743dotraplinkage void 785dotraplinkage void
744do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 786do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
745{ 787{
746 conditional_sti(regs); 788 cond_local_irq_enable(regs);
747} 789}
748 790
749dotraplinkage void 791dotraplinkage void
750do_device_not_available(struct pt_regs *regs, long error_code) 792do_device_not_available(struct pt_regs *regs, long error_code)
751{ 793{
752 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 794 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
753 BUG_ON(use_eager_fpu());
754 795
755#ifdef CONFIG_MATH_EMULATION 796#ifdef CONFIG_MATH_EMULATION
756 if (read_cr0() & X86_CR0_EM) { 797 if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
757 struct math_emu_info info = { }; 798 struct math_emu_info info = { };
758 799
759 conditional_sti(regs); 800 cond_local_irq_enable(regs);
760 801
761 info.regs = regs; 802 info.regs = regs;
762 math_emulate(&info); 803 math_emulate(&info);
@@ -765,7 +806,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
765#endif 806#endif
766 fpu__restore(&current->thread.fpu); /* interrupts still off */ 807 fpu__restore(&current->thread.fpu); /* interrupts still off */
767#ifdef CONFIG_X86_32 808#ifdef CONFIG_X86_32
768 conditional_sti(regs); 809 cond_local_irq_enable(regs);
769#endif 810#endif
770} 811}
771NOKPROBE_SYMBOL(do_device_not_available); 812NOKPROBE_SYMBOL(do_device_not_available);
@@ -868,7 +909,7 @@ void __init trap_init(void)
868#endif 909#endif
869 910
870#ifdef CONFIG_X86_32 911#ifdef CONFIG_X86_32
871 set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); 912 set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
872 set_bit(IA32_SYSCALL_VECTOR, used_vectors); 913 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
873#endif 914#endif
874 915
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da828d3..56380440d862 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
43 43
44int tsc_clocksource_reliable; 44int tsc_clocksource_reliable;
45 45
46static u32 art_to_tsc_numerator;
47static u32 art_to_tsc_denominator;
48static u64 art_to_tsc_offset;
49struct clocksource *art_related_clocksource;
50
46/* 51/*
47 * Use a ring-buffer like data structure, where a writer advances the head by 52 * Use a ring-buffer like data structure, where a writer advances the head by
48 * writing a new data entry and a reader advances the tail when it observes a 53 * writing a new data entry and a reader advances the tail when it observes a
@@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc);
964 969
965#endif /* CONFIG_CPU_FREQ */ 970#endif /* CONFIG_CPU_FREQ */
966 971
972#define ART_CPUID_LEAF (0x15)
973#define ART_MIN_DENOMINATOR (1)
974
975
976/*
977 * If ART is present detect the numerator:denominator to convert to TSC
978 */
979static void detect_art(void)
980{
981 unsigned int unused[2];
982
983 if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
984 return;
985
986 cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
987 &art_to_tsc_numerator, unused, unused+1);
988
989 /* Don't enable ART in a VM, non-stop TSC required */
990 if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
991 !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
992 art_to_tsc_denominator < ART_MIN_DENOMINATOR)
993 return;
994
995 if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
996 return;
997
998 /* Make this sticky over multiple CPU init calls */
999 setup_force_cpu_cap(X86_FEATURE_ART);
1000}
1001
1002
967/* clocksource code */ 1003/* clocksource code */
968 1004
969static struct clocksource clocksource_tsc; 1005static struct clocksource clocksource_tsc;
@@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void)
1071 return 0; 1107 return 0;
1072} 1108}
1073 1109
1110/*
1111 * Convert ART to TSC given numerator/denominator found in detect_art()
1112 */
1113struct system_counterval_t convert_art_to_tsc(cycle_t art)
1114{
1115 u64 tmp, res, rem;
1116
1117 rem = do_div(art, art_to_tsc_denominator);
1118
1119 res = art * art_to_tsc_numerator;
1120 tmp = rem * art_to_tsc_numerator;
1121
1122 do_div(tmp, art_to_tsc_denominator);
1123 res += tmp + art_to_tsc_offset;
1124
1125 return (struct system_counterval_t) {.cs = art_related_clocksource,
1126 .cycles = res};
1127}
1128EXPORT_SYMBOL(convert_art_to_tsc);
1074 1129
1075static void tsc_refine_calibration_work(struct work_struct *work); 1130static void tsc_refine_calibration_work(struct work_struct *work);
1076static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); 1131static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
@@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
1142 (unsigned long)tsc_khz % 1000); 1197 (unsigned long)tsc_khz % 1000);
1143 1198
1144out: 1199out:
1200 if (boot_cpu_has(X86_FEATURE_ART))
1201 art_related_clocksource = &clocksource_tsc;
1145 clocksource_register_khz(&clocksource_tsc, tsc_khz); 1202 clocksource_register_khz(&clocksource_tsc, tsc_khz);
1146} 1203}
1147 1204
@@ -1235,6 +1292,8 @@ void __init tsc_init(void)
1235 mark_tsc_unstable("TSCs unsynchronized"); 1292 mark_tsc_unstable("TSCs unsynchronized");
1236 1293
1237 check_system_tsc_reliable(); 1294 check_system_tsc_reliable();
1295
1296 detect_art();
1238} 1297}
1239 1298
1240#ifdef CONFIG_SMP 1299#ifdef CONFIG_SMP
@@ -1246,14 +1305,14 @@ void __init tsc_init(void)
1246 */ 1305 */
1247unsigned long calibrate_delay_is_known(void) 1306unsigned long calibrate_delay_is_known(void)
1248{ 1307{
1249 int i, cpu = smp_processor_id(); 1308 int sibling, cpu = smp_processor_id();
1250 1309
1251 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) 1310 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
1252 return 0; 1311 return 0;
1253 1312
1254 for_each_online_cpu(i) 1313 sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
1255 if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) 1314 if (sibling < nr_cpu_ids)
1256 return cpu_data(i).loops_per_jiffy; 1315 return cpu_data(sibling).loops_per_jiffy;
1257 return 0; 1316 return 0;
1258} 1317}
1259#endif 1318#endif
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 07efb35ee4bc..014ea59aa153 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -30,7 +30,7 @@
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeatures.h>
34#include <asm/msr-index.h> 34#include <asm/msr-index.h>
35 35
36verify_cpu: 36verify_cpu:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e574b8546518..3dce1ca0a653 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
362 /* make room for real-mode segments */ 362 /* make room for real-mode segments */
363 tsk->thread.sp0 += 16; 363 tsk->thread.sp0 += 16;
364 364
365 if (static_cpu_has_safe(X86_FEATURE_SEP)) 365 if (static_cpu_has(X86_FEATURE_SEP))
366 tsk->thread.sysenter_cs = 0; 366 tsk->thread.sysenter_cs = 0;
367 367
368 load_sp0(tss, &tsk->thread); 368 load_sp0(tss, &tsk->thread);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index fe133b710bef..5af9958cbdb6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -192,6 +192,17 @@ SECTIONS
192 :init 192 :init
193#endif 193#endif
194 194
195 /*
196 * Section for code used exclusively before alternatives are run. All
197 * references to such code must be patched out by alternatives, normally
198 * by using X86_FEATURE_ALWAYS CPU feature bit.
199 *
200 * See static_cpu_has() for an example.
201 */
202 .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
203 *(.altinstr_aux)
204 }
205
195 INIT_DATA_SECTION(16) 206 INIT_DATA_SECTION(16)
196 207
197 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 208 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a2fe51b00cce..65be7cfaf947 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,5 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/cpufeature.h> 2#include <asm/cpufeatures.h>
3#include <asm/alternative-asm.h> 3#include <asm/alternative-asm.h>
4 4
5/* 5/*
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..5cc78bf57232 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -21,12 +21,16 @@ static inline int myisspace(u8 c)
21 * @option: option string to look for 21 * @option: option string to look for
22 * 22 *
23 * Returns the position of that @option (starts counting with 1) 23 * Returns the position of that @option (starts counting with 1)
24 * or 0 on not found. 24 * or 0 on not found. @option will only be found if it is found
25 * as an entire word in @cmdline. For instance, if @option="car"
26 * then a cmdline which contains "cart" will not match.
25 */ 27 */
26int cmdline_find_option_bool(const char *cmdline, const char *option) 28static int
29__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
30 const char *option)
27{ 31{
28 char c; 32 char c;
29 int len, pos = 0, wstart = 0; 33 int pos = 0, wstart = 0;
30 const char *opptr = NULL; 34 const char *opptr = NULL;
31 enum { 35 enum {
32 st_wordstart = 0, /* Start of word/after whitespace */ 36 st_wordstart = 0, /* Start of word/after whitespace */
@@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
37 if (!cmdline) 41 if (!cmdline)
38 return -1; /* No command line */ 42 return -1; /* No command line */
39 43
40 len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); 44 /*
41 if (!len) 45 * This 'pos' check ensures we do not overrun
42 return 0; 46 * a non-NULL-terminated 'cmdline'
43 47 */
44 while (len--) { 48 while (pos < max_cmdline_size) {
45 c = *(char *)cmdline++; 49 c = *(char *)cmdline++;
46 pos++; 50 pos++;
47 51
@@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
58 /* fall through */ 62 /* fall through */
59 63
60 case st_wordcmp: 64 case st_wordcmp:
61 if (!*opptr) 65 if (!*opptr) {
66 /*
67 * We matched all the way to the end of the
68 * option we were looking for. If the
69 * command-line has a space _or_ ends, then
70 * we matched!
71 */
62 if (!c || myisspace(c)) 72 if (!c || myisspace(c))
63 return wstart; 73 return wstart;
64 else 74 /*
65 state = st_wordskip; 75 * We hit the end of the option, but _not_
66 else if (!c) 76 * the end of a word on the cmdline. Not
77 * a match.
78 */
79 } else if (!c) {
80 /*
81 * Hit the NULL terminator on the end of
82 * cmdline.
83 */
67 return 0; 84 return 0;
68 else if (c != *opptr++) 85 } else if (c == *opptr++) {
69 state = st_wordskip; 86 /*
70 else if (!len) /* last word and is matching */ 87 * We are currently matching, so continue
71 return wstart; 88 * to the next character on the cmdline.
72 break; 89 */
90 break;
91 }
92 state = st_wordskip;
93 /* fall through */
73 94
74 case st_wordskip: 95 case st_wordskip:
75 if (!c) 96 if (!c)
@@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
82 103
83 return 0; /* Buffer overrun */ 104 return 0; /* Buffer overrun */
84} 105}
106
107int cmdline_find_option_bool(const char *cmdline, const char *option)
108{
109 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
110}
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 009f98216b7e..24ef1c2104d4 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,7 @@
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ 1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/cpufeature.h> 4#include <asm/cpufeatures.h>
5#include <asm/alternative-asm.h> 5#include <asm/alternative-asm.h>
6 6
7/* 7/*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 27f89c79a44b..2b0ef26da0bd 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -10,7 +10,7 @@
10#include <asm/current.h> 10#include <asm/current.h>
11#include <asm/asm-offsets.h> 11#include <asm/asm-offsets.h>
12#include <asm/thread_info.h> 12#include <asm/thread_info.h>
13#include <asm/cpufeature.h> 13#include <asm/cpufeatures.h>
14#include <asm/alternative-asm.h> 14#include <asm/alternative-asm.h>
15#include <asm/asm.h> 15#include <asm/asm.h>
16#include <asm/smap.h> 16#include <asm/smap.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 7d37641ada5b..cbb8ee5830ff 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,7 +1,7 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/cpufeature.h> 4#include <asm/cpufeatures.h>
5#include <asm/alternative-asm.h> 5#include <asm/alternative-asm.h>
6 6
7/* 7/*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index ca2afdd6d98e..90ce01bee00c 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,7 @@
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> 6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */ 7 */
8#include <linux/linkage.h> 8#include <linux/linkage.h>
9#include <asm/cpufeature.h> 9#include <asm/cpufeatures.h>
10#include <asm/alternative-asm.h> 10#include <asm/alternative-asm.h>
11 11
12#undef memmove 12#undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2661fad05827..c9c81227ea37 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,7 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs */ 1/* Copyright 2002 Andi Kleen, SuSE Labs */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/cpufeature.h> 4#include <asm/cpufeatures.h>
5#include <asm/alternative-asm.h> 5#include <asm/alternative-asm.h>
6 6
7.weak memset 7.weak memset
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 4a6f1d9b5106..99bfb192803f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
358#define pgd_none(a) pud_none(__pud(pgd_val(a))) 358#define pgd_none(a) pud_none(__pud(pgd_val(a)))
359#endif 359#endif
360 360
361#ifdef CONFIG_X86_64
362static inline bool is_hypervisor_range(int idx) 361static inline bool is_hypervisor_range(int idx)
363{ 362{
363#ifdef CONFIG_X86_64
364 /* 364 /*
365 * ffff800000000000 - ffff87ffffffffff is reserved for 365 * ffff800000000000 - ffff87ffffffffff is reserved for
366 * the hypervisor. 366 * the hypervisor.
367 */ 367 */
368 return paravirt_enabled() && 368 return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
369 (idx >= pgd_index(__PAGE_OFFSET) - 16) && 369 (idx < pgd_index(__PAGE_OFFSET));
370 (idx < pgd_index(__PAGE_OFFSET));
371}
372#else 370#else
373static inline bool is_hypervisor_range(int idx) { return false; } 371 return false;
374#endif 372#endif
373}
375 374
376static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 375static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
377 bool checkwx) 376 bool checkwx)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2ebfbaf61142..bd7a9b9e2e14 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -388,7 +388,6 @@ repeat:
388} 388}
389 389
390pte_t *kmap_pte; 390pte_t *kmap_pte;
391pgprot_t kmap_prot;
392 391
393static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) 392static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
394{ 393{
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
405 */ 404 */
406 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 405 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
407 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 406 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
408
409 kmap_prot = PAGE_KERNEL;
410} 407}
411 408
412#ifdef CONFIG_HIGHMEM 409#ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a40b755c67e3..214afda97911 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
53#include <asm/numa.h> 53#include <asm/numa.h>
54#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
55#include <asm/init.h> 55#include <asm/init.h>
56#include <asm/uv/uv.h>
56#include <asm/setup.h> 57#include <asm/setup.h>
57 58
58#include "mm_internal.h" 59#include "mm_internal.h"
@@ -1203,26 +1204,13 @@ int kern_addr_valid(unsigned long addr)
1203 1204
1204static unsigned long probe_memory_block_size(void) 1205static unsigned long probe_memory_block_size(void)
1205{ 1206{
1206 /* start from 2g */ 1207 unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
1207 unsigned long bz = 1UL<<31;
1208 1208
1209 if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { 1209 /* if system is UV or has 64GB of RAM or more, use large blocks */
1210 pr_info("Using 2GB memory block size for large-memory system\n"); 1210 if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
1211 return 2UL * 1024 * 1024 * 1024; 1211 bz = 2UL << 30; /* 2GB */
1212 }
1213
1214 /* less than 64g installed */
1215 if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
1216 return MIN_MEMORY_BLOCK_SIZE;
1217
1218 /* get the tail size */
1219 while (bz > MIN_MEMORY_BLOCK_SIZE) {
1220 if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
1221 break;
1222 bz >>= 1;
1223 }
1224 1212
1225 printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); 1213 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1226 1214
1227 return bz; 1215 return bz;
1228} 1216}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index d470cf219a2d..1b1110fa0057 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -120,11 +120,22 @@ void __init kasan_init(void)
120 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 120 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
121 (void *)KASAN_SHADOW_END); 121 (void *)KASAN_SHADOW_END);
122 122
123 memset(kasan_zero_page, 0, PAGE_SIZE);
124
125 load_cr3(init_level4_pgt); 123 load_cr3(init_level4_pgt);
126 __flush_tlb_all(); 124 __flush_tlb_all();
127 init_task.kasan_depth = 0;
128 125
126 /*
127 * kasan_zero_page has been used as early shadow memory, thus it may
128 * contain some garbage. Now we can clear and write protect it, since
129 * after the TLB flush no one should write to it.
130 */
131 memset(kasan_zero_page, 0, PAGE_SIZE);
132 for (i = 0; i < PTRS_PER_PTE; i++) {
133 pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
134 set_pte(&kasan_zero_pte[i], pte);
135 }
136 /* Flush TLBs again to be sure that write protection applied. */
137 __flush_tlb_all();
138
139 init_task.kasan_depth = 0;
129 pr_info("KernelAddressSanitizer initialized\n"); 140 pr_info("KernelAddressSanitizer initialized\n");
130} 141}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34ed632..ddb2244b06a1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
33struct kmmio_fault_page { 33struct kmmio_fault_page {
34 struct list_head list; 34 struct list_head list;
35 struct kmmio_fault_page *release_next; 35 struct kmmio_fault_page *release_next;
36 unsigned long page; /* location of the fault page */ 36 unsigned long addr; /* the requested address */
37 pteval_t old_presence; /* page presence prior to arming */ 37 pteval_t old_presence; /* page presence prior to arming */
38 bool armed; 38 bool armed;
39 39
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
70static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 70static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
71static LIST_HEAD(kmmio_probes); 71static LIST_HEAD(kmmio_probes);
72 72
73static struct list_head *kmmio_page_list(unsigned long page) 73static struct list_head *kmmio_page_list(unsigned long addr)
74{ 74{
75 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; 75 unsigned int l;
76 pte_t *pte = lookup_address(addr, &l);
77
78 if (!pte)
79 return NULL;
80 addr &= page_level_mask(l);
81
82 return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
76} 83}
77 84
78/* Accessed per-cpu */ 85/* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
98} 105}
99 106
100/* You must be holding RCU read lock. */ 107/* You must be holding RCU read lock. */
101static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 108static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
102{ 109{
103 struct list_head *head; 110 struct list_head *head;
104 struct kmmio_fault_page *f; 111 struct kmmio_fault_page *f;
112 unsigned int l;
113 pte_t *pte = lookup_address(addr, &l);
105 114
106 page &= PAGE_MASK; 115 if (!pte)
107 head = kmmio_page_list(page); 116 return NULL;
117 addr &= page_level_mask(l);
118 head = kmmio_page_list(addr);
108 list_for_each_entry_rcu(f, head, list) { 119 list_for_each_entry_rcu(f, head, list) {
109 if (f->page == page) 120 if (f->addr == addr)
110 return f; 121 return f;
111 } 122 }
112 return NULL; 123 return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
137static int clear_page_presence(struct kmmio_fault_page *f, bool clear) 148static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
138{ 149{
139 unsigned int level; 150 unsigned int level;
140 pte_t *pte = lookup_address(f->page, &level); 151 pte_t *pte = lookup_address(f->addr, &level);
141 152
142 if (!pte) { 153 if (!pte) {
143 pr_err("no pte for page 0x%08lx\n", f->page); 154 pr_err("no pte for addr 0x%08lx\n", f->addr);
144 return -1; 155 return -1;
145 } 156 }
146 157
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
156 return -1; 167 return -1;
157 } 168 }
158 169
159 __flush_tlb_one(f->page); 170 __flush_tlb_one(f->addr);
160 return 0; 171 return 0;
161} 172}
162 173
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
176 int ret; 187 int ret;
177 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); 188 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
178 if (f->armed) { 189 if (f->armed) {
179 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", 190 pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
180 f->page, f->count, !!f->old_presence); 191 f->addr, f->count, !!f->old_presence);
181 } 192 }
182 ret = clear_page_presence(f, true); 193 ret = clear_page_presence(f, true);
183 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), 194 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
184 f->page); 195 f->addr);
185 f->armed = true; 196 f->armed = true;
186 return ret; 197 return ret;
187} 198}
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
191{ 202{
192 int ret = clear_page_presence(f, false); 203 int ret = clear_page_presence(f, false);
193 WARN_ONCE(ret < 0, 204 WARN_ONCE(ret < 0,
194 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 205 KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
195 f->armed = false; 206 f->armed = false;
196} 207}
197 208
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
215 struct kmmio_context *ctx; 226 struct kmmio_context *ctx;
216 struct kmmio_fault_page *faultpage; 227 struct kmmio_fault_page *faultpage;
217 int ret = 0; /* default to fault not handled */ 228 int ret = 0; /* default to fault not handled */
229 unsigned long page_base = addr;
230 unsigned int l;
231 pte_t *pte = lookup_address(addr, &l);
232 if (!pte)
233 return -EINVAL;
234 page_base &= page_level_mask(l);
218 235
219 /* 236 /*
220 * Preemption is now disabled to prevent process switch during 237 * Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
227 preempt_disable(); 244 preempt_disable();
228 rcu_read_lock(); 245 rcu_read_lock();
229 246
230 faultpage = get_kmmio_fault_page(addr); 247 faultpage = get_kmmio_fault_page(page_base);
231 if (!faultpage) { 248 if (!faultpage) {
232 /* 249 /*
233 * Either this page fault is not caused by kmmio, or 250 * Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
239 256
240 ctx = &get_cpu_var(kmmio_ctx); 257 ctx = &get_cpu_var(kmmio_ctx);
241 if (ctx->active) { 258 if (ctx->active) {
242 if (addr == ctx->addr) { 259 if (page_base == ctx->addr) {
243 /* 260 /*
244 * A second fault on the same page means some other 261 * A second fault on the same page means some other
245 * condition needs handling by do_page_fault(), the 262 * condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
267 ctx->active++; 284 ctx->active++;
268 285
269 ctx->fpage = faultpage; 286 ctx->fpage = faultpage;
270 ctx->probe = get_kmmio_probe(addr); 287 ctx->probe = get_kmmio_probe(page_base);
271 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 288 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
272 ctx->addr = addr; 289 ctx->addr = page_base;
273 290
274 if (ctx->probe && ctx->probe->pre_handler) 291 if (ctx->probe && ctx->probe->pre_handler)
275 ctx->probe->pre_handler(ctx->probe, regs, addr); 292 ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
354} 371}
355 372
356/* You must be holding kmmio_lock. */ 373/* You must be holding kmmio_lock. */
357static int add_kmmio_fault_page(unsigned long page) 374static int add_kmmio_fault_page(unsigned long addr)
358{ 375{
359 struct kmmio_fault_page *f; 376 struct kmmio_fault_page *f;
360 377
361 page &= PAGE_MASK; 378 f = get_kmmio_fault_page(addr);
362 f = get_kmmio_fault_page(page);
363 if (f) { 379 if (f) {
364 if (!f->count) 380 if (!f->count)
365 arm_kmmio_fault_page(f); 381 arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
372 return -1; 388 return -1;
373 389
374 f->count = 1; 390 f->count = 1;
375 f->page = page; 391 f->addr = addr;
376 392
377 if (arm_kmmio_fault_page(f)) { 393 if (arm_kmmio_fault_page(f)) {
378 kfree(f); 394 kfree(f);
379 return -1; 395 return -1;
380 } 396 }
381 397
382 list_add_rcu(&f->list, kmmio_page_list(f->page)); 398 list_add_rcu(&f->list, kmmio_page_list(f->addr));
383 399
384 return 0; 400 return 0;
385} 401}
386 402
387/* You must be holding kmmio_lock. */ 403/* You must be holding kmmio_lock. */
388static void release_kmmio_fault_page(unsigned long page, 404static void release_kmmio_fault_page(unsigned long addr,
389 struct kmmio_fault_page **release_list) 405 struct kmmio_fault_page **release_list)
390{ 406{
391 struct kmmio_fault_page *f; 407 struct kmmio_fault_page *f;
392 408
393 page &= PAGE_MASK; 409 f = get_kmmio_fault_page(addr);
394 f = get_kmmio_fault_page(page);
395 if (!f) 410 if (!f)
396 return; 411 return;
397 412
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
420 int ret = 0; 435 int ret = 0;
421 unsigned long size = 0; 436 unsigned long size = 0;
422 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 437 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
438 unsigned int l;
439 pte_t *pte;
423 440
424 spin_lock_irqsave(&kmmio_lock, flags); 441 spin_lock_irqsave(&kmmio_lock, flags);
425 if (get_kmmio_probe(p->addr)) { 442 if (get_kmmio_probe(p->addr)) {
426 ret = -EEXIST; 443 ret = -EEXIST;
427 goto out; 444 goto out;
428 } 445 }
446
447 pte = lookup_address(p->addr, &l);
448 if (!pte) {
449 ret = -EINVAL;
450 goto out;
451 }
452
429 kmmio_count++; 453 kmmio_count++;
430 list_add_rcu(&p->list, &kmmio_probes); 454 list_add_rcu(&p->list, &kmmio_probes);
431 while (size < size_lim) { 455 while (size < size_lim) {
432 if (add_kmmio_fault_page(p->addr + size)) 456 if (add_kmmio_fault_page(p->addr + size))
433 pr_err("Unable to set page fault.\n"); 457 pr_err("Unable to set page fault.\n");
434 size += PAGE_SIZE; 458 size += page_level_size(l);
435 } 459 }
436out: 460out:
437 spin_unlock_irqrestore(&kmmio_lock, flags); 461 spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
506 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 530 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
507 struct kmmio_fault_page *release_list = NULL; 531 struct kmmio_fault_page *release_list = NULL;
508 struct kmmio_delayed_release *drelease; 532 struct kmmio_delayed_release *drelease;
533 unsigned int l;
534 pte_t *pte;
535
536 pte = lookup_address(p->addr, &l);
537 if (!pte)
538 return;
509 539
510 spin_lock_irqsave(&kmmio_lock, flags); 540 spin_lock_irqsave(&kmmio_lock, flags);
511 while (size < size_lim) { 541 while (size < size_lim) {
512 release_kmmio_fault_page(p->addr + size, &release_list); 542 release_kmmio_fault_page(p->addr + size, &release_list);
513 size += PAGE_SIZE; 543 size += page_level_size(l);
514 } 544 }
515 list_del_rcu(&p->list); 545 list_del_rcu(&p->list);
516 kmmio_count--; 546 kmmio_count--;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 72bb52f93c3d..d2dc0438d654 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd)
94} 94}
95 95
96/* 96/*
97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
98 * does, but not when emulating X86_32
99 */
100static unsigned long mmap_legacy_base(unsigned long rnd)
101{
102 if (mmap_is_ia32())
103 return TASK_UNMAPPED_BASE;
104 else
105 return TASK_UNMAPPED_BASE + rnd;
106}
107
108/*
109 * This function, called very early during the creation of a new 97 * This function, called very early during the creation of a new
110 * process VM image, sets up which VM layout function to use: 98 * process VM image, sets up which VM layout function to use:
111 */ 99 */
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
116 if (current->flags & PF_RANDOMIZE) 104 if (current->flags & PF_RANDOMIZE)
117 random_factor = arch_mmap_rnd(); 105 random_factor = arch_mmap_rnd();
118 106
119 mm->mmap_legacy_base = mmap_legacy_base(random_factor); 107 mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
120 108
121 if (mmap_is_legacy()) { 109 if (mmap_is_legacy()) {
122 mm->mmap_base = mm->mmap_legacy_base; 110 mm->mmap_base = mm->mmap_legacy_base;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index d04f8094bc23..f70c1ff46125 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
465 return true; 465 return true;
466} 466}
467 467
468/*
469 * Mark all currently memblock-reserved physical memory (which covers the
470 * kernel's own memory ranges) as hot-unswappable.
471 */
468static void __init numa_clear_kernel_node_hotplug(void) 472static void __init numa_clear_kernel_node_hotplug(void)
469{ 473{
470 int i, nid; 474 nodemask_t reserved_nodemask = NODE_MASK_NONE;
471 nodemask_t numa_kernel_nodes = NODE_MASK_NONE; 475 struct memblock_region *mb_region;
472 phys_addr_t start, end; 476 int i;
473 struct memblock_region *r;
474 477
475 /* 478 /*
479 * We have to do some preprocessing of memblock regions, to
480 * make them suitable for reservation.
481 *
476 * At this time, all memory regions reserved by memblock are 482 * At this time, all memory regions reserved by memblock are
477 * used by the kernel. Set the nid in memblock.reserved will 483 * used by the kernel, but those regions are not split up
478 * mark out all the nodes the kernel resides in. 484 * along node boundaries yet, and don't necessarily have their
485 * node ID set yet either.
486 *
487 * So iterate over all memory known to the x86 architecture,
488 * and use those ranges to set the nid in memblock.reserved.
489 * This will split up the memblock regions along node
490 * boundaries and will set the node IDs as well.
479 */ 491 */
480 for (i = 0; i < numa_meminfo.nr_blks; i++) { 492 for (i = 0; i < numa_meminfo.nr_blks; i++) {
481 struct numa_memblk *mb = &numa_meminfo.blk[i]; 493 struct numa_memblk *mb = numa_meminfo.blk + i;
494 int ret;
482 495
483 memblock_set_node(mb->start, mb->end - mb->start, 496 ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
484 &memblock.reserved, mb->nid); 497 WARN_ON_ONCE(ret);
485 } 498 }
486 499
487 /* 500 /*
488 * Mark all kernel nodes. 501 * Now go over all reserved memblock regions, to construct a
502 * node mask of all kernel reserved memory areas.
489 * 503 *
490 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo 504 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
491 * may not include all the memblock.reserved memory ranges because 505 * numa_meminfo might not include all memblock.reserved
492 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. 506 * memory ranges, because quirks such as trim_snb_memory()
507 * reserve specific pages for Sandy Bridge graphics. ]
493 */ 508 */
494 for_each_memblock(reserved, r) 509 for_each_memblock(reserved, mb_region) {
495 if (r->nid != MAX_NUMNODES) 510 if (mb_region->nid != MAX_NUMNODES)
496 node_set(r->nid, numa_kernel_nodes); 511 node_set(mb_region->nid, reserved_nodemask);
512 }
497 513
498 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 514 /*
515 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
516 * belonging to the reserved node mask.
517 *
518 * Note that this will include memory regions that reside
519 * on nodes that contain kernel memory - entire nodes
520 * become hot-unpluggable:
521 */
499 for (i = 0; i < numa_meminfo.nr_blks; i++) { 522 for (i = 0; i < numa_meminfo.nr_blks; i++) {
500 nid = numa_meminfo.blk[i].nid; 523 struct numa_memblk *mb = numa_meminfo.blk + i;
501 if (!node_isset(nid, numa_kernel_nodes))
502 continue;
503 524
504 start = numa_meminfo.blk[i].start; 525 if (!node_isset(mb->nid, reserved_nodemask))
505 end = numa_meminfo.blk[i].end; 526 continue;
506 527
507 memblock_clear_hotplug(start, end - start); 528 memblock_clear_hotplug(mb->start, mb->end - mb->start);
508 } 529 }
509} 530}
510 531
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1c37e650acac..007ebe2d8157 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1128,8 +1128,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1128 /* 1128 /*
1129 * Ignore all non primary paths. 1129 * Ignore all non primary paths.
1130 */ 1130 */
1131 if (!primary) 1131 if (!primary) {
1132 cpa->numpages = 1;
1132 return 0; 1133 return 0;
1134 }
1133 1135
1134 /* 1136 /*
1135 * Ignore the NULL PTE for kernel identity mapping, as it is expected 1137 * Ignore the NULL PTE for kernel identity mapping, as it is expected
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f4ae536b0914..04e2e7144bee 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
943 return -EINVAL; 943 return -EINVAL;
944 } 944 }
945 945
946 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 946 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
947 cachemode2protval(pcm)); 947 cachemode2protval(pcm));
948 948
949 return 0; 949 return 0;
@@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
959 959
960 /* Set prot based on lookup */ 960 /* Set prot based on lookup */
961 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 961 pcm = lookup_memtype(pfn_t_to_phys(pfn));
962 *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | 962 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
963 cachemode2protval(pcm)); 963 cachemode2protval(pcm));
964 964
965 return 0; 965 return 0;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 92e2eacb3321..8bea84724a7d 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -4,6 +4,7 @@
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/cpufeature.h>
7 8
8static int disable_nx; 9static int disable_nx;
9 10
@@ -31,9 +32,8 @@ early_param("noexec", noexec_setup);
31 32
32void x86_configure_nx(void) 33void x86_configure_nx(void)
33{ 34{
34 if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) 35 /* If disable_nx is set, clear NX on all new mappings going forward. */
35 __supported_pte_mask |= _PAGE_NX; 36 if (disable_nx)
36 else
37 __supported_pte_mask &= ~_PAGE_NX; 37 __supported_pte_mask &= ~_PAGE_NX;
38} 38}
39 39
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 50d86c0e9ba4..660a83c8287b 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -24,7 +24,6 @@
24#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/apic.h> 25#include <asm/apic.h>
26#include <asm/processor.h> 26#include <asm/processor.h>
27#include <asm/cpufeature.h>
28 27
29#include "op_x86_model.h" 28#include "op_x86_model.h"
30#include "op_counter.h" 29#include "op_counter.h"
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index 76b6632d3143..1865c196f136 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -21,7 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/io.h> 22#include <linux/io.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/module.h> 24#include <linux/moduleparam.h>
25#include <linux/leds.h> 25#include <linux/leds.h>
26#include <linux/platform_device.h> 26#include <linux/platform_device.h>
27#include <linux/gpio.h> 27#include <linux/gpio.h>
@@ -35,6 +35,11 @@
35#define BIOS_SIGNATURE_COREBOOT 0x500 35#define BIOS_SIGNATURE_COREBOOT 0x500
36#define BIOS_REGION_SIZE 0x10000 36#define BIOS_REGION_SIZE 0x10000
37 37
38/*
39 * This driver is not modular, but to keep back compatibility
40 * with existing use cases, continuing with module_param is
41 * the easiest way forward.
42 */
38static bool force = 0; 43static bool force = 0;
39module_param(force, bool, 0444); 44module_param(force, bool, 0444);
40/* FIXME: Award bios is not automatically detected as Alix platform */ 45/* FIXME: Award bios is not automatically detected as Alix platform */
@@ -192,9 +197,4 @@ static int __init alix_init(void)
192 197
193 return 0; 198 return 0;
194} 199}
195 200device_initcall(alix_init);
196module_init(alix_init);
197
198MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>");
199MODULE_DESCRIPTION("PCEngines ALIX System Setup");
200MODULE_LICENSE("GPL");
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index aa733fba2471..4fcdb91318a0 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -19,7 +19,6 @@
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/io.h> 20#include <linux/io.h>
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/module.h>
23#include <linux/leds.h> 22#include <linux/leds.h>
24#include <linux/platform_device.h> 23#include <linux/platform_device.h>
25#include <linux/gpio.h> 24#include <linux/gpio.h>
@@ -120,9 +119,4 @@ static int __init geos_init(void)
120 119
121 return 0; 120 return 0;
122} 121}
123 122device_initcall(geos_init);
124module_init(geos_init);
125
126MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
127MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
128MODULE_LICENSE("GPL");
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 927e38c0089f..a2f6b982a729 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,7 +20,6 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/io.h> 21#include <linux/io.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/module.h>
24#include <linux/leds.h> 23#include <linux/leds.h>
25#include <linux/platform_device.h> 24#include <linux/platform_device.h>
26#include <linux/gpio.h> 25#include <linux/gpio.h>
@@ -146,9 +145,4 @@ static int __init net5501_init(void)
146 145
147 return 0; 146 return 0;
148} 147}
149 148device_initcall(net5501_init);
150module_init(net5501_init);
151
152MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
153MODULE_DESCRIPTION("Soekris net5501 System Setup");
154MODULE_LICENSE("GPL");
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 23381d2174ae..1eb47b6298c2 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void)
52 /* mark tsc clocksource as reliable */ 52 /* mark tsc clocksource as reliable */
53 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); 53 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
54 54
55 if (fast_calibrate) 55 return fast_calibrate;
56 return fast_calibrate;
57
58 return 0;
59} 56}
60 57
61static void __init penwell_arch_setup(void) 58static void __init penwell_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
index aaca91753d32..bd1adc621781 100644
--- a/arch/x86/platform/intel-mid/mrfl.c
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
81 /* mark tsc clocksource as reliable */ 81 /* mark tsc clocksource as reliable */
82 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); 82 set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
83 83
84 if (fast_calibrate) 84 return fast_calibrate;
85 return fast_calibrate;
86
87 return 0;
88} 85}
89 86
90static void __init tangier_arch_setup(void) 87static void __init tangier_arch_setup(void)
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index bfadcd0f4944..17d6d2296e4d 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
1/** 1/**
2 * imr.c 2 * imr.c -- Intel Isolated Memory Region driver
3 * 3 *
4 * Copyright(c) 2013 Intel Corporation. 4 * Copyright(c) 2013 Intel Corporation.
5 * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> 5 * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -31,7 +31,6 @@
31#include <linux/debugfs.h> 31#include <linux/debugfs.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/module.h>
35#include <linux/types.h> 34#include <linux/types.h>
36 35
37struct imr_device { 36struct imr_device {
@@ -135,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
135 * @idev: pointer to imr_device structure. 134 * @idev: pointer to imr_device structure.
136 * @imr_id: IMR entry to write. 135 * @imr_id: IMR entry to write.
137 * @imr: IMR structure representing address and access masks. 136 * @imr: IMR structure representing address and access masks.
138 * @lock: indicates if the IMR lock bit should be applied.
139 * @return: 0 on success or error code passed from mbi_iosf on failure. 137 * @return: 0 on success or error code passed from mbi_iosf on failure.
140 */ 138 */
141static int imr_write(struct imr_device *idev, u32 imr_id, 139static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
142 struct imr_regs *imr, bool lock)
143{ 140{
144 unsigned long flags; 141 unsigned long flags;
145 u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; 142 u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
@@ -163,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
163 if (ret) 160 if (ret)
164 goto failed; 161 goto failed;
165 162
166 /* Lock bit must be set separately to addr_lo address bits. */
167 if (lock) {
168 imr->addr_lo |= IMR_LOCK;
169 ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
170 reg - IMR_NUM_REGS, imr->addr_lo);
171 if (ret)
172 goto failed;
173 }
174
175 local_irq_restore(flags); 163 local_irq_restore(flags);
176 return 0; 164 return 0;
177failed: 165failed:
@@ -270,17 +258,6 @@ static int imr_debugfs_register(struct imr_device *idev)
270} 258}
271 259
272/** 260/**
273 * imr_debugfs_unregister - unregister debugfs hooks.
274 *
275 * @idev: pointer to imr_device structure.
276 * @return:
277 */
278static void imr_debugfs_unregister(struct imr_device *idev)
279{
280 debugfs_remove(idev->file);
281}
282
283/**
284 * imr_check_params - check passed address range IMR alignment and non-zero size 261 * imr_check_params - check passed address range IMR alignment and non-zero size
285 * 262 *
286 * @base: base address of intended IMR. 263 * @base: base address of intended IMR.
@@ -334,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
334 * @size: physical size of region in bytes must be aligned to 1KiB. 311 * @size: physical size of region in bytes must be aligned to 1KiB.
335 * @read_mask: read access mask. 312 * @read_mask: read access mask.
336 * @write_mask: write access mask. 313 * @write_mask: write access mask.
337 * @lock: indicates whether or not to permanently lock this region.
338 * @return: zero on success or negative value indicating error. 314 * @return: zero on success or negative value indicating error.
339 */ 315 */
340int imr_add_range(phys_addr_t base, size_t size, 316int imr_add_range(phys_addr_t base, size_t size,
341 unsigned int rmask, unsigned int wmask, bool lock) 317 unsigned int rmask, unsigned int wmask)
342{ 318{
343 phys_addr_t end; 319 phys_addr_t end;
344 unsigned int i; 320 unsigned int i;
@@ -411,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size,
411 imr.rmask = rmask; 387 imr.rmask = rmask;
412 imr.wmask = wmask; 388 imr.wmask = wmask;
413 389
414 ret = imr_write(idev, reg, &imr, lock); 390 ret = imr_write(idev, reg, &imr);
415 if (ret < 0) { 391 if (ret < 0) {
416 /* 392 /*
417 * In the highly unlikely event iosf_mbi_write failed 393 * In the highly unlikely event iosf_mbi_write failed
@@ -422,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size,
422 imr.addr_hi = 0; 398 imr.addr_hi = 0;
423 imr.rmask = IMR_READ_ACCESS_ALL; 399 imr.rmask = IMR_READ_ACCESS_ALL;
424 imr.wmask = IMR_WRITE_ACCESS_ALL; 400 imr.wmask = IMR_WRITE_ACCESS_ALL;
425 imr_write(idev, reg, &imr, false); 401 imr_write(idev, reg, &imr);
426 } 402 }
427failed: 403failed:
428 mutex_unlock(&idev->lock); 404 mutex_unlock(&idev->lock);
@@ -518,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
518 imr.rmask = IMR_READ_ACCESS_ALL; 494 imr.rmask = IMR_READ_ACCESS_ALL;
519 imr.wmask = IMR_WRITE_ACCESS_ALL; 495 imr.wmask = IMR_WRITE_ACCESS_ALL;
520 496
521 ret = imr_write(idev, reg, &imr, false); 497 ret = imr_write(idev, reg, &imr);
522 498
523failed: 499failed:
524 mutex_unlock(&idev->lock); 500 mutex_unlock(&idev->lock);
@@ -599,7 +575,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
599 * We don't round up @size since it is already PAGE_SIZE aligned. 575 * We don't round up @size since it is already PAGE_SIZE aligned.
600 * See vmlinux.lds.S for details. 576 * See vmlinux.lds.S for details.
601 */ 577 */
602 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); 578 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
603 if (ret < 0) { 579 if (ret < 0) {
604 pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n", 580 pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
605 size / 1024, start, end); 581 size / 1024, start, end);
@@ -614,7 +590,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
614 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ 590 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
615 {} 591 {}
616}; 592};
617MODULE_DEVICE_TABLE(x86cpu, imr_ids);
618 593
619/** 594/**
620 * imr_init - entry point for IMR driver. 595 * imr_init - entry point for IMR driver.
@@ -640,22 +615,4 @@ static int __init imr_init(void)
640 imr_fixup_memmap(idev); 615 imr_fixup_memmap(idev);
641 return 0; 616 return 0;
642} 617}
643 618device_initcall(imr_init);
644/**
645 * imr_exit - exit point for IMR code.
646 *
647 * Deregisters debugfs, leave IMR state as-is.
648 *
649 * return:
650 */
651static void __exit imr_exit(void)
652{
653 imr_debugfs_unregister(&imr_dev);
654}
655
656module_init(imr_init);
657module_exit(imr_exit);
658
659MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
660MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
661MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 278e4da4222f..f5bad40936ac 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
1/** 1/**
2 * imr_selftest.c 2 * imr_selftest.c -- Intel Isolated Memory Region self-test driver
3 * 3 *
4 * Copyright(c) 2013 Intel Corporation. 4 * Copyright(c) 2013 Intel Corporation.
5 * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> 5 * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -15,7 +15,6 @@
15#include <asm/imr.h> 15#include <asm/imr.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/module.h>
19#include <linux/types.h> 18#include <linux/types.h>
20 19
21#define SELFTEST KBUILD_MODNAME ": " 20#define SELFTEST KBUILD_MODNAME ": "
@@ -61,30 +60,30 @@ static void __init imr_self_test(void)
61 int ret; 60 int ret;
62 61
63 /* Test zero zero. */ 62 /* Test zero zero. */
64 ret = imr_add_range(0, 0, 0, 0, false); 63 ret = imr_add_range(0, 0, 0, 0);
65 imr_self_test_result(ret < 0, "zero sized IMR\n"); 64 imr_self_test_result(ret < 0, "zero sized IMR\n");
66 65
67 /* Test exact overlap. */ 66 /* Test exact overlap. */
68 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); 67 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
69 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); 68 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
70 69
71 /* Test overlap with base inside of existing. */ 70 /* Test overlap with base inside of existing. */
72 base += size - IMR_ALIGN; 71 base += size - IMR_ALIGN;
73 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); 72 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
74 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); 73 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
75 74
76 /* Test overlap with end inside of existing. */ 75 /* Test overlap with end inside of existing. */
77 base -= size + IMR_ALIGN * 2; 76 base -= size + IMR_ALIGN * 2;
78 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); 77 ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
79 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); 78 imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
80 79
81 /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ 80 /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
82 ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, 81 ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
83 IMR_WRITE_ACCESS_ALL, false); 82 IMR_WRITE_ACCESS_ALL);
84 imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); 83 imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
85 84
86 /* Test that a 1 KiB IMR @ zero with CPU only will work. */ 85 /* Test that a 1 KiB IMR @ zero with CPU only will work. */
87 ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); 86 ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
88 imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); 87 imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
89 if (ret >= 0) { 88 if (ret >= 0) {
90 ret = imr_remove_range(0, IMR_ALIGN); 89 ret = imr_remove_range(0, IMR_ALIGN);
@@ -93,8 +92,7 @@ static void __init imr_self_test(void)
93 92
94 /* Test 2 KiB works. */ 93 /* Test 2 KiB works. */
95 size = IMR_ALIGN * 2; 94 size = IMR_ALIGN * 2;
96 ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, 95 ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
97 IMR_WRITE_ACCESS_ALL, false);
98 imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); 96 imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
99 if (ret >= 0) { 97 if (ret >= 0) {
100 ret = imr_remove_range(0, size); 98 ret = imr_remove_range(0, size);
@@ -106,7 +104,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
106 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ 104 { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
107 {} 105 {}
108}; 106};
109MODULE_DEVICE_TABLE(x86cpu, imr_ids);
110 107
111/** 108/**
112 * imr_self_test_init - entry point for IMR driver. 109 * imr_self_test_init - entry point for IMR driver.
@@ -125,13 +122,4 @@ static int __init imr_self_test_init(void)
125 * 122 *
126 * return: 123 * return:
127 */ 124 */
128static void __exit imr_self_test_exit(void) 125device_initcall(imr_self_test_init);
129{
130}
131
132module_init(imr_self_test_init);
133module_exit(imr_self_test_exit);
134
135MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
136MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
137MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 174781a404ff..00c319048d52 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -3,7 +3,7 @@
3 3
4#include <asm/asm.h> 4#include <asm/asm.h>
5#include <asm/segment.h> 5#include <asm/segment.h>
6#include <asm/cpufeature.h> 6#include <asm/cpufeatures.h>
7#include <asm/cmpxchg.h> 7#include <asm/cmpxchg.h>
8#include <asm/nops.h> 8#include <asm/nops.h>
9 9
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 439c0994b696..bfce503dffae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
25 25
26#define old_mmap sys_old_mmap 26#define old_mmap sys_old_mmap
27 27
28#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; 28#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
29#include <asm/syscalls_32.h> 29#include <asm/syscalls_32.h>
30 30
31#undef __SYSCALL_I386 31#undef __SYSCALL_I386
32#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, 32#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
33 33
34extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 34extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
35 35
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b74ea6c2c0e7..f306413d3eb6 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,14 +35,11 @@
35#define stub_execveat sys_execveat 35#define stub_execveat sys_execveat
36#define stub_rt_sigreturn sys_rt_sigreturn 36#define stub_rt_sigreturn sys_rt_sigreturn
37 37
38#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 38#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
39#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
40
41#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
42#include <asm/syscalls_64.h> 39#include <asm/syscalls_64.h>
43 40
44#undef __SYSCALL_64 41#undef __SYSCALL_64
45#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, 42#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
46 43
47extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); 44extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
48 45
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ce7e3607a870..470564bbd08e 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,14 +9,12 @@
9#include <asm/types.h> 9#include <asm/types.h>
10 10
11#ifdef __i386__ 11#ifdef __i386__
12#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, 12#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
13static char syscalls[] = { 13static char syscalls[] = {
14#include <asm/syscalls_32.h> 14#include <asm/syscalls_32.h>
15}; 15};
16#else 16#else
17#define __SYSCALL_64(nr, sym, compat) [nr] = 1, 17#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
18#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
19#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
20static char syscalls[] = { 18static char syscalls[] = {
21#include <asm/syscalls_64.h> 19#include <asm/syscalls_64.h>
22}; 20};
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3f4ebf0261f2..3c6d17fd423a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
112 xen_pvh_secondary_vcpu_init(cpu); 112 xen_pvh_secondary_vcpu_init(cpu);
113#endif 113#endif
114 cpu_bringup(); 114 cpu_bringup();
115 cpu_startup_entry(CPUHP_ONLINE); 115 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
116} 116}
117 117
118static void xen_smp_intr_free(unsigned int cpu) 118static void xen_smp_intr_free(unsigned int cpu)
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4d02e38514f5..fc4ad21a5ed4 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -157,7 +157,7 @@ void secondary_start_kernel(void)
157 157
158 complete(&cpu_running); 158 complete(&cpu_running);
159 159
160 cpu_startup_entry(CPUHP_ONLINE); 160 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
161} 161}
162 162
163static void mx_cpu_start(void *p) 163static void mx_cpu_start(void *p)
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 33db7406c0e2..c346be650892 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -160,6 +160,7 @@ config CLKSRC_EFM32
160config CLKSRC_LPC32XX 160config CLKSRC_LPC32XX
161 bool "Clocksource for LPC32XX" if COMPILE_TEST 161 bool "Clocksource for LPC32XX" if COMPILE_TEST
162 depends on GENERIC_CLOCKEVENTS && HAS_IOMEM 162 depends on GENERIC_CLOCKEVENTS && HAS_IOMEM
163 depends on ARM
163 select CLKSRC_MMIO 164 select CLKSRC_MMIO
164 select CLKSRC_OF 165 select CLKSRC_OF
165 help 166 help
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index c64d543d64bf..f0dd9d42bc7b 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -32,6 +32,14 @@
32#define CNTTIDR 0x08 32#define CNTTIDR 0x08
33#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) 33#define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4))
34 34
35#define CNTACR(n) (0x40 + ((n) * 4))
36#define CNTACR_RPCT BIT(0)
37#define CNTACR_RVCT BIT(1)
38#define CNTACR_RFRQ BIT(2)
39#define CNTACR_RVOFF BIT(3)
40#define CNTACR_RWVT BIT(4)
41#define CNTACR_RWPT BIT(5)
42
35#define CNTVCT_LO 0x08 43#define CNTVCT_LO 0x08
36#define CNTVCT_HI 0x0c 44#define CNTVCT_HI 0x0c
37#define CNTFRQ 0x10 45#define CNTFRQ 0x10
@@ -266,10 +274,12 @@ static void __arch_timer_setup(unsigned type,
266 if (arch_timer_use_virtual) { 274 if (arch_timer_use_virtual) {
267 clk->irq = arch_timer_ppi[VIRT_PPI]; 275 clk->irq = arch_timer_ppi[VIRT_PPI];
268 clk->set_state_shutdown = arch_timer_shutdown_virt; 276 clk->set_state_shutdown = arch_timer_shutdown_virt;
277 clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
269 clk->set_next_event = arch_timer_set_next_event_virt; 278 clk->set_next_event = arch_timer_set_next_event_virt;
270 } else { 279 } else {
271 clk->irq = arch_timer_ppi[PHYS_SECURE_PPI]; 280 clk->irq = arch_timer_ppi[PHYS_SECURE_PPI];
272 clk->set_state_shutdown = arch_timer_shutdown_phys; 281 clk->set_state_shutdown = arch_timer_shutdown_phys;
282 clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
273 clk->set_next_event = arch_timer_set_next_event_phys; 283 clk->set_next_event = arch_timer_set_next_event_phys;
274 } 284 }
275 } else { 285 } else {
@@ -279,10 +289,12 @@ static void __arch_timer_setup(unsigned type,
279 clk->cpumask = cpu_all_mask; 289 clk->cpumask = cpu_all_mask;
280 if (arch_timer_mem_use_virtual) { 290 if (arch_timer_mem_use_virtual) {
281 clk->set_state_shutdown = arch_timer_shutdown_virt_mem; 291 clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
292 clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
282 clk->set_next_event = 293 clk->set_next_event =
283 arch_timer_set_next_event_virt_mem; 294 arch_timer_set_next_event_virt_mem;
284 } else { 295 } else {
285 clk->set_state_shutdown = arch_timer_shutdown_phys_mem; 296 clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
297 clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
286 clk->set_next_event = 298 clk->set_next_event =
287 arch_timer_set_next_event_phys_mem; 299 arch_timer_set_next_event_phys_mem;
288 } 300 }
@@ -757,7 +769,6 @@ static void __init arch_timer_mem_init(struct device_node *np)
757 } 769 }
758 770
759 cnttidr = readl_relaxed(cntctlbase + CNTTIDR); 771 cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
760 iounmap(cntctlbase);
761 772
762 /* 773 /*
763 * Try to find a virtual capable frame. Otherwise fall back to a 774 * Try to find a virtual capable frame. Otherwise fall back to a
@@ -765,20 +776,31 @@ static void __init arch_timer_mem_init(struct device_node *np)
765 */ 776 */
766 for_each_available_child_of_node(np, frame) { 777 for_each_available_child_of_node(np, frame) {
767 int n; 778 int n;
779 u32 cntacr;
768 780
769 if (of_property_read_u32(frame, "frame-number", &n)) { 781 if (of_property_read_u32(frame, "frame-number", &n)) {
770 pr_err("arch_timer: Missing frame-number\n"); 782 pr_err("arch_timer: Missing frame-number\n");
771 of_node_put(best_frame);
772 of_node_put(frame); 783 of_node_put(frame);
773 return; 784 goto out;
774 } 785 }
775 786
776 if (cnttidr & CNTTIDR_VIRT(n)) { 787 /* Try enabling everything, and see what sticks */
788 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
789 CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
790 writel_relaxed(cntacr, cntctlbase + CNTACR(n));
791 cntacr = readl_relaxed(cntctlbase + CNTACR(n));
792
793 if ((cnttidr & CNTTIDR_VIRT(n)) &&
794 !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
777 of_node_put(best_frame); 795 of_node_put(best_frame);
778 best_frame = frame; 796 best_frame = frame;
779 arch_timer_mem_use_virtual = true; 797 arch_timer_mem_use_virtual = true;
780 break; 798 break;
781 } 799 }
800
801 if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
802 continue;
803
782 of_node_put(best_frame); 804 of_node_put(best_frame);
783 best_frame = of_node_get(frame); 805 best_frame = of_node_get(frame);
784 } 806 }
@@ -786,24 +808,26 @@ static void __init arch_timer_mem_init(struct device_node *np)
786 base = arch_counter_base = of_iomap(best_frame, 0); 808 base = arch_counter_base = of_iomap(best_frame, 0);
787 if (!base) { 809 if (!base) {
788 pr_err("arch_timer: Can't map frame's registers\n"); 810 pr_err("arch_timer: Can't map frame's registers\n");
789 of_node_put(best_frame); 811 goto out;
790 return;
791 } 812 }
792 813
793 if (arch_timer_mem_use_virtual) 814 if (arch_timer_mem_use_virtual)
794 irq = irq_of_parse_and_map(best_frame, 1); 815 irq = irq_of_parse_and_map(best_frame, 1);
795 else 816 else
796 irq = irq_of_parse_and_map(best_frame, 0); 817 irq = irq_of_parse_and_map(best_frame, 0);
797 of_node_put(best_frame); 818
798 if (!irq) { 819 if (!irq) {
799 pr_err("arch_timer: Frame missing %s irq", 820 pr_err("arch_timer: Frame missing %s irq",
800 arch_timer_mem_use_virtual ? "virt" : "phys"); 821 arch_timer_mem_use_virtual ? "virt" : "phys");
801 return; 822 goto out;
802 } 823 }
803 824
804 arch_timer_detect_rate(base, np); 825 arch_timer_detect_rate(base, np);
805 arch_timer_mem_register(base, irq); 826 arch_timer_mem_register(base, irq);
806 arch_timer_common_init(); 827 arch_timer_common_init();
828out:
829 iounmap(cntctlbase);
830 of_node_put(best_frame);
807} 831}
808CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", 832CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
809 arch_timer_mem_init); 833 arch_timer_mem_init);
diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c
index d189d8cb69f7..9df0d1699d22 100644
--- a/drivers/clocksource/arm_global_timer.c
+++ b/drivers/clocksource/arm_global_timer.c
@@ -16,6 +16,7 @@
16#include <linux/clockchips.h> 16#include <linux/clockchips.h>
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/clk.h> 18#include <linux/clk.h>
19#include <linux/delay.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/io.h> 21#include <linux/io.h>
21#include <linux/of.h> 22#include <linux/of.h>
@@ -174,6 +175,7 @@ static int gt_clockevents_init(struct clock_event_device *clk)
174 clk->set_state_shutdown = gt_clockevent_shutdown; 175 clk->set_state_shutdown = gt_clockevent_shutdown;
175 clk->set_state_periodic = gt_clockevent_set_periodic; 176 clk->set_state_periodic = gt_clockevent_set_periodic;
176 clk->set_state_oneshot = gt_clockevent_shutdown; 177 clk->set_state_oneshot = gt_clockevent_shutdown;
178 clk->set_state_oneshot_stopped = gt_clockevent_shutdown;
177 clk->set_next_event = gt_clockevent_set_next_event; 179 clk->set_next_event = gt_clockevent_set_next_event;
178 clk->cpumask = cpumask_of(cpu); 180 clk->cpumask = cpumask_of(cpu);
179 clk->rating = 300; 181 clk->rating = 300;
@@ -221,6 +223,21 @@ static u64 notrace gt_sched_clock_read(void)
221} 223}
222#endif 224#endif
223 225
226static unsigned long gt_read_long(void)
227{
228 return readl_relaxed(gt_base + GT_COUNTER0);
229}
230
231static struct delay_timer gt_delay_timer = {
232 .read_current_timer = gt_read_long,
233};
234
235static void __init gt_delay_timer_init(void)
236{
237 gt_delay_timer.freq = gt_clk_rate;
238 register_current_timer_delay(&gt_delay_timer);
239}
240
224static void __init gt_clocksource_init(void) 241static void __init gt_clocksource_init(void)
225{ 242{
226 writel(0, gt_base + GT_CONTROL); 243 writel(0, gt_base + GT_CONTROL);
@@ -317,6 +334,7 @@ static void __init global_timer_of_register(struct device_node *np)
317 /* Immediately configure the timer on the boot CPU */ 334 /* Immediately configure the timer on the boot CPU */
318 gt_clocksource_init(); 335 gt_clocksource_init();
319 gt_clockevents_init(this_cpu_ptr(gt_evt)); 336 gt_clockevents_init(this_cpu_ptr(gt_evt));
337 gt_delay_timer_init();
320 338
321 return; 339 return;
322 340
diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index ff44082a0827..be09bc0b5e26 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -313,6 +313,7 @@ static struct clock_event_device mct_comp_device = {
313 .set_state_periodic = mct_set_state_periodic, 313 .set_state_periodic = mct_set_state_periodic,
314 .set_state_shutdown = mct_set_state_shutdown, 314 .set_state_shutdown = mct_set_state_shutdown,
315 .set_state_oneshot = mct_set_state_shutdown, 315 .set_state_oneshot = mct_set_state_shutdown,
316 .set_state_oneshot_stopped = mct_set_state_shutdown,
316 .tick_resume = mct_set_state_shutdown, 317 .tick_resume = mct_set_state_shutdown,
317}; 318};
318 319
@@ -452,6 +453,7 @@ static int exynos4_local_timer_setup(struct mct_clock_event_device *mevt)
452 evt->set_state_periodic = set_state_periodic; 453 evt->set_state_periodic = set_state_periodic;
453 evt->set_state_shutdown = set_state_shutdown; 454 evt->set_state_shutdown = set_state_shutdown;
454 evt->set_state_oneshot = set_state_shutdown; 455 evt->set_state_oneshot = set_state_shutdown;
456 evt->set_state_oneshot_stopped = set_state_shutdown;
455 evt->tick_resume = set_state_shutdown; 457 evt->tick_resume = set_state_shutdown;
456 evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; 458 evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
457 evt->rating = 450; 459 evt->rating = 450;
diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c
index 8c77a529d0d4..b991b288c803 100644
--- a/drivers/clocksource/rockchip_timer.c
+++ b/drivers/clocksource/rockchip_timer.c
@@ -122,23 +122,23 @@ static void __init rk_timer_init(struct device_node *np)
122 pclk = of_clk_get_by_name(np, "pclk"); 122 pclk = of_clk_get_by_name(np, "pclk");
123 if (IS_ERR(pclk)) { 123 if (IS_ERR(pclk)) {
124 pr_err("Failed to get pclk for '%s'\n", TIMER_NAME); 124 pr_err("Failed to get pclk for '%s'\n", TIMER_NAME);
125 return; 125 goto out_unmap;
126 } 126 }
127 127
128 if (clk_prepare_enable(pclk)) { 128 if (clk_prepare_enable(pclk)) {
129 pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME); 129 pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME);
130 return; 130 goto out_unmap;
131 } 131 }
132 132
133 timer_clk = of_clk_get_by_name(np, "timer"); 133 timer_clk = of_clk_get_by_name(np, "timer");
134 if (IS_ERR(timer_clk)) { 134 if (IS_ERR(timer_clk)) {
135 pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME); 135 pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME);
136 return; 136 goto out_timer_clk;
137 } 137 }
138 138
139 if (clk_prepare_enable(timer_clk)) { 139 if (clk_prepare_enable(timer_clk)) {
140 pr_err("Failed to enable timer clock\n"); 140 pr_err("Failed to enable timer clock\n");
141 return; 141 goto out_timer_clk;
142 } 142 }
143 143
144 bc_timer.freq = clk_get_rate(timer_clk); 144 bc_timer.freq = clk_get_rate(timer_clk);
@@ -146,7 +146,7 @@ static void __init rk_timer_init(struct device_node *np)
146 irq = irq_of_parse_and_map(np, 0); 146 irq = irq_of_parse_and_map(np, 0);
147 if (!irq) { 147 if (!irq) {
148 pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME); 148 pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME);
149 return; 149 goto out_irq;
150 } 150 }
151 151
152 ce->name = TIMER_NAME; 152 ce->name = TIMER_NAME;
@@ -164,10 +164,19 @@ static void __init rk_timer_init(struct device_node *np)
164 ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce); 164 ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce);
165 if (ret) { 165 if (ret) {
166 pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret); 166 pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret);
167 return; 167 goto out_irq;
168 } 168 }
169 169
170 clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX); 170 clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX);
171
172 return;
173
174out_irq:
175 clk_disable_unprepare(timer_clk);
176out_timer_clk:
177 clk_disable_unprepare(pclk);
178out_unmap:
179 iounmap(bc_timer.base);
171} 180}
172 181
173CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init); 182CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init);
diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c
index 1316876b487a..daae61e8c820 100644
--- a/drivers/clocksource/time-lpc32xx.c
+++ b/drivers/clocksource/time-lpc32xx.c
@@ -18,6 +18,7 @@
18#include <linux/clk.h> 18#include <linux/clk.h>
19#include <linux/clockchips.h> 19#include <linux/clockchips.h>
20#include <linux/clocksource.h> 20#include <linux/clocksource.h>
21#include <linux/delay.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
22#include <linux/irq.h> 23#include <linux/irq.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -43,6 +44,7 @@
43struct lpc32xx_clock_event_ddata { 44struct lpc32xx_clock_event_ddata {
44 struct clock_event_device evtdev; 45 struct clock_event_device evtdev;
45 void __iomem *base; 46 void __iomem *base;
47 u32 ticks_per_jiffy;
46}; 48};
47 49
48/* Needed for the sched clock */ 50/* Needed for the sched clock */
@@ -53,6 +55,15 @@ static u64 notrace lpc32xx_read_sched_clock(void)
53 return readl(clocksource_timer_counter); 55 return readl(clocksource_timer_counter);
54} 56}
55 57
58static unsigned long lpc32xx_delay_timer_read(void)
59{
60 return readl(clocksource_timer_counter);
61}
62
63static struct delay_timer lpc32xx_delay_timer = {
64 .read_current_timer = lpc32xx_delay_timer_read,
65};
66
56static int lpc32xx_clkevt_next_event(unsigned long delta, 67static int lpc32xx_clkevt_next_event(unsigned long delta,
57 struct clock_event_device *evtdev) 68 struct clock_event_device *evtdev)
58{ 69{
@@ -60,14 +71,13 @@ static int lpc32xx_clkevt_next_event(unsigned long delta,
60 container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); 71 container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
61 72
62 /* 73 /*
63 * Place timer in reset and program the delta in the prescale 74 * Place timer in reset and program the delta in the match
64 * register (PR). When the prescale counter matches the value 75 * channel 0 (MR0). When the timer counter matches the value
65 * in PR the counter register is incremented and the compare 76 * in MR0 register the match will trigger an interrupt.
66 * match will trigger. After setup the timer is released from 77 * After setup the timer is released from reset and enabled.
67 * reset and enabled.
68 */ 78 */
69 writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); 79 writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
70 writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR); 80 writel_relaxed(delta, ddata->base + LPC32XX_TIMER_MR0);
71 writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); 81 writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
72 82
73 return 0; 83 return 0;
@@ -86,11 +96,39 @@ static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev)
86 96
87static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev) 97static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev)
88{ 98{
99 struct lpc32xx_clock_event_ddata *ddata =
100 container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
101
89 /* 102 /*
90 * When using oneshot, we must also disable the timer 103 * When using oneshot, we must also disable the timer
91 * to wait for the first call to set_next_event(). 104 * to wait for the first call to set_next_event().
92 */ 105 */
93 return lpc32xx_clkevt_shutdown(evtdev); 106 writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR);
107
108 /* Enable interrupt, reset on match and stop on match (MCR). */
109 writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
110 LPC32XX_TIMER_MCR_MR0S, ddata->base + LPC32XX_TIMER_MCR);
111 return 0;
112}
113
114static int lpc32xx_clkevt_periodic(struct clock_event_device *evtdev)
115{
116 struct lpc32xx_clock_event_ddata *ddata =
117 container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
118
119 /* Enable interrupt and reset on match. */
120 writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R,
121 ddata->base + LPC32XX_TIMER_MCR);
122
123 /*
124 * Place timer in reset and program the delta in the match
125 * channel 0 (MR0).
126 */
127 writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
128 writel_relaxed(ddata->ticks_per_jiffy, ddata->base + LPC32XX_TIMER_MR0);
129 writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
130
131 return 0;
94} 132}
95 133
96static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) 134static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
@@ -108,11 +146,13 @@ static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
108static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = { 146static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = {
109 .evtdev = { 147 .evtdev = {
110 .name = "lpc3220 clockevent", 148 .name = "lpc3220 clockevent",
111 .features = CLOCK_EVT_FEAT_ONESHOT, 149 .features = CLOCK_EVT_FEAT_ONESHOT |
150 CLOCK_EVT_FEAT_PERIODIC,
112 .rating = 300, 151 .rating = 300,
113 .set_next_event = lpc32xx_clkevt_next_event, 152 .set_next_event = lpc32xx_clkevt_next_event,
114 .set_state_shutdown = lpc32xx_clkevt_shutdown, 153 .set_state_shutdown = lpc32xx_clkevt_shutdown,
115 .set_state_oneshot = lpc32xx_clkevt_oneshot, 154 .set_state_oneshot = lpc32xx_clkevt_oneshot,
155 .set_state_periodic = lpc32xx_clkevt_periodic,
116 }, 156 },
117}; 157};
118 158
@@ -162,6 +202,8 @@ static int __init lpc32xx_clocksource_init(struct device_node *np)
162 } 202 }
163 203
164 clocksource_timer_counter = base + LPC32XX_TIMER_TC; 204 clocksource_timer_counter = base + LPC32XX_TIMER_TC;
205 lpc32xx_delay_timer.freq = rate;
206 register_current_timer_delay(&lpc32xx_delay_timer);
165 sched_clock_register(lpc32xx_read_sched_clock, 32, rate); 207 sched_clock_register(lpc32xx_read_sched_clock, 32, rate);
166 208
167 return 0; 209 return 0;
@@ -210,18 +252,16 @@ static int __init lpc32xx_clockevent_init(struct device_node *np)
210 252
211 /* 253 /*
212 * Disable timer and clear any pending interrupt (IR) on match 254 * Disable timer and clear any pending interrupt (IR) on match
213 * channel 0 (MR0). Configure a compare match value of 1 on MR0 255 * channel 0 (MR0). Clear the prescaler as it's not used.
214 * and enable interrupt, reset on match and stop on match (MCR).
215 */ 256 */
216 writel_relaxed(0, base + LPC32XX_TIMER_TCR); 257 writel_relaxed(0, base + LPC32XX_TIMER_TCR);
258 writel_relaxed(0, base + LPC32XX_TIMER_PR);
217 writel_relaxed(0, base + LPC32XX_TIMER_CTCR); 259 writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
218 writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); 260 writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR);
219 writel_relaxed(1, base + LPC32XX_TIMER_MR0);
220 writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
221 LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR);
222 261
223 rate = clk_get_rate(clk); 262 rate = clk_get_rate(clk);
224 lpc32xx_clk_event_ddata.base = base; 263 lpc32xx_clk_event_ddata.base = base;
264 lpc32xx_clk_event_ddata.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ);
225 clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev, 265 clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev,
226 rate, 1, -1); 266 rate, 1, -1);
227 267
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd83d477e32d..3a4b39afc0ab 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void)
1431 if (!all_cpu_data) 1431 if (!all_cpu_data)
1432 return -ENOMEM; 1432 return -ENOMEM;
1433 1433
1434 if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) { 1434 if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) {
1435 pr_info("intel_pstate: HWP enabled\n"); 1435 pr_info("intel_pstate: HWP enabled\n");
1436 hwp_active++; 1436 hwp_active++;
1437 } 1437 }
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index fb50911b3940..7e8c441ff2de 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -60,6 +60,17 @@ config ARM_VIC_NR
60 The maximum number of VICs available in the system, for 60 The maximum number of VICs available in the system, for
61 power management. 61 power management.
62 62
63config ARMADA_370_XP_IRQ
64 bool
65 select GENERIC_IRQ_CHIP
66 select PCI_MSI_IRQ_DOMAIN if PCI_MSI
67
68config ALPINE_MSI
69 bool
70 depends on PCI && PCI_MSI
71 select GENERIC_IRQ_CHIP
72 select PCI_MSI_IRQ_DOMAIN
73
63config ATMEL_AIC_IRQ 74config ATMEL_AIC_IRQ
64 bool 75 bool
65 select GENERIC_IRQ_CHIP 76 select GENERIC_IRQ_CHIP
@@ -78,6 +89,11 @@ config I8259
78 bool 89 bool
79 select IRQ_DOMAIN 90 select IRQ_DOMAIN
80 91
92config BCM6345_L1_IRQ
93 bool
94 select GENERIC_IRQ_CHIP
95 select IRQ_DOMAIN
96
81config BCM7038_L1_IRQ 97config BCM7038_L1_IRQ
82 bool 98 bool
83 select GENERIC_IRQ_CHIP 99 select GENERIC_IRQ_CHIP
@@ -151,6 +167,11 @@ config ST_IRQCHIP
151 help 167 help
152 Enables SysCfg Controlled IRQs on STi based platforms. 168 Enables SysCfg Controlled IRQs on STi based platforms.
153 169
170config TANGO_IRQ
171 bool
172 select IRQ_DOMAIN
173 select GENERIC_IRQ_CHIP
174
154config TB10X_IRQC 175config TB10X_IRQC
155 bool 176 bool
156 select IRQ_DOMAIN 177 select IRQ_DOMAIN
@@ -160,6 +181,7 @@ config TS4800_IRQ
160 tristate "TS-4800 IRQ controller" 181 tristate "TS-4800 IRQ controller"
161 select IRQ_DOMAIN 182 select IRQ_DOMAIN
162 depends on HAS_IOMEM 183 depends on HAS_IOMEM
184 depends on SOC_IMX51 || COMPILE_TEST
163 help 185 help
164 Support for the TS-4800 FPGA IRQ controller 186 Support for the TS-4800 FPGA IRQ controller
165 187
@@ -193,6 +215,8 @@ config KEYSTONE_IRQ
193 215
194config MIPS_GIC 216config MIPS_GIC
195 bool 217 bool
218 select GENERIC_IRQ_IPI
219 select IRQ_DOMAIN_HIERARCHY
196 select MIPS_CM 220 select MIPS_CM
197 221
198config INGENIC_IRQ 222config INGENIC_IRQ
@@ -218,3 +242,7 @@ config IRQ_MXS
218 def_bool y if MACH_ASM9260 || ARCH_MXS 242 def_bool y if MACH_ASM9260 || ARCH_MXS
219 select IRQ_DOMAIN 243 select IRQ_DOMAIN
220 select STMP_DEVICE 244 select STMP_DEVICE
245
246config MVEBU_ODMI
247 bool
248 select GENERIC_MSI_IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 18caacb60d58..b03cfcbbac6b 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -1,11 +1,13 @@
1obj-$(CONFIG_IRQCHIP) += irqchip.o 1obj-$(CONFIG_IRQCHIP) += irqchip.o
2 2
3obj-$(CONFIG_ALPINE_MSI) += irq-alpine-msi.o
4obj-$(CONFIG_ATH79) += irq-ath79-cpu.o
5obj-$(CONFIG_ATH79) += irq-ath79-misc.o
3obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o 6obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o
4obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o 7obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o
5obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o 8obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o
6obj-$(CONFIG_ARCH_HIP04) += irq-hip04.o 9obj-$(CONFIG_ARCH_HIP04) += irq-hip04.o
7obj-$(CONFIG_ARCH_MMP) += irq-mmp.o 10obj-$(CONFIG_ARCH_MMP) += irq-mmp.o
8obj-$(CONFIG_ARCH_MVEBU) += irq-armada-370-xp.o
9obj-$(CONFIG_IRQ_MXS) += irq-mxs.o 11obj-$(CONFIG_IRQ_MXS) += irq-mxs.o
10obj-$(CONFIG_ARCH_TEGRA) += irq-tegra.o 12obj-$(CONFIG_ARCH_TEGRA) += irq-tegra.o
11obj-$(CONFIG_ARCH_S3C24XX) += irq-s3c24xx.o 13obj-$(CONFIG_ARCH_S3C24XX) += irq-s3c24xx.o
@@ -28,6 +30,7 @@ obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-g
28obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o 30obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o
29obj-$(CONFIG_ARM_NVIC) += irq-nvic.o 31obj-$(CONFIG_ARM_NVIC) += irq-nvic.o
30obj-$(CONFIG_ARM_VIC) += irq-vic.o 32obj-$(CONFIG_ARM_VIC) += irq-vic.o
33obj-$(CONFIG_ARMADA_370_XP_IRQ) += irq-armada-370-xp.o
31obj-$(CONFIG_ATMEL_AIC_IRQ) += irq-atmel-aic-common.o irq-atmel-aic.o 34obj-$(CONFIG_ATMEL_AIC_IRQ) += irq-atmel-aic-common.o irq-atmel-aic.o
32obj-$(CONFIG_ATMEL_AIC5_IRQ) += irq-atmel-aic-common.o irq-atmel-aic5.o 35obj-$(CONFIG_ATMEL_AIC5_IRQ) += irq-atmel-aic-common.o irq-atmel-aic5.o
33obj-$(CONFIG_I8259) += irq-i8259.o 36obj-$(CONFIG_I8259) += irq-i8259.o
@@ -40,12 +43,14 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ) += irq-versatile-fpga.o
40obj-$(CONFIG_ARCH_NSPIRE) += irq-zevio.o 43obj-$(CONFIG_ARCH_NSPIRE) += irq-zevio.o
41obj-$(CONFIG_ARCH_VT8500) += irq-vt8500.o 44obj-$(CONFIG_ARCH_VT8500) += irq-vt8500.o
42obj-$(CONFIG_ST_IRQCHIP) += irq-st.o 45obj-$(CONFIG_ST_IRQCHIP) += irq-st.o
46obj-$(CONFIG_TANGO_IRQ) += irq-tango.o
43obj-$(CONFIG_TB10X_IRQC) += irq-tb10x.o 47obj-$(CONFIG_TB10X_IRQC) += irq-tb10x.o
44obj-$(CONFIG_TS4800_IRQ) += irq-ts4800.o 48obj-$(CONFIG_TS4800_IRQ) += irq-ts4800.o
45obj-$(CONFIG_XTENSA) += irq-xtensa-pic.o 49obj-$(CONFIG_XTENSA) += irq-xtensa-pic.o
46obj-$(CONFIG_XTENSA_MX) += irq-xtensa-mx.o 50obj-$(CONFIG_XTENSA_MX) += irq-xtensa-mx.o
47obj-$(CONFIG_IRQ_CROSSBAR) += irq-crossbar.o 51obj-$(CONFIG_IRQ_CROSSBAR) += irq-crossbar.o
48obj-$(CONFIG_SOC_VF610) += irq-vf610-mscm-ir.o 52obj-$(CONFIG_SOC_VF610) += irq-vf610-mscm-ir.o
53obj-$(CONFIG_BCM6345_L1_IRQ) += irq-bcm6345-l1.o
49obj-$(CONFIG_BCM7038_L1_IRQ) += irq-bcm7038-l1.o 54obj-$(CONFIG_BCM7038_L1_IRQ) += irq-bcm7038-l1.o
50obj-$(CONFIG_BCM7120_L2_IRQ) += irq-bcm7120-l2.o 55obj-$(CONFIG_BCM7120_L2_IRQ) += irq-bcm7120-l2.o
51obj-$(CONFIG_BRCMSTB_L2_IRQ) += irq-brcmstb-l2.o 56obj-$(CONFIG_BRCMSTB_L2_IRQ) += irq-brcmstb-l2.o
@@ -59,3 +64,4 @@ obj-$(CONFIG_ARCH_SA1100) += irq-sa11x0.o
59obj-$(CONFIG_INGENIC_IRQ) += irq-ingenic.o 64obj-$(CONFIG_INGENIC_IRQ) += irq-ingenic.o
60obj-$(CONFIG_IMX_GPCV2) += irq-imx-gpcv2.o 65obj-$(CONFIG_IMX_GPCV2) += irq-imx-gpcv2.o
61obj-$(CONFIG_PIC32_EVIC) += irq-pic32-evic.o 66obj-$(CONFIG_PIC32_EVIC) += irq-pic32-evic.o
67obj-$(CONFIG_MVEBU_ODMI) += irq-mvebu-odmi.o
diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c
new file mode 100644
index 000000000000..25384255b30f
--- /dev/null
+++ b/drivers/irqchip/irq-alpine-msi.c
@@ -0,0 +1,293 @@
1/*
2 * Annapurna Labs MSIX support services
3 *
4 * Copyright (C) 2016, Amazon.com, Inc. or its affiliates. All Rights Reserved.
5 *
6 * Antoine Tenart <antoine.tenart@free-electrons.com>
7 *
8 * This file is licensed under the terms of the GNU General Public
9 * License version 2. This program is licensed "as is" without any
10 * warranty of any kind, whether express or implied.
11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/irqchip.h>
16#include <linux/irqchip/arm-gic.h>
17#include <linux/msi.h>
18#include <linux/of.h>
19#include <linux/of_address.h>
20#include <linux/of_irq.h>
21#include <linux/of_pci.h>
22#include <linux/pci.h>
23#include <linux/slab.h>
24
25#include <asm/irq.h>
26#include <asm-generic/msi.h>
27
28/* MSIX message address format: local GIC target */
29#define ALPINE_MSIX_SPI_TARGET_CLUSTER0 BIT(16)
30
31struct alpine_msix_data {
32 spinlock_t msi_map_lock;
33 phys_addr_t addr;
34 u32 spi_first; /* The SGI number that MSIs start */
35 u32 num_spis; /* The number of SGIs for MSIs */
36 unsigned long *msi_map;
37};
38
39static void alpine_msix_mask_msi_irq(struct irq_data *d)
40{
41 pci_msi_mask_irq(d);
42 irq_chip_mask_parent(d);
43}
44
45static void alpine_msix_unmask_msi_irq(struct irq_data *d)
46{
47 pci_msi_unmask_irq(d);
48 irq_chip_unmask_parent(d);
49}
50
51static struct irq_chip alpine_msix_irq_chip = {
52 .name = "MSIx",
53 .irq_mask = alpine_msix_mask_msi_irq,
54 .irq_unmask = alpine_msix_unmask_msi_irq,
55 .irq_eoi = irq_chip_eoi_parent,
56 .irq_set_affinity = irq_chip_set_affinity_parent,
57};
58
59static int alpine_msix_allocate_sgi(struct alpine_msix_data *priv, int num_req)
60{
61 int first;
62
63 spin_lock(&priv->msi_map_lock);
64
65 first = bitmap_find_next_zero_area(priv->msi_map, priv->num_spis, 0,
66 num_req, 0);
67 if (first >= priv->num_spis) {
68 spin_unlock(&priv->msi_map_lock);
69 return -ENOSPC;
70 }
71
72 bitmap_set(priv->msi_map, first, num_req);
73
74 spin_unlock(&priv->msi_map_lock);
75
76 return priv->spi_first + first;
77}
78
79static void alpine_msix_free_sgi(struct alpine_msix_data *priv, unsigned sgi,
80 int num_req)
81{
82 int first = sgi - priv->spi_first;
83
84 spin_lock(&priv->msi_map_lock);
85
86 bitmap_clear(priv->msi_map, first, num_req);
87
88 spin_unlock(&priv->msi_map_lock);
89}
90
91static void alpine_msix_compose_msi_msg(struct irq_data *data,
92 struct msi_msg *msg)
93{
94 struct alpine_msix_data *priv = irq_data_get_irq_chip_data(data);
95 phys_addr_t msg_addr = priv->addr;
96
97 msg_addr |= (data->hwirq << 3);
98
99 msg->address_hi = upper_32_bits(msg_addr);
100 msg->address_lo = lower_32_bits(msg_addr);
101 msg->data = 0;
102}
103
104static struct msi_domain_info alpine_msix_domain_info = {
105 .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
106 MSI_FLAG_PCI_MSIX,
107 .chip = &alpine_msix_irq_chip,
108};
109
110static struct irq_chip middle_irq_chip = {
111 .name = "alpine_msix_middle",
112 .irq_mask = irq_chip_mask_parent,
113 .irq_unmask = irq_chip_unmask_parent,
114 .irq_eoi = irq_chip_eoi_parent,
115 .irq_set_affinity = irq_chip_set_affinity_parent,
116 .irq_compose_msi_msg = alpine_msix_compose_msi_msg,
117};
118
119static int alpine_msix_gic_domain_alloc(struct irq_domain *domain,
120 unsigned int virq, int sgi)
121{
122 struct irq_fwspec fwspec;
123 struct irq_data *d;
124 int ret;
125
126 if (!is_of_node(domain->parent->fwnode))
127 return -EINVAL;
128
129 fwspec.fwnode = domain->parent->fwnode;
130 fwspec.param_count = 3;
131 fwspec.param[0] = 0;
132 fwspec.param[1] = sgi;
133 fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
134
135 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
136 if (ret)
137 return ret;
138
139 d = irq_domain_get_irq_data(domain->parent, virq);
140 d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
141
142 return 0;
143}
144
145static int alpine_msix_middle_domain_alloc(struct irq_domain *domain,
146 unsigned int virq,
147 unsigned int nr_irqs, void *args)
148{
149 struct alpine_msix_data *priv = domain->host_data;
150 int sgi, err, i;
151
152 sgi = alpine_msix_allocate_sgi(priv, nr_irqs);
153 if (sgi < 0)
154 return sgi;
155
156 for (i = 0; i < nr_irqs; i++) {
157 err = alpine_msix_gic_domain_alloc(domain, virq + i, sgi + i);
158 if (err)
159 goto err_sgi;
160
161 irq_domain_set_hwirq_and_chip(domain, virq + i, sgi + i,
162 &middle_irq_chip, priv);
163 }
164
165 return 0;
166
167err_sgi:
168 while (--i >= 0)
169 irq_domain_free_irqs_parent(domain, virq, i);
170 alpine_msix_free_sgi(priv, sgi, nr_irqs);
171 return err;
172}
173
174static void alpine_msix_middle_domain_free(struct irq_domain *domain,
175 unsigned int virq,
176 unsigned int nr_irqs)
177{
178 struct irq_data *d = irq_domain_get_irq_data(domain, virq);
179 struct alpine_msix_data *priv = irq_data_get_irq_chip_data(d);
180
181 irq_domain_free_irqs_parent(domain, virq, nr_irqs);
182 alpine_msix_free_sgi(priv, d->hwirq, nr_irqs);
183}
184
185static const struct irq_domain_ops alpine_msix_middle_domain_ops = {
186 .alloc = alpine_msix_middle_domain_alloc,
187 .free = alpine_msix_middle_domain_free,
188};
189
190static int alpine_msix_init_domains(struct alpine_msix_data *priv,
191 struct device_node *node)
192{
193 struct irq_domain *middle_domain, *msi_domain, *gic_domain;
194 struct device_node *gic_node;
195
196 gic_node = of_irq_find_parent(node);
197 if (!gic_node) {
198 pr_err("Failed to find the GIC node\n");
199 return -ENODEV;
200 }
201
202 gic_domain = irq_find_host(gic_node);
203 if (!gic_domain) {
204 pr_err("Failed to find the GIC domain\n");
205 return -ENXIO;
206 }
207
208 middle_domain = irq_domain_add_tree(NULL,
209 &alpine_msix_middle_domain_ops,
210 priv);
211 if (!middle_domain) {
212 pr_err("Failed to create the MSIX middle domain\n");
213 return -ENOMEM;
214 }
215
216 middle_domain->parent = gic_domain;
217
218 msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node),
219 &alpine_msix_domain_info,
220 middle_domain);
221 if (!msi_domain) {
222 pr_err("Failed to create MSI domain\n");
223 irq_domain_remove(middle_domain);
224 return -ENOMEM;
225 }
226
227 return 0;
228}
229
230static int alpine_msix_init(struct device_node *node,
231 struct device_node *parent)
232{
233 struct alpine_msix_data *priv;
234 struct resource res;
235 int ret;
236
237 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
238 if (!priv)
239 return -ENOMEM;
240
241 spin_lock_init(&priv->msi_map_lock);
242
243 ret = of_address_to_resource(node, 0, &res);
244 if (ret) {
245 pr_err("Failed to allocate resource\n");
246 goto err_priv;
247 }
248
249 /*
250 * The 20 least significant bits of addr provide direct information
251 * regarding the interrupt destination.
252 *
253 * To select the primary GIC as the target GIC, bits [18:17] must be set
254 * to 0x0. In this case, bit 16 (SPI_TARGET_CLUSTER0) must be set.
255 */
256 priv->addr = res.start & GENMASK_ULL(63,20);
257 priv->addr |= ALPINE_MSIX_SPI_TARGET_CLUSTER0;
258
259 if (of_property_read_u32(node, "al,msi-base-spi", &priv->spi_first)) {
260 pr_err("Unable to parse MSI base\n");
261 ret = -EINVAL;
262 goto err_priv;
263 }
264
265 if (of_property_read_u32(node, "al,msi-num-spis", &priv->num_spis)) {
266 pr_err("Unable to parse MSI numbers\n");
267 ret = -EINVAL;
268 goto err_priv;
269 }
270
271 priv->msi_map = kzalloc(sizeof(*priv->msi_map) * BITS_TO_LONGS(priv->num_spis),
272 GFP_KERNEL);
273 if (!priv->msi_map) {
274 ret = -ENOMEM;
275 goto err_priv;
276 }
277
278 pr_debug("Registering %d msixs, starting at %d\n",
279 priv->num_spis, priv->spi_first);
280
281 ret = alpine_msix_init_domains(priv, node);
282 if (ret)
283 goto err_map;
284
285 return 0;
286
287err_map:
288 kfree(priv->msi_map);
289err_priv:
290 kfree(priv);
291 return ret;
292}
293IRQCHIP_DECLARE(alpine_msix, "al,alpine-msix", alpine_msix_init);
diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index 3f3a8c3d2175..e7dc6cbda2a1 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -71,6 +71,7 @@ static u32 doorbell_mask_reg;
71static int parent_irq; 71static int parent_irq;
72#ifdef CONFIG_PCI_MSI 72#ifdef CONFIG_PCI_MSI
73static struct irq_domain *armada_370_xp_msi_domain; 73static struct irq_domain *armada_370_xp_msi_domain;
74static struct irq_domain *armada_370_xp_msi_inner_domain;
74static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR); 75static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR);
75static DEFINE_MUTEX(msi_used_lock); 76static DEFINE_MUTEX(msi_used_lock);
76static phys_addr_t msi_doorbell_addr; 77static phys_addr_t msi_doorbell_addr;
@@ -115,127 +116,102 @@ static void armada_370_xp_irq_unmask(struct irq_data *d)
115 116
116#ifdef CONFIG_PCI_MSI 117#ifdef CONFIG_PCI_MSI
117 118
118static int armada_370_xp_alloc_msi(void) 119static struct irq_chip armada_370_xp_msi_irq_chip = {
119{ 120 .name = "MPIC MSI",
120 int hwirq; 121 .irq_mask = pci_msi_mask_irq,
122 .irq_unmask = pci_msi_unmask_irq,
123};
121 124
122 mutex_lock(&msi_used_lock); 125static struct msi_domain_info armada_370_xp_msi_domain_info = {
123 hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR); 126 .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
124 if (hwirq >= PCI_MSI_DOORBELL_NR) 127 MSI_FLAG_MULTI_PCI_MSI),
125 hwirq = -ENOSPC; 128 .chip = &armada_370_xp_msi_irq_chip,
126 else 129};
127 set_bit(hwirq, msi_used);
128 mutex_unlock(&msi_used_lock);
129 130
130 return hwirq; 131static void armada_370_xp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
132{
133 msg->address_lo = lower_32_bits(msi_doorbell_addr);
134 msg->address_hi = upper_32_bits(msi_doorbell_addr);
135 msg->data = 0xf00 | (data->hwirq + PCI_MSI_DOORBELL_START);
131} 136}
132 137
133static void armada_370_xp_free_msi(int hwirq) 138static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data,
139 const struct cpumask *mask, bool force)
134{ 140{
135 mutex_lock(&msi_used_lock); 141 return -EINVAL;
136 if (!test_bit(hwirq, msi_used))
137 pr_err("trying to free unused MSI#%d\n", hwirq);
138 else
139 clear_bit(hwirq, msi_used);
140 mutex_unlock(&msi_used_lock);
141} 142}
142 143
143static int armada_370_xp_setup_msi_irq(struct msi_controller *chip, 144static struct irq_chip armada_370_xp_msi_bottom_irq_chip = {
144 struct pci_dev *pdev, 145 .name = "MPIC MSI",
145 struct msi_desc *desc) 146 .irq_compose_msi_msg = armada_370_xp_compose_msi_msg,
146{ 147 .irq_set_affinity = armada_370_xp_msi_set_affinity,
147 struct msi_msg msg; 148};
148 int virq, hwirq;
149 149
150 /* We support MSI, but not MSI-X */ 150static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq,
151 if (desc->msi_attrib.is_msix) 151 unsigned int nr_irqs, void *args)
152 return -EINVAL; 152{
153 int hwirq, i;
153 154
154 hwirq = armada_370_xp_alloc_msi(); 155 mutex_lock(&msi_used_lock);
155 if (hwirq < 0)
156 return hwirq;
157 156
158 virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq); 157 hwirq = bitmap_find_next_zero_area(msi_used, PCI_MSI_DOORBELL_NR,
159 if (!virq) { 158 0, nr_irqs, 0);
160 armada_370_xp_free_msi(hwirq); 159 if (hwirq >= PCI_MSI_DOORBELL_NR) {
161 return -EINVAL; 160 mutex_unlock(&msi_used_lock);
161 return -ENOSPC;
162 } 162 }
163 163
164 irq_set_msi_desc(virq, desc); 164 bitmap_set(msi_used, hwirq, nr_irqs);
165 165 mutex_unlock(&msi_used_lock);
166 msg.address_lo = msi_doorbell_addr;
167 msg.address_hi = 0;
168 msg.data = 0xf00 | (hwirq + 16);
169
170 pci_write_msi_msg(virq, &msg);
171 return 0;
172}
173 166
174static void armada_370_xp_teardown_msi_irq(struct msi_controller *chip, 167 for (i = 0; i < nr_irqs; i++) {
175 unsigned int irq) 168 irq_domain_set_info(domain, virq + i, hwirq + i,
176{ 169 &armada_370_xp_msi_bottom_irq_chip,
177 struct irq_data *d = irq_get_irq_data(irq); 170 domain->host_data, handle_simple_irq,
178 unsigned long hwirq = d->hwirq; 171 NULL, NULL);
172 }
179 173
180 irq_dispose_mapping(irq); 174 return hwirq;
181 armada_370_xp_free_msi(hwirq);
182} 175}
183 176
184static struct irq_chip armada_370_xp_msi_irq_chip = { 177static void armada_370_xp_msi_free(struct irq_domain *domain,
185 .name = "armada_370_xp_msi_irq", 178 unsigned int virq, unsigned int nr_irqs)
186 .irq_enable = pci_msi_unmask_irq,
187 .irq_disable = pci_msi_mask_irq,
188 .irq_mask = pci_msi_mask_irq,
189 .irq_unmask = pci_msi_unmask_irq,
190};
191
192static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq,
193 irq_hw_number_t hw)
194{ 179{
195 irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip, 180 struct irq_data *d = irq_domain_get_irq_data(domain, virq);
196 handle_simple_irq);
197 181
198 return 0; 182 mutex_lock(&msi_used_lock);
183 bitmap_clear(msi_used, d->hwirq, nr_irqs);
184 mutex_unlock(&msi_used_lock);
199} 185}
200 186
201static const struct irq_domain_ops armada_370_xp_msi_irq_ops = { 187static const struct irq_domain_ops armada_370_xp_msi_domain_ops = {
202 .map = armada_370_xp_msi_map, 188 .alloc = armada_370_xp_msi_alloc,
189 .free = armada_370_xp_msi_free,
203}; 190};
204 191
205static int armada_370_xp_msi_init(struct device_node *node, 192static int armada_370_xp_msi_init(struct device_node *node,
206 phys_addr_t main_int_phys_base) 193 phys_addr_t main_int_phys_base)
207{ 194{
208 struct msi_controller *msi_chip;
209 u32 reg; 195 u32 reg;
210 int ret;
211 196
212 msi_doorbell_addr = main_int_phys_base + 197 msi_doorbell_addr = main_int_phys_base +
213 ARMADA_370_XP_SW_TRIG_INT_OFFS; 198 ARMADA_370_XP_SW_TRIG_INT_OFFS;
214 199
215 msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL); 200 armada_370_xp_msi_inner_domain =
216 if (!msi_chip) 201 irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
202 &armada_370_xp_msi_domain_ops, NULL);
203 if (!armada_370_xp_msi_inner_domain)
217 return -ENOMEM; 204 return -ENOMEM;
218 205
219 msi_chip->setup_irq = armada_370_xp_setup_msi_irq;
220 msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq;
221 msi_chip->of_node = node;
222
223 armada_370_xp_msi_domain = 206 armada_370_xp_msi_domain =
224 irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR, 207 pci_msi_create_irq_domain(of_node_to_fwnode(node),
225 &armada_370_xp_msi_irq_ops, 208 &armada_370_xp_msi_domain_info,
226 NULL); 209 armada_370_xp_msi_inner_domain);
227 if (!armada_370_xp_msi_domain) { 210 if (!armada_370_xp_msi_domain) {
228 kfree(msi_chip); 211 irq_domain_remove(armada_370_xp_msi_inner_domain);
229 return -ENOMEM; 212 return -ENOMEM;
230 } 213 }
231 214
232 ret = of_pci_msi_chip_add(msi_chip);
233 if (ret < 0) {
234 irq_domain_remove(armada_370_xp_msi_domain);
235 kfree(msi_chip);
236 return ret;
237 }
238
239 reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS) 215 reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS)
240 | PCI_MSI_DOORBELL_MASK; 216 | PCI_MSI_DOORBELL_MASK;
241 217
@@ -280,7 +256,7 @@ static int armada_xp_set_affinity(struct irq_data *d,
280#endif 256#endif
281 257
282static struct irq_chip armada_370_xp_irq_chip = { 258static struct irq_chip armada_370_xp_irq_chip = {
283 .name = "armada_370_xp_irq", 259 .name = "MPIC",
284 .irq_mask = armada_370_xp_irq_mask, 260 .irq_mask = armada_370_xp_irq_mask,
285 .irq_mask_ack = armada_370_xp_irq_mask, 261 .irq_mask_ack = armada_370_xp_irq_mask,
286 .irq_unmask = armada_370_xp_irq_unmask, 262 .irq_unmask = armada_370_xp_irq_unmask,
@@ -427,12 +403,12 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained)
427 continue; 403 continue;
428 404
429 if (is_chained) { 405 if (is_chained) {
430 irq = irq_find_mapping(armada_370_xp_msi_domain, 406 irq = irq_find_mapping(armada_370_xp_msi_inner_domain,
431 msinr - 16); 407 msinr - PCI_MSI_DOORBELL_START);
432 generic_handle_irq(irq); 408 generic_handle_irq(irq);
433 } else { 409 } else {
434 irq = msinr - 16; 410 irq = msinr - PCI_MSI_DOORBELL_START;
435 handle_domain_irq(armada_370_xp_msi_domain, 411 handle_domain_irq(armada_370_xp_msi_inner_domain,
436 irq, regs); 412 irq, regs);
437 } 413 }
438 } 414 }
@@ -604,8 +580,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node,
604 armada_370_xp_mpic_domain = 580 armada_370_xp_mpic_domain =
605 irq_domain_add_linear(node, nr_irqs, 581 irq_domain_add_linear(node, nr_irqs,
606 &armada_370_xp_mpic_irq_ops, NULL); 582 &armada_370_xp_mpic_irq_ops, NULL);
607
608 BUG_ON(!armada_370_xp_mpic_domain); 583 BUG_ON(!armada_370_xp_mpic_domain);
584 armada_370_xp_mpic_domain->bus_token = DOMAIN_BUS_WIRED;
609 585
610 /* Setup for the boot CPU */ 586 /* Setup for the boot CPU */
611 armada_xp_mpic_perf_init(); 587 armada_xp_mpic_perf_init();
diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c
new file mode 100644
index 000000000000..befe93c5a51a
--- /dev/null
+++ b/drivers/irqchip/irq-ath79-cpu.c
@@ -0,0 +1,97 @@
1/*
2 * Atheros AR71xx/AR724x/AR913x specific interrupt handling
3 *
4 * Copyright (C) 2015 Alban Bedel <albeu@free.fr>
5 * Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
6 * Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
7 * Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
8 *
9 * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License version 2 as published
13 * by the Free Software Foundation.
14 */
15
16#include <linux/interrupt.h>
17#include <linux/irqchip.h>
18#include <linux/of.h>
19
20#include <asm/irq_cpu.h>
21#include <asm/mach-ath79/ath79.h>
22
23/*
24 * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
25 * these devices typically allocate coherent DMA memory, however the
26 * DMA controller may still have some unsynchronized data in the FIFO.
27 * Issue a flush in the handlers to ensure that the driver sees
28 * the update.
29 *
30 * This array map the interrupt lines to the DDR write buffer channels.
31 */
32
33static unsigned irq_wb_chan[8] = {
34 -1, -1, -1, -1, -1, -1, -1, -1,
35};
36
37asmlinkage void plat_irq_dispatch(void)
38{
39 unsigned long pending;
40 int irq;
41
42 pending = read_c0_status() & read_c0_cause() & ST0_IM;
43
44 if (!pending) {
45 spurious_interrupt();
46 return;
47 }
48
49 pending >>= CAUSEB_IP;
50 while (pending) {
51 irq = fls(pending) - 1;
52 if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
53 ath79_ddr_wb_flush(irq_wb_chan[irq]);
54 do_IRQ(MIPS_CPU_IRQ_BASE + irq);
55 pending &= ~BIT(irq);
56 }
57}
58
59static int __init ar79_cpu_intc_of_init(
60 struct device_node *node, struct device_node *parent)
61{
62 int err, i, count;
63
64 /* Fill the irq_wb_chan table */
65 count = of_count_phandle_with_args(
66 node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
67
68 for (i = 0; i < count; i++) {
69 struct of_phandle_args args;
70 u32 irq = i;
71
72 of_property_read_u32_index(
73 node, "qca,ddr-wb-channel-interrupts", i, &irq);
74 if (irq >= ARRAY_SIZE(irq_wb_chan))
75 continue;
76
77 err = of_parse_phandle_with_args(
78 node, "qca,ddr-wb-channels",
79 "#qca,ddr-wb-channel-cells",
80 i, &args);
81 if (err)
82 return err;
83
84 irq_wb_chan[irq] = args.args[0];
85 }
86
87 return mips_cpu_irq_of_init(node, parent);
88}
89IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
90 ar79_cpu_intc_of_init);
91
92void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3)
93{
94 irq_wb_chan[2] = irq_wb_chan2;
95 irq_wb_chan[3] = irq_wb_chan3;
96 mips_cpu_irq_init();
97}
diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c
new file mode 100644
index 000000000000..aa7290784636
--- /dev/null
+++ b/drivers/irqchip/irq-ath79-misc.c
@@ -0,0 +1,189 @@
1/*
2 * Atheros AR71xx/AR724x/AR913x MISC interrupt controller
3 *
4 * Copyright (C) 2015 Alban Bedel <albeu@free.fr>
5 * Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
6 * Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
7 * Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
8 *
9 * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License version 2 as published
13 * by the Free Software Foundation.
14 */
15
16#include <linux/irqchip.h>
17#include <linux/irqchip/chained_irq.h>
18#include <linux/of_address.h>
19#include <linux/of_irq.h>
20
21#define AR71XX_RESET_REG_MISC_INT_STATUS 0
22#define AR71XX_RESET_REG_MISC_INT_ENABLE 4
23
24#define ATH79_MISC_IRQ_COUNT 32
25
26static void ath79_misc_irq_handler(struct irq_desc *desc)
27{
28 struct irq_domain *domain = irq_desc_get_handler_data(desc);
29 struct irq_chip *chip = irq_desc_get_chip(desc);
30 void __iomem *base = domain->host_data;
31 u32 pending;
32
33 chained_irq_enter(chip, desc);
34
35 pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
36 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
37
38 if (!pending) {
39 spurious_interrupt();
40 chained_irq_exit(chip, desc);
41 return;
42 }
43
44 while (pending) {
45 int bit = __ffs(pending);
46
47 generic_handle_irq(irq_linear_revmap(domain, bit));
48 pending &= ~BIT(bit);
49 }
50
51 chained_irq_exit(chip, desc);
52}
53
54static void ar71xx_misc_irq_unmask(struct irq_data *d)
55{
56 void __iomem *base = irq_data_get_irq_chip_data(d);
57 unsigned int irq = d->hwirq;
58 u32 t;
59
60 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
61 __raw_writel(t | BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
62
63 /* flush write */
64 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
65}
66
67static void ar71xx_misc_irq_mask(struct irq_data *d)
68{
69 void __iomem *base = irq_data_get_irq_chip_data(d);
70 unsigned int irq = d->hwirq;
71 u32 t;
72
73 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
74 __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
75
76 /* flush write */
77 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
78}
79
80static void ar724x_misc_irq_ack(struct irq_data *d)
81{
82 void __iomem *base = irq_data_get_irq_chip_data(d);
83 unsigned int irq = d->hwirq;
84 u32 t;
85
86 t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
87 __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
88
89 /* flush write */
90 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
91}
92
93static struct irq_chip ath79_misc_irq_chip = {
94 .name = "MISC",
95 .irq_unmask = ar71xx_misc_irq_unmask,
96 .irq_mask = ar71xx_misc_irq_mask,
97};
98
99static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
100{
101 irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
102 irq_set_chip_data(irq, d->host_data);
103 return 0;
104}
105
106static const struct irq_domain_ops misc_irq_domain_ops = {
107 .xlate = irq_domain_xlate_onecell,
108 .map = misc_map,
109};
110
111static void __init ath79_misc_intc_domain_init(
112 struct irq_domain *domain, int irq)
113{
114 void __iomem *base = domain->host_data;
115
116 /* Disable and clear all interrupts */
117 __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
118 __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
119
120 irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
121}
122
123static int __init ath79_misc_intc_of_init(
124 struct device_node *node, struct device_node *parent)
125{
126 struct irq_domain *domain;
127 void __iomem *base;
128 int irq;
129
130 irq = irq_of_parse_and_map(node, 0);
131 if (!irq) {
132 pr_err("Failed to get MISC IRQ\n");
133 return -EINVAL;
134 }
135
136 base = of_iomap(node, 0);
137 if (!base) {
138 pr_err("Failed to get MISC IRQ registers\n");
139 return -ENOMEM;
140 }
141
142 domain = irq_domain_add_linear(node, ATH79_MISC_IRQ_COUNT,
143 &misc_irq_domain_ops, base);
144 if (!domain) {
145 pr_err("Failed to add MISC irqdomain\n");
146 return -EINVAL;
147 }
148
149 ath79_misc_intc_domain_init(domain, irq);
150 return 0;
151}
152
153static int __init ar7100_misc_intc_of_init(
154 struct device_node *node, struct device_node *parent)
155{
156 ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
157 return ath79_misc_intc_of_init(node, parent);
158}
159
160IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
161 ar7100_misc_intc_of_init);
162
163static int __init ar7240_misc_intc_of_init(
164 struct device_node *node, struct device_node *parent)
165{
166 ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
167 return ath79_misc_intc_of_init(node, parent);
168}
169
170IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
171 ar7240_misc_intc_of_init);
172
173void __init ath79_misc_irq_init(void __iomem *regs, int irq,
174 int irq_base, bool is_ar71xx)
175{
176 struct irq_domain *domain;
177
178 if (is_ar71xx)
179 ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
180 else
181 ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
182
183 domain = irq_domain_add_legacy(NULL, ATH79_MISC_IRQ_COUNT,
184 irq_base, 0, &misc_irq_domain_ops, regs);
185 if (!domain)
186 panic("Failed to create MISC irqdomain");
187
188 ath79_misc_intc_domain_init(domain, irq);
189}
diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c
index 37199b9b2cfa..28b26c80f4cf 100644
--- a/drivers/irqchip/irq-atmel-aic-common.c
+++ b/drivers/irqchip/irq-atmel-aic-common.c
@@ -80,16 +80,10 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val)
80 return 0; 80 return 0;
81} 81}
82 82
83int aic_common_set_priority(int priority, unsigned *val) 83void aic_common_set_priority(int priority, unsigned *val)
84{ 84{
85 if (priority < AT91_AIC_IRQ_MIN_PRIORITY ||
86 priority > AT91_AIC_IRQ_MAX_PRIORITY)
87 return -EINVAL;
88
89 *val &= ~AT91_AIC_PRIOR; 85 *val &= ~AT91_AIC_PRIOR;
90 *val |= priority; 86 *val |= priority;
91
92 return 0;
93} 87}
94 88
95int aic_common_irq_domain_xlate(struct irq_domain *d, 89int aic_common_irq_domain_xlate(struct irq_domain *d,
@@ -193,7 +187,7 @@ void __init aic_common_rtt_irq_fixup(struct device_node *root)
193 } 187 }
194} 188}
195 189
196void __init aic_common_irq_fixup(const struct of_device_id *matches) 190static void __init aic_common_irq_fixup(const struct of_device_id *matches)
197{ 191{
198 struct device_node *root = of_find_node_by_path("/"); 192 struct device_node *root = of_find_node_by_path("/");
199 const struct of_device_id *match; 193 const struct of_device_id *match;
@@ -214,7 +208,8 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches)
214 208
215struct irq_domain *__init aic_common_of_init(struct device_node *node, 209struct irq_domain *__init aic_common_of_init(struct device_node *node,
216 const struct irq_domain_ops *ops, 210 const struct irq_domain_ops *ops,
217 const char *name, int nirqs) 211 const char *name, int nirqs,
212 const struct of_device_id *matches)
218{ 213{
219 struct irq_chip_generic *gc; 214 struct irq_chip_generic *gc;
220 struct irq_domain *domain; 215 struct irq_domain *domain;
@@ -264,6 +259,7 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node,
264 } 259 }
265 260
266 aic_common_ext_irq_of_init(domain); 261 aic_common_ext_irq_of_init(domain);
262 aic_common_irq_fixup(matches);
267 263
268 return domain; 264 return domain;
269 265
diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h
index 603f0a9d5411..af60376d50de 100644
--- a/drivers/irqchip/irq-atmel-aic-common.h
+++ b/drivers/irqchip/irq-atmel-aic-common.h
@@ -19,7 +19,7 @@
19 19
20int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val); 20int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val);
21 21
22int aic_common_set_priority(int priority, unsigned *val); 22void aic_common_set_priority(int priority, unsigned *val);
23 23
24int aic_common_irq_domain_xlate(struct irq_domain *d, 24int aic_common_irq_domain_xlate(struct irq_domain *d,
25 struct device_node *ctrlr, 25 struct device_node *ctrlr,
@@ -30,12 +30,11 @@ int aic_common_irq_domain_xlate(struct irq_domain *d,
30 30
31struct irq_domain *__init aic_common_of_init(struct device_node *node, 31struct irq_domain *__init aic_common_of_init(struct device_node *node,
32 const struct irq_domain_ops *ops, 32 const struct irq_domain_ops *ops,
33 const char *name, int nirqs); 33 const char *name, int nirqs,
34 const struct of_device_id *matches);
34 35
35void __init aic_common_rtc_irq_fixup(struct device_node *root); 36void __init aic_common_rtc_irq_fixup(struct device_node *root);
36 37
37void __init aic_common_rtt_irq_fixup(struct device_node *root); 38void __init aic_common_rtt_irq_fixup(struct device_node *root);
38 39
39void __init aic_common_irq_fixup(const struct of_device_id *matches);
40
41#endif /* __IRQ_ATMEL_AIC_COMMON_H */ 40#endif /* __IRQ_ATMEL_AIC_COMMON_H */
diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c
index 8a0c7f288198..112e17c2768b 100644
--- a/drivers/irqchip/irq-atmel-aic.c
+++ b/drivers/irqchip/irq-atmel-aic.c
@@ -196,9 +196,8 @@ static int aic_irq_domain_xlate(struct irq_domain *d,
196 196
197 irq_gc_lock(gc); 197 irq_gc_lock(gc);
198 smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq)); 198 smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq));
199 ret = aic_common_set_priority(intspec[2], &smr); 199 aic_common_set_priority(intspec[2], &smr);
200 if (!ret) 200 irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
201 irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
202 irq_gc_unlock(gc); 201 irq_gc_unlock(gc);
203 202
204 return ret; 203 return ret;
@@ -248,12 +247,10 @@ static int __init aic_of_init(struct device_node *node,
248 return -EEXIST; 247 return -EEXIST;
249 248
250 domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic", 249 domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic",
251 NR_AIC_IRQS); 250 NR_AIC_IRQS, aic_irq_fixups);
252 if (IS_ERR(domain)) 251 if (IS_ERR(domain))
253 return PTR_ERR(domain); 252 return PTR_ERR(domain);
254 253
255 aic_common_irq_fixup(aic_irq_fixups);
256
257 aic_domain = domain; 254 aic_domain = domain;
258 gc = irq_get_domain_generic_chip(domain, 0); 255 gc = irq_get_domain_generic_chip(domain, 0);
259 256
diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c
index 62bb840c613f..4f0d068e1abe 100644
--- a/drivers/irqchip/irq-atmel-aic5.c
+++ b/drivers/irqchip/irq-atmel-aic5.c
@@ -272,9 +272,8 @@ static int aic5_irq_domain_xlate(struct irq_domain *d,
272 irq_gc_lock(bgc); 272 irq_gc_lock(bgc);
273 irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR); 273 irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR);
274 smr = irq_reg_readl(bgc, AT91_AIC5_SMR); 274 smr = irq_reg_readl(bgc, AT91_AIC5_SMR);
275 ret = aic_common_set_priority(intspec[2], &smr); 275 aic_common_set_priority(intspec[2], &smr);
276 if (!ret) 276 irq_reg_writel(bgc, smr, AT91_AIC5_SMR);
277 irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR);
278 irq_gc_unlock(bgc); 277 irq_gc_unlock(bgc);
279 278
280 return ret; 279 return ret;
@@ -312,12 +311,10 @@ static int __init aic5_of_init(struct device_node *node,
312 return -EEXIST; 311 return -EEXIST;
313 312
314 domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5", 313 domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5",
315 nirqs); 314 nirqs, aic5_irq_fixups);
316 if (IS_ERR(domain)) 315 if (IS_ERR(domain))
317 return PTR_ERR(domain); 316 return PTR_ERR(domain);
318 317
319 aic_common_irq_fixup(aic5_irq_fixups);
320
321 aic5_domain = domain; 318 aic5_domain = domain;
322 nchips = aic5_domain->revmap_size / 32; 319 nchips = aic5_domain->revmap_size / 32;
323 for (i = 0; i < nchips; i++) { 320 for (i = 0; i < nchips; i++) {
diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c
index 963065a0d774..b6e950d4782a 100644
--- a/drivers/irqchip/irq-bcm2836.c
+++ b/drivers/irqchip/irq-bcm2836.c
@@ -229,7 +229,6 @@ int __init bcm2836_smp_boot_secondary(unsigned int cpu,
229 unsigned long secondary_startup_phys = 229 unsigned long secondary_startup_phys =
230 (unsigned long)virt_to_phys((void *)secondary_startup); 230 (unsigned long)virt_to_phys((void *)secondary_startup);
231 231
232 dsb();
233 writel(secondary_startup_phys, 232 writel(secondary_startup_phys,
234 intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu); 233 intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu);
235 234
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
new file mode 100644
index 000000000000..b844c89a9506
--- /dev/null
+++ b/drivers/irqchip/irq-bcm6345-l1.c
@@ -0,0 +1,364 @@
1/*
2 * Broadcom BCM6345 style Level 1 interrupt controller driver
3 *
4 * Copyright (C) 2014 Broadcom Corporation
5 * Copyright 2015 Simon Arlott
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This is based on the BCM7038 (which supports SMP) but with a single
12 * enable register instead of separate mask/set/clear registers.
13 *
14 * The BCM3380 has a similar mask/status register layout, but each pair
15 * of words is at separate locations (and SMP is not supported).
16 *
17 * ENABLE/STATUS words are packed next to each other for each CPU:
18 *
19 * BCM6368:
20 * 0x1000_0020: CPU0_W0_ENABLE
21 * 0x1000_0024: CPU0_W1_ENABLE
22 * 0x1000_0028: CPU0_W0_STATUS IRQs 31-63
23 * 0x1000_002c: CPU0_W1_STATUS IRQs 0-31
24 * 0x1000_0030: CPU1_W0_ENABLE
25 * 0x1000_0034: CPU1_W1_ENABLE
26 * 0x1000_0038: CPU1_W0_STATUS IRQs 31-63
27 * 0x1000_003c: CPU1_W1_STATUS IRQs 0-31
28 *
29 * BCM63168:
30 * 0x1000_0020: CPU0_W0_ENABLE
31 * 0x1000_0024: CPU0_W1_ENABLE
32 * 0x1000_0028: CPU0_W2_ENABLE
33 * 0x1000_002c: CPU0_W3_ENABLE
34 * 0x1000_0030: CPU0_W0_STATUS IRQs 96-127
35 * 0x1000_0034: CPU0_W1_STATUS IRQs 64-95
36 * 0x1000_0038: CPU0_W2_STATUS IRQs 32-63
37 * 0x1000_003c: CPU0_W3_STATUS IRQs 0-31
38 * 0x1000_0040: CPU1_W0_ENABLE
39 * 0x1000_0044: CPU1_W1_ENABLE
40 * 0x1000_0048: CPU1_W2_ENABLE
41 * 0x1000_004c: CPU1_W3_ENABLE
42 * 0x1000_0050: CPU1_W0_STATUS IRQs 96-127
43 * 0x1000_0054: CPU1_W1_STATUS IRQs 64-95
44 * 0x1000_0058: CPU1_W2_STATUS IRQs 32-63
45 * 0x1000_005c: CPU1_W3_STATUS IRQs 0-31
46 *
47 * IRQs are numbered in CPU native endian order
48 * (which is big-endian in these examples)
49 */
50
51#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
52
53#include <linux/bitops.h>
54#include <linux/cpumask.h>
55#include <linux/kconfig.h>
56#include <linux/kernel.h>
57#include <linux/init.h>
58#include <linux/interrupt.h>
59#include <linux/io.h>
60#include <linux/ioport.h>
61#include <linux/irq.h>
62#include <linux/irqdomain.h>
63#include <linux/module.h>
64#include <linux/of.h>
65#include <linux/of_irq.h>
66#include <linux/of_address.h>
67#include <linux/of_platform.h>
68#include <linux/platform_device.h>
69#include <linux/slab.h>
70#include <linux/smp.h>
71#include <linux/types.h>
72#include <linux/irqchip.h>
73#include <linux/irqchip/chained_irq.h>
74
75#define IRQS_PER_WORD 32
76#define REG_BYTES_PER_IRQ_WORD (sizeof(u32) * 2)
77
78struct bcm6345_l1_cpu;
79
80struct bcm6345_l1_chip {
81 raw_spinlock_t lock;
82 unsigned int n_words;
83 struct irq_domain *domain;
84 struct cpumask cpumask;
85 struct bcm6345_l1_cpu *cpus[NR_CPUS];
86};
87
88struct bcm6345_l1_cpu {
89 void __iomem *map_base;
90 unsigned int parent_irq;
91 u32 enable_cache[];
92};
93
94static inline unsigned int reg_enable(struct bcm6345_l1_chip *intc,
95 unsigned int word)
96{
97#ifdef __BIG_ENDIAN
98 return (1 * intc->n_words - word - 1) * sizeof(u32);
99#else
100 return (0 * intc->n_words + word) * sizeof(u32);
101#endif
102}
103
104static inline unsigned int reg_status(struct bcm6345_l1_chip *intc,
105 unsigned int word)
106{
107#ifdef __BIG_ENDIAN
108 return (2 * intc->n_words - word - 1) * sizeof(u32);
109#else
110 return (1 * intc->n_words + word) * sizeof(u32);
111#endif
112}
113
114static inline unsigned int cpu_for_irq(struct bcm6345_l1_chip *intc,
115 struct irq_data *d)
116{
117 return cpumask_first_and(&intc->cpumask, irq_data_get_affinity_mask(d));
118}
119
120static void bcm6345_l1_irq_handle(struct irq_desc *desc)
121{
122 struct bcm6345_l1_chip *intc = irq_desc_get_handler_data(desc);
123 struct bcm6345_l1_cpu *cpu;
124 struct irq_chip *chip = irq_desc_get_chip(desc);
125 unsigned int idx;
126
127#ifdef CONFIG_SMP
128 cpu = intc->cpus[cpu_logical_map(smp_processor_id())];
129#else
130 cpu = intc->cpus[0];
131#endif
132
133 chained_irq_enter(chip, desc);
134
135 for (idx = 0; idx < intc->n_words; idx++) {
136 int base = idx * IRQS_PER_WORD;
137 unsigned long pending;
138 irq_hw_number_t hwirq;
139 unsigned int irq;
140
141 pending = __raw_readl(cpu->map_base + reg_status(intc, idx));
142 pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx));
143
144 for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) {
145 irq = irq_linear_revmap(intc->domain, base + hwirq);
146 if (irq)
147 do_IRQ(irq);
148 else
149 spurious_interrupt();
150 }
151 }
152
153 chained_irq_exit(chip, desc);
154}
155
156static inline void __bcm6345_l1_unmask(struct irq_data *d)
157{
158 struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
159 u32 word = d->hwirq / IRQS_PER_WORD;
160 u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
161 unsigned int cpu_idx = cpu_for_irq(intc, d);
162
163 intc->cpus[cpu_idx]->enable_cache[word] |= mask;
164 __raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
165 intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
166}
167
168static inline void __bcm6345_l1_mask(struct irq_data *d)
169{
170 struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
171 u32 word = d->hwirq / IRQS_PER_WORD;
172 u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
173 unsigned int cpu_idx = cpu_for_irq(intc, d);
174
175 intc->cpus[cpu_idx]->enable_cache[word] &= ~mask;
176 __raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
177 intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
178}
179
180static void bcm6345_l1_unmask(struct irq_data *d)
181{
182 struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
183 unsigned long flags;
184
185 raw_spin_lock_irqsave(&intc->lock, flags);
186 __bcm6345_l1_unmask(d);
187 raw_spin_unlock_irqrestore(&intc->lock, flags);
188}
189
190static void bcm6345_l1_mask(struct irq_data *d)
191{
192 struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
193 unsigned long flags;
194
195 raw_spin_lock_irqsave(&intc->lock, flags);
196 __bcm6345_l1_mask(d);
197 raw_spin_unlock_irqrestore(&intc->lock, flags);
198}
199
200static int bcm6345_l1_set_affinity(struct irq_data *d,
201 const struct cpumask *dest,
202 bool force)
203{
204 struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
205 u32 word = d->hwirq / IRQS_PER_WORD;
206 u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
207 unsigned int old_cpu = cpu_for_irq(intc, d);
208 unsigned int new_cpu;
209 struct cpumask valid;
210 unsigned long flags;
211 bool enabled;
212
213 if (!cpumask_and(&valid, &intc->cpumask, dest))
214 return -EINVAL;
215
216 new_cpu = cpumask_any_and(&valid, cpu_online_mask);
217 if (new_cpu >= nr_cpu_ids)
218 return -EINVAL;
219
220 dest = cpumask_of(new_cpu);
221
222 raw_spin_lock_irqsave(&intc->lock, flags);
223 if (old_cpu != new_cpu) {
224 enabled = intc->cpus[old_cpu]->enable_cache[word] & mask;
225 if (enabled)
226 __bcm6345_l1_mask(d);
227 cpumask_copy(irq_data_get_affinity_mask(d), dest);
228 if (enabled)
229 __bcm6345_l1_unmask(d);
230 } else {
231 cpumask_copy(irq_data_get_affinity_mask(d), dest);
232 }
233 raw_spin_unlock_irqrestore(&intc->lock, flags);
234
235 return IRQ_SET_MASK_OK_NOCOPY;
236}
237
238static int __init bcm6345_l1_init_one(struct device_node *dn,
239 unsigned int idx,
240 struct bcm6345_l1_chip *intc)
241{
242 struct resource res;
243 resource_size_t sz;
244 struct bcm6345_l1_cpu *cpu;
245 unsigned int i, n_words;
246
247 if (of_address_to_resource(dn, idx, &res))
248 return -EINVAL;
249 sz = resource_size(&res);
250 n_words = sz / REG_BYTES_PER_IRQ_WORD;
251
252 if (!intc->n_words)
253 intc->n_words = n_words;
254 else if (intc->n_words != n_words)
255 return -EINVAL;
256
257 cpu = intc->cpus[idx] = kzalloc(sizeof(*cpu) + n_words * sizeof(u32),
258 GFP_KERNEL);
259 if (!cpu)
260 return -ENOMEM;
261
262 cpu->map_base = ioremap(res.start, sz);
263 if (!cpu->map_base)
264 return -ENOMEM;
265
266 for (i = 0; i < n_words; i++) {
267 cpu->enable_cache[i] = 0;
268 __raw_writel(0, cpu->map_base + reg_enable(intc, i));
269 }
270
271 cpu->parent_irq = irq_of_parse_and_map(dn, idx);
272 if (!cpu->parent_irq) {
273 pr_err("failed to map parent interrupt %d\n", cpu->parent_irq);
274 return -EINVAL;
275 }
276 irq_set_chained_handler_and_data(cpu->parent_irq,
277 bcm6345_l1_irq_handle, intc);
278
279 return 0;
280}
281
282static struct irq_chip bcm6345_l1_irq_chip = {
283 .name = "bcm6345-l1",
284 .irq_mask = bcm6345_l1_mask,
285 .irq_unmask = bcm6345_l1_unmask,
286 .irq_set_affinity = bcm6345_l1_set_affinity,
287};
288
289static int bcm6345_l1_map(struct irq_domain *d, unsigned int virq,
290 irq_hw_number_t hw_irq)
291{
292 irq_set_chip_and_handler(virq,
293 &bcm6345_l1_irq_chip, handle_percpu_irq);
294 irq_set_chip_data(virq, d->host_data);
295 return 0;
296}
297
298static const struct irq_domain_ops bcm6345_l1_domain_ops = {
299 .xlate = irq_domain_xlate_onecell,
300 .map = bcm6345_l1_map,
301};
302
303static int __init bcm6345_l1_of_init(struct device_node *dn,
304 struct device_node *parent)
305{
306 struct bcm6345_l1_chip *intc;
307 unsigned int idx;
308 int ret;
309
310 intc = kzalloc(sizeof(*intc), GFP_KERNEL);
311 if (!intc)
312 return -ENOMEM;
313
314 for_each_possible_cpu(idx) {
315 ret = bcm6345_l1_init_one(dn, idx, intc);
316 if (ret)
317 pr_err("failed to init intc L1 for cpu %d: %d\n",
318 idx, ret);
319 else
320 cpumask_set_cpu(idx, &intc->cpumask);
321 }
322
323 if (!cpumask_weight(&intc->cpumask)) {
324 ret = -ENODEV;
325 goto out_free;
326 }
327
328 raw_spin_lock_init(&intc->lock);
329
330 intc->domain = irq_domain_add_linear(dn, IRQS_PER_WORD * intc->n_words,
331 &bcm6345_l1_domain_ops,
332 intc);
333 if (!intc->domain) {
334 ret = -ENOMEM;
335 goto out_unmap;
336 }
337
338 pr_info("registered BCM6345 L1 intc (IRQs: %d)\n",
339 IRQS_PER_WORD * intc->n_words);
340 for_each_cpu(idx, &intc->cpumask) {
341 struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
342
343 pr_info(" CPU%u at MMIO 0x%p (irq = %d)\n", idx,
344 cpu->map_base, cpu->parent_irq);
345 }
346
347 return 0;
348
349out_unmap:
350 for_each_possible_cpu(idx) {
351 struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
352
353 if (cpu) {
354 if (cpu->map_base)
355 iounmap(cpu->map_base);
356 kfree(cpu);
357 }
358 }
359out_free:
360 kfree(intc);
361 return ret;
362}
363
364IRQCHIP_DECLARE(bcm6345_l1, "brcm,bcm6345-l1-intc", bcm6345_l1_of_init);
diff --git a/drivers/irqchip/irq-gic-realview.c b/drivers/irqchip/irq-gic-realview.c
index aa46eb280a7f..54c296401525 100644
--- a/drivers/irqchip/irq-gic-realview.c
+++ b/drivers/irqchip/irq-gic-realview.c
@@ -10,7 +10,8 @@
10#include <linux/irqchip/arm-gic.h> 10#include <linux/irqchip/arm-gic.h>
11 11
12#define REALVIEW_SYS_LOCK_OFFSET 0x20 12#define REALVIEW_SYS_LOCK_OFFSET 0x20
13#define REALVIEW_PB11MP_SYS_PLD_CTRL1 0x74 13#define REALVIEW_SYS_PLD_CTRL1 0x74
14#define REALVIEW_EB_REVB_SYS_PLD_CTRL1 0xD8
14#define VERSATILE_LOCK_VAL 0xA05F 15#define VERSATILE_LOCK_VAL 0xA05F
15#define PLD_INTMODE_MASK BIT(22)|BIT(23)|BIT(24) 16#define PLD_INTMODE_MASK BIT(22)|BIT(23)|BIT(24)
16#define PLD_INTMODE_LEGACY 0x0 17#define PLD_INTMODE_LEGACY 0x0
@@ -18,26 +19,57 @@
18#define PLD_INTMODE_NEW_NO_DCC BIT(23) 19#define PLD_INTMODE_NEW_NO_DCC BIT(23)
19#define PLD_INTMODE_FIQ_ENABLE BIT(24) 20#define PLD_INTMODE_FIQ_ENABLE BIT(24)
20 21
22/* For some reason RealView EB Rev B moved this register */
23static const struct of_device_id syscon_pldset_of_match[] = {
24 {
25 .compatible = "arm,realview-eb11mp-revb-syscon",
26 .data = (void *)REALVIEW_EB_REVB_SYS_PLD_CTRL1,
27 },
28 {
29 .compatible = "arm,realview-eb11mp-revc-syscon",
30 .data = (void *)REALVIEW_SYS_PLD_CTRL1,
31 },
32 {
33 .compatible = "arm,realview-eb-syscon",
34 .data = (void *)REALVIEW_SYS_PLD_CTRL1,
35 },
36 {
37 .compatible = "arm,realview-pb11mp-syscon",
38 .data = (void *)REALVIEW_SYS_PLD_CTRL1,
39 },
40 {},
41};
42
21static int __init 43static int __init
22realview_gic_of_init(struct device_node *node, struct device_node *parent) 44realview_gic_of_init(struct device_node *node, struct device_node *parent)
23{ 45{
24 static struct regmap *map; 46 static struct regmap *map;
47 struct device_node *np;
48 const struct of_device_id *gic_id;
49 u32 pld1_ctrl;
50
51 np = of_find_matching_node_and_match(NULL, syscon_pldset_of_match,
52 &gic_id);
53 if (!np)
54 return -ENODEV;
55 pld1_ctrl = (u32)gic_id->data;
25 56
26 /* The PB11MPCore GIC needs to be configured in the syscon */ 57 /* The PB11MPCore GIC needs to be configured in the syscon */
27 map = syscon_regmap_lookup_by_compatible("arm,realview-pb11mp-syscon"); 58 map = syscon_node_to_regmap(np);
28 if (!IS_ERR(map)) { 59 if (!IS_ERR(map)) {
29 /* new irq mode with no DCC */ 60 /* new irq mode with no DCC */
30 regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 61 regmap_write(map, REALVIEW_SYS_LOCK_OFFSET,
31 VERSATILE_LOCK_VAL); 62 VERSATILE_LOCK_VAL);
32 regmap_update_bits(map, REALVIEW_PB11MP_SYS_PLD_CTRL1, 63 regmap_update_bits(map, pld1_ctrl,
33 PLD_INTMODE_NEW_NO_DCC, 64 PLD_INTMODE_NEW_NO_DCC,
34 PLD_INTMODE_MASK); 65 PLD_INTMODE_MASK);
35 regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000); 66 regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000);
36 pr_info("TC11MP GIC: set up interrupt controller to NEW mode, no DCC\n"); 67 pr_info("RealView GIC: set up interrupt controller to NEW mode, no DCC\n");
37 } else { 68 } else {
38 pr_err("TC11MP GIC setup: could not find syscon\n"); 69 pr_err("RealView GIC setup: could not find syscon\n");
39 return -ENXIO; 70 return -ENODEV;
40 } 71 }
41 return gic_of_init(node, parent); 72 return gic_of_init(node, parent);
42} 73}
43IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init); 74IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init);
75IRQCHIP_DECLARE(armeb11mp_gic, "arm,eb11mp-gic", realview_gic_of_init);
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index c779f83e511d..28f047c61baa 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -92,18 +92,6 @@ static struct msi_domain_info gicv2m_msi_domain_info = {
92 .chip = &gicv2m_msi_irq_chip, 92 .chip = &gicv2m_msi_irq_chip,
93}; 93};
94 94
95static int gicv2m_set_affinity(struct irq_data *irq_data,
96 const struct cpumask *mask, bool force)
97{
98 int ret;
99
100 ret = irq_chip_set_affinity_parent(irq_data, mask, force);
101 if (ret == IRQ_SET_MASK_OK)
102 ret = IRQ_SET_MASK_OK_DONE;
103
104 return ret;
105}
106
107static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 95static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
108{ 96{
109 struct v2m_data *v2m = irq_data_get_irq_chip_data(data); 97 struct v2m_data *v2m = irq_data_get_irq_chip_data(data);
@@ -122,7 +110,7 @@ static struct irq_chip gicv2m_irq_chip = {
122 .irq_mask = irq_chip_mask_parent, 110 .irq_mask = irq_chip_mask_parent,
123 .irq_unmask = irq_chip_unmask_parent, 111 .irq_unmask = irq_chip_unmask_parent,
124 .irq_eoi = irq_chip_eoi_parent, 112 .irq_eoi = irq_chip_eoi_parent,
125 .irq_set_affinity = gicv2m_set_affinity, 113 .irq_set_affinity = irq_chip_set_affinity_parent,
126 .irq_compose_msi_msg = gicv2m_compose_msi_msg, 114 .irq_compose_msi_msg = gicv2m_compose_msi_msg,
127}; 115};
128 116
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 43dfd15c1dd2..39261798c59f 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -103,7 +103,6 @@ struct its_device {
103 103
104static LIST_HEAD(its_nodes); 104static LIST_HEAD(its_nodes);
105static DEFINE_SPINLOCK(its_lock); 105static DEFINE_SPINLOCK(its_lock);
106static struct device_node *gic_root_node;
107static struct rdists *gic_rdists; 106static struct rdists *gic_rdists;
108 107
109#define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) 108#define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist))
@@ -671,7 +670,7 @@ static int its_chunk_to_lpi(int chunk)
671 return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192; 670 return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192;
672} 671}
673 672
674static int its_lpi_init(u32 id_bits) 673static int __init its_lpi_init(u32 id_bits)
675{ 674{
676 lpi_chunks = its_lpi_to_chunk(1UL << id_bits); 675 lpi_chunks = its_lpi_to_chunk(1UL << id_bits);
677 676
@@ -1430,7 +1429,8 @@ static void its_enable_quirks(struct its_node *its)
1430 gic_enable_quirks(iidr, its_quirks, its); 1429 gic_enable_quirks(iidr, its_quirks, its);
1431} 1430}
1432 1431
1433static int its_probe(struct device_node *node, struct irq_domain *parent) 1432static int __init its_probe(struct device_node *node,
1433 struct irq_domain *parent)
1434{ 1434{
1435 struct resource res; 1435 struct resource res;
1436 struct its_node *its; 1436 struct its_node *its;
@@ -1591,7 +1591,7 @@ static struct of_device_id its_device_id[] = {
1591 {}, 1591 {},
1592}; 1592};
1593 1593
1594int its_init(struct device_node *node, struct rdists *rdists, 1594int __init its_init(struct device_node *node, struct rdists *rdists,
1595 struct irq_domain *parent_domain) 1595 struct irq_domain *parent_domain)
1596{ 1596{
1597 struct device_node *np; 1597 struct device_node *np;
@@ -1607,8 +1607,6 @@ int its_init(struct device_node *node, struct rdists *rdists,
1607 } 1607 }
1608 1608
1609 gic_rdists = rdists; 1609 gic_rdists = rdists;
1610 gic_root_node = node;
1611
1612 its_alloc_lpi_tables(); 1610 its_alloc_lpi_tables();
1613 its_lpi_init(rdists->id_bits); 1611 its_lpi_init(rdists->id_bits);
1614 1612
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index d7be6ddc34f6..5b7d3c2129d8 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -15,10 +15,12 @@
15 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */ 16 */
17 17
18#include <linux/acpi.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
19#include <linux/cpu_pm.h> 20#include <linux/cpu_pm.h>
20#include <linux/delay.h> 21#include <linux/delay.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/irqdomain.h>
22#include <linux/of.h> 24#include <linux/of.h>
23#include <linux/of_address.h> 25#include <linux/of_address.h>
24#include <linux/of_irq.h> 26#include <linux/of_irq.h>
@@ -38,6 +40,7 @@
38struct redist_region { 40struct redist_region {
39 void __iomem *redist_base; 41 void __iomem *redist_base;
40 phys_addr_t phys_base; 42 phys_addr_t phys_base;
43 bool single_redist;
41}; 44};
42 45
43struct gic_chip_data { 46struct gic_chip_data {
@@ -434,6 +437,9 @@ static int gic_populate_rdist(void)
434 return 0; 437 return 0;
435 } 438 }
436 439
440 if (gic_data.redist_regions[i].single_redist)
441 break;
442
437 if (gic_data.redist_stride) { 443 if (gic_data.redist_stride) {
438 ptr += gic_data.redist_stride; 444 ptr += gic_data.redist_stride;
439 } else { 445 } else {
@@ -634,7 +640,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
634 else 640 else
635 gic_dist_wait_for_rwp(); 641 gic_dist_wait_for_rwp();
636 642
637 return IRQ_SET_MASK_OK; 643 return IRQ_SET_MASK_OK_DONE;
638} 644}
639#else 645#else
640#define gic_set_affinity NULL 646#define gic_set_affinity NULL
@@ -764,6 +770,15 @@ static int gic_irq_domain_translate(struct irq_domain *d,
764 return 0; 770 return 0;
765 } 771 }
766 772
773 if (is_fwnode_irqchip(fwspec->fwnode)) {
774 if(fwspec->param_count != 2)
775 return -EINVAL;
776
777 *hwirq = fwspec->param[0];
778 *type = fwspec->param[1];
779 return 0;
780 }
781
767 return -EINVAL; 782 return -EINVAL;
768} 783}
769 784
@@ -811,17 +826,88 @@ static void gicv3_enable_quirks(void)
811#endif 826#endif
812} 827}
813 828
829static int __init gic_init_bases(void __iomem *dist_base,
830 struct redist_region *rdist_regs,
831 u32 nr_redist_regions,
832 u64 redist_stride,
833 struct fwnode_handle *handle)
834{
835 struct device_node *node;
836 u32 typer;
837 int gic_irqs;
838 int err;
839
840 if (!is_hyp_mode_available())
841 static_key_slow_dec(&supports_deactivate);
842
843 if (static_key_true(&supports_deactivate))
844 pr_info("GIC: Using split EOI/Deactivate mode\n");
845
846 gic_data.dist_base = dist_base;
847 gic_data.redist_regions = rdist_regs;
848 gic_data.nr_redist_regions = nr_redist_regions;
849 gic_data.redist_stride = redist_stride;
850
851 gicv3_enable_quirks();
852
853 /*
854 * Find out how many interrupts are supported.
855 * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI)
856 */
857 typer = readl_relaxed(gic_data.dist_base + GICD_TYPER);
858 gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer);
859 gic_irqs = GICD_TYPER_IRQS(typer);
860 if (gic_irqs > 1020)
861 gic_irqs = 1020;
862 gic_data.irq_nr = gic_irqs;
863
864 gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
865 &gic_data);
866 gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
867
868 if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
869 err = -ENOMEM;
870 goto out_free;
871 }
872
873 set_handle_irq(gic_handle_irq);
874
875 node = to_of_node(handle);
876 if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() &&
877 node) /* Temp hack to prevent ITS init for ACPI */
878 its_init(node, &gic_data.rdists, gic_data.domain);
879
880 gic_smp_init();
881 gic_dist_init();
882 gic_cpu_init();
883 gic_cpu_pm_init();
884
885 return 0;
886
887out_free:
888 if (gic_data.domain)
889 irq_domain_remove(gic_data.domain);
890 free_percpu(gic_data.rdists.rdist);
891 return err;
892}
893
894static int __init gic_validate_dist_version(void __iomem *dist_base)
895{
896 u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
897
898 if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4)
899 return -ENODEV;
900
901 return 0;
902}
903
814static int __init gic_of_init(struct device_node *node, struct device_node *parent) 904static int __init gic_of_init(struct device_node *node, struct device_node *parent)
815{ 905{
816 void __iomem *dist_base; 906 void __iomem *dist_base;
817 struct redist_region *rdist_regs; 907 struct redist_region *rdist_regs;
818 u64 redist_stride; 908 u64 redist_stride;
819 u32 nr_redist_regions; 909 u32 nr_redist_regions;
820 u32 typer; 910 int err, i;
821 u32 reg;
822 int gic_irqs;
823 int err;
824 int i;
825 911
826 dist_base = of_iomap(node, 0); 912 dist_base = of_iomap(node, 0);
827 if (!dist_base) { 913 if (!dist_base) {
@@ -830,11 +916,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
830 return -ENXIO; 916 return -ENXIO;
831 } 917 }
832 918
833 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; 919 err = gic_validate_dist_version(dist_base);
834 if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) { 920 if (err) {
835 pr_err("%s: no distributor detected, giving up\n", 921 pr_err("%s: no distributor detected, giving up\n",
836 node->full_name); 922 node->full_name);
837 err = -ENODEV;
838 goto out_unmap_dist; 923 goto out_unmap_dist;
839 } 924 }
840 925
@@ -865,63 +950,229 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
865 if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) 950 if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
866 redist_stride = 0; 951 redist_stride = 0;
867 952
868 if (!is_hyp_mode_available()) 953 err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
869 static_key_slow_dec(&supports_deactivate); 954 redist_stride, &node->fwnode);
955 if (!err)
956 return 0;
870 957
871 if (static_key_true(&supports_deactivate)) 958out_unmap_rdist:
872 pr_info("GIC: Using split EOI/Deactivate mode\n"); 959 for (i = 0; i < nr_redist_regions; i++)
960 if (rdist_regs[i].redist_base)
961 iounmap(rdist_regs[i].redist_base);
962 kfree(rdist_regs);
963out_unmap_dist:
964 iounmap(dist_base);
965 return err;
966}
873 967
874 gic_data.dist_base = dist_base; 968IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
875 gic_data.redist_regions = rdist_regs;
876 gic_data.nr_redist_regions = nr_redist_regions;
877 gic_data.redist_stride = redist_stride;
878 969
879 gicv3_enable_quirks(); 970#ifdef CONFIG_ACPI
971static void __iomem *dist_base;
972static struct redist_region *redist_regs __initdata;
973static u32 nr_redist_regions __initdata;
974static bool single_redist;
975
976static void __init
977gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
978{
979 static int count = 0;
980
981 redist_regs[count].phys_base = phys_base;
982 redist_regs[count].redist_base = redist_base;
983 redist_regs[count].single_redist = single_redist;
984 count++;
985}
986
987static int __init
988gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
989 const unsigned long end)
990{
991 struct acpi_madt_generic_redistributor *redist =
992 (struct acpi_madt_generic_redistributor *)header;
993 void __iomem *redist_base;
994
995 redist_base = ioremap(redist->base_address, redist->length);
996 if (!redist_base) {
997 pr_err("Couldn't map GICR region @%llx\n", redist->base_address);
998 return -ENOMEM;
999 }
1000
1001 gic_acpi_register_redist(redist->base_address, redist_base);
1002 return 0;
1003}
1004
1005static int __init
1006gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
1007 const unsigned long end)
1008{
1009 struct acpi_madt_generic_interrupt *gicc =
1010 (struct acpi_madt_generic_interrupt *)header;
1011 u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
1012 u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
1013 void __iomem *redist_base;
1014
1015 redist_base = ioremap(gicc->gicr_base_address, size);
1016 if (!redist_base)
1017 return -ENOMEM;
1018
1019 gic_acpi_register_redist(gicc->gicr_base_address, redist_base);
1020 return 0;
1021}
1022
1023static int __init gic_acpi_collect_gicr_base(void)
1024{
1025 acpi_tbl_entry_handler redist_parser;
1026 enum acpi_madt_type type;
1027
1028 if (single_redist) {
1029 type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
1030 redist_parser = gic_acpi_parse_madt_gicc;
1031 } else {
1032 type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR;
1033 redist_parser = gic_acpi_parse_madt_redist;
1034 }
1035
1036 /* Collect redistributor base addresses in GICR entries */
1037 if (acpi_table_parse_madt(type, redist_parser, 0) > 0)
1038 return 0;
1039
1040 pr_info("No valid GICR entries exist\n");
1041 return -ENODEV;
1042}
1043
1044static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
1045 const unsigned long end)
1046{
1047 /* Subtable presence means that redist exists, that's it */
1048 return 0;
1049}
1050
1051static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
1052 const unsigned long end)
1053{
1054 struct acpi_madt_generic_interrupt *gicc =
1055 (struct acpi_madt_generic_interrupt *)header;
880 1056
881 /* 1057 /*
882 * Find out how many interrupts are supported. 1058 * If GICC is enabled and has valid gicr base address, then it means
883 * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI) 1059 * GICR base is presented via GICC
884 */ 1060 */
885 typer = readl_relaxed(gic_data.dist_base + GICD_TYPER); 1061 if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address)
886 gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer); 1062 return 0;
887 gic_irqs = GICD_TYPER_IRQS(typer);
888 if (gic_irqs > 1020)
889 gic_irqs = 1020;
890 gic_data.irq_nr = gic_irqs;
891 1063
892 gic_data.domain = irq_domain_add_tree(node, &gic_irq_domain_ops, 1064 return -ENODEV;
893 &gic_data); 1065}
894 gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
895 1066
896 if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { 1067static int __init gic_acpi_count_gicr_regions(void)
1068{
1069 int count;
1070
1071 /*
1072 * Count how many redistributor regions we have. It is not allowed
1073 * to mix redistributor description, GICR and GICC subtables have to be
1074 * mutually exclusive.
1075 */
1076 count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
1077 gic_acpi_match_gicr, 0);
1078 if (count > 0) {
1079 single_redist = false;
1080 return count;
1081 }
1082
1083 count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
1084 gic_acpi_match_gicc, 0);
1085 if (count > 0)
1086 single_redist = true;
1087
1088 return count;
1089}
1090
1091static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
1092 struct acpi_probe_entry *ape)
1093{
1094 struct acpi_madt_generic_distributor *dist;
1095 int count;
1096
1097 dist = (struct acpi_madt_generic_distributor *)header;
1098 if (dist->version != ape->driver_data)
1099 return false;
1100
1101 /* We need to do that exercise anyway, the sooner the better */
1102 count = gic_acpi_count_gicr_regions();
1103 if (count <= 0)
1104 return false;
1105
1106 nr_redist_regions = count;
1107 return true;
1108}
1109
1110#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
1111
1112static int __init
1113gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
1114{
1115 struct acpi_madt_generic_distributor *dist;
1116 struct fwnode_handle *domain_handle;
1117 int i, err;
1118
1119 /* Get distributor base address */
1120 dist = (struct acpi_madt_generic_distributor *)header;
1121 dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
1122 if (!dist_base) {
1123 pr_err("Unable to map GICD registers\n");
1124 return -ENOMEM;
1125 }
1126
1127 err = gic_validate_dist_version(dist_base);
1128 if (err) {
1129 pr_err("No distributor detected at @%p, giving up", dist_base);
1130 goto out_dist_unmap;
1131 }
1132
1133 redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
1134 GFP_KERNEL);
1135 if (!redist_regs) {
897 err = -ENOMEM; 1136 err = -ENOMEM;
898 goto out_free; 1137 goto out_dist_unmap;
899 } 1138 }
900 1139
901 set_handle_irq(gic_handle_irq); 1140 err = gic_acpi_collect_gicr_base();
1141 if (err)
1142 goto out_redist_unmap;
902 1143
903 if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis()) 1144 domain_handle = irq_domain_alloc_fwnode(dist_base);
904 its_init(node, &gic_data.rdists, gic_data.domain); 1145 if (!domain_handle) {
1146 err = -ENOMEM;
1147 goto out_redist_unmap;
1148 }
905 1149
906 gic_smp_init(); 1150 err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
907 gic_dist_init(); 1151 domain_handle);
908 gic_cpu_init(); 1152 if (err)
909 gic_cpu_pm_init(); 1153 goto out_fwhandle_free;
910 1154
1155 acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
911 return 0; 1156 return 0;
912 1157
913out_free: 1158out_fwhandle_free:
914 if (gic_data.domain) 1159 irq_domain_free_fwnode(domain_handle);
915 irq_domain_remove(gic_data.domain); 1160out_redist_unmap:
916 free_percpu(gic_data.rdists.rdist);
917out_unmap_rdist:
918 for (i = 0; i < nr_redist_regions; i++) 1161 for (i = 0; i < nr_redist_regions; i++)
919 if (rdist_regs[i].redist_base) 1162 if (redist_regs[i].redist_base)
920 iounmap(rdist_regs[i].redist_base); 1163 iounmap(redist_regs[i].redist_base);
921 kfree(rdist_regs); 1164 kfree(redist_regs);
922out_unmap_dist: 1165out_dist_unmap:
923 iounmap(dist_base); 1166 iounmap(dist_base);
924 return err; 1167 return err;
925} 1168}
926 1169IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
927IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); 1170 acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3,
1171 gic_acpi_init);
1172IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
1173 acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4,
1174 gic_acpi_init);
1175IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
1176 acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE,
1177 gic_acpi_init);
1178#endif
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 8f9ebf714e2b..282344b95ec2 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -319,7 +319,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
319 writel_relaxed(val | bit, reg); 319 writel_relaxed(val | bit, reg);
320 raw_spin_unlock_irqrestore(&irq_controller_lock, flags); 320 raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
321 321
322 return IRQ_SET_MASK_OK; 322 return IRQ_SET_MASK_OK_DONE;
323} 323}
324#endif 324#endif
325 325
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 9e17ef27a183..94a30da0cfac 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -29,16 +29,32 @@ struct gic_pcpu_mask {
29 DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS); 29 DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS);
30}; 30};
31 31
32struct gic_irq_spec {
33 enum {
34 GIC_DEVICE,
35 GIC_IPI
36 } type;
37
38 union {
39 struct cpumask *ipimask;
40 unsigned int hwirq;
41 };
42};
43
32static unsigned long __gic_base_addr; 44static unsigned long __gic_base_addr;
45
33static void __iomem *gic_base; 46static void __iomem *gic_base;
34static struct gic_pcpu_mask pcpu_masks[NR_CPUS]; 47static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
35static DEFINE_SPINLOCK(gic_lock); 48static DEFINE_SPINLOCK(gic_lock);
36static struct irq_domain *gic_irq_domain; 49static struct irq_domain *gic_irq_domain;
50static struct irq_domain *gic_dev_domain;
51static struct irq_domain *gic_ipi_domain;
37static int gic_shared_intrs; 52static int gic_shared_intrs;
38static int gic_vpes; 53static int gic_vpes;
39static unsigned int gic_cpu_pin; 54static unsigned int gic_cpu_pin;
40static unsigned int timer_cpu_pin; 55static unsigned int timer_cpu_pin;
41static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; 56static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
57DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS);
42 58
43static void __gic_irq_dispatch(void); 59static void __gic_irq_dispatch(void);
44 60
@@ -264,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set)
264 GIC_VPE_EIC_SS(irq), set); 280 GIC_VPE_EIC_SS(irq), set);
265} 281}
266 282
267void gic_send_ipi(unsigned int intr) 283static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
268{ 284{
269 gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr)); 285 irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d));
286
287 gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq));
270} 288}
271 289
272int gic_get_c0_compare_int(void) 290int gic_get_c0_compare_int(void)
@@ -449,7 +467,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask,
449 gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp))); 467 gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp)));
450 468
451 /* Update the pcpu_masks */ 469 /* Update the pcpu_masks */
452 for (i = 0; i < NR_CPUS; i++) 470 for (i = 0; i < gic_vpes; i++)
453 clear_bit(irq, pcpu_masks[i].pcpu_mask); 471 clear_bit(irq, pcpu_masks[i].pcpu_mask);
454 set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask); 472 set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask);
455 473
@@ -479,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = {
479#ifdef CONFIG_SMP 497#ifdef CONFIG_SMP
480 .irq_set_affinity = gic_set_affinity, 498 .irq_set_affinity = gic_set_affinity,
481#endif 499#endif
500 .ipi_send_single = gic_send_ipi,
482}; 501};
483 502
484static void gic_handle_local_int(bool chained) 503static void gic_handle_local_int(bool chained)
@@ -572,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc)
572 gic_handle_shared_int(true); 591 gic_handle_shared_int(true);
573} 592}
574 593
575#ifdef CONFIG_MIPS_GIC_IPI
576static int gic_resched_int_base;
577static int gic_call_int_base;
578
579unsigned int plat_ipi_resched_int_xlate(unsigned int cpu)
580{
581 return gic_resched_int_base + cpu;
582}
583
584unsigned int plat_ipi_call_int_xlate(unsigned int cpu)
585{
586 return gic_call_int_base + cpu;
587}
588
589static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
590{
591 scheduler_ipi();
592
593 return IRQ_HANDLED;
594}
595
596static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
597{
598 generic_smp_call_function_interrupt();
599
600 return IRQ_HANDLED;
601}
602
603static struct irqaction irq_resched = {
604 .handler = ipi_resched_interrupt,
605 .flags = IRQF_PERCPU,
606 .name = "IPI resched"
607};
608
609static struct irqaction irq_call = {
610 .handler = ipi_call_interrupt,
611 .flags = IRQF_PERCPU,
612 .name = "IPI call"
613};
614
615static __init void gic_ipi_init_one(unsigned int intr, int cpu,
616 struct irqaction *action)
617{
618 int virq = irq_create_mapping(gic_irq_domain,
619 GIC_SHARED_TO_HWIRQ(intr));
620 int i;
621
622 gic_map_to_vpe(intr, mips_cm_vp_id(cpu));
623 for (i = 0; i < NR_CPUS; i++)
624 clear_bit(intr, pcpu_masks[i].pcpu_mask);
625 set_bit(intr, pcpu_masks[cpu].pcpu_mask);
626
627 irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
628
629 irq_set_handler(virq, handle_percpu_irq);
630 setup_irq(virq, action);
631}
632
633static __init void gic_ipi_init(void)
634{
635 int i;
636
637 /* Use last 2 * NR_CPUS interrupts as IPIs */
638 gic_resched_int_base = gic_shared_intrs - nr_cpu_ids;
639 gic_call_int_base = gic_resched_int_base - nr_cpu_ids;
640
641 for (i = 0; i < nr_cpu_ids; i++) {
642 gic_ipi_init_one(gic_call_int_base + i, i, &irq_call);
643 gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched);
644 }
645}
646#else
647static inline void gic_ipi_init(void)
648{
649}
650#endif
651
652static void __init gic_basic_init(void) 594static void __init gic_basic_init(void)
653{ 595{
654 unsigned int i; 596 unsigned int i;
@@ -753,19 +695,21 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
753} 695}
754 696
755static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, 697static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
756 irq_hw_number_t hw) 698 irq_hw_number_t hw, unsigned int vpe)
757{ 699{
758 int intr = GIC_HWIRQ_TO_SHARED(hw); 700 int intr = GIC_HWIRQ_TO_SHARED(hw);
759 unsigned long flags; 701 unsigned long flags;
702 int i;
760 703
761 irq_set_chip_and_handler(virq, &gic_level_irq_controller, 704 irq_set_chip_and_handler(virq, &gic_level_irq_controller,
762 handle_level_irq); 705 handle_level_irq);
763 706
764 spin_lock_irqsave(&gic_lock, flags); 707 spin_lock_irqsave(&gic_lock, flags);
765 gic_map_to_pin(intr, gic_cpu_pin); 708 gic_map_to_pin(intr, gic_cpu_pin);
766 /* Map to VPE 0 by default */ 709 gic_map_to_vpe(intr, vpe);
767 gic_map_to_vpe(intr, 0); 710 for (i = 0; i < gic_vpes; i++)
768 set_bit(intr, pcpu_masks[0].pcpu_mask); 711 clear_bit(intr, pcpu_masks[i].pcpu_mask);
712 set_bit(intr, pcpu_masks[vpe].pcpu_mask);
769 spin_unlock_irqrestore(&gic_lock, flags); 713 spin_unlock_irqrestore(&gic_lock, flags);
770 714
771 return 0; 715 return 0;
@@ -776,10 +720,93 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
776{ 720{
777 if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS) 721 if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS)
778 return gic_local_irq_domain_map(d, virq, hw); 722 return gic_local_irq_domain_map(d, virq, hw);
779 return gic_shared_irq_domain_map(d, virq, hw); 723 return gic_shared_irq_domain_map(d, virq, hw, 0);
780} 724}
781 725
782static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, 726static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
727 unsigned int nr_irqs, void *arg)
728{
729 struct gic_irq_spec *spec = arg;
730 irq_hw_number_t hwirq, base_hwirq;
731 int cpu, ret, i;
732
733 if (spec->type == GIC_DEVICE) {
734 /* verify that it doesn't conflict with an IPI irq */
735 if (test_bit(spec->hwirq, ipi_resrv))
736 return -EBUSY;
737 } else {
738 base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs);
739 if (base_hwirq == gic_shared_intrs) {
740 return -ENOMEM;
741 }
742
743 /* check that we have enough space */
744 for (i = base_hwirq; i < nr_irqs; i++) {
745 if (!test_bit(i, ipi_resrv))
746 return -EBUSY;
747 }
748 bitmap_clear(ipi_resrv, base_hwirq, nr_irqs);
749
750 /* map the hwirq for each cpu consecutively */
751 i = 0;
752 for_each_cpu(cpu, spec->ipimask) {
753 hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i);
754
755 ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq,
756 &gic_edge_irq_controller,
757 NULL);
758 if (ret)
759 goto error;
760
761 ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu);
762 if (ret)
763 goto error;
764
765 i++;
766 }
767
768 /*
769 * tell the parent about the base hwirq we allocated so it can
770 * set its own domain data
771 */
772 spec->hwirq = base_hwirq;
773 }
774
775 return 0;
776error:
777 bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
778 return ret;
779}
780
781void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
782 unsigned int nr_irqs)
783{
784 irq_hw_number_t base_hwirq;
785 struct irq_data *data;
786
787 data = irq_get_irq_data(virq);
788 if (!data)
789 return;
790
791 base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data));
792 bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
793}
794
795int gic_irq_domain_match(struct irq_domain *d, struct device_node *node,
796 enum irq_domain_bus_token bus_token)
797{
798 /* this domain should'nt be accessed directly */
799 return 0;
800}
801
802static const struct irq_domain_ops gic_irq_domain_ops = {
803 .map = gic_irq_domain_map,
804 .alloc = gic_irq_domain_alloc,
805 .free = gic_irq_domain_free,
806 .match = gic_irq_domain_match,
807};
808
809static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
783 const u32 *intspec, unsigned int intsize, 810 const u32 *intspec, unsigned int intsize,
784 irq_hw_number_t *out_hwirq, 811 irq_hw_number_t *out_hwirq,
785 unsigned int *out_type) 812 unsigned int *out_type)
@@ -798,9 +825,130 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
798 return 0; 825 return 0;
799} 826}
800 827
801static const struct irq_domain_ops gic_irq_domain_ops = { 828static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq,
802 .map = gic_irq_domain_map, 829 unsigned int nr_irqs, void *arg)
803 .xlate = gic_irq_domain_xlate, 830{
831 struct irq_fwspec *fwspec = arg;
832 struct gic_irq_spec spec = {
833 .type = GIC_DEVICE,
834 .hwirq = fwspec->param[1],
835 };
836 int i, ret;
837 bool is_shared = fwspec->param[0] == GIC_SHARED;
838
839 if (is_shared) {
840 ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
841 if (ret)
842 return ret;
843 }
844
845 for (i = 0; i < nr_irqs; i++) {
846 irq_hw_number_t hwirq;
847
848 if (is_shared)
849 hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i);
850 else
851 hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i);
852
853 ret = irq_domain_set_hwirq_and_chip(d, virq + i,
854 hwirq,
855 &gic_level_irq_controller,
856 NULL);
857 if (ret)
858 return ret;
859 }
860
861 return 0;
862}
863
864void gic_dev_domain_free(struct irq_domain *d, unsigned int virq,
865 unsigned int nr_irqs)
866{
867 /* no real allocation is done for dev irqs, so no need to free anything */
868 return;
869}
870
871static struct irq_domain_ops gic_dev_domain_ops = {
872 .xlate = gic_dev_domain_xlate,
873 .alloc = gic_dev_domain_alloc,
874 .free = gic_dev_domain_free,
875};
876
877static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
878 const u32 *intspec, unsigned int intsize,
879 irq_hw_number_t *out_hwirq,
880 unsigned int *out_type)
881{
882 /*
883 * There's nothing to translate here. hwirq is dynamically allocated and
884 * the irq type is always edge triggered.
885 * */
886 *out_hwirq = 0;
887 *out_type = IRQ_TYPE_EDGE_RISING;
888
889 return 0;
890}
891
892static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq,
893 unsigned int nr_irqs, void *arg)
894{
895 struct cpumask *ipimask = arg;
896 struct gic_irq_spec spec = {
897 .type = GIC_IPI,
898 .ipimask = ipimask
899 };
900 int ret, i;
901
902 ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
903 if (ret)
904 return ret;
905
906 /* the parent should have set spec.hwirq to the base_hwirq it allocated */
907 for (i = 0; i < nr_irqs; i++) {
908 ret = irq_domain_set_hwirq_and_chip(d, virq + i,
909 GIC_SHARED_TO_HWIRQ(spec.hwirq + i),
910 &gic_edge_irq_controller,
911 NULL);
912 if (ret)
913 goto error;
914
915 ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING);
916 if (ret)
917 goto error;
918 }
919
920 return 0;
921error:
922 irq_domain_free_irqs_parent(d, virq, nr_irqs);
923 return ret;
924}
925
926void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq,
927 unsigned int nr_irqs)
928{
929 irq_domain_free_irqs_parent(d, virq, nr_irqs);
930}
931
932int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node,
933 enum irq_domain_bus_token bus_token)
934{
935 bool is_ipi;
936
937 switch (bus_token) {
938 case DOMAIN_BUS_IPI:
939 is_ipi = d->bus_token == bus_token;
940 return to_of_node(d->fwnode) == node && is_ipi;
941 break;
942 default:
943 return 0;
944 }
945}
946
947static struct irq_domain_ops gic_ipi_domain_ops = {
948 .xlate = gic_ipi_domain_xlate,
949 .alloc = gic_ipi_domain_alloc,
950 .free = gic_ipi_domain_free,
951 .match = gic_ipi_domain_match,
804}; 952};
805 953
806static void __init __gic_init(unsigned long gic_base_addr, 954static void __init __gic_init(unsigned long gic_base_addr,
@@ -809,6 +957,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
809 struct device_node *node) 957 struct device_node *node)
810{ 958{
811 unsigned int gicconfig; 959 unsigned int gicconfig;
960 unsigned int v[2];
812 961
813 __gic_base_addr = gic_base_addr; 962 __gic_base_addr = gic_base_addr;
814 963
@@ -864,9 +1013,32 @@ static void __init __gic_init(unsigned long gic_base_addr,
864 if (!gic_irq_domain) 1013 if (!gic_irq_domain)
865 panic("Failed to add GIC IRQ domain"); 1014 panic("Failed to add GIC IRQ domain");
866 1015
867 gic_basic_init(); 1016 gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0,
1017 GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
1018 node, &gic_dev_domain_ops, NULL);
1019 if (!gic_dev_domain)
1020 panic("Failed to add GIC DEV domain");
1021
1022 gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain,
1023 IRQ_DOMAIN_FLAG_IPI_PER_CPU,
1024 GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
1025 node, &gic_ipi_domain_ops, NULL);
1026 if (!gic_ipi_domain)
1027 panic("Failed to add GIC IPI domain");
868 1028
869 gic_ipi_init(); 1029 gic_ipi_domain->bus_token = DOMAIN_BUS_IPI;
1030
1031 if (node &&
1032 !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) {
1033 bitmap_set(ipi_resrv, v[0], v[1]);
1034 } else {
1035 /* Make the last 2 * gic_vpes available for IPIs */
1036 bitmap_set(ipi_resrv,
1037 gic_shared_intrs - 2 * gic_vpes,
1038 2 * gic_vpes);
1039 }
1040
1041 gic_basic_init();
870} 1042}
871 1043
872void __init gic_init(unsigned long gic_base_addr, 1044void __init gic_init(unsigned long gic_base_addr,
diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c
new file mode 100644
index 000000000000..b4d367868dbb
--- /dev/null
+++ b/drivers/irqchip/irq-mvebu-odmi.c
@@ -0,0 +1,236 @@
1/*
2 * Copyright (C) 2016 Marvell
3 *
4 * Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#define pr_fmt(fmt) "GIC-ODMI: " fmt
12
13#include <linux/irq.h>
14#include <linux/irqchip.h>
15#include <linux/irqdomain.h>
16#include <linux/kernel.h>
17#include <linux/msi.h>
18#include <linux/of_address.h>
19#include <linux/slab.h>
20#include <dt-bindings/interrupt-controller/arm-gic.h>
21
22#define GICP_ODMIN_SET 0x40
23#define GICP_ODMI_INT_NUM_SHIFT 12
24#define GICP_ODMIN_GM_EP_R0 0x110
25#define GICP_ODMIN_GM_EP_R1 0x114
26#define GICP_ODMIN_GM_EA_R0 0x108
27#define GICP_ODMIN_GM_EA_R1 0x118
28
29/*
30 * We don't support the group events, so we simply have 8 interrupts
31 * per frame.
32 */
33#define NODMIS_SHIFT 3
34#define NODMIS_PER_FRAME (1 << NODMIS_SHIFT)
35#define NODMIS_MASK (NODMIS_PER_FRAME - 1)
36
37struct odmi_data {
38 struct resource res;
39 void __iomem *base;
40 unsigned int spi_base;
41};
42
43static struct odmi_data *odmis;
44static unsigned long *odmis_bm;
45static unsigned int odmis_count;
46
47/* Protects odmis_bm */
48static DEFINE_SPINLOCK(odmis_bm_lock);
49
50static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
51{
52 struct odmi_data *odmi;
53 phys_addr_t addr;
54 unsigned int odmin;
55
56 if (WARN_ON(d->hwirq >= odmis_count * NODMIS_PER_FRAME))
57 return;
58
59 odmi = &odmis[d->hwirq >> NODMIS_SHIFT];
60 odmin = d->hwirq & NODMIS_MASK;
61
62 addr = odmi->res.start + GICP_ODMIN_SET;
63
64 msg->address_hi = upper_32_bits(addr);
65 msg->address_lo = lower_32_bits(addr);
66 msg->data = odmin << GICP_ODMI_INT_NUM_SHIFT;
67}
68
69static struct irq_chip odmi_irq_chip = {
70 .name = "ODMI",
71 .irq_mask = irq_chip_mask_parent,
72 .irq_unmask = irq_chip_unmask_parent,
73 .irq_eoi = irq_chip_eoi_parent,
74 .irq_set_affinity = irq_chip_set_affinity_parent,
75 .irq_compose_msi_msg = odmi_compose_msi_msg,
76};
77
78static int odmi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
79 unsigned int nr_irqs, void *args)
80{
81 struct odmi_data *odmi = NULL;
82 struct irq_fwspec fwspec;
83 struct irq_data *d;
84 unsigned int hwirq, odmin;
85 int ret;
86
87 spin_lock(&odmis_bm_lock);
88 hwirq = find_first_zero_bit(odmis_bm, NODMIS_PER_FRAME * odmis_count);
89 if (hwirq >= NODMIS_PER_FRAME * odmis_count) {
90 spin_unlock(&odmis_bm_lock);
91 return -ENOSPC;
92 }
93
94 __set_bit(hwirq, odmis_bm);
95 spin_unlock(&odmis_bm_lock);
96
97 odmi = &odmis[hwirq >> NODMIS_SHIFT];
98 odmin = hwirq & NODMIS_MASK;
99
100 fwspec.fwnode = domain->parent->fwnode;
101 fwspec.param_count = 3;
102 fwspec.param[0] = GIC_SPI;
103 fwspec.param[1] = odmi->spi_base - 32 + odmin;
104 fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
105
106 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
107 if (ret) {
108 pr_err("Cannot allocate parent IRQ\n");
109 spin_lock(&odmis_bm_lock);
110 __clear_bit(odmin, odmis_bm);
111 spin_unlock(&odmis_bm_lock);
112 return ret;
113 }
114
115 /* Configure the interrupt line to be edge */
116 d = irq_domain_get_irq_data(domain->parent, virq);
117 d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
118
119 irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
120 &odmi_irq_chip, NULL);
121
122 return 0;
123}
124
125static void odmi_irq_domain_free(struct irq_domain *domain,
126 unsigned int virq, unsigned int nr_irqs)
127{
128 struct irq_data *d = irq_domain_get_irq_data(domain, virq);
129
130 if (d->hwirq >= odmis_count * NODMIS_PER_FRAME) {
131 pr_err("Failed to teardown msi. Invalid hwirq %lu\n", d->hwirq);
132 return;
133 }
134
135 irq_domain_free_irqs_parent(domain, virq, nr_irqs);
136
137 /* Actually free the MSI */
138 spin_lock(&odmis_bm_lock);
139 __clear_bit(d->hwirq, odmis_bm);
140 spin_unlock(&odmis_bm_lock);
141}
142
143static const struct irq_domain_ops odmi_domain_ops = {
144 .alloc = odmi_irq_domain_alloc,
145 .free = odmi_irq_domain_free,
146};
147
148static struct irq_chip odmi_msi_irq_chip = {
149 .name = "ODMI",
150};
151
152static struct msi_domain_ops odmi_msi_ops = {
153};
154
155static struct msi_domain_info odmi_msi_domain_info = {
156 .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
157 .ops = &odmi_msi_ops,
158 .chip = &odmi_msi_irq_chip,
159};
160
161static int __init mvebu_odmi_init(struct device_node *node,
162 struct device_node *parent)
163{
164 struct irq_domain *inner_domain, *plat_domain;
165 int ret, i;
166
167 if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count))
168 return -EINVAL;
169
170 odmis = kcalloc(odmis_count, sizeof(struct odmi_data), GFP_KERNEL);
171 if (!odmis)
172 return -ENOMEM;
173
174 odmis_bm = kcalloc(BITS_TO_LONGS(odmis_count * NODMIS_PER_FRAME),
175 sizeof(long), GFP_KERNEL);
176 if (!odmis_bm) {
177 ret = -ENOMEM;
178 goto err_alloc;
179 }
180
181 for (i = 0; i < odmis_count; i++) {
182 struct odmi_data *odmi = &odmis[i];
183
184 ret = of_address_to_resource(node, i, &odmi->res);
185 if (ret)
186 goto err_unmap;
187
188 odmi->base = of_io_request_and_map(node, i, "odmi");
189 if (IS_ERR(odmi->base)) {
190 ret = PTR_ERR(odmi->base);
191 goto err_unmap;
192 }
193
194 if (of_property_read_u32_index(node, "marvell,spi-base",
195 i, &odmi->spi_base)) {
196 ret = -EINVAL;
197 goto err_unmap;
198 }
199 }
200
201 inner_domain = irq_domain_create_linear(of_node_to_fwnode(node),
202 odmis_count * NODMIS_PER_FRAME,
203 &odmi_domain_ops, NULL);
204 if (!inner_domain) {
205 ret = -ENOMEM;
206 goto err_unmap;
207 }
208
209 inner_domain->parent = irq_find_host(parent);
210
211 plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node),
212 &odmi_msi_domain_info,
213 inner_domain);
214 if (!plat_domain) {
215 ret = -ENOMEM;
216 goto err_remove_inner;
217 }
218
219 return 0;
220
221err_remove_inner:
222 irq_domain_remove(inner_domain);
223err_unmap:
224 for (i = 0; i < odmis_count; i++) {
225 struct odmi_data *odmi = &odmis[i];
226
227 if (odmi->base && !IS_ERR(odmi->base))
228 iounmap(odmis[i].base);
229 }
230 kfree(odmis_bm);
231err_alloc:
232 kfree(odmis);
233 return ret;
234}
235
236IRQCHIP_DECLARE(mvebu_odmi, "marvell,odmi-controller", mvebu_odmi_init);
diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
index efe50845939d..17304705f2cf 100644
--- a/drivers/irqchip/irq-mxs.c
+++ b/drivers/irqchip/irq-mxs.c
@@ -183,7 +183,7 @@ static void __iomem * __init icoll_init_iobase(struct device_node *np)
183 void __iomem *icoll_base; 183 void __iomem *icoll_base;
184 184
185 icoll_base = of_io_request_and_map(np, 0, np->name); 185 icoll_base = of_io_request_and_map(np, 0, np->name);
186 if (!icoll_base) 186 if (IS_ERR(icoll_base))
187 panic("%s: unable to map resource", np->full_name); 187 panic("%s: unable to map resource", np->full_name);
188 return icoll_base; 188 return icoll_base;
189} 189}
diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c
index 0820f67cc9a7..668730c5cb66 100644
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -160,9 +160,9 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node,
160 160
161 gc = irq_get_domain_generic_chip(domain, 0); 161 gc = irq_get_domain_generic_chip(domain, 0);
162 gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node)); 162 gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node));
163 if (!gc->reg_base) { 163 if (IS_ERR(gc->reg_base)) {
164 pr_err("unable to map resource\n"); 164 pr_err("unable to map resource\n");
165 ret = -ENOMEM; 165 ret = PTR_ERR(gc->reg_base);
166 goto fail_irqd_remove; 166 goto fail_irqd_remove;
167 } 167 }
168 168
diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c
new file mode 100644
index 000000000000..bdbb5c0ff7fe
--- /dev/null
+++ b/drivers/irqchip/irq-tango.c
@@ -0,0 +1,232 @@
1/*
2 * Copyright (C) 2014 Mans Rullgard <mans@mansr.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the
6 * Free Software Foundation; either version 2 of the License, or (at your
7 * option) any later version.
8 */
9
10#include <linux/init.h>
11#include <linux/irq.h>
12#include <linux/irqchip.h>
13#include <linux/irqchip/chained_irq.h>
14#include <linux/ioport.h>
15#include <linux/io.h>
16#include <linux/of_address.h>
17#include <linux/of_irq.h>
18#include <linux/slab.h>
19
20#define IRQ0_CTL_BASE 0x0000
21#define IRQ1_CTL_BASE 0x0100
22#define EDGE_CTL_BASE 0x0200
23#define IRQ2_CTL_BASE 0x0300
24
25#define IRQ_CTL_HI 0x18
26#define EDGE_CTL_HI 0x20
27
28#define IRQ_STATUS 0x00
29#define IRQ_RAWSTAT 0x04
30#define IRQ_EN_SET 0x08
31#define IRQ_EN_CLR 0x0c
32#define IRQ_SOFT_SET 0x10
33#define IRQ_SOFT_CLR 0x14
34
35#define EDGE_STATUS 0x00
36#define EDGE_RAWSTAT 0x04
37#define EDGE_CFG_RISE 0x08
38#define EDGE_CFG_FALL 0x0c
39#define EDGE_CFG_RISE_SET 0x10
40#define EDGE_CFG_RISE_CLR 0x14
41#define EDGE_CFG_FALL_SET 0x18
42#define EDGE_CFG_FALL_CLR 0x1c
43
44struct tangox_irq_chip {
45 void __iomem *base;
46 unsigned long ctl;
47};
48
49static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg)
50{
51 return readl_relaxed(chip->base + reg);
52}
53
54static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val)
55{
56 writel_relaxed(val, chip->base + reg);
57}
58
59static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status,
60 int base)
61{
62 unsigned int hwirq;
63 unsigned int virq;
64
65 while (status) {
66 hwirq = __ffs(status);
67 virq = irq_find_mapping(dom, base + hwirq);
68 if (virq)
69 generic_handle_irq(virq);
70 status &= ~BIT(hwirq);
71 }
72}
73
74static void tangox_irq_handler(struct irq_desc *desc)
75{
76 struct irq_domain *dom = irq_desc_get_handler_data(desc);
77 struct irq_chip *host_chip = irq_desc_get_chip(desc);
78 struct tangox_irq_chip *chip = dom->host_data;
79 unsigned int status_lo, status_hi;
80
81 chained_irq_enter(host_chip, desc);
82
83 status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS);
84 status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS);
85
86 tangox_dispatch_irqs(dom, status_lo, 0);
87 tangox_dispatch_irqs(dom, status_hi, 32);
88
89 chained_irq_exit(host_chip, desc);
90}
91
92static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type)
93{
94 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
95 struct tangox_irq_chip *chip = gc->domain->host_data;
96 struct irq_chip_regs *regs = &gc->chip_types[0].regs;
97
98 switch (flow_type & IRQ_TYPE_SENSE_MASK) {
99 case IRQ_TYPE_EDGE_RISING:
100 intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
101 intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
102 break;
103
104 case IRQ_TYPE_EDGE_FALLING:
105 intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
106 intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
107 break;
108
109 case IRQ_TYPE_LEVEL_HIGH:
110 intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
111 intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
112 break;
113
114 case IRQ_TYPE_LEVEL_LOW:
115 intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
116 intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
117 break;
118
119 default:
120 pr_err("Invalid trigger mode %x for IRQ %d\n",
121 flow_type, d->irq);
122 return -EINVAL;
123 }
124
125 return irq_setup_alt_chip(d, flow_type);
126}
127
128static void __init tangox_irq_init_chip(struct irq_chip_generic *gc,
129 unsigned long ctl_offs,
130 unsigned long edge_offs)
131{
132 struct tangox_irq_chip *chip = gc->domain->host_data;
133 struct irq_chip_type *ct = gc->chip_types;
134 unsigned long ctl_base = chip->ctl + ctl_offs;
135 unsigned long edge_base = EDGE_CTL_BASE + edge_offs;
136 int i;
137
138 gc->reg_base = chip->base;
139 gc->unused = 0;
140
141 for (i = 0; i < 2; i++) {
142 ct[i].chip.irq_ack = irq_gc_ack_set_bit;
143 ct[i].chip.irq_mask = irq_gc_mask_disable_reg;
144 ct[i].chip.irq_mask_ack = irq_gc_mask_disable_reg_and_ack;
145 ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg;
146 ct[i].chip.irq_set_type = tangox_irq_set_type;
147 ct[i].chip.name = gc->domain->name;
148
149 ct[i].regs.enable = ctl_base + IRQ_EN_SET;
150 ct[i].regs.disable = ctl_base + IRQ_EN_CLR;
151 ct[i].regs.ack = edge_base + EDGE_RAWSTAT;
152 ct[i].regs.type = edge_base;
153 }
154
155 ct[0].type = IRQ_TYPE_LEVEL_MASK;
156 ct[0].handler = handle_level_irq;
157
158 ct[1].type = IRQ_TYPE_EDGE_BOTH;
159 ct[1].handler = handle_edge_irq;
160
161 intc_writel(chip, ct->regs.disable, 0xffffffff);
162 intc_writel(chip, ct->regs.ack, 0xffffffff);
163}
164
165static void __init tangox_irq_domain_init(struct irq_domain *dom)
166{
167 struct irq_chip_generic *gc;
168 int i;
169
170 for (i = 0; i < 2; i++) {
171 gc = irq_get_domain_generic_chip(dom, i * 32);
172 tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI);
173 }
174}
175
176static int __init tangox_irq_init(void __iomem *base, struct resource *baseres,
177 struct device_node *node)
178{
179 struct tangox_irq_chip *chip;
180 struct irq_domain *dom;
181 struct resource res;
182 int irq;
183 int err;
184
185 irq = irq_of_parse_and_map(node, 0);
186 if (!irq)
187 panic("%s: failed to get IRQ", node->name);
188
189 err = of_address_to_resource(node, 0, &res);
190 if (err)
191 panic("%s: failed to get address", node->name);
192
193 chip = kzalloc(sizeof(*chip), GFP_KERNEL);
194 chip->ctl = res.start - baseres->start;
195 chip->base = base;
196
197 dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip);
198 if (!dom)
199 panic("%s: failed to create irqdomain", node->name);
200
201 err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name,
202 handle_level_irq, 0, 0, 0);
203 if (err)
204 panic("%s: failed to allocate irqchip", node->name);
205
206 tangox_irq_domain_init(dom);
207
208 irq_set_chained_handler(irq, tangox_irq_handler);
209 irq_set_handler_data(irq, dom);
210
211 return 0;
212}
213
214static int __init tangox_of_irq_init(struct device_node *node,
215 struct device_node *parent)
216{
217 struct device_node *c;
218 struct resource res;
219 void __iomem *base;
220
221 base = of_iomap(node, 0);
222 if (!base)
223 panic("%s: of_iomap failed", node->name);
224
225 of_address_to_resource(node, 0, &res);
226
227 for_each_child_of_node(node, c)
228 tangox_irq_init(base, &res, c);
229
230 return 0;
231}
232IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init);
diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c
index 4192bdcd2734..2325fb3c482b 100644
--- a/drivers/irqchip/irq-ts4800.c
+++ b/drivers/irqchip/irq-ts4800.c
@@ -59,7 +59,7 @@ static int ts4800_irqdomain_map(struct irq_domain *d, unsigned int irq,
59 return 0; 59 return 0;
60} 60}
61 61
62struct irq_domain_ops ts4800_ic_ops = { 62static const struct irq_domain_ops ts4800_ic_ops = {
63 .map = ts4800_irqdomain_map, 63 .map = ts4800_irqdomain_map,
64 .xlate = irq_domain_xlate_onecell, 64 .xlate = irq_domain_xlate_onecell,
65}; 65};
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index fa593dd3efe1..3772f3ac956e 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -83,6 +83,15 @@ config E1000E
83 To compile this driver as a module, choose M here. The module 83 To compile this driver as a module, choose M here. The module
84 will be called e1000e. 84 will be called e1000e.
85 85
86config E1000E_HWTS
87 bool "Support HW cross-timestamp on PCH devices"
88 default y
89 depends on E1000E && X86
90 ---help---
91 Say Y to enable hardware supported cross-timestamping on PCH
92 devices. The cross-timestamp is available through the PTP clock
93 driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
94
86config IGB 95config IGB
87 tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support" 96 tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
88 depends on PCI 97 depends on PCI
diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h
index f7c7804d79e5..0641c0098738 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -528,6 +528,11 @@
528#define E1000_RXCW_C 0x20000000 /* Receive config */ 528#define E1000_RXCW_C 0x20000000 /* Receive config */
529#define E1000_RXCW_SYNCH 0x40000000 /* Receive config synch */ 529#define E1000_RXCW_SYNCH 0x40000000 /* Receive config synch */
530 530
531/* HH Time Sync */
532#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */
533#define E1000_TSYNCTXCTL_SYNC_COMP 0x40000000 /* sync complete */
534#define E1000_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */
535
531#define E1000_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */ 536#define E1000_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */
532#define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */ 537#define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */
533 538
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index 25a0ad5102d6..e2ff3ef75d5d 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -26,6 +26,12 @@
26 26
27#include "e1000.h" 27#include "e1000.h"
28 28
29#ifdef CONFIG_E1000E_HWTS
30#include <linux/clocksource.h>
31#include <linux/ktime.h>
32#include <asm/tsc.h>
33#endif
34
29/** 35/**
30 * e1000e_phc_adjfreq - adjust the frequency of the hardware clock 36 * e1000e_phc_adjfreq - adjust the frequency of the hardware clock
31 * @ptp: ptp clock structure 37 * @ptp: ptp clock structure
@@ -98,6 +104,78 @@ static int e1000e_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
98 return 0; 104 return 0;
99} 105}
100 106
107#ifdef CONFIG_E1000E_HWTS
108#define MAX_HW_WAIT_COUNT (3)
109
110/**
111 * e1000e_phc_get_syncdevicetime - Callback given to timekeeping code reads system/device registers
112 * @device: current device time
113 * @system: system counter value read synchronously with device time
114 * @ctx: context provided by timekeeping code
115 *
116 * Read device and system (ART) clock simultaneously and return the corrected
117 * clock values in ns.
118 **/
119static int e1000e_phc_get_syncdevicetime(ktime_t *device,
120 struct system_counterval_t *system,
121 void *ctx)
122{
123 struct e1000_adapter *adapter = (struct e1000_adapter *)ctx;
124 struct e1000_hw *hw = &adapter->hw;
125 unsigned long flags;
126 int i;
127 u32 tsync_ctrl;
128 cycle_t dev_cycles;
129 cycle_t sys_cycles;
130
131 tsync_ctrl = er32(TSYNCTXCTL);
132 tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC |
133 E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK;
134 ew32(TSYNCTXCTL, tsync_ctrl);
135 for (i = 0; i < MAX_HW_WAIT_COUNT; ++i) {
136 udelay(1);
137 tsync_ctrl = er32(TSYNCTXCTL);
138 if (tsync_ctrl & E1000_TSYNCTXCTL_SYNC_COMP)
139 break;
140 }
141
142 if (i == MAX_HW_WAIT_COUNT)
143 return -ETIMEDOUT;
144
145 dev_cycles = er32(SYSSTMPH);
146 dev_cycles <<= 32;
147 dev_cycles |= er32(SYSSTMPL);
148 spin_lock_irqsave(&adapter->systim_lock, flags);
149 *device = ns_to_ktime(timecounter_cyc2time(&adapter->tc, dev_cycles));
150 spin_unlock_irqrestore(&adapter->systim_lock, flags);
151
152 sys_cycles = er32(PLTSTMPH);
153 sys_cycles <<= 32;
154 sys_cycles |= er32(PLTSTMPL);
155 *system = convert_art_to_tsc(sys_cycles);
156
157 return 0;
158}
159
160/**
161 * e1000e_phc_getsynctime - Reads the current system/device cross timestamp
162 * @ptp: ptp clock structure
163 * @cts: structure containing timestamp
164 *
165 * Read device and system (ART) clock simultaneously and return the scaled
166 * clock values in ns.
167 **/
168static int e1000e_phc_getcrosststamp(struct ptp_clock_info *ptp,
169 struct system_device_crosststamp *xtstamp)
170{
171 struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
172 ptp_clock_info);
173
174 return get_device_system_crosststamp(e1000e_phc_get_syncdevicetime,
175 adapter, NULL, xtstamp);
176}
177#endif/*CONFIG_E1000E_HWTS*/
178
101/** 179/**
102 * e1000e_phc_gettime - Reads the current time from the hardware clock 180 * e1000e_phc_gettime - Reads the current time from the hardware clock
103 * @ptp: ptp clock structure 181 * @ptp: ptp clock structure
@@ -236,6 +314,13 @@ void e1000e_ptp_init(struct e1000_adapter *adapter)
236 break; 314 break;
237 } 315 }
238 316
317#ifdef CONFIG_E1000E_HWTS
318 /* CPU must have ART and GBe must be from Sunrise Point or greater */
319 if (hw->mac.type >= e1000_pch_spt && boot_cpu_has(X86_FEATURE_ART))
320 adapter->ptp_clock_info.getcrosststamp =
321 e1000e_phc_getcrosststamp;
322#endif/*CONFIG_E1000E_HWTS*/
323
239 INIT_DELAYED_WORK(&adapter->systim_overflow_work, 324 INIT_DELAYED_WORK(&adapter->systim_overflow_work,
240 e1000e_systim_overflow_work); 325 e1000e_systim_overflow_work);
241 326
diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h
index 1d5e0b77062a..0cb4d365e5ad 100644
--- a/drivers/net/ethernet/intel/e1000e/regs.h
+++ b/drivers/net/ethernet/intel/e1000e/regs.h
@@ -245,6 +245,10 @@
245#define E1000_SYSTIML 0x0B600 /* System time register Low - RO */ 245#define E1000_SYSTIML 0x0B600 /* System time register Low - RO */
246#define E1000_SYSTIMH 0x0B604 /* System time register High - RO */ 246#define E1000_SYSTIMH 0x0B604 /* System time register High - RO */
247#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ 247#define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */
248#define E1000_SYSSTMPL 0x0B648 /* HH Timesync system stamp low register */
249#define E1000_SYSSTMPH 0x0B64C /* HH Timesync system stamp hi register */
250#define E1000_PLTSTMPL 0x0B640 /* HH Timesync platform stamp low register */
251#define E1000_PLTSTMPH 0x0B644 /* HH Timesync platform stamp hi register */
248#define E1000_RXMTRL 0x0B634 /* Time sync Rx EtherType and Msg Type - RW */ 252#define E1000_RXMTRL 0x0B634 /* Time sync Rx EtherType and Msg Type - RW */
249#define E1000_RXUDP 0x0B638 /* Time Sync Rx UDP Port - RW */ 253#define E1000_RXUDP 0x0B638 /* Time Sync Rx UDP Port - RW */
250 254
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index da7bae991552..579fd65299a0 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -22,6 +22,7 @@
22#include <linux/poll.h> 22#include <linux/poll.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/timekeeping.h>
25 26
26#include "ptp_private.h" 27#include "ptp_private.h"
27 28
@@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
120 struct ptp_clock_caps caps; 121 struct ptp_clock_caps caps;
121 struct ptp_clock_request req; 122 struct ptp_clock_request req;
122 struct ptp_sys_offset *sysoff = NULL; 123 struct ptp_sys_offset *sysoff = NULL;
124 struct ptp_sys_offset_precise precise_offset;
123 struct ptp_pin_desc pd; 125 struct ptp_pin_desc pd;
124 struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); 126 struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
125 struct ptp_clock_info *ops = ptp->info; 127 struct ptp_clock_info *ops = ptp->info;
126 struct ptp_clock_time *pct; 128 struct ptp_clock_time *pct;
127 struct timespec64 ts; 129 struct timespec64 ts;
130 struct system_device_crosststamp xtstamp;
128 int enable, err = 0; 131 int enable, err = 0;
129 unsigned int i, pin_index; 132 unsigned int i, pin_index;
130 133
@@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
138 caps.n_per_out = ptp->info->n_per_out; 141 caps.n_per_out = ptp->info->n_per_out;
139 caps.pps = ptp->info->pps; 142 caps.pps = ptp->info->pps;
140 caps.n_pins = ptp->info->n_pins; 143 caps.n_pins = ptp->info->n_pins;
144 caps.cross_timestamping = ptp->info->getcrosststamp != NULL;
141 if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) 145 if (copy_to_user((void __user *)arg, &caps, sizeof(caps)))
142 err = -EFAULT; 146 err = -EFAULT;
143 break; 147 break;
@@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
180 err = ops->enable(ops, &req, enable); 184 err = ops->enable(ops, &req, enable);
181 break; 185 break;
182 186
187 case PTP_SYS_OFFSET_PRECISE:
188 if (!ptp->info->getcrosststamp) {
189 err = -EOPNOTSUPP;
190 break;
191 }
192 err = ptp->info->getcrosststamp(ptp->info, &xtstamp);
193 if (err)
194 break;
195
196 ts = ktime_to_timespec64(xtstamp.device);
197 precise_offset.device.sec = ts.tv_sec;
198 precise_offset.device.nsec = ts.tv_nsec;
199 ts = ktime_to_timespec64(xtstamp.sys_realtime);
200 precise_offset.sys_realtime.sec = ts.tv_sec;
201 precise_offset.sys_realtime.nsec = ts.tv_nsec;
202 ts = ktime_to_timespec64(xtstamp.sys_monoraw);
203 precise_offset.sys_monoraw.sec = ts.tv_sec;
204 precise_offset.sys_monoraw.nsec = ts.tv_nsec;
205 if (copy_to_user((void __user *)arg, &precise_offset,
206 sizeof(precise_offset)))
207 err = -EFAULT;
208 break;
209
183 case PTP_SYS_OFFSET: 210 case PTP_SYS_OFFSET:
184 sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL); 211 sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL);
185 if (!sysoff) { 212 if (!sysoff) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4545e2e2ad45..5699bbc23feb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -931,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
931 if (bio_flags & EXTENT_BIO_TREE_LOG) 931 if (bio_flags & EXTENT_BIO_TREE_LOG)
932 return 0; 932 return 0;
933#ifdef CONFIG_X86 933#ifdef CONFIG_X86
934 if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) 934 if (static_cpu_has(X86_FEATURE_XMM4_2))
935 return 0; 935 return 0;
936#endif 936#endif
937 return 1; 937 return 1;
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index bdcf358dfce2..0d442e34c349 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev,
190extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); 190extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);
191 191
192static inline void 192static inline void
193clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) 193clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
194{ 194{
195 return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec); 195 return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
196} 196}
197 197
198extern void clockevents_suspend(void); 198extern void clockevents_suspend(void);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 6013021a3b39..a307bf62974f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -118,6 +118,23 @@ struct clocksource {
118/* simplify initialization of mask field */ 118/* simplify initialization of mask field */
119#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) 119#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
120 120
121static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
122{
123 /* freq = cyc/from
124 * mult/2^shift = ns/cyc
125 * mult = ns/cyc * 2^shift
126 * mult = from/freq * 2^shift
127 * mult = from * 2^shift / freq
128 * mult = (from<<shift) / freq
129 */
130 u64 tmp = ((u64)from) << shift_constant;
131
132 tmp += freq/2; /* round for do_div */
133 do_div(tmp, freq);
134
135 return (u32)tmp;
136}
137
121/** 138/**
122 * clocksource_khz2mult - calculates mult from khz and shift 139 * clocksource_khz2mult - calculates mult from khz and shift
123 * @khz: Clocksource frequency in KHz 140 * @khz: Clocksource frequency in KHz
@@ -128,19 +145,7 @@ struct clocksource {
128 */ 145 */
129static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant) 146static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
130{ 147{
131 /* khz = cyc/(Million ns) 148 return clocksource_freq2mult(khz, shift_constant, NSEC_PER_MSEC);
132 * mult/2^shift = ns/cyc
133 * mult = ns/cyc * 2^shift
134 * mult = 1Million/khz * 2^shift
135 * mult = 1000000 * 2^shift / khz
136 * mult = (1000000<<shift) / khz
137 */
138 u64 tmp = ((u64)1000000) << shift_constant;
139
140 tmp += khz/2; /* round for do_div */
141 do_div(tmp, khz);
142
143 return (u32)tmp;
144} 149}
145 150
146/** 151/**
@@ -154,19 +159,7 @@ static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
154 */ 159 */
155static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) 160static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
156{ 161{
157 /* hz = cyc/(Billion ns) 162 return clocksource_freq2mult(hz, shift_constant, NSEC_PER_SEC);
158 * mult/2^shift = ns/cyc
159 * mult = ns/cyc * 2^shift
160 * mult = 1Billion/hz * 2^shift
161 * mult = 1000000000 * 2^shift / hz
162 * mult = (1000000000<<shift) / hz
163 */
164 u64 tmp = ((u64)1000000000) << shift_constant;
165
166 tmp += hz/2; /* round for do_div */
167 do_div(tmp, hz);
168
169 return (u32)tmp;
170} 163}
171 164
172/** 165/**
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a27f4f17c382..b5ff9881bef8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -20,12 +20,14 @@
20# define __pmem __attribute__((noderef, address_space(5))) 20# define __pmem __attribute__((noderef, address_space(5)))
21#ifdef CONFIG_SPARSE_RCU_POINTER 21#ifdef CONFIG_SPARSE_RCU_POINTER
22# define __rcu __attribute__((noderef, address_space(4))) 22# define __rcu __attribute__((noderef, address_space(4)))
23#else 23#else /* CONFIG_SPARSE_RCU_POINTER */
24# define __rcu 24# define __rcu
25#endif 25#endif /* CONFIG_SPARSE_RCU_POINTER */
26# define __private __attribute__((noderef))
26extern void __chk_user_ptr(const volatile void __user *); 27extern void __chk_user_ptr(const volatile void __user *);
27extern void __chk_io_ptr(const volatile void __iomem *); 28extern void __chk_io_ptr(const volatile void __iomem *);
28#else 29# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
30#else /* __CHECKER__ */
29# define __user 31# define __user
30# define __kernel 32# define __kernel
31# define __safe 33# define __safe
@@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *);
44# define __percpu 46# define __percpu
45# define __rcu 47# define __rcu
46# define __pmem 48# define __pmem
47#endif 49# define __private
50# define ACCESS_PRIVATE(p, member) ((p)->member)
51#endif /* __CHECKER__ */
48 52
49/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ 53/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
50#define ___PASTE(a,b) a##b 54#define ___PASTE(a,b) a##b
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d2ca8c38f9c4..f9b1fab4388a 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -16,6 +16,7 @@
16#include <linux/node.h> 16#include <linux/node.h>
17#include <linux/compiler.h> 17#include <linux/compiler.h>
18#include <linux/cpumask.h> 18#include <linux/cpumask.h>
19#include <linux/cpuhotplug.h>
19 20
20struct device; 21struct device;
21struct device_node; 22struct device_node;
@@ -27,6 +28,9 @@ struct cpu {
27 struct device dev; 28 struct device dev;
28}; 29};
29 30
31extern void boot_cpu_init(void);
32extern void boot_cpu_state_init(void);
33
30extern int register_cpu(struct cpu *cpu, int num); 34extern int register_cpu(struct cpu *cpu, int num);
31extern struct device *get_cpu_device(unsigned cpu); 35extern struct device *get_cpu_device(unsigned cpu);
32extern bool cpu_is_hotpluggable(unsigned cpu); 36extern bool cpu_is_hotpluggable(unsigned cpu);
@@ -74,7 +78,7 @@ enum {
74 /* migration should happen before other stuff but after perf */ 78 /* migration should happen before other stuff but after perf */
75 CPU_PRI_PERF = 20, 79 CPU_PRI_PERF = 20,
76 CPU_PRI_MIGRATION = 10, 80 CPU_PRI_MIGRATION = 10,
77 CPU_PRI_SMPBOOT = 9, 81
78 /* bring up workqueues before normal notifiers and down after */ 82 /* bring up workqueues before normal notifiers and down after */
79 CPU_PRI_WORKQUEUE_UP = 5, 83 CPU_PRI_WORKQUEUE_UP = 5,
80 CPU_PRI_WORKQUEUE_DOWN = -5, 84 CPU_PRI_WORKQUEUE_DOWN = -5,
@@ -97,9 +101,7 @@ enum {
97 * Called on the new cpu, just before 101 * Called on the new cpu, just before
98 * enabling interrupts. Must not sleep, 102 * enabling interrupts. Must not sleep,
99 * must not fail */ 103 * must not fail */
100#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached 104#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly,
101 * idle loop. */
102#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly,
103 * perhaps due to preemption. */ 105 * perhaps due to preemption. */
104 106
105/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend 107/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
@@ -118,6 +120,7 @@ enum {
118 120
119 121
120#ifdef CONFIG_SMP 122#ifdef CONFIG_SMP
123extern bool cpuhp_tasks_frozen;
121/* Need to know about CPUs going up/down? */ 124/* Need to know about CPUs going up/down? */
122#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) 125#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
123#define cpu_notifier(fn, pri) { \ 126#define cpu_notifier(fn, pri) { \
@@ -167,7 +170,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
167} 170}
168#endif 171#endif
169 172
170void smpboot_thread_init(void);
171int cpu_up(unsigned int cpu); 173int cpu_up(unsigned int cpu);
172void notify_cpu_starting(unsigned int cpu); 174void notify_cpu_starting(unsigned int cpu);
173extern void cpu_maps_update_begin(void); 175extern void cpu_maps_update_begin(void);
@@ -177,6 +179,7 @@ extern void cpu_maps_update_done(void);
177#define cpu_notifier_register_done cpu_maps_update_done 179#define cpu_notifier_register_done cpu_maps_update_done
178 180
179#else /* CONFIG_SMP */ 181#else /* CONFIG_SMP */
182#define cpuhp_tasks_frozen 0
180 183
181#define cpu_notifier(fn, pri) do { (void)(fn); } while (0) 184#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
182#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) 185#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0)
@@ -215,10 +218,6 @@ static inline void cpu_notifier_register_done(void)
215{ 218{
216} 219}
217 220
218static inline void smpboot_thread_init(void)
219{
220}
221
222#endif /* CONFIG_SMP */ 221#endif /* CONFIG_SMP */
223extern struct bus_type cpu_subsys; 222extern struct bus_type cpu_subsys;
224 223
@@ -265,11 +264,6 @@ static inline int disable_nonboot_cpus(void) { return 0; }
265static inline void enable_nonboot_cpus(void) {} 264static inline void enable_nonboot_cpus(void) {}
266#endif /* !CONFIG_PM_SLEEP_SMP */ 265#endif /* !CONFIG_PM_SLEEP_SMP */
267 266
268enum cpuhp_state {
269 CPUHP_OFFLINE,
270 CPUHP_ONLINE,
271};
272
273void cpu_startup_entry(enum cpuhp_state state); 267void cpu_startup_entry(enum cpuhp_state state);
274 268
275void cpu_idle_poll_ctrl(bool enable); 269void cpu_idle_poll_ctrl(bool enable);
@@ -280,14 +274,15 @@ void arch_cpu_idle_enter(void);
280void arch_cpu_idle_exit(void); 274void arch_cpu_idle_exit(void);
281void arch_cpu_idle_dead(void); 275void arch_cpu_idle_dead(void);
282 276
283DECLARE_PER_CPU(bool, cpu_dead_idle);
284
285int cpu_report_state(int cpu); 277int cpu_report_state(int cpu);
286int cpu_check_up_prepare(int cpu); 278int cpu_check_up_prepare(int cpu);
287void cpu_set_state_online(int cpu); 279void cpu_set_state_online(int cpu);
288#ifdef CONFIG_HOTPLUG_CPU 280#ifdef CONFIG_HOTPLUG_CPU
289bool cpu_wait_death(unsigned int cpu, int seconds); 281bool cpu_wait_death(unsigned int cpu, int seconds);
290bool cpu_report_death(void); 282bool cpu_report_death(void);
283void cpuhp_report_idle_dead(void);
284#else
285static inline void cpuhp_report_idle_dead(void) { }
291#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 286#endif /* #ifdef CONFIG_HOTPLUG_CPU */
292 287
293#endif /* _LINUX_CPU_H_ */ 288#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
new file mode 100644
index 000000000000..5d68e15e46b7
--- /dev/null
+++ b/include/linux/cpuhotplug.h
@@ -0,0 +1,93 @@
1#ifndef __CPUHOTPLUG_H
2#define __CPUHOTPLUG_H
3
4enum cpuhp_state {
5 CPUHP_OFFLINE,
6 CPUHP_CREATE_THREADS,
7 CPUHP_NOTIFY_PREPARE,
8 CPUHP_BRINGUP_CPU,
9 CPUHP_AP_IDLE_DEAD,
10 CPUHP_AP_OFFLINE,
11 CPUHP_AP_NOTIFY_STARTING,
12 CPUHP_AP_ONLINE,
13 CPUHP_TEARDOWN_CPU,
14 CPUHP_AP_ONLINE_IDLE,
15 CPUHP_AP_SMPBOOT_THREADS,
16 CPUHP_AP_NOTIFY_ONLINE,
17 CPUHP_AP_ONLINE_DYN,
18 CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30,
19 CPUHP_ONLINE,
20};
21
22int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke,
23 int (*startup)(unsigned int cpu),
24 int (*teardown)(unsigned int cpu));
25
26/**
27 * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
28 * @state: The state for which the calls are installed
29 * @name: Name of the callback (will be used in debug output)
30 * @startup: startup callback function
31 * @teardown: teardown callback function
32 *
33 * Installs the callback functions and invokes the startup callback on
34 * the present cpus which have already reached the @state.
35 */
36static inline int cpuhp_setup_state(enum cpuhp_state state,
37 const char *name,
38 int (*startup)(unsigned int cpu),
39 int (*teardown)(unsigned int cpu))
40{
41 return __cpuhp_setup_state(state, name, true, startup, teardown);
42}
43
44/**
45 * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
46 * callbacks
47 * @state: The state for which the calls are installed
48 * @name: Name of the callback.
49 * @startup: startup callback function
50 * @teardown: teardown callback function
51 *
52 * Same as @cpuhp_setup_state except that no calls are executed are invoked
53 * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
54 */
55static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
56 const char *name,
57 int (*startup)(unsigned int cpu),
58 int (*teardown)(unsigned int cpu))
59{
60 return __cpuhp_setup_state(state, name, false, startup, teardown);
61}
62
63void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
64
65/**
66 * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
67 * @state: The state for which the calls are removed
68 *
69 * Removes the callback functions and invokes the teardown callback on
70 * the present cpus which have already reached the @state.
71 */
72static inline void cpuhp_remove_state(enum cpuhp_state state)
73{
74 __cpuhp_remove_state(state, true);
75}
76
77/**
78 * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
79 * teardown
80 * @state: The state for which the calls are removed
81 */
82static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
83{
84 __cpuhp_remove_state(state, false);
85}
86
87#ifdef CONFIG_SMP
88void cpuhp_online_idle(enum cpuhp_state state);
89#else
90static inline void cpuhp_online_idle(enum cpuhp_state state) { }
91#endif
92
93#endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c1c96786248..c4de62348ff2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -133,17 +133,23 @@ struct irq_domain;
133 * Use accessor functions to deal with it 133 * Use accessor functions to deal with it
134 * @node: node index useful for balancing 134 * @node: node index useful for balancing
135 * @handler_data: per-IRQ data for the irq_chip methods 135 * @handler_data: per-IRQ data for the irq_chip methods
136 * @affinity: IRQ affinity on SMP 136 * @affinity: IRQ affinity on SMP. If this is an IPI
137 * related irq, then this is the mask of the
138 * CPUs to which an IPI can be sent.
137 * @msi_desc: MSI descriptor 139 * @msi_desc: MSI descriptor
140 * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional.
138 */ 141 */
139struct irq_common_data { 142struct irq_common_data {
140 unsigned int state_use_accessors; 143 unsigned int __private state_use_accessors;
141#ifdef CONFIG_NUMA 144#ifdef CONFIG_NUMA
142 unsigned int node; 145 unsigned int node;
143#endif 146#endif
144 void *handler_data; 147 void *handler_data;
145 struct msi_desc *msi_desc; 148 struct msi_desc *msi_desc;
146 cpumask_var_t affinity; 149 cpumask_var_t affinity;
150#ifdef CONFIG_GENERIC_IRQ_IPI
151 unsigned int ipi_offset;
152#endif
147}; 153};
148 154
149/** 155/**
@@ -208,7 +214,7 @@ enum {
208 IRQD_FORWARDED_TO_VCPU = (1 << 20), 214 IRQD_FORWARDED_TO_VCPU = (1 << 20),
209}; 215};
210 216
211#define __irqd_to_state(d) ((d)->common->state_use_accessors) 217#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
212 218
213static inline bool irqd_is_setaffinity_pending(struct irq_data *d) 219static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
214{ 220{
@@ -299,6 +305,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d)
299 __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; 305 __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU;
300} 306}
301 307
308#undef __irqd_to_state
309
302static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) 310static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
303{ 311{
304 return d->hwirq; 312 return d->hwirq;
@@ -341,6 +349,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
341 * @irq_get_irqchip_state: return the internal state of an interrupt 349 * @irq_get_irqchip_state: return the internal state of an interrupt
342 * @irq_set_irqchip_state: set the internal state of a interrupt 350 * @irq_set_irqchip_state: set the internal state of a interrupt
343 * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine 351 * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine
352 * @ipi_send_single: send a single IPI to destination cpus
353 * @ipi_send_mask: send an IPI to destination cpus in cpumask
344 * @flags: chip specific flags 354 * @flags: chip specific flags
345 */ 355 */
346struct irq_chip { 356struct irq_chip {
@@ -385,6 +395,9 @@ struct irq_chip {
385 395
386 int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); 396 int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
387 397
398 void (*ipi_send_single)(struct irq_data *data, unsigned int cpu);
399 void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest);
400
388 unsigned long flags; 401 unsigned long flags;
389}; 402};
390 403
@@ -934,4 +947,12 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
934 return readl(gc->reg_base + reg_offset); 947 return readl(gc->reg_base + reg_offset);
935} 948}
936 949
950/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
951#define INVALID_HWIRQ (~0UL)
952irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
953int __ipi_send_single(struct irq_desc *desc, unsigned int cpu);
954int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
955int ipi_send_single(unsigned int virq, unsigned int cpu);
956int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
957
937#endif /* _LINUX_IRQ_H */ 958#endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index ce824db48d64..80f89e4a29ac 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt);
261extern void gic_write_cpu_compare(cycle_t cnt, int cpu); 261extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
262extern void gic_start_count(void); 262extern void gic_start_count(void);
263extern void gic_stop_count(void); 263extern void gic_stop_count(void);
264extern void gic_send_ipi(unsigned int intr);
265extern unsigned int plat_ipi_call_int_xlate(unsigned int);
266extern unsigned int plat_ipi_resched_int_xlate(unsigned int);
267extern int gic_get_c0_compare_int(void); 264extern int gic_get_c0_compare_int(void);
268extern int gic_get_c0_perfcount_int(void); 265extern int gic_get_c0_perfcount_int(void);
269extern int gic_get_c0_fdc_int(void); 266extern int gic_get_c0_fdc_int(void);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 04579d9fbce4..ed48594e96d2 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -74,6 +74,7 @@ enum irq_domain_bus_token {
74 DOMAIN_BUS_PCI_MSI, 74 DOMAIN_BUS_PCI_MSI,
75 DOMAIN_BUS_PLATFORM_MSI, 75 DOMAIN_BUS_PLATFORM_MSI,
76 DOMAIN_BUS_NEXUS, 76 DOMAIN_BUS_NEXUS,
77 DOMAIN_BUS_IPI,
77}; 78};
78 79
79/** 80/**
@@ -172,6 +173,12 @@ enum {
172 /* Core calls alloc/free recursive through the domain hierarchy. */ 173 /* Core calls alloc/free recursive through the domain hierarchy. */
173 IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1), 174 IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1),
174 175
176 /* Irq domain is an IPI domain with virq per cpu */
177 IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2),
178
179 /* Irq domain is an IPI domain with single virq */
180 IRQ_DOMAIN_FLAG_IPI_SINGLE = (1 << 3),
181
175 /* 182 /*
176 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved 183 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
177 * for implementation specific purposes and ignored by the 184 * for implementation specific purposes and ignored by the
@@ -206,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
206extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, 213extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
207 enum irq_domain_bus_token bus_token); 214 enum irq_domain_bus_token bus_token);
208extern void irq_set_default_host(struct irq_domain *host); 215extern void irq_set_default_host(struct irq_domain *host);
216extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
217 irq_hw_number_t hwirq, int node);
209 218
210static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) 219static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
211{ 220{
@@ -335,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
335 const u32 *intspec, unsigned int intsize, 344 const u32 *intspec, unsigned int intsize,
336 irq_hw_number_t *out_hwirq, unsigned int *out_type); 345 irq_hw_number_t *out_hwirq, unsigned int *out_type);
337 346
347/* IPI functions */
348unsigned int irq_reserve_ipi(struct irq_domain *domain,
349 const struct cpumask *dest);
350void irq_destroy_ipi(unsigned int irq);
351
338/* V2 interfaces to support hierarchy IRQ domains. */ 352/* V2 interfaces to support hierarchy IRQ domains. */
339extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, 353extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
340 unsigned int virq); 354 unsigned int virq);
@@ -400,6 +414,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
400{ 414{
401 return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; 415 return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
402} 416}
417
418static inline bool irq_domain_is_ipi(struct irq_domain *domain)
419{
420 return domain->flags &
421 (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE);
422}
423
424static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
425{
426 return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU;
427}
428
429static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
430{
431 return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
432}
403#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ 433#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
404static inline void irq_domain_activate_irq(struct irq_data *data) { } 434static inline void irq_domain_activate_irq(struct irq_data *data) { }
405static inline void irq_domain_deactivate_irq(struct irq_data *data) { } 435static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -413,6 +443,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
413{ 443{
414 return false; 444 return false;
415} 445}
446
447static inline bool irq_domain_is_ipi(struct irq_domain *domain)
448{
449 return false;
450}
451
452static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
453{
454 return false;
455}
456
457static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
458{
459 return false;
460}
416#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ 461#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
417 462
418#else /* CONFIG_IRQ_DOMAIN */ 463#else /* CONFIG_IRQ_DOMAIN */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2b6e22782699..3579d1e2fe3a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2139,6 +2139,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
2139int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 2139int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
2140int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2140int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2141 unsigned long pfn); 2141 unsigned long pfn);
2142int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2143 unsigned long pfn, pgprot_t pgprot);
2142int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 2144int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2143 pfn_t pfn); 2145 pfn_t pfn);
2144int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 2146int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 624b78b848b8..944b2b37313b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -566,10 +566,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
566} 566}
567#endif 567#endif
568 568
569struct vm_special_mapping 569struct vm_fault;
570{ 570
571 const char *name; 571struct vm_special_mapping {
572 const char *name; /* The name, e.g. "[vdso]". */
573
574 /*
575 * If .fault is not provided, this points to a
576 * NULL-terminated array of pages that back the special mapping.
577 *
578 * This must not be NULL unless .fault is provided.
579 */
572 struct page **pages; 580 struct page **pages;
581
582 /*
583 * If non-NULL, then this is called to resolve page faults
584 * on the special mapping. If used, .pages is not checked.
585 */
586 int (*fault)(const struct vm_special_mapping *sm,
587 struct vm_area_struct *vma,
588 struct vm_fault *vmf);
573}; 589};
574 590
575enum tlb_flush_reason { 591enum tlb_flush_reason {
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d14a4c362465..4149868de4e6 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,6 +47,8 @@
47 * runtime initialization. 47 * runtime initialization.
48 */ 48 */
49 49
50struct notifier_block;
51
50typedef int (*notifier_fn_t)(struct notifier_block *nb, 52typedef int (*notifier_fn_t)(struct notifier_block *nb,
51 unsigned long action, void *data); 53 unsigned long action, void *data);
52 54
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 54bf1484d41f..35ac903956c7 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
111 kt->nsec = ts.tv_nsec; 111 kt->nsec = ts.tv_nsec;
112} 112}
113 113
114#ifdef CONFIG_NTP_PPS
115
116static inline void pps_get_ts(struct pps_event_time *ts) 114static inline void pps_get_ts(struct pps_event_time *ts)
117{ 115{
118 ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); 116 struct system_time_snapshot snap;
119}
120 117
121#else /* CONFIG_NTP_PPS */ 118 ktime_get_snapshot(&snap);
122 119 ts->ts_real = ktime_to_timespec64(snap.real);
123static inline void pps_get_ts(struct pps_event_time *ts) 120#ifdef CONFIG_NTP_PPS
124{ 121 ts->ts_raw = ktime_to_timespec64(snap.raw);
125 ktime_get_real_ts64(&ts->ts_real); 122#endif
126} 123}
127 124
128#endif /* CONFIG_NTP_PPS */
129
130/* Subtract known time delay from PPS event time(s) */ 125/* Subtract known time delay from PPS event time(s) */
131static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) 126static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta)
132{ 127{
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index b8b73066d137..6b15e168148a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -38,6 +38,7 @@ struct ptp_clock_request {
38 }; 38 };
39}; 39};
40 40
41struct system_device_crosststamp;
41/** 42/**
42 * struct ptp_clock_info - decribes a PTP hardware clock 43 * struct ptp_clock_info - decribes a PTP hardware clock
43 * 44 *
@@ -67,6 +68,11 @@ struct ptp_clock_request {
67 * @gettime64: Reads the current time from the hardware clock. 68 * @gettime64: Reads the current time from the hardware clock.
68 * parameter ts: Holds the result. 69 * parameter ts: Holds the result.
69 * 70 *
71 * @getcrosststamp: Reads the current time from the hardware clock and
72 * system clock simultaneously.
73 * parameter cts: Contains timestamp (device,system) pair,
74 * where system time is realtime and monotonic.
75 *
70 * @settime64: Set the current time on the hardware clock. 76 * @settime64: Set the current time on the hardware clock.
71 * parameter ts: Time value to set. 77 * parameter ts: Time value to set.
72 * 78 *
@@ -105,6 +111,8 @@ struct ptp_clock_info {
105 int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); 111 int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
106 int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); 112 int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
107 int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); 113 int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
114 int (*getcrosststamp)(struct ptp_clock_info *ptp,
115 struct system_device_crosststamp *cts);
108 int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); 116 int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
109 int (*enable)(struct ptp_clock_info *ptp, 117 int (*enable)(struct ptp_clock_info *ptp,
110 struct ptp_clock_request *request, int on); 118 struct ptp_clock_request *request, int on);
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 14e6f47ee16f..2657aff2725b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -332,9 +332,7 @@ void rcu_init(void);
332void rcu_sched_qs(void); 332void rcu_sched_qs(void);
333void rcu_bh_qs(void); 333void rcu_bh_qs(void);
334void rcu_check_callbacks(int user); 334void rcu_check_callbacks(int user);
335struct notifier_block; 335void rcu_report_dead(unsigned int cpu);
336int rcu_cpu_notify(struct notifier_block *self,
337 unsigned long action, void *hcpu);
338 336
339#ifndef CONFIG_TINY_RCU 337#ifndef CONFIG_TINY_RCU
340void rcu_end_inkernel_boot(void); 338void rcu_end_inkernel_boot(void);
@@ -360,8 +358,6 @@ void rcu_user_exit(void);
360#else 358#else
361static inline void rcu_user_enter(void) { } 359static inline void rcu_user_enter(void) { }
362static inline void rcu_user_exit(void) { } 360static inline void rcu_user_exit(void) { }
363static inline void rcu_user_hooks_switch(struct task_struct *prev,
364 struct task_struct *next) { }
365#endif /* CONFIG_NO_HZ_FULL */ 361#endif /* CONFIG_NO_HZ_FULL */
366 362
367#ifdef CONFIG_RCU_NOCB_CPU 363#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f5f80c5643ac..dc8eb63c6568 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work);
99 } 99 }
100 100
101/* 101/*
102 * define and init a srcu struct at build time. 102 * Define and initialize a srcu struct at build time.
103 * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. 103 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
104 *
105 * Note that although DEFINE_STATIC_SRCU() hides the name from other
106 * files, the per-CPU variable rules nevertheless require that the
107 * chosen name be globally unique. These rules also prohibit use of
108 * DEFINE_STATIC_SRCU() within a function. If these rules are too
109 * restrictive, declare the srcu_struct manually. For example, in
110 * each file:
111 *
112 * static struct srcu_struct my_srcu;
113 *
114 * Then, before the first use of each my_srcu, manually initialize it:
115 *
116 * init_srcu_struct(&my_srcu);
117 *
118 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
104 */ 119 */
105#define __DEFINE_SRCU(name, is_static) \ 120#define __DEFINE_SRCU(name, is_static) \
106 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ 121 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 25247220b4b7..e88005459035 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,6 +50,7 @@ struct tk_read_base {
50 * @offs_tai: Offset clock monotonic -> clock tai 50 * @offs_tai: Offset clock monotonic -> clock tai
51 * @tai_offset: The current UTC to TAI offset in seconds 51 * @tai_offset: The current UTC to TAI offset in seconds
52 * @clock_was_set_seq: The sequence number of clock was set events 52 * @clock_was_set_seq: The sequence number of clock was set events
53 * @cs_was_changed_seq: The sequence number of clocksource change events
53 * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second 54 * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second
54 * @raw_time: Monotonic raw base time in timespec64 format 55 * @raw_time: Monotonic raw base time in timespec64 format
55 * @cycle_interval: Number of clock cycles in one NTP interval 56 * @cycle_interval: Number of clock cycles in one NTP interval
@@ -91,6 +92,7 @@ struct timekeeper {
91 ktime_t offs_tai; 92 ktime_t offs_tai;
92 s32 tai_offset; 93 s32 tai_offset;
93 unsigned int clock_was_set_seq; 94 unsigned int clock_was_set_seq;
95 u8 cs_was_changed_seq;
94 ktime_t next_leap_ktime; 96 ktime_t next_leap_ktime;
95 struct timespec64 raw_time; 97 struct timespec64 raw_time;
96 98
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ec89d846324c..96f37bee3bc1 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -267,6 +267,64 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
267 struct timespec64 *ts_real); 267 struct timespec64 *ts_real);
268 268
269/* 269/*
270 * struct system_time_snapshot - simultaneous raw/real time capture with
271 * counter value
272 * @cycles: Clocksource counter value to produce the system times
273 * @real: Realtime system time
274 * @raw: Monotonic raw system time
275 * @clock_was_set_seq: The sequence number of clock was set events
276 * @cs_was_changed_seq: The sequence number of clocksource change events
277 */
278struct system_time_snapshot {
279 cycle_t cycles;
280 ktime_t real;
281 ktime_t raw;
282 unsigned int clock_was_set_seq;
283 u8 cs_was_changed_seq;
284};
285
286/*
287 * struct system_device_crosststamp - system/device cross-timestamp
288 * (syncronized capture)
289 * @device: Device time
290 * @sys_realtime: Realtime simultaneous with device time
291 * @sys_monoraw: Monotonic raw simultaneous with device time
292 */
293struct system_device_crosststamp {
294 ktime_t device;
295 ktime_t sys_realtime;
296 ktime_t sys_monoraw;
297};
298
299/*
300 * struct system_counterval_t - system counter value with the pointer to the
301 * corresponding clocksource
302 * @cycles: System counter value
303 * @cs: Clocksource corresponding to system counter value. Used by
304 * timekeeping code to verify comparibility of two cycle values
305 */
306struct system_counterval_t {
307 cycle_t cycles;
308 struct clocksource *cs;
309};
310
311/*
312 * Get cross timestamp between system clock and device clock
313 */
314extern int get_device_system_crosststamp(
315 int (*get_time_fn)(ktime_t *device_time,
316 struct system_counterval_t *system_counterval,
317 void *ctx),
318 void *ctx,
319 struct system_time_snapshot *history,
320 struct system_device_crosststamp *xtstamp);
321
322/*
323 * Simultaneously snapshot realtime and monotonic raw clocks
324 */
325extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
326
327/*
270 * Persistent clock related interfaces 328 * Persistent clock related interfaces
271 */ 329 */
272extern int persistent_clock_is_local; 330extern int persistent_clock_is_local;
diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h
new file mode 100644
index 000000000000..a72bd93ec7e5
--- /dev/null
+++ b/include/trace/events/cpuhp.h
@@ -0,0 +1,66 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM cpuhp
3
4#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_CPUHP_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(cpuhp_enter,
10
11 TP_PROTO(unsigned int cpu,
12 int target,
13 int idx,
14 int (*fun)(unsigned int)),
15
16 TP_ARGS(cpu, target, idx, fun),
17
18 TP_STRUCT__entry(
19 __field( unsigned int, cpu )
20 __field( int, target )
21 __field( int, idx )
22 __field( void *, fun )
23 ),
24
25 TP_fast_assign(
26 __entry->cpu = cpu;
27 __entry->target = target;
28 __entry->idx = idx;
29 __entry->fun = fun;
30 ),
31
32 TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
33 __entry->cpu, __entry->target, __entry->idx, __entry->fun)
34);
35
36TRACE_EVENT(cpuhp_exit,
37
38 TP_PROTO(unsigned int cpu,
39 int state,
40 int idx,
41 int ret),
42
43 TP_ARGS(cpu, state, idx, ret),
44
45 TP_STRUCT__entry(
46 __field( unsigned int, cpu )
47 __field( int, state )
48 __field( int, idx )
49 __field( int, ret )
50 ),
51
52 TP_fast_assign(
53 __entry->cpu = cpu;
54 __entry->state = state;
55 __entry->idx = idx;
56 __entry->ret = ret;
57 ),
58
59 TP_printk(" cpu: %04u state: %3d step: %3d ret: %d",
60 __entry->cpu, __entry->state, __entry->idx, __entry->ret)
61);
62
63#endif
64
65/* This part must be outside protection */
66#include <trace/define_trace.h>
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index f0b7bfe5da92..ac6dded80ffa 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -51,7 +51,9 @@ struct ptp_clock_caps {
51 int n_per_out; /* Number of programmable periodic signals. */ 51 int n_per_out; /* Number of programmable periodic signals. */
52 int pps; /* Whether the clock supports a PPS callback. */ 52 int pps; /* Whether the clock supports a PPS callback. */
53 int n_pins; /* Number of input/output pins. */ 53 int n_pins; /* Number of input/output pins. */
54 int rsv[14]; /* Reserved for future use. */ 54 /* Whether the clock supports precise system-device cross timestamps */
55 int cross_timestamping;
56 int rsv[13]; /* Reserved for future use. */
55}; 57};
56 58
57struct ptp_extts_request { 59struct ptp_extts_request {
@@ -81,6 +83,13 @@ struct ptp_sys_offset {
81 struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1]; 83 struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
82}; 84};
83 85
86struct ptp_sys_offset_precise {
87 struct ptp_clock_time device;
88 struct ptp_clock_time sys_realtime;
89 struct ptp_clock_time sys_monoraw;
90 unsigned int rsv[4]; /* Reserved for future use. */
91};
92
84enum ptp_pin_function { 93enum ptp_pin_function {
85 PTP_PF_NONE, 94 PTP_PF_NONE,
86 PTP_PF_EXTTS, 95 PTP_PF_EXTTS,
@@ -124,6 +133,8 @@ struct ptp_pin_desc {
124#define PTP_SYS_OFFSET _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset) 133#define PTP_SYS_OFFSET _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset)
125#define PTP_PIN_GETFUNC _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc) 134#define PTP_PIN_GETFUNC _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc)
126#define PTP_PIN_SETFUNC _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc) 135#define PTP_PIN_SETFUNC _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
136#define PTP_SYS_OFFSET_PRECISE \
137 _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
127 138
128struct ptp_extts_event { 139struct ptp_extts_event {
129 struct ptp_clock_time t; /* Time event occured. */ 140 struct ptp_clock_time t; /* Time event occured. */
diff --git a/init/main.c b/init/main.c
index 7c27de4577ed..8dc93df20f7f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -385,7 +385,6 @@ static noinline void __init_refok rest_init(void)
385 int pid; 385 int pid;
386 386
387 rcu_scheduler_starting(); 387 rcu_scheduler_starting();
388 smpboot_thread_init();
389 /* 388 /*
390 * We need to spawn init first so that it obtains pid 1, however 389 * We need to spawn init first so that it obtains pid 1, however
391 * the init task will end up wanting to create kthreads, which, if 390 * the init task will end up wanting to create kthreads, which, if
@@ -449,20 +448,6 @@ void __init parse_early_param(void)
449 done = 1; 448 done = 1;
450} 449}
451 450
452/*
453 * Activate the first processor.
454 */
455
456static void __init boot_cpu_init(void)
457{
458 int cpu = smp_processor_id();
459 /* Mark the boot cpu "present", "online" etc for SMP and UP case */
460 set_cpu_online(cpu, true);
461 set_cpu_active(cpu, true);
462 set_cpu_present(cpu, true);
463 set_cpu_possible(cpu, true);
464}
465
466void __init __weak smp_setup_processor_id(void) 451void __init __weak smp_setup_processor_id(void)
467{ 452{
468} 453}
@@ -522,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
522 setup_command_line(command_line); 507 setup_command_line(command_line);
523 setup_nr_cpu_ids(); 508 setup_nr_cpu_ids();
524 setup_per_cpu_areas(); 509 setup_per_cpu_areas();
510 boot_cpu_state_init();
525 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 511 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
526 512
527 build_all_zonelists(NULL, NULL); 513 build_all_zonelists(NULL, NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5b9d39633ce9..6ea42e8da861 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,13 +22,88 @@
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <linux/tick.h> 23#include <linux/tick.h>
24#include <linux/irq.h> 24#include <linux/irq.h>
25#include <linux/smpboot.h>
26
25#include <trace/events/power.h> 27#include <trace/events/power.h>
28#define CREATE_TRACE_POINTS
29#include <trace/events/cpuhp.h>
26 30
27#include "smpboot.h" 31#include "smpboot.h"
28 32
33/**
34 * cpuhp_cpu_state - Per cpu hotplug state storage
35 * @state: The current cpu state
36 * @target: The target state
37 * @thread: Pointer to the hotplug thread
38 * @should_run: Thread should execute
39 * @cb_stat: The state for a single callback (install/uninstall)
40 * @cb: Single callback function (install/uninstall)
41 * @result: Result of the operation
42 * @done: Signal completion to the issuer of the task
43 */
44struct cpuhp_cpu_state {
45 enum cpuhp_state state;
46 enum cpuhp_state target;
47#ifdef CONFIG_SMP
48 struct task_struct *thread;
49 bool should_run;
50 enum cpuhp_state cb_state;
51 int (*cb)(unsigned int cpu);
52 int result;
53 struct completion done;
54#endif
55};
56
57static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
58
59/**
60 * cpuhp_step - Hotplug state machine step
61 * @name: Name of the step
62 * @startup: Startup function of the step
63 * @teardown: Teardown function of the step
64 * @skip_onerr: Do not invoke the functions on error rollback
65 * Will go away once the notifiers are gone
66 * @cant_stop: Bringup/teardown can't be stopped at this step
67 */
68struct cpuhp_step {
69 const char *name;
70 int (*startup)(unsigned int cpu);
71 int (*teardown)(unsigned int cpu);
72 bool skip_onerr;
73 bool cant_stop;
74};
75
76static DEFINE_MUTEX(cpuhp_state_mutex);
77static struct cpuhp_step cpuhp_bp_states[];
78static struct cpuhp_step cpuhp_ap_states[];
79
80/**
81 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
82 * @cpu: The cpu for which the callback should be invoked
83 * @step: The step in the state machine
84 * @cb: The callback function to invoke
85 *
86 * Called from cpu hotplug and from the state register machinery
87 */
88static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
89 int (*cb)(unsigned int))
90{
91 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
92 int ret = 0;
93
94 if (cb) {
95 trace_cpuhp_enter(cpu, st->target, step, cb);
96 ret = cb(cpu);
97 trace_cpuhp_exit(cpu, st->state, step, ret);
98 }
99 return ret;
100}
101
29#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
30/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 103/* Serializes the updates to cpu_online_mask, cpu_present_mask */
31static DEFINE_MUTEX(cpu_add_remove_lock); 104static DEFINE_MUTEX(cpu_add_remove_lock);
105bool cpuhp_tasks_frozen;
106EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
32 107
33/* 108/*
34 * The following two APIs (cpu_maps_update_begin/done) must be used when 109 * The following two APIs (cpu_maps_update_begin/done) must be used when
@@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb)
207 return raw_notifier_chain_register(&cpu_chain, nb); 282 return raw_notifier_chain_register(&cpu_chain, nb);
208} 283}
209 284
210static int __cpu_notify(unsigned long val, void *v, int nr_to_call, 285static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
211 int *nr_calls) 286 int *nr_calls)
212{ 287{
288 unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
289 void *hcpu = (void *)(long)cpu;
290
213 int ret; 291 int ret;
214 292
215 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, 293 ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
216 nr_calls); 294 nr_calls);
217 295
218 return notifier_to_errno(ret); 296 return notifier_to_errno(ret);
219} 297}
220 298
221static int cpu_notify(unsigned long val, void *v) 299static int cpu_notify(unsigned long val, unsigned int cpu)
222{ 300{
223 return __cpu_notify(val, v, -1, NULL); 301 return __cpu_notify(val, cpu, -1, NULL);
224} 302}
225 303
226#ifdef CONFIG_HOTPLUG_CPU 304/* Notifier wrappers for transitioning to state machine */
305static int notify_prepare(unsigned int cpu)
306{
307 int nr_calls = 0;
308 int ret;
309
310 ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
311 if (ret) {
312 nr_calls--;
313 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
314 __func__, cpu);
315 __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
316 }
317 return ret;
318}
319
320static int notify_online(unsigned int cpu)
321{
322 cpu_notify(CPU_ONLINE, cpu);
323 return 0;
324}
325
326static int notify_starting(unsigned int cpu)
327{
328 cpu_notify(CPU_STARTING, cpu);
329 return 0;
330}
331
332static int bringup_wait_for_ap(unsigned int cpu)
333{
334 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
335
336 wait_for_completion(&st->done);
337 return st->result;
338}
339
340static int bringup_cpu(unsigned int cpu)
341{
342 struct task_struct *idle = idle_thread_get(cpu);
343 int ret;
344
345 /* Arch-specific enabling code. */
346 ret = __cpu_up(cpu, idle);
347 if (ret) {
348 cpu_notify(CPU_UP_CANCELED, cpu);
349 return ret;
350 }
351 ret = bringup_wait_for_ap(cpu);
352 BUG_ON(!cpu_online(cpu));
353 return ret;
354}
355
356/*
357 * Hotplug state machine related functions
358 */
359static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
360 struct cpuhp_step *steps)
361{
362 for (st->state++; st->state < st->target; st->state++) {
363 struct cpuhp_step *step = steps + st->state;
364
365 if (!step->skip_onerr)
366 cpuhp_invoke_callback(cpu, st->state, step->startup);
367 }
368}
369
370static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
371 struct cpuhp_step *steps, enum cpuhp_state target)
372{
373 enum cpuhp_state prev_state = st->state;
374 int ret = 0;
375
376 for (; st->state > target; st->state--) {
377 struct cpuhp_step *step = steps + st->state;
378
379 ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
380 if (ret) {
381 st->target = prev_state;
382 undo_cpu_down(cpu, st, steps);
383 break;
384 }
385 }
386 return ret;
387}
388
389static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
390 struct cpuhp_step *steps)
391{
392 for (st->state--; st->state > st->target; st->state--) {
393 struct cpuhp_step *step = steps + st->state;
394
395 if (!step->skip_onerr)
396 cpuhp_invoke_callback(cpu, st->state, step->teardown);
397 }
398}
399
400static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
401 struct cpuhp_step *steps, enum cpuhp_state target)
402{
403 enum cpuhp_state prev_state = st->state;
404 int ret = 0;
405
406 while (st->state < target) {
407 struct cpuhp_step *step;
408
409 st->state++;
410 step = steps + st->state;
411 ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
412 if (ret) {
413 st->target = prev_state;
414 undo_cpu_up(cpu, st, steps);
415 break;
416 }
417 }
418 return ret;
419}
420
421/*
422 * The cpu hotplug threads manage the bringup and teardown of the cpus
423 */
424static void cpuhp_create(unsigned int cpu)
425{
426 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
427
428 init_completion(&st->done);
429}
430
431static int cpuhp_should_run(unsigned int cpu)
432{
433 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
434
435 return st->should_run;
436}
437
438/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
439static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
440{
441 enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
442
443 return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
444}
445
446/* Execute the online startup callbacks. Used to be CPU_ONLINE */
447static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
448{
449 return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
450}
451
452/*
453 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
454 * callbacks when a state gets [un]installed at runtime.
455 */
456static void cpuhp_thread_fun(unsigned int cpu)
457{
458 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
459 int ret = 0;
460
461 /*
462 * Paired with the mb() in cpuhp_kick_ap_work and
463 * cpuhp_invoke_ap_callback, so the work set is consistent visible.
464 */
465 smp_mb();
466 if (!st->should_run)
467 return;
468
469 st->should_run = false;
470
471 /* Single callback invocation for [un]install ? */
472 if (st->cb) {
473 if (st->cb_state < CPUHP_AP_ONLINE) {
474 local_irq_disable();
475 ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
476 local_irq_enable();
477 } else {
478 ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
479 }
480 } else {
481 /* Cannot happen .... */
482 BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
483
484 /* Regular hotplug work */
485 if (st->state < st->target)
486 ret = cpuhp_ap_online(cpu, st);
487 else if (st->state > st->target)
488 ret = cpuhp_ap_offline(cpu, st);
489 }
490 st->result = ret;
491 complete(&st->done);
492}
227 493
228static void cpu_notify_nofail(unsigned long val, void *v) 494/* Invoke a single callback on a remote cpu */
495static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
496 int (*cb)(unsigned int))
229{ 497{
230 BUG_ON(cpu_notify(val, v)); 498 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
499
500 if (!cpu_online(cpu))
501 return 0;
502
503 st->cb_state = state;
504 st->cb = cb;
505 /*
506 * Make sure the above stores are visible before should_run becomes
507 * true. Paired with the mb() above in cpuhp_thread_fun()
508 */
509 smp_mb();
510 st->should_run = true;
511 wake_up_process(st->thread);
512 wait_for_completion(&st->done);
513 return st->result;
231} 514}
515
516/* Regular hotplug invocation of the AP hotplug thread */
517static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
518{
519 st->result = 0;
520 st->cb = NULL;
521 /*
522 * Make sure the above stores are visible before should_run becomes
523 * true. Paired with the mb() above in cpuhp_thread_fun()
524 */
525 smp_mb();
526 st->should_run = true;
527 wake_up_process(st->thread);
528}
529
530static int cpuhp_kick_ap_work(unsigned int cpu)
531{
532 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
533 enum cpuhp_state state = st->state;
534
535 trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
536 __cpuhp_kick_ap_work(st);
537 wait_for_completion(&st->done);
538 trace_cpuhp_exit(cpu, st->state, state, st->result);
539 return st->result;
540}
541
542static struct smp_hotplug_thread cpuhp_threads = {
543 .store = &cpuhp_state.thread,
544 .create = &cpuhp_create,
545 .thread_should_run = cpuhp_should_run,
546 .thread_fn = cpuhp_thread_fun,
547 .thread_comm = "cpuhp/%u",
548 .selfparking = true,
549};
550
551void __init cpuhp_threads_init(void)
552{
553 BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
554 kthread_unpark(this_cpu_read(cpuhp_state.thread));
555}
556
557#ifdef CONFIG_HOTPLUG_CPU
232EXPORT_SYMBOL(register_cpu_notifier); 558EXPORT_SYMBOL(register_cpu_notifier);
233EXPORT_SYMBOL(__register_cpu_notifier); 559EXPORT_SYMBOL(__register_cpu_notifier);
234
235void unregister_cpu_notifier(struct notifier_block *nb) 560void unregister_cpu_notifier(struct notifier_block *nb)
236{ 561{
237 cpu_maps_update_begin(); 562 cpu_maps_update_begin();
@@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu)
311 read_unlock(&tasklist_lock); 636 read_unlock(&tasklist_lock);
312} 637}
313 638
314struct take_cpu_down_param { 639static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
315 unsigned long mod; 640{
316 void *hcpu; 641 BUG_ON(cpu_notify(val, cpu));
317}; 642}
643
644static int notify_down_prepare(unsigned int cpu)
645{
646 int err, nr_calls = 0;
647
648 err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
649 if (err) {
650 nr_calls--;
651 __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
652 pr_warn("%s: attempt to take down CPU %u failed\n",
653 __func__, cpu);
654 }
655 return err;
656}
657
658static int notify_dying(unsigned int cpu)
659{
660 cpu_notify(CPU_DYING, cpu);
661 return 0;
662}
318 663
319/* Take this CPU down. */ 664/* Take this CPU down. */
320static int take_cpu_down(void *_param) 665static int take_cpu_down(void *_param)
321{ 666{
322 struct take_cpu_down_param *param = _param; 667 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
323 int err; 668 enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
669 int err, cpu = smp_processor_id();
324 670
325 /* Ensure this CPU doesn't handle any more interrupts. */ 671 /* Ensure this CPU doesn't handle any more interrupts. */
326 err = __cpu_disable(); 672 err = __cpu_disable();
327 if (err < 0) 673 if (err < 0)
328 return err; 674 return err;
329 675
330 cpu_notify(CPU_DYING | param->mod, param->hcpu); 676 /* Invoke the former CPU_DYING callbacks */
677 for (; st->state > target; st->state--) {
678 struct cpuhp_step *step = cpuhp_ap_states + st->state;
679
680 cpuhp_invoke_callback(cpu, st->state, step->teardown);
681 }
331 /* Give up timekeeping duties */ 682 /* Give up timekeeping duties */
332 tick_handover_do_timer(); 683 tick_handover_do_timer();
333 /* Park the stopper thread */ 684 /* Park the stopper thread */
334 stop_machine_park((long)param->hcpu); 685 stop_machine_park(cpu);
335 return 0; 686 return 0;
336} 687}
337 688
338/* Requires cpu_add_remove_lock to be held */ 689static int takedown_cpu(unsigned int cpu)
339static int _cpu_down(unsigned int cpu, int tasks_frozen)
340{ 690{
341 int err, nr_calls = 0; 691 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
342 void *hcpu = (void *)(long)cpu; 692 int err;
343 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
344 struct take_cpu_down_param tcd_param = {
345 .mod = mod,
346 .hcpu = hcpu,
347 };
348
349 if (num_online_cpus() == 1)
350 return -EBUSY;
351
352 if (!cpu_online(cpu))
353 return -EINVAL;
354
355 cpu_hotplug_begin();
356
357 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
358 if (err) {
359 nr_calls--;
360 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
361 pr_warn("%s: attempt to take down CPU %u failed\n",
362 __func__, cpu);
363 goto out_release;
364 }
365 693
366 /* 694 /*
367 * By now we've cleared cpu_active_mask, wait for all preempt-disabled 695 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
378 else 706 else
379 synchronize_rcu(); 707 synchronize_rcu();
380 708
709 /* Park the smpboot threads */
710 kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
381 smpboot_park_threads(cpu); 711 smpboot_park_threads(cpu);
382 712
383 /* 713 /*
@@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
389 /* 719 /*
390 * So now all preempt/rcu users must observe !cpu_active(). 720 * So now all preempt/rcu users must observe !cpu_active().
391 */ 721 */
392 err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 722 err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
393 if (err) { 723 if (err) {
394 /* CPU didn't die: tell everyone. Can't complain. */ 724 /* CPU didn't die: tell everyone. Can't complain. */
395 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 725 cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
396 irq_unlock_sparse(); 726 irq_unlock_sparse();
397 goto out_release; 727 return err;
398 } 728 }
399 BUG_ON(cpu_online(cpu)); 729 BUG_ON(cpu_online(cpu));
400 730
@@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
405 * 735 *
406 * Wait for the stop thread to go away. 736 * Wait for the stop thread to go away.
407 */ 737 */
408 while (!per_cpu(cpu_dead_idle, cpu)) 738 wait_for_completion(&st->done);
409 cpu_relax(); 739 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
410 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
411 per_cpu(cpu_dead_idle, cpu) = false;
412 740
413 /* Interrupts are moved away from the dying cpu, reenable alloc/free */ 741 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
414 irq_unlock_sparse(); 742 irq_unlock_sparse();
@@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
417 /* This actually kills the CPU. */ 745 /* This actually kills the CPU. */
418 __cpu_die(cpu); 746 __cpu_die(cpu);
419 747
420 /* CPU is completely dead: tell everyone. Too late to complain. */
421 tick_cleanup_dead_cpu(cpu); 748 tick_cleanup_dead_cpu(cpu);
422 cpu_notify_nofail(CPU_DEAD | mod, hcpu); 749 return 0;
750}
423 751
752static int notify_dead(unsigned int cpu)
753{
754 cpu_notify_nofail(CPU_DEAD, cpu);
424 check_for_tasks(cpu); 755 check_for_tasks(cpu);
756 return 0;
757}
425 758
426out_release: 759static void cpuhp_complete_idle_dead(void *arg)
760{
761 struct cpuhp_cpu_state *st = arg;
762
763 complete(&st->done);
764}
765
766void cpuhp_report_idle_dead(void)
767{
768 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
769
770 BUG_ON(st->state != CPUHP_AP_OFFLINE);
771 rcu_report_dead(smp_processor_id());
772 st->state = CPUHP_AP_IDLE_DEAD;
773 /*
774 * We cannot call complete after rcu_report_dead() so we delegate it
775 * to an online cpu.
776 */
777 smp_call_function_single(cpumask_first(cpu_online_mask),
778 cpuhp_complete_idle_dead, st, 0);
779}
780
781#else
782#define notify_down_prepare NULL
783#define takedown_cpu NULL
784#define notify_dead NULL
785#define notify_dying NULL
786#endif
787
788#ifdef CONFIG_HOTPLUG_CPU
789
790/* Requires cpu_add_remove_lock to be held */
791static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
792 enum cpuhp_state target)
793{
794 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
795 int prev_state, ret = 0;
796 bool hasdied = false;
797
798 if (num_online_cpus() == 1)
799 return -EBUSY;
800
801 if (!cpu_present(cpu))
802 return -EINVAL;
803
804 cpu_hotplug_begin();
805
806 cpuhp_tasks_frozen = tasks_frozen;
807
808 prev_state = st->state;
809 st->target = target;
810 /*
811 * If the current CPU state is in the range of the AP hotplug thread,
812 * then we need to kick the thread.
813 */
814 if (st->state > CPUHP_TEARDOWN_CPU) {
815 ret = cpuhp_kick_ap_work(cpu);
816 /*
817 * The AP side has done the error rollback already. Just
818 * return the error code..
819 */
820 if (ret)
821 goto out;
822
823 /*
824 * We might have stopped still in the range of the AP hotplug
825 * thread. Nothing to do anymore.
826 */
827 if (st->state > CPUHP_TEARDOWN_CPU)
828 goto out;
829 }
830 /*
831 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
832 * to do the further cleanups.
833 */
834 ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
835
836 hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
837out:
427 cpu_hotplug_done(); 838 cpu_hotplug_done();
428 if (!err) 839 /* This post dead nonsense must die */
429 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); 840 if (!ret && hasdied)
430 return err; 841 cpu_notify_nofail(CPU_POST_DEAD, cpu);
842 return ret;
431} 843}
432 844
433int cpu_down(unsigned int cpu) 845static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
434{ 846{
435 int err; 847 int err;
436 848
@@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu)
441 goto out; 853 goto out;
442 } 854 }
443 855
444 err = _cpu_down(cpu, 0); 856 err = _cpu_down(cpu, 0, target);
445 857
446out: 858out:
447 cpu_maps_update_done(); 859 cpu_maps_update_done();
448 return err; 860 return err;
449} 861}
862int cpu_down(unsigned int cpu)
863{
864 return do_cpu_down(cpu, CPUHP_OFFLINE);
865}
450EXPORT_SYMBOL(cpu_down); 866EXPORT_SYMBOL(cpu_down);
451#endif /*CONFIG_HOTPLUG_CPU*/ 867#endif /*CONFIG_HOTPLUG_CPU*/
452 868
453/* 869/**
454 * Unpark per-CPU smpboot kthreads at CPU-online time. 870 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
871 * @cpu: cpu that just started
872 *
873 * This function calls the cpu_chain notifiers with CPU_STARTING.
874 * It must be called by the arch code on the new cpu, before the new cpu
875 * enables interrupts and before the "boot" cpu returns from __cpu_up().
455 */ 876 */
456static int smpboot_thread_call(struct notifier_block *nfb, 877void notify_cpu_starting(unsigned int cpu)
457 unsigned long action, void *hcpu)
458{ 878{
459 int cpu = (long)hcpu; 879 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
460 880 enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
461 switch (action & ~CPU_TASKS_FROZEN) {
462 881
463 case CPU_DOWN_FAILED: 882 while (st->state < target) {
464 case CPU_ONLINE: 883 struct cpuhp_step *step;
465 smpboot_unpark_threads(cpu);
466 break;
467 884
468 default: 885 st->state++;
469 break; 886 step = cpuhp_ap_states + st->state;
887 cpuhp_invoke_callback(cpu, st->state, step->startup);
470 } 888 }
471
472 return NOTIFY_OK;
473} 889}
474 890
475static struct notifier_block smpboot_thread_notifier = { 891/*
476 .notifier_call = smpboot_thread_call, 892 * Called from the idle task. We need to set active here, so we can kick off
477 .priority = CPU_PRI_SMPBOOT, 893 * the stopper thread and unpark the smpboot threads. If the target state is
478}; 894 * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
479 895 * cpu further.
480void smpboot_thread_init(void) 896 */
897void cpuhp_online_idle(enum cpuhp_state state)
481{ 898{
482 register_cpu_notifier(&smpboot_thread_notifier); 899 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
900 unsigned int cpu = smp_processor_id();
901
902 /* Happens for the boot cpu */
903 if (state != CPUHP_AP_ONLINE_IDLE)
904 return;
905
906 st->state = CPUHP_AP_ONLINE_IDLE;
907
908 /* The cpu is marked online, set it active now */
909 set_cpu_active(cpu, true);
910 /* Unpark the stopper thread and the hotplug thread of this cpu */
911 stop_machine_unpark(cpu);
912 kthread_unpark(st->thread);
913
914 /* Should we go further up ? */
915 if (st->target > CPUHP_AP_ONLINE_IDLE)
916 __cpuhp_kick_ap_work(st);
917 else
918 complete(&st->done);
483} 919}
484 920
485/* Requires cpu_add_remove_lock to be held */ 921/* Requires cpu_add_remove_lock to be held */
486static int _cpu_up(unsigned int cpu, int tasks_frozen) 922static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
487{ 923{
488 int ret, nr_calls = 0; 924 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
489 void *hcpu = (void *)(long)cpu;
490 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
491 struct task_struct *idle; 925 struct task_struct *idle;
926 int ret = 0;
492 927
493 cpu_hotplug_begin(); 928 cpu_hotplug_begin();
494 929
495 if (cpu_online(cpu) || !cpu_present(cpu)) { 930 if (!cpu_present(cpu)) {
496 ret = -EINVAL; 931 ret = -EINVAL;
497 goto out; 932 goto out;
498 } 933 }
499 934
500 idle = idle_thread_get(cpu); 935 /*
501 if (IS_ERR(idle)) { 936 * The caller of do_cpu_up might have raced with another
502 ret = PTR_ERR(idle); 937 * caller. Ignore it for now.
503 goto out; 938 */
504 } 939 if (st->state >= target)
505
506 ret = smpboot_create_threads(cpu);
507 if (ret)
508 goto out; 940 goto out;
509 941
510 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 942 if (st->state == CPUHP_OFFLINE) {
511 if (ret) { 943 /* Let it fail before we try to bring the cpu up */
512 nr_calls--; 944 idle = idle_thread_get(cpu);
513 pr_warn("%s: attempt to bring up CPU %u failed\n", 945 if (IS_ERR(idle)) {
514 __func__, cpu); 946 ret = PTR_ERR(idle);
515 goto out_notify; 947 goto out;
948 }
516 } 949 }
517 950
518 /* Arch-specific enabling code. */ 951 cpuhp_tasks_frozen = tasks_frozen;
519 ret = __cpu_up(cpu, idle);
520
521 if (ret != 0)
522 goto out_notify;
523 BUG_ON(!cpu_online(cpu));
524 952
525 /* Now call notifier in preparation. */ 953 st->target = target;
526 cpu_notify(CPU_ONLINE | mod, hcpu); 954 /*
955 * If the current CPU state is in the range of the AP hotplug thread,
956 * then we need to kick the thread once more.
957 */
958 if (st->state > CPUHP_BRINGUP_CPU) {
959 ret = cpuhp_kick_ap_work(cpu);
960 /*
961 * The AP side has done the error rollback already. Just
962 * return the error code..
963 */
964 if (ret)
965 goto out;
966 }
527 967
528out_notify: 968 /*
529 if (ret != 0) 969 * Try to reach the target state. We max out on the BP at
530 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 970 * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
971 * responsible for bringing it up to the target state.
972 */
973 target = min((int)target, CPUHP_BRINGUP_CPU);
974 ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
531out: 975out:
532 cpu_hotplug_done(); 976 cpu_hotplug_done();
533
534 return ret; 977 return ret;
535} 978}
536 979
537int cpu_up(unsigned int cpu) 980static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
538{ 981{
539 int err = 0; 982 int err = 0;
540 983
@@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu)
558 goto out; 1001 goto out;
559 } 1002 }
560 1003
561 err = _cpu_up(cpu, 0); 1004 err = _cpu_up(cpu, 0, target);
562
563out: 1005out:
564 cpu_maps_update_done(); 1006 cpu_maps_update_done();
565 return err; 1007 return err;
566} 1008}
1009
1010int cpu_up(unsigned int cpu)
1011{
1012 return do_cpu_up(cpu, CPUHP_ONLINE);
1013}
567EXPORT_SYMBOL_GPL(cpu_up); 1014EXPORT_SYMBOL_GPL(cpu_up);
568 1015
569#ifdef CONFIG_PM_SLEEP_SMP 1016#ifdef CONFIG_PM_SLEEP_SMP
@@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void)
586 if (cpu == first_cpu) 1033 if (cpu == first_cpu)
587 continue; 1034 continue;
588 trace_suspend_resume(TPS("CPU_OFF"), cpu, true); 1035 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
589 error = _cpu_down(cpu, 1); 1036 error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
590 trace_suspend_resume(TPS("CPU_OFF"), cpu, false); 1037 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
591 if (!error) 1038 if (!error)
592 cpumask_set_cpu(cpu, frozen_cpus); 1039 cpumask_set_cpu(cpu, frozen_cpus);
@@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void)
636 1083
637 for_each_cpu(cpu, frozen_cpus) { 1084 for_each_cpu(cpu, frozen_cpus) {
638 trace_suspend_resume(TPS("CPU_ON"), cpu, true); 1085 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
639 error = _cpu_up(cpu, 1); 1086 error = _cpu_up(cpu, 1, CPUHP_ONLINE);
640 trace_suspend_resume(TPS("CPU_ON"), cpu, false); 1087 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
641 if (!error) { 1088 if (!error) {
642 pr_info("CPU%d is up\n", cpu); 1089 pr_info("CPU%d is up\n", cpu);
@@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init);
709 1156
710#endif /* CONFIG_PM_SLEEP_SMP */ 1157#endif /* CONFIG_PM_SLEEP_SMP */
711 1158
1159#endif /* CONFIG_SMP */
1160
1161/* Boot processor state steps */
1162static struct cpuhp_step cpuhp_bp_states[] = {
1163 [CPUHP_OFFLINE] = {
1164 .name = "offline",
1165 .startup = NULL,
1166 .teardown = NULL,
1167 },
1168#ifdef CONFIG_SMP
1169 [CPUHP_CREATE_THREADS]= {
1170 .name = "threads:create",
1171 .startup = smpboot_create_threads,
1172 .teardown = NULL,
1173 .cant_stop = true,
1174 },
1175 /*
1176 * Preparatory and dead notifiers. Will be replaced once the notifiers
1177 * are converted to states.
1178 */
1179 [CPUHP_NOTIFY_PREPARE] = {
1180 .name = "notify:prepare",
1181 .startup = notify_prepare,
1182 .teardown = notify_dead,
1183 .skip_onerr = true,
1184 .cant_stop = true,
1185 },
1186 /* Kicks the plugged cpu into life */
1187 [CPUHP_BRINGUP_CPU] = {
1188 .name = "cpu:bringup",
1189 .startup = bringup_cpu,
1190 .teardown = NULL,
1191 .cant_stop = true,
1192 },
1193 /*
1194 * Handled on controll processor until the plugged processor manages
1195 * this itself.
1196 */
1197 [CPUHP_TEARDOWN_CPU] = {
1198 .name = "cpu:teardown",
1199 .startup = NULL,
1200 .teardown = takedown_cpu,
1201 .cant_stop = true,
1202 },
1203#endif
1204};
1205
1206/* Application processor state steps */
1207static struct cpuhp_step cpuhp_ap_states[] = {
1208#ifdef CONFIG_SMP
1209 /* Final state before CPU kills itself */
1210 [CPUHP_AP_IDLE_DEAD] = {
1211 .name = "idle:dead",
1212 },
1213 /*
1214 * Last state before CPU enters the idle loop to die. Transient state
1215 * for synchronization.
1216 */
1217 [CPUHP_AP_OFFLINE] = {
1218 .name = "ap:offline",
1219 .cant_stop = true,
1220 },
1221 /*
1222 * Low level startup/teardown notifiers. Run with interrupts
1223 * disabled. Will be removed once the notifiers are converted to
1224 * states.
1225 */
1226 [CPUHP_AP_NOTIFY_STARTING] = {
1227 .name = "notify:starting",
1228 .startup = notify_starting,
1229 .teardown = notify_dying,
1230 .skip_onerr = true,
1231 .cant_stop = true,
1232 },
1233 /* Entry state on starting. Interrupts enabled from here on. Transient
1234 * state for synchronsization */
1235 [CPUHP_AP_ONLINE] = {
1236 .name = "ap:online",
1237 },
1238 /* Handle smpboot threads park/unpark */
1239 [CPUHP_AP_SMPBOOT_THREADS] = {
1240 .name = "smpboot:threads",
1241 .startup = smpboot_unpark_threads,
1242 .teardown = NULL,
1243 },
1244 /*
1245 * Online/down_prepare notifiers. Will be removed once the notifiers
1246 * are converted to states.
1247 */
1248 [CPUHP_AP_NOTIFY_ONLINE] = {
1249 .name = "notify:online",
1250 .startup = notify_online,
1251 .teardown = notify_down_prepare,
1252 },
1253#endif
1254 /*
1255 * The dynamically registered state space is here
1256 */
1257
1258 /* CPU is fully up and running. */
1259 [CPUHP_ONLINE] = {
1260 .name = "online",
1261 .startup = NULL,
1262 .teardown = NULL,
1263 },
1264};
1265
1266/* Sanity check for callbacks */
1267static int cpuhp_cb_check(enum cpuhp_state state)
1268{
1269 if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
1270 return -EINVAL;
1271 return 0;
1272}
1273
1274static bool cpuhp_is_ap_state(enum cpuhp_state state)
1275{
1276 /*
1277 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
1278 * purposes as that state is handled explicitely in cpu_down.
1279 */
1280 return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
1281}
1282
1283static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
1284{
1285 struct cpuhp_step *sp;
1286
1287 sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
1288 return sp + state;
1289}
1290
1291static void cpuhp_store_callbacks(enum cpuhp_state state,
1292 const char *name,
1293 int (*startup)(unsigned int cpu),
1294 int (*teardown)(unsigned int cpu))
1295{
1296 /* (Un)Install the callbacks for further cpu hotplug operations */
1297 struct cpuhp_step *sp;
1298
1299 mutex_lock(&cpuhp_state_mutex);
1300 sp = cpuhp_get_step(state);
1301 sp->startup = startup;
1302 sp->teardown = teardown;
1303 sp->name = name;
1304 mutex_unlock(&cpuhp_state_mutex);
1305}
1306
1307static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
1308{
1309 return cpuhp_get_step(state)->teardown;
1310}
1311
1312/*
1313 * Call the startup/teardown function for a step either on the AP or
1314 * on the current CPU.
1315 */
1316static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
1317 int (*cb)(unsigned int), bool bringup)
1318{
1319 int ret;
1320
1321 if (!cb)
1322 return 0;
1323 /*
1324 * The non AP bound callbacks can fail on bringup. On teardown
1325 * e.g. module removal we crash for now.
1326 */
1327#ifdef CONFIG_SMP
1328 if (cpuhp_is_ap_state(state))
1329 ret = cpuhp_invoke_ap_callback(cpu, state, cb);
1330 else
1331 ret = cpuhp_invoke_callback(cpu, state, cb);
1332#else
1333 ret = cpuhp_invoke_callback(cpu, state, cb);
1334#endif
1335 BUG_ON(ret && !bringup);
1336 return ret;
1337}
1338
1339/*
1340 * Called from __cpuhp_setup_state on a recoverable failure.
1341 *
1342 * Note: The teardown callbacks for rollback are not allowed to fail!
1343 */
1344static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1345 int (*teardown)(unsigned int cpu))
1346{
1347 int cpu;
1348
1349 if (!teardown)
1350 return;
1351
1352 /* Roll back the already executed steps on the other cpus */
1353 for_each_present_cpu(cpu) {
1354 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1355 int cpustate = st->state;
1356
1357 if (cpu >= failedcpu)
1358 break;
1359
1360 /* Did we invoke the startup call on that cpu ? */
1361 if (cpustate >= state)
1362 cpuhp_issue_call(cpu, state, teardown, false);
1363 }
1364}
1365
1366/*
1367 * Returns a free for dynamic slot assignment of the Online state. The states
1368 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
1369 * by having no name assigned.
1370 */
1371static int cpuhp_reserve_state(enum cpuhp_state state)
1372{
1373 enum cpuhp_state i;
1374
1375 mutex_lock(&cpuhp_state_mutex);
1376 for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
1377 if (cpuhp_ap_states[i].name)
1378 continue;
1379
1380 cpuhp_ap_states[i].name = "Reserved";
1381 mutex_unlock(&cpuhp_state_mutex);
1382 return i;
1383 }
1384 mutex_unlock(&cpuhp_state_mutex);
1385 WARN(1, "No more dynamic states available for CPU hotplug\n");
1386 return -ENOSPC;
1387}
1388
712/** 1389/**
713 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers 1390 * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
714 * @cpu: cpu that just started 1391 * @state: The state to setup
1392 * @invoke: If true, the startup function is invoked for cpus where
1393 * cpu state >= @state
1394 * @startup: startup callback function
1395 * @teardown: teardown callback function
715 * 1396 *
716 * This function calls the cpu_chain notifiers with CPU_STARTING. 1397 * Returns 0 if successful, otherwise a proper error code
717 * It must be called by the arch code on the new cpu, before the new cpu
718 * enables interrupts and before the "boot" cpu returns from __cpu_up().
719 */ 1398 */
720void notify_cpu_starting(unsigned int cpu) 1399int __cpuhp_setup_state(enum cpuhp_state state,
1400 const char *name, bool invoke,
1401 int (*startup)(unsigned int cpu),
1402 int (*teardown)(unsigned int cpu))
721{ 1403{
722 unsigned long val = CPU_STARTING; 1404 int cpu, ret = 0;
1405 int dyn_state = 0;
723 1406
724#ifdef CONFIG_PM_SLEEP_SMP 1407 if (cpuhp_cb_check(state) || !name)
725 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 1408 return -EINVAL;
726 val = CPU_STARTING_FROZEN; 1409
727#endif /* CONFIG_PM_SLEEP_SMP */ 1410 get_online_cpus();
728 cpu_notify(val, (void *)(long)cpu); 1411
1412 /* currently assignments for the ONLINE state are possible */
1413 if (state == CPUHP_AP_ONLINE_DYN) {
1414 dyn_state = 1;
1415 ret = cpuhp_reserve_state(state);
1416 if (ret < 0)
1417 goto out;
1418 state = ret;
1419 }
1420
1421 cpuhp_store_callbacks(state, name, startup, teardown);
1422
1423 if (!invoke || !startup)
1424 goto out;
1425
1426 /*
1427 * Try to call the startup callback for each present cpu
1428 * depending on the hotplug state of the cpu.
1429 */
1430 for_each_present_cpu(cpu) {
1431 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1432 int cpustate = st->state;
1433
1434 if (cpustate < state)
1435 continue;
1436
1437 ret = cpuhp_issue_call(cpu, state, startup, true);
1438 if (ret) {
1439 cpuhp_rollback_install(cpu, state, teardown);
1440 cpuhp_store_callbacks(state, NULL, NULL, NULL);
1441 goto out;
1442 }
1443 }
1444out:
1445 put_online_cpus();
1446 if (!ret && dyn_state)
1447 return state;
1448 return ret;
729} 1449}
1450EXPORT_SYMBOL(__cpuhp_setup_state);
730 1451
731#endif /* CONFIG_SMP */ 1452/**
1453 * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
1454 * @state: The state to remove
1455 * @invoke: If true, the teardown function is invoked for cpus where
1456 * cpu state >= @state
1457 *
1458 * The teardown callback is currently not allowed to fail. Think
1459 * about module removal!
1460 */
1461void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
1462{
1463 int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
1464 int cpu;
1465
1466 BUG_ON(cpuhp_cb_check(state));
1467
1468 get_online_cpus();
1469
1470 if (!invoke || !teardown)
1471 goto remove;
1472
1473 /*
1474 * Call the teardown callback for each present cpu depending
1475 * on the hotplug state of the cpu. This function is not
1476 * allowed to fail currently!
1477 */
1478 for_each_present_cpu(cpu) {
1479 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1480 int cpustate = st->state;
1481
1482 if (cpustate >= state)
1483 cpuhp_issue_call(cpu, state, teardown, false);
1484 }
1485remove:
1486 cpuhp_store_callbacks(state, NULL, NULL, NULL);
1487 put_online_cpus();
1488}
1489EXPORT_SYMBOL(__cpuhp_remove_state);
1490
1491#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
1492static ssize_t show_cpuhp_state(struct device *dev,
1493 struct device_attribute *attr, char *buf)
1494{
1495 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
1496
1497 return sprintf(buf, "%d\n", st->state);
1498}
1499static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
1500
1501static ssize_t write_cpuhp_target(struct device *dev,
1502 struct device_attribute *attr,
1503 const char *buf, size_t count)
1504{
1505 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
1506 struct cpuhp_step *sp;
1507 int target, ret;
1508
1509 ret = kstrtoint(buf, 10, &target);
1510 if (ret)
1511 return ret;
1512
1513#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
1514 if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
1515 return -EINVAL;
1516#else
1517 if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
1518 return -EINVAL;
1519#endif
1520
1521 ret = lock_device_hotplug_sysfs();
1522 if (ret)
1523 return ret;
1524
1525 mutex_lock(&cpuhp_state_mutex);
1526 sp = cpuhp_get_step(target);
1527 ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
1528 mutex_unlock(&cpuhp_state_mutex);
1529 if (ret)
1530 return ret;
1531
1532 if (st->state < target)
1533 ret = do_cpu_up(dev->id, target);
1534 else
1535 ret = do_cpu_down(dev->id, target);
1536
1537 unlock_device_hotplug();
1538 return ret ? ret : count;
1539}
1540
1541static ssize_t show_cpuhp_target(struct device *dev,
1542 struct device_attribute *attr, char *buf)
1543{
1544 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
1545
1546 return sprintf(buf, "%d\n", st->target);
1547}
1548static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
1549
1550static struct attribute *cpuhp_cpu_attrs[] = {
1551 &dev_attr_state.attr,
1552 &dev_attr_target.attr,
1553 NULL
1554};
1555
1556static struct attribute_group cpuhp_cpu_attr_group = {
1557 .attrs = cpuhp_cpu_attrs,
1558 .name = "hotplug",
1559 NULL
1560};
1561
1562static ssize_t show_cpuhp_states(struct device *dev,
1563 struct device_attribute *attr, char *buf)
1564{
1565 ssize_t cur, res = 0;
1566 int i;
1567
1568 mutex_lock(&cpuhp_state_mutex);
1569 for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
1570 struct cpuhp_step *sp = cpuhp_get_step(i);
1571
1572 if (sp->name) {
1573 cur = sprintf(buf, "%3d: %s\n", i, sp->name);
1574 buf += cur;
1575 res += cur;
1576 }
1577 }
1578 mutex_unlock(&cpuhp_state_mutex);
1579 return res;
1580}
1581static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
1582
1583static struct attribute *cpuhp_cpu_root_attrs[] = {
1584 &dev_attr_states.attr,
1585 NULL
1586};
1587
1588static struct attribute_group cpuhp_cpu_root_attr_group = {
1589 .attrs = cpuhp_cpu_root_attrs,
1590 .name = "hotplug",
1591 NULL
1592};
1593
1594static int __init cpuhp_sysfs_init(void)
1595{
1596 int cpu, ret;
1597
1598 ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
1599 &cpuhp_cpu_root_attr_group);
1600 if (ret)
1601 return ret;
1602
1603 for_each_possible_cpu(cpu) {
1604 struct device *dev = get_cpu_device(cpu);
1605
1606 if (!dev)
1607 continue;
1608 ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
1609 if (ret)
1610 return ret;
1611 }
1612 return 0;
1613}
1614device_initcall(cpuhp_sysfs_init);
1615#endif
732 1616
733/* 1617/*
734 * cpu_bit_bitmap[] is a special, "compressed" data structure that 1618 * cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -789,3 +1673,25 @@ void init_cpu_online(const struct cpumask *src)
789{ 1673{
790 cpumask_copy(&__cpu_online_mask, src); 1674 cpumask_copy(&__cpu_online_mask, src);
791} 1675}
1676
1677/*
1678 * Activate the first processor.
1679 */
1680void __init boot_cpu_init(void)
1681{
1682 int cpu = smp_processor_id();
1683
1684 /* Mark the boot cpu "present", "online" etc for SMP and UP case */
1685 set_cpu_online(cpu, true);
1686 set_cpu_active(cpu, true);
1687 set_cpu_present(cpu, true);
1688 set_cpu_possible(cpu, true);
1689}
1690
1691/*
1692 * Must be called _AFTER_ setting up the per_cpu areas
1693 */
1694void __init boot_cpu_state_init(void)
1695{
1696 per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
1697}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0167679182c0..5f6ce931f1ea 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
1178 goto free_area; 1178 goto free_area;
1179 1179
1180 area->xol_mapping.name = "[uprobes]"; 1180 area->xol_mapping.name = "[uprobes]";
1181 area->xol_mapping.fault = NULL;
1181 area->xol_mapping.pages = area->pages; 1182 area->xol_mapping.pages = area->pages;
1182 area->pages[0] = alloc_page(GFP_HIGHUSER); 1183 area->pages[0] = alloc_page(GFP_HIGHUSER);
1183 if (!area->pages[0]) 1184 if (!area->pages[0])
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3b48dab80164..3bbfd6a9c475 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY
64 bool 64 bool
65 select IRQ_DOMAIN 65 select IRQ_DOMAIN
66 66
67# Generic IRQ IPI support
68config GENERIC_IRQ_IPI
69 bool
70
67# Generic MSI interrupt support 71# Generic MSI interrupt support
68config GENERIC_MSI_IRQ 72config GENERIC_MSI_IRQ
69 bool 73 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2fc9cbdf35b6..2ee42e95a3ce 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
8obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o 8obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
9obj-$(CONFIG_PM_SLEEP) += pm.o 9obj-$(CONFIG_PM_SLEEP) += pm.o
10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o 10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
11obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5797909f4e5b..2f9f2b0e79f2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data)
961 data = data->parent_data; 961 data = data->parent_data;
962 data->chip->irq_mask(data); 962 data->chip->irq_mask(data);
963} 963}
964EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
964 965
965/** 966/**
966 * irq_chip_unmask_parent - Unmask the parent interrupt 967 * irq_chip_unmask_parent - Unmask the parent interrupt
@@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
971 data = data->parent_data; 972 data = data->parent_data;
972 data->chip->irq_unmask(data); 973 data->chip->irq_unmask(data);
973} 974}
975EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
974 976
975/** 977/**
976 * irq_chip_eoi_parent - Invoke EOI on the parent interrupt 978 * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
981 data = data->parent_data; 983 data = data->parent_data;
982 data->chip->irq_eoi(data); 984 data->chip->irq_eoi(data);
983} 985}
986EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
984 987
985/** 988/**
986 * irq_chip_set_affinity_parent - Set affinity on the parent interrupt 989 * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
1016 1019
1017 return -ENOSYS; 1020 return -ENOSYS;
1018} 1021}
1022EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
1019 1023
1020/** 1024/**
1021 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware 1025 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 57bff7857e87..a15b5485b446 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
136{ 136{
137 irqreturn_t retval = IRQ_NONE; 137 irqreturn_t retval = IRQ_NONE;
138 unsigned int flags = 0, irq = desc->irq_data.irq; 138 unsigned int flags = 0, irq = desc->irq_data.irq;
139 struct irqaction *action = desc->action; 139 struct irqaction *action;
140 140
141 /* action might have become NULL since we dropped the lock */ 141 for_each_action_of_desc(desc, action) {
142 while (action) {
143 irqreturn_t res; 142 irqreturn_t res;
144 143
145 trace_irq_handler_entry(irq, action); 144 trace_irq_handler_entry(irq, action);
@@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
173 } 172 }
174 173
175 retval |= res; 174 retval |= res;
176 action = action->next;
177 } 175 }
178 176
179 add_interrupt_randomness(irq, flags); 177 add_interrupt_randomness(irq, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c66905..09be2c903c6d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
131#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) 131#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
132#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) 132#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
133 133
134#define for_each_action_of_desc(desc, act) \
135 for (act = desc->act; act; act = act->next)
136
134struct irq_desc * 137struct irq_desc *
135__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, 138__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
136 unsigned int check); 139 unsigned int check);
@@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
160 __irq_put_desc_unlock(desc, flags, false); 163 __irq_put_desc_unlock(desc, flags, false);
161} 164}
162 165
166#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
167
163/* 168/*
164 * Manipulation functions for irq_data.state 169 * Manipulation functions for irq_data.state
165 */ 170 */
@@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
188 return __irqd_to_state(d) & mask; 193 return __irqd_to_state(d) & mask;
189} 194}
190 195
196#undef __irqd_to_state
197
191static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) 198static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
192{ 199{
193 __this_cpu_inc(*desc->kstat_irqs); 200 __this_cpu_inc(*desc->kstat_irqs);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644
index 000000000000..c37f34b00a11
--- /dev/null
+++ b/kernel/irq/ipi.c
@@ -0,0 +1,326 @@
1/*
2 * linux/kernel/irq/ipi.c
3 *
4 * Copyright (C) 2015 Imagination Technologies Ltd
5 * Author: Qais Yousef <qais.yousef@imgtec.com>
6 *
7 * This file contains driver APIs to the IPI subsystem.
8 */
9
10#define pr_fmt(fmt) "genirq/ipi: " fmt
11
12#include <linux/irqdomain.h>
13#include <linux/irq.h>
14
15/**
16 * irq_reserve_ipi() - Setup an IPI to destination cpumask
17 * @domain: IPI domain
18 * @dest: cpumask of cpus which can receive the IPI
19 *
20 * Allocate a virq that can be used to send IPI to any CPU in dest mask.
21 *
22 * On success it'll return linux irq number and 0 on failure
23 */
24unsigned int irq_reserve_ipi(struct irq_domain *domain,
25 const struct cpumask *dest)
26{
27 unsigned int nr_irqs, offset;
28 struct irq_data *data;
29 int virq, i;
30
31 if (!domain ||!irq_domain_is_ipi(domain)) {
32 pr_warn("Reservation on a non IPI domain\n");
33 return 0;
34 }
35
36 if (!cpumask_subset(dest, cpu_possible_mask)) {
37 pr_warn("Reservation is not in possible_cpu_mask\n");
38 return 0;
39 }
40
41 nr_irqs = cpumask_weight(dest);
42 if (!nr_irqs) {
43 pr_warn("Reservation for empty destination mask\n");
44 return 0;
45 }
46
47 if (irq_domain_is_ipi_single(domain)) {
48 /*
49 * If the underlying implementation uses a single HW irq on
50 * all cpus then we only need a single Linux irq number for
51 * it. We have no restrictions vs. the destination mask. The
52 * underlying implementation can deal with holes nicely.
53 */
54 nr_irqs = 1;
55 offset = 0;
56 } else {
57 unsigned int next;
58
59 /*
60 * The IPI requires a seperate HW irq on each CPU. We require
61 * that the destination mask is consecutive. If an
62 * implementation needs to support holes, it can reserve
63 * several IPI ranges.
64 */
65 offset = cpumask_first(dest);
66 /*
67 * Find a hole and if found look for another set bit after the
68 * hole. For now we don't support this scenario.
69 */
70 next = cpumask_next_zero(offset, dest);
71 if (next < nr_cpu_ids)
72 next = cpumask_next(next, dest);
73 if (next < nr_cpu_ids) {
74 pr_warn("Destination mask has holes\n");
75 return 0;
76 }
77 }
78
79 virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
80 if (virq <= 0) {
81 pr_warn("Can't reserve IPI, failed to alloc descs\n");
82 return 0;
83 }
84
85 virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
86 (void *) dest, true);
87
88 if (virq <= 0) {
89 pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
90 goto free_descs;
91 }
92
93 for (i = 0; i < nr_irqs; i++) {
94 data = irq_get_irq_data(virq + i);
95 cpumask_copy(data->common->affinity, dest);
96 data->common->ipi_offset = offset;
97 }
98 return virq;
99
100free_descs:
101 irq_free_descs(virq, nr_irqs);
102 return 0;
103}
104
105/**
106 * irq_destroy_ipi() - unreserve an IPI that was previously allocated
107 * @irq: linux irq number to be destroyed
108 *
109 * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
110 * all virqs associated with them.
111 */
112void irq_destroy_ipi(unsigned int irq)
113{
114 struct irq_data *data = irq_get_irq_data(irq);
115 struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
116 struct irq_domain *domain;
117 unsigned int nr_irqs;
118
119 if (!irq || !data || !ipimask)
120 return;
121
122 domain = data->domain;
123 if (WARN_ON(domain == NULL))
124 return;
125
126 if (!irq_domain_is_ipi(domain)) {
127 pr_warn("Trying to destroy a non IPI domain!\n");
128 return;
129 }
130
131 if (irq_domain_is_ipi_per_cpu(domain))
132 nr_irqs = cpumask_weight(ipimask);
133 else
134 nr_irqs = 1;
135
136 irq_domain_free_irqs(irq, nr_irqs);
137}
138
139/**
140 * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
141 * @irq: linux irq number
142 * @cpu: the target cpu
143 *
144 * When dealing with coprocessors IPI, we need to inform the coprocessor of
145 * the hwirq it needs to use to receive and send IPIs.
146 *
147 * Returns hwirq value on success and INVALID_HWIRQ on failure.
148 */
149irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
150{
151 struct irq_data *data = irq_get_irq_data(irq);
152 struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
153
154 if (!data || !ipimask || cpu > nr_cpu_ids)
155 return INVALID_HWIRQ;
156
157 if (!cpumask_test_cpu(cpu, ipimask))
158 return INVALID_HWIRQ;
159
160 /*
161 * Get the real hardware irq number if the underlying implementation
162 * uses a seperate irq per cpu. If the underlying implementation uses
163 * a single hardware irq for all cpus then the IPI send mechanism
164 * needs to take care of the cpu destinations.
165 */
166 if (irq_domain_is_ipi_per_cpu(data->domain))
167 data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
168
169 return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
170}
171EXPORT_SYMBOL_GPL(ipi_get_hwirq);
172
173static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
174 const struct cpumask *dest, unsigned int cpu)
175{
176 struct cpumask *ipimask = irq_data_get_affinity_mask(data);
177
178 if (!chip || !ipimask)
179 return -EINVAL;
180
181 if (!chip->ipi_send_single && !chip->ipi_send_mask)
182 return -EINVAL;
183
184 if (cpu > nr_cpu_ids)
185 return -EINVAL;
186
187 if (dest) {
188 if (!cpumask_subset(dest, ipimask))
189 return -EINVAL;
190 } else {
191 if (!cpumask_test_cpu(cpu, ipimask))
192 return -EINVAL;
193 }
194 return 0;
195}
196
197/**
198 * __ipi_send_single - send an IPI to a target Linux SMP CPU
199 * @desc: pointer to irq_desc of the IRQ
200 * @cpu: destination CPU, must in the destination mask passed to
201 * irq_reserve_ipi()
202 *
203 * This function is for architecture or core code to speed up IPI sending. Not
204 * usable from driver code.
205 *
206 * Returns zero on success and negative error number on failure.
207 */
208int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
209{
210 struct irq_data *data = irq_desc_get_irq_data(desc);
211 struct irq_chip *chip = irq_data_get_irq_chip(data);
212
213#ifdef DEBUG
214 /*
215 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
216 * Since the callers should be arch or core code which is generally
217 * trusted, only check for errors when debugging.
218 */
219 if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
220 return -EINVAL;
221#endif
222 if (!chip->ipi_send_single) {
223 chip->ipi_send_mask(data, cpumask_of(cpu));
224 return 0;
225 }
226
227 /* FIXME: Store this information in irqdata flags */
228 if (irq_domain_is_ipi_per_cpu(data->domain) &&
229 cpu != data->common->ipi_offset) {
230 /* use the correct data for that cpu */
231 unsigned irq = data->irq + cpu - data->common->ipi_offset;
232
233 data = irq_get_irq_data(irq);
234 }
235 chip->ipi_send_single(data, cpu);
236 return 0;
237}
238
239/**
240 * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
241 * @desc: pointer to irq_desc of the IRQ
242 * @dest: dest CPU(s), must be a subset of the mask passed to
243 * irq_reserve_ipi()
244 *
245 * This function is for architecture or core code to speed up IPI sending. Not
246 * usable from driver code.
247 *
248 * Returns zero on success and negative error number on failure.
249 */
250int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
251{
252 struct irq_data *data = irq_desc_get_irq_data(desc);
253 struct irq_chip *chip = irq_data_get_irq_chip(data);
254 unsigned int cpu;
255
256#ifdef DEBUG
257 /*
258 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
259 * Since the callers should be arch or core code which is generally
260 * trusted, only check for errors when debugging.
261 */
262 if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
263 return -EINVAL;
264#endif
265 if (chip->ipi_send_mask) {
266 chip->ipi_send_mask(data, dest);
267 return 0;
268 }
269
270 if (irq_domain_is_ipi_per_cpu(data->domain)) {
271 unsigned int base = data->irq;
272
273 for_each_cpu(cpu, dest) {
274 unsigned irq = base + cpu - data->common->ipi_offset;
275
276 data = irq_get_irq_data(irq);
277 chip->ipi_send_single(data, cpu);
278 }
279 } else {
280 for_each_cpu(cpu, dest)
281 chip->ipi_send_single(data, cpu);
282 }
283 return 0;
284}
285
286/**
287 * ipi_send_single - Send an IPI to a single CPU
288 * @virq: linux irq number from irq_reserve_ipi()
289 * @cpu: destination CPU, must in the destination mask passed to
290 * irq_reserve_ipi()
291 *
292 * Returns zero on success and negative error number on failure.
293 */
294int ipi_send_single(unsigned int virq, unsigned int cpu)
295{
296 struct irq_desc *desc = irq_to_desc(virq);
297 struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
298 struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
299
300 if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
301 return -EINVAL;
302
303 return __ipi_send_single(desc, cpu);
304}
305EXPORT_SYMBOL_GPL(ipi_send_single);
306
307/**
308 * ipi_send_mask - Send an IPI to target CPU(s)
309 * @virq: linux irq number from irq_reserve_ipi()
310 * @dest: dest CPU(s), must be a subset of the mask passed to
311 * irq_reserve_ipi()
312 *
313 * Returns zero on success and negative error number on failure.
314 */
315int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
316{
317 struct irq_desc *desc = irq_to_desc(virq);
318 struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
319 struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
320
321 if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
322 return -EINVAL;
323
324 return __ipi_send_mask(desc, dest);
325}
326EXPORT_SYMBOL_GPL(ipi_send_mask);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0409da0bcc33..0ccd028817d7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -24,10 +24,27 @@
24static struct lock_class_key irq_desc_lock_class; 24static struct lock_class_key irq_desc_lock_class;
25 25
26#if defined(CONFIG_SMP) 26#if defined(CONFIG_SMP)
27static int __init irq_affinity_setup(char *str)
28{
29 zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
30 cpulist_parse(str, irq_default_affinity);
31 /*
32 * Set at least the boot cpu. We don't want to end up with
33 * bugreports caused by random comandline masks
34 */
35 cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
36 return 1;
37}
38__setup("irqaffinity=", irq_affinity_setup);
39
27static void __init init_irq_default_affinity(void) 40static void __init init_irq_default_affinity(void)
28{ 41{
29 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); 42#ifdef CONFIG_CPUMASK_OFFSTACK
30 cpumask_setall(irq_default_affinity); 43 if (!irq_default_affinity)
44 zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
45#endif
46 if (cpumask_empty(irq_default_affinity))
47 cpumask_setall(irq_default_affinity);
31} 48}
32#else 49#else
33static void __init init_irq_default_affinity(void) 50static void __init init_irq_default_affinity(void)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3e56d2f03e24..3a519a01118b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
23static DEFINE_MUTEX(revmap_trees_mutex); 23static DEFINE_MUTEX(revmap_trees_mutex);
24static struct irq_domain *irq_default_domain; 24static struct irq_domain *irq_default_domain;
25 25
26static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
27 irq_hw_number_t hwirq, int node);
28static void irq_domain_check_hierarchy(struct irq_domain *domain); 26static void irq_domain_check_hierarchy(struct irq_domain *domain);
29 27
30struct irqchip_fwid { 28struct irqchip_fwid {
@@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
840}; 838};
841EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 839EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
842 840
843static int irq_domain_alloc_descs(int virq, unsigned int cnt, 841int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
844 irq_hw_number_t hwirq, int node) 842 int node)
845{ 843{
846 unsigned int hint; 844 unsigned int hint;
847 845
@@ -895,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
895 893
896 return domain; 894 return domain;
897} 895}
896EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
898 897
899static void irq_domain_insert_irq(int virq) 898static void irq_domain_insert_irq(int virq)
900{ 899{
@@ -1045,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
1045 1044
1046 return 0; 1045 return 0;
1047} 1046}
1047EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
1048 1048
1049/** 1049/**
1050 * irq_domain_set_info - Set the complete data for a @virq in @domain 1050 * irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -1078,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
1078 irq_data->chip = &no_irq_chip; 1078 irq_data->chip = &no_irq_chip;
1079 irq_data->chip_data = NULL; 1079 irq_data->chip_data = NULL;
1080} 1080}
1081EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
1081 1082
1082/** 1083/**
1083 * irq_domain_free_irqs_common - Clear irq_data and free the parent 1084 * irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -1275,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
1275 nr_irqs, arg); 1276 nr_irqs, arg);
1276 return -ENOSYS; 1277 return -ENOSYS;
1277} 1278}
1279EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
1278 1280
1279/** 1281/**
1280 * irq_domain_free_irqs_parent - Free interrupts from parent domain 1282 * irq_domain_free_irqs_parent - Free interrupts from parent domain
@@ -1292,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
1292 irq_domain_free_irqs_recursive(domain->parent, irq_base, 1294 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1293 nr_irqs); 1295 nr_irqs);
1294} 1296}
1297EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
1295 1298
1296/** 1299/**
1297 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate 1300 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 841187239adc..3ddd2297ee95 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq)
144 */ 144 */
145void irq_set_thread_affinity(struct irq_desc *desc) 145void irq_set_thread_affinity(struct irq_desc *desc)
146{ 146{
147 struct irqaction *action = desc->action; 147 struct irqaction *action;
148 148
149 while (action) { 149 for_each_action_of_desc(desc, action)
150 if (action->thread) 150 if (action->thread)
151 set_bit(IRQTF_AFFINITY, &action->thread_flags); 151 set_bit(IRQTF_AFFINITY, &action->thread_flags);
152 action = action->next;
153 }
154} 152}
155 153
156#ifdef CONFIG_GENERIC_PENDING_IRQ 154#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
994 return; 992 return;
995 993
996 raw_spin_lock_irqsave(&desc->lock, flags); 994 raw_spin_lock_irqsave(&desc->lock, flags);
997 for (action = desc->action; action; action = action->next) { 995 for_each_action_of_desc(desc, action) {
998 if (action->dev_id == dev_id) { 996 if (action->dev_id == dev_id) {
999 if (action->thread) 997 if (action->thread)
1000 __irq_wake_thread(desc, action); 998 __irq_wake_thread(desc, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a2c02fd5d6d0..4e1b94726818 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
291 int ret = 1; 291 int ret = 1;
292 292
293 raw_spin_lock_irqsave(&desc->lock, flags); 293 raw_spin_lock_irqsave(&desc->lock, flags);
294 for (action = desc->action ; action; action = action->next) { 294 for_each_action_of_desc(desc, action) {
295 if ((action != new_action) && action->name && 295 if ((action != new_action) && action->name &&
296 !strcmp(new_action->name, action->name)) { 296 !strcmp(new_action->name, action->name)) {
297 ret = 0; 297 ret = 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32144175458d..5707f97a3e6a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
211 * desc->lock here. See synchronize_irq(). 211 * desc->lock here. See synchronize_irq().
212 */ 212 */
213 raw_spin_lock_irqsave(&desc->lock, flags); 213 raw_spin_lock_irqsave(&desc->lock, flags);
214 action = desc->action; 214 for_each_action_of_desc(desc, action) {
215 while (action) {
216 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); 215 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
217 if (action->thread_fn) 216 if (action->thread_fn)
218 printk(KERN_CONT " threaded [<%p>] %pf", 217 printk(KERN_CONT " threaded [<%p>] %pf",
219 action->thread_fn, action->thread_fn); 218 action->thread_fn, action->thread_fn);
220 printk(KERN_CONT "\n"); 219 printk(KERN_CONT "\n");
221 action = action->next;
222 } 220 }
223 raw_spin_unlock_irqrestore(&desc->lock, flags); 221 raw_spin_unlock_irqrestore(&desc->lock, flags);
224} 222}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d2988d047d66..65ae0e5c35da 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -932,12 +932,14 @@ rcu_torture_writer(void *arg)
932 int nsynctypes = 0; 932 int nsynctypes = 0;
933 933
934 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 934 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
935 pr_alert("%s" TORTURE_FLAG 935 if (!can_expedite) {
936 " Grace periods expedited from boot/sysfs for %s,\n", 936 pr_alert("%s" TORTURE_FLAG
937 torture_type, cur_ops->name); 937 " Grace periods expedited from boot/sysfs for %s,\n",
938 pr_alert("%s" TORTURE_FLAG 938 torture_type, cur_ops->name);
939 " Testing of dynamic grace-period expediting diabled.\n", 939 pr_alert("%s" TORTURE_FLAG
940 torture_type); 940 " Disabled dynamic grace-period expediting.\n",
941 torture_type);
942 }
941 943
942 /* Initialize synctype[] array. If none set, take default. */ 944 /* Initialize synctype[] array. If none set, take default. */
943 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) 945 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index e492a5253e0f..196f0302e2f4 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -23,7 +23,7 @@
23 */ 23 */
24 24
25#include <linux/kthread.h> 25#include <linux/kthread.h>
26#include <linux/module.h> 26#include <linux/init.h>
27#include <linux/debugfs.h> 27#include <linux/debugfs.h>
28#include <linux/seq_file.h> 28#include <linux/seq_file.h>
29 29
@@ -122,18 +122,7 @@ free_out:
122 debugfs_remove_recursive(rcudir); 122 debugfs_remove_recursive(rcudir);
123 return 1; 123 return 1;
124} 124}
125 125device_initcall(rcutiny_trace_init);
126static void __exit rcutiny_trace_cleanup(void)
127{
128 debugfs_remove_recursive(rcudir);
129}
130
131module_init(rcutiny_trace_init);
132module_exit(rcutiny_trace_cleanup);
133
134MODULE_AUTHOR("Paul E. McKenney");
135MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
136MODULE_LICENSE("GPL");
137 126
138static void check_cpu_stall(struct rcu_ctrlblk *rcp) 127static void check_cpu_stall(struct rcu_ctrlblk *rcp)
139{ 128{
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9fd5b628a88d..9a535a86e732 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
108RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 108RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
109 109
110static struct rcu_state *const rcu_state_p; 110static struct rcu_state *const rcu_state_p;
111static struct rcu_data __percpu *const rcu_data_p;
112LIST_HEAD(rcu_struct_flavors); 111LIST_HEAD(rcu_struct_flavors);
113 112
114/* Dump rcu_node combining tree at boot to verify correct setup. */ 113/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1083,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
1083 rcu_sysidle_check_cpu(rdp, isidle, maxj); 1082 rcu_sysidle_check_cpu(rdp, isidle, maxj);
1084 if ((rdp->dynticks_snap & 0x1) == 0) { 1083 if ((rdp->dynticks_snap & 0x1) == 0) {
1085 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1084 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
1086 return 1;
1087 } else {
1088 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, 1085 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
1089 rdp->mynode->gpnum)) 1086 rdp->mynode->gpnum))
1090 WRITE_ONCE(rdp->gpwrap, true); 1087 WRITE_ONCE(rdp->gpwrap, true);
1091 return 0; 1088 return 1;
1092 } 1089 }
1090 return 0;
1093} 1091}
1094 1092
1095/* 1093/*
@@ -1173,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1173 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1171 smp_mb(); /* ->cond_resched_completed before *rcrmp. */
1174 WRITE_ONCE(*rcrmp, 1172 WRITE_ONCE(*rcrmp,
1175 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); 1173 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
1176 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
1177 rdp->rsp->jiffies_resched += 5; /* Enable beating. */
1178 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
1179 /* Time to beat on that CPU again! */
1180 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
1181 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
1182 } 1174 }
1175 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
1183 } 1176 }
1184 1177
1178 /* And if it has been a really long time, kick the CPU as well. */
1179 if (ULONG_CMP_GE(jiffies,
1180 rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
1181 ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
1182 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
1183
1185 return 0; 1184 return 0;
1186} 1185}
1187 1186
@@ -1246,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1246 if (rnp->qsmask & (1UL << cpu)) 1245 if (rnp->qsmask & (1UL << cpu))
1247 dump_cpu_task(rnp->grplo + cpu); 1246 dump_cpu_task(rnp->grplo + cpu);
1248 } 1247 }
1249 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1248 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1250 } 1249 }
1251} 1250}
1252 1251
@@ -1266,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1266 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1265 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1267 delta = jiffies - READ_ONCE(rsp->jiffies_stall); 1266 delta = jiffies - READ_ONCE(rsp->jiffies_stall);
1268 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1267 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
1269 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1268 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1270 return; 1269 return;
1271 } 1270 }
1272 WRITE_ONCE(rsp->jiffies_stall, 1271 WRITE_ONCE(rsp->jiffies_stall,
1273 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1272 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1274 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1273 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1275 1274
1276 /* 1275 /*
1277 * OK, time to rat on our buddy... 1276 * OK, time to rat on our buddy...
@@ -1292,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1292 ndetected++; 1291 ndetected++;
1293 } 1292 }
1294 } 1293 }
1295 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1294 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1296 } 1295 }
1297 1296
1298 print_cpu_stall_info_end(); 1297 print_cpu_stall_info_end();
@@ -1357,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
1357 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1356 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
1358 WRITE_ONCE(rsp->jiffies_stall, 1357 WRITE_ONCE(rsp->jiffies_stall,
1359 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1358 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1360 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1359 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1361 1360
1362 /* 1361 /*
1363 * Attempt to revive the RCU machinery by forcing a context switch. 1362 * Attempt to revive the RCU machinery by forcing a context switch.
@@ -1595,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1595 } 1594 }
1596unlock_out: 1595unlock_out:
1597 if (rnp != rnp_root) 1596 if (rnp != rnp_root)
1598 raw_spin_unlock(&rnp_root->lock); 1597 raw_spin_unlock_rcu_node(rnp_root);
1599out: 1598out:
1600 if (c_out != NULL) 1599 if (c_out != NULL)
1601 *c_out = c; 1600 *c_out = c;
@@ -1814,7 +1813,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1814 return; 1813 return;
1815 } 1814 }
1816 needwake = __note_gp_changes(rsp, rnp, rdp); 1815 needwake = __note_gp_changes(rsp, rnp, rdp);
1817 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1816 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1818 if (needwake) 1817 if (needwake)
1819 rcu_gp_kthread_wake(rsp); 1818 rcu_gp_kthread_wake(rsp);
1820} 1819}
@@ -1839,7 +1838,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1839 raw_spin_lock_irq_rcu_node(rnp); 1838 raw_spin_lock_irq_rcu_node(rnp);
1840 if (!READ_ONCE(rsp->gp_flags)) { 1839 if (!READ_ONCE(rsp->gp_flags)) {
1841 /* Spurious wakeup, tell caller to go back to sleep. */ 1840 /* Spurious wakeup, tell caller to go back to sleep. */
1842 raw_spin_unlock_irq(&rnp->lock); 1841 raw_spin_unlock_irq_rcu_node(rnp);
1843 return false; 1842 return false;
1844 } 1843 }
1845 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ 1844 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1849,7 +1848,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1849 * Grace period already in progress, don't start another. 1848 * Grace period already in progress, don't start another.
1850 * Not supposed to be able to happen. 1849 * Not supposed to be able to happen.
1851 */ 1850 */
1852 raw_spin_unlock_irq(&rnp->lock); 1851 raw_spin_unlock_irq_rcu_node(rnp);
1853 return false; 1852 return false;
1854 } 1853 }
1855 1854
@@ -1858,7 +1857,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1858 /* Record GP times before starting GP, hence smp_store_release(). */ 1857 /* Record GP times before starting GP, hence smp_store_release(). */
1859 smp_store_release(&rsp->gpnum, rsp->gpnum + 1); 1858 smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
1860 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1859 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1861 raw_spin_unlock_irq(&rnp->lock); 1860 raw_spin_unlock_irq_rcu_node(rnp);
1862 1861
1863 /* 1862 /*
1864 * Apply per-leaf buffered online and offline operations to the 1863 * Apply per-leaf buffered online and offline operations to the
@@ -1872,7 +1871,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1872 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1871 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1873 !rnp->wait_blkd_tasks) { 1872 !rnp->wait_blkd_tasks) {
1874 /* Nothing to do on this leaf rcu_node structure. */ 1873 /* Nothing to do on this leaf rcu_node structure. */
1875 raw_spin_unlock_irq(&rnp->lock); 1874 raw_spin_unlock_irq_rcu_node(rnp);
1876 continue; 1875 continue;
1877 } 1876 }
1878 1877
@@ -1906,7 +1905,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1906 rcu_cleanup_dead_rnp(rnp); 1905 rcu_cleanup_dead_rnp(rnp);
1907 } 1906 }
1908 1907
1909 raw_spin_unlock_irq(&rnp->lock); 1908 raw_spin_unlock_irq_rcu_node(rnp);
1910 } 1909 }
1911 1910
1912 /* 1911 /*
@@ -1937,7 +1936,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1937 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1936 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1938 rnp->level, rnp->grplo, 1937 rnp->level, rnp->grplo,
1939 rnp->grphi, rnp->qsmask); 1938 rnp->grphi, rnp->qsmask);
1940 raw_spin_unlock_irq(&rnp->lock); 1939 raw_spin_unlock_irq_rcu_node(rnp);
1941 cond_resched_rcu_qs(); 1940 cond_resched_rcu_qs();
1942 WRITE_ONCE(rsp->gp_activity, jiffies); 1941 WRITE_ONCE(rsp->gp_activity, jiffies);
1943 } 1942 }
@@ -1995,7 +1994,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
1995 raw_spin_lock_irq_rcu_node(rnp); 1994 raw_spin_lock_irq_rcu_node(rnp);
1996 WRITE_ONCE(rsp->gp_flags, 1995 WRITE_ONCE(rsp->gp_flags,
1997 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 1996 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
1998 raw_spin_unlock_irq(&rnp->lock); 1997 raw_spin_unlock_irq_rcu_node(rnp);
1999 } 1998 }
2000} 1999}
2001 2000
@@ -2025,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2025 * safe for us to drop the lock in order to mark the grace 2024 * safe for us to drop the lock in order to mark the grace
2026 * period as completed in all of the rcu_node structures. 2025 * period as completed in all of the rcu_node structures.
2027 */ 2026 */
2028 raw_spin_unlock_irq(&rnp->lock); 2027 raw_spin_unlock_irq_rcu_node(rnp);
2029 2028
2030 /* 2029 /*
2031 * Propagate new ->completed value to rcu_node structures so 2030 * Propagate new ->completed value to rcu_node structures so
@@ -2047,7 +2046,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2047 /* smp_mb() provided by prior unlock-lock pair. */ 2046 /* smp_mb() provided by prior unlock-lock pair. */
2048 nocb += rcu_future_gp_cleanup(rsp, rnp); 2047 nocb += rcu_future_gp_cleanup(rsp, rnp);
2049 sq = rcu_nocb_gp_get(rnp); 2048 sq = rcu_nocb_gp_get(rnp);
2050 raw_spin_unlock_irq(&rnp->lock); 2049 raw_spin_unlock_irq_rcu_node(rnp);
2051 rcu_nocb_gp_cleanup(sq); 2050 rcu_nocb_gp_cleanup(sq);
2052 cond_resched_rcu_qs(); 2051 cond_resched_rcu_qs();
2053 WRITE_ONCE(rsp->gp_activity, jiffies); 2052 WRITE_ONCE(rsp->gp_activity, jiffies);
@@ -2070,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2070 READ_ONCE(rsp->gpnum), 2069 READ_ONCE(rsp->gpnum),
2071 TPS("newreq")); 2070 TPS("newreq"));
2072 } 2071 }
2073 raw_spin_unlock_irq(&rnp->lock); 2072 raw_spin_unlock_irq_rcu_node(rnp);
2074} 2073}
2075 2074
2076/* 2075/*
@@ -2236,18 +2235,20 @@ static bool rcu_start_gp(struct rcu_state *rsp)
2236} 2235}
2237 2236
2238/* 2237/*
2239 * Report a full set of quiescent states to the specified rcu_state 2238 * Report a full set of quiescent states to the specified rcu_state data
2240 * data structure. This involves cleaning up after the prior grace 2239 * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
2241 * period and letting rcu_start_gp() start up the next grace period 2240 * kthread if another grace period is required. Whether we wake
2242 * if one is needed. Note that the caller must hold rnp->lock, which 2241 * the grace-period kthread or it awakens itself for the next round
2243 * is released before return. 2242 * of quiescent-state forcing, that kthread will clean up after the
2243 * just-completed grace period. Note that the caller must hold rnp->lock,
2244 * which is released before return.
2244 */ 2245 */
2245static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2246static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2246 __releases(rcu_get_root(rsp)->lock) 2247 __releases(rcu_get_root(rsp)->lock)
2247{ 2248{
2248 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2249 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
2249 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2250 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2250 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2251 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
2251 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2252 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
2252} 2253}
2253 2254
@@ -2277,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2277 * Our bit has already been cleared, or the 2278 * Our bit has already been cleared, or the
2278 * relevant grace period is already over, so done. 2279 * relevant grace period is already over, so done.
2279 */ 2280 */
2280 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2281 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2281 return; 2282 return;
2282 } 2283 }
2283 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2284 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
@@ -2289,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2289 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2290 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2290 2291
2291 /* Other bits still set at this level, so done. */ 2292 /* Other bits still set at this level, so done. */
2292 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2293 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2293 return; 2294 return;
2294 } 2295 }
2295 mask = rnp->grpmask; 2296 mask = rnp->grpmask;
@@ -2299,7 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2299 2300
2300 break; 2301 break;
2301 } 2302 }
2302 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2303 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2303 rnp_c = rnp; 2304 rnp_c = rnp;
2304 rnp = rnp->parent; 2305 rnp = rnp->parent;
2305 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2306 raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -2331,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2331 2332
2332 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || 2333 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2333 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2334 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2334 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2335 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2335 return; /* Still need more quiescent states! */ 2336 return; /* Still need more quiescent states! */
2336 } 2337 }
2337 2338
@@ -2348,19 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2348 /* Report up the rest of the hierarchy, tracking current ->gpnum. */ 2349 /* Report up the rest of the hierarchy, tracking current ->gpnum. */
2349 gps = rnp->gpnum; 2350 gps = rnp->gpnum;
2350 mask = rnp->grpmask; 2351 mask = rnp->grpmask;
2351 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2352 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2352 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ 2353 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
2353 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2354 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
2354} 2355}
2355 2356
2356/* 2357/*
2357 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2358 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2358 * structure. This must be either called from the specified CPU, or 2359 * structure. This must be called from the specified CPU.
2359 * called when the specified CPU is known to be offline (and when it is
2360 * also known that no other CPU is concurrently trying to help the offline
2361 * CPU). The lastcomp argument is used to make sure we are still in the
2362 * grace period of interest. We don't want to end the current grace period
2363 * based on quiescent states detected in an earlier grace period!
2364 */ 2360 */
2365static void 2361static void
2366rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 2362rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
@@ -2385,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2385 */ 2381 */
2386 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2382 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2387 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2383 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2388 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2384 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2389 return; 2385 return;
2390 } 2386 }
2391 mask = rdp->grpmask; 2387 mask = rdp->grpmask;
2392 if ((rnp->qsmask & mask) == 0) { 2388 if ((rnp->qsmask & mask) == 0) {
2393 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2389 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2394 } else { 2390 } else {
2395 rdp->core_needs_qs = 0; 2391 rdp->core_needs_qs = false;
2396 2392
2397 /* 2393 /*
2398 * This GP can't end until cpu checks in, so all of our 2394 * This GP can't end until cpu checks in, so all of our
@@ -2601,36 +2597,15 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2601 rnp->qsmaskinit &= ~mask; 2597 rnp->qsmaskinit &= ~mask;
2602 rnp->qsmask &= ~mask; 2598 rnp->qsmask &= ~mask;
2603 if (rnp->qsmaskinit) { 2599 if (rnp->qsmaskinit) {
2604 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2600 raw_spin_unlock_rcu_node(rnp);
2601 /* irqs remain disabled. */
2605 return; 2602 return;
2606 } 2603 }
2607 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2604 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2608 } 2605 }
2609} 2606}
2610 2607
2611/* 2608/*
2612 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
2613 * function. We now remove it from the rcu_node tree's ->qsmaskinit
2614 * bit masks.
2615 */
2616static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2617{
2618 unsigned long flags;
2619 unsigned long mask;
2620 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2621 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2622
2623 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2624 return;
2625
2626 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2627 mask = rdp->grpmask;
2628 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
2629 rnp->qsmaskinitnext &= ~mask;
2630 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2631}
2632
2633/*
2634 * The CPU has been completely removed, and some other CPU is reporting 2609 * The CPU has been completely removed, and some other CPU is reporting
2635 * this fact from process context. Do the remainder of the cleanup, 2610 * this fact from process context. Do the remainder of the cleanup,
2636 * including orphaning the outgoing CPU's RCU callbacks, and also 2611 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2861,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2861 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); 2836 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2862 } else { 2837 } else {
2863 /* Nothing to do here, so just drop the lock. */ 2838 /* Nothing to do here, so just drop the lock. */
2864 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2839 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2865 } 2840 }
2866 } 2841 }
2867} 2842}
@@ -2897,11 +2872,11 @@ static void force_quiescent_state(struct rcu_state *rsp)
2897 raw_spin_unlock(&rnp_old->fqslock); 2872 raw_spin_unlock(&rnp_old->fqslock);
2898 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2873 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2899 rsp->n_force_qs_lh++; 2874 rsp->n_force_qs_lh++;
2900 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2875 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2901 return; /* Someone beat us to it. */ 2876 return; /* Someone beat us to it. */
2902 } 2877 }
2903 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2878 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2904 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2879 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2905 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2880 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
2906} 2881}
2907 2882
@@ -2927,7 +2902,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2927 if (cpu_needs_another_gp(rsp, rdp)) { 2902 if (cpu_needs_another_gp(rsp, rdp)) {
2928 raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ 2903 raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
2929 needwake = rcu_start_gp(rsp); 2904 needwake = rcu_start_gp(rsp);
2930 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2905 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
2931 if (needwake) 2906 if (needwake)
2932 rcu_gp_kthread_wake(rsp); 2907 rcu_gp_kthread_wake(rsp);
2933 } else { 2908 } else {
@@ -3018,7 +2993,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3018 2993
3019 raw_spin_lock_rcu_node(rnp_root); 2994 raw_spin_lock_rcu_node(rnp_root);
3020 needwake = rcu_start_gp(rsp); 2995 needwake = rcu_start_gp(rsp);
3021 raw_spin_unlock(&rnp_root->lock); 2996 raw_spin_unlock_rcu_node(rnp_root);
3022 if (needwake) 2997 if (needwake)
3023 rcu_gp_kthread_wake(rsp); 2998 rcu_gp_kthread_wake(rsp);
3024 } else { 2999 } else {
@@ -3438,14 +3413,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3438 rcu_for_each_leaf_node(rsp, rnp) { 3413 rcu_for_each_leaf_node(rsp, rnp) {
3439 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3414 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3440 if (rnp->expmaskinit == rnp->expmaskinitnext) { 3415 if (rnp->expmaskinit == rnp->expmaskinitnext) {
3441 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3416 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3442 continue; /* No new CPUs, nothing to do. */ 3417 continue; /* No new CPUs, nothing to do. */
3443 } 3418 }
3444 3419
3445 /* Update this node's mask, track old value for propagation. */ 3420 /* Update this node's mask, track old value for propagation. */
3446 oldmask = rnp->expmaskinit; 3421 oldmask = rnp->expmaskinit;
3447 rnp->expmaskinit = rnp->expmaskinitnext; 3422 rnp->expmaskinit = rnp->expmaskinitnext;
3448 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3423 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3449 3424
3450 /* If was already nonzero, nothing to propagate. */ 3425 /* If was already nonzero, nothing to propagate. */
3451 if (oldmask) 3426 if (oldmask)
@@ -3460,7 +3435,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3460 if (rnp_up->expmaskinit) 3435 if (rnp_up->expmaskinit)
3461 done = true; 3436 done = true;
3462 rnp_up->expmaskinit |= mask; 3437 rnp_up->expmaskinit |= mask;
3463 raw_spin_unlock_irqrestore(&rnp_up->lock, flags); 3438 raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
3464 if (done) 3439 if (done)
3465 break; 3440 break;
3466 mask = rnp_up->grpmask; 3441 mask = rnp_up->grpmask;
@@ -3483,7 +3458,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3483 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3458 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3484 WARN_ON_ONCE(rnp->expmask); 3459 WARN_ON_ONCE(rnp->expmask);
3485 rnp->expmask = rnp->expmaskinit; 3460 rnp->expmask = rnp->expmaskinit;
3486 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3461 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3487 } 3462 }
3488} 3463}
3489 3464
@@ -3524,11 +3499,11 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3524 if (!rnp->expmask) 3499 if (!rnp->expmask)
3525 rcu_initiate_boost(rnp, flags); 3500 rcu_initiate_boost(rnp, flags);
3526 else 3501 else
3527 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3502 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3528 break; 3503 break;
3529 } 3504 }
3530 if (rnp->parent == NULL) { 3505 if (rnp->parent == NULL) {
3531 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3506 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3532 if (wake) { 3507 if (wake) {
3533 smp_mb(); /* EGP done before wake_up(). */ 3508 smp_mb(); /* EGP done before wake_up(). */
3534 swake_up(&rsp->expedited_wq); 3509 swake_up(&rsp->expedited_wq);
@@ -3536,7 +3511,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3536 break; 3511 break;
3537 } 3512 }
3538 mask = rnp->grpmask; 3513 mask = rnp->grpmask;
3539 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 3514 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
3540 rnp = rnp->parent; 3515 rnp = rnp->parent;
3541 raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ 3516 raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
3542 WARN_ON_ONCE(!(rnp->expmask & mask)); 3517 WARN_ON_ONCE(!(rnp->expmask & mask));
@@ -3571,7 +3546,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3571 3546
3572 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3547 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3573 if (!(rnp->expmask & mask)) { 3548 if (!(rnp->expmask & mask)) {
3574 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3549 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3575 return; 3550 return;
3576 } 3551 }
3577 rnp->expmask &= ~mask; 3552 rnp->expmask &= ~mask;
@@ -3732,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
3732 */ 3707 */
3733 if (rcu_preempt_has_tasks(rnp)) 3708 if (rcu_preempt_has_tasks(rnp))
3734 rnp->exp_tasks = rnp->blkd_tasks.next; 3709 rnp->exp_tasks = rnp->blkd_tasks.next;
3735 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3710 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3736 3711
3737 /* IPI the remaining CPUs for expedited quiescent state. */ 3712 /* IPI the remaining CPUs for expedited quiescent state. */
3738 mask = 1; 3713 mask = 1;
@@ -3749,7 +3724,7 @@ retry_ipi:
3749 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3724 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3750 if (cpu_online(cpu) && 3725 if (cpu_online(cpu) &&
3751 (rnp->expmask & mask)) { 3726 (rnp->expmask & mask)) {
3752 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3727 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3753 schedule_timeout_uninterruptible(1); 3728 schedule_timeout_uninterruptible(1);
3754 if (cpu_online(cpu) && 3729 if (cpu_online(cpu) &&
3755 (rnp->expmask & mask)) 3730 (rnp->expmask & mask))
@@ -3758,7 +3733,7 @@ retry_ipi:
3758 } 3733 }
3759 if (!(rnp->expmask & mask)) 3734 if (!(rnp->expmask & mask))
3760 mask_ofl_ipi &= ~mask; 3735 mask_ofl_ipi &= ~mask;
3761 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3736 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3762 } 3737 }
3763 /* Report quiescent states for those that went offline. */ 3738 /* Report quiescent states for those that went offline. */
3764 mask_ofl_test |= mask_ofl_ipi; 3739 mask_ofl_test |= mask_ofl_ipi;
@@ -4165,7 +4140,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
4165 return; 4140 return;
4166 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ 4141 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
4167 rnp->qsmaskinit |= mask; 4142 rnp->qsmaskinit |= mask;
4168 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 4143 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
4169 } 4144 }
4170} 4145}
4171 4146
@@ -4189,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
4189 rdp->rsp = rsp; 4164 rdp->rsp = rsp;
4190 mutex_init(&rdp->exp_funnel_mutex); 4165 mutex_init(&rdp->exp_funnel_mutex);
4191 rcu_boot_init_nocb_percpu_data(rdp); 4166 rcu_boot_init_nocb_percpu_data(rdp);
4192 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4167 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4193} 4168}
4194 4169
4195/* 4170/*
@@ -4217,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
4217 rcu_sysidle_init_percpu_data(rdp->dynticks); 4192 rcu_sysidle_init_percpu_data(rdp->dynticks);
4218 atomic_set(&rdp->dynticks->dynticks, 4193 atomic_set(&rdp->dynticks->dynticks,
4219 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 4194 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
4220 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 4195 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
4221 4196
4222 /* 4197 /*
4223 * Add CPU to leaf rcu_node pending-online bitmask. Any needed 4198 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -4238,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
4238 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 4213 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
4239 rdp->core_needs_qs = false; 4214 rdp->core_needs_qs = false;
4240 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 4215 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
4241 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4216 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4242} 4217}
4243 4218
4244static void rcu_prepare_cpu(int cpu) 4219static void rcu_prepare_cpu(int cpu)
@@ -4249,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu)
4249 rcu_init_percpu_data(cpu, rsp); 4224 rcu_init_percpu_data(cpu, rsp);
4250} 4225}
4251 4226
4227#ifdef CONFIG_HOTPLUG_CPU
4228/*
4229 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
4230 * function. We now remove it from the rcu_node tree's ->qsmaskinit
4231 * bit masks.
4232 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
4233 * function. We now remove it from the rcu_node tree's ->qsmaskinit
4234 * bit masks.
4235 */
4236static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
4237{
4238 unsigned long flags;
4239 unsigned long mask;
4240 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
4241 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
4242
4243 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
4244 return;
4245
4246 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
4247 mask = rdp->grpmask;
4248 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
4249 rnp->qsmaskinitnext &= ~mask;
4250 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4251}
4252
4253void rcu_report_dead(unsigned int cpu)
4254{
4255 struct rcu_state *rsp;
4256
4257 /* QS for any half-done expedited RCU-sched GP. */
4258 preempt_disable();
4259 rcu_report_exp_rdp(&rcu_sched_state,
4260 this_cpu_ptr(rcu_sched_state.rda), true);
4261 preempt_enable();
4262 for_each_rcu_flavor(rsp)
4263 rcu_cleanup_dying_idle_cpu(cpu, rsp);
4264}
4265#endif
4266
4252/* 4267/*
4253 * Handle CPU online/offline notification events. 4268 * Handle CPU online/offline notification events.
4254 */ 4269 */
@@ -4280,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self,
4280 for_each_rcu_flavor(rsp) 4295 for_each_rcu_flavor(rsp)
4281 rcu_cleanup_dying_cpu(rsp); 4296 rcu_cleanup_dying_cpu(rsp);
4282 break; 4297 break;
4283 case CPU_DYING_IDLE:
4284 /* QS for any half-done expedited RCU-sched GP. */
4285 preempt_disable();
4286 rcu_report_exp_rdp(&rcu_sched_state,
4287 this_cpu_ptr(rcu_sched_state.rda), true);
4288 preempt_enable();
4289
4290 for_each_rcu_flavor(rsp) {
4291 rcu_cleanup_dying_idle_cpu(cpu, rsp);
4292 }
4293 break;
4294 case CPU_DEAD: 4298 case CPU_DEAD:
4295 case CPU_DEAD_FROZEN: 4299 case CPU_DEAD_FROZEN:
4296 case CPU_UP_CANCELED: 4300 case CPU_UP_CANCELED:
@@ -4360,7 +4364,7 @@ static int __init rcu_spawn_gp_kthread(void)
4360 sp.sched_priority = kthread_prio; 4364 sp.sched_priority = kthread_prio;
4361 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 4365 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
4362 } 4366 }
4363 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4367 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4364 wake_up_process(t); 4368 wake_up_process(t);
4365 } 4369 }
4366 rcu_spawn_nocb_kthreads(); 4370 rcu_spawn_nocb_kthreads();
@@ -4451,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4451 cpustride *= levelspread[i]; 4455 cpustride *= levelspread[i];
4452 rnp = rsp->level[i]; 4456 rnp = rsp->level[i];
4453 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4457 for (j = 0; j < levelcnt[i]; j++, rnp++) {
4454 raw_spin_lock_init(&rnp->lock); 4458 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4455 lockdep_set_class_and_name(&rnp->lock, 4459 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4456 &rcu_node_class[i], buf[i]); 4460 &rcu_node_class[i], buf[i]);
4457 raw_spin_lock_init(&rnp->fqslock); 4461 raw_spin_lock_init(&rnp->fqslock);
4458 lockdep_set_class_and_name(&rnp->fqslock, 4462 lockdep_set_class_and_name(&rnp->fqslock,
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bbd235d0e71f..df668c0f9e64 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -150,8 +150,9 @@ struct rcu_dynticks {
150 * Definition for node within the RCU grace-period-detection hierarchy. 150 * Definition for node within the RCU grace-period-detection hierarchy.
151 */ 151 */
152struct rcu_node { 152struct rcu_node {
153 raw_spinlock_t lock; /* Root rcu_node's lock protects some */ 153 raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
154 /* rcu_state fields as well as following. */ 154 /* some rcu_state fields as well as */
155 /* following. */
155 unsigned long gpnum; /* Current grace period for this node. */ 156 unsigned long gpnum; /* Current grace period for this node. */
156 /* This will either be equal to or one */ 157 /* This will either be equal to or one */
157 /* behind the root rcu_node's gpnum. */ 158 /* behind the root rcu_node's gpnum. */
@@ -682,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
682#endif /* #else #ifdef CONFIG_PPC */ 683#endif /* #else #ifdef CONFIG_PPC */
683 684
684/* 685/*
685 * Wrappers for the rcu_node::lock acquire. 686 * Wrappers for the rcu_node::lock acquire and release.
686 * 687 *
687 * Because the rcu_nodes form a tree, the tree traversal locking will observe 688 * Because the rcu_nodes form a tree, the tree traversal locking will observe
688 * different lock values, this in turn means that an UNLOCK of one level 689 * different lock values, this in turn means that an UNLOCK of one level
@@ -691,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
691 * 692 *
692 * In order to restore full ordering between tree levels, augment the regular 693 * In order to restore full ordering between tree levels, augment the regular
693 * lock acquire functions with smp_mb__after_unlock_lock(). 694 * lock acquire functions with smp_mb__after_unlock_lock().
695 *
696 * As ->lock of struct rcu_node is a __private field, therefore one should use
697 * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
694 */ 698 */
695static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) 699static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
696{ 700{
697 raw_spin_lock(&rnp->lock); 701 raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
698 smp_mb__after_unlock_lock(); 702 smp_mb__after_unlock_lock();
699} 703}
700 704
705static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
706{
707 raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
708}
709
701static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) 710static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
702{ 711{
703 raw_spin_lock_irq(&rnp->lock); 712 raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
704 smp_mb__after_unlock_lock(); 713 smp_mb__after_unlock_lock();
705} 714}
706 715
707#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ 716static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
708do { \ 717{
709 typecheck(unsigned long, flags); \ 718 raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
710 raw_spin_lock_irqsave(&(rnp)->lock, flags); \ 719}
711 smp_mb__after_unlock_lock(); \ 720
721#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
722do { \
723 typecheck(unsigned long, flags); \
724 raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
725 smp_mb__after_unlock_lock(); \
726} while (0)
727
728#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
729do { \
730 typecheck(unsigned long, flags); \
731 raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
712} while (0) 732} while (0)
713 733
714static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) 734static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
715{ 735{
716 bool locked = raw_spin_trylock(&rnp->lock); 736 bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
717 737
718 if (locked) 738 if (locked)
719 smp_mb__after_unlock_lock(); 739 smp_mb__after_unlock_lock();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 080bd202d360..efdf7b61ce12 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
235 rnp->gp_tasks = &t->rcu_node_entry; 235 rnp->gp_tasks = &t->rcu_node_entry;
236 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 236 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
237 rnp->exp_tasks = &t->rcu_node_entry; 237 rnp->exp_tasks = &t->rcu_node_entry;
238 raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ 238 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
239 239
240 /* 240 /*
241 * Report the quiescent state for the expedited GP. This expedited 241 * Report the quiescent state for the expedited GP. This expedited
@@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
489 !!rnp->gp_tasks); 489 !!rnp->gp_tasks);
490 rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); 490 rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
491 } else { 491 } else {
492 raw_spin_unlock_irqrestore(&rnp->lock, flags); 492 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
493 } 493 }
494 494
495 /* Unboost if we were boosted. */ 495 /* Unboost if we were boosted. */
@@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
518 518
519 raw_spin_lock_irqsave_rcu_node(rnp, flags); 519 raw_spin_lock_irqsave_rcu_node(rnp, flags);
520 if (!rcu_preempt_blocked_readers_cgp(rnp)) { 520 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
521 raw_spin_unlock_irqrestore(&rnp->lock, flags); 521 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
522 return; 522 return;
523 } 523 }
524 t = list_entry(rnp->gp_tasks->prev, 524 t = list_entry(rnp->gp_tasks->prev,
525 struct task_struct, rcu_node_entry); 525 struct task_struct, rcu_node_entry);
526 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 526 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
527 sched_show_task(t); 527 sched_show_task(t);
528 raw_spin_unlock_irqrestore(&rnp->lock, flags); 528 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
529} 529}
530 530
531/* 531/*
@@ -807,7 +807,6 @@ void exit_rcu(void)
807#else /* #ifdef CONFIG_PREEMPT_RCU */ 807#else /* #ifdef CONFIG_PREEMPT_RCU */
808 808
809static struct rcu_state *const rcu_state_p = &rcu_sched_state; 809static struct rcu_state *const rcu_state_p = &rcu_sched_state;
810static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
811 810
812/* 811/*
813 * Tell them what RCU they are running. 812 * Tell them what RCU they are running.
@@ -991,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp)
991 * might exit their RCU read-side critical sections on their own. 990 * might exit their RCU read-side critical sections on their own.
992 */ 991 */
993 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 992 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
994 raw_spin_unlock_irqrestore(&rnp->lock, flags); 993 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
995 return 0; 994 return 0;
996 } 995 }
997 996
@@ -1028,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp)
1028 */ 1027 */
1029 t = container_of(tb, struct task_struct, rcu_node_entry); 1028 t = container_of(tb, struct task_struct, rcu_node_entry);
1030 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1029 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1031 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1030 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1032 /* Lock only for side effect: boosts task t's priority. */ 1031 /* Lock only for side effect: boosts task t's priority. */
1033 rt_mutex_lock(&rnp->boost_mtx); 1032 rt_mutex_lock(&rnp->boost_mtx);
1034 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1033 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
@@ -1088,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1088 1087
1089 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1088 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1090 rnp->n_balk_exp_gp_tasks++; 1089 rnp->n_balk_exp_gp_tasks++;
1091 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1090 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1092 return; 1091 return;
1093 } 1092 }
1094 if (rnp->exp_tasks != NULL || 1093 if (rnp->exp_tasks != NULL ||
@@ -1098,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1098 ULONG_CMP_GE(jiffies, rnp->boost_time))) { 1097 ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1099 if (rnp->exp_tasks == NULL) 1098 if (rnp->exp_tasks == NULL)
1100 rnp->boost_tasks = rnp->gp_tasks; 1099 rnp->boost_tasks = rnp->gp_tasks;
1101 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1100 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1102 t = rnp->boost_kthread_task; 1101 t = rnp->boost_kthread_task;
1103 if (t) 1102 if (t)
1104 rcu_wake_cond(t, rnp->boost_kthread_status); 1103 rcu_wake_cond(t, rnp->boost_kthread_status);
1105 } else { 1104 } else {
1106 rcu_initiate_boost_trace(rnp); 1105 rcu_initiate_boost_trace(rnp);
1107 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1106 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1108 } 1107 }
1109} 1108}
1110 1109
@@ -1172,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1172 return PTR_ERR(t); 1171 return PTR_ERR(t);
1173 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1172 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1174 rnp->boost_kthread_task = t; 1173 rnp->boost_kthread_task = t;
1175 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1174 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1176 sp.sched_priority = kthread_prio; 1175 sp.sched_priority = kthread_prio;
1177 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1176 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1178 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1177 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
@@ -1308,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu)
1308static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1307static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1309 __releases(rnp->lock) 1308 __releases(rnp->lock)
1310{ 1309{
1311 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1310 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1312} 1311}
1313 1312
1314static void invoke_rcu_callbacks_kthread(void) 1313static void invoke_rcu_callbacks_kthread(void)
@@ -1559,7 +1558,7 @@ static void rcu_prepare_for_idle(void)
1559 rnp = rdp->mynode; 1558 rnp = rdp->mynode;
1560 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1559 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1561 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 1560 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1562 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1561 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1563 if (needwake) 1562 if (needwake)
1564 rcu_gp_kthread_wake(rsp); 1563 rcu_gp_kthread_wake(rsp);
1565 } 1564 }
@@ -2064,7 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2064 2063
2065 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2064 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2066 needwake = rcu_start_future_gp(rnp, rdp, &c); 2065 needwake = rcu_start_future_gp(rnp, rdp, &c);
2067 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2066 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2068 if (needwake) 2067 if (needwake)
2069 rcu_gp_kthread_wake(rdp->rsp); 2068 rcu_gp_kthread_wake(rdp->rsp);
2070 2069
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 76b94e19430b..ca828b41c938 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void)
128{ 128{
129 return READ_ONCE(rcu_normal); 129 return READ_ONCE(rcu_normal);
130} 130}
131EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
131 132
132static atomic_t rcu_expedited_nesting = 133static atomic_t rcu_expedited_nesting =
133 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); 134 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5725b931bee..ea8f49ae0062 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5434,16 +5434,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
5434 set_cpu_rq_start_time(); 5434 set_cpu_rq_start_time();
5435 return NOTIFY_OK; 5435 return NOTIFY_OK;
5436 5436
5437 case CPU_ONLINE:
5438 /*
5439 * At this point a starting CPU has marked itself as online via
5440 * set_cpu_online(). But it might not yet have marked itself
5441 * as active, which is essential from here on.
5442 */
5443 set_cpu_active(cpu, true);
5444 stop_machine_unpark(cpu);
5445 return NOTIFY_OK;
5446
5447 case CPU_DOWN_FAILED: 5437 case CPU_DOWN_FAILED:
5448 set_cpu_active(cpu, true); 5438 set_cpu_active(cpu, true);
5449 return NOTIFY_OK; 5439 return NOTIFY_OK;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a7133cbd1..bd12c6c714ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,6 +4,7 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/cpuidle.h> 6#include <linux/cpuidle.h>
7#include <linux/cpuhotplug.h>
7#include <linux/tick.h> 8#include <linux/tick.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9#include <linux/stackprotector.h> 10#include <linux/stackprotector.h>
@@ -193,8 +194,6 @@ exit_idle:
193 rcu_idle_exit(); 194 rcu_idle_exit();
194} 195}
195 196
196DEFINE_PER_CPU(bool, cpu_dead_idle);
197
198/* 197/*
199 * Generic idle loop implementation 198 * Generic idle loop implementation
200 * 199 *
@@ -221,10 +220,7 @@ static void cpu_idle_loop(void)
221 rmb(); 220 rmb();
222 221
223 if (cpu_is_offline(smp_processor_id())) { 222 if (cpu_is_offline(smp_processor_id())) {
224 rcu_cpu_notify(NULL, CPU_DYING_IDLE, 223 cpuhp_report_idle_dead();
225 (void *)(long)smp_processor_id());
226 smp_mb(); /* all activity before dead. */
227 this_cpu_write(cpu_dead_idle, true);
228 arch_cpu_idle_dead(); 224 arch_cpu_idle_dead();
229 } 225 }
230 226
@@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state)
291 boot_init_stack_canary(); 287 boot_init_stack_canary();
292#endif 288#endif
293 arch_cpu_idle_prepare(); 289 arch_cpu_idle_prepare();
290 cpuhp_online_idle(state);
294 cpu_idle_loop(); 291 cpu_idle_loop();
295} 292}
diff --git a/kernel/smp.c b/kernel/smp.c
index 300d29391e07..74165443c240 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -568,6 +568,7 @@ void __init smp_init(void)
568 unsigned int cpu; 568 unsigned int cpu;
569 569
570 idle_threads_init(); 570 idle_threads_init();
571 cpuhp_threads_init();
571 572
572 /* FIXME: This should be done in userspace --RR */ 573 /* FIXME: This should be done in userspace --RR */
573 for_each_present_cpu(cpu) { 574 for_each_present_cpu(cpu) {
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59bff56..13bc43d1fb22 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
226 kthread_unpark(tsk); 226 kthread_unpark(tsk);
227} 227}
228 228
229void smpboot_unpark_threads(unsigned int cpu) 229int smpboot_unpark_threads(unsigned int cpu)
230{ 230{
231 struct smp_hotplug_thread *cur; 231 struct smp_hotplug_thread *cur;
232 232
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
235 if (cpumask_test_cpu(cpu, cur->cpumask)) 235 if (cpumask_test_cpu(cpu, cur->cpumask))
236 smpboot_unpark_thread(cur, cpu); 236 smpboot_unpark_thread(cur, cpu);
237 mutex_unlock(&smpboot_threads_lock); 237 mutex_unlock(&smpboot_threads_lock);
238 return 0;
238} 239}
239 240
240static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) 241static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
245 kthread_park(tsk); 246 kthread_park(tsk);
246} 247}
247 248
248void smpboot_park_threads(unsigned int cpu) 249int smpboot_park_threads(unsigned int cpu)
249{ 250{
250 struct smp_hotplug_thread *cur; 251 struct smp_hotplug_thread *cur;
251 252
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
253 list_for_each_entry_reverse(cur, &hotplug_threads, list) 254 list_for_each_entry_reverse(cur, &hotplug_threads, list)
254 smpboot_park_thread(cur, cpu); 255 smpboot_park_thread(cur, cpu);
255 mutex_unlock(&smpboot_threads_lock); 256 mutex_unlock(&smpboot_threads_lock);
257 return 0;
256} 258}
257 259
258static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) 260static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 72415a0eb955..485b81cfab34 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { }
14#endif 14#endif
15 15
16int smpboot_create_threads(unsigned int cpu); 16int smpboot_create_threads(unsigned int cpu);
17void smpboot_park_threads(unsigned int cpu); 17int smpboot_park_threads(unsigned int cpu);
18void smpboot_unpark_threads(unsigned int cpu); 18int smpboot_unpark_threads(unsigned int cpu);
19
20void __init cpuhp_threads_init(void);
19 21
20#endif 22#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 664de539299b..56ece145a814 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
323 /* cs is a watchdog. */ 323 /* cs is a watchdog. */
324 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 324 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
325 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 325 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
326 }
327 spin_unlock_irqrestore(&watchdog_lock, flags);
328}
329
330static void clocksource_select_watchdog(bool fallback)
331{
332 struct clocksource *cs, *old_wd;
333 unsigned long flags;
334
335 spin_lock_irqsave(&watchdog_lock, flags);
336 /* save current watchdog */
337 old_wd = watchdog;
338 if (fallback)
339 watchdog = NULL;
340
341 list_for_each_entry(cs, &clocksource_list, list) {
342 /* cs is a clocksource to be watched. */
343 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
344 continue;
345
346 /* Skip current if we were requested for a fallback. */
347 if (fallback && cs == old_wd)
348 continue;
349
326 /* Pick the best watchdog. */ 350 /* Pick the best watchdog. */
327 if (!watchdog || cs->rating > watchdog->rating) { 351 if (!watchdog || cs->rating > watchdog->rating)
328 watchdog = cs; 352 watchdog = cs;
329 /* Reset watchdog cycles */
330 clocksource_reset_watchdog();
331 }
332 } 353 }
354 /* If we failed to find a fallback restore the old one. */
355 if (!watchdog)
356 watchdog = old_wd;
357
358 /* If we changed the watchdog we need to reset cycles. */
359 if (watchdog != old_wd)
360 clocksource_reset_watchdog();
361
333 /* Check if the watchdog timer needs to be started. */ 362 /* Check if the watchdog timer needs to be started. */
334 clocksource_start_watchdog(); 363 clocksource_start_watchdog();
335 spin_unlock_irqrestore(&watchdog_lock, flags); 364 spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
404 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 433 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
405} 434}
406 435
436static void clocksource_select_watchdog(bool fallback) { }
407static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } 437static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
408static inline void clocksource_resume_watchdog(void) { } 438static inline void clocksource_resume_watchdog(void) { }
409static inline int __clocksource_watchdog_kthread(void) { return 0; } 439static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
736 clocksource_enqueue(cs); 766 clocksource_enqueue(cs);
737 clocksource_enqueue_watchdog(cs); 767 clocksource_enqueue_watchdog(cs);
738 clocksource_select(); 768 clocksource_select();
769 clocksource_select_watchdog(false);
739 mutex_unlock(&clocksource_mutex); 770 mutex_unlock(&clocksource_mutex);
740 return 0; 771 return 0;
741} 772}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
758 mutex_lock(&clocksource_mutex); 789 mutex_lock(&clocksource_mutex);
759 __clocksource_change_rating(cs, rating); 790 __clocksource_change_rating(cs, rating);
760 clocksource_select(); 791 clocksource_select();
792 clocksource_select_watchdog(false);
761 mutex_unlock(&clocksource_mutex); 793 mutex_unlock(&clocksource_mutex);
762} 794}
763EXPORT_SYMBOL(clocksource_change_rating); 795EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
767 */ 799 */
768static int clocksource_unbind(struct clocksource *cs) 800static int clocksource_unbind(struct clocksource *cs)
769{ 801{
770 /* 802 if (clocksource_is_watchdog(cs)) {
771 * I really can't convince myself to support this on hardware 803 /* Select and try to install a replacement watchdog. */
772 * designed by lobotomized monkeys. 804 clocksource_select_watchdog(true);
773 */ 805 if (clocksource_is_watchdog(cs))
774 if (clocksource_is_watchdog(cs)) 806 return -EBUSY;
775 return -EBUSY; 807 }
776 808
777 if (cs == curr_clocksource) { 809 if (cs == curr_clocksource) {
778 /* Select and try to install a replacement clock source */ 810 /* Select and try to install a replacement clock source */
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 347fecf86a3f..555e21f7b966 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = {
68 .name = "jiffies", 68 .name = "jiffies",
69 .rating = 1, /* lowest valid rating*/ 69 .rating = 1, /* lowest valid rating*/
70 .read = jiffies_read, 70 .read = jiffies_read,
71 .mask = 0xffffffff, /*32bits*/ 71 .mask = CLOCKSOURCE_MASK(32),
72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 72 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
73 .shift = JIFFIES_SHIFT, 73 .shift = JIFFIES_SHIFT,
74 .max_cycles = 10, 74 .max_cycles = 10,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 34b4cedfa80d..9c629bbed572 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
233 u64 tmp, ntpinterval; 233 u64 tmp, ntpinterval;
234 struct clocksource *old_clock; 234 struct clocksource *old_clock;
235 235
236 ++tk->cs_was_changed_seq;
236 old_clock = tk->tkr_mono.clock; 237 old_clock = tk->tkr_mono.clock;
237 tk->tkr_mono.clock = clock; 238 tk->tkr_mono.clock = clock;
238 tk->tkr_mono.read = clock->read; 239 tk->tkr_mono.read = clock->read;
@@ -298,17 +299,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
298static inline u32 arch_gettimeoffset(void) { return 0; } 299static inline u32 arch_gettimeoffset(void) { return 0; }
299#endif 300#endif
300 301
302static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
303 cycle_t delta)
304{
305 s64 nsec;
306
307 nsec = delta * tkr->mult + tkr->xtime_nsec;
308 nsec >>= tkr->shift;
309
310 /* If arch requires, add in get_arch_timeoffset() */
311 return nsec + arch_gettimeoffset();
312}
313
301static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) 314static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
302{ 315{
303 cycle_t delta; 316 cycle_t delta;
304 s64 nsec;
305 317
306 delta = timekeeping_get_delta(tkr); 318 delta = timekeeping_get_delta(tkr);
319 return timekeeping_delta_to_ns(tkr, delta);
320}
307 321
308 nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; 322static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
323 cycle_t cycles)
324{
325 cycle_t delta;
309 326
310 /* If arch requires, add in get_arch_timeoffset() */ 327 /* calculate the delta since the last update_wall_time */
311 return nsec + arch_gettimeoffset(); 328 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
329 return timekeeping_delta_to_ns(tkr, delta);
312} 330}
313 331
314/** 332/**
@@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void)
857 return tk->xtime_sec; 875 return tk->xtime_sec;
858} 876}
859 877
878/**
879 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
880 * @systime_snapshot: pointer to struct receiving the system time snapshot
881 */
882void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
883{
884 struct timekeeper *tk = &tk_core.timekeeper;
885 unsigned long seq;
886 ktime_t base_raw;
887 ktime_t base_real;
888 s64 nsec_raw;
889 s64 nsec_real;
890 cycle_t now;
860 891
861#ifdef CONFIG_NTP_PPS 892 WARN_ON_ONCE(timekeeping_suspended);
893
894 do {
895 seq = read_seqcount_begin(&tk_core.seq);
896
897 now = tk->tkr_mono.read(tk->tkr_mono.clock);
898 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
899 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
900 base_real = ktime_add(tk->tkr_mono.base,
901 tk_core.timekeeper.offs_real);
902 base_raw = tk->tkr_raw.base;
903 nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
904 nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
905 } while (read_seqcount_retry(&tk_core.seq, seq));
906
907 systime_snapshot->cycles = now;
908 systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
909 systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
910}
911EXPORT_SYMBOL_GPL(ktime_get_snapshot);
912
913/* Scale base by mult/div checking for overflow */
914static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
915{
916 u64 tmp, rem;
917
918 tmp = div64_u64_rem(*base, div, &rem);
919
920 if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
921 ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
922 return -EOVERFLOW;
923 tmp *= mult;
924 rem *= mult;
925
926 do_div(rem, div);
927 *base = tmp + rem;
928 return 0;
929}
862 930
863/** 931/**
864 * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format 932 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
865 * @ts_raw: pointer to the timespec to be set to raw monotonic time 933 * @history: Snapshot representing start of history
866 * @ts_real: pointer to the timespec to be set to the time of day 934 * @partial_history_cycles: Cycle offset into history (fractional part)
935 * @total_history_cycles: Total history length in cycles
936 * @discontinuity: True indicates clock was set on history period
937 * @ts: Cross timestamp that should be adjusted using
938 * partial/total ratio
867 * 939 *
868 * This function reads both the time of day and raw monotonic time at the 940 * Helper function used by get_device_system_crosststamp() to correct the
869 * same time atomically and stores the resulting timestamps in timespec 941 * crosstimestamp corresponding to the start of the current interval to the
870 * format. 942 * system counter value (timestamp point) provided by the driver. The
943 * total_history_* quantities are the total history starting at the provided
944 * reference point and ending at the start of the current interval. The cycle
945 * count between the driver timestamp point and the start of the current
946 * interval is partial_history_cycles.
871 */ 947 */
872void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) 948static int adjust_historical_crosststamp(struct system_time_snapshot *history,
949 cycle_t partial_history_cycles,
950 cycle_t total_history_cycles,
951 bool discontinuity,
952 struct system_device_crosststamp *ts)
873{ 953{
874 struct timekeeper *tk = &tk_core.timekeeper; 954 struct timekeeper *tk = &tk_core.timekeeper;
875 unsigned long seq; 955 u64 corr_raw, corr_real;
876 s64 nsecs_raw, nsecs_real; 956 bool interp_forward;
957 int ret;
877 958
878 WARN_ON_ONCE(timekeeping_suspended); 959 if (total_history_cycles == 0 || partial_history_cycles == 0)
960 return 0;
961
962 /* Interpolate shortest distance from beginning or end of history */
963 interp_forward = partial_history_cycles > total_history_cycles/2 ?
964 true : false;
965 partial_history_cycles = interp_forward ?
966 total_history_cycles - partial_history_cycles :
967 partial_history_cycles;
968
969 /*
970 * Scale the monotonic raw time delta by:
971 * partial_history_cycles / total_history_cycles
972 */
973 corr_raw = (u64)ktime_to_ns(
974 ktime_sub(ts->sys_monoraw, history->raw));
975 ret = scale64_check_overflow(partial_history_cycles,
976 total_history_cycles, &corr_raw);
977 if (ret)
978 return ret;
979
980 /*
981 * If there is a discontinuity in the history, scale monotonic raw
982 * correction by:
983 * mult(real)/mult(raw) yielding the realtime correction
984 * Otherwise, calculate the realtime correction similar to monotonic
985 * raw calculation
986 */
987 if (discontinuity) {
988 corr_real = mul_u64_u32_div
989 (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
990 } else {
991 corr_real = (u64)ktime_to_ns(
992 ktime_sub(ts->sys_realtime, history->real));
993 ret = scale64_check_overflow(partial_history_cycles,
994 total_history_cycles, &corr_real);
995 if (ret)
996 return ret;
997 }
998
999 /* Fixup monotonic raw and real time time values */
1000 if (interp_forward) {
1001 ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
1002 ts->sys_realtime = ktime_add_ns(history->real, corr_real);
1003 } else {
1004 ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
1005 ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
1006 }
1007
1008 return 0;
1009}
1010
1011/*
1012 * cycle_between - true if test occurs chronologically between before and after
1013 */
1014static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
1015{
1016 if (test > before && test < after)
1017 return true;
1018 if (test < before && before > after)
1019 return true;
1020 return false;
1021}
1022
1023/**
1024 * get_device_system_crosststamp - Synchronously capture system/device timestamp
1025 * @get_time_fn: Callback to get simultaneous device time and
1026 * system counter from the device driver
1027 * @ctx: Context passed to get_time_fn()
1028 * @history_begin: Historical reference point used to interpolate system
1029 * time when counter provided by the driver is before the current interval
1030 * @xtstamp: Receives simultaneously captured system and device time
1031 *
1032 * Reads a timestamp from a device and correlates it to system time
1033 */
1034int get_device_system_crosststamp(int (*get_time_fn)
1035 (ktime_t *device_time,
1036 struct system_counterval_t *sys_counterval,
1037 void *ctx),
1038 void *ctx,
1039 struct system_time_snapshot *history_begin,
1040 struct system_device_crosststamp *xtstamp)
1041{
1042 struct system_counterval_t system_counterval;
1043 struct timekeeper *tk = &tk_core.timekeeper;
1044 cycle_t cycles, now, interval_start;
1045 unsigned int clock_was_set_seq = 0;
1046 ktime_t base_real, base_raw;
1047 s64 nsec_real, nsec_raw;
1048 u8 cs_was_changed_seq;
1049 unsigned long seq;
1050 bool do_interp;
1051 int ret;
879 1052
880 do { 1053 do {
881 seq = read_seqcount_begin(&tk_core.seq); 1054 seq = read_seqcount_begin(&tk_core.seq);
1055 /*
1056 * Try to synchronously capture device time and a system
1057 * counter value calling back into the device driver
1058 */
1059 ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
1060 if (ret)
1061 return ret;
1062
1063 /*
1064 * Verify that the clocksource associated with the captured
1065 * system counter value is the same as the currently installed
1066 * timekeeper clocksource
1067 */
1068 if (tk->tkr_mono.clock != system_counterval.cs)
1069 return -ENODEV;
1070 cycles = system_counterval.cycles;
882 1071
883 *ts_raw = tk->raw_time; 1072 /*
884 ts_real->tv_sec = tk->xtime_sec; 1073 * Check whether the system counter value provided by the
885 ts_real->tv_nsec = 0; 1074 * device driver is on the current timekeeping interval.
1075 */
1076 now = tk->tkr_mono.read(tk->tkr_mono.clock);
1077 interval_start = tk->tkr_mono.cycle_last;
1078 if (!cycle_between(interval_start, cycles, now)) {
1079 clock_was_set_seq = tk->clock_was_set_seq;
1080 cs_was_changed_seq = tk->cs_was_changed_seq;
1081 cycles = interval_start;
1082 do_interp = true;
1083 } else {
1084 do_interp = false;
1085 }
886 1086
887 nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); 1087 base_real = ktime_add(tk->tkr_mono.base,
888 nsecs_real = timekeeping_get_ns(&tk->tkr_mono); 1088 tk_core.timekeeper.offs_real);
1089 base_raw = tk->tkr_raw.base;
889 1090
1091 nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
1092 system_counterval.cycles);
1093 nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
1094 system_counterval.cycles);
890 } while (read_seqcount_retry(&tk_core.seq, seq)); 1095 } while (read_seqcount_retry(&tk_core.seq, seq));
891 1096
892 timespec64_add_ns(ts_raw, nsecs_raw); 1097 xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
893 timespec64_add_ns(ts_real, nsecs_real); 1098 xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
894}
895EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
896 1099
897#endif /* CONFIG_NTP_PPS */ 1100 /*
1101 * Interpolate if necessary, adjusting back from the start of the
1102 * current interval
1103 */
1104 if (do_interp) {
1105 cycle_t partial_history_cycles, total_history_cycles;
1106 bool discontinuity;
1107
1108 /*
1109 * Check that the counter value occurs after the provided
1110 * history reference and that the history doesn't cross a
1111 * clocksource change
1112 */
1113 if (!history_begin ||
1114 !cycle_between(history_begin->cycles,
1115 system_counterval.cycles, cycles) ||
1116 history_begin->cs_was_changed_seq != cs_was_changed_seq)
1117 return -EINVAL;
1118 partial_history_cycles = cycles - system_counterval.cycles;
1119 total_history_cycles = cycles - history_begin->cycles;
1120 discontinuity =
1121 history_begin->clock_was_set_seq != clock_was_set_seq;
1122
1123 ret = adjust_historical_crosststamp(history_begin,
1124 partial_history_cycles,
1125 total_history_cycles,
1126 discontinuity, xtstamp);
1127 if (ret)
1128 return ret;
1129 }
1130
1131 return 0;
1132}
1133EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
898 1134
899/** 1135/**
900 * do_gettimeofday - Returns the time of day in a timeval 1136 * do_gettimeofday - Returns the time of day in a timeval
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bfd1aca7a3d..f28f7fad452f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1442,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT
1442 1442
1443 Say N if you are unsure. 1443 Say N if you are unsure.
1444 1444
1445config CPU_HOTPLUG_STATE_CONTROL
1446 bool "Enable CPU hotplug state control"
1447 depends on DEBUG_KERNEL
1448 depends on HOTPLUG_CPU
1449 default n
1450 help
1451 Allows to write steps between "offline" and "online" to the CPUs
1452 sysfs target file so states can be stepped granular. This is a debug
1453 option for now as the hotplug machinery cannot be stopped and
1454 restarted at arbitrary points yet.
1455
1456 Say N if your are unsure.
1457
1445config NOTIFIER_ERROR_INJECTION 1458config NOTIFIER_ERROR_INJECTION
1446 tristate "Notifier error injection" 1459 tristate "Notifier error injection"
1447 depends on DEBUG_KERNEL 1460 depends on DEBUG_KERNEL
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index d62de8bf022d..123481814320 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -17,7 +17,7 @@
17#include <linux/atomic.h> 17#include <linux/atomic.h>
18 18
19#ifdef CONFIG_X86 19#ifdef CONFIG_X86
20#include <asm/processor.h> /* for boot_cpu_has below */ 20#include <asm/cpufeature.h> /* for boot_cpu_has below */
21#endif 21#endif
22 22
23#define TEST(bit, op, c_op, val) \ 23#define TEST(bit, op, c_op, val) \
diff --git a/mm/memory.c b/mm/memory.c
index 8132787ae4d5..906d8e3b42c0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1551,8 +1551,29 @@ out:
1551int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1551int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1552 unsigned long pfn) 1552 unsigned long pfn)
1553{ 1553{
1554 return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1555}
1556EXPORT_SYMBOL(vm_insert_pfn);
1557
1558/**
1559 * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
1560 * @vma: user vma to map to
1561 * @addr: target user address of this page
1562 * @pfn: source kernel pfn
1563 * @pgprot: pgprot flags for the inserted page
1564 *
1565 * This is exactly like vm_insert_pfn, except that it allows drivers to
1566 * to override pgprot on a per-page basis.
1567 *
1568 * This only makes sense for IO mappings, and it makes no sense for
1569 * cow mappings. In general, using multiple vmas is preferable;
1570 * vm_insert_pfn_prot should only be used if using multiple VMAs is
1571 * impractical.
1572 */
1573int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1574 unsigned long pfn, pgprot_t pgprot)
1575{
1554 int ret; 1576 int ret;
1555 pgprot_t pgprot = vma->vm_page_prot;
1556 /* 1577 /*
1557 * Technically, architectures with pte_special can avoid all these 1578 * Technically, architectures with pte_special can avoid all these
1558 * restrictions (same for remap_pfn_range). However we would like 1579 * restrictions (same for remap_pfn_range). However we would like
@@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1574 1595
1575 return ret; 1596 return ret;
1576} 1597}
1577EXPORT_SYMBOL(vm_insert_pfn); 1598EXPORT_SYMBOL(vm_insert_pfn_prot);
1578 1599
1579int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1600int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1580 pfn_t pfn) 1601 pfn_t pfn)
diff --git a/mm/mmap.c b/mm/mmap.c
index 76d1ec29149b..90e3b869a8b9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3066,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
3066 pgoff_t pgoff; 3066 pgoff_t pgoff;
3067 struct page **pages; 3067 struct page **pages;
3068 3068
3069 if (vma->vm_ops == &legacy_special_mapping_vmops) 3069 if (vma->vm_ops == &legacy_special_mapping_vmops) {
3070 pages = vma->vm_private_data; 3070 pages = vma->vm_private_data;
3071 else 3071 } else {
3072 pages = ((struct vm_special_mapping *)vma->vm_private_data)-> 3072 struct vm_special_mapping *sm = vma->vm_private_data;
3073 pages; 3073
3074 if (sm->fault)
3075 return sm->fault(sm, vma, vmf);
3076
3077 pages = sm->pages;
3078 }
3074 3079
3075 for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) 3080 for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
3076 pgoff--; 3081 pgoff--;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0147c91fa549..874132b26d23 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -269,7 +269,8 @@ our $Sparse = qr{
269 __init_refok| 269 __init_refok|
270 __kprobes| 270 __kprobes|
271 __ref| 271 __ref|
272 __rcu 272 __rcu|
273 __private
273 }x; 274 }x;
274our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)}; 275our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)};
275our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)}; 276our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)};
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 844787a0d7be..5eb49b7f864c 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file
33then 33then
34 print_warning Console output contains nul bytes, old qemu still running? 34 print_warning Console output contains nul bytes, old qemu still running?
35fi 35fi
36egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags 36egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
37if test -s $1.diags 37if test -s $1.diags
38then 38then
39 print_warning Assertion failure in $file $title 39 print_warning Assertion failure in $file $title
@@ -64,10 +64,12 @@ then
64 then 64 then
65 summary="$summary lockdep: $n_badness" 65 summary="$summary lockdep: $n_badness"
66 fi 66 fi
67 n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` 67 n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1`
68 if test "$n_stalls" -ne 0 68 if test "$n_stalls" -ne 0
69 then 69 then
70 summary="$summary Stalls: $n_stalls" 70 summary="$summary Stalls: $n_stalls"
71 fi 71 fi
72 print_warning Summary: $summary 72 print_warning Summary: $summary
73else
74 rm $1.diags
73fi 75fi
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index d0c473f65850..d5ce7d7aae3e 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,15 +4,16 @@ include ../lib.mk
4 4
5.PHONY: all all_32 all_64 warn_32bit_failure clean 5.PHONY: all all_32 all_64 warn_32bit_failure clean
6 6
7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall 7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
8TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ 8 check_initial_reg_state sigreturn ldt_gdt
9TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
9 test_FCMOV test_FCOMI test_FISTTP \ 10 test_FCMOV test_FCOMI test_FISTTP \
10 ldt_gdt \
11 vdso_restorer 11 vdso_restorer
12 12
13TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) 13TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
14TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
14BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) 15BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
15BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) 16BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
16 17
17CFLAGS := -O2 -g -std=gnu99 -pthread -Wall 18CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
18 19
@@ -40,7 +41,7 @@ clean:
40$(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c 41$(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
41 $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm 42 $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
42 43
43$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c 44$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
44 $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl 45 $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
45 46
46# x86_64 users should be encouraged to install 32-bit libraries 47# x86_64 users should be encouraged to install 32-bit libraries
@@ -65,3 +66,9 @@ endif
65sysret_ss_attrs_64: thunks.S 66sysret_ss_attrs_64: thunks.S
66ptrace_syscall_32: raw_syscall_helper_32.S 67ptrace_syscall_32: raw_syscall_helper_32.S
67test_syscall_vdso_32: thunks_32.S 68test_syscall_vdso_32: thunks_32.S
69
70# check_initial_reg_state is special: it needs a custom entry, and it
71# needs to be static so that its interpreter doesn't destroy its initial
72# state.
73check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
74check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644
index 000000000000..6aaed9b85baf
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,109 @@
1/*
2 * check_initial_reg_state.c - check that execve sets the correct state
3 * Copyright (c) 2014-2016 Andrew Lutomirski
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */
14
15#define _GNU_SOURCE
16
17#include <stdio.h>
18
19unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
20unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
21
22asm (
23 ".pushsection .text\n\t"
24 ".type real_start, @function\n\t"
25 ".global real_start\n\t"
26 "real_start:\n\t"
27#ifdef __x86_64__
28 "mov %rax, ax\n\t"
29 "mov %rbx, bx\n\t"
30 "mov %rcx, cx\n\t"
31 "mov %rdx, dx\n\t"
32 "mov %rsi, si\n\t"
33 "mov %rdi, di\n\t"
34 "mov %rbp, bp\n\t"
35 "mov %rsp, sp\n\t"
36 "mov %r8, r8\n\t"
37 "mov %r9, r9\n\t"
38 "mov %r10, r10\n\t"
39 "mov %r11, r11\n\t"
40 "mov %r12, r12\n\t"
41 "mov %r13, r13\n\t"
42 "mov %r14, r14\n\t"
43 "mov %r15, r15\n\t"
44 "pushfq\n\t"
45 "popq flags\n\t"
46#else
47 "mov %eax, ax\n\t"
48 "mov %ebx, bx\n\t"
49 "mov %ecx, cx\n\t"
50 "mov %edx, dx\n\t"
51 "mov %esi, si\n\t"
52 "mov %edi, di\n\t"
53 "mov %ebp, bp\n\t"
54 "mov %esp, sp\n\t"
55 "pushfl\n\t"
56 "popl flags\n\t"
57#endif
58 "jmp _start\n\t"
59 ".size real_start, . - real_start\n\t"
60 ".popsection");
61
62int main()
63{
64 int nerrs = 0;
65
66 if (sp == 0) {
67 printf("[FAIL]\tTest was built incorrectly\n");
68 return 1;
69 }
70
71 if (ax || bx || cx || dx || si || di || bp
72#ifdef __x86_64__
73 || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
74#endif
75 ) {
76 printf("[FAIL]\tAll GPRs except SP should be 0\n");
77#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
78 SHOW(ax);
79 SHOW(bx);
80 SHOW(cx);
81 SHOW(dx);
82 SHOW(si);
83 SHOW(di);
84 SHOW(bp);
85 SHOW(sp);
86#ifdef __x86_64__
87 SHOW(r8);
88 SHOW(r9);
89 SHOW(r10);
90 SHOW(r11);
91 SHOW(r12);
92 SHOW(r13);
93 SHOW(r14);
94 SHOW(r15);
95#endif
96 nerrs++;
97 } else {
98 printf("[OK]\tAll GPRs except SP are 0\n");
99 }
100
101 if (flags != 0x202) {
102 printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
103 nerrs++;
104 } else {
105 printf("[OK]\tFLAGS is 0x202\n");
106 }
107
108 return nerrs ? 1 : 0;
109}
diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c
index 5105b49cd8aa..421456784bc6 100644
--- a/tools/testing/selftests/x86/ptrace_syscall.c
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
103 err(1, "sigaction"); 103 err(1, "sigaction");
104} 104}
105 105
106static void setsigign(int sig, int flags)
107{
108 struct sigaction sa;
109 memset(&sa, 0, sizeof(sa));
110 sa.sa_sigaction = (void *)SIG_IGN;
111 sa.sa_flags = flags;
112 sigemptyset(&sa.sa_mask);
113 if (sigaction(sig, &sa, 0))
114 err(1, "sigaction");
115}
116
106static void clearhandler(int sig) 117static void clearhandler(int sig)
107{ 118{
108 struct sigaction sa; 119 struct sigaction sa;
@@ -187,7 +198,7 @@ static void test_ptrace_syscall_restart(void)
187 198
188 printf("[RUN]\tSYSEMU\n"); 199 printf("[RUN]\tSYSEMU\n");
189 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) 200 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
190 err(1, "PTRACE_SYSCALL"); 201 err(1, "PTRACE_SYSEMU");
191 wait_trap(chld); 202 wait_trap(chld);
192 203
193 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0) 204 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -218,7 +229,7 @@ static void test_ptrace_syscall_restart(void)
218 err(1, "PTRACE_SETREGS"); 229 err(1, "PTRACE_SETREGS");
219 230
220 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) 231 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
221 err(1, "PTRACE_SYSCALL"); 232 err(1, "PTRACE_SYSEMU");
222 wait_trap(chld); 233 wait_trap(chld);
223 234
224 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0) 235 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -250,7 +261,7 @@ static void test_ptrace_syscall_restart(void)
250 err(1, "PTRACE_SETREGS"); 261 err(1, "PTRACE_SETREGS");
251 262
252 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) 263 if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
253 err(1, "PTRACE_SYSCALL"); 264 err(1, "PTRACE_SYSEMU");
254 wait_trap(chld); 265 wait_trap(chld);
255 266
256 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0) 267 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void)
277 } 288 }
278} 289}
279 290
291static void test_restart_under_ptrace(void)
292{
293 printf("[RUN]\tkernel syscall restart under ptrace\n");
294 pid_t chld = fork();
295 if (chld < 0)
296 err(1, "fork");
297
298 if (chld == 0) {
299 if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
300 err(1, "PTRACE_TRACEME");
301
302 printf("\tChild will take a nap until signaled\n");
303 setsigign(SIGUSR1, SA_RESTART);
304 raise(SIGSTOP);
305
306 syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
307 _exit(0);
308 }
309
310 int status;
311
312 /* Wait for SIGSTOP. */
313 if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
314 err(1, "waitpid");
315
316 struct user_regs_struct regs;
317
318 printf("[RUN]\tSYSCALL\n");
319 if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
320 err(1, "PTRACE_SYSCALL");
321 wait_trap(chld);
322
323 /* We should be stopped at pause(2) entry. */
324
325 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
326 err(1, "PTRACE_GETREGS");
327
328 if (regs.user_syscall_nr != SYS_pause ||
329 regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
330 regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
331 regs.user_arg4 != 0 || regs.user_arg5 != 0) {
332 printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
333 nerrs++;
334 } else {
335 printf("[OK]\tInitial nr and args are correct\n");
336 }
337
338 /* Interrupt it. */
339 kill(chld, SIGUSR1);
340
341 /* Advance. We should be stopped at exit. */
342 printf("[RUN]\tSYSCALL\n");
343 if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
344 err(1, "PTRACE_SYSCALL");
345 wait_trap(chld);
346
347 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
348 err(1, "PTRACE_GETREGS");
349
350 if (regs.user_syscall_nr != SYS_pause ||
351 regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
352 regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
353 regs.user_arg4 != 0 || regs.user_arg5 != 0) {
354 printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
355 nerrs++;
356 } else {
357 printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
358 (long)regs.user_ax);
359 }
360
361 /* Poke the regs back in. This must not break anything. */
362 if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
363 err(1, "PTRACE_SETREGS");
364
365 /* Catch the (ignored) SIGUSR1. */
366 if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
367 err(1, "PTRACE_CONT");
368 if (waitpid(chld, &status, 0) != chld)
369 err(1, "waitpid");
370 if (!WIFSTOPPED(status)) {
371 printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
372 nerrs++;
373 } else {
374 printf("[OK]\tChild got SIGUSR1\n");
375 }
376
377 /* The next event should be pause(2) again. */
378 printf("[RUN]\tStep again\n");
379 if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
380 err(1, "PTRACE_SYSCALL");
381 wait_trap(chld);
382
383 /* We should be stopped at pause(2) entry. */
384
385 if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
386 err(1, "PTRACE_GETREGS");
387
388 if (regs.user_syscall_nr != SYS_pause ||
389 regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
390 regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
391 regs.user_arg4 != 0 || regs.user_arg5 != 0) {
392 printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
393 nerrs++;
394 } else {
395 printf("[OK]\tpause(2) restarted correctly\n");
396 }
397
398 /* Kill it. */
399 kill(chld, SIGKILL);
400 if (waitpid(chld, &status, 0) != chld)
401 err(1, "waitpid");
402}
403
280int main() 404int main()
281{ 405{
282 printf("[RUN]\tCheck int80 return regs\n"); 406 printf("[RUN]\tCheck int80 return regs\n");
@@ -290,5 +414,7 @@ int main()
290 414
291 test_ptrace_syscall_restart(); 415 test_ptrace_syscall_restart();
292 416
417 test_restart_under_ptrace();
418
293 return 0; 419 return 0;
294} 420}
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
index b5aa1bab7416..8a577e7070c6 100644
--- a/tools/testing/selftests/x86/sigreturn.c
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -54,6 +54,37 @@
54#include <sys/ptrace.h> 54#include <sys/ptrace.h>
55#include <sys/user.h> 55#include <sys/user.h>
56 56
57/* Pull in AR_xyz defines. */
58typedef unsigned int u32;
59typedef unsigned short u16;
60#include "../../../../arch/x86/include/asm/desc_defs.h"
61
62/*
63 * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
64 * headers.
65 */
66#ifdef __x86_64__
67/*
68 * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
69 * kernels that save SS in the sigcontext. All kernels that set
70 * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
71 * regardless of SS (i.e. they implement espfix).
72 *
73 * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
74 * when delivering a signal that came from 64-bit code.
75 *
76 * Sigreturn restores SS as follows:
77 *
78 * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
79 * saved CS is not 64-bit)
80 * new SS = saved SS (will fail IRET and signal if invalid)
81 * else
82 * new SS = a flat 32-bit data segment
83 */
84#define UC_SIGCONTEXT_SS 0x2
85#define UC_STRICT_RESTORE_SS 0x4
86#endif
87
57/* 88/*
58 * In principle, this test can run on Linux emulation layers (e.g. 89 * In principle, this test can run on Linux emulation layers (e.g.
59 * Illumos "LX branded zones"). Solaris-based kernels reserve LDT 90 * Illumos "LX branded zones"). Solaris-based kernels reserve LDT
@@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs;
267/* Instructions for the SIGUSR1 handler. */ 298/* Instructions for the SIGUSR1 handler. */
268static volatile unsigned short sig_cs, sig_ss; 299static volatile unsigned short sig_cs, sig_ss;
269static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno; 300static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
301#ifdef __x86_64__
302static volatile sig_atomic_t sig_corrupt_final_ss;
303#endif
270 304
271/* Abstractions for some 32-bit vs 64-bit differences. */ 305/* Abstractions for some 32-bit vs 64-bit differences. */
272#ifdef __x86_64__ 306#ifdef __x86_64__
@@ -305,9 +339,105 @@ static greg_t *csptr(ucontext_t *ctx)
305} 339}
306#endif 340#endif
307 341
342/*
343 * Checks a given selector for its code bitness or returns -1 if it's not
344 * a usable code segment selector.
345 */
346int cs_bitness(unsigned short cs)
347{
348 uint32_t valid = 0, ar;
349 asm ("lar %[cs], %[ar]\n\t"
350 "jnz 1f\n\t"
351 "mov $1, %[valid]\n\t"
352 "1:"
353 : [ar] "=r" (ar), [valid] "+rm" (valid)
354 : [cs] "r" (cs));
355
356 if (!valid)
357 return -1;
358
359 bool db = (ar & (1 << 22));
360 bool l = (ar & (1 << 21));
361
362 if (!(ar & (1<<11)))
363 return -1; /* Not code. */
364
365 if (l && !db)
366 return 64;
367 else if (!l && db)
368 return 32;
369 else if (!l && !db)
370 return 16;
371 else
372 return -1; /* Unknown bitness. */
373}
374
375/*
376 * Checks a given selector for its code bitness or returns -1 if it's not
377 * a usable code segment selector.
378 */
379bool is_valid_ss(unsigned short cs)
380{
381 uint32_t valid = 0, ar;
382 asm ("lar %[cs], %[ar]\n\t"
383 "jnz 1f\n\t"
384 "mov $1, %[valid]\n\t"
385 "1:"
386 : [ar] "=r" (ar), [valid] "+rm" (valid)
387 : [cs] "r" (cs));
388
389 if (!valid)
390 return false;
391
392 if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
393 (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
394 return false;
395
396 return (ar & AR_P);
397}
398
308/* Number of errors in the current test case. */ 399/* Number of errors in the current test case. */
309static volatile sig_atomic_t nerrs; 400static volatile sig_atomic_t nerrs;
310 401
402static void validate_signal_ss(int sig, ucontext_t *ctx)
403{
404#ifdef __x86_64__
405 bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
406
407 if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
408 printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
409 nerrs++;
410
411 /*
412 * This happens on Linux 4.1. The rest will fail, too, so
413 * return now to reduce the noise.
414 */
415 return;
416 }
417
418 /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
419 if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
420 printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
421 sig);
422 nerrs++;
423 }
424
425 if (is_valid_ss(*ssptr(ctx))) {
426 /*
427 * DOSEMU was written before 64-bit sigcontext had SS, and
428 * it tries to figure out the signal source SS by looking at
429 * the physical register. Make sure that keeps working.
430 */
431 unsigned short hw_ss;
432 asm ("mov %%ss, %0" : "=rm" (hw_ss));
433 if (hw_ss != *ssptr(ctx)) {
434 printf("[FAIL]\tHW SS didn't match saved SS\n");
435 nerrs++;
436 }
437 }
438#endif
439}
440
311/* 441/*
312 * SIGUSR1 handler. Sets CS and SS as requested and points IP to the 442 * SIGUSR1 handler. Sets CS and SS as requested and points IP to the
313 * int3 trampoline. Sets SP to a large known value so that we can see 443 * int3 trampoline. Sets SP to a large known value so that we can see
@@ -317,6 +447,8 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
317{ 447{
318 ucontext_t *ctx = (ucontext_t*)ctx_void; 448 ucontext_t *ctx = (ucontext_t*)ctx_void;
319 449
450 validate_signal_ss(sig, ctx);
451
320 memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); 452 memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
321 453
322 *csptr(ctx) = sig_cs; 454 *csptr(ctx) = sig_cs;
@@ -334,13 +466,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
334} 466}
335 467
336/* 468/*
337 * Called after a successful sigreturn. Restores our state so that 469 * Called after a successful sigreturn (via int3) or from a failed
338 * the original raise(SIGUSR1) returns. 470 * sigreturn (directly by kernel). Restores our state so that the
471 * original raise(SIGUSR1) returns.
339 */ 472 */
340static void sigtrap(int sig, siginfo_t *info, void *ctx_void) 473static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
341{ 474{
342 ucontext_t *ctx = (ucontext_t*)ctx_void; 475 ucontext_t *ctx = (ucontext_t*)ctx_void;
343 476
477 validate_signal_ss(sig, ctx);
478
344 sig_err = ctx->uc_mcontext.gregs[REG_ERR]; 479 sig_err = ctx->uc_mcontext.gregs[REG_ERR];
345 sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; 480 sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
346 481
@@ -358,41 +493,62 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
358 memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); 493 memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
359 memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); 494 memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
360 495
496#ifdef __x86_64__
497 if (sig_corrupt_final_ss) {
498 if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
499 printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
500 nerrs++;
501 } else {
502 /*
503 * DOSEMU transitions from 32-bit to 64-bit mode by
504 * adjusting sigcontext, and it requires that this work
505 * even if the saved SS is bogus.
506 */
507 printf("\tCorrupting SS on return to 64-bit mode\n");
508 *ssptr(ctx) = 0;
509 }
510 }
511#endif
512
361 sig_trapped = sig; 513 sig_trapped = sig;
362} 514}
363 515
364/* 516#ifdef __x86_64__
365 * Checks a given selector for its code bitness or returns -1 if it's not 517/* Tests recovery if !UC_STRICT_RESTORE_SS */
366 * a usable code segment selector. 518static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
367 */
368int cs_bitness(unsigned short cs)
369{ 519{
370 uint32_t valid = 0, ar; 520 ucontext_t *ctx = (ucontext_t*)ctx_void;
371 asm ("lar %[cs], %[ar]\n\t"
372 "jnz 1f\n\t"
373 "mov $1, %[valid]\n\t"
374 "1:"
375 : [ar] "=r" (ar), [valid] "+rm" (valid)
376 : [cs] "r" (cs));
377 521
378 if (!valid) 522 if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
379 return -1; 523 printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
524 nerrs++;
525 return; /* We can't do the rest. */
526 }
380 527
381 bool db = (ar & (1 << 22)); 528 ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
382 bool l = (ar & (1 << 21)); 529 *ssptr(ctx) = 0;
383 530
384 if (!(ar & (1<<11))) 531 /* Return. The kernel should recover without sending another signal. */
385 return -1; /* Not code. */ 532}
386 533
387 if (l && !db) 534static int test_nonstrict_ss(void)
388 return 64; 535{
389 else if (!l && db) 536 clearhandler(SIGUSR1);
390 return 32; 537 clearhandler(SIGTRAP);
391 else if (!l && !db) 538 clearhandler(SIGSEGV);
392 return 16; 539 clearhandler(SIGILL);
393 else 540 sethandler(SIGUSR2, sigusr2, 0);
394 return -1; /* Unknown bitness. */ 541
542 nerrs = 0;
543
544 printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
545 raise(SIGUSR2);
546 if (!nerrs)
547 printf("[OK]\tIt worked\n");
548
549 return nerrs;
395} 550}
551#endif
396 552
397/* Finds a usable code segment of the requested bitness. */ 553/* Finds a usable code segment of the requested bitness. */
398int find_cs(int bitness) 554int find_cs(int bitness)
@@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
576 errdesc, strsignal(sig_trapped)); 732 errdesc, strsignal(sig_trapped));
577 return 0; 733 return 0;
578 } else { 734 } else {
735 /*
736 * This also implicitly tests UC_STRICT_RESTORE_SS:
737 * We check that these signals set UC_STRICT_RESTORE_SS and,
738 * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
739 * then we won't get SIGSEGV.
740 */
579 printf("[FAIL]\tDid not get SIGSEGV\n"); 741 printf("[FAIL]\tDid not get SIGSEGV\n");
580 return 1; 742 return 1;
581 } 743 }
@@ -632,6 +794,14 @@ int main()
632 GDT3(gdt_data16_idx)); 794 GDT3(gdt_data16_idx));
633 } 795 }
634 796
797#ifdef __x86_64__
798 /* Nasty ABI case: check SS corruption handling. */
799 sig_corrupt_final_ss = 1;
800 total_nerrs += test_valid_sigreturn(32, false, -1);
801 total_nerrs += test_valid_sigreturn(32, true, -1);
802 sig_corrupt_final_ss = 0;
803#endif
804
635 /* 805 /*
636 * We're done testing valid sigreturn cases. Now we test states 806 * We're done testing valid sigreturn cases. Now we test states
637 * for which sigreturn itself will succeed but the subsequent 807 * for which sigreturn itself will succeed but the subsequent
@@ -680,5 +850,9 @@ int main()
680 if (gdt_npdata32_idx) 850 if (gdt_npdata32_idx)
681 test_bad_iret(32, GDT3(gdt_npdata32_idx), -1); 851 test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
682 852
853#ifdef __x86_64__
854 total_nerrs += test_nonstrict_ss();
855#endif
856
683 return total_nerrs ? 1 : 0; 857 return total_nerrs ? 1 : 0;
684} 858}
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
index 60c06af4646a..43fcab367fb0 100644
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -17,6 +17,9 @@
17 17
18#include <stdio.h> 18#include <stdio.h>
19#include <unistd.h> 19#include <unistd.h>
20#include <string.h>
21#include <signal.h>
22#include <err.h>
20#include <sys/syscall.h> 23#include <sys/syscall.h>
21#include <asm/processor-flags.h> 24#include <asm/processor-flags.h>
22 25
@@ -26,6 +29,8 @@
26# define WIDTH "l" 29# define WIDTH "l"
27#endif 30#endif
28 31
32static unsigned int nerrs;
33
29static unsigned long get_eflags(void) 34static unsigned long get_eflags(void)
30{ 35{
31 unsigned long eflags; 36 unsigned long eflags;
@@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags)
39 : : "rm" (eflags) : "flags"); 44 : : "rm" (eflags) : "flags");
40} 45}
41 46
42int main() 47static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
48 int flags)
43{ 49{
44 printf("[RUN]\tSet NT and issue a syscall\n"); 50 struct sigaction sa;
45 set_eflags(get_eflags() | X86_EFLAGS_NT); 51 memset(&sa, 0, sizeof(sa));
52 sa.sa_sigaction = handler;
53 sa.sa_flags = SA_SIGINFO | flags;
54 sigemptyset(&sa.sa_mask);
55 if (sigaction(sig, &sa, 0))
56 err(1, "sigaction");
57}
58
59static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
60{
61}
62
63static void do_it(unsigned long extraflags)
64{
65 unsigned long flags;
66
67 set_eflags(get_eflags() | extraflags);
46 syscall(SYS_getpid); 68 syscall(SYS_getpid);
47 if (get_eflags() & X86_EFLAGS_NT) { 69 flags = get_eflags();
48 printf("[OK]\tThe syscall worked and NT is still set\n"); 70 if ((flags & extraflags) == extraflags) {
49 return 0; 71 printf("[OK]\tThe syscall worked and flags are still set\n");
50 } else { 72 } else {
51 printf("[FAIL]\tThe syscall worked but NT was cleared\n"); 73 printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
52 return 1; 74 flags, extraflags);
75 nerrs++;
53 } 76 }
54} 77}
78
79int main(void)
80{
81 printf("[RUN]\tSet NT and issue a syscall\n");
82 do_it(X86_EFLAGS_NT);
83
84 /*
85 * Now try it again with TF set -- TF forces returns via IRET in all
86 * cases except non-ptregs-using 64-bit full fast path syscalls.
87 */
88
89 sethandler(SIGTRAP, sigtrap, 0);
90
91 printf("[RUN]\tSet NT|TF and issue a syscall\n");
92 do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
93
94 return nerrs == 0 ? 0 : 1;
95}