aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-05 14:36:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-05 14:36:44 -0400
commit7246f60068840847bdcf595be5f0b5ca632736e0 (patch)
treefd9a963a03c2655f3ba9d1ced3c87a2775f5b166
parente579dde654fc2c6b0d3e4b77a9a4b2d2405c510e (diff)
parent700b7eadd5625d22b8235fb21259b3d7d564c000 (diff)
Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman: "Highlights include: - Larger virtual address space on 64-bit server CPUs. By default we use a 128TB virtual address space, but a process can request access to the full 512TB by passing a hint to mmap(). - Support for the new Power9 "XIVE" interrupt controller. - TLB flushing optimisations for the radix MMU on Power9. - Support for CAPI cards on Power9, using the "Coherent Accelerator Interface Architecture 2.0". - The ability to configure the mmap randomisation limits at build and runtime. - Several small fixes and cleanups to the kprobes code, as well as support for KPROBES_ON_FTRACE. - Major improvements to handling of system reset interrupts, correctly treating them as NMIs, giving them a dedicated stack and using a new hypervisor call to trigger them, all of which should aid debugging and robustness. - Many fixes and other minor enhancements. Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anshuman Khandual, Anton Blanchard, Balbir Singh, Ben Hutchings, Benjamin Herrenschmidt, Bhupesh Sharma, Chris Packham, Christian Zigotzky, Christophe Leroy, Christophe Lombard, Daniel Axtens, David Gibson, Gautham R. Shenoy, Gavin Shan, Geert Uytterhoeven, Guilherme G. Piccoli, Hamish Martin, Hari Bathini, Kees Cook, Laurent Dufour, Madhavan Srinivasan, Mahesh J Salgaonkar, Mahesh Salgaonkar, Masami Hiramatsu, Matt Brown, Matthew R. Ochs, Michael Neuling, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Pan Xinhui, Paul Mackerras, Rashmica Gupta, Russell Currey, Sukadev Bhattiprolu, Thadeu Lima de Souza Cascardo, Tobin C. Harding, Tyrel Datwyler, Uma Krishnan, Vaibhav Jain, Vipin K Parashar, Yang Shi" * tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (214 commits) powerpc/64s: Power9 has no LPCR[VRMASD] field so don't set it powerpc/powernv: Fix TCE kill on NVLink2 powerpc/mm/radix: Drop support for CPUs without lockless tlbie powerpc/book3s/mce: Move add_taint() later in virtual mode powerpc/sysfs: Move #ifdef CONFIG_HOTPLUG_CPU out of the function body powerpc/smp: Document irq enable/disable after migrating IRQs powerpc/mpc52xx: Don't select user-visible RTAS_PROC powerpc/powernv: Document cxl dependency on special case in pnv_eeh_reset() powerpc/eeh: Clean up and document event handling functions powerpc/eeh: Avoid use after free in eeh_handle_special_event() cxl: Mask slice error interrupts after first occurrence cxl: Route eeh events to all drivers in cxl_pci_error_detected() cxl: Force context lock during EEH flow powerpc/64: Allow CONFIG_RELOCATABLE if COMPILE_TEST powerpc/xmon: Teach xmon oops about radix vectors powerpc/mm/hash: Fix off-by-one in comment about kernel contexts ids powerpc/pseries: Enable VFIO powerpc/powernv: Fix iommu table size calculation hook for small tables powerpc/powernv: Check kzalloc() return value in pnv_pci_table_alloc powerpc: Add arch/powerpc/tools directory ...
-rw-r--r--Documentation/features/debug/kprobes-on-ftrace/arch-support.txt2
-rw-r--r--Documentation/powerpc/cxl.txt15
-rw-r--r--Documentation/powerpc/firmware-assisted-dump.txt34
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/powerpc/Kconfig66
-rw-r--r--arch/powerpc/Makefile13
-rw-r--r--arch/powerpc/Makefile.postlink34
-rw-r--r--arch/powerpc/configs/powernv_defconfig6
-rw-r--r--arch/powerpc/configs/ppc64_defconfig6
-rw-r--r--arch/powerpc/configs/pseries_defconfig6
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h4
-rw-r--r--arch/powerpc/include/asm/bitops.h8
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h10
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h16
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h200
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h9
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h58
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h8
-rw-r--r--arch/powerpc/include/asm/code-patching.h41
-rw-r--r--arch/powerpc/include/asm/cpuidle.h33
-rw-r--r--arch/powerpc/include/asm/cputable.h5
-rw-r--r--arch/powerpc/include/asm/dbell.h40
-rw-r--r--arch/powerpc/include/asm/debug.h2
-rw-r--r--arch/powerpc/include/asm/debugfs.h17
-rw-r--r--arch/powerpc/include/asm/exception-64s.h87
-rw-r--r--arch/powerpc/include/asm/feature-fixups.h3
-rw-r--r--arch/powerpc/include/asm/head-64.h1
-rw-r--r--arch/powerpc/include/asm/hvcall.h10
-rw-r--r--arch/powerpc/include/asm/io.h102
-rw-r--r--arch/powerpc/include/asm/iommu.h12
-rw-r--r--arch/powerpc/include/asm/kprobes.h63
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h10
-rw-r--r--arch/powerpc/include/asm/mce.h94
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h5
-rw-r--r--arch/powerpc/include/asm/mmu.h19
-rw-r--r--arch/powerpc/include/asm/mmu_context.h24
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h5
-rw-r--r--arch/powerpc/include/asm/opal-api.h77
-rw-r--r--arch/powerpc/include/asm/opal.h41
-rw-r--r--arch/powerpc/include/asm/paca.h38
-rw-r--r--arch/powerpc/include/asm/page_64.h14
-rw-r--r--arch/powerpc/include/asm/perf_event_server.h3
-rw-r--r--arch/powerpc/include/asm/powernv.h22
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/asm/processor.h41
-rw-r--r--arch/powerpc/include/asm/reg.h4
-rw-r--r--arch/powerpc/include/asm/sections.h2
-rw-r--r--arch/powerpc/include/asm/smp.h21
-rw-r--r--arch/powerpc/include/asm/syscalls.h4
-rw-r--r--arch/powerpc/include/asm/thread_info.h10
-rw-r--r--arch/powerpc/include/asm/xics.h2
-rw-r--r--arch/powerpc/include/asm/xive-regs.h97
-rw-r--r--arch/powerpc/include/asm/xive.h163
-rw-r--r--arch/powerpc/include/asm/xmon.h2
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h16
-rw-r--r--arch/powerpc/kernel/Makefile12
-rw-r--r--arch/powerpc/kernel/asm-offsets.c9
-rw-r--r--arch/powerpc/kernel/cpu_setup_power.S36
-rw-r--r--arch/powerpc/kernel/dbell.c58
-rw-r--r--arch/powerpc/kernel/eeh.c3
-rw-r--r--arch/powerpc/kernel/eeh_driver.c55
-rw-r--r--arch/powerpc/kernel/entry_32.S107
-rw-r--r--arch/powerpc/kernel/entry_64.S380
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S186
-rw-r--r--arch/powerpc/kernel/fadump.c36
-rw-r--r--arch/powerpc/kernel/head_32.S16
-rw-r--r--arch/powerpc/kernel/head_64.S3
-rw-r--r--arch/powerpc/kernel/idle_book3s.S285
-rw-r--r--arch/powerpc/kernel/iommu.c54
-rw-r--r--arch/powerpc/kernel/irq.c41
-rw-r--r--arch/powerpc/kernel/kprobes-ftrace.c104
-rw-r--r--arch/powerpc/kernel/kprobes.c214
-rw-r--r--arch/powerpc/kernel/mce.c18
-rw-r--r--arch/powerpc/kernel/mce_power.c780
-rw-r--r--arch/powerpc/kernel/optprobes.c6
-rw-r--r--arch/powerpc/kernel/paca.c21
-rw-r--r--arch/powerpc/kernel/prom.c1
-rw-r--r--arch/powerpc/kernel/prom_init.c2
-rw-r--r--arch/powerpc/kernel/setup-common.c11
-rw-r--r--arch/powerpc/kernel/setup_64.c9
-rw-r--r--arch/powerpc/kernel/smp.c299
-rw-r--r--arch/powerpc/kernel/stacktrace.c9
-rw-r--r--arch/powerpc/kernel/swsusp.c1
-rw-r--r--arch/powerpc/kernel/syscalls.c16
-rw-r--r--arch/powerpc/kernel/sysfs.c12
-rw-r--r--arch/powerpc/kernel/trace/Makefile29
-rw-r--r--arch/powerpc/kernel/trace/ftrace.c (renamed from arch/powerpc/kernel/ftrace.c)1
-rw-r--r--arch/powerpc/kernel/trace/ftrace_32.S118
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64.S85
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S272
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_pg.S68
-rw-r--r--arch/powerpc/kernel/trace/trace_clock.c (renamed from arch/powerpc/kernel/trace_clock.c)0
-rw-r--r--arch/powerpc/kernel/traps.c27
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S2
-rw-r--r--arch/powerpc/kvm/book3s.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c10
-rw-r--r--arch/powerpc/kvm/book3s_hv.c18
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c29
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c5
-rw-r--r--arch/powerpc/kvm/book3s_xics.c5
-rw-r--r--arch/powerpc/lib/code-patching.c4
-rw-r--r--arch/powerpc/lib/sstep.c82
-rw-r--r--arch/powerpc/mm/dump_hashpagetable.c2
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables.c106
-rw-r--r--arch/powerpc/mm/fault.c84
-rw-r--r--arch/powerpc/mm/hash_low_32.S2
-rw-r--r--arch/powerpc/mm/hash_utils_64.c26
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c7
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c11
-rw-r--r--arch/powerpc/mm/hugetlbpage.c18
-rw-r--r--arch/powerpc/mm/init_64.c4
-rw-r--r--arch/powerpc/mm/mmap.c53
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c116
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c43
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c5
-rw-r--r--arch/powerpc/mm/numa.c7
-rw-r--r--arch/powerpc/mm/slb.c4
-rw-r--r--arch/powerpc/mm/slb_low.S82
-rw-r--r--arch/powerpc/mm/slice.c258
-rw-r--r--arch/powerpc/mm/subpage-prot.c3
-rw-r--r--arch/powerpc/mm/tlb-radix.c93
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/perf/isa207-common.c82
-rw-r--r--arch/powerpc/perf/isa207-common.h26
-rw-r--r--arch/powerpc/perf/power8-events-list.h6
-rw-r--r--arch/powerpc/perf/power8-pmu.c4
-rw-r--r--arch/powerpc/perf/power9-pmu.c2
-rw-r--r--arch/powerpc/platforms/44x/sam440ep.c2
-rw-r--r--arch/powerpc/platforms/52xx/Kconfig1
-rw-r--r--arch/powerpc/platforms/85xx/smp.c12
-rw-r--r--arch/powerpc/platforms/86xx/mpc86xx_smp.c1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype14
-rw-r--r--arch/powerpc/platforms/cell/axon_msi.c2
-rw-r--r--arch/powerpc/platforms/cell/interrupt.c2
-rw-r--r--arch/powerpc/platforms/cell/pervasive.c11
-rw-r--r--arch/powerpc/platforms/chrp/smp.c1
-rw-r--r--arch/powerpc/platforms/pasemi/idle.c11
-rw-r--r--arch/powerpc/platforms/powermac/smp.c3
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig3
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c7
-rw-r--r--arch/powerpc/platforms/powernv/idle.c109
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c470
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c3
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S71
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c27
-rw-r--r--arch/powerpc/platforms/powernv/opal.c79
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c71
-rw-r--r--arch/powerpc/platforms/powernv/pci.c6
-rw-r--r--arch/powerpc/platforms/powernv/pci.h17
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h2
-rw-r--r--arch/powerpc/platforms/powernv/rng.c2
-rw-r--r--arch/powerpc/platforms/powernv/setup.c19
-rw-r--r--arch/powerpc/platforms/powernv/smp.c107
-rw-r--r--arch/powerpc/platforms/ps3/smp.c4
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig3
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c1
-rw-r--r--arch/powerpc/platforms/pseries/dtl.c3
-rw-r--r--arch/powerpc/platforms/pseries/hvCall_inst.c10
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c43
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c61
-rw-r--r--arch/powerpc/platforms/pseries/ras.c4
-rw-r--r--arch/powerpc/platforms/pseries/setup.c4
-rw-r--r--arch/powerpc/platforms/pseries/smp.c49
-rw-r--r--arch/powerpc/platforms/pseries/vio.c2
-rw-r--r--arch/powerpc/sysdev/Kconfig1
-rw-r--r--arch/powerpc/sysdev/Makefile1
-rw-r--r--arch/powerpc/sysdev/scom.c3
-rw-r--r--arch/powerpc/sysdev/xics/icp-hv.c2
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c20
-rw-r--r--arch/powerpc/sysdev/xics/icp-opal.c2
-rw-r--r--arch/powerpc/sysdev/xics/xics-common.c6
-rw-r--r--arch/powerpc/sysdev/xive/Kconfig11
-rw-r--r--arch/powerpc/sysdev/xive/Makefile4
-rw-r--r--arch/powerpc/sysdev/xive/common.c1302
-rw-r--r--arch/powerpc/sysdev/xive/native.c640
-rw-r--r--arch/powerpc/sysdev/xive/xive-internal.h62
-rwxr-xr-xarch/powerpc/tools/gcc-check-mprofile-kernel.sh (renamed from arch/powerpc/scripts/gcc-check-mprofile-kernel.sh)0
-rwxr-xr-xarch/powerpc/tools/relocs_check.sh (renamed from arch/powerpc/relocs_check.sh)0
-rw-r--r--arch/powerpc/xmon/xmon.c241
-rw-r--r--drivers/misc/cxl/api.c17
-rw-r--r--drivers/misc/cxl/context.c68
-rw-r--r--drivers/misc/cxl/cxl.h263
-rw-r--r--drivers/misc/cxl/debugfs.c41
-rw-r--r--drivers/misc/cxl/fault.c136
-rw-r--r--drivers/misc/cxl/file.c15
-rw-r--r--drivers/misc/cxl/guest.c10
-rw-r--r--drivers/misc/cxl/hcalls.c6
-rw-r--r--drivers/misc/cxl/irq.c53
-rw-r--r--drivers/misc/cxl/main.c12
-rw-r--r--drivers/misc/cxl/native.c339
-rw-r--r--drivers/misc/cxl/pci.c409
-rw-r--r--drivers/misc/cxl/trace.h43
-rw-r--r--drivers/of/base.c31
-rw-r--r--drivers/pcmcia/electra_cf.c4
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c2
-rw-r--r--include/linux/kprobes.h1
-rw-r--r--include/linux/of.h3
-rw-r--r--include/uapi/linux/perf_event.h16
-rw-r--r--kernel/kprobes.c32
-rw-r--r--tools/include/uapi/linux/perf_event.h16
-rw-r--r--tools/testing/selftests/powerpc/Makefile1
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/Makefile10
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/cache_shape.c125
-rw-r--r--tools/testing/selftests/powerpc/include/utils.h6
-rw-r--r--tools/testing/selftests/powerpc/utils.c53
212 files changed, 8698 insertions, 2790 deletions
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index f9133a921d5a..1e84be3c142e 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -26,7 +26,7 @@
26 | nios2: | TODO | 26 | nios2: | TODO |
27 | openrisc: | TODO | 27 | openrisc: | TODO |
28 | parisc: | TODO | 28 | parisc: | TODO |
29 | powerpc: | TODO | 29 | powerpc: | ok |
30 | s390: | TODO | 30 | s390: | TODO |
31 | score: | TODO | 31 | score: | TODO |
32 | sh: | TODO | 32 | sh: | TODO |
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
index d5506ba0fef7..c5e8d5098ed3 100644
--- a/Documentation/powerpc/cxl.txt
+++ b/Documentation/powerpc/cxl.txt
@@ -21,7 +21,7 @@ Introduction
21Hardware overview 21Hardware overview
22================= 22=================
23 23
24 POWER8 FPGA 24 POWER8/9 FPGA
25 +----------+ +---------+ 25 +----------+ +---------+
26 | | | | 26 | | | |
27 | CPU | | AFU | 27 | CPU | | AFU |
@@ -34,7 +34,7 @@ Hardware overview
34 | | CAPP |<------>| | 34 | | CAPP |<------>| |
35 +---+------+ PCIE +---------+ 35 +---+------+ PCIE +---------+
36 36
37 The POWER8 chip has a Coherently Attached Processor Proxy (CAPP) 37 The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP)
38 unit which is part of the PCIe Host Bridge (PHB). This is managed 38 unit which is part of the PCIe Host Bridge (PHB). This is managed
39 by Linux by calls into OPAL. Linux doesn't directly program the 39 by Linux by calls into OPAL. Linux doesn't directly program the
40 CAPP. 40 CAPP.
@@ -59,6 +59,17 @@ Hardware overview
59 the fault. The context to which this fault is serviced is based on 59 the fault. The context to which this fault is serviced is based on
60 who owns that acceleration function. 60 who owns that acceleration function.
61 61
62 POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0.
63 POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0.
64 This PSL Version 9 provides new features such as:
65 * Interaction with the nest MMU on the P9 chip.
66 * Native DMA support.
67 * Supports sending ASB_Notify messages for host thread wakeup.
68 * Supports Atomic operations.
69 * ....
70
71 Cards with a PSL9 won't work on a POWER8 system and cards with a
72 PSL8 won't work on a POWER9 system.
62 73
63AFU Modes 74AFU Modes
64========= 75=========
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 3007bc98af28..19b1e3d09a19 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -105,21 +105,21 @@ memory is held.
105 105
106If there is no waiting dump data, then only the memory required 106If there is no waiting dump data, then only the memory required
107to hold CPU state, HPTE region, boot memory dump and elfcore 107to hold CPU state, HPTE region, boot memory dump and elfcore
108header, is reserved at the top of memory (see Fig. 1). This area 108header, is usually reserved at an offset greater than boot memory
109is *not* released: this region will be kept permanently reserved, 109size (see Fig. 1). This area is *not* released: this region will
110so that it can act as a receptacle for a copy of the boot memory 110be kept permanently reserved, so that it can act as a receptacle
111content in addition to CPU state and HPTE region, in the case a 111for a copy of the boot memory content in addition to CPU state
112crash does occur. 112and HPTE region, in the case a crash does occur.
113 113
114 o Memory Reservation during first kernel 114 o Memory Reservation during first kernel
115 115
116 Low memory Top of memory 116 Low memory Top of memory
117 0 boot memory size | 117 0 boot memory size |
118 | | |<--Reserved dump area -->| 118 | | |<--Reserved dump area -->| |
119 V V | Permanent Reservation V 119 V V | Permanent Reservation | V
120 +-----------+----------/ /----------+---+----+-----------+----+ 120 +-----------+----------/ /---+---+----+-----------+----+------+
121 | | |CPU|HPTE| DUMP |ELF | 121 | | |CPU|HPTE| DUMP |ELF | |
122 +-----------+----------/ /----------+---+----+-----------+----+ 122 +-----------+----------/ /---+---+----+-----------+----+------+
123 | ^ 123 | ^
124 | | 124 | |
125 \ / 125 \ /
@@ -135,12 +135,12 @@ crash does occur.
135 0 boot memory size | 135 0 boot memory size |
136 | |<------------- Reserved dump area ----------- -->| 136 | |<------------- Reserved dump area ----------- -->|
137 V V V 137 V V V
138 +-----------+----------/ /----------+---+----+-----------+----+ 138 +-----------+----------/ /---+---+----+-----------+----+------+
139 | | |CPU|HPTE| DUMP |ELF | 139 | | |CPU|HPTE| DUMP |ELF | |
140 +-----------+----------/ /----------+---+----+-----------+----+ 140 +-----------+----------/ /---+---+----+-----------+----+------+
141 | | 141 | |
142 V V 142 V V
143 Used by second /proc/vmcore 143 Used by second /proc/vmcore
144 kernel to boot 144 kernel to boot
145 Fig. 2 145 Fig. 2
146 146
diff --git a/MAINTAINERS b/MAINTAINERS
index 5ee3125f8341..b1f9b45f061f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5310,6 +5310,7 @@ M: Scott Wood <oss@buserror.net>
5310L: linuxppc-dev@lists.ozlabs.org 5310L: linuxppc-dev@lists.ozlabs.org
5311L: linux-arm-kernel@lists.infradead.org 5311L: linux-arm-kernel@lists.infradead.org
5312S: Maintained 5312S: Maintained
5313F: Documentation/devicetree/bindings/powerpc/fsl/
5313F: drivers/soc/fsl/ 5314F: drivers/soc/fsl/
5314F: include/linux/fsl/ 5315F: include/linux/fsl/
5315 5316
@@ -7568,7 +7569,7 @@ Q: http://patchwork.ozlabs.org/project/linuxppc-dev/list/
7568T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 7569T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
7569S: Supported 7570S: Supported
7570F: Documentation/ABI/stable/sysfs-firmware-opal-* 7571F: Documentation/ABI/stable/sysfs-firmware-opal-*
7571F: Documentation/devicetree/bindings/powerpc/opal/ 7572F: Documentation/devicetree/bindings/powerpc/
7572F: Documentation/devicetree/bindings/rtc/rtc-opal.txt 7573F: Documentation/devicetree/bindings/rtc/rtc-opal.txt
7573F: Documentation/devicetree/bindings/i2c/i2c-opal.txt 7574F: Documentation/devicetree/bindings/i2c/i2c-opal.txt
7574F: Documentation/powerpc/ 7575F: Documentation/powerpc/
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 053382616533..f07f727cbfd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -22,6 +22,48 @@ config MMU
22 bool 22 bool
23 default y 23 default y
24 24
25config ARCH_MMAP_RND_BITS_MAX
26 # On Book3S 64, the default virtual address space for 64-bit processes
27 # is 2^47 (128TB). As a maximum, allow randomisation to consume up to
28 # 32T of address space (2^45), which should ensure a reasonable gap
29 # between bottom-up and top-down allocations for applications that
30 # consume "normal" amounts of address space. Book3S 64 only supports 64K
31 # and 4K page sizes.
32 default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K)
33 default 33 if PPC_BOOK3S_64 # 33 = 45 (32T) - 12 (4K)
34 #
35 # On all other 64-bit platforms (currently only Book3E), the virtual
36 # address space is 2^46 (64TB). Allow randomisation to consume up to 16T
37 # of address space (2^44). Only 4K page sizes are supported.
38 default 32 if 64BIT # 32 = 44 (16T) - 12 (4K)
39 #
40 # For 32-bit, use the compat values, as they're the same.
41 default ARCH_MMAP_RND_COMPAT_BITS_MAX
42
43config ARCH_MMAP_RND_BITS_MIN
44 # Allow randomisation to consume up to 1GB of address space (2^30).
45 default 14 if 64BIT && PPC_64K_PAGES # 14 = 30 (1GB) - 16 (64K)
46 default 18 if 64BIT # 18 = 30 (1GB) - 12 (4K)
47 #
48 # For 32-bit, use the compat values, as they're the same.
49 default ARCH_MMAP_RND_COMPAT_BITS_MIN
50
51config ARCH_MMAP_RND_COMPAT_BITS_MAX
52 # Total virtual address space for 32-bit processes is 2^31 (2GB).
53 # Allow randomisation to consume up to 512MB of address space (2^29).
54 default 11 if PPC_256K_PAGES # 11 = 29 (512MB) - 18 (256K)
55 default 13 if PPC_64K_PAGES # 13 = 29 (512MB) - 16 (64K)
56 default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K)
57 default 17 # 17 = 29 (512MB) - 12 (4K)
58
59config ARCH_MMAP_RND_COMPAT_BITS_MIN
60 # Total virtual address space for 32-bit processes is 2^31 (2GB).
61 # Allow randomisation to consume up to 8MB of address space (2^23).
62 default 5 if PPC_256K_PAGES # 5 = 23 (8MB) - 18 (256K)
63 default 7 if PPC_64K_PAGES # 7 = 23 (8MB) - 16 (64K)
64 default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K)
65 default 11 # 11 = 23 (8MB) - 12 (4K)
66
25config HAVE_SETUP_PER_CPU_AREA 67config HAVE_SETUP_PER_CPU_AREA
26 def_bool PPC64 68 def_bool PPC64
27 69
@@ -38,6 +80,11 @@ config NR_IRQS
38 /proc/interrupts. If you configure your system to have too few, 80 /proc/interrupts. If you configure your system to have too few,
39 drivers will fail to load or worse - handle with care. 81 drivers will fail to load or worse - handle with care.
40 82
83config NMI_IPI
84 bool
85 depends on SMP && (DEBUGGER || KEXEC_CORE)
86 default y
87
41config STACKTRACE_SUPPORT 88config STACKTRACE_SUPPORT
42 bool 89 bool
43 default y 90 default y
@@ -119,6 +166,8 @@ config PPC
119 select HAVE_ARCH_AUDITSYSCALL 166 select HAVE_ARCH_AUDITSYSCALL
120 select HAVE_ARCH_JUMP_LABEL 167 select HAVE_ARCH_JUMP_LABEL
121 select HAVE_ARCH_KGDB 168 select HAVE_ARCH_KGDB
169 select HAVE_ARCH_MMAP_RND_BITS
170 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
122 select HAVE_ARCH_SECCOMP_FILTER 171 select HAVE_ARCH_SECCOMP_FILTER
123 select HAVE_ARCH_TRACEHOOK 172 select HAVE_ARCH_TRACEHOOK
124 select HAVE_CBPF_JIT if !PPC64 173 select HAVE_CBPF_JIT if !PPC64
@@ -141,6 +190,7 @@ config PPC
141 select HAVE_IRQ_EXIT_ON_IRQ_STACK 190 select HAVE_IRQ_EXIT_ON_IRQ_STACK
142 select HAVE_KERNEL_GZIP 191 select HAVE_KERNEL_GZIP
143 select HAVE_KPROBES 192 select HAVE_KPROBES
193 select HAVE_KPROBES_ON_FTRACE
144 select HAVE_KRETPROBES 194 select HAVE_KRETPROBES
145 select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS 195 select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
146 select HAVE_MEMBLOCK 196 select HAVE_MEMBLOCK
@@ -489,7 +539,7 @@ config KEXEC_FILE
489 539
490config RELOCATABLE 540config RELOCATABLE
491 bool "Build a relocatable kernel" 541 bool "Build a relocatable kernel"
492 depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE)) 542 depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
493 select NONSTATIC_KERNEL 543 select NONSTATIC_KERNEL
494 select MODULE_REL_CRCS if MODVERSIONS 544 select MODULE_REL_CRCS if MODVERSIONS
495 help 545 help
@@ -523,7 +573,7 @@ config RELOCATABLE_TEST
523config CRASH_DUMP 573config CRASH_DUMP
524 bool "Build a kdump crash kernel" 574 bool "Build a kdump crash kernel"
525 depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) 575 depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
526 select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE 576 select RELOCATABLE if PPC64 || 44x || FSL_BOOKE
527 help 577 help
528 Build a kernel suitable for use as a kdump capture kernel. 578 Build a kernel suitable for use as a kdump capture kernel.
529 The same kernel binary can be used as production kernel and dump 579 The same kernel binary can be used as production kernel and dump
@@ -585,7 +635,7 @@ config ARCH_SPARSEMEM_ENABLE
585 635
586config ARCH_SPARSEMEM_DEFAULT 636config ARCH_SPARSEMEM_DEFAULT
587 def_bool y 637 def_bool y
588 depends on (SMP && PPC_PSERIES) || PPC_PS3 638 depends on PPC_BOOK3S_64
589 639
590config SYS_SUPPORTS_HUGETLBFS 640config SYS_SUPPORTS_HUGETLBFS
591 bool 641 bool
@@ -677,6 +727,16 @@ config PPC_256K_PAGES
677 727
678endchoice 728endchoice
679 729
730config THREAD_SHIFT
731 int "Thread shift" if EXPERT
732 range 13 15
733 default "15" if PPC_256K_PAGES
734 default "14" if PPC64
735 default "13"
736 help
737 Used to define the stack size. The default is almost always what you
738 want. Only change this if you know what you are doing.
739
680config FORCE_MAX_ZONEORDER 740config FORCE_MAX_ZONEORDER
681 int "Maximum zone order" 741 int "Maximum zone order"
682 range 8 9 if PPC64 && PPC_64K_PAGES 742 range 8 9 if PPC64 && PPC_64K_PAGES
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 19b0d1a81959..3e0f0e1fadef 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -136,7 +136,7 @@ CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
136endif 136endif
137 137
138ifdef CONFIG_MPROFILE_KERNEL 138ifdef CONFIG_MPROFILE_KERNEL
139 ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK) 139 ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
140 CC_FLAGS_FTRACE := -pg -mprofile-kernel 140 CC_FLAGS_FTRACE := -pg -mprofile-kernel
141 KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL 141 KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
142 else 142 else
@@ -274,17 +274,6 @@ PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2)
274 274
275boot := arch/$(ARCH)/boot 275boot := arch/$(ARCH)/boot
276 276
277ifeq ($(CONFIG_RELOCATABLE),y)
278quiet_cmd_relocs_check = CALL $<
279 cmd_relocs_check = $(CONFIG_SHELL) $< "$(OBJDUMP)" "$(obj)/vmlinux"
280
281PHONY += relocs_check
282relocs_check: arch/powerpc/relocs_check.sh vmlinux
283 $(call cmd,relocs_check)
284
285zImage: relocs_check
286endif
287
288$(BOOT_TARGETS1): vmlinux 277$(BOOT_TARGETS1): vmlinux
289 $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) 278 $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
290$(BOOT_TARGETS2): vmlinux 279$(BOOT_TARGETS2): vmlinux
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
new file mode 100644
index 000000000000..3c22d64b2de9
--- /dev/null
+++ b/arch/powerpc/Makefile.postlink
@@ -0,0 +1,34 @@
1# ===========================================================================
2# Post-link powerpc pass
3# ===========================================================================
4#
5# 1. Check that vmlinux relocations look sane
6
7PHONY := __archpost
8__archpost:
9
10include include/config/auto.conf
11include scripts/Kbuild.include
12
13quiet_cmd_relocs_check = CHKREL $@
14 cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@"
15
16# `@true` prevents complaint when there is nothing to be done
17
18vmlinux: FORCE
19 @true
20ifdef CONFIG_RELOCATABLE
21 $(call if_changed,relocs_check)
22endif
23
24%.ko: FORCE
25 @true
26
27clean:
28 @true
29
30PHONY += FORCE clean
31
32FORCE:
33
34.PHONY: $(PHONY)
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index ac8b8332ed82..0695ce047d56 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y
33CONFIG_BPF_SYSCALL=y 33CONFIG_BPF_SYSCALL=y
34# CONFIG_COMPAT_BRK is not set 34# CONFIG_COMPAT_BRK is not set
35CONFIG_PROFILING=y 35CONFIG_PROFILING=y
36CONFIG_OPROFILE=y 36CONFIG_OPROFILE=m
37CONFIG_KPROBES=y 37CONFIG_KPROBES=y
38CONFIG_JUMP_LABEL=y 38CONFIG_JUMP_LABEL=y
39CONFIG_MODULES=y 39CONFIG_MODULES=y
@@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m
261CONFIG_AUTOFS4_FS=m 261CONFIG_AUTOFS4_FS=m
262CONFIG_FUSE_FS=m 262CONFIG_FUSE_FS=m
263CONFIG_OVERLAY_FS=m 263CONFIG_OVERLAY_FS=m
264CONFIG_ISO9660_FS=m 264CONFIG_ISO9660_FS=y
265CONFIG_UDF_FS=m 265CONFIG_UDF_FS=m
266CONFIG_MSDOS_FS=y 266CONFIG_MSDOS_FS=y
267CONFIG_VFAT_FS=m 267CONFIG_VFAT_FS=m
@@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m
306CONFIG_CRYPTO_CCM=m 306CONFIG_CRYPTO_CCM=m
307CONFIG_CRYPTO_PCBC=m 307CONFIG_CRYPTO_PCBC=m
308CONFIG_CRYPTO_HMAC=y 308CONFIG_CRYPTO_HMAC=y
309CONFIG_CRYPT_CRC32C_VPMSUM=m 309CONFIG_CRYPTO_CRC32C_VPMSUM=m
310CONFIG_CRYPTO_MD5_PPC=m 310CONFIG_CRYPTO_MD5_PPC=m
311CONFIG_CRYPTO_MICHAEL_MIC=m 311CONFIG_CRYPTO_MICHAEL_MIC=m
312CONFIG_CRYPTO_SHA256=y 312CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index f2e03f032041..5175028c56ce 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y
19CONFIG_BPF_SYSCALL=y 19CONFIG_BPF_SYSCALL=y
20# CONFIG_COMPAT_BRK is not set 20# CONFIG_COMPAT_BRK is not set
21CONFIG_PROFILING=y 21CONFIG_PROFILING=y
22CONFIG_OPROFILE=y 22CONFIG_OPROFILE=m
23CONFIG_KPROBES=y 23CONFIG_KPROBES=y
24CONFIG_JUMP_LABEL=y 24CONFIG_JUMP_LABEL=y
25CONFIG_MODULES=y 25CONFIG_MODULES=y
@@ -290,7 +290,7 @@ CONFIG_NILFS2_FS=m
290CONFIG_AUTOFS4_FS=m 290CONFIG_AUTOFS4_FS=m
291CONFIG_FUSE_FS=m 291CONFIG_FUSE_FS=m
292CONFIG_OVERLAY_FS=m 292CONFIG_OVERLAY_FS=m
293CONFIG_ISO9660_FS=m 293CONFIG_ISO9660_FS=y
294CONFIG_UDF_FS=m 294CONFIG_UDF_FS=m
295CONFIG_MSDOS_FS=y 295CONFIG_MSDOS_FS=y
296CONFIG_VFAT_FS=m 296CONFIG_VFAT_FS=m
@@ -339,7 +339,7 @@ CONFIG_PPC_EARLY_DEBUG=y
339CONFIG_CRYPTO_TEST=m 339CONFIG_CRYPTO_TEST=m
340CONFIG_CRYPTO_PCBC=m 340CONFIG_CRYPTO_PCBC=m
341CONFIG_CRYPTO_HMAC=y 341CONFIG_CRYPTO_HMAC=y
342CONFIG_CRYPT_CRC32C_VPMSUM=m 342CONFIG_CRYPTO_CRC32C_VPMSUM=m
343CONFIG_CRYPTO_MD5_PPC=m 343CONFIG_CRYPTO_MD5_PPC=m
344CONFIG_CRYPTO_MICHAEL_MIC=m 344CONFIG_CRYPTO_MICHAEL_MIC=m
345CONFIG_CRYPTO_SHA256=y 345CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 4ff68b752618..1a61aa20dfba 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y
34CONFIG_BPF_SYSCALL=y 34CONFIG_BPF_SYSCALL=y
35# CONFIG_COMPAT_BRK is not set 35# CONFIG_COMPAT_BRK is not set
36CONFIG_PROFILING=y 36CONFIG_PROFILING=y
37CONFIG_OPROFILE=y 37CONFIG_OPROFILE=m
38CONFIG_KPROBES=y 38CONFIG_KPROBES=y
39CONFIG_JUMP_LABEL=y 39CONFIG_JUMP_LABEL=y
40CONFIG_MODULES=y 40CONFIG_MODULES=y
@@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m
259CONFIG_AUTOFS4_FS=m 259CONFIG_AUTOFS4_FS=m
260CONFIG_FUSE_FS=m 260CONFIG_FUSE_FS=m
261CONFIG_OVERLAY_FS=m 261CONFIG_OVERLAY_FS=m
262CONFIG_ISO9660_FS=m 262CONFIG_ISO9660_FS=y
263CONFIG_UDF_FS=m 263CONFIG_UDF_FS=m
264CONFIG_MSDOS_FS=y 264CONFIG_MSDOS_FS=y
265CONFIG_VFAT_FS=m 265CONFIG_VFAT_FS=m
@@ -303,7 +303,7 @@ CONFIG_XMON=y
303CONFIG_CRYPTO_TEST=m 303CONFIG_CRYPTO_TEST=m
304CONFIG_CRYPTO_PCBC=m 304CONFIG_CRYPTO_PCBC=m
305CONFIG_CRYPTO_HMAC=y 305CONFIG_CRYPTO_HMAC=y
306CONFIG_CRYPT_CRC32C_VPMSUM=m 306CONFIG_CRYPTO_CRC32C_VPMSUM=m
307CONFIG_CRYPTO_MD5_PPC=m 307CONFIG_CRYPTO_MD5_PPC=m
308CONFIG_CRYPTO_MICHAEL_MIC=m 308CONFIG_CRYPTO_MICHAEL_MIC=m
309CONFIG_CRYPTO_SHA256=y 309CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index f6c5264287e5..7330150bfe34 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -17,6 +17,8 @@
17#include <asm/checksum.h> 17#include <asm/checksum.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <asm/epapr_hcalls.h> 19#include <asm/epapr_hcalls.h>
20#include <asm/dcr.h>
21#include <asm/mmu_context.h>
20 22
21#include <uapi/asm/ucontext.h> 23#include <uapi/asm/ucontext.h>
22 24
@@ -120,6 +122,8 @@ extern s64 __ashrdi3(s64, int);
120extern int __cmpdi2(s64, s64); 122extern int __cmpdi2(s64, s64);
121extern int __ucmpdi2(u64, u64); 123extern int __ucmpdi2(u64, u64);
122 124
125/* tracing */
123void _mcount(void); 126void _mcount(void);
127unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
124 128
125#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ 129#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index bc5fdfd22788..33a24fdd7958 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -55,6 +55,14 @@
55#define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \ 55#define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \
56 ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit)) 56 ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit))
57 57
58#define PPC_BITLSHIFT32(be) (32 - 1 - (be))
59#define PPC_BIT32(bit) (1UL << PPC_BITLSHIFT32(bit))
60#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs))
61
62#define PPC_BITLSHIFT8(be) (8 - 1 - (be))
63#define PPC_BIT8(bit) (1UL << PPC_BITLSHIFT8(bit))
64#define PPC_BITMASK8(bs, be) ((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs))
65
58#include <asm/barrier.h> 66#include <asm/barrier.h>
59 67
60/* Macro for generating the ***_bits() functions */ 68/* Macro for generating the ***_bits() functions */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..b4b5e6b671ca 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -8,7 +8,7 @@
8#define H_PTE_INDEX_SIZE 9 8#define H_PTE_INDEX_SIZE 9
9#define H_PMD_INDEX_SIZE 7 9#define H_PMD_INDEX_SIZE 7
10#define H_PUD_INDEX_SIZE 9 10#define H_PUD_INDEX_SIZE 9
11#define H_PGD_INDEX_SIZE 9 11#define H_PGD_INDEX_SIZE 12
12 12
13#ifndef __ASSEMBLY__ 13#ifndef __ASSEMBLY__
14#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) 14#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index f3dd21efa2ea..214219dff87c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -4,10 +4,14 @@
4#define H_PTE_INDEX_SIZE 8 4#define H_PTE_INDEX_SIZE 8
5#define H_PMD_INDEX_SIZE 5 5#define H_PMD_INDEX_SIZE 5
6#define H_PUD_INDEX_SIZE 5 6#define H_PUD_INDEX_SIZE 5
7#define H_PGD_INDEX_SIZE 12 7#define H_PGD_INDEX_SIZE 15
8 8
9#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ 9/*
10#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ 10 * 64k aligned address free up few of the lower bits of RPN for us
11 * We steal that here. For more deatils look at pte_pfn/pfn_pte()
12 */
13#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */
14#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */
11/* 15/*
12 * We need to differentiate between explicit huge page and THP huge 16 * We need to differentiate between explicit huge page and THP huge
13 * page, since THP huge page also need to track real subpage details 17 * page, since THP huge page also need to track real subpage details
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index f7b721bbf918..4e957b027fe0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -6,19 +6,13 @@
6 * Common bits between 4K and 64K pages in a linux-style PTE. 6 * Common bits between 4K and 64K pages in a linux-style PTE.
7 * Additional bits may be defined in pgtable-hash64-*.h 7 * Additional bits may be defined in pgtable-hash64-*.h
8 * 8 *
9 * Note: We only support user read/write permissions. Supervisor always
10 * have full read/write to pages above PAGE_OFFSET (pages below that
11 * always use the user access permissions).
12 *
13 * We could create separate kernel read-only if we used the 3 PP bits
14 * combinations that newer processors provide but we currently don't.
15 */ 9 */
16#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */
17#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS 10#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS
18#define H_PAGE_F_GIX_SHIFT 57 11#define H_PAGE_F_GIX_SHIFT 56
19#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ 12#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */
20#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ 13#define H_PAGE_F_SECOND _RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
21#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ 14#define H_PAGE_F_GIX (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
15#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */
22 16
23#ifdef CONFIG_PPC_64K_PAGES 17#ifdef CONFIG_PPC_64K_PAGES
24#include <asm/book3s/64/hash-64k.h> 18#include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index c62f14d0bec1..6666cd366596 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
46 */ 46 */
47 VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift); 47 VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift);
48 if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift) 48 if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift)
49 return __pte(pte_val(entry) | _PAGE_LARGE); 49 return __pte(pte_val(entry) | R_PAGE_LARGE);
50 else 50 else
51 return entry; 51 return entry;
52} 52}
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 52d8d1e4b772..6981a52b3887 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -39,6 +39,7 @@
39 39
40/* Bits in the SLB VSID word */ 40/* Bits in the SLB VSID word */
41#define SLB_VSID_SHIFT 12 41#define SLB_VSID_SHIFT 12
42#define SLB_VSID_SHIFT_256M SLB_VSID_SHIFT
42#define SLB_VSID_SHIFT_1T 24 43#define SLB_VSID_SHIFT_1T 24
43#define SLB_VSID_SSIZE_SHIFT 62 44#define SLB_VSID_SSIZE_SHIFT 62
44#define SLB_VSID_B ASM_CONST(0xc000000000000000) 45#define SLB_VSID_B ASM_CONST(0xc000000000000000)
@@ -408,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea,
408static inline unsigned long hpt_hash(unsigned long vpn, 409static inline unsigned long hpt_hash(unsigned long vpn,
409 unsigned int shift, int ssize) 410 unsigned int shift, int ssize)
410{ 411{
411 int mask; 412 unsigned long mask;
412 unsigned long hash, vsid; 413 unsigned long hash, vsid;
413 414
414 /* VPN_SHIFT can be atmost 12 */ 415 /* VPN_SHIFT can be atmost 12 */
@@ -491,13 +492,14 @@ extern void slb_set_size(u16 size);
491 * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated 492 * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
492 * from mmu context id and effective segment id of the address. 493 * from mmu context id and effective segment id of the address.
493 * 494 *
494 * For user processes max context id is limited to ((1ul << 19) - 5) 495 * For user processes max context id is limited to MAX_USER_CONTEXT.
495 * for kernel space, we use the top 4 context ids to map address as below 496
497 * For kernel space, we use context ids 1-4 to map addresses as below:
496 * NOTE: each context only support 64TB now. 498 * NOTE: each context only support 64TB now.
497 * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 499 * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
498 * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 500 * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
499 * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 501 * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
500 * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] 502 * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
501 * 503 *
502 * The proto-VSIDs are then scrambled into real VSIDs with the 504 * The proto-VSIDs are then scrambled into real VSIDs with the
503 * multiplicative hash: 505 * multiplicative hash:
@@ -511,20 +513,28 @@ extern void slb_set_size(u16 size);
511 * robust scattering in the hash table (at least based on some initial 513 * robust scattering in the hash table (at least based on some initial
512 * results). 514 * results).
513 * 515 *
514 * We also consider VSID 0 special. We use VSID 0 for slb entries mapping 516 * We use VSID 0 to indicate an invalid VSID. The means we can't use context id
515 * bad address. This enables us to consolidate bad address handling in 517 * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which
516 * hash_page. 518 * will produce a VSID of 0.
517 * 519 *
518 * We also need to avoid the last segment of the last context, because that 520 * We also need to avoid the last segment of the last context, because that
519 * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 521 * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
520 * because of the modulo operation in vsid scramble. But the vmemmap 522 * because of the modulo operation in vsid scramble.
521 * (which is what uses region 0xf) will never be close to 64TB in size
522 * (it's 56 bytes per page of system memory).
523 */ 523 */
524 524
525/*
526 * Max Va bits we support as of now is 68 bits. We want 19 bit
527 * context ID.
528 * Restrictions:
529 * GPU has restrictions of not able to access beyond 128TB
530 * (47 bit effective address). We also cannot do more than 20bit PID.
531 * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS
532 * to 16 bits (ie, we can only have 2^16 pids at the same time).
533 */
534#define VA_BITS 68
525#define CONTEXT_BITS 19 535#define CONTEXT_BITS 19
526#define ESID_BITS 18 536#define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS))
527#define ESID_BITS_1T 6 537#define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS))
528 538
529#define ESID_BITS_MASK ((1 << ESID_BITS) - 1) 539#define ESID_BITS_MASK ((1 << ESID_BITS) - 1)
530#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) 540#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1)
@@ -532,63 +542,70 @@ extern void slb_set_size(u16 size);
532/* 542/*
533 * 256MB segment 543 * 256MB segment
534 * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments 544 * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
535 * available for user + kernel mapping. The top 4 contexts are used for 545 * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts
536 * kernel mapping. Each segment contains 2^28 bytes. Each 546 * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each
537 * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts 547 * context maps 2^49 bytes (512TB).
538 * (19 == 37 + 28 - 46). 548 *
549 * We also need to avoid the last segment of the last context, because that
550 * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
551 * because of the modulo operation in vsid scramble.
552 */
553#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2)
554#define MIN_USER_CONTEXT (5)
555
556/* Would be nice to use KERNEL_REGION_ID here */
557#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1)
558
559/*
560 * For platforms that support on 65bit VA we limit the context bits
539 */ 561 */
540#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) 562#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2)
541 563
542/* 564/*
543 * This should be computed such that protovosid * vsid_mulitplier 565 * This should be computed such that protovosid * vsid_mulitplier
544 * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus 566 * doesn't overflow 64 bits. The vsid_mutliplier should also be
567 * co-prime to vsid_modulus. We also need to make sure that number
568 * of bits in multiplied result (dividend) is less than twice the number of
569 * protovsid bits for our modulus optmization to work.
570 *
571 * The below table shows the current values used.
572 * |-------+------------+----------------------+------------+-------------------|
573 * | | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS |
574 * |-------+------------+----------------------+------------+-------------------|
575 * | 1T | 24 | 25 | 49 | 50 |
576 * |-------+------------+----------------------+------------+-------------------|
577 * | 256MB | 24 | 37 | 61 | 74 |
578 * |-------+------------+----------------------+------------+-------------------|
579 *
580 * |-------+------------+----------------------+------------+--------------------|
581 * | | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS |
582 * |-------+------------+----------------------+------------+--------------------|
583 * | 1T | 24 | 28 | 52 | 56 |
584 * |-------+------------+----------------------+------------+--------------------|
585 * | 256MB | 24 | 40 | 64 | 80 |
586 * |-------+------------+----------------------+------------+--------------------|
587 *
545 */ 588 */
546#define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ 589#define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */
547#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) 590#define VSID_BITS_256M (VA_BITS - SID_SHIFT)
548#define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) 591#define VSID_BITS_65_256M (65 - SID_SHIFT)
592/*
593 * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS
594 */
595#define VSID_MULINV_256M ASM_CONST(665548017062)
549 596
550#define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ 597#define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */
551#define VSID_BITS_1T (CONTEXT_BITS + ESID_BITS_1T) 598#define VSID_BITS_1T (VA_BITS - SID_SHIFT_1T)
552#define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) 599#define VSID_BITS_65_1T (65 - SID_SHIFT_1T)
553 600#define VSID_MULINV_1T ASM_CONST(209034062)
554 601
602/* 1TB VSID reserved for VRMA */
603#define VRMA_VSID 0x1ffffffUL
555#define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT)) 604#define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT))
556 605
557/*
558 * This macro generates asm code to compute the VSID scramble
559 * function. Used in slb_allocate() and do_stab_bolted. The function
560 * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
561 *
562 * rt = register containing the proto-VSID and into which the
563 * VSID will be stored
564 * rx = scratch register (clobbered)
565 *
566 * - rt and rx must be different registers
567 * - The answer will end up in the low VSID_BITS bits of rt. The higher
568 * bits may contain other garbage, so you may need to mask the
569 * result.
570 */
571#define ASM_VSID_SCRAMBLE(rt, rx, size) \
572 lis rx,VSID_MULTIPLIER_##size@h; \
573 ori rx,rx,VSID_MULTIPLIER_##size@l; \
574 mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
575 \
576 srdi rx,rt,VSID_BITS_##size; \
577 clrldi rt,rt,(64-VSID_BITS_##size); \
578 add rt,rt,rx; /* add high and low bits */ \
579 /* NOTE: explanation based on VSID_BITS_##size = 36 \
580 * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \
581 * 2^36-1+2^28-1. That in particular means that if r3 >= \
582 * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \
583 * the bit clear, r3 already has the answer we want, if it \
584 * doesn't, the answer is the low 36 bits of r3+1. So in all \
585 * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\
586 addi rx,rt,1; \
587 srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
588 add rt,rt,rx
589
590/* 4 bits per slice and we have one slice per 1TB */ 606/* 4 bits per slice and we have one slice per 1TB */
591#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) 607#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41)
608#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41)
592 609
593#ifndef __ASSEMBLY__ 610#ifndef __ASSEMBLY__
594 611
@@ -634,7 +651,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
634#define vsid_scramble(protovsid, size) \ 651#define vsid_scramble(protovsid, size) \
635 ((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size)) 652 ((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size))
636 653
637#else /* 1 */ 654/* simplified form avoiding mod operation */
638#define vsid_scramble(protovsid, size) \ 655#define vsid_scramble(protovsid, size) \
639 ({ \ 656 ({ \
640 unsigned long x; \ 657 unsigned long x; \
@@ -642,6 +659,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
642 x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \ 659 x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \
643 (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \ 660 (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \
644 }) 661 })
662
663#else /* 1 */
664static inline unsigned long vsid_scramble(unsigned long protovsid,
665 unsigned long vsid_multiplier, int vsid_bits)
666{
667 unsigned long vsid;
668 unsigned long vsid_modulus = ((1UL << vsid_bits) - 1);
669 /*
670 * We have same multipler for both 256 and 1T segements now
671 */
672 vsid = protovsid * vsid_multiplier;
673 vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus);
674 return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus;
675}
676
645#endif /* 1 */ 677#endif /* 1 */
646 678
647/* Returns the segment size indicator for a user address */ 679/* Returns the segment size indicator for a user address */
@@ -656,36 +688,56 @@ static inline int user_segment_size(unsigned long addr)
656static inline unsigned long get_vsid(unsigned long context, unsigned long ea, 688static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
657 int ssize) 689 int ssize)
658{ 690{
691 unsigned long va_bits = VA_BITS;
692 unsigned long vsid_bits;
693 unsigned long protovsid;
694
659 /* 695 /*
660 * Bad address. We return VSID 0 for that 696 * Bad address. We return VSID 0 for that
661 */ 697 */
662 if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) 698 if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
663 return 0; 699 return 0;
664 700
665 if (ssize == MMU_SEGSIZE_256M) 701 if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
666 return vsid_scramble((context << ESID_BITS) 702 va_bits = 65;
667 | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M); 703
668 return vsid_scramble((context << ESID_BITS_1T) 704 if (ssize == MMU_SEGSIZE_256M) {
669 | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T); 705 vsid_bits = va_bits - SID_SHIFT;
706 protovsid = (context << ESID_BITS) |
707 ((ea >> SID_SHIFT) & ESID_BITS_MASK);
708 return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits);
709 }
710 /* 1T segment */
711 vsid_bits = va_bits - SID_SHIFT_1T;
712 protovsid = (context << ESID_BITS_1T) |
713 ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK);
714 return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
670} 715}
671 716
672/* 717/*
673 * This is only valid for addresses >= PAGE_OFFSET 718 * This is only valid for addresses >= PAGE_OFFSET
674 *
675 * For kernel space, we use the top 4 context ids to map address as below
676 * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
677 * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
678 * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
679 * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
680 */ 719 */
681static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) 720static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
682{ 721{
683 unsigned long context; 722 unsigned long context;
684 723
724 if (!is_kernel_addr(ea))
725 return 0;
726
685 /* 727 /*
686 * kernel take the top 4 context from the available range 728 * For kernel space, we use context ids 1-4 to map the address space as
729 * below:
730 *
731 * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
732 * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
733 * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
734 * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
735 *
736 * So we can compute the context from the region (top nibble) by
737 * subtracting 11, or 0xc - 1.
687 */ 738 */
688 context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; 739 context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
740
689 return get_vsid(context, ea, ssize); 741 return get_vsid(context, ea, ssize);
690} 742}
691 743
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 805d4105e9bb..77529a3e3811 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb;
65 * MAX_USER_CONTEXT * 16 bytes of space. 65 * MAX_USER_CONTEXT * 16 bytes of space.
66 */ 66 */
67#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) 67#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4)
68#define PRTB_ENTRIES (1ul << CONTEXT_BITS)
69
68/* 70/*
69 * Power9 currently only support 64K partition table size. 71 * Power9 currently only support 64K partition table size.
70 */ 72 */
@@ -73,13 +75,20 @@ extern struct patb_entry *partition_tb;
73typedef unsigned long mm_context_id_t; 75typedef unsigned long mm_context_id_t;
74struct spinlock; 76struct spinlock;
75 77
78/* Maximum possible number of NPUs in a system. */
79#define NV_MAX_NPUS 8
80
76typedef struct { 81typedef struct {
77 mm_context_id_t id; 82 mm_context_id_t id;
78 u16 user_psize; /* page size index */ 83 u16 user_psize; /* page size index */
79 84
85 /* NPU NMMU context */
86 struct npu_context *npu_context;
87
80#ifdef CONFIG_PPC_MM_SLICES 88#ifdef CONFIG_PPC_MM_SLICES
81 u64 low_slices_psize; /* SLB page size encodings */ 89 u64 low_slices_psize; /* SLB page size encodings */
82 unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; 90 unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
91 unsigned long addr_limit;
83#else 92#else
84 u16 sllp; /* SLB page size encoding */ 93 u16 sllp; /* SLB page size encoding */
85#endif 94#endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8f4d41936e5a..85bc9875c3be 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -13,6 +13,7 @@
13#define _PAGE_BIT_SWAP_TYPE 0 13#define _PAGE_BIT_SWAP_TYPE 0
14 14
15#define _PAGE_RO 0 15#define _PAGE_RO 0
16#define _PAGE_SHARED 0
16 17
17#define _PAGE_EXEC 0x00001 /* execute permission */ 18#define _PAGE_EXEC 0x00001 /* execute permission */
18#define _PAGE_WRITE 0x00002 /* write access allowed */ 19#define _PAGE_WRITE 0x00002 /* write access allowed */
@@ -37,21 +38,47 @@
37#define _RPAGE_RSV3 0x0400000000000000UL 38#define _RPAGE_RSV3 0x0400000000000000UL
38#define _RPAGE_RSV4 0x0200000000000000UL 39#define _RPAGE_RSV4 0x0200000000000000UL
39 40
40#ifdef CONFIG_MEM_SOFT_DIRTY 41#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */
41#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ 42#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */
42#else
43#define _PAGE_SOFT_DIRTY 0x00000
44#endif
45#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
46 43
47/* 44/*
48 * For P9 DD1 only, we need to track whether the pte's huge. 45 * Top and bottom bits of RPN which can be used by hash
46 * translation mode, because we expect them to be zero
47 * otherwise.
49 */ 48 */
50#define _PAGE_LARGE _RPAGE_RSV1 49#define _RPAGE_RPN0 0x01000
50#define _RPAGE_RPN1 0x02000
51#define _RPAGE_RPN44 0x0100000000000000UL
52#define _RPAGE_RPN43 0x0080000000000000UL
53#define _RPAGE_RPN42 0x0040000000000000UL
54#define _RPAGE_RPN41 0x0020000000000000UL
55
56/* Max physical address bit as per radix table */
57#define _RPAGE_PA_MAX 57
51 58
59/*
60 * Max physical address bit we will use for now.
61 *
62 * This is mostly a hardware limitation and for now Power9 has
63 * a 51 bit limit.
64 *
65 * This is different from the number of physical bit required to address
66 * the last byte of memory. That is defined by MAX_PHYSMEM_BITS.
67 * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum
68 * number of sections we can support (SECTIONS_SHIFT).
69 *
70 * This is different from Radix page table limitation above and
71 * should always be less than that. The limit is done such that
72 * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX
73 * for hash linux page table specific bits.
74 *
75 * In order to be compatible with future hardware generations we keep
76 * some offsets and limit this for now to 53
77 */
78#define _PAGE_PA_MAX 53
52 79
53#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ 80#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */
54#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ 81#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
55/* 82/*
56 * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE 83 * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
57 * Instead of fixing all of them, add an alternate define which 84 * Instead of fixing all of them, add an alternate define which
@@ -59,10 +86,11 @@
59 */ 86 */
60#define _PAGE_NO_CACHE _PAGE_TOLERANT 87#define _PAGE_NO_CACHE _PAGE_TOLERANT
61/* 88/*
62 * We support 57 bit real address in pte. Clear everything above 57, and 89 * We support _RPAGE_PA_MAX bit real address in pte. On the linux side
63 * every thing below PAGE_SHIFT; 90 * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX
91 * and every thing below PAGE_SHIFT;
64 */ 92 */
65#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) 93#define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK))
66/* 94/*
67 * set of bits not changed in pmd_modify. Even though we have hash specific bits 95 * set of bits not changed in pmd_modify. Even though we have hash specific bits
68 * in here, on radix we expect them to be zero. 96 * in here, on radix we expect them to be zero.
@@ -205,10 +233,6 @@ extern unsigned long __pte_frag_nr;
205extern unsigned long __pte_frag_size_shift; 233extern unsigned long __pte_frag_size_shift;
206#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift 234#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
207#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) 235#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
208/*
209 * Pgtable size used by swapper, init in asm code
210 */
211#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
212 236
213#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 237#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
214#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) 238#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 9e0bb7cd6e22..ac16d1943022 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -11,6 +11,12 @@
11#include <asm/book3s/64/radix-4k.h> 11#include <asm/book3s/64/radix-4k.h>
12#endif 12#endif
13 13
14/*
15 * For P9 DD1 only, we need to track whether the pte's huge.
16 */
17#define R_PAGE_LARGE _RPAGE_RSV1
18
19
14#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
15#include <asm/book3s/64/tlbflush-radix.h> 21#include <asm/book3s/64/tlbflush-radix.h>
16#include <asm/cpu_has_feature.h> 22#include <asm/cpu_has_feature.h>
@@ -252,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
252static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) 258static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
253{ 259{
254 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) 260 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
255 return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE); 261 return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
256 return __pmd(pmd_val(pmd) | _PAGE_PTE); 262 return __pmd(pmd_val(pmd) | _PAGE_PTE);
257} 263}
258static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, 264static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 8ab937771068..abef812de7f8 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -12,6 +12,8 @@
12 12
13#include <asm/types.h> 13#include <asm/types.h>
14#include <asm/ppc-opcode.h> 14#include <asm/ppc-opcode.h>
15#include <linux/string.h>
16#include <linux/kallsyms.h>
15 17
16/* Flags for create_branch: 18/* Flags for create_branch:
17 * "b" == create_branch(addr, target, 0); 19 * "b" == create_branch(addr, target, 0);
@@ -99,6 +101,45 @@ static inline unsigned long ppc_global_function_entry(void *func)
99#endif 101#endif
100} 102}
101 103
104/*
105 * Wrapper around kallsyms_lookup() to return function entry address:
106 * - For ABIv1, we lookup the dot variant.
107 * - For ABIv2, we return the local entry point.
108 */
109static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
110{
111 unsigned long addr;
112#ifdef PPC64_ELF_ABI_v1
113 /* check for dot variant */
114 char dot_name[1 + KSYM_NAME_LEN];
115 bool dot_appended = false;
116
117 if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
118 return 0;
119
120 if (name[0] != '.') {
121 dot_name[0] = '.';
122 dot_name[1] = '\0';
123 strlcat(dot_name, name, sizeof(dot_name));
124 dot_appended = true;
125 } else {
126 dot_name[0] = '\0';
127 strlcat(dot_name, name, sizeof(dot_name));
128 }
129 addr = kallsyms_lookup_name(dot_name);
130 if (!addr && dot_appended)
131 /* Let's try the original non-dot symbol lookup */
132 addr = kallsyms_lookup_name(name);
133#elif defined(PPC64_ELF_ABI_v2)
134 addr = kallsyms_lookup_name(name);
135 if (addr)
136 addr = ppc_function_entry((void *)addr);
137#else
138 addr = kallsyms_lookup_name(name);
139#endif
140 return addr;
141}
142
102#ifdef CONFIG_PPC64 143#ifdef CONFIG_PPC64
103/* 144/*
104 * Some instruction encodings commonly used in dynamic ftracing 145 * Some instruction encodings commonly used in dynamic ftracing
diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 155731557c9b..52586f9956bb 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -2,13 +2,39 @@
2#define _ASM_POWERPC_CPUIDLE_H 2#define _ASM_POWERPC_CPUIDLE_H
3 3
4#ifdef CONFIG_PPC_POWERNV 4#ifdef CONFIG_PPC_POWERNV
5/* Used in powernv idle state management */ 5/* Thread state used in powernv idle state management */
6#define PNV_THREAD_RUNNING 0 6#define PNV_THREAD_RUNNING 0
7#define PNV_THREAD_NAP 1 7#define PNV_THREAD_NAP 1
8#define PNV_THREAD_SLEEP 2 8#define PNV_THREAD_SLEEP 2
9#define PNV_THREAD_WINKLE 3 9#define PNV_THREAD_WINKLE 3
10#define PNV_CORE_IDLE_LOCK_BIT 0x100 10
11#define PNV_CORE_IDLE_THREAD_BITS 0x0FF 11/*
12 * Core state used in powernv idle for POWER8.
13 *
14 * The lock bit synchronizes updates to the state, as well as parts of the
15 * sleep/wake code (see kernel/idle_book3s.S).
16 *
17 * Bottom 8 bits track the idle state of each thread. Bit is cleared before
18 * the thread executes an idle instruction (nap/sleep/winkle).
19 *
20 * Then there is winkle tracking. A core does not lose complete state
21 * until every thread is in winkle. So the winkle count field counts the
22 * number of threads in winkle (small window of false positives is okay
23 * around the sleep/wake, so long as there are no false negatives).
24 *
25 * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then
26 * the THREAD_WINKLE_BITS are set, which indicate which threads have not
27 * yet woken from the winkle state.
28 */
29#define PNV_CORE_IDLE_LOCK_BIT 0x10000000
30
31#define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000
32#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000
33#define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000
34#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8
35#define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00
36
37#define PNV_CORE_IDLE_THREAD_BITS 0x000000FF
12 38
13/* 39/*
14 * ============================ NOTE ================================= 40 * ============================ NOTE =================================
@@ -46,6 +72,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[];
46 72
47extern u64 pnv_first_deep_stop_state; 73extern u64 pnv_first_deep_stop_state;
48 74
75unsigned long pnv_cpu_offline(unsigned int cpu);
49int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); 76int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
50static inline void report_invalid_psscr_val(u64 psscr_val, int err) 77static inline void report_invalid_psscr_val(u64 psscr_val, int err)
51{ 78{
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ab68d0ee7725..1f6847b107e4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -471,10 +471,11 @@ enum {
471 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ 471 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
472 CPU_FTR_DSCR | CPU_FTR_SAO | \ 472 CPU_FTR_DSCR | CPU_FTR_SAO | \
473 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ 473 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
474 CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ 474 CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
475 CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ 475 CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
476 CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) 476 CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
477#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) 477#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
478 (~CPU_FTR_SAO))
478#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 479#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
479 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 480 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
480 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ 481 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 378167377065..f70cbfe0ec04 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -35,33 +35,53 @@ enum ppc_dbell {
35#ifdef CONFIG_PPC_BOOK3S 35#ifdef CONFIG_PPC_BOOK3S
36 36
37#define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER 37#define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER
38#define SPRN_DOORBELL_CPUTAG SPRN_TIR
39#define PPC_DBELL_TAG_MASK 0x7f
40 38
41static inline void _ppc_msgsnd(u32 msg) 39static inline void _ppc_msgsnd(u32 msg)
42{ 40{
43 if (cpu_has_feature(CPU_FTR_HVMODE)) 41 __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0)
44 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 42 : : "i" (CPU_FTR_HVMODE), "r" (msg));
45 else 43}
46 __asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg)); 44
45/* sync before sending message */
46static inline void ppc_msgsnd_sync(void)
47{
48 __asm__ __volatile__ ("sync" : : : "memory");
49}
50
51/* sync after taking message interrupt */
52static inline void ppc_msgsync(void)
53{
54 /* sync is not required when taking messages from the same core */
55 __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0)
56 : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300));
47} 57}
48 58
49#else /* CONFIG_PPC_BOOK3S */ 59#else /* CONFIG_PPC_BOOK3S */
50 60
51#define PPC_DBELL_MSGTYPE PPC_DBELL 61#define PPC_DBELL_MSGTYPE PPC_DBELL
52#define SPRN_DOORBELL_CPUTAG SPRN_PIR
53#define PPC_DBELL_TAG_MASK 0x3fff
54 62
55static inline void _ppc_msgsnd(u32 msg) 63static inline void _ppc_msgsnd(u32 msg)
56{ 64{
57 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 65 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
58} 66}
59 67
68/* sync before sending message */
69static inline void ppc_msgsnd_sync(void)
70{
71 __asm__ __volatile__ ("sync" : : : "memory");
72}
73
74/* sync after taking message interrupt */
75static inline void ppc_msgsync(void)
76{
77}
78
60#endif /* CONFIG_PPC_BOOK3S */ 79#endif /* CONFIG_PPC_BOOK3S */
61 80
62extern void doorbell_cause_ipi(int cpu, unsigned long data); 81extern void doorbell_global_ipi(int cpu);
82extern void doorbell_core_ipi(int cpu);
83extern int doorbell_try_core_ipi(int cpu);
63extern void doorbell_exception(struct pt_regs *regs); 84extern void doorbell_exception(struct pt_regs *regs);
64extern void doorbell_setup_this_cpu(void);
65 85
66static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) 86static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
67{ 87{
diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86308f177f2d..5d5af3fddfd8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -8,8 +8,6 @@
8 8
9struct pt_regs; 9struct pt_regs;
10 10
11extern struct dentry *powerpc_debugfs_root;
12
13#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) 11#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
14 12
15extern int (*__debugger)(struct pt_regs *regs); 13extern int (*__debugger)(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h
new file mode 100644
index 000000000000..4f3b39f3e3d2
--- /dev/null
+++ b/arch/powerpc/include/asm/debugfs.h
@@ -0,0 +1,17 @@
1#ifndef _ASM_POWERPC_DEBUGFS_H
2#define _ASM_POWERPC_DEBUGFS_H
3
4/*
5 * Copyright 2017, Michael Ellerman, IBM Corporation.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/debugfs.h>
14
15extern struct dentry *powerpc_debugfs_root;
16
17#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index ed3beadd2cc5..183d73b6ed99 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -167,17 +167,14 @@ BEGIN_FTR_SECTION_NESTED(943) \
167 std ra,offset(r13); \ 167 std ra,offset(r13); \
168END_FTR_SECTION_NESTED(ftr,ftr,943) 168END_FTR_SECTION_NESTED(ftr,ftr,943)
169 169
170#define EXCEPTION_PROLOG_0_PACA(area) \ 170#define EXCEPTION_PROLOG_0(area) \
171 GET_PACA(r13); \
171 std r9,area+EX_R9(r13); /* save r9 */ \ 172 std r9,area+EX_R9(r13); /* save r9 */ \
172 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ 173 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \
173 HMT_MEDIUM; \ 174 HMT_MEDIUM; \
174 std r10,area+EX_R10(r13); /* save r10 - r12 */ \ 175 std r10,area+EX_R10(r13); /* save r10 - r12 */ \
175 OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) 176 OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
176 177
177#define EXCEPTION_PROLOG_0(area) \
178 GET_PACA(r13); \
179 EXCEPTION_PROLOG_0_PACA(area)
180
181#define __EXCEPTION_PROLOG_1(area, extra, vec) \ 178#define __EXCEPTION_PROLOG_1(area, extra, vec) \
182 OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR); \ 179 OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR); \
183 OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR); \ 180 OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR); \
@@ -203,17 +200,26 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
203#define EXCEPTION_PROLOG_PSERIES_1(label, h) \ 200#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
204 __EXCEPTION_PROLOG_PSERIES_1(label, h) 201 __EXCEPTION_PROLOG_PSERIES_1(label, h)
205 202
203/* _NORI variant keeps MSR_RI clear */
204#define __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \
205 ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \
206 xori r10,r10,MSR_RI; /* Clear MSR_RI */ \
207 mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \
208 LOAD_HANDLER(r12,label) \
209 mtspr SPRN_##h##SRR0,r12; \
210 mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
211 mtspr SPRN_##h##SRR1,r10; \
212 h##rfid; \
213 b . /* prevent speculative execution */
214
215#define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \
216 __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)
217
206#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \ 218#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \
207 EXCEPTION_PROLOG_0(area); \ 219 EXCEPTION_PROLOG_0(area); \
208 EXCEPTION_PROLOG_1(area, extra, vec); \ 220 EXCEPTION_PROLOG_1(area, extra, vec); \
209 EXCEPTION_PROLOG_PSERIES_1(label, h); 221 EXCEPTION_PROLOG_PSERIES_1(label, h);
210 222
211/* Have the PACA in r13 already */
212#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec) \
213 EXCEPTION_PROLOG_0_PACA(area); \
214 EXCEPTION_PROLOG_1(area, extra, vec); \
215 EXCEPTION_PROLOG_PSERIES_1(label, h);
216
217#define __KVMTEST(h, n) \ 223#define __KVMTEST(h, n) \
218 lbz r10,HSTATE_IN_GUEST(r13); \ 224 lbz r10,HSTATE_IN_GUEST(r13); \
219 cmpwi r10,0; \ 225 cmpwi r10,0; \
@@ -256,11 +262,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
256 ld r9,area+EX_R9(r13); \ 262 ld r9,area+EX_R9(r13); \
257 bctr 263 bctr
258 264
259#define BRANCH_TO_KVM(reg, label) \
260 __LOAD_FAR_HANDLER(reg, label); \
261 mtctr reg; \
262 bctr
263
264#else 265#else
265#define BRANCH_TO_COMMON(reg, label) \ 266#define BRANCH_TO_COMMON(reg, label) \
266 b label 267 b label
@@ -268,15 +269,18 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
268#define BRANCH_LINK_TO_FAR(label) \ 269#define BRANCH_LINK_TO_FAR(label) \
269 bl label 270 bl label
270 271
271#define BRANCH_TO_KVM(reg, label) \
272 b label
273
274#define __BRANCH_TO_KVM_EXIT(area, label) \ 272#define __BRANCH_TO_KVM_EXIT(area, label) \
275 ld r9,area+EX_R9(r13); \ 273 ld r9,area+EX_R9(r13); \
276 b label 274 b label
277 275
278#endif 276#endif
279 277
278/* Do not enable RI */
279#define EXCEPTION_PROLOG_PSERIES_NORI(area, label, h, extra, vec) \
280 EXCEPTION_PROLOG_0(area); \
281 EXCEPTION_PROLOG_1(area, extra, vec); \
282 EXCEPTION_PROLOG_PSERIES_1_NORI(label, h);
283
280 284
281#define __KVM_HANDLER(area, h, n) \ 285#define __KVM_HANDLER(area, h, n) \
282 BEGIN_FTR_SECTION_NESTED(947) \ 286 BEGIN_FTR_SECTION_NESTED(947) \
@@ -325,6 +329,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
325 329
326#define NOTEST(n) 330#define NOTEST(n)
327 331
332#define EXCEPTION_PROLOG_COMMON_1() \
333 std r9,_CCR(r1); /* save CR in stackframe */ \
334 std r11,_NIP(r1); /* save SRR0 in stackframe */ \
335 std r12,_MSR(r1); /* save SRR1 in stackframe */ \
336 std r10,0(r1); /* make stack chain pointer */ \
337 std r0,GPR0(r1); /* save r0 in stackframe */ \
338 std r10,GPR1(r1); /* save r1 in stackframe */ \
339
340
328/* 341/*
329 * The common exception prolog is used for all except a few exceptions 342 * The common exception prolog is used for all except a few exceptions
330 * such as a segment miss on a kernel address. We have to be prepared 343 * such as a segment miss on a kernel address. We have to be prepared
@@ -349,12 +362,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
349 addi r3,r13,area; /* r3 -> where regs are saved*/ \ 362 addi r3,r13,area; /* r3 -> where regs are saved*/ \
350 RESTORE_CTR(r1, area); \ 363 RESTORE_CTR(r1, area); \
351 b bad_stack; \ 364 b bad_stack; \
3523: std r9,_CCR(r1); /* save CR in stackframe */ \ 3653: EXCEPTION_PROLOG_COMMON_1(); \
353 std r11,_NIP(r1); /* save SRR0 in stackframe */ \
354 std r12,_MSR(r1); /* save SRR1 in stackframe */ \
355 std r10,0(r1); /* make stack chain pointer */ \
356 std r0,GPR0(r1); /* save r0 in stackframe */ \
357 std r10,GPR1(r1); /* save r1 in stackframe */ \
358 beq 4f; /* if from kernel mode */ \ 366 beq 4f; /* if from kernel mode */ \
359 ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ 367 ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \
360 SAVE_PPR(area, r9, r10); \ 368 SAVE_PPR(area, r9, r10); \
@@ -522,7 +530,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
522 530
523#define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ 531#define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \
524 EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ 532 EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \
525 EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) 533 EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
526 534
527/* 535/*
528 * Our exception common code can be passed various "additions" 536 * Our exception common code can be passed various "additions"
@@ -547,26 +555,39 @@ BEGIN_FTR_SECTION \
547 beql ppc64_runlatch_on_trampoline; \ 555 beql ppc64_runlatch_on_trampoline; \
548END_FTR_SECTION_IFSET(CPU_FTR_CTRL) 556END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
549 557
550#define EXCEPTION_COMMON(trap, label, hdlr, ret, additions) \ 558#define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \
551 EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ 559 EXCEPTION_PROLOG_COMMON(trap, area); \
552 /* Volatile regs are potentially clobbered here */ \ 560 /* Volatile regs are potentially clobbered here */ \
553 additions; \ 561 additions; \
554 addi r3,r1,STACK_FRAME_OVERHEAD; \ 562 addi r3,r1,STACK_FRAME_OVERHEAD; \
555 bl hdlr; \ 563 bl hdlr; \
556 b ret 564 b ret
557 565
566/*
567 * Exception where stack is already set in r1, r1 is saved in r10, and it
568 * continues rather than returns.
569 */
570#define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \
571 EXCEPTION_PROLOG_COMMON_1(); \
572 EXCEPTION_PROLOG_COMMON_2(area); \
573 EXCEPTION_PROLOG_COMMON_3(trap); \
574 /* Volatile regs are potentially clobbered here */ \
575 additions; \
576 addi r3,r1,STACK_FRAME_OVERHEAD; \
577 bl hdlr
578
558#define STD_EXCEPTION_COMMON(trap, label, hdlr) \ 579#define STD_EXCEPTION_COMMON(trap, label, hdlr) \
559 EXCEPTION_COMMON(trap, label, hdlr, ret_from_except, \ 580 EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \
560 ADD_NVGPRS;ADD_RECONCILE) 581 ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
561 582
562/* 583/*
563 * Like STD_EXCEPTION_COMMON, but for exceptions that can occur 584 * Like STD_EXCEPTION_COMMON, but for exceptions that can occur
564 * in the idle task and therefore need the special idle handling 585 * in the idle task and therefore need the special idle handling
565 * (finish nap and runlatch) 586 * (finish nap and runlatch)
566 */ 587 */
567#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \ 588#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \
568 EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \ 589 EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \
569 FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON) 590 ret_from_except_lite, FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
570 591
571/* 592/*
572 * When the idle code in power4_idle puts the CPU into NAP mode, 593 * When the idle code in power4_idle puts the CPU into NAP mode,
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index ddf54f5bbdd1..2de2319b99e2 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -66,6 +66,9 @@ label##5: \
66#define END_FTR_SECTION(msk, val) \ 66#define END_FTR_SECTION(msk, val) \
67 END_FTR_SECTION_NESTED(msk, val, 97) 67 END_FTR_SECTION_NESTED(msk, val, 97)
68 68
69#define END_FTR_SECTION_NESTED_IFSET(msk, label) \
70 END_FTR_SECTION_NESTED((msk), (msk), label)
71
69#define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk)) 72#define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk))
70#define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0) 73#define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0)
71 74
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index 5067048daad4..86eb87382031 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -213,6 +213,7 @@ name:
213 USE_TEXT_SECTION(); \ 213 USE_TEXT_SECTION(); \
214 .balign IFETCH_ALIGN_BYTES; \ 214 .balign IFETCH_ALIGN_BYTES; \
215 .global name; \ 215 .global name; \
216 _ASM_NOKPROBE_SYMBOL(name); \
216 DEFINE_FIXED_SYMBOL(name); \ 217 DEFINE_FIXED_SYMBOL(name); \
217name: 218name:
218 219
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 3cc12a86ef5d..d73755fafbb0 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...);
377long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); 377long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...);
378long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); 378long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...);
379 379
380/* For hcall instrumentation. One structure per-hcall, per-CPU */
381struct hcall_stats {
382 unsigned long num_calls; /* number of calls (on this CPU) */
383 unsigned long tb_total; /* total wall time (mftb) of calls. */
384 unsigned long purr_total; /* total cpu time (PURR) of calls. */
385 unsigned long tb_start;
386 unsigned long purr_start;
387};
388#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1)
389
390struct hvcall_mpp_data { 380struct hvcall_mpp_data {
391 unsigned long entitled_mem; 381 unsigned long entitled_mem;
392 unsigned long mapped_mem; 382 unsigned long mapped_mem;
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5ed292431b5b..422f99cf9924 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -25,8 +25,6 @@ extern struct pci_dev *isa_bridge_pcidev;
25#endif 25#endif
26 26
27#include <linux/device.h> 27#include <linux/device.h>
28#include <linux/io.h>
29
30#include <linux/compiler.h> 28#include <linux/compiler.h>
31#include <asm/page.h> 29#include <asm/page.h>
32#include <asm/byteorder.h> 30#include <asm/byteorder.h>
@@ -192,24 +190,8 @@ DEF_MMIO_OUT_D(out_le32, 32, stw);
192 190
193#endif /* __BIG_ENDIAN */ 191#endif /* __BIG_ENDIAN */
194 192
195/*
196 * Cache inhibitied accessors for use in real mode, you don't want to use these
197 * unless you know what you're doing.
198 *
199 * NB. These use the cpu byte ordering.
200 */
201DEF_MMIO_OUT_X(out_rm8, 8, stbcix);
202DEF_MMIO_OUT_X(out_rm16, 16, sthcix);
203DEF_MMIO_OUT_X(out_rm32, 32, stwcix);
204DEF_MMIO_IN_X(in_rm8, 8, lbzcix);
205DEF_MMIO_IN_X(in_rm16, 16, lhzcix);
206DEF_MMIO_IN_X(in_rm32, 32, lwzcix);
207
208#ifdef __powerpc64__ 193#ifdef __powerpc64__
209 194
210DEF_MMIO_OUT_X(out_rm64, 64, stdcix);
211DEF_MMIO_IN_X(in_rm64, 64, ldcix);
212
213#ifdef __BIG_ENDIAN__ 195#ifdef __BIG_ENDIAN__
214DEF_MMIO_OUT_D(out_be64, 64, std); 196DEF_MMIO_OUT_D(out_be64, 64, std);
215DEF_MMIO_IN_D(in_be64, 64, ld); 197DEF_MMIO_IN_D(in_be64, 64, ld);
@@ -242,35 +224,6 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
242#endif 224#endif
243#endif /* __powerpc64__ */ 225#endif /* __powerpc64__ */
244 226
245
246/*
247 * Simple Cache inhibited accessors
248 * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
249 * barriers, callers need to manage memory barriers on their own.
250 * These can only be used in hypervisor real mode.
251 */
252
253static inline u32 _lwzcix(unsigned long addr)
254{
255 u32 ret;
256
257 __asm__ __volatile__("lwzcix %0,0, %1"
258 : "=r" (ret) : "r" (addr) : "memory");
259 return ret;
260}
261
262static inline void _stbcix(u64 addr, u8 val)
263{
264 __asm__ __volatile__("stbcix %0,0,%1"
265 : : "r" (val), "r" (addr) : "memory");
266}
267
268static inline void _stwcix(u64 addr, u32 val)
269{
270 __asm__ __volatile__("stwcix %0,0,%1"
271 : : "r" (val), "r" (addr) : "memory");
272}
273
274/* 227/*
275 * Low level IO stream instructions are defined out of line for now 228 * Low level IO stream instructions are defined out of line for now
276 */ 229 */
@@ -417,15 +370,64 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr)
417} 370}
418 371
419/* 372/*
420 * Real mode version of the above. stdcix is only supposed to be used 373 * Real mode versions of the above. Those instructions are only supposed
421 * in hypervisor real mode as per the architecture spec. 374 * to be used in hypervisor real mode as per the architecture spec.
422 */ 375 */
376static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr)
377{
378 __asm__ __volatile__("stbcix %0,0,%1"
379 : : "r" (val), "r" (paddr) : "memory");
380}
381
382static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr)
383{
384 __asm__ __volatile__("sthcix %0,0,%1"
385 : : "r" (val), "r" (paddr) : "memory");
386}
387
388static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr)
389{
390 __asm__ __volatile__("stwcix %0,0,%1"
391 : : "r" (val), "r" (paddr) : "memory");
392}
393
423static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) 394static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
424{ 395{
425 __asm__ __volatile__("stdcix %0,0,%1" 396 __asm__ __volatile__("stdcix %0,0,%1"
426 : : "r" (val), "r" (paddr) : "memory"); 397 : : "r" (val), "r" (paddr) : "memory");
427} 398}
428 399
400static inline u8 __raw_rm_readb(volatile void __iomem *paddr)
401{
402 u8 ret;
403 __asm__ __volatile__("lbzcix %0,0, %1"
404 : "=r" (ret) : "r" (paddr) : "memory");
405 return ret;
406}
407
408static inline u16 __raw_rm_readw(volatile void __iomem *paddr)
409{
410 u16 ret;
411 __asm__ __volatile__("lhzcix %0,0, %1"
412 : "=r" (ret) : "r" (paddr) : "memory");
413 return ret;
414}
415
416static inline u32 __raw_rm_readl(volatile void __iomem *paddr)
417{
418 u32 ret;
419 __asm__ __volatile__("lwzcix %0,0, %1"
420 : "=r" (ret) : "r" (paddr) : "memory");
421 return ret;
422}
423
424static inline u64 __raw_rm_readq(volatile void __iomem *paddr)
425{
426 u64 ret;
427 __asm__ __volatile__("ldcix %0,0, %1"
428 : "=r" (ret) : "r" (paddr) : "memory");
429 return ret;
430}
429#endif /* __powerpc64__ */ 431#endif /* __powerpc64__ */
430 432
431/* 433/*
@@ -757,6 +759,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
757extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); 759extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
758#define ioremap_nocache(addr, size) ioremap((addr), (size)) 760#define ioremap_nocache(addr, size) ioremap((addr), (size))
759#define ioremap_uc(addr, size) ioremap((addr), (size)) 761#define ioremap_uc(addr, size) ioremap((addr), (size))
762#define ioremap_cache(addr, size) \
763 ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
760 764
761extern void iounmap(volatile void __iomem *addr); 765extern void iounmap(volatile void __iomem *addr);
762 766
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d50792944..d96142572e6d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
64 long index, 64 long index,
65 unsigned long *hpa, 65 unsigned long *hpa,
66 enum dma_data_direction *direction); 66 enum dma_data_direction *direction);
67 /* Real mode */
68 int (*exchange_rm)(struct iommu_table *tbl,
69 long index,
70 unsigned long *hpa,
71 enum dma_data_direction *direction);
67#endif 72#endif
68 void (*clear)(struct iommu_table *tbl, 73 void (*clear)(struct iommu_table *tbl,
69 long index, long npages); 74 long index, long npages);
@@ -114,6 +119,7 @@ struct iommu_table {
114 struct list_head it_group_list;/* List of iommu_table_group_link */ 119 struct list_head it_group_list;/* List of iommu_table_group_link */
115 unsigned long *it_userspace; /* userspace view of the table */ 120 unsigned long *it_userspace; /* userspace view of the table */
116 struct iommu_table_ops *it_ops; 121 struct iommu_table_ops *it_ops;
122 struct kref it_kref;
117}; 123};
118 124
119#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ 125#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -146,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
146 152
147extern int dma_iommu_dma_supported(struct device *dev, u64 mask); 153extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
148 154
149/* Frees table for an individual device node */ 155extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
150extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); 156extern int iommu_tce_table_put(struct iommu_table *tbl);
151 157
152/* Initializes an iommu_table based in values set in the passed-in 158/* Initializes an iommu_table based in values set in the passed-in
153 * structure 159 * structure
@@ -208,6 +214,8 @@ extern void iommu_del_device(struct device *dev);
208extern int __init tce_iommu_bus_notifier_init(void); 214extern int __init tce_iommu_bus_notifier_init(void);
209extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 215extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
210 unsigned long *hpa, enum dma_data_direction *direction); 216 unsigned long *hpa, enum dma_data_direction *direction);
217extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
218 unsigned long *hpa, enum dma_data_direction *direction);
211#else 219#else
212static inline void iommu_register_group(struct iommu_table_group *table_group, 220static inline void iommu_register_group(struct iommu_table_group *table_group,
213 int pci_domain_number, 221 int pci_domain_number,
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 0503c98b2117..a83821f33ea3 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[];
61#define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry) 61#define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry)
62#define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */ 62#define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */
63 63
64#ifdef PPC64_ELF_ABI_v2
65/* PPC64 ABIv2 needs local entry point */
66#define kprobe_lookup_name(name, addr) \
67{ \
68 addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \
69 if (addr) \
70 addr = (kprobe_opcode_t *)ppc_function_entry(addr); \
71}
72#elif defined(PPC64_ELF_ABI_v1)
73/*
74 * 64bit powerpc ABIv1 uses function descriptors:
75 * - Check for the dot variant of the symbol first.
76 * - If that fails, try looking up the symbol provided.
77 *
78 * This ensures we always get to the actual symbol and not the descriptor.
79 * Also handle <module:symbol> format.
80 */
81#define kprobe_lookup_name(name, addr) \
82{ \
83 char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; \
84 const char *modsym; \
85 bool dot_appended = false; \
86 if ((modsym = strchr(name, ':')) != NULL) { \
87 modsym++; \
88 if (*modsym != '\0' && *modsym != '.') { \
89 /* Convert to <module:.symbol> */ \
90 strncpy(dot_name, name, modsym - name); \
91 dot_name[modsym - name] = '.'; \
92 dot_name[modsym - name + 1] = '\0'; \
93 strncat(dot_name, modsym, \
94 sizeof(dot_name) - (modsym - name) - 2);\
95 dot_appended = true; \
96 } else { \
97 dot_name[0] = '\0'; \
98 strncat(dot_name, name, sizeof(dot_name) - 1); \
99 } \
100 } else if (name[0] != '.') { \
101 dot_name[0] = '.'; \
102 dot_name[1] = '\0'; \
103 strncat(dot_name, name, KSYM_NAME_LEN - 2); \
104 dot_appended = true; \
105 } else { \
106 dot_name[0] = '\0'; \
107 strncat(dot_name, name, KSYM_NAME_LEN - 1); \
108 } \
109 addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); \
110 if (!addr && dot_appended) { \
111 /* Let's try the original non-dot symbol lookup */ \
112 addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \
113 } \
114}
115#endif
116
117#define flush_insn_slot(p) do { } while (0) 64#define flush_insn_slot(p) do { } while (0)
118#define kretprobe_blacklist_size 0 65#define kretprobe_blacklist_size 0
119 66
@@ -156,6 +103,16 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
156extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); 103extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
157extern int kprobe_handler(struct pt_regs *regs); 104extern int kprobe_handler(struct pt_regs *regs);
158extern int kprobe_post_handler(struct pt_regs *regs); 105extern int kprobe_post_handler(struct pt_regs *regs);
106#ifdef CONFIG_KPROBES_ON_FTRACE
107extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
108 struct kprobe_ctlblk *kcb);
109#else
110static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
111 struct kprobe_ctlblk *kcb)
112{
113 return 0;
114}
115#endif
159#else 116#else
160static inline int kprobe_handler(struct pt_regs *regs) { return 0; } 117static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
161static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; } 118static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d9b48f5bb606..d55c7f881ce7 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm)
49#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 49#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
50#endif 50#endif
51 51
52#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
53
54/* 52/*
55 * We use a lock bit in HPTE dword 0 to synchronize updates and 53 * We use a lock bit in HPTE dword 0 to synchronize updates and
56 * accesses to each HPTE, and another bit to indicate non-present 54 * accesses to each HPTE, and another bit to indicate non-present
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d318d432caa9..0593d9479f74 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -110,7 +110,7 @@ struct kvmppc_host_state {
110 u8 ptid; 110 u8 ptid;
111 struct kvm_vcpu *kvm_vcpu; 111 struct kvm_vcpu *kvm_vcpu;
112 struct kvmppc_vcore *kvm_vcore; 112 struct kvmppc_vcore *kvm_vcore;
113 unsigned long xics_phys; 113 void __iomem *xics_phys;
114 u32 saved_xirr; 114 u32 saved_xirr;
115 u64 dabr; 115 u64 dabr;
116 u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ 116 u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a..c3877992eff9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -409,7 +409,7 @@ struct openpic;
409extern void kvm_cma_reserve(void) __init; 409extern void kvm_cma_reserve(void) __init;
410static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 410static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
411{ 411{
412 paca[cpu].kvm_hstate.xics_phys = addr; 412 paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
413} 413}
414 414
415static inline u32 kvmppc_get_xics_latch(void) 415static inline u32 kvmppc_get_xics_latch(void)
@@ -478,8 +478,6 @@ extern void kvmppc_free_host_rm_ops(void);
478extern void kvmppc_free_pimap(struct kvm *kvm); 478extern void kvmppc_free_pimap(struct kvm *kvm);
479extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); 479extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
480extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); 480extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
481extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
482extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
483extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); 481extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
484extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); 482extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
485extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); 483extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
@@ -507,12 +505,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
507static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) 505static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
508 { return 0; } 506 { return 0; }
509static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } 507static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
510static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
511 unsigned long server)
512 { return -EINVAL; }
513static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
514 struct kvm_irq_level *args)
515 { return -ENOTTY; }
516static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 508static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
517 { return 0; } 509 { return 0; }
518#endif 510#endif
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index ed62efe01e49..81eff8631434 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -24,97 +24,6 @@
24 24
25#include <linux/bitops.h> 25#include <linux/bitops.h>
26 26
27/*
28 * Machine Check bits on power7 and power8
29 */
30#define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */
31
32/* SRR1 bits for machine check (On Power7 and Power8) */
33#define P7_SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
34
35#define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */
36#define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */
37#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */
38#define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45))
39#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */
40#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45)) /* P8 too */
41#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45))
42
43/* SRR1 bits for machine check (On Power8) */
44#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT (0x4 << PPC_BITLSHIFT(45))
45
46/* DSISR bits for machine check (On Power7 and Power8) */
47#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */
48#define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */
49#define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */
50#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */
51#define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */
52#define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */
53#define P7_DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57)) /* P8 too */
54
55/*
56 * DSISR bits for machine check (Power8) in addition to above.
57 * Secondary DERAT Multihit
58 */
59#define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54))
60
61/* SLB error bits */
62#define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \
63 P7_DSISR_MC_SLB_PARITY_MFSLB | \
64 P7_DSISR_MC_SLB_MULTIHIT | \
65 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
66
67#define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \
68 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
69
70/*
71 * Machine Check bits on power9
72 */
73#define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1)
74
75#define P9_SRR1_MC_IFETCH(srr1) ( \
76 PPC_BITEXTRACT(srr1, 45, 0) | \
77 PPC_BITEXTRACT(srr1, 44, 1) | \
78 PPC_BITEXTRACT(srr1, 43, 2) | \
79 PPC_BITEXTRACT(srr1, 36, 3) )
80
81/* 0 is reserved */
82#define P9_SRR1_MC_IFETCH_UE 1
83#define P9_SRR1_MC_IFETCH_SLB_PARITY 2
84#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3
85#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4
86#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5
87#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6
88/* 7 is reserved */
89#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8
90#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9
91/* 10 ? */
92#define P9_SRR1_MC_IFETCH_RA 11
93#define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12
94#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13
95#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14
96#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15
97
98/* DSISR bits for machine check (On Power9) */
99#define P9_DSISR_MC_UE (PPC_BIT(48))
100#define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49))
101#define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50))
102#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51))
103#define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52))
104#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53))
105#define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54))
106#define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55))
107#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56))
108#define P9_DSISR_MC_RA_LOAD (PPC_BIT(57))
109#define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58))
110#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59))
111#define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60))
112
113/* SLB error bits */
114#define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \
115 P9_DSISR_MC_SLB_PARITY_MFSLB | \
116 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
117
118enum MCE_Version { 27enum MCE_Version {
119 MCE_V1 = 1, 28 MCE_V1 = 1,
120}; 29};
@@ -298,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
298extern int get_mce_event(struct machine_check_event *mce, bool release); 207extern int get_mce_event(struct machine_check_event *mce, bool release);
299extern void release_mce_event(void); 208extern void release_mce_event(void);
300extern void machine_check_queue_event(void); 209extern void machine_check_queue_event(void);
301extern void machine_check_print_event_info(struct machine_check_event *evt); 210extern void machine_check_print_event_info(struct machine_check_event *evt,
211 bool user_mode);
302extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); 212extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
303 213
304#endif /* __ASM_PPC64_MCE_H__ */ 214#endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index b62a8d43a06c..7ca8d8e80ffa 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -229,11 +229,6 @@ typedef struct {
229 unsigned int id; 229 unsigned int id;
230 unsigned int active; 230 unsigned int active;
231 unsigned long vdso_base; 231 unsigned long vdso_base;
232#ifdef CONFIG_PPC_MM_SLICES
233 u64 low_slices_psize; /* SLB page size encodings */
234 u64 high_slices_psize; /* 4 bits per slice for now */
235 u16 user_psize; /* page size index */
236#endif
237#ifdef CONFIG_PPC_64K_PAGES 232#ifdef CONFIG_PPC_64K_PAGES
238 /* for 4K PTE fragment support */ 233 /* for 4K PTE fragment support */
239 void *pte_frag; 234 void *pte_frag;
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 065e762fae85..78260409dc9c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -29,6 +29,10 @@
29 */ 29 */
30 30
31/* 31/*
32 * Support for 68 bit VA space. We added that from ISA 2.05
33 */
34#define MMU_FTR_68_BIT_VA ASM_CONST(0x00002000)
35/*
32 * Kernel read only support. 36 * Kernel read only support.
33 * We added the ppp value 0b110 in ISA 2.04. 37 * We added the ppp value 0b110 in ISA 2.04.
34 */ 38 */
@@ -109,10 +113,10 @@
109#define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2 113#define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2
110#define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA 114#define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
111#define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE 115#define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
112#define MMU_FTRS_POWER6 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 116#define MMU_FTRS_POWER6 MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA
113#define MMU_FTRS_POWER7 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 117#define MMU_FTRS_POWER7 MMU_FTRS_POWER6
114#define MMU_FTRS_POWER8 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 118#define MMU_FTRS_POWER8 MMU_FTRS_POWER6
115#define MMU_FTRS_POWER9 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 119#define MMU_FTRS_POWER9 MMU_FTRS_POWER6
116#define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ 120#define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
117 MMU_FTR_CI_LARGE_PAGE 121 MMU_FTR_CI_LARGE_PAGE
118#define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ 122#define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
@@ -136,7 +140,7 @@ enum {
136 MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | 140 MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
137 MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | 141 MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
138 MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | 142 MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
139 MMU_FTR_KERNEL_RO | 143 MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA |
140#ifdef CONFIG_PPC_RADIX_MMU 144#ifdef CONFIG_PPC_RADIX_MMU
141 MMU_FTR_TYPE_RADIX | 145 MMU_FTR_TYPE_RADIX |
142#endif 146#endif
@@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void)
290#define MMU_PAGE_16G 14 294#define MMU_PAGE_16G 14
291#define MMU_PAGE_64G 15 295#define MMU_PAGE_64G 15
292 296
293/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ 297/*
298 * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16
299 * Also we need to change he type of mm_context.low/high_slices_psize.
300 */
294#define MMU_PAGE_COUNT 16 301#define MMU_PAGE_COUNT 16
295 302
296#ifdef CONFIG_PPC_BOOK3S_64 303#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index ecf9885ab660..da7e9432fa8f 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
29extern void mm_iommu_cleanup(struct mm_struct *mm); 29extern void mm_iommu_cleanup(struct mm_struct *mm);
30extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, 30extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
31 unsigned long ua, unsigned long size); 31 unsigned long ua, unsigned long size);
32extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
33 struct mm_struct *mm, unsigned long ua, unsigned long size);
32extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 34extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
33 unsigned long ua, unsigned long entries); 35 unsigned long ua, unsigned long entries);
34extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, 36extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
35 unsigned long ua, unsigned long *hpa); 37 unsigned long ua, unsigned long *hpa);
38extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
39 unsigned long ua, unsigned long *hpa);
36extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); 40extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
37extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); 41extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
38#endif 42#endif
@@ -51,7 +55,8 @@ static inline void switch_mmu_context(struct mm_struct *prev,
51 return switch_slb(tsk, next); 55 return switch_slb(tsk, next);
52} 56}
53 57
54extern int __init_new_context(void); 58extern int hash__alloc_context_id(void);
59extern void hash__reserve_context_id(int id);
55extern void __destroy_context(int context_id); 60extern void __destroy_context(int context_id);
56static inline void mmu_context_init(void) { } 61static inline void mmu_context_init(void) { }
57#else 62#else
@@ -70,8 +75,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
70 * switch_mm is the entry point called from the architecture independent 75 * switch_mm is the entry point called from the architecture independent
71 * code in kernel/sched/core.c 76 * code in kernel/sched/core.c
72 */ 77 */
73static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 78static inline void switch_mm_irqs_off(struct mm_struct *prev,
74 struct task_struct *tsk) 79 struct mm_struct *next,
80 struct task_struct *tsk)
75{ 81{
76 /* Mark this context has been used on the new CPU */ 82 /* Mark this context has been used on the new CPU */
77 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) 83 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
@@ -110,6 +116,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
110 switch_mmu_context(prev, next, tsk); 116 switch_mmu_context(prev, next, tsk);
111} 117}
112 118
119static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
120 struct task_struct *tsk)
121{
122 unsigned long flags;
123
124 local_irq_save(flags);
125 switch_mm_irqs_off(prev, next, tsk);
126 local_irq_restore(flags);
127}
128#define switch_mm_irqs_off switch_mm_irqs_off
129
130
113#define deactivate_mm(tsk,mm) do { } while (0) 131#define deactivate_mm(tsk,mm) do { } while (0)
114 132
115/* 133/*
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c7f927e67d14..f0ff384d4ca5 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -88,11 +88,6 @@
88#include <asm/nohash/pte-book3e.h> 88#include <asm/nohash/pte-book3e.h>
89#include <asm/pte-common.h> 89#include <asm/pte-common.h>
90 90
91#ifdef CONFIG_PPC_MM_SLICES
92#define HAVE_ARCH_UNMAPPED_AREA
93#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
94#endif /* CONFIG_PPC_MM_SLICES */
95
96#ifndef __ASSEMBLY__ 91#ifndef __ASSEMBLY__
97/* pte_clear moved to later in this file */ 92/* pte_clear moved to later in this file */
98 93
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..cb3e6242a78c 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -40,6 +40,8 @@
40#define OPAL_I2C_ARBT_LOST -22 40#define OPAL_I2C_ARBT_LOST -22
41#define OPAL_I2C_NACK_RCVD -23 41#define OPAL_I2C_NACK_RCVD -23
42#define OPAL_I2C_STOP_ERR -24 42#define OPAL_I2C_STOP_ERR -24
43#define OPAL_XIVE_PROVISIONING -31
44#define OPAL_XIVE_FREE_ACTIVE -32
43 45
44/* API Tokens (in r0) */ 46/* API Tokens (in r0) */
45#define OPAL_INVALID_CALL -1 47#define OPAL_INVALID_CALL -1
@@ -168,7 +170,27 @@
168#define OPAL_INT_SET_MFRR 125 170#define OPAL_INT_SET_MFRR 125
169#define OPAL_PCI_TCE_KILL 126 171#define OPAL_PCI_TCE_KILL 126
170#define OPAL_NMMU_SET_PTCR 127 172#define OPAL_NMMU_SET_PTCR 127
171#define OPAL_LAST 127 173#define OPAL_XIVE_RESET 128
174#define OPAL_XIVE_GET_IRQ_INFO 129
175#define OPAL_XIVE_GET_IRQ_CONFIG 130
176#define OPAL_XIVE_SET_IRQ_CONFIG 131
177#define OPAL_XIVE_GET_QUEUE_INFO 132
178#define OPAL_XIVE_SET_QUEUE_INFO 133
179#define OPAL_XIVE_DONATE_PAGE 134
180#define OPAL_XIVE_ALLOCATE_VP_BLOCK 135
181#define OPAL_XIVE_FREE_VP_BLOCK 136
182#define OPAL_XIVE_GET_VP_INFO 137
183#define OPAL_XIVE_SET_VP_INFO 138
184#define OPAL_XIVE_ALLOCATE_IRQ 139
185#define OPAL_XIVE_FREE_IRQ 140
186#define OPAL_XIVE_SYNC 141
187#define OPAL_XIVE_DUMP 142
188#define OPAL_XIVE_RESERVED3 143
189#define OPAL_XIVE_RESERVED4 144
190#define OPAL_NPU_INIT_CONTEXT 146
191#define OPAL_NPU_DESTROY_CONTEXT 147
192#define OPAL_NPU_MAP_LPAR 148
193#define OPAL_LAST 148
172 194
173/* Device tree flags */ 195/* Device tree flags */
174 196
@@ -928,6 +950,59 @@ enum {
928 OPAL_PCI_TCE_KILL_ALL, 950 OPAL_PCI_TCE_KILL_ALL,
929}; 951};
930 952
953/* The xive operation mode indicates the active "API" and
954 * corresponds to the "mode" parameter of the opal_xive_reset()
955 * call
956 */
957enum {
958 OPAL_XIVE_MODE_EMU = 0,
959 OPAL_XIVE_MODE_EXPL = 1,
960};
961
962/* Flags for OPAL_XIVE_GET_IRQ_INFO */
963enum {
964 OPAL_XIVE_IRQ_TRIGGER_PAGE = 0x00000001,
965 OPAL_XIVE_IRQ_STORE_EOI = 0x00000002,
966 OPAL_XIVE_IRQ_LSI = 0x00000004,
967 OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008,
968 OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010,
969 OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020,
970};
971
972/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */
973enum {
974 OPAL_XIVE_EQ_ENABLED = 0x00000001,
975 OPAL_XIVE_EQ_ALWAYS_NOTIFY = 0x00000002,
976 OPAL_XIVE_EQ_ESCALATE = 0x00000004,
977};
978
979/* Flags for OPAL_XIVE_GET/SET_VP_INFO */
980enum {
981 OPAL_XIVE_VP_ENABLED = 0x00000001,
982};
983
984/* "Any chip" replacement for chip ID for allocation functions */
985enum {
986 OPAL_XIVE_ANY_CHIP = 0xffffffff,
987};
988
989/* Xive sync options */
990enum {
991 /* This bits are cumulative, arg is a girq */
992 XIVE_SYNC_EAS = 0x00000001, /* Sync irq source */
993 XIVE_SYNC_QUEUE = 0x00000002, /* Sync irq target */
994};
995
996/* Dump options */
997enum {
998 XIVE_DUMP_TM_HYP = 0,
999 XIVE_DUMP_TM_POOL = 1,
1000 XIVE_DUMP_TM_OS = 2,
1001 XIVE_DUMP_TM_USER = 3,
1002 XIVE_DUMP_VP = 4,
1003 XIVE_DUMP_EMU_STATE = 5,
1004};
1005
931#endif /* __ASSEMBLY__ */ 1006#endif /* __ASSEMBLY__ */
932 1007
933#endif /* __OPAL_API_H */ 1008#endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..588fb1c23af9 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -29,6 +29,11 @@ extern struct device_node *opal_node;
29 29
30/* API functions */ 30/* API functions */
31int64_t opal_invalid_call(void); 31int64_t opal_invalid_call(void);
32int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
33int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
34 uint64_t bdf);
35int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
36 uint64_t lpcr);
32int64_t opal_console_write(int64_t term_number, __be64 *length, 37int64_t opal_console_write(int64_t term_number, __be64 *length,
33 const uint8_t *buffer); 38 const uint8_t *buffer);
34int64_t opal_console_read(int64_t term_number, __be64 *length, 39int64_t opal_console_read(int64_t term_number, __be64 *length,
@@ -226,6 +231,42 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
226 uint32_t pe_num, uint32_t tce_size, 231 uint32_t pe_num, uint32_t tce_size,
227 uint64_t dma_addr, uint32_t npages); 232 uint64_t dma_addr, uint32_t npages);
228int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr); 233int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
234int64_t opal_xive_reset(uint64_t version);
235int64_t opal_xive_get_irq_info(uint32_t girq,
236 __be64 *out_flags,
237 __be64 *out_eoi_page,
238 __be64 *out_trig_page,
239 __be32 *out_esb_shift,
240 __be32 *out_src_chip);
241int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp,
242 uint8_t *out_prio, __be32 *out_lirq);
243int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
244 uint32_t lirq);
245int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
246 __be64 *out_qpage,
247 __be64 *out_qsize,
248 __be64 *out_qeoi_page,
249 __be32 *out_escalate_irq,
250 __be64 *out_qflags);
251int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
252 uint64_t qpage,
253 uint64_t qsize,
254 uint64_t qflags);
255int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr);
256int64_t opal_xive_alloc_vp_block(uint32_t alloc_order);
257int64_t opal_xive_free_vp_block(uint64_t vp);
258int64_t opal_xive_get_vp_info(uint64_t vp,
259 __be64 *out_flags,
260 __be64 *out_cam_value,
261 __be64 *out_report_cl_pair,
262 __be32 *out_chip_id);
263int64_t opal_xive_set_vp_info(uint64_t vp,
264 uint64_t flags,
265 uint64_t report_cl_pair);
266int64_t opal_xive_allocate_irq(uint32_t chip_id);
267int64_t opal_xive_free_irq(uint32_t girq);
268int64_t opal_xive_sync(uint32_t type, uint32_t id);
269int64_t opal_xive_dump(uint32_t type, uint32_t id);
229 270
230/* Internal functions */ 271/* Internal functions */
231extern int early_init_dt_scan_opal(unsigned long node, const char *uname, 272extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 708c3e592eeb..1c09f8fe2ee8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,7 +99,6 @@ struct paca_struct {
99 */ 99 */
100 /* used for most interrupts/exceptions */ 100 /* used for most interrupts/exceptions */
101 u64 exgen[13] __attribute__((aligned(0x80))); 101 u64 exgen[13] __attribute__((aligned(0x80)));
102 u64 exmc[13]; /* used for machine checks */
103 u64 exslb[13]; /* used for SLB/segment table misses 102 u64 exslb[13]; /* used for SLB/segment table misses
104 * on the linear mapping */ 103 * on the linear mapping */
105 /* SLB related definitions */ 104 /* SLB related definitions */
@@ -139,6 +138,7 @@ struct paca_struct {
139#ifdef CONFIG_PPC_MM_SLICES 138#ifdef CONFIG_PPC_MM_SLICES
140 u64 mm_ctx_low_slices_psize; 139 u64 mm_ctx_low_slices_psize;
141 unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; 140 unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
141 unsigned long addr_limit;
142#else 142#else
143 u16 mm_ctx_user_psize; 143 u16 mm_ctx_user_psize;
144 u16 mm_ctx_sllp; 144 u16 mm_ctx_sllp;
@@ -172,17 +172,31 @@ struct paca_struct {
172 u8 thread_mask; 172 u8 thread_mask;
173 /* Mask to denote subcore sibling threads */ 173 /* Mask to denote subcore sibling threads */
174 u8 subcore_sibling_mask; 174 u8 subcore_sibling_mask;
175 /*
176 * Pointer to an array which contains pointer
177 * to the sibling threads' paca.
178 */
179 struct paca_struct **thread_sibling_pacas;
175#endif 180#endif
176 181
182#ifdef CONFIG_PPC_STD_MMU_64
183 /* Non-maskable exceptions that are not performance critical */
184 u64 exnmi[13]; /* used for system reset (nmi) */
185 u64 exmc[13]; /* used for machine checks */
186#endif
177#ifdef CONFIG_PPC_BOOK3S_64 187#ifdef CONFIG_PPC_BOOK3S_64
178 /* Exclusive emergency stack pointer for machine check exception. */ 188 /* Exclusive stacks for system reset and machine check exception. */
189 void *nmi_emergency_sp;
179 void *mc_emergency_sp; 190 void *mc_emergency_sp;
191
192 u16 in_nmi; /* In nmi handler */
193
180 /* 194 /*
181 * Flag to check whether we are in machine check early handler 195 * Flag to check whether we are in machine check early handler
182 * and already using emergency stack. 196 * and already using emergency stack.
183 */ 197 */
184 u16 in_mce; 198 u16 in_mce;
185 u8 hmi_event_available; /* HMI event is available */ 199 u8 hmi_event_available; /* HMI event is available */
186#endif 200#endif
187 201
188 /* Stuff for accurate time accounting */ 202 /* Stuff for accurate time accounting */
@@ -206,23 +220,7 @@ struct paca_struct {
206#endif 220#endif
207}; 221};
208 222
209#ifdef CONFIG_PPC_BOOK3S 223extern void copy_mm_to_paca(struct mm_struct *mm);
210static inline void copy_mm_to_paca(mm_context_t *context)
211{
212 get_paca()->mm_ctx_id = context->id;
213#ifdef CONFIG_PPC_MM_SLICES
214 get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
215 memcpy(&get_paca()->mm_ctx_high_slices_psize,
216 &context->high_slices_psize, SLICE_ARRAY_SIZE);
217#else
218 get_paca()->mm_ctx_user_psize = context->user_psize;
219 get_paca()->mm_ctx_sllp = context->sllp;
220#endif
221}
222#else
223static inline void copy_mm_to_paca(mm_context_t *context){}
224#endif
225
226extern struct paca_struct *paca; 224extern struct paca_struct *paca;
227extern void initialise_paca(struct paca_struct *new_paca, int cpu); 225extern void initialise_paca(struct paca_struct *new_paca, int cpu);
228extern void setup_paca(struct paca_struct *new_paca); 226extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 3e83d2a20b6f..c4d9654bd637 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -98,21 +98,7 @@ extern u64 ppc64_pft_size;
98#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) 98#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT)
99#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) 99#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
100 100
101/*
102 * 1 bit per slice and we have one slice per 1TB
103 * Right now we support only 64TB.
104 * IF we change this we will have to change the type
105 * of high_slices
106 */
107#define SLICE_MASK_SIZE 8
108
109#ifndef __ASSEMBLY__ 101#ifndef __ASSEMBLY__
110
111struct slice_mask {
112 u16 low_slices;
113 u64 high_slices;
114};
115
116struct mm_struct; 102struct mm_struct;
117 103
118extern unsigned long slice_get_unmapped_area(unsigned long addr, 104extern unsigned long slice_get_unmapped_area(unsigned long addr,
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index ae0a23091a9b..723bf48e7494 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -38,6 +38,9 @@ struct power_pmu {
38 unsigned long *valp); 38 unsigned long *valp);
39 int (*get_alternatives)(u64 event_id, unsigned int flags, 39 int (*get_alternatives)(u64 event_id, unsigned int flags,
40 u64 alt[]); 40 u64 alt[]);
41 void (*get_mem_data_src)(union perf_mem_data_src *dsrc,
42 u32 flags, struct pt_regs *regs);
43 void (*get_mem_weight)(u64 *weight);
41 u64 (*bhrb_filter_map)(u64 branch_sample_type); 44 u64 (*bhrb_filter_map)(u64 branch_sample_type);
42 void (*config_bhrb)(u64 pmu_bhrb_filter); 45 void (*config_bhrb)(u64 pmu_bhrb_filter);
43 void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); 46 void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index 0e9c2402dd20..f62797702300 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -11,9 +11,31 @@
11#define _ASM_POWERNV_H 11#define _ASM_POWERNV_H
12 12
13#ifdef CONFIG_PPC_POWERNV 13#ifdef CONFIG_PPC_POWERNV
14#define NPU2_WRITE 1
14extern void powernv_set_nmmu_ptcr(unsigned long ptcr); 15extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
16extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
17 unsigned long flags,
18 struct npu_context *(*cb)(struct npu_context *, void *),
19 void *priv);
20extern void pnv_npu2_destroy_context(struct npu_context *context,
21 struct pci_dev *gpdev);
22extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
23 unsigned long *flags, unsigned long *status,
24 int count);
15#else 25#else
16static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } 26static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
27static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
28 unsigned long flags,
29 struct npu_context *(*cb)(struct npu_context *, void *),
30 void *priv) { return ERR_PTR(-ENODEV); }
31static inline void pnv_npu2_destroy_context(struct npu_context *context,
32 struct pci_dev *gpdev) { }
33
34static inline int pnv_npu2_handle_fault(struct npu_context *context,
35 uintptr_t *ea, unsigned long *flags,
36 unsigned long *status, int count) {
37 return -ENODEV;
38}
17#endif 39#endif
18 40
19#endif /* _ASM_POWERNV_H */ 41#endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563ee..142d78d645f4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -161,6 +161,7 @@
161#define PPC_INST_MFTMR 0x7c0002dc 161#define PPC_INST_MFTMR 0x7c0002dc
162#define PPC_INST_MSGSND 0x7c00019c 162#define PPC_INST_MSGSND 0x7c00019c
163#define PPC_INST_MSGCLR 0x7c0001dc 163#define PPC_INST_MSGCLR 0x7c0001dc
164#define PPC_INST_MSGSYNC 0x7c0006ec
164#define PPC_INST_MSGSNDP 0x7c00011c 165#define PPC_INST_MSGSNDP 0x7c00011c
165#define PPC_INST_MTTMR 0x7c0003dc 166#define PPC_INST_MTTMR 0x7c0003dc
166#define PPC_INST_NOP 0x60000000 167#define PPC_INST_NOP 0x60000000
@@ -345,6 +346,7 @@
345 ___PPC_RB(b) | __PPC_EH(eh)) 346 ___PPC_RB(b) | __PPC_EH(eh))
346#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ 347#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \
347 ___PPC_RB(b)) 348 ___PPC_RB(b))
349#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC)
348#define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \ 350#define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \
349 ___PPC_RB(b)) 351 ___PPC_RB(b))
350#define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ 352#define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index e0fecbcea2a2..a4b1d8d6b793 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -102,11 +102,25 @@ void release_thread(struct task_struct *);
102#endif 102#endif
103 103
104#ifdef CONFIG_PPC64 104#ifdef CONFIG_PPC64
105/* 64-bit user address space is 46-bits (64TB user VM) */ 105/*
106#define TASK_SIZE_USER64 (0x0000400000000000UL) 106 * 64-bit user address space can have multiple limits
107 * For now supported values are:
108 */
109#define TASK_SIZE_64TB (0x0000400000000000UL)
110#define TASK_SIZE_128TB (0x0000800000000000UL)
111#define TASK_SIZE_512TB (0x0002000000000000UL)
112
113#ifdef CONFIG_PPC_BOOK3S_64
114/*
115 * Max value currently used:
116 */
117#define TASK_SIZE_USER64 TASK_SIZE_512TB
118#else
119#define TASK_SIZE_USER64 TASK_SIZE_64TB
120#endif
107 121
108/* 122/*
109 * 32-bit user address space is 4GB - 1 page 123 * 32-bit user address space is 4GB - 1 page
110 * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT 124 * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
111 */ 125 */
112#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) 126#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
@@ -114,26 +128,37 @@ void release_thread(struct task_struct *);
114#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ 128#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
115 TASK_SIZE_USER32 : TASK_SIZE_USER64) 129 TASK_SIZE_USER32 : TASK_SIZE_USER64)
116#define TASK_SIZE TASK_SIZE_OF(current) 130#define TASK_SIZE TASK_SIZE_OF(current)
117
118/* This decides where the kernel will search for a free chunk of vm 131/* This decides where the kernel will search for a free chunk of vm
119 * space during mmap's. 132 * space during mmap's.
120 */ 133 */
121#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) 134#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
122#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4)) 135#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4))
123 136
124#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ 137#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
125 TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) 138 TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
126#endif 139#endif
127 140
141/*
142 * Initial task size value for user applications. For book3s 64 we start
143 * with 128TB and conditionally enable upto 512TB
144 */
145#ifdef CONFIG_PPC_BOOK3S_64
146#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \
147 TASK_SIZE_USER32 : TASK_SIZE_128TB)
148#else
149#define DEFAULT_MAP_WINDOW TASK_SIZE
150#endif
151
128#ifdef __powerpc64__ 152#ifdef __powerpc64__
129 153
130#define STACK_TOP_USER64 TASK_SIZE_USER64 154/* Limit stack to 128TB */
155#define STACK_TOP_USER64 TASK_SIZE_128TB
131#define STACK_TOP_USER32 TASK_SIZE_USER32 156#define STACK_TOP_USER32 TASK_SIZE_USER32
132 157
133#define STACK_TOP (is_32bit_task() ? \ 158#define STACK_TOP (is_32bit_task() ? \
134 STACK_TOP_USER32 : STACK_TOP_USER64) 159 STACK_TOP_USER32 : STACK_TOP_USER64)
135 160
136#define STACK_TOP_MAX STACK_TOP_USER64 161#define STACK_TOP_MAX TASK_SIZE_USER64
137 162
138#else /* __powerpc64__ */ 163#else /* __powerpc64__ */
139 164
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fc879fd6bdae..d4f653c9259a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -310,6 +310,7 @@
310#define SPRN_PMCR 0x374 /* Power Management Control Register */ 310#define SPRN_PMCR 0x374 /* Power Management Control Register */
311 311
312/* HFSCR and FSCR bit numbers are the same */ 312/* HFSCR and FSCR bit numbers are the same */
313#define FSCR_SCV_LG 12 /* Enable System Call Vectored */
313#define FSCR_MSGP_LG 10 /* Enable MSGP */ 314#define FSCR_MSGP_LG 10 /* Enable MSGP */
314#define FSCR_TAR_LG 8 /* Enable Target Address Register */ 315#define FSCR_TAR_LG 8 /* Enable Target Address Register */
315#define FSCR_EBB_LG 7 /* Enable Event Based Branching */ 316#define FSCR_EBB_LG 7 /* Enable Event Based Branching */
@@ -320,6 +321,7 @@
320#define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */ 321#define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */
321#define FSCR_FP_LG 0 /* Enable Floating Point */ 322#define FSCR_FP_LG 0 /* Enable Floating Point */
322#define SPRN_FSCR 0x099 /* Facility Status & Control Register */ 323#define SPRN_FSCR 0x099 /* Facility Status & Control Register */
324#define FSCR_SCV __MASK(FSCR_SCV_LG)
323#define FSCR_TAR __MASK(FSCR_TAR_LG) 325#define FSCR_TAR __MASK(FSCR_TAR_LG)
324#define FSCR_EBB __MASK(FSCR_EBB_LG) 326#define FSCR_EBB __MASK(FSCR_EBB_LG)
325#define FSCR_DSCR __MASK(FSCR_DSCR_LG) 327#define FSCR_DSCR __MASK(FSCR_DSCR_LG)
@@ -365,6 +367,7 @@
365#define LPCR_MER_SH 11 367#define LPCR_MER_SH 11
366#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */ 368#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */
367#define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */ 369#define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */
370#define LPCR_HEIC ASM_CONST(0x0000000000000010) /* Hypervisor External Interrupt Control */
368#define LPCR_LPES 0x0000000c 371#define LPCR_LPES 0x0000000c
369#define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */ 372#define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */
370#define LPCR_LPES1 ASM_CONST(0x0000000000000004) /* LPAR Env selector 1 */ 373#define LPCR_LPES1 ASM_CONST(0x0000000000000004) /* LPAR Env selector 1 */
@@ -656,6 +659,7 @@
656#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ 659#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */
657#define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ 660#define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */
658#define SRR1_WAKEMASK_P8 0x003c0000 /* reason for wakeup on POWER8 and 9 */ 661#define SRR1_WAKEMASK_P8 0x003c0000 /* reason for wakeup on POWER8 and 9 */
662#define SRR1_WAKEMCE_RESVD 0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */
659#define SRR1_WAKESYSERR 0x00300000 /* System error */ 663#define SRR1_WAKESYSERR 0x00300000 /* System error */
660#define SRR1_WAKEEE 0x00200000 /* External interrupt */ 664#define SRR1_WAKEEE 0x00200000 /* External interrupt */
661#define SRR1_WAKEHVI 0x00240000 /* Hypervisor Virtualization Interrupt (P9) */ 665#define SRR1_WAKEHVI 0x00240000 /* Hypervisor Virtualization Interrupt (P9) */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 7dc006b58369..7902d6358854 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -6,6 +6,8 @@
6#include <linux/uaccess.h> 6#include <linux/uaccess.h>
7#include <asm-generic/sections.h> 7#include <asm-generic/sections.h>
8 8
9extern char __head_end[];
10
9#ifdef __powerpc64__ 11#ifdef __powerpc64__
10 12
11extern char __start_interrupts[]; 13extern char __start_interrupts[];
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 32db16d2e7ad..ebddb2111d87 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -40,10 +40,12 @@ extern int cpu_to_chip_id(int cpu);
40struct smp_ops_t { 40struct smp_ops_t {
41 void (*message_pass)(int cpu, int msg); 41 void (*message_pass)(int cpu, int msg);
42#ifdef CONFIG_PPC_SMP_MUXED_IPI 42#ifdef CONFIG_PPC_SMP_MUXED_IPI
43 void (*cause_ipi)(int cpu, unsigned long data); 43 void (*cause_ipi)(int cpu);
44#endif 44#endif
45 int (*cause_nmi_ipi)(int cpu);
45 void (*probe)(void); 46 void (*probe)(void);
46 int (*kick_cpu)(int nr); 47 int (*kick_cpu)(int nr);
48 int (*prepare_cpu)(int nr);
47 void (*setup_cpu)(int nr); 49 void (*setup_cpu)(int nr);
48 void (*bringup_done)(void); 50 void (*bringup_done)(void);
49 void (*take_timebase)(void); 51 void (*take_timebase)(void);
@@ -61,7 +63,6 @@ extern void smp_generic_take_timebase(void);
61DECLARE_PER_CPU(unsigned int, cpu_pvr); 63DECLARE_PER_CPU(unsigned int, cpu_pvr);
62 64
63#ifdef CONFIG_HOTPLUG_CPU 65#ifdef CONFIG_HOTPLUG_CPU
64extern void migrate_irqs(void);
65int generic_cpu_disable(void); 66int generic_cpu_disable(void);
66void generic_cpu_die(unsigned int cpu); 67void generic_cpu_die(unsigned int cpu);
67void generic_set_cpu_dead(unsigned int cpu); 68void generic_set_cpu_dead(unsigned int cpu);
@@ -112,23 +113,31 @@ extern int cpu_to_core_id(int cpu);
112 * 113 *
113 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up 114 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
114 * in /proc/interrupts will be wrong!!! --Troy */ 115 * in /proc/interrupts will be wrong!!! --Troy */
115#define PPC_MSG_CALL_FUNCTION 0 116#define PPC_MSG_CALL_FUNCTION 0
116#define PPC_MSG_RESCHEDULE 1 117#define PPC_MSG_RESCHEDULE 1
117#define PPC_MSG_TICK_BROADCAST 2 118#define PPC_MSG_TICK_BROADCAST 2
118#define PPC_MSG_DEBUGGER_BREAK 3 119#define PPC_MSG_NMI_IPI 3
119 120
120/* This is only used by the powernv kernel */ 121/* This is only used by the powernv kernel */
121#define PPC_MSG_RM_HOST_ACTION 4 122#define PPC_MSG_RM_HOST_ACTION 4
122 123
124#define NMI_IPI_ALL_OTHERS -2
125
126#ifdef CONFIG_NMI_IPI
127extern int smp_handle_nmi_ipi(struct pt_regs *regs);
128#else
129static inline int smp_handle_nmi_ipi(struct pt_regs *regs) { return 0; }
130#endif
131
123/* for irq controllers that have dedicated ipis per message (4) */ 132/* for irq controllers that have dedicated ipis per message (4) */
124extern int smp_request_message_ipi(int virq, int message); 133extern int smp_request_message_ipi(int virq, int message);
125extern const char *smp_ipi_name[]; 134extern const char *smp_ipi_name[];
126 135
127/* for irq controllers with only a single ipi */ 136/* for irq controllers with only a single ipi */
128extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
129extern void smp_muxed_ipi_message_pass(int cpu, int msg); 137extern void smp_muxed_ipi_message_pass(int cpu, int msg);
130extern void smp_muxed_ipi_set_message(int cpu, int msg); 138extern void smp_muxed_ipi_set_message(int cpu, int msg);
131extern irqreturn_t smp_ipi_demux(void); 139extern irqreturn_t smp_ipi_demux(void);
140extern irqreturn_t smp_ipi_demux_relaxed(void);
132 141
133void smp_init_pSeries(void); 142void smp_init_pSeries(void);
134void smp_init_cell(void); 143void smp_init_cell(void);
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 23be8f1e7e64..16fab6898240 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -8,10 +8,10 @@
8 8
9struct rtas_args; 9struct rtas_args;
10 10
11asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len, 11asmlinkage long sys_mmap(unsigned long addr, size_t len,
12 unsigned long prot, unsigned long flags, 12 unsigned long prot, unsigned long flags,
13 unsigned long fd, off_t offset); 13 unsigned long fd, off_t offset);
14asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len, 14asmlinkage long sys_mmap2(unsigned long addr, size_t len,
15 unsigned long prot, unsigned long flags, 15 unsigned long prot, unsigned long flags,
16 unsigned long fd, unsigned long pgoff); 16 unsigned long fd, unsigned long pgoff);
17asmlinkage long ppc64_personality(unsigned long personality); 17asmlinkage long ppc64_personality(unsigned long personality);
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 6fc6464f7421..a941cc6fc3e9 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -10,15 +10,7 @@
10 10
11#ifdef __KERNEL__ 11#ifdef __KERNEL__
12 12
13/* We have 8k stacks on ppc32 and 16k on ppc64 */ 13#define THREAD_SHIFT CONFIG_THREAD_SHIFT
14
15#if defined(CONFIG_PPC64)
16#define THREAD_SHIFT 14
17#elif defined(CONFIG_PPC_256K_PAGES)
18#define THREAD_SHIFT 15
19#else
20#define THREAD_SHIFT 13
21#endif
22 14
23#define THREAD_SIZE (1 << THREAD_SHIFT) 15#define THREAD_SIZE (1 << THREAD_SHIFT)
24 16
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index e0b9e576905a..7ce2c3ac2964 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -57,7 +57,7 @@ struct icp_ops {
57 void (*teardown_cpu)(void); 57 void (*teardown_cpu)(void);
58 void (*flush_ipi)(void); 58 void (*flush_ipi)(void);
59#ifdef CONFIG_SMP 59#ifdef CONFIG_SMP
60 void (*cause_ipi)(int cpu, unsigned long data); 60 void (*cause_ipi)(int cpu);
61 irq_handler_t ipi_action; 61 irq_handler_t ipi_action;
62#endif 62#endif
63}; 63};
diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
new file mode 100644
index 000000000000..1d3f2be5ae39
--- /dev/null
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright 2016,2017 IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#ifndef _ASM_POWERPC_XIVE_REGS_H
10#define _ASM_POWERPC_XIVE_REGS_H
11
12/*
13 * Thread Management (aka "TM") registers
14 */
15
16/* TM register offsets */
17#define TM_QW0_USER 0x000 /* All rings */
18#define TM_QW1_OS 0x010 /* Ring 0..2 */
19#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */
20#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */
21
22/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
23#define TM_NSR 0x0 /* + + - + */
24#define TM_CPPR 0x1 /* - + - + */
25#define TM_IPB 0x2 /* - + + + */
26#define TM_LSMFB 0x3 /* - + + + */
27#define TM_ACK_CNT 0x4 /* - + - - */
28#define TM_INC 0x5 /* - + - + */
29#define TM_AGE 0x6 /* - + - + */
30#define TM_PIPR 0x7 /* - + - + */
31
32#define TM_WORD0 0x0
33#define TM_WORD1 0x4
34
35/*
36 * QW word 2 contains the valid bit at the top and other fields
37 * depending on the QW.
38 */
39#define TM_WORD2 0x8
40#define TM_QW0W2_VU PPC_BIT32(0)
41#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
42#define TM_QW1W2_VO PPC_BIT32(0)
43#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
44#define TM_QW2W2_VP PPC_BIT32(0)
45#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
46#define TM_QW3W2_VT PPC_BIT32(0)
47#define TM_QW3W2_LP PPC_BIT32(6)
48#define TM_QW3W2_LE PPC_BIT32(7)
49#define TM_QW3W2_T PPC_BIT32(31)
50
51/*
52 * In addition to normal loads to "peek" and writes (only when invalid)
53 * using 4 and 8 bytes accesses, the above registers support these
54 * "special" byte operations:
55 *
56 * - Byte load from QW0[NSR] - User level NSR (EBB)
57 * - Byte store to QW0[NSR] - User level NSR (EBB)
58 * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
59 * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
60 * otherwise VT||0000000
61 * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
62 *
63 * Then we have all these "special" CI ops at these offset that trigger
64 * all sorts of side effects:
65 */
66#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/
67#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */
68#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */
69#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */
70#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */
71#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */
72#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/
73#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */
74#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */
75#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */
76#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */
77#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */
78/* XXX more... */
79
80/* NSR fields for the various QW ack types */
81#define TM_QW0_NSR_EB PPC_BIT8(0)
82#define TM_QW1_NSR_EO PPC_BIT8(0)
83#define TM_QW3_NSR_HE PPC_BITMASK8(0,1)
84#define TM_QW3_NSR_HE_NONE 0
85#define TM_QW3_NSR_HE_POOL 1
86#define TM_QW3_NSR_HE_PHYS 2
87#define TM_QW3_NSR_HE_LSI 3
88#define TM_QW3_NSR_I PPC_BIT8(2)
89#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7)
90
91/* Utilities to manipulate these (originaly from OPAL) */
92#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
93#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m))
94#define SETFIELD(m, v, val) \
95 (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
96
97#endif /* _ASM_POWERPC_XIVE_REGS_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
new file mode 100644
index 000000000000..3cdbeaeac397
--- /dev/null
+++ b/arch/powerpc/include/asm/xive.h
@@ -0,0 +1,163 @@
1/*
2 * Copyright 2016,2017 IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#ifndef _ASM_POWERPC_XIVE_H
10#define _ASM_POWERPC_XIVE_H
11
12#define XIVE_INVALID_VP 0xffffffff
13
14#ifdef CONFIG_PPC_XIVE
15
16/*
17 * Thread Interrupt Management Area (TIMA)
18 *
19 * This is a global MMIO region divided in 4 pages of varying access
20 * permissions, providing access to per-cpu interrupt management
21 * functions. It always identifies the CPU doing the access based
22 * on the PowerBus initiator ID, thus we always access via the
23 * same offset regardless of where the code is executing
24 */
25extern void __iomem *xive_tima;
26
27/*
28 * Offset in the TM area of our current execution level (provided by
29 * the backend)
30 */
31extern u32 xive_tima_offset;
32
33/*
34 * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
35 * have it stored in the xive_cpu structure. We also cache
36 * for normal interrupts the current target CPU.
37 *
38 * This structure is setup by the backend for each interrupt.
39 */
40struct xive_irq_data {
41 u64 flags;
42 u64 eoi_page;
43 void __iomem *eoi_mmio;
44 u64 trig_page;
45 void __iomem *trig_mmio;
46 u32 esb_shift;
47 int src_chip;
48
49 /* Setup/used by frontend */
50 int target;
51 bool saved_p;
52};
53#define XIVE_IRQ_FLAG_STORE_EOI 0x01
54#define XIVE_IRQ_FLAG_LSI 0x02
55#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
56#define XIVE_IRQ_FLAG_MASK_FW 0x08
57#define XIVE_IRQ_FLAG_EOI_FW 0x10
58
59#define XIVE_INVALID_CHIP_ID -1
60
61/* A queue tracking structure in a CPU */
62struct xive_q {
63 __be32 *qpage;
64 u32 msk;
65 u32 idx;
66 u32 toggle;
67 u64 eoi_phys;
68 u32 esc_irq;
69 atomic_t count;
70 atomic_t pending_count;
71};
72
73/*
74 * "magic" Event State Buffer (ESB) MMIO offsets.
75 *
76 * Each interrupt source has a 2-bit state machine called ESB
77 * which can be controlled by MMIO. It's made of 2 bits, P and
78 * Q. P indicates that an interrupt is pending (has been sent
79 * to a queue and is waiting for an EOI). Q indicates that the
80 * interrupt has been triggered while pending.
81 *
82 * This acts as a coalescing mechanism in order to guarantee
83 * that a given interrupt only occurs at most once in a queue.
84 *
85 * When doing an EOI, the Q bit will indicate if the interrupt
86 * needs to be re-triggered.
87 *
88 * The following offsets into the ESB MMIO allow to read or
89 * manipulate the PQ bits. They must be used with an 8-bytes
90 * load instruction. They all return the previous state of the
91 * interrupt (atomically).
92 *
93 * Additionally, some ESB pages support doing an EOI via a
94 * store at 0 and some ESBs support doing a trigger via a
95 * separate trigger page.
96 */
97#define XIVE_ESB_GET 0x800
98#define XIVE_ESB_SET_PQ_00 0xc00
99#define XIVE_ESB_SET_PQ_01 0xd00
100#define XIVE_ESB_SET_PQ_10 0xe00
101#define XIVE_ESB_SET_PQ_11 0xf00
102#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
103
104#define XIVE_ESB_VAL_P 0x2
105#define XIVE_ESB_VAL_Q 0x1
106
107/* Global enable flags for the XIVE support */
108extern bool __xive_enabled;
109
110static inline bool xive_enabled(void) { return __xive_enabled; }
111
112extern bool xive_native_init(void);
113extern void xive_smp_probe(void);
114extern int xive_smp_prepare_cpu(unsigned int cpu);
115extern void xive_smp_setup_cpu(void);
116extern void xive_smp_disable_cpu(void);
117extern void xive_kexec_teardown_cpu(int secondary);
118extern void xive_shutdown(void);
119extern void xive_flush_interrupt(void);
120
121/* xmon hook */
122extern void xmon_xive_do_dump(int cpu);
123
124/* APIs used by KVM */
125extern u32 xive_native_default_eq_shift(void);
126extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
127extern void xive_native_free_vp_block(u32 vp_base);
128extern int xive_native_populate_irq_data(u32 hw_irq,
129 struct xive_irq_data *data);
130extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
131extern u32 xive_native_alloc_irq(void);
132extern void xive_native_free_irq(u32 irq);
133extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
134
135extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
136 __be32 *qpage, u32 order, bool can_escalate);
137extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
138
139extern bool __xive_irq_trigger(struct xive_irq_data *xd);
140extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
141extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
142
143extern bool is_xive_irq(struct irq_chip *chip);
144
145#else
146
147static inline bool xive_enabled(void) { return false; }
148
149static inline bool xive_native_init(void) { return false; }
150static inline void xive_smp_probe(void) { }
151extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
152static inline void xive_smp_setup_cpu(void) { }
153static inline void xive_smp_disable_cpu(void) { }
154static inline void xive_kexec_teardown_cpu(int secondary) { }
155static inline void xive_shutdown(void) { }
156static inline void xive_flush_interrupt(void) { }
157
158static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
159static inline void xive_native_free_vp_block(u32 vp_base) { }
160
161#endif
162
163#endif /* _ASM_POWERPC_XIVE_H */
diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
index 5eb8e599e5cc..eb42a0c6e1d9 100644
--- a/arch/powerpc/include/asm/xmon.h
+++ b/arch/powerpc/include/asm/xmon.h
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
29extern int cpus_are_in_xmon(void); 29extern int cpus_are_in_xmon(void);
30#endif 30#endif
31 31
32extern void xmon_printf(const char *format, ...);
33
32#endif /* __KERNEL __ */ 34#endif /* __KERNEL __ */
33#endif /* __ASM_POWERPC_XMON_H */ 35#endif /* __ASM_POWERPC_XMON_H */
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 03c06ba7464f..ab45cc2f3101 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,20 @@
29#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 29#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
30#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 30#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
31 31
32/*
33 * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
34 * encode the log2 of the huge page size. A value of zero indicates that the
35 * default huge page size should be used. To use a non-default huge page size,
36 * one of these defines can be used, or the size can be encoded by hand. Note
37 * that on most systems only a subset, or possibly none, of these sizes will be
38 * available.
39 */
40#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */
41#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */
42#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */
43#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */
44#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */
45#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */
46#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */
47
32#endif /* _UAPI_ASM_POWERPC_MMAN_H */ 48#endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 811f441a125f..b9db46ae545b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
25CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 25CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
26CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 26CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
27CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 27CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
28# do not trace tracer code
29CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
30# timers used by tracing 28# timers used by tracing
31CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 29CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
32endif 30endif
@@ -97,6 +95,7 @@ obj-$(CONFIG_BOOTX_TEXT) += btext.o
97obj-$(CONFIG_SMP) += smp.o 95obj-$(CONFIG_SMP) += smp.o
98obj-$(CONFIG_KPROBES) += kprobes.o 96obj-$(CONFIG_KPROBES) += kprobes.o
99obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o 97obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o
98obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o
100obj-$(CONFIG_UPROBES) += uprobes.o 99obj-$(CONFIG_UPROBES) += uprobes.o
101obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o 100obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
102obj-$(CONFIG_STACKTRACE) += stacktrace.o 101obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -118,10 +117,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
118 117
119obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o 118obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o
120 119
121obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 120obj-y += trace/
122obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
123obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
124obj-$(CONFIG_TRACING) += trace_clock.o
125 121
126ifneq ($(CONFIG_PPC_INDIRECT_PIO),y) 122ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
127obj-y += iomap.o 123obj-y += iomap.o
@@ -142,14 +138,14 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
142# Disable GCOV & sanitizers in odd or sensitive code 138# Disable GCOV & sanitizers in odd or sensitive code
143GCOV_PROFILE_prom_init.o := n 139GCOV_PROFILE_prom_init.o := n
144UBSAN_SANITIZE_prom_init.o := n 140UBSAN_SANITIZE_prom_init.o := n
145GCOV_PROFILE_ftrace.o := n
146UBSAN_SANITIZE_ftrace.o := n
147GCOV_PROFILE_machine_kexec_64.o := n 141GCOV_PROFILE_machine_kexec_64.o := n
148UBSAN_SANITIZE_machine_kexec_64.o := n 142UBSAN_SANITIZE_machine_kexec_64.o := n
149GCOV_PROFILE_machine_kexec_32.o := n 143GCOV_PROFILE_machine_kexec_32.o := n
150UBSAN_SANITIZE_machine_kexec_32.o := n 144UBSAN_SANITIZE_machine_kexec_32.o := n
151GCOV_PROFILE_kprobes.o := n 145GCOV_PROFILE_kprobes.o := n
152UBSAN_SANITIZE_kprobes.o := n 146UBSAN_SANITIZE_kprobes.o := n
147GCOV_PROFILE_kprobes-ftrace.o := n
148UBSAN_SANITIZE_kprobes-ftrace.o := n
153UBSAN_SANITIZE_vdso.o := n 149UBSAN_SANITIZE_vdso.o := n
154 150
155extra-$(CONFIG_PPC_FPU) += fpu.o 151extra-$(CONFIG_PPC_FPU) += fpu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..439c257dec4a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,6 +185,7 @@ int main(void)
185#ifdef CONFIG_PPC_MM_SLICES 185#ifdef CONFIG_PPC_MM_SLICES
186 OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); 186 OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
187 OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); 187 OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
188 DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
188 DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); 189 DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
189#endif /* CONFIG_PPC_MM_SLICES */ 190#endif /* CONFIG_PPC_MM_SLICES */
190#endif 191#endif
@@ -219,6 +220,7 @@ int main(void)
219 OFFSET(PACA_EXGEN, paca_struct, exgen); 220 OFFSET(PACA_EXGEN, paca_struct, exgen);
220 OFFSET(PACA_EXMC, paca_struct, exmc); 221 OFFSET(PACA_EXMC, paca_struct, exmc);
221 OFFSET(PACA_EXSLB, paca_struct, exslb); 222 OFFSET(PACA_EXSLB, paca_struct, exslb);
223 OFFSET(PACA_EXNMI, paca_struct, exnmi);
222 OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr); 224 OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
223 OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr); 225 OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
224 OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid); 226 OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
@@ -232,7 +234,9 @@ int main(void)
232 OFFSET(PACAEMERGSP, paca_struct, emergency_sp); 234 OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
233#ifdef CONFIG_PPC_BOOK3S_64 235#ifdef CONFIG_PPC_BOOK3S_64
234 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); 236 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
237 OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
235 OFFSET(PACA_IN_MCE, paca_struct, in_mce); 238 OFFSET(PACA_IN_MCE, paca_struct, in_mce);
239 OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
236#endif 240#endif
237 OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); 241 OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
238 OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); 242 OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
@@ -399,8 +403,8 @@ int main(void)
399 DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); 403 DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
400#endif 404#endif
401 405
402#ifdef MAX_PGD_TABLE_SIZE 406#ifdef CONFIG_PPC_BOOK3S_64
403 DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); 407 DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE)));
404#else 408#else
405 DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); 409 DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
406#endif 410#endif
@@ -727,6 +731,7 @@ int main(void)
727 OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); 731 OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
728 OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); 732 OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
729 OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); 733 OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
734 OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
730#endif 735#endif
731 736
732 DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); 737 DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 7fe8c79e6937..10cb2896b2ae 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,7 +29,8 @@ _GLOBAL(__setup_cpu_power7)
29 li r0,0 29 li r0,0
30 mtspr SPRN_LPID,r0 30 mtspr SPRN_LPID,r0
31 mfspr r3,SPRN_LPCR 31 mfspr r3,SPRN_LPCR
32 bl __init_LPCR 32 li r4,(LPCR_LPES1 >> LPCR_LPES_SH)
33 bl __init_LPCR_ISA206
33 bl __init_tlb_power7 34 bl __init_tlb_power7
34 mtlr r11 35 mtlr r11
35 blr 36 blr
@@ -42,7 +43,8 @@ _GLOBAL(__restore_cpu_power7)
42 li r0,0 43 li r0,0
43 mtspr SPRN_LPID,r0 44 mtspr SPRN_LPID,r0
44 mfspr r3,SPRN_LPCR 45 mfspr r3,SPRN_LPCR
45 bl __init_LPCR 46 li r4,(LPCR_LPES1 >> LPCR_LPES_SH)
47 bl __init_LPCR_ISA206
46 bl __init_tlb_power7 48 bl __init_tlb_power7
47 mtlr r11 49 mtlr r11
48 blr 50 blr
@@ -59,7 +61,8 @@ _GLOBAL(__setup_cpu_power8)
59 mtspr SPRN_LPID,r0 61 mtspr SPRN_LPID,r0
60 mfspr r3,SPRN_LPCR 62 mfspr r3,SPRN_LPCR
61 ori r3, r3, LPCR_PECEDH 63 ori r3, r3, LPCR_PECEDH
62 bl __init_LPCR 64 li r4,0 /* LPES = 0 */
65 bl __init_LPCR_ISA206
63 bl __init_HFSCR 66 bl __init_HFSCR
64 bl __init_tlb_power8 67 bl __init_tlb_power8
65 bl __init_PMU_HV 68 bl __init_PMU_HV
@@ -80,7 +83,8 @@ _GLOBAL(__restore_cpu_power8)
80 mtspr SPRN_LPID,r0 83 mtspr SPRN_LPID,r0
81 mfspr r3,SPRN_LPCR 84 mfspr r3,SPRN_LPCR
82 ori r3, r3, LPCR_PECEDH 85 ori r3, r3, LPCR_PECEDH
83 bl __init_LPCR 86 li r4,0 /* LPES = 0 */
87 bl __init_LPCR_ISA206
84 bl __init_HFSCR 88 bl __init_HFSCR
85 bl __init_tlb_power8 89 bl __init_tlb_power8
86 bl __init_PMU_HV 90 bl __init_PMU_HV
@@ -99,11 +103,12 @@ _GLOBAL(__setup_cpu_power9)
99 mtspr SPRN_PSSCR,r0 103 mtspr SPRN_PSSCR,r0
100 mtspr SPRN_LPID,r0 104 mtspr SPRN_LPID,r0
101 mfspr r3,SPRN_LPCR 105 mfspr r3,SPRN_LPCR
102 LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) 106 LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
103 or r3, r3, r4 107 or r3, r3, r4
104 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) 108 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
105 andc r3, r3, r4 109 andc r3, r3, r4
106 bl __init_LPCR 110 li r4,0 /* LPES = 0 */
111 bl __init_LPCR_ISA300
107 bl __init_HFSCR 112 bl __init_HFSCR
108 bl __init_tlb_power9 113 bl __init_tlb_power9
109 bl __init_PMU_HV 114 bl __init_PMU_HV
@@ -122,11 +127,12 @@ _GLOBAL(__restore_cpu_power9)
122 mtspr SPRN_PSSCR,r0 127 mtspr SPRN_PSSCR,r0
123 mtspr SPRN_LPID,r0 128 mtspr SPRN_LPID,r0
124 mfspr r3,SPRN_LPCR 129 mfspr r3,SPRN_LPCR
125 LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) 130 LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
126 or r3, r3, r4 131 or r3, r3, r4
127 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) 132 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
128 andc r3, r3, r4 133 andc r3, r3, r4
129 bl __init_LPCR 134 li r4,0 /* LPES = 0 */
135 bl __init_LPCR_ISA300
130 bl __init_HFSCR 136 bl __init_HFSCR
131 bl __init_tlb_power9 137 bl __init_tlb_power9
132 bl __init_PMU_HV 138 bl __init_PMU_HV
@@ -144,9 +150,9 @@ __init_hvmode_206:
144 std r5,CPU_SPEC_FEATURES(r4) 150 std r5,CPU_SPEC_FEATURES(r4)
145 blr 151 blr
146 152
147__init_LPCR: 153__init_LPCR_ISA206:
148 /* Setup a sane LPCR: 154 /* Setup a sane LPCR:
149 * Called with initial LPCR in R3 155 * Called with initial LPCR in R3 and desired LPES 2-bit value in R4
150 * 156 *
151 * LPES = 0b01 (HSRR0/1 used for 0x500) 157 * LPES = 0b01 (HSRR0/1 used for 0x500)
152 * PECE = 0b111 158 * PECE = 0b111
@@ -157,16 +163,18 @@ __init_LPCR:
157 * 163 *
158 * Other bits untouched for now 164 * Other bits untouched for now
159 */ 165 */
160 li r5,1 166 li r5,0x10
161 rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 167 rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
168
169 /* POWER9 has no VRMASD */
170__init_LPCR_ISA300:
171 rldimi r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
162 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) 172 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
163 li r5,4 173 li r5,4
164 rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 174 rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
165 clrrdi r3,r3,1 /* clear HDICE */ 175 clrrdi r3,r3,1 /* clear HDICE */
166 li r5,4 176 li r5,4
167 rldimi r3,r5, LPCR_VC_SH, 0 177 rldimi r3,r5, LPCR_VC_SH, 0
168 li r5,0x10
169 rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
170 mtspr SPRN_LPCR,r3 178 mtspr SPRN_LPCR,r3
171 isync 179 isync
172 blr 180 blr
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 2128f3a96c32..b6fe883b1016 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -20,18 +20,60 @@
20#include <asm/kvm_ppc.h> 20#include <asm/kvm_ppc.h>
21 21
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23void doorbell_setup_this_cpu(void) 23
24/*
25 * Doorbells must only be used if CPU_FTR_DBELL is available.
26 * msgsnd is used in HV, and msgsndp is used in !HV.
27 *
28 * These should be used by platform code that is aware of restrictions.
29 * Other arch code should use ->cause_ipi.
30 *
31 * doorbell_global_ipi() sends a dbell to any target CPU.
32 * Must be used only by architectures that address msgsnd target
33 * by PIR/get_hard_smp_processor_id.
34 */
35void doorbell_global_ipi(int cpu)
24{ 36{
25 unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; 37 u32 tag = get_hard_smp_processor_id(cpu);
26 38
27 smp_muxed_ipi_set_data(smp_processor_id(), tag); 39 kvmppc_set_host_ipi(cpu, 1);
40 /* Order previous accesses vs. msgsnd, which is treated as a store */
41 ppc_msgsnd_sync();
42 ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
28} 43}
29 44
30void doorbell_cause_ipi(int cpu, unsigned long data) 45/*
46 * doorbell_core_ipi() sends a dbell to a target CPU in the same core.
47 * Must be used only by architectures that address msgsnd target
48 * by TIR/cpu_thread_in_core.
49 */
50void doorbell_core_ipi(int cpu)
31{ 51{
52 u32 tag = cpu_thread_in_core(cpu);
53
54 kvmppc_set_host_ipi(cpu, 1);
32 /* Order previous accesses vs. msgsnd, which is treated as a store */ 55 /* Order previous accesses vs. msgsnd, which is treated as a store */
33 mb(); 56 ppc_msgsnd_sync();
34 ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); 57 ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
58}
59
60/*
61 * Attempt to cause a core doorbell if destination is on the same core.
62 * Returns 1 on success, 0 on failure.
63 */
64int doorbell_try_core_ipi(int cpu)
65{
66 int this_cpu = get_cpu();
67 int ret = 0;
68
69 if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) {
70 doorbell_core_ipi(cpu);
71 ret = 1;
72 }
73
74 put_cpu();
75
76 return ret;
35} 77}
36 78
37void doorbell_exception(struct pt_regs *regs) 79void doorbell_exception(struct pt_regs *regs)
@@ -40,12 +82,14 @@ void doorbell_exception(struct pt_regs *regs)
40 82
41 irq_enter(); 83 irq_enter();
42 84
85 ppc_msgsync();
86
43 may_hard_irq_enable(); 87 may_hard_irq_enable();
44 88
45 kvmppc_set_host_ipi(smp_processor_id(), 0); 89 kvmppc_set_host_ipi(smp_processor_id(), 0);
46 __this_cpu_inc(irq_stat.doorbell_irqs); 90 __this_cpu_inc(irq_stat.doorbell_irqs);
47 91
48 smp_ipi_demux(); 92 smp_ipi_demux_relaxed(); /* already performed the barrier */
49 93
50 irq_exit(); 94 irq_exit();
51 set_irq_regs(old_regs); 95 set_irq_regs(old_regs);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9de7f79e702b..63992b2d8e15 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -22,7 +22,6 @@
22 */ 22 */
23 23
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/debugfs.h>
26#include <linux/sched.h> 25#include <linux/sched.h>
27#include <linux/init.h> 26#include <linux/init.h>
28#include <linux/list.h> 27#include <linux/list.h>
@@ -37,7 +36,7 @@
37#include <linux/of.h> 36#include <linux/of.h>
38 37
39#include <linux/atomic.h> 38#include <linux/atomic.h>
40#include <asm/debug.h> 39#include <asm/debugfs.h>
41#include <asm/eeh.h> 40#include <asm/eeh.h>
42#include <asm/eeh_event.h> 41#include <asm/eeh_event.h>
43#include <asm/io.h> 42#include <asm/io.h>
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b94887165a10..c405c79e50cd 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,7 +724,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
724 */ 724 */
725#define MAX_WAIT_FOR_RECOVERY 300 725#define MAX_WAIT_FOR_RECOVERY 300
726 726
727static void eeh_handle_normal_event(struct eeh_pe *pe) 727/**
728 * eeh_handle_normal_event - Handle EEH events on a specific PE
729 * @pe: EEH PE
730 *
731 * Attempts to recover the given PE. If recovery fails or the PE has failed
732 * too many times, remove the PE.
733 *
734 * Returns true if @pe should no longer be used, else false.
735 */
736static bool eeh_handle_normal_event(struct eeh_pe *pe)
728{ 737{
729 struct pci_bus *frozen_bus; 738 struct pci_bus *frozen_bus;
730 struct eeh_dev *edev, *tmp; 739 struct eeh_dev *edev, *tmp;
@@ -736,13 +745,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
736 if (!frozen_bus) { 745 if (!frozen_bus) {
737 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", 746 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
738 __func__, pe->phb->global_number, pe->addr); 747 __func__, pe->phb->global_number, pe->addr);
739 return; 748 return false;
740 } 749 }
741 750
742 eeh_pe_update_time_stamp(pe); 751 eeh_pe_update_time_stamp(pe);
743 pe->freeze_count++; 752 pe->freeze_count++;
744 if (pe->freeze_count > eeh_max_freezes) 753 if (pe->freeze_count > eeh_max_freezes) {
745 goto excess_failures; 754 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
755 "last hour and has been permanently disabled.\n",
756 pe->phb->global_number, pe->addr,
757 pe->freeze_count);
758 goto hard_fail;
759 }
746 pr_warn("EEH: This PCI device has failed %d times in the last hour\n", 760 pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
747 pe->freeze_count); 761 pe->freeze_count);
748 762
@@ -870,27 +884,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
870 pr_info("EEH: Notify device driver to resume\n"); 884 pr_info("EEH: Notify device driver to resume\n");
871 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); 885 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
872 886
873 return; 887 return false;
874 888
875excess_failures: 889hard_fail:
876 /* 890 /*
877 * About 90% of all real-life EEH failures in the field 891 * About 90% of all real-life EEH failures in the field
878 * are due to poorly seated PCI cards. Only 10% or so are 892 * are due to poorly seated PCI cards. Only 10% or so are
879 * due to actual, failed cards. 893 * due to actual, failed cards.
880 */ 894 */
881 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
882 "last hour and has been permanently disabled.\n"
883 "Please try reseating or replacing it.\n",
884 pe->phb->global_number, pe->addr,
885 pe->freeze_count);
886 goto perm_error;
887
888hard_fail:
889 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" 895 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
890 "Please try reseating or replacing it\n", 896 "Please try reseating or replacing it\n",
891 pe->phb->global_number, pe->addr); 897 pe->phb->global_number, pe->addr);
892 898
893perm_error:
894 eeh_slot_error_detail(pe, EEH_LOG_PERM); 899 eeh_slot_error_detail(pe, EEH_LOG_PERM);
895 900
896 /* Notify all devices that they're about to go down. */ 901 /* Notify all devices that they're about to go down. */
@@ -915,10 +920,21 @@ perm_error:
915 pci_lock_rescan_remove(); 920 pci_lock_rescan_remove();
916 pci_hp_remove_devices(frozen_bus); 921 pci_hp_remove_devices(frozen_bus);
917 pci_unlock_rescan_remove(); 922 pci_unlock_rescan_remove();
923
924 /* The passed PE should no longer be used */
925 return true;
918 } 926 }
919 } 927 }
928 return false;
920} 929}
921 930
931/**
932 * eeh_handle_special_event - Handle EEH events without a specific failing PE
933 *
934 * Called when an EEH event is detected but can't be narrowed down to a
935 * specific PE. Iterates through possible failures and handles them as
936 * necessary.
937 */
922static void eeh_handle_special_event(void) 938static void eeh_handle_special_event(void)
923{ 939{
924 struct eeh_pe *pe, *phb_pe; 940 struct eeh_pe *pe, *phb_pe;
@@ -982,7 +998,14 @@ static void eeh_handle_special_event(void)
982 */ 998 */
983 if (rc == EEH_NEXT_ERR_FROZEN_PE || 999 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
984 rc == EEH_NEXT_ERR_FENCED_PHB) { 1000 rc == EEH_NEXT_ERR_FENCED_PHB) {
985 eeh_handle_normal_event(pe); 1001 /*
1002 * eeh_handle_normal_event() can make the PE stale if it
1003 * determines that the PE cannot possibly be recovered.
1004 * Don't modify the PE state if that's the case.
1005 */
1006 if (eeh_handle_normal_event(pe))
1007 continue;
1008
986 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 1009 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
987 } else { 1010 } else {
988 pci_lock_rescan_remove(); 1011 pci_lock_rescan_remove();
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a38600949f3a..8587059ad848 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -31,7 +31,6 @@
31#include <asm/ppc_asm.h> 31#include <asm/ppc_asm.h>
32#include <asm/asm-offsets.h> 32#include <asm/asm-offsets.h>
33#include <asm/unistd.h> 33#include <asm/unistd.h>
34#include <asm/ftrace.h>
35#include <asm/ptrace.h> 34#include <asm/ptrace.h>
36#include <asm/export.h> 35#include <asm/export.h>
37 36
@@ -1315,109 +1314,3 @@ machine_check_in_rtas:
1315 /* XXX load up BATs and panic */ 1314 /* XXX load up BATs and panic */
1316 1315
1317#endif /* CONFIG_PPC_RTAS */ 1316#endif /* CONFIG_PPC_RTAS */
1318
1319#ifdef CONFIG_FUNCTION_TRACER
1320#ifdef CONFIG_DYNAMIC_FTRACE
1321_GLOBAL(mcount)
1322_GLOBAL(_mcount)
1323 /*
1324 * It is required that _mcount on PPC32 must preserve the
1325 * link register. But we have r0 to play with. We use r0
1326 * to push the return address back to the caller of mcount
1327 * into the ctr register, restore the link register and
1328 * then jump back using the ctr register.
1329 */
1330 mflr r0
1331 mtctr r0
1332 lwz r0, 4(r1)
1333 mtlr r0
1334 bctr
1335
1336_GLOBAL(ftrace_caller)
1337 MCOUNT_SAVE_FRAME
1338 /* r3 ends up with link register */
1339 subi r3, r3, MCOUNT_INSN_SIZE
1340.globl ftrace_call
1341ftrace_call:
1342 bl ftrace_stub
1343 nop
1344#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1345.globl ftrace_graph_call
1346ftrace_graph_call:
1347 b ftrace_graph_stub
1348_GLOBAL(ftrace_graph_stub)
1349#endif
1350 MCOUNT_RESTORE_FRAME
1351 /* old link register ends up in ctr reg */
1352 bctr
1353#else
1354_GLOBAL(mcount)
1355_GLOBAL(_mcount)
1356
1357 MCOUNT_SAVE_FRAME
1358
1359 subi r3, r3, MCOUNT_INSN_SIZE
1360 LOAD_REG_ADDR(r5, ftrace_trace_function)
1361 lwz r5,0(r5)
1362
1363 mtctr r5
1364 bctrl
1365 nop
1366
1367#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1368 b ftrace_graph_caller
1369#endif
1370 MCOUNT_RESTORE_FRAME
1371 bctr
1372#endif
1373EXPORT_SYMBOL(_mcount)
1374
1375_GLOBAL(ftrace_stub)
1376 blr
1377
1378#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1379_GLOBAL(ftrace_graph_caller)
1380 /* load r4 with local address */
1381 lwz r4, 44(r1)
1382 subi r4, r4, MCOUNT_INSN_SIZE
1383
1384 /* Grab the LR out of the caller stack frame */
1385 lwz r3,52(r1)
1386
1387 bl prepare_ftrace_return
1388 nop
1389
1390 /*
1391 * prepare_ftrace_return gives us the address we divert to.
1392 * Change the LR in the callers stack frame to this.
1393 */
1394 stw r3,52(r1)
1395
1396 MCOUNT_RESTORE_FRAME
1397 /* old link register ends up in ctr reg */
1398 bctr
1399
1400_GLOBAL(return_to_handler)
1401 /* need to save return values */
1402 stwu r1, -32(r1)
1403 stw r3, 20(r1)
1404 stw r4, 16(r1)
1405 stw r31, 12(r1)
1406 mr r31, r1
1407
1408 bl ftrace_return_to_handler
1409 nop
1410
1411 /* return value has real return address */
1412 mtlr r3
1413
1414 lwz r3, 20(r1)
1415 lwz r4, 16(r1)
1416 lwz r31,12(r1)
1417 lwz r1, 0(r1)
1418
1419 /* Jump back to real return address */
1420 blr
1421#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1422
1423#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 767ef6d68c9e..bfbad08a1207 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -20,7 +20,6 @@
20 20
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/err.h> 22#include <linux/err.h>
23#include <linux/magic.h>
24#include <asm/unistd.h> 23#include <asm/unistd.h>
25#include <asm/processor.h> 24#include <asm/processor.h>
26#include <asm/page.h> 25#include <asm/page.h>
@@ -33,7 +32,6 @@
33#include <asm/bug.h> 32#include <asm/bug.h>
34#include <asm/ptrace.h> 33#include <asm/ptrace.h>
35#include <asm/irqflags.h> 34#include <asm/irqflags.h>
36#include <asm/ftrace.h>
37#include <asm/hw_irq.h> 35#include <asm/hw_irq.h>
38#include <asm/context_tracking.h> 36#include <asm/context_tracking.h>
39#include <asm/tm.h> 37#include <asm/tm.h>
@@ -1173,381 +1171,3 @@ _GLOBAL(enter_prom)
1173 ld r0,16(r1) 1171 ld r0,16(r1)
1174 mtlr r0 1172 mtlr r0
1175 blr 1173 blr
1176
1177#ifdef CONFIG_FUNCTION_TRACER
1178#ifdef CONFIG_DYNAMIC_FTRACE
1179_GLOBAL(mcount)
1180_GLOBAL(_mcount)
1181EXPORT_SYMBOL(_mcount)
1182 mflr r12
1183 mtctr r12
1184 mtlr r0
1185 bctr
1186
1187#ifndef CC_USING_MPROFILE_KERNEL
1188_GLOBAL_TOC(ftrace_caller)
1189 /* Taken from output of objdump from lib64/glibc */
1190 mflr r3
1191 ld r11, 0(r1)
1192 stdu r1, -112(r1)
1193 std r3, 128(r1)
1194 ld r4, 16(r11)
1195 subi r3, r3, MCOUNT_INSN_SIZE
1196.globl ftrace_call
1197ftrace_call:
1198 bl ftrace_stub
1199 nop
1200#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1201.globl ftrace_graph_call
1202ftrace_graph_call:
1203 b ftrace_graph_stub
1204_GLOBAL(ftrace_graph_stub)
1205#endif
1206 ld r0, 128(r1)
1207 mtlr r0
1208 addi r1, r1, 112
1209
1210#else /* CC_USING_MPROFILE_KERNEL */
1211/*
1212 *
1213 * ftrace_caller() is the function that replaces _mcount() when ftrace is
1214 * active.
1215 *
1216 * We arrive here after a function A calls function B, and we are the trace
1217 * function for B. When we enter r1 points to A's stack frame, B has not yet
1218 * had a chance to allocate one yet.
1219 *
1220 * Additionally r2 may point either to the TOC for A, or B, depending on
1221 * whether B did a TOC setup sequence before calling us.
1222 *
1223 * On entry the LR points back to the _mcount() call site, and r0 holds the
1224 * saved LR as it was on entry to B, ie. the original return address at the
1225 * call site in A.
1226 *
1227 * Our job is to save the register state into a struct pt_regs (on the stack)
1228 * and then arrange for the ftrace function to be called.
1229 */
1230_GLOBAL(ftrace_caller)
1231 /* Save the original return address in A's stack frame */
1232 std r0,LRSAVE(r1)
1233
1234 /* Create our stack frame + pt_regs */
1235 stdu r1,-SWITCH_FRAME_SIZE(r1)
1236
1237 /* Save all gprs to pt_regs */
1238 SAVE_8GPRS(0,r1)
1239 SAVE_8GPRS(8,r1)
1240 SAVE_8GPRS(16,r1)
1241 SAVE_8GPRS(24,r1)
1242
1243 /* Load special regs for save below */
1244 mfmsr r8
1245 mfctr r9
1246 mfxer r10
1247 mfcr r11
1248
1249 /* Get the _mcount() call site out of LR */
1250 mflr r7
1251 /* Save it as pt_regs->nip & pt_regs->link */
1252 std r7, _NIP(r1)
1253 std r7, _LINK(r1)
1254
1255 /* Save callee's TOC in the ABI compliant location */
1256 std r2, 24(r1)
1257 ld r2,PACATOC(r13) /* get kernel TOC in r2 */
1258
1259 addis r3,r2,function_trace_op@toc@ha
1260 addi r3,r3,function_trace_op@toc@l
1261 ld r5,0(r3)
1262
1263#ifdef CONFIG_LIVEPATCH
1264 mr r14,r7 /* remember old NIP */
1265#endif
1266 /* Calculate ip from nip-4 into r3 for call below */
1267 subi r3, r7, MCOUNT_INSN_SIZE
1268
1269 /* Put the original return address in r4 as parent_ip */
1270 mr r4, r0
1271
1272 /* Save special regs */
1273 std r8, _MSR(r1)
1274 std r9, _CTR(r1)
1275 std r10, _XER(r1)
1276 std r11, _CCR(r1)
1277
1278 /* Load &pt_regs in r6 for call below */
1279 addi r6, r1 ,STACK_FRAME_OVERHEAD
1280
1281 /* ftrace_call(r3, r4, r5, r6) */
1282.globl ftrace_call
1283ftrace_call:
1284 bl ftrace_stub
1285 nop
1286
1287 /* Load ctr with the possibly modified NIP */
1288 ld r3, _NIP(r1)
1289 mtctr r3
1290#ifdef CONFIG_LIVEPATCH
1291 cmpd r14,r3 /* has NIP been altered? */
1292#endif
1293
1294 /* Restore gprs */
1295 REST_8GPRS(0,r1)
1296 REST_8GPRS(8,r1)
1297 REST_8GPRS(16,r1)
1298 REST_8GPRS(24,r1)
1299
1300 /* Restore callee's TOC */
1301 ld r2, 24(r1)
1302
1303 /* Pop our stack frame */
1304 addi r1, r1, SWITCH_FRAME_SIZE
1305
1306 /* Restore original LR for return to B */
1307 ld r0, LRSAVE(r1)
1308 mtlr r0
1309
1310#ifdef CONFIG_LIVEPATCH
1311 /* Based on the cmpd above, if the NIP was altered handle livepatch */
1312 bne- livepatch_handler
1313#endif
1314
1315#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1316 stdu r1, -112(r1)
1317.globl ftrace_graph_call
1318ftrace_graph_call:
1319 b ftrace_graph_stub
1320_GLOBAL(ftrace_graph_stub)
1321 addi r1, r1, 112
1322#endif
1323
1324 ld r0,LRSAVE(r1) /* restore callee's lr at _mcount site */
1325 mtlr r0
1326 bctr /* jump after _mcount site */
1327#endif /* CC_USING_MPROFILE_KERNEL */
1328
1329_GLOBAL(ftrace_stub)
1330 blr
1331
1332#ifdef CONFIG_LIVEPATCH
1333 /*
1334 * This function runs in the mcount context, between two functions. As
1335 * such it can only clobber registers which are volatile and used in
1336 * function linkage.
1337 *
1338 * We get here when a function A, calls another function B, but B has
1339 * been live patched with a new function C.
1340 *
1341 * On entry:
1342 * - we have no stack frame and can not allocate one
1343 * - LR points back to the original caller (in A)
1344 * - CTR holds the new NIP in C
1345 * - r0 & r12 are free
1346 *
1347 * r0 can't be used as the base register for a DS-form load or store, so
1348 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
1349 */
1350livepatch_handler:
1351 CURRENT_THREAD_INFO(r12, r1)
1352
1353 /* Save stack pointer into r0 */
1354 mr r0, r1
1355
1356 /* Allocate 3 x 8 bytes */
1357 ld r1, TI_livepatch_sp(r12)
1358 addi r1, r1, 24
1359 std r1, TI_livepatch_sp(r12)
1360
1361 /* Save toc & real LR on livepatch stack */
1362 std r2, -24(r1)
1363 mflr r12
1364 std r12, -16(r1)
1365
1366 /* Store stack end marker */
1367 lis r12, STACK_END_MAGIC@h
1368 ori r12, r12, STACK_END_MAGIC@l
1369 std r12, -8(r1)
1370
1371 /* Restore real stack pointer */
1372 mr r1, r0
1373
1374 /* Put ctr in r12 for global entry and branch there */
1375 mfctr r12
1376 bctrl
1377
1378 /*
1379 * Now we are returning from the patched function to the original
1380 * caller A. We are free to use r0 and r12, and we can use r2 until we
1381 * restore it.
1382 */
1383
1384 CURRENT_THREAD_INFO(r12, r1)
1385
1386 /* Save stack pointer into r0 */
1387 mr r0, r1
1388
1389 ld r1, TI_livepatch_sp(r12)
1390
1391 /* Check stack marker hasn't been trashed */
1392 lis r2, STACK_END_MAGIC@h
1393 ori r2, r2, STACK_END_MAGIC@l
1394 ld r12, -8(r1)
13951: tdne r12, r2
1396 EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
1397
1398 /* Restore LR & toc from livepatch stack */
1399 ld r12, -16(r1)
1400 mtlr r12
1401 ld r2, -24(r1)
1402
1403 /* Pop livepatch stack frame */
1404 CURRENT_THREAD_INFO(r12, r0)
1405 subi r1, r1, 24
1406 std r1, TI_livepatch_sp(r12)
1407
1408 /* Restore real stack pointer */
1409 mr r1, r0
1410
1411 /* Return to original caller of live patched function */
1412 blr
1413#endif
1414
1415
1416#else
1417_GLOBAL_TOC(_mcount)
1418EXPORT_SYMBOL(_mcount)
1419 /* Taken from output of objdump from lib64/glibc */
1420 mflr r3
1421 ld r11, 0(r1)
1422 stdu r1, -112(r1)
1423 std r3, 128(r1)
1424 ld r4, 16(r11)
1425
1426 subi r3, r3, MCOUNT_INSN_SIZE
1427 LOAD_REG_ADDR(r5,ftrace_trace_function)
1428 ld r5,0(r5)
1429 ld r5,0(r5)
1430 mtctr r5
1431 bctrl
1432 nop
1433
1434
1435#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1436 b ftrace_graph_caller
1437#endif
1438 ld r0, 128(r1)
1439 mtlr r0
1440 addi r1, r1, 112
1441_GLOBAL(ftrace_stub)
1442 blr
1443
1444#endif /* CONFIG_DYNAMIC_FTRACE */
1445
1446#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1447#ifndef CC_USING_MPROFILE_KERNEL
1448_GLOBAL(ftrace_graph_caller)
1449 /* load r4 with local address */
1450 ld r4, 128(r1)
1451 subi r4, r4, MCOUNT_INSN_SIZE
1452
1453 /* Grab the LR out of the caller stack frame */
1454 ld r11, 112(r1)
1455 ld r3, 16(r11)
1456
1457 bl prepare_ftrace_return
1458 nop
1459
1460 /*
1461 * prepare_ftrace_return gives us the address we divert to.
1462 * Change the LR in the callers stack frame to this.
1463 */
1464 ld r11, 112(r1)
1465 std r3, 16(r11)
1466
1467 ld r0, 128(r1)
1468 mtlr r0
1469 addi r1, r1, 112
1470 blr
1471
1472#else /* CC_USING_MPROFILE_KERNEL */
1473_GLOBAL(ftrace_graph_caller)
1474 /* with -mprofile-kernel, parameter regs are still alive at _mcount */
1475 std r10, 104(r1)
1476 std r9, 96(r1)
1477 std r8, 88(r1)
1478 std r7, 80(r1)
1479 std r6, 72(r1)
1480 std r5, 64(r1)
1481 std r4, 56(r1)
1482 std r3, 48(r1)
1483
1484 /* Save callee's TOC in the ABI compliant location */
1485 std r2, 24(r1)
1486 ld r2, PACATOC(r13) /* get kernel TOC in r2 */
1487
1488 mfctr r4 /* ftrace_caller has moved local addr here */
1489 std r4, 40(r1)
1490 mflr r3 /* ftrace_caller has restored LR from stack */
1491 subi r4, r4, MCOUNT_INSN_SIZE
1492
1493 bl prepare_ftrace_return
1494 nop
1495
1496 /*
1497 * prepare_ftrace_return gives us the address we divert to.
1498 * Change the LR to this.
1499 */
1500 mtlr r3
1501
1502 ld r0, 40(r1)
1503 mtctr r0
1504 ld r10, 104(r1)
1505 ld r9, 96(r1)
1506 ld r8, 88(r1)
1507 ld r7, 80(r1)
1508 ld r6, 72(r1)
1509 ld r5, 64(r1)
1510 ld r4, 56(r1)
1511 ld r3, 48(r1)
1512
1513 /* Restore callee's TOC */
1514 ld r2, 24(r1)
1515
1516 addi r1, r1, 112
1517 mflr r0
1518 std r0, LRSAVE(r1)
1519 bctr
1520#endif /* CC_USING_MPROFILE_KERNEL */
1521
1522_GLOBAL(return_to_handler)
1523 /* need to save return values */
1524 std r4, -32(r1)
1525 std r3, -24(r1)
1526 /* save TOC */
1527 std r2, -16(r1)
1528 std r31, -8(r1)
1529 mr r31, r1
1530 stdu r1, -112(r1)
1531
1532 /*
1533 * We might be called from a module.
1534 * Switch to our TOC to run inside the core kernel.
1535 */
1536 ld r2, PACATOC(r13)
1537
1538 bl ftrace_return_to_handler
1539 nop
1540
1541 /* return value has real return address */
1542 mtlr r3
1543
1544 ld r1, 0(r1)
1545 ld r4, -32(r1)
1546 ld r3, -24(r1)
1547 ld r2, -16(r1)
1548 ld r31, -8(r1)
1549
1550 /* Jump back to real return address */
1551 blr
1552#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1553#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6353019966e6..a9312b52fe6f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,9 +116,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
116 116
117EXC_REAL_BEGIN(system_reset, 0x100, 0x100) 117EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
118 SET_SCRATCH0(r13) 118 SET_SCRATCH0(r13)
119 GET_PACA(r13) 119 /*
120 clrrdi r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */ 120 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
121 EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD, 121 * being used, so a nested NMI exception would corrupt it.
122 */
123 EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
122 IDLETEST, 0x100) 124 IDLETEST, 0x100)
123 125
124EXC_REAL_END(system_reset, 0x100, 0x100) 126EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -126,34 +128,37 @@ EXC_VIRT_NONE(0x4100, 0x100)
126 128
127#ifdef CONFIG_PPC_P7_NAP 129#ifdef CONFIG_PPC_P7_NAP
128EXC_COMMON_BEGIN(system_reset_idle_common) 130EXC_COMMON_BEGIN(system_reset_idle_common)
129BEGIN_FTR_SECTION 131 b pnv_powersave_wakeup
130 GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */ 132#endif
131END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
132 bl pnv_restore_hyp_resource
133 133
134 li r0,PNV_THREAD_RUNNING 134EXC_COMMON_BEGIN(system_reset_common)
135 stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ 135 /*
136 * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able
137 * to recover, but nested NMI will notice in_nmi and not recover
138 * because of the use of the NMI stack. in_nmi reentrancy is tested in
139 * system_reset_exception.
140 */
141 lhz r10,PACA_IN_NMI(r13)
142 addi r10,r10,1
143 sth r10,PACA_IN_NMI(r13)
144 li r10,MSR_RI
145 mtmsrd r10,1
136 146
137#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 147 mr r10,r1
138 li r0,KVM_HWTHREAD_IN_KERNEL 148 ld r1,PACA_NMI_EMERG_SP(r13)
139 stb r0,HSTATE_HWTHREAD_STATE(r13) 149 subi r1,r1,INT_FRAME_SIZE
140 /* Order setting hwthread_state vs. testing hwthread_req */ 150 EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100,
141 sync 151 system_reset, system_reset_exception,
142 lbz r0,HSTATE_HWTHREAD_REQ(r13) 152 ADD_NVGPRS;ADD_RECONCILE)
143 cmpwi r0,0
144 beq 1f
145 BRANCH_TO_KVM(r10, kvm_start_guest)
1461:
147#endif
148 153
149 /* Return SRR1 from power7_nap() */ 154 /*
150 mfspr r3,SPRN_SRR1 155 * The stack is no longer in use, decrement in_nmi.
151 blt cr3,2f 156 */
152 b pnv_wakeup_loss 157 lhz r10,PACA_IN_NMI(r13)
1532: b pnv_wakeup_noloss 158 subi r10,r10,1
154#endif 159 sth r10,PACA_IN_NMI(r13)
155 160
156EXC_COMMON(system_reset_common, 0x100, system_reset_exception) 161 b ret_from_except
157 162
158#ifdef CONFIG_PPC_PSERIES 163#ifdef CONFIG_PPC_PSERIES
159/* 164/*
@@ -161,8 +166,9 @@ EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
161 */ 166 */
162TRAMP_REAL_BEGIN(system_reset_fwnmi) 167TRAMP_REAL_BEGIN(system_reset_fwnmi)
163 SET_SCRATCH0(r13) /* save r13 */ 168 SET_SCRATCH0(r13) /* save r13 */
164 EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, 169 /* See comment at system_reset exception */
165 NOTEST, 0x100) 170 EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common,
171 EXC_STD, NOTEST, 0x100)
166#endif /* CONFIG_PPC_PSERIES */ 172#endif /* CONFIG_PPC_PSERIES */
167 173
168 174
@@ -172,14 +178,6 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
172 * vector 178 * vector
173 */ 179 */
174 SET_SCRATCH0(r13) /* save r13 */ 180 SET_SCRATCH0(r13) /* save r13 */
175 /*
176 * Running native on arch 2.06 or later, we may wakeup from winkle
177 * inside machine check. If yes, then last bit of HSPRG0 would be set
178 * to 1. Hence clear it unconditionally.
179 */
180 GET_PACA(r13)
181 clrrdi r13,r13,1
182 SET_PACA(r13)
183 EXCEPTION_PROLOG_0(PACA_EXMC) 181 EXCEPTION_PROLOG_0(PACA_EXMC)
184BEGIN_FTR_SECTION 182BEGIN_FTR_SECTION
185 b machine_check_powernv_early 183 b machine_check_powernv_early
@@ -212,6 +210,12 @@ BEGIN_FTR_SECTION
212 * NOTE: We are here with MSR_ME=0 (off), which means we risk a 210 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
213 * checkstop if we get another machine check exception before we do 211 * checkstop if we get another machine check exception before we do
214 * rfid with MSR_ME=1. 212 * rfid with MSR_ME=1.
213 *
214 * This interrupt can wake directly from idle. If that is the case,
215 * the machine check is handled then the idle wakeup code is called
216 * to restore state. In that case, the POWER9 DD1 idle PACA workaround
217 * is not applied in the early machine check code, which will cause
218 * bugs.
215 */ 219 */
216 mr r11,r1 /* Save r1 */ 220 mr r11,r1 /* Save r1 */
217 lhz r10,PACA_IN_MCE(r13) 221 lhz r10,PACA_IN_MCE(r13)
@@ -268,20 +272,11 @@ machine_check_fwnmi:
268machine_check_pSeries_0: 272machine_check_pSeries_0:
269 EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) 273 EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
270 /* 274 /*
271 * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the 275 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
272 * difference that MSR_RI is not enabled, because PACA_EXMC is being 276 * nested machine check corrupts it. machine_check_common enables
273 * used, so nested machine check corrupts it. machine_check_common 277 * MSR_RI.
274 * enables MSR_RI.
275 */ 278 */
276 ld r10,PACAKMSR(r13) 279 EXCEPTION_PROLOG_PSERIES_1_NORI(machine_check_common, EXC_STD)
277 xori r10,r10,MSR_RI
278 mfspr r11,SPRN_SRR0
279 LOAD_HANDLER(r12, machine_check_common)
280 mtspr SPRN_SRR0,r12
281 mfspr r12,SPRN_SRR1
282 mtspr SPRN_SRR1,r10
283 rfid
284 b . /* prevent speculative execution */
285 280
286TRAMP_KVM_SKIP(PACA_EXMC, 0x200) 281TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
287 282
@@ -340,6 +335,37 @@ EXC_COMMON_BEGIN(machine_check_common)
340 /* restore original r1. */ \ 335 /* restore original r1. */ \
341 ld r1,GPR1(r1) 336 ld r1,GPR1(r1)
342 337
338#ifdef CONFIG_PPC_P7_NAP
339/*
340 * This is an idle wakeup. Low level machine check has already been
341 * done. Queue the event then call the idle code to do the wake up.
342 */
343EXC_COMMON_BEGIN(machine_check_idle_common)
344 bl machine_check_queue_event
345
346 /*
347 * We have not used any non-volatile GPRs here, and as a rule
348 * most exception code including machine check does not.
349 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
350 * wakeup will restore volatile registers.
351 *
352 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
353 *
354 * Then decrement MCE nesting after finishing with the stack.
355 */
356 ld r3,_MSR(r1)
357
358 lhz r11,PACA_IN_MCE(r13)
359 subi r11,r11,1
360 sth r11,PACA_IN_MCE(r13)
361
362 /* Turn off the RI bit because SRR1 is used by idle wakeup code. */
363 /* Recoverability could be improved by reducing the use of SRR1. */
364 li r11,0
365 mtmsrd r11,1
366
367 b pnv_powersave_wakeup_mce
368#endif
343 /* 369 /*
344 * Handle machine check early in real mode. We come here with 370 * Handle machine check early in real mode. We come here with
345 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. 371 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
@@ -352,6 +378,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
352 bl machine_check_early 378 bl machine_check_early
353 std r3,RESULT(r1) /* Save result */ 379 std r3,RESULT(r1) /* Save result */
354 ld r12,_MSR(r1) 380 ld r12,_MSR(r1)
381
355#ifdef CONFIG_PPC_P7_NAP 382#ifdef CONFIG_PPC_P7_NAP
356 /* 383 /*
357 * Check if thread was in power saving mode. We come here when any 384 * Check if thread was in power saving mode. We come here when any
@@ -362,48 +389,14 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
362 * 389 *
363 * Go back to nap/sleep/winkle mode again if (b) is true. 390 * Go back to nap/sleep/winkle mode again if (b) is true.
364 */ 391 */
365 rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ 392 BEGIN_FTR_SECTION
366 beq 4f /* No, it wasn;t */ 393 rlwinm. r11,r12,47-31,30,31
367 /* Thread was in power saving mode. Go back to nap again. */ 394 beq- 4f
368 cmpwi r11,2 395 BRANCH_TO_COMMON(r10, machine_check_idle_common)
369 blt 3f
370 /* Supervisor/Hypervisor state loss */
371 li r0,1
372 stb r0,PACA_NAPSTATELOST(r13)
3733: bl machine_check_queue_event
374 MACHINE_CHECK_HANDLER_WINDUP
375 GET_PACA(r13)
376 ld r1,PACAR1(r13)
377 /*
378 * Check what idle state this CPU was in and go back to same mode
379 * again.
380 */
381 lbz r3,PACA_THREAD_IDLE_STATE(r13)
382 cmpwi r3,PNV_THREAD_NAP
383 bgt 10f
384 IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
385 /* No return */
38610:
387 cmpwi r3,PNV_THREAD_SLEEP
388 bgt 2f
389 IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
390 /* No return */
391
3922:
393 /*
394 * Go back to winkle. Please note that this thread was woken up in
395 * machine check from winkle and have not restored the per-subcore
396 * state. Hence before going back to winkle, set last bit of HSPRG0
397 * to 1. This will make sure that if this thread gets woken up
398 * again at reset vector 0x100 then it will get chance to restore
399 * the subcore state.
400 */
401 ori r13,r13,1
402 SET_PACA(r13)
403 IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
404 /* No return */
4054: 3964:
397 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
406#endif 398#endif
399
407 /* 400 /*
408 * Check if we are coming from hypervisor userspace. If yes then we 401 * Check if we are coming from hypervisor userspace. If yes then we
409 * continue in host kernel in V mode to deliver the MC event. 402 * continue in host kernel in V mode to deliver the MC event.
@@ -968,17 +961,12 @@ EXC_VIRT_NONE(0x4e60, 0x20)
968TRAMP_KVM_HV(PACA_EXGEN, 0xe60) 961TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
969TRAMP_REAL_BEGIN(hmi_exception_early) 962TRAMP_REAL_BEGIN(hmi_exception_early)
970 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60) 963 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
971 mr r10,r1 /* Save r1 */ 964 mr r10,r1 /* Save r1 */
972 ld r1,PACAEMERGSP(r13) /* Use emergency stack */ 965 ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */
973 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ 966 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
974 std r9,_CCR(r1) /* save CR in stackframe */
975 mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ 967 mfspr r11,SPRN_HSRR0 /* Save HSRR0 */
976 std r11,_NIP(r1) /* save HSRR0 in stackframe */ 968 mfspr r12,SPRN_HSRR1 /* Save HSRR1 */
977 mfspr r12,SPRN_HSRR1 /* Save SRR1 */ 969 EXCEPTION_PROLOG_COMMON_1()
978 std r12,_MSR(r1) /* save SRR1 in stackframe */
979 std r10,0(r1) /* make stack chain pointer */
980 std r0,GPR0(r1) /* save r0 in stackframe */
981 std r10,GPR1(r1) /* save r1 in stackframe */
982 EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) 970 EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
983 EXCEPTION_PROLOG_COMMON_3(0xe60) 971 EXCEPTION_PROLOG_COMMON_3(0xe60)
984 addi r3,r1,STACK_FRAME_OVERHEAD 972 addi r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4e77a7..243dbef7e926 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,17 +30,16 @@
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/memblock.h> 31#include <linux/memblock.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/debugfs.h>
34#include <linux/seq_file.h> 33#include <linux/seq_file.h>
35#include <linux/crash_dump.h> 34#include <linux/crash_dump.h>
36#include <linux/kobject.h> 35#include <linux/kobject.h>
37#include <linux/sysfs.h> 36#include <linux/sysfs.h>
38 37
38#include <asm/debugfs.h>
39#include <asm/page.h> 39#include <asm/page.h>
40#include <asm/prom.h> 40#include <asm/prom.h>
41#include <asm/rtas.h> 41#include <asm/rtas.h>
42#include <asm/fadump.h> 42#include <asm/fadump.h>
43#include <asm/debug.h>
44#include <asm/setup.h> 43#include <asm/setup.h>
45 44
46static struct fw_dump fw_dump; 45static struct fw_dump fw_dump;
@@ -319,15 +318,34 @@ int __init fadump_reserve_mem(void)
319 pr_debug("fadumphdr_addr = %p\n", 318 pr_debug("fadumphdr_addr = %p\n",
320 (void *) fw_dump.fadumphdr_addr); 319 (void *) fw_dump.fadumphdr_addr);
321 } else { 320 } else {
322 /* Reserve the memory at the top of memory. */
323 size = get_fadump_area_size(); 321 size = get_fadump_area_size();
324 base = memory_boundary - size; 322
325 memblock_reserve(base, size); 323 /*
326 printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " 324 * Reserve memory at an offset closer to bottom of the RAM to
327 "for firmware-assisted dump\n", 325 * minimize the impact of memory hot-remove operation. We can't
328 (unsigned long)(size >> 20), 326 * use memblock_find_in_range() here since it doesn't allocate
329 (unsigned long)(base >> 20)); 327 * from bottom to top.
328 */
329 for (base = fw_dump.boot_memory_size;
330 base <= (memory_boundary - size);
331 base += size) {
332 if (memblock_is_region_memory(base, size) &&
333 !memblock_is_region_reserved(base, size))
334 break;
335 }
336 if ((base > (memory_boundary - size)) ||
337 memblock_reserve(base, size)) {
338 pr_err("Failed to reserve memory\n");
339 return 0;
340 }
341
342 pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
343 "assisted dump (System RAM: %ldMB)\n",
344 (unsigned long)(size >> 20),
345 (unsigned long)(base >> 20),
346 (unsigned long)(memblock_phys_mem_size() >> 20));
330 } 347 }
348
331 fw_dump.reserve_dump_area_start = base; 349 fw_dump.reserve_dump_area_start = base;
332 fw_dump.reserve_dump_area_size = size; 350 fw_dump.reserve_dump_area_size = size;
333 return 1; 351 return 1;
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 1607be7c0ef2..e22734278458 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
735 EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE) 735 EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
736 EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE) 736 EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
737 EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE) 737 EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
738 EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE) 738 EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
739
740 .globl mol_trampoline
741 .set mol_trampoline, i0x2f00
742 EXPORT_SYMBOL(mol_trampoline)
743 739
744 . = 0x3000 740 . = 0x3000
745 741
@@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page)
1278swapper_pg_dir: 1274swapper_pg_dir:
1279 .space PGD_TABLE_SIZE 1275 .space PGD_TABLE_SIZE
1280 1276
1281 .globl intercept_table
1282intercept_table:
1283 .long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700
1284 .long i0x800, 0, 0, 0, 0, i0xd00, 0, 0
1285 .long 0, 0, 0, i0x1300, 0, 0, 0, 0
1286 .long 0, 0, 0, 0, 0, 0, 0, 0
1287 .long 0, 0, 0, 0, 0, 0, 0, 0
1288 .long 0, 0, 0, 0, 0, 0, 0, 0
1289EXPORT_SYMBOL(intercept_table)
1290
1291/* Room for two PTE pointers, usually the kernel and current user pointers 1277/* Room for two PTE pointers, usually the kernel and current user pointers
1292 * to their respective root page table. 1278 * to their respective root page table.
1293 */ 1279 */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 1dc5eae2ced3..0ddc602b33a4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -949,7 +949,8 @@ start_here_multiplatform:
949 LOAD_REG_ADDR(r3,init_thread_union) 949 LOAD_REG_ADDR(r3,init_thread_union)
950 950
951 /* set up a stack pointer */ 951 /* set up a stack pointer */
952 addi r1,r3,THREAD_SIZE 952 LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
953 add r1,r3,r1
953 li r0,0 954 li r0,0
954 stdu r0,-STACK_FRAME_OVERHEAD(r1) 955 stdu r0,-STACK_FRAME_OVERHEAD(r1)
955 956
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 6fd08219248d..07d4e0ad60db 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -20,6 +20,7 @@
20#include <asm/kvm_book3s_asm.h> 20#include <asm/kvm_book3s_asm.h>
21#include <asm/opal.h> 21#include <asm/opal.h>
22#include <asm/cpuidle.h> 22#include <asm/cpuidle.h>
23#include <asm/exception-64s.h>
23#include <asm/book3s/64/mmu-hash.h> 24#include <asm/book3s/64/mmu-hash.h>
24#include <asm/mmu.h> 25#include <asm/mmu.h>
25 26
@@ -94,12 +95,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
94core_idle_lock_held: 95core_idle_lock_held:
95 HMT_LOW 96 HMT_LOW
963: lwz r15,0(r14) 973: lwz r15,0(r14)
97 andi. r15,r15,PNV_CORE_IDLE_LOCK_BIT 98 andis. r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
98 bne 3b 99 bne 3b
99 HMT_MEDIUM 100 HMT_MEDIUM
100 lwarx r15,0,r14 101 lwarx r15,0,r14
101 andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 102 andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
102 bne core_idle_lock_held 103 bne- core_idle_lock_held
103 blr 104 blr
104 105
105/* 106/*
@@ -113,7 +114,7 @@ core_idle_lock_held:
113 * 114 *
114 * Address to 'rfid' to in r5 115 * Address to 'rfid' to in r5
115 */ 116 */
116_GLOBAL(pnv_powersave_common) 117pnv_powersave_common:
117 /* Use r3 to pass state nap/sleep/winkle */ 118 /* Use r3 to pass state nap/sleep/winkle */
118 /* NAP is a state loss, we create a regs frame on the 119 /* NAP is a state loss, we create a regs frame on the
119 * stack, fill it up with the state we care about and 120 * stack, fill it up with the state we care about and
@@ -188,8 +189,8 @@ pnv_enter_arch207_idle_mode:
188 /* The following store to HSTATE_HWTHREAD_STATE(r13) */ 189 /* The following store to HSTATE_HWTHREAD_STATE(r13) */
189 /* MUST occur in real mode, i.e. with the MMU off, */ 190 /* MUST occur in real mode, i.e. with the MMU off, */
190 /* and the MMU must stay off until we clear this flag */ 191 /* and the MMU must stay off until we clear this flag */
191 /* and test HSTATE_HWTHREAD_REQ(r13) in the system */ 192 /* and test HSTATE_HWTHREAD_REQ(r13) in */
192 /* reset interrupt vector in exceptions-64s.S. */ 193 /* pnv_powersave_wakeup in this file. */
193 /* The reason is that another thread can switch the */ 194 /* The reason is that another thread can switch the */
194 /* MMU to a guest context whenever this flag is set */ 195 /* MMU to a guest context whenever this flag is set */
195 /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ 196 /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */
@@ -209,15 +210,20 @@ pnv_enter_arch207_idle_mode:
209 /* Sleep or winkle */ 210 /* Sleep or winkle */
210 lbz r7,PACA_THREAD_MASK(r13) 211 lbz r7,PACA_THREAD_MASK(r13)
211 ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 212 ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
213 li r5,0
214 beq cr3,3f
215 lis r5,PNV_CORE_IDLE_WINKLE_COUNT@h
2163:
212lwarx_loop1: 217lwarx_loop1:
213 lwarx r15,0,r14 218 lwarx r15,0,r14
214 219
215 andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 220 andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
216 bnel core_idle_lock_held 221 bnel- core_idle_lock_held
217 222
223 add r15,r15,r5 /* Add if winkle */
218 andc r15,r15,r7 /* Clear thread bit */ 224 andc r15,r15,r7 /* Clear thread bit */
219 225
220 andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 226 andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS
221 227
222/* 228/*
223 * If cr0 = 0, then current thread is the last thread of the core entering 229 * If cr0 = 0, then current thread is the last thread of the core entering
@@ -240,7 +246,7 @@ common_enter: /* common code for all the threads entering sleep or winkle */
240 IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 246 IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
241 247
242fastsleep_workaround_at_entry: 248fastsleep_workaround_at_entry:
243 ori r15,r15,PNV_CORE_IDLE_LOCK_BIT 249 oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
244 stwcx. r15,0,r14 250 stwcx. r15,0,r14
245 bne- lwarx_loop1 251 bne- lwarx_loop1
246 isync 252 isync
@@ -250,10 +256,10 @@ fastsleep_workaround_at_entry:
250 li r4,1 256 li r4,1
251 bl opal_config_cpu_idle_state 257 bl opal_config_cpu_idle_state
252 258
253 /* Clear Lock bit */ 259 /* Unlock */
254 li r0,0 260 xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
255 lwsync 261 lwsync
256 stw r0,0(r14) 262 stw r15,0(r14)
257 b common_enter 263 b common_enter
258 264
259enter_winkle: 265enter_winkle:
@@ -301,8 +307,8 @@ power_enter_stop:
301 307
302lwarx_loop_stop: 308lwarx_loop_stop:
303 lwarx r15,0,r14 309 lwarx r15,0,r14
304 andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 310 andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
305 bnel core_idle_lock_held 311 bnel- core_idle_lock_held
306 andc r15,r15,r7 /* Clear thread bit */ 312 andc r15,r15,r7 /* Clear thread bit */
307 313
308 stwcx. r15,0,r14 314 stwcx. r15,0,r14
@@ -375,17 +381,113 @@ _GLOBAL(power9_idle_stop)
375 li r4,1 381 li r4,1
376 b pnv_powersave_common 382 b pnv_powersave_common
377 /* No return */ 383 /* No return */
384
378/* 385/*
379 * Called from reset vector. Check whether we have woken up with 386 * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1,
380 * hypervisor state loss. If yes, restore hypervisor state and return 387 * HSPRG0 will be set to the HSPRG0 value of one of the
381 * back to reset vector. 388 * threads in this core. Thus the value we have in r13
389 * may not be this thread's paca pointer.
390 *
391 * Fortunately, the TIR remains invariant. Since this thread's
392 * paca pointer is recorded in all its sibling's paca, we can
393 * correctly recover this thread's paca pointer if we
394 * know the index of this thread in the core.
395 *
396 * This index can be obtained from the TIR.
382 * 397 *
383 * r13 - Contents of HSPRG0 398 * i.e, thread's position in the core = TIR.
399 * If this value is i, then this thread's paca is
400 * paca->thread_sibling_pacas[i].
401 */
402power9_dd1_recover_paca:
403 mfspr r4, SPRN_TIR
404 /*
405 * Since each entry in thread_sibling_pacas is 8 bytes
406 * we need to left-shift by 3 bits. Thus r4 = i * 8
407 */
408 sldi r4, r4, 3
409 /* Get &paca->thread_sibling_pacas[0] in r5 */
410 ld r5, PACA_SIBLING_PACA_PTRS(r13)
411 /* Load paca->thread_sibling_pacas[i] into r13 */
412 ldx r13, r4, r5
413 SET_PACA(r13)
414 /*
415 * Indicate that we have lost NVGPR state
416 * which needs to be restored from the stack.
417 */
418 li r3, 1
419 stb r0,PACA_NAPSTATELOST(r13)
420 blr
421
422/*
423 * Called from machine check handler for powersave wakeups.
424 * Low level machine check processing has already been done. Now just
425 * go through the wake up path to get everything in order.
426 *
427 * r3 - The original SRR1 value.
428 * Original SRR[01] have been clobbered.
429 * MSR_RI is clear.
430 */
431.global pnv_powersave_wakeup_mce
432pnv_powersave_wakeup_mce:
433 /* Set cr3 for pnv_powersave_wakeup */
434 rlwinm r11,r3,47-31,30,31
435 cmpwi cr3,r11,2
436
437 /*
438 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
439 * reason into SRR1, which allows reuse of the system reset wakeup
440 * code without being mistaken for another type of wakeup.
441 */
442 oris r3,r3,SRR1_WAKEMCE_RESVD@h
443 mtspr SPRN_SRR1,r3
444
445 b pnv_powersave_wakeup
446
447/*
448 * Called from reset vector for powersave wakeups.
384 * cr3 - set to gt if waking up with partial/complete hypervisor state loss 449 * cr3 - set to gt if waking up with partial/complete hypervisor state loss
385 */ 450 */
386_GLOBAL(pnv_restore_hyp_resource) 451.global pnv_powersave_wakeup
452pnv_powersave_wakeup:
453 ld r2, PACATOC(r13)
454
387BEGIN_FTR_SECTION 455BEGIN_FTR_SECTION
388 ld r2,PACATOC(r13); 456BEGIN_FTR_SECTION_NESTED(70)
457 bl power9_dd1_recover_paca
458END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
459 bl pnv_restore_hyp_resource_arch300
460FTR_SECTION_ELSE
461 bl pnv_restore_hyp_resource_arch207
462ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
463
464 li r0,PNV_THREAD_RUNNING
465 stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
466
467#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
468 li r0,KVM_HWTHREAD_IN_KERNEL
469 stb r0,HSTATE_HWTHREAD_STATE(r13)
470 /* Order setting hwthread_state vs. testing hwthread_req */
471 sync
472 lbz r0,HSTATE_HWTHREAD_REQ(r13)
473 cmpwi r0,0
474 beq 1f
475 b kvm_start_guest
4761:
477#endif
478
479 /* Return SRR1 from power7_nap() */
480 mfspr r3,SPRN_SRR1
481 blt cr3,pnv_wakeup_noloss
482 b pnv_wakeup_loss
483
484/*
485 * Check whether we have woken up with hypervisor state loss.
486 * If yes, restore hypervisor state and return back to link.
487 *
488 * cr3 - set to gt if waking up with partial/complete hypervisor state loss
489 */
490pnv_restore_hyp_resource_arch300:
389 /* 491 /*
390 * POWER ISA 3. Use PSSCR to determine if we 492 * POWER ISA 3. Use PSSCR to determine if we
391 * are waking up from deep idle state 493 * are waking up from deep idle state
@@ -400,31 +502,19 @@ BEGIN_FTR_SECTION
400 */ 502 */
401 rldicl r5,r5,4,60 503 rldicl r5,r5,4,60
402 cmpd cr4,r5,r4 504 cmpd cr4,r5,r4
403 bge cr4,pnv_wakeup_tb_loss 505 bge cr4,pnv_wakeup_tb_loss /* returns to caller */
404 /*
405 * Waking up without hypervisor state loss. Return to
406 * reset vector
407 */
408 blr
409 506
410END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 507 blr /* Waking up without hypervisor state loss. */
411 508
509/* Same calling convention as arch300 */
510pnv_restore_hyp_resource_arch207:
412 /* 511 /*
413 * POWER ISA 2.07 or less. 512 * POWER ISA 2.07 or less.
414 * Check if last bit of HSPGR0 is set. This indicates whether we are 513 * Check if we slept with sleep or winkle.
415 * waking up from winkle.
416 */ 514 */
417 clrldi r5,r13,63 515 lbz r4,PACA_THREAD_IDLE_STATE(r13)
418 clrrdi r13,r13,1 516 cmpwi cr2,r4,PNV_THREAD_NAP
419 517 bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */
420 /* Now that we are sure r13 is corrected, load TOC */
421 ld r2,PACATOC(r13);
422 cmpwi cr4,r5,1
423 mtspr SPRN_HSPRG0,r13
424
425 lbz r0,PACA_THREAD_IDLE_STATE(r13)
426 cmpwi cr2,r0,PNV_THREAD_NAP
427 bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */
428 518
429 /* 519 /*
430 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking 520 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
@@ -433,8 +523,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
433 */ 523 */
434 bgt cr3,. 524 bgt cr3,.
435 525
436 blr /* Return back to System Reset vector from where 526 blr /* Waking up without hypervisor state loss */
437 pnv_restore_hyp_resource was invoked */
438 527
439/* 528/*
440 * Called if waking up from idle state which can cause either partial or 529 * Called if waking up from idle state which can cause either partial or
@@ -444,9 +533,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
444 * 533 *
445 * r13 - PACA 534 * r13 - PACA
446 * cr3 - gt if waking up with partial/complete hypervisor state loss 535 * cr3 - gt if waking up with partial/complete hypervisor state loss
536 *
537 * If ISA300:
447 * cr4 - gt or eq if waking up from complete hypervisor state loss. 538 * cr4 - gt or eq if waking up from complete hypervisor state loss.
539 *
540 * If ISA207:
541 * r4 - PACA_THREAD_IDLE_STATE
448 */ 542 */
449_GLOBAL(pnv_wakeup_tb_loss) 543pnv_wakeup_tb_loss:
450 ld r1,PACAR1(r13) 544 ld r1,PACAR1(r13)
451 /* 545 /*
452 * Before entering any idle state, the NVGPRs are saved in the stack. 546 * Before entering any idle state, the NVGPRs are saved in the stack.
@@ -473,18 +567,19 @@ _GLOBAL(pnv_wakeup_tb_loss)
473 * is required to return back to reset vector after hypervisor state 567 * is required to return back to reset vector after hypervisor state
474 * restore is complete. 568 * restore is complete.
475 */ 569 */
570 mr r18,r4
476 mflr r17 571 mflr r17
477 mfspr r16,SPRN_SRR1 572 mfspr r16,SPRN_SRR1
478BEGIN_FTR_SECTION 573BEGIN_FTR_SECTION
479 CHECK_HMI_INTERRUPT 574 CHECK_HMI_INTERRUPT
480END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 575END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
481 576
482 lbz r7,PACA_THREAD_MASK(r13)
483 ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 577 ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
484lwarx_loop2: 578 lbz r7,PACA_THREAD_MASK(r13)
485 lwarx r15,0,r14 579
486 andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
487 /* 580 /*
581 * Take the core lock to synchronize against other threads.
582 *
488 * Lock bit is set in one of the 2 cases- 583 * Lock bit is set in one of the 2 cases-
489 * a. In the sleep/winkle enter path, the last thread is executing 584 * a. In the sleep/winkle enter path, the last thread is executing
490 * fastsleep workaround code. 585 * fastsleep workaround code.
@@ -492,23 +587,93 @@ lwarx_loop2:
492 * workaround undo code or resyncing timebase or restoring context 587 * workaround undo code or resyncing timebase or restoring context
493 * In either case loop until the lock bit is cleared. 588 * In either case loop until the lock bit is cleared.
494 */ 589 */
495 bnel core_idle_lock_held 5901:
591 lwarx r15,0,r14
592 andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
593 bnel- core_idle_lock_held
594 oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
595 stwcx. r15,0,r14
596 bne- 1b
597 isync
496 598
497 cmpwi cr2,r15,0 599 andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS
600 cmpwi cr2,r9,0
498 601
499 /* 602 /*
500 * At this stage 603 * At this stage
501 * cr2 - eq if first thread to wakeup in core 604 * cr2 - eq if first thread to wakeup in core
502 * cr3- gt if waking up with partial/complete hypervisor state loss 605 * cr3- gt if waking up with partial/complete hypervisor state loss
606 * ISA300:
503 * cr4 - gt or eq if waking up from complete hypervisor state loss. 607 * cr4 - gt or eq if waking up from complete hypervisor state loss.
504 */ 608 */
505 609
506 ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
507 stwcx. r15,0,r14
508 bne- lwarx_loop2
509 isync
510
511BEGIN_FTR_SECTION 610BEGIN_FTR_SECTION
611 /*
612 * Were we in winkle?
613 * If yes, check if all threads were in winkle, decrement our
614 * winkle count, set all thread winkle bits if all were in winkle.
615 * Check if our thread has a winkle bit set, and set cr4 accordingly
616 * (to match ISA300, above). Pseudo-code for core idle state
617 * transitions for ISA207 is as follows (everything happens atomically
618 * due to store conditional and/or lock bit):
619 *
620 * nap_idle() { }
621 * nap_wake() { }
622 *
623 * sleep_idle()
624 * {
625 * core_idle_state &= ~thread_in_core
626 * }
627 *
628 * sleep_wake()
629 * {
630 * bool first_in_core, first_in_subcore;
631 *
632 * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
633 * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
634 *
635 * core_idle_state |= thread_in_core;
636 * }
637 *
638 * winkle_idle()
639 * {
640 * core_idle_state &= ~thread_in_core;
641 * core_idle_state += 1 << WINKLE_COUNT_SHIFT;
642 * }
643 *
644 * winkle_wake()
645 * {
646 * bool first_in_core, first_in_subcore, winkle_state_lost;
647 *
648 * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
649 * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
650 *
651 * core_idle_state |= thread_in_core;
652 *
653 * if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
654 * core_idle_state |= THREAD_WINKLE_BITS;
655 * core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
656 *
657 * winkle_state_lost = core_idle_state &
658 * (thread_in_core << WINKLE_THREAD_SHIFT);
659 * core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
660 * }
661 *
662 */
663 cmpwi r18,PNV_THREAD_WINKLE
664 bne 2f
665 andis. r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
666 subis r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
667 beq 2f
668 ori r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
6692:
670 /* Shift thread bit to winkle mask, then test if this thread is set,
671 * and remove it from the winkle bits */
672 slwi r8,r7,8
673 and r8,r8,r15
674 andc r15,r15,r8
675 cmpwi cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
676
512 lbz r4,PACA_SUBCORE_SIBLING_MASK(r13) 677 lbz r4,PACA_SUBCORE_SIBLING_MASK(r13)
513 and r4,r4,r15 678 and r4,r4,r15
514 cmpwi r4,0 /* Check if first in subcore */ 679 cmpwi r4,0 /* Check if first in subcore */
@@ -593,7 +758,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
593 mtspr SPRN_WORC,r4 758 mtspr SPRN_WORC,r4
594 759
595clear_lock: 760clear_lock:
596 andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 761 xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
597 lwsync 762 lwsync
598 stw r15,0(r14) 763 stw r15,0(r14)
599 764
@@ -651,8 +816,7 @@ hypervisor_state_restored:
651 816
652 mtspr SPRN_SRR1,r16 817 mtspr SPRN_SRR1,r16
653 mtlr r17 818 mtlr r17
654 blr /* Return back to System Reset vector from where 819 blr /* return to pnv_powersave_wakeup */
655 pnv_restore_hyp_resource was invoked */
656 820
657fastsleep_workaround_at_exit: 821fastsleep_workaround_at_exit:
658 li r3,1 822 li r3,1
@@ -664,7 +828,8 @@ fastsleep_workaround_at_exit:
664 * R3 here contains the value that will be returned to the caller 828 * R3 here contains the value that will be returned to the caller
665 * of power7_nap. 829 * of power7_nap.
666 */ 830 */
667_GLOBAL(pnv_wakeup_loss) 831.global pnv_wakeup_loss
832pnv_wakeup_loss:
668 ld r1,PACAR1(r13) 833 ld r1,PACAR1(r13)
669BEGIN_FTR_SECTION 834BEGIN_FTR_SECTION
670 CHECK_HMI_INTERRUPT 835 CHECK_HMI_INTERRUPT
@@ -684,7 +849,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
684 * R3 here contains the value that will be returned to the caller 849 * R3 here contains the value that will be returned to the caller
685 * of power7_nap. 850 * of power7_nap.
686 */ 851 */
687_GLOBAL(pnv_wakeup_noloss) 852pnv_wakeup_noloss:
688 lbz r0,PACA_NAPSTATELOST(r13) 853 lbz r0,PACA_NAPSTATELOST(r13)
689 cmpwi r0,0 854 cmpwi r0,0
690 bne pnv_wakeup_loss 855 bne pnv_wakeup_loss
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a566ec5..5a3231fedf08 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,16 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
711 return tbl; 711 return tbl;
712} 712}
713 713
714void iommu_free_table(struct iommu_table *tbl, const char *node_name) 714static void iommu_table_free(struct kref *kref)
715{ 715{
716 unsigned long bitmap_sz; 716 unsigned long bitmap_sz;
717 unsigned int order; 717 unsigned int order;
718 struct iommu_table *tbl;
718 719
719 if (!tbl) 720 tbl = container_of(kref, struct iommu_table, it_kref);
720 return; 721
722 if (tbl->it_ops->free)
723 tbl->it_ops->free(tbl);
721 724
722 if (!tbl->it_map) { 725 if (!tbl->it_map) {
723 kfree(tbl); 726 kfree(tbl);
@@ -733,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
733 736
734 /* verify that table contains no entries */ 737 /* verify that table contains no entries */
735 if (!bitmap_empty(tbl->it_map, tbl->it_size)) 738 if (!bitmap_empty(tbl->it_map, tbl->it_size))
736 pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); 739 pr_warn("%s: Unexpected TCEs\n", __func__);
737 740
738 /* calculate bitmap size in bytes */ 741 /* calculate bitmap size in bytes */
739 bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); 742 bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -746,6 +749,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
746 kfree(tbl); 749 kfree(tbl);
747} 750}
748 751
752struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
753{
754 if (kref_get_unless_zero(&tbl->it_kref))
755 return tbl;
756
757 return NULL;
758}
759EXPORT_SYMBOL_GPL(iommu_tce_table_get);
760
761int iommu_tce_table_put(struct iommu_table *tbl)
762{
763 if (WARN_ON(!tbl))
764 return 0;
765
766 return kref_put(&tbl->it_kref, iommu_table_free);
767}
768EXPORT_SYMBOL_GPL(iommu_tce_table_put);
769
749/* Creates TCEs for a user provided buffer. The user buffer must be 770/* Creates TCEs for a user provided buffer. The user buffer must be
750 * contiguous real kernel storage (not vmalloc). The address passed here 771 * contiguous real kernel storage (not vmalloc). The address passed here
751 * comprises a page address and offset into that page. The dma_addr_t 772 * comprises a page address and offset into that page. The dma_addr_t
@@ -1004,6 +1025,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
1004} 1025}
1005EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1026EXPORT_SYMBOL_GPL(iommu_tce_xchg);
1006 1027
1028#ifdef CONFIG_PPC_BOOK3S_64
1029long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
1030 unsigned long *hpa, enum dma_data_direction *direction)
1031{
1032 long ret;
1033
1034 ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
1035
1036 if (!ret && ((*direction == DMA_FROM_DEVICE) ||
1037 (*direction == DMA_BIDIRECTIONAL))) {
1038 struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
1039
1040 if (likely(pg)) {
1041 SetPageDirty(pg);
1042 } else {
1043 tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
1044 ret = -EFAULT;
1045 }
1046 }
1047
1048 return ret;
1049}
1050EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
1051#endif
1052
1007int iommu_take_ownership(struct iommu_table *tbl) 1053int iommu_take_ownership(struct iommu_table *tbl)
1008{ 1054{
1009 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; 1055 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..5c291df30fe3 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,7 +65,6 @@
65#include <asm/machdep.h> 65#include <asm/machdep.h>
66#include <asm/udbg.h> 66#include <asm/udbg.h>
67#include <asm/smp.h> 67#include <asm/smp.h>
68#include <asm/debug.h>
69#include <asm/livepatch.h> 68#include <asm/livepatch.h>
70#include <asm/asm-prototypes.h> 69#include <asm/asm-prototypes.h>
71 70
@@ -442,46 +441,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
442 return sum; 441 return sum;
443} 442}
444 443
445#ifdef CONFIG_HOTPLUG_CPU
446void migrate_irqs(void)
447{
448 struct irq_desc *desc;
449 unsigned int irq;
450 static int warned;
451 cpumask_var_t mask;
452 const struct cpumask *map = cpu_online_mask;
453
454 alloc_cpumask_var(&mask, GFP_KERNEL);
455
456 for_each_irq_desc(irq, desc) {
457 struct irq_data *data;
458 struct irq_chip *chip;
459
460 data = irq_desc_get_irq_data(desc);
461 if (irqd_is_per_cpu(data))
462 continue;
463
464 chip = irq_data_get_irq_chip(data);
465
466 cpumask_and(mask, irq_data_get_affinity_mask(data), map);
467 if (cpumask_any(mask) >= nr_cpu_ids) {
468 pr_warn("Breaking affinity for irq %i\n", irq);
469 cpumask_copy(mask, map);
470 }
471 if (chip->irq_set_affinity)
472 chip->irq_set_affinity(data, mask, true);
473 else if (desc->action && !(warned++))
474 pr_err("Cannot set affinity for irq %i\n", irq);
475 }
476
477 free_cpumask_var(mask);
478
479 local_irq_enable();
480 mdelay(1);
481 local_irq_disable();
482}
483#endif
484
485static inline void check_stack_overflow(void) 444static inline void check_stack_overflow(void)
486{ 445{
487#ifdef CONFIG_DEBUG_STACKOVERFLOW 446#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
new file mode 100644
index 000000000000..6c089d9757c9
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -0,0 +1,104 @@
1/*
2 * Dynamic Ftrace based Kprobes Optimization
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) Hitachi Ltd., 2012
19 * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
20 * IBM Corporation
21 */
22#include <linux/kprobes.h>
23#include <linux/ptrace.h>
24#include <linux/hardirq.h>
25#include <linux/preempt.h>
26#include <linux/ftrace.h>
27
28static nokprobe_inline
29int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
30 struct kprobe_ctlblk *kcb, unsigned long orig_nip)
31{
32 /*
33 * Emulate singlestep (and also recover regs->nip)
34 * as if there is a nop
35 */
36 regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
37 if (unlikely(p->post_handler)) {
38 kcb->kprobe_status = KPROBE_HIT_SSDONE;
39 p->post_handler(p, regs, 0);
40 }
41 __this_cpu_write(current_kprobe, NULL);
42 if (orig_nip)
43 regs->nip = orig_nip;
44 return 1;
45}
46
47int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
48 struct kprobe_ctlblk *kcb)
49{
50 if (kprobe_ftrace(p))
51 return __skip_singlestep(p, regs, kcb, 0);
52 else
53 return 0;
54}
55NOKPROBE_SYMBOL(skip_singlestep);
56
57/* Ftrace callback handler for kprobes */
58void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
59 struct ftrace_ops *ops, struct pt_regs *regs)
60{
61 struct kprobe *p;
62 struct kprobe_ctlblk *kcb;
63 unsigned long flags;
64
65 /* Disable irq for emulating a breakpoint and avoiding preempt */
66 local_irq_save(flags);
67 hard_irq_disable();
68
69 p = get_kprobe((kprobe_opcode_t *)nip);
70 if (unlikely(!p) || kprobe_disabled(p))
71 goto end;
72
73 kcb = get_kprobe_ctlblk();
74 if (kprobe_running()) {
75 kprobes_inc_nmissed_count(p);
76 } else {
77 unsigned long orig_nip = regs->nip;
78
79 /*
80 * On powerpc, NIP is *before* this instruction for the
81 * pre handler
82 */
83 regs->nip -= MCOUNT_INSN_SIZE;
84
85 __this_cpu_write(current_kprobe, p);
86 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
87 if (!p->pre_handler || !p->pre_handler(p, regs))
88 __skip_singlestep(p, regs, kcb, orig_nip);
89 /*
90 * If pre_handler returns !0, it sets regs->nip and
91 * resets current kprobe.
92 */
93 }
94end:
95 local_irq_restore(flags);
96}
97NOKPROBE_SYMBOL(kprobe_ftrace_handler);
98
99int arch_prepare_kprobe_ftrace(struct kprobe *p)
100{
101 p->ainsn.insn = NULL;
102 p->ainsn.boostable = -1;
103 return 0;
104}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..160ae0fa7d0d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -35,6 +35,7 @@
35#include <asm/code-patching.h> 35#include <asm/code-patching.h>
36#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
37#include <asm/sstep.h> 37#include <asm/sstep.h>
38#include <asm/sections.h>
38#include <linux/uaccess.h> 39#include <linux/uaccess.h>
39 40
40DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 41DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -42,7 +43,86 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
42 43
43struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; 44struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
44 45
45int __kprobes arch_prepare_kprobe(struct kprobe *p) 46bool arch_within_kprobe_blacklist(unsigned long addr)
47{
48 return (addr >= (unsigned long)__kprobes_text_start &&
49 addr < (unsigned long)__kprobes_text_end) ||
50 (addr >= (unsigned long)_stext &&
51 addr < (unsigned long)__head_end);
52}
53
54kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
55{
56 kprobe_opcode_t *addr;
57
58#ifdef PPC64_ELF_ABI_v2
59 /* PPC64 ABIv2 needs local entry point */
60 addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
61 if (addr && !offset) {
62#ifdef CONFIG_KPROBES_ON_FTRACE
63 unsigned long faddr;
64 /*
65 * Per livepatch.h, ftrace location is always within the first
66 * 16 bytes of a function on powerpc with -mprofile-kernel.
67 */
68 faddr = ftrace_location_range((unsigned long)addr,
69 (unsigned long)addr + 16);
70 if (faddr)
71 addr = (kprobe_opcode_t *)faddr;
72 else
73#endif
74 addr = (kprobe_opcode_t *)ppc_function_entry(addr);
75 }
76#elif defined(PPC64_ELF_ABI_v1)
77 /*
78 * 64bit powerpc ABIv1 uses function descriptors:
79 * - Check for the dot variant of the symbol first.
80 * - If that fails, try looking up the symbol provided.
81 *
82 * This ensures we always get to the actual symbol and not
83 * the descriptor.
84 *
85 * Also handle <module:symbol> format.
86 */
87 char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
88 const char *modsym;
89 bool dot_appended = false;
90 if ((modsym = strchr(name, ':')) != NULL) {
91 modsym++;
92 if (*modsym != '\0' && *modsym != '.') {
93 /* Convert to <module:.symbol> */
94 strncpy(dot_name, name, modsym - name);
95 dot_name[modsym - name] = '.';
96 dot_name[modsym - name + 1] = '\0';
97 strncat(dot_name, modsym,
98 sizeof(dot_name) - (modsym - name) - 2);
99 dot_appended = true;
100 } else {
101 dot_name[0] = '\0';
102 strncat(dot_name, name, sizeof(dot_name) - 1);
103 }
104 } else if (name[0] != '.') {
105 dot_name[0] = '.';
106 dot_name[1] = '\0';
107 strncat(dot_name, name, KSYM_NAME_LEN - 2);
108 dot_appended = true;
109 } else {
110 dot_name[0] = '\0';
111 strncat(dot_name, name, KSYM_NAME_LEN - 1);
112 }
113 addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
114 if (!addr && dot_appended) {
115 /* Let's try the original non-dot symbol lookup */
116 addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
117 }
118#else
119 addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
120#endif
121
122 return addr;
123}
124
125int arch_prepare_kprobe(struct kprobe *p)
46{ 126{
47 int ret = 0; 127 int ret = 0;
48 kprobe_opcode_t insn = *p->addr; 128 kprobe_opcode_t insn = *p->addr;
@@ -74,30 +154,34 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
74 p->ainsn.boostable = 0; 154 p->ainsn.boostable = 0;
75 return ret; 155 return ret;
76} 156}
157NOKPROBE_SYMBOL(arch_prepare_kprobe);
77 158
78void __kprobes arch_arm_kprobe(struct kprobe *p) 159void arch_arm_kprobe(struct kprobe *p)
79{ 160{
80 *p->addr = BREAKPOINT_INSTRUCTION; 161 *p->addr = BREAKPOINT_INSTRUCTION;
81 flush_icache_range((unsigned long) p->addr, 162 flush_icache_range((unsigned long) p->addr,
82 (unsigned long) p->addr + sizeof(kprobe_opcode_t)); 163 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
83} 164}
165NOKPROBE_SYMBOL(arch_arm_kprobe);
84 166
85void __kprobes arch_disarm_kprobe(struct kprobe *p) 167void arch_disarm_kprobe(struct kprobe *p)
86{ 168{
87 *p->addr = p->opcode; 169 *p->addr = p->opcode;
88 flush_icache_range((unsigned long) p->addr, 170 flush_icache_range((unsigned long) p->addr,
89 (unsigned long) p->addr + sizeof(kprobe_opcode_t)); 171 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
90} 172}
173NOKPROBE_SYMBOL(arch_disarm_kprobe);
91 174
92void __kprobes arch_remove_kprobe(struct kprobe *p) 175void arch_remove_kprobe(struct kprobe *p)
93{ 176{
94 if (p->ainsn.insn) { 177 if (p->ainsn.insn) {
95 free_insn_slot(p->ainsn.insn, 0); 178 free_insn_slot(p->ainsn.insn, 0);
96 p->ainsn.insn = NULL; 179 p->ainsn.insn = NULL;
97 } 180 }
98} 181}
182NOKPROBE_SYMBOL(arch_remove_kprobe);
99 183
100static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) 184static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
101{ 185{
102 enable_single_step(regs); 186 enable_single_step(regs);
103 187
@@ -110,37 +194,80 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
110 regs->nip = (unsigned long)p->ainsn.insn; 194 regs->nip = (unsigned long)p->ainsn.insn;
111} 195}
112 196
113static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) 197static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
114{ 198{
115 kcb->prev_kprobe.kp = kprobe_running(); 199 kcb->prev_kprobe.kp = kprobe_running();
116 kcb->prev_kprobe.status = kcb->kprobe_status; 200 kcb->prev_kprobe.status = kcb->kprobe_status;
117 kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; 201 kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
118} 202}
119 203
120static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 204static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
121{ 205{
122 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); 206 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
123 kcb->kprobe_status = kcb->prev_kprobe.status; 207 kcb->kprobe_status = kcb->prev_kprobe.status;
124 kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; 208 kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
125} 209}
126 210
127static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 211static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
128 struct kprobe_ctlblk *kcb) 212 struct kprobe_ctlblk *kcb)
129{ 213{
130 __this_cpu_write(current_kprobe, p); 214 __this_cpu_write(current_kprobe, p);
131 kcb->kprobe_saved_msr = regs->msr; 215 kcb->kprobe_saved_msr = regs->msr;
132} 216}
133 217
134void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 218bool arch_function_offset_within_entry(unsigned long offset)
135 struct pt_regs *regs) 219{
220#ifdef PPC64_ELF_ABI_v2
221#ifdef CONFIG_KPROBES_ON_FTRACE
222 return offset <= 16;
223#else
224 return offset <= 8;
225#endif
226#else
227 return !offset;
228#endif
229}
230
231void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
136{ 232{
137 ri->ret_addr = (kprobe_opcode_t *)regs->link; 233 ri->ret_addr = (kprobe_opcode_t *)regs->link;
138 234
139 /* Replace the return addr with trampoline addr */ 235 /* Replace the return addr with trampoline addr */
140 regs->link = (unsigned long)kretprobe_trampoline; 236 regs->link = (unsigned long)kretprobe_trampoline;
141} 237}
238NOKPROBE_SYMBOL(arch_prepare_kretprobe);
142 239
143int __kprobes kprobe_handler(struct pt_regs *regs) 240int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
241{
242 int ret;
243 unsigned int insn = *p->ainsn.insn;
244
245 /* regs->nip is also adjusted if emulate_step returns 1 */
246 ret = emulate_step(regs, insn);
247 if (ret > 0) {
248 /*
249 * Once this instruction has been boosted
250 * successfully, set the boostable flag
251 */
252 if (unlikely(p->ainsn.boostable == 0))
253 p->ainsn.boostable = 1;
254 } else if (ret < 0) {
255 /*
256 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
257 * So, we should never get here... but, its still
258 * good to catch them, just in case...
259 */
260 printk("Can't step on instruction %x\n", insn);
261 BUG();
262 } else if (ret == 0)
263 /* This instruction can't be boosted */
264 p->ainsn.boostable = -1;
265
266 return ret;
267}
268NOKPROBE_SYMBOL(try_to_emulate);
269
270int kprobe_handler(struct pt_regs *regs)
144{ 271{
145 struct kprobe *p; 272 struct kprobe *p;
146 int ret = 0; 273 int ret = 0;
@@ -177,10 +304,17 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
177 */ 304 */
178 save_previous_kprobe(kcb); 305 save_previous_kprobe(kcb);
179 set_current_kprobe(p, regs, kcb); 306 set_current_kprobe(p, regs, kcb);
180 kcb->kprobe_saved_msr = regs->msr;
181 kprobes_inc_nmissed_count(p); 307 kprobes_inc_nmissed_count(p);
182 prepare_singlestep(p, regs); 308 prepare_singlestep(p, regs);
183 kcb->kprobe_status = KPROBE_REENTER; 309 kcb->kprobe_status = KPROBE_REENTER;
310 if (p->ainsn.boostable >= 0) {
311 ret = try_to_emulate(p, regs);
312
313 if (ret > 0) {
314 restore_previous_kprobe(kcb);
315 return 1;
316 }
317 }
184 return 1; 318 return 1;
185 } else { 319 } else {
186 if (*addr != BREAKPOINT_INSTRUCTION) { 320 if (*addr != BREAKPOINT_INSTRUCTION) {
@@ -197,7 +331,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
197 } 331 }
198 p = __this_cpu_read(current_kprobe); 332 p = __this_cpu_read(current_kprobe);
199 if (p->break_handler && p->break_handler(p, regs)) { 333 if (p->break_handler && p->break_handler(p, regs)) {
200 goto ss_probe; 334 if (!skip_singlestep(p, regs, kcb))
335 goto ss_probe;
336 ret = 1;
201 } 337 }
202 } 338 }
203 goto no_kprobe; 339 goto no_kprobe;
@@ -235,18 +371,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
235 371
236ss_probe: 372ss_probe:
237 if (p->ainsn.boostable >= 0) { 373 if (p->ainsn.boostable >= 0) {
238 unsigned int insn = *p->ainsn.insn; 374 ret = try_to_emulate(p, regs);
239 375
240 /* regs->nip is also adjusted if emulate_step returns 1 */
241 ret = emulate_step(regs, insn);
242 if (ret > 0) { 376 if (ret > 0) {
243 /*
244 * Once this instruction has been boosted
245 * successfully, set the boostable flag
246 */
247 if (unlikely(p->ainsn.boostable == 0))
248 p->ainsn.boostable = 1;
249
250 if (p->post_handler) 377 if (p->post_handler)
251 p->post_handler(p, regs, 0); 378 p->post_handler(p, regs, 0);
252 379
@@ -254,17 +381,7 @@ ss_probe:
254 reset_current_kprobe(); 381 reset_current_kprobe();
255 preempt_enable_no_resched(); 382 preempt_enable_no_resched();
256 return 1; 383 return 1;
257 } else if (ret < 0) { 384 }
258 /*
259 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
260 * So, we should never get here... but, its still
261 * good to catch them, just in case...
262 */
263 printk("Can't step on instruction %x\n", insn);
264 BUG();
265 } else if (ret == 0)
266 /* This instruction can't be boosted */
267 p->ainsn.boostable = -1;
268 } 385 }
269 prepare_singlestep(p, regs); 386 prepare_singlestep(p, regs);
270 kcb->kprobe_status = KPROBE_HIT_SS; 387 kcb->kprobe_status = KPROBE_HIT_SS;
@@ -274,6 +391,7 @@ no_kprobe:
274 preempt_enable_no_resched(); 391 preempt_enable_no_resched();
275 return ret; 392 return ret;
276} 393}
394NOKPROBE_SYMBOL(kprobe_handler);
277 395
278/* 396/*
279 * Function return probe trampoline: 397 * Function return probe trampoline:
@@ -291,8 +409,7 @@ asm(".global kretprobe_trampoline\n"
291/* 409/*
292 * Called when the probe at kretprobe trampoline is hit 410 * Called when the probe at kretprobe trampoline is hit
293 */ 411 */
294static int __kprobes trampoline_probe_handler(struct kprobe *p, 412static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
295 struct pt_regs *regs)
296{ 413{
297 struct kretprobe_instance *ri = NULL; 414 struct kretprobe_instance *ri = NULL;
298 struct hlist_head *head, empty_rp; 415 struct hlist_head *head, empty_rp;
@@ -361,6 +478,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
361 */ 478 */
362 return 1; 479 return 1;
363} 480}
481NOKPROBE_SYMBOL(trampoline_probe_handler);
364 482
365/* 483/*
366 * Called after single-stepping. p->addr is the address of the 484 * Called after single-stepping. p->addr is the address of the
@@ -370,7 +488,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
370 * single-stepped a copy of the instruction. The address of this 488 * single-stepped a copy of the instruction. The address of this
371 * copy is p->ainsn.insn. 489 * copy is p->ainsn.insn.
372 */ 490 */
373int __kprobes kprobe_post_handler(struct pt_regs *regs) 491int kprobe_post_handler(struct pt_regs *regs)
374{ 492{
375 struct kprobe *cur = kprobe_running(); 493 struct kprobe *cur = kprobe_running();
376 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 494 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -410,8 +528,9 @@ out:
410 528
411 return 1; 529 return 1;
412} 530}
531NOKPROBE_SYMBOL(kprobe_post_handler);
413 532
414int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) 533int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
415{ 534{
416 struct kprobe *cur = kprobe_running(); 535 struct kprobe *cur = kprobe_running();
417 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 536 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -474,13 +593,15 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
474 } 593 }
475 return 0; 594 return 0;
476} 595}
596NOKPROBE_SYMBOL(kprobe_fault_handler);
477 597
478unsigned long arch_deref_entry_point(void *entry) 598unsigned long arch_deref_entry_point(void *entry)
479{ 599{
480 return ppc_global_function_entry(entry); 600 return ppc_global_function_entry(entry);
481} 601}
602NOKPROBE_SYMBOL(arch_deref_entry_point);
482 603
483int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) 604int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
484{ 605{
485 struct jprobe *jp = container_of(p, struct jprobe, kp); 606 struct jprobe *jp = container_of(p, struct jprobe, kp);
486 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 607 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -497,17 +618,20 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
497 618
498 return 1; 619 return 1;
499} 620}
621NOKPROBE_SYMBOL(setjmp_pre_handler);
500 622
501void __used __kprobes jprobe_return(void) 623void __used jprobe_return(void)
502{ 624{
503 asm volatile("trap" ::: "memory"); 625 asm volatile("trap" ::: "memory");
504} 626}
627NOKPROBE_SYMBOL(jprobe_return);
505 628
506static void __used __kprobes jprobe_return_end(void) 629static void __used jprobe_return_end(void)
507{ 630{
508}; 631}
632NOKPROBE_SYMBOL(jprobe_return_end);
509 633
510int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) 634int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
511{ 635{
512 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 636 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
513 637
@@ -520,6 +644,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
520 preempt_enable_no_resched(); 644 preempt_enable_no_resched();
521 return 1; 645 return 1;
522} 646}
647NOKPROBE_SYMBOL(longjmp_break_handler);
523 648
524static struct kprobe trampoline_p = { 649static struct kprobe trampoline_p = {
525 .addr = (kprobe_opcode_t *) &kretprobe_trampoline, 650 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
@@ -531,10 +656,11 @@ int __init arch_init_kprobes(void)
531 return register_kprobe(&trampoline_p); 656 return register_kprobe(&trampoline_p);
532} 657}
533 658
534int __kprobes arch_trampoline_kprobe(struct kprobe *p) 659int arch_trampoline_kprobe(struct kprobe *p)
535{ 660{
536 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) 661 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
537 return 1; 662 return 1;
538 663
539 return 0; 664 return 0;
540} 665}
666NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a1475e6aef3a..5f9eada3519b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -221,6 +221,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
221{ 221{
222 int index; 222 int index;
223 223
224 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
225
224 /* 226 /*
225 * For now just print it to console. 227 * For now just print it to console.
226 * TODO: log this error event to FSP or nvram. 228 * TODO: log this error event to FSP or nvram.
@@ -228,12 +230,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
228 while (__this_cpu_read(mce_queue_count) > 0) { 230 while (__this_cpu_read(mce_queue_count) > 0) {
229 index = __this_cpu_read(mce_queue_count) - 1; 231 index = __this_cpu_read(mce_queue_count) - 1;
230 machine_check_print_event_info( 232 machine_check_print_event_info(
231 this_cpu_ptr(&mce_event_queue[index])); 233 this_cpu_ptr(&mce_event_queue[index]), false);
232 __this_cpu_dec(mce_queue_count); 234 __this_cpu_dec(mce_queue_count);
233 } 235 }
234} 236}
235 237
236void machine_check_print_event_info(struct machine_check_event *evt) 238void machine_check_print_event_info(struct machine_check_event *evt,
239 bool user_mode)
237{ 240{
238 const char *level, *sevstr, *subtype; 241 const char *level, *sevstr, *subtype;
239 static const char *mc_ue_types[] = { 242 static const char *mc_ue_types[] = {
@@ -310,7 +313,16 @@ void machine_check_print_event_info(struct machine_check_event *evt)
310 313
311 printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 314 printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
312 evt->disposition == MCE_DISPOSITION_RECOVERED ? 315 evt->disposition == MCE_DISPOSITION_RECOVERED ?
313 "Recovered" : "[Not recovered"); 316 "Recovered" : "Not recovered");
317
318 if (user_mode) {
319 printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level,
320 evt->srr0, current->pid, current->comm);
321 } else {
322 printk("%s NIP [%016llx]: %pS\n", level, evt->srr0,
323 (void *)evt->srr0);
324 }
325
314 printk("%s Initiator: %s\n", level, 326 printk("%s Initiator: %s\n", level,
315 evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 327 evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
316 switch (evt->error_type) { 328 switch (evt->error_type) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 763d6f58caa8..f913139bb0c2 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,10 +72,14 @@ void __flush_tlb_power8(unsigned int action)
72 72
73void __flush_tlb_power9(unsigned int action) 73void __flush_tlb_power9(unsigned int action)
74{ 74{
75 unsigned int num_sets;
76
75 if (radix_enabled()) 77 if (radix_enabled())
76 flush_tlb_206(POWER9_TLB_SETS_RADIX, action); 78 num_sets = POWER9_TLB_SETS_RADIX;
79 else
80 num_sets = POWER9_TLB_SETS_HASH;
77 81
78 flush_tlb_206(POWER9_TLB_SETS_HASH, action); 82 flush_tlb_206(num_sets, action);
79} 83}
80 84
81 85
@@ -147,159 +151,365 @@ static int mce_flush(int what)
147 return 0; 151 return 0;
148} 152}
149 153
150static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat) 154#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
151{ 155
152 if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB)) 156struct mce_ierror_table {
153 dsisr &= ~slb; 157 unsigned long srr1_mask;
154 if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT)) 158 unsigned long srr1_value;
155 dsisr &= ~erat; 159 bool nip_valid; /* nip is a valid indicator of faulting address */
156 if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB)) 160 unsigned int error_type;
157 dsisr &= ~tlb; 161 unsigned int error_subtype;
158 /* Any other errors we don't understand? */ 162 unsigned int initiator;
159 if (dsisr) 163 unsigned int severity;
160 return 0; 164};
161 return 1; 165
162} 166static const struct mce_ierror_table mce_p7_ierror_table[] = {
163 167{ 0x00000000001c0000, 0x0000000000040000, true,
164static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) 168 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
169 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
170{ 0x00000000001c0000, 0x0000000000080000, true,
171 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
172 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
173{ 0x00000000001c0000, 0x00000000000c0000, true,
174 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
175 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
176{ 0x00000000001c0000, 0x0000000000100000, true,
177 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
178 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
179{ 0x00000000001c0000, 0x0000000000140000, true,
180 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
181 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
182{ 0x00000000001c0000, 0x0000000000180000, true,
183 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
184 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
185{ 0x00000000001c0000, 0x00000000001c0000, true,
186 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
187 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
188{ 0, 0, 0, 0, 0, 0 } };
189
190static const struct mce_ierror_table mce_p8_ierror_table[] = {
191{ 0x00000000081c0000, 0x0000000000040000, true,
192 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
193 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
194{ 0x00000000081c0000, 0x0000000000080000, true,
195 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
196 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
197{ 0x00000000081c0000, 0x00000000000c0000, true,
198 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
199 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
200{ 0x00000000081c0000, 0x0000000000100000, true,
201 MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
202 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
203{ 0x00000000081c0000, 0x0000000000140000, true,
204 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
205 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
206{ 0x00000000081c0000, 0x0000000000180000, true,
207 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
208 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
209{ 0x00000000081c0000, 0x00000000001c0000, true,
210 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
211 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
212{ 0x00000000081c0000, 0x0000000008000000, true,
213 MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
214 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
215{ 0x00000000081c0000, 0x0000000008040000, true,
216 MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
217 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
218{ 0, 0, 0, 0, 0, 0 } };
219
220static const struct mce_ierror_table mce_p9_ierror_table[] = {
221{ 0x00000000081c0000, 0x0000000000040000, true,
222 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
223 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
224{ 0x00000000081c0000, 0x0000000000080000, true,
225 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
226 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
227{ 0x00000000081c0000, 0x00000000000c0000, true,
228 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
229 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
230{ 0x00000000081c0000, 0x0000000000100000, true,
231 MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
232 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
233{ 0x00000000081c0000, 0x0000000000140000, true,
234 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
235 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
236{ 0x00000000081c0000, 0x0000000000180000, true,
237 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
238 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
239{ 0x00000000081c0000, 0x0000000008000000, true,
240 MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
241 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
242{ 0x00000000081c0000, 0x0000000008040000, true,
243 MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
244 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
245{ 0x00000000081c0000, 0x00000000080c0000, true,
246 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH,
247 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
248{ 0x00000000081c0000, 0x0000000008100000, true,
249 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
250 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
251{ 0x00000000081c0000, 0x0000000008140000, false,
252 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE,
253 MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */
254{ 0x00000000081c0000, 0x0000000008180000, false,
255 MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
256 MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */
257{ 0x00000000081c0000, 0x00000000081c0000, true,
258 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
259 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
260{ 0, 0, 0, 0, 0, 0 } };
261
262struct mce_derror_table {
263 unsigned long dsisr_value;
264 bool dar_valid; /* dar is a valid indicator of faulting address */
265 unsigned int error_type;
266 unsigned int error_subtype;
267 unsigned int initiator;
268 unsigned int severity;
269};
270
271static const struct mce_derror_table mce_p7_derror_table[] = {
272{ 0x00008000, false,
273 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
274 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
275{ 0x00004000, true,
276 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
277 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
278{ 0x00000800, true,
279 MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
280 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
281{ 0x00000400, true,
282 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
283 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
284{ 0x00000100, true,
285 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
286 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
287{ 0x00000080, true,
288 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
289 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
290{ 0x00000040, true,
291 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
292 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
293{ 0, false, 0, 0, 0, 0 } };
294
295static const struct mce_derror_table mce_p8_derror_table[] = {
296{ 0x00008000, false,
297 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
298 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
299{ 0x00004000, true,
300 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
301 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
302{ 0x00002000, true,
303 MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
304 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
305{ 0x00001000, true,
306 MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
307 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
308{ 0x00000800, true,
309 MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
310 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
311{ 0x00000400, true,
312 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
313 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
314{ 0x00000200, true,
315 MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
316 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
317{ 0x00000100, true,
318 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
319 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
320{ 0x00000080, true,
321 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
322 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
323{ 0, false, 0, 0, 0, 0 } };
324
325static const struct mce_derror_table mce_p9_derror_table[] = {
326{ 0x00008000, false,
327 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
328 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
329{ 0x00004000, true,
330 MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
331 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
332{ 0x00002000, true,
333 MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
334 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
335{ 0x00001000, true,
336 MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
337 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
338{ 0x00000800, true,
339 MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
340 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
341{ 0x00000400, true,
342 MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
343 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
344{ 0x00000200, false,
345 MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
346 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
347{ 0x00000100, true,
348 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
349 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
350{ 0x00000080, true,
351 MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
352 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
353{ 0x00000040, true,
354 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD,
355 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
356{ 0x00000020, false,
357 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
358 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
359{ 0x00000010, false,
360 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
361 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
362{ 0x00000008, false,
363 MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN,
364 MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
365{ 0, false, 0, 0, 0, 0 } };
366
367static int mce_handle_ierror(struct pt_regs *regs,
368 const struct mce_ierror_table table[],
369 struct mce_error_info *mce_err, uint64_t *addr)
165{ 370{
166 long handled = 1; 371 uint64_t srr1 = regs->msr;
372 int handled = 0;
373 int i;
374
375 *addr = 0;
376
377 for (i = 0; table[i].srr1_mask; i++) {
378 if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
379 continue;
380
381 /* attempt to correct the error */
382 switch (table[i].error_type) {
383 case MCE_ERROR_TYPE_SLB:
384 handled = mce_flush(MCE_FLUSH_SLB);
385 break;
386 case MCE_ERROR_TYPE_ERAT:
387 handled = mce_flush(MCE_FLUSH_ERAT);
388 break;
389 case MCE_ERROR_TYPE_TLB:
390 handled = mce_flush(MCE_FLUSH_TLB);
391 break;
392 }
167 393
168 /* 394 /* now fill in mce_error_info */
169 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. 395 mce_err->error_type = table[i].error_type;
170 * reset the error bits whenever we handle them so that at the end 396 switch (table[i].error_type) {
171 * we can check whether we handled all of them or not. 397 case MCE_ERROR_TYPE_UE:
172 * */ 398 mce_err->u.ue_error_type = table[i].error_subtype;
173#ifdef CONFIG_PPC_STD_MMU_64 399 break;
174 if (dsisr & slb_error_bits) { 400 case MCE_ERROR_TYPE_SLB:
175 flush_and_reload_slb(); 401 mce_err->u.slb_error_type = table[i].error_subtype;
176 /* reset error bits */ 402 break;
177 dsisr &= ~(slb_error_bits); 403 case MCE_ERROR_TYPE_ERAT:
178 } 404 mce_err->u.erat_error_type = table[i].error_subtype;
179 if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { 405 break;
180 if (cur_cpu_spec && cur_cpu_spec->flush_tlb) 406 case MCE_ERROR_TYPE_TLB:
181 cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); 407 mce_err->u.tlb_error_type = table[i].error_subtype;
182 /* reset error bits */ 408 break;
183 dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; 409 case MCE_ERROR_TYPE_USER:
410 mce_err->u.user_error_type = table[i].error_subtype;
411 break;
412 case MCE_ERROR_TYPE_RA:
413 mce_err->u.ra_error_type = table[i].error_subtype;
414 break;
415 case MCE_ERROR_TYPE_LINK:
416 mce_err->u.link_error_type = table[i].error_subtype;
417 break;
418 }
419 mce_err->severity = table[i].severity;
420 mce_err->initiator = table[i].initiator;
421 if (table[i].nip_valid)
422 *addr = regs->nip;
423 return handled;
184 } 424 }
185#endif
186 /* Any other errors we don't understand? */
187 if (dsisr & 0xffffffffUL)
188 handled = 0;
189 425
190 return handled; 426 mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
191} 427 mce_err->severity = MCE_SEV_ERROR_SYNC;
428 mce_err->initiator = MCE_INITIATOR_CPU;
192 429
193static long mce_handle_derror_p7(uint64_t dsisr) 430 return 0;
194{
195 return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
196} 431}
197 432
198static long mce_handle_common_ierror(uint64_t srr1) 433static int mce_handle_derror(struct pt_regs *regs,
434 const struct mce_derror_table table[],
435 struct mce_error_info *mce_err, uint64_t *addr)
199{ 436{
200 long handled = 0; 437 uint64_t dsisr = regs->dsisr;
201 438 int handled = 0;
202 switch (P7_SRR1_MC_IFETCH(srr1)) { 439 int found = 0;
203 case 0: 440 int i;
204 break; 441
205#ifdef CONFIG_PPC_STD_MMU_64 442 *addr = 0;
206 case P7_SRR1_MC_IFETCH_SLB_PARITY: 443
207 case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: 444 for (i = 0; table[i].dsisr_value; i++) {
208 /* flush and reload SLBs for SLB errors. */ 445 if (!(dsisr & table[i].dsisr_value))
209 flush_and_reload_slb(); 446 continue;
210 handled = 1; 447
211 break; 448 /* attempt to correct the error */
212 case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: 449 switch (table[i].error_type) {
213 if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { 450 case MCE_ERROR_TYPE_SLB:
214 cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); 451 if (mce_flush(MCE_FLUSH_SLB))
215 handled = 1; 452 handled = 1;
453 break;
454 case MCE_ERROR_TYPE_ERAT:
455 if (mce_flush(MCE_FLUSH_ERAT))
456 handled = 1;
457 break;
458 case MCE_ERROR_TYPE_TLB:
459 if (mce_flush(MCE_FLUSH_TLB))
460 handled = 1;
461 break;
216 } 462 }
217 break;
218#endif
219 default:
220 break;
221 }
222 463
223 return handled; 464 /*
224} 465 * Attempt to handle multiple conditions, but only return
225 466 * one. Ensure uncorrectable errors are first in the table
226static long mce_handle_ierror_p7(uint64_t srr1) 467 * to match.
227{ 468 */
228 long handled = 0; 469 if (found)
229 470 continue;
230 handled = mce_handle_common_ierror(srr1); 471
472 /* now fill in mce_error_info */
473 mce_err->error_type = table[i].error_type;
474 switch (table[i].error_type) {
475 case MCE_ERROR_TYPE_UE:
476 mce_err->u.ue_error_type = table[i].error_subtype;
477 break;
478 case MCE_ERROR_TYPE_SLB:
479 mce_err->u.slb_error_type = table[i].error_subtype;
480 break;
481 case MCE_ERROR_TYPE_ERAT:
482 mce_err->u.erat_error_type = table[i].error_subtype;
483 break;
484 case MCE_ERROR_TYPE_TLB:
485 mce_err->u.tlb_error_type = table[i].error_subtype;
486 break;
487 case MCE_ERROR_TYPE_USER:
488 mce_err->u.user_error_type = table[i].error_subtype;
489 break;
490 case MCE_ERROR_TYPE_RA:
491 mce_err->u.ra_error_type = table[i].error_subtype;
492 break;
493 case MCE_ERROR_TYPE_LINK:
494 mce_err->u.link_error_type = table[i].error_subtype;
495 break;
496 }
497 mce_err->severity = table[i].severity;
498 mce_err->initiator = table[i].initiator;
499 if (table[i].dar_valid)
500 *addr = regs->dar;
231 501
232#ifdef CONFIG_PPC_STD_MMU_64 502 found = 1;
233 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
234 flush_and_reload_slb();
235 handled = 1;
236 } 503 }
237#endif
238 return handled;
239}
240 504
241static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) 505 if (found)
242{ 506 return handled;
243 switch (P7_SRR1_MC_IFETCH(srr1)) {
244 case P7_SRR1_MC_IFETCH_SLB_PARITY:
245 mce_err->error_type = MCE_ERROR_TYPE_SLB;
246 mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
247 break;
248 case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
249 mce_err->error_type = MCE_ERROR_TYPE_SLB;
250 mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
251 break;
252 case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
253 mce_err->error_type = MCE_ERROR_TYPE_TLB;
254 mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
255 break;
256 case P7_SRR1_MC_IFETCH_UE:
257 case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
258 mce_err->error_type = MCE_ERROR_TYPE_UE;
259 mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
260 break;
261 case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
262 mce_err->error_type = MCE_ERROR_TYPE_UE;
263 mce_err->u.ue_error_type =
264 MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
265 break;
266 }
267}
268 507
269static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) 508 mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
270{ 509 mce_err->severity = MCE_SEV_ERROR_SYNC;
271 mce_get_common_ierror(mce_err, srr1); 510 mce_err->initiator = MCE_INITIATOR_CPU;
272 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
273 mce_err->error_type = MCE_ERROR_TYPE_SLB;
274 mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
275 }
276}
277 511
278static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) 512 return 0;
279{
280 if (dsisr & P7_DSISR_MC_UE) {
281 mce_err->error_type = MCE_ERROR_TYPE_UE;
282 mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
283 } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
284 mce_err->error_type = MCE_ERROR_TYPE_UE;
285 mce_err->u.ue_error_type =
286 MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
287 } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
288 mce_err->error_type = MCE_ERROR_TYPE_ERAT;
289 mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
290 } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
291 mce_err->error_type = MCE_ERROR_TYPE_SLB;
292 mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
293 } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
294 mce_err->error_type = MCE_ERROR_TYPE_SLB;
295 mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
296 } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
297 mce_err->error_type = MCE_ERROR_TYPE_TLB;
298 mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
299 } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
300 mce_err->error_type = MCE_ERROR_TYPE_SLB;
301 mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
302 }
303} 513}
304 514
305static long mce_handle_ue_error(struct pt_regs *regs) 515static long mce_handle_ue_error(struct pt_regs *regs)
@@ -320,292 +530,42 @@ static long mce_handle_ue_error(struct pt_regs *regs)
320 return handled; 530 return handled;
321} 531}
322 532
323long __machine_check_early_realmode_p7(struct pt_regs *regs) 533static long mce_handle_error(struct pt_regs *regs,
534 const struct mce_derror_table dtable[],
535 const struct mce_ierror_table itable[])
324{ 536{
325 uint64_t srr1, nip, addr; 537 struct mce_error_info mce_err = { 0 };
326 long handled = 1; 538 uint64_t addr;
327 struct mce_error_info mce_error_info = { 0 }; 539 uint64_t srr1 = regs->msr;
328 540 long handled;
329 mce_error_info.severity = MCE_SEV_ERROR_SYNC;
330 mce_error_info.initiator = MCE_INITIATOR_CPU;
331
332 srr1 = regs->msr;
333 nip = regs->nip;
334 541
335 /* 542 if (SRR1_MC_LOADSTORE(srr1))
336 * Handle memory errors depending whether this was a load/store or 543 handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
337 * ifetch exception. Also, populate the mce error_type and 544 else
338 * type-specific error_type from either SRR1 or DSISR, depending 545 handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
339 * whether this was a load/store or ifetch exception
340 */
341 if (P7_SRR1_MC_LOADSTORE(srr1)) {
342 handled = mce_handle_derror_p7(regs->dsisr);
343 mce_get_derror_p7(&mce_error_info, regs->dsisr);
344 addr = regs->dar;
345 } else {
346 handled = mce_handle_ierror_p7(srr1);
347 mce_get_ierror_p7(&mce_error_info, srr1);
348 addr = regs->nip;
349 }
350 546
351 /* Handle UE error. */ 547 if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
352 if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
353 handled = mce_handle_ue_error(regs); 548 handled = mce_handle_ue_error(regs);
354 549
355 save_mce_event(regs, handled, &mce_error_info, nip, addr); 550 save_mce_event(regs, handled, &mce_err, regs->nip, addr);
356 return handled;
357}
358
359static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
360{
361 mce_get_common_ierror(mce_err, srr1);
362 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
363 mce_err->error_type = MCE_ERROR_TYPE_ERAT;
364 mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
365 }
366}
367
368static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
369{
370 mce_get_derror_p7(mce_err, dsisr);
371 if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
372 mce_err->error_type = MCE_ERROR_TYPE_ERAT;
373 mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
374 }
375}
376
377static long mce_handle_ierror_p8(uint64_t srr1)
378{
379 long handled = 0;
380 551
381 handled = mce_handle_common_ierror(srr1);
382
383#ifdef CONFIG_PPC_STD_MMU_64
384 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
385 flush_and_reload_slb();
386 handled = 1;
387 }
388#endif
389 return handled; 552 return handled;
390} 553}
391 554
392static long mce_handle_derror_p8(uint64_t dsisr) 555long __machine_check_early_realmode_p7(struct pt_regs *regs)
393{
394 return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
395}
396
397long __machine_check_early_realmode_p8(struct pt_regs *regs)
398{
399 uint64_t srr1, nip, addr;
400 long handled = 1;
401 struct mce_error_info mce_error_info = { 0 };
402
403 mce_error_info.severity = MCE_SEV_ERROR_SYNC;
404 mce_error_info.initiator = MCE_INITIATOR_CPU;
405
406 srr1 = regs->msr;
407 nip = regs->nip;
408
409 if (P7_SRR1_MC_LOADSTORE(srr1)) {
410 handled = mce_handle_derror_p8(regs->dsisr);
411 mce_get_derror_p8(&mce_error_info, regs->dsisr);
412 addr = regs->dar;
413 } else {
414 handled = mce_handle_ierror_p8(srr1);
415 mce_get_ierror_p8(&mce_error_info, srr1);
416 addr = regs->nip;
417 }
418
419 /* Handle UE error. */
420 if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
421 handled = mce_handle_ue_error(regs);
422
423 save_mce_event(regs, handled, &mce_error_info, nip, addr);
424 return handled;
425}
426
427static int mce_handle_derror_p9(struct pt_regs *regs)
428{
429 uint64_t dsisr = regs->dsisr;
430
431 return mce_handle_flush_derrors(dsisr,
432 P9_DSISR_MC_SLB_PARITY_MFSLB |
433 P9_DSISR_MC_SLB_MULTIHIT_MFSLB,
434
435 P9_DSISR_MC_TLB_MULTIHIT_MFTLB,
436
437 P9_DSISR_MC_ERAT_MULTIHIT);
438}
439
440static int mce_handle_ierror_p9(struct pt_regs *regs)
441{ 556{
442 uint64_t srr1 = regs->msr; 557 /* P7 DD1 leaves top bits of DSISR undefined */
558 regs->dsisr &= 0x0000ffff;
443 559
444 switch (P9_SRR1_MC_IFETCH(srr1)) { 560 return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
445 case P9_SRR1_MC_IFETCH_SLB_PARITY:
446 case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
447 return mce_flush(MCE_FLUSH_SLB);
448 case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
449 return mce_flush(MCE_FLUSH_TLB);
450 case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
451 return mce_flush(MCE_FLUSH_ERAT);
452 default:
453 return 0;
454 }
455} 561}
456 562
457static void mce_get_derror_p9(struct pt_regs *regs, 563long __machine_check_early_realmode_p8(struct pt_regs *regs)
458 struct mce_error_info *mce_err, uint64_t *addr)
459{
460 uint64_t dsisr = regs->dsisr;
461
462 mce_err->severity = MCE_SEV_ERROR_SYNC;
463 mce_err->initiator = MCE_INITIATOR_CPU;
464
465 if (dsisr & P9_DSISR_MC_USER_TLBIE)
466 *addr = regs->nip;
467 else
468 *addr = regs->dar;
469
470 if (dsisr & P9_DSISR_MC_UE) {
471 mce_err->error_type = MCE_ERROR_TYPE_UE;
472 mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
473 } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) {
474 mce_err->error_type = MCE_ERROR_TYPE_UE;
475 mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
476 } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) {
477 mce_err->error_type = MCE_ERROR_TYPE_LINK;
478 mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT;
479 } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) {
480 mce_err->error_type = MCE_ERROR_TYPE_LINK;
481 mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT;
482 } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) {
483 mce_err->error_type = MCE_ERROR_TYPE_ERAT;
484 mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
485 } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) {
486 mce_err->error_type = MCE_ERROR_TYPE_TLB;
487 mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
488 } else if (dsisr & P9_DSISR_MC_USER_TLBIE) {
489 mce_err->error_type = MCE_ERROR_TYPE_USER;
490 mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE;
491 } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) {
492 mce_err->error_type = MCE_ERROR_TYPE_SLB;
493 mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
494 } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) {
495 mce_err->error_type = MCE_ERROR_TYPE_SLB;
496 mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
497 } else if (dsisr & P9_DSISR_MC_RA_LOAD) {
498 mce_err->error_type = MCE_ERROR_TYPE_RA;
499 mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD;
500 } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) {
501 mce_err->error_type = MCE_ERROR_TYPE_RA;
502 mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
503 } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) {
504 mce_err->error_type = MCE_ERROR_TYPE_RA;
505 mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
506 } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) {
507 mce_err->error_type = MCE_ERROR_TYPE_RA;
508 mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN;
509 }
510}
511
512static void mce_get_ierror_p9(struct pt_regs *regs,
513 struct mce_error_info *mce_err, uint64_t *addr)
514{ 564{
515 uint64_t srr1 = regs->msr; 565 return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
516
517 switch (P9_SRR1_MC_IFETCH(srr1)) {
518 case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
519 case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
520 mce_err->severity = MCE_SEV_FATAL;
521 break;
522 default:
523 mce_err->severity = MCE_SEV_ERROR_SYNC;
524 break;
525 }
526
527 mce_err->initiator = MCE_INITIATOR_CPU;
528
529 *addr = regs->nip;
530
531 switch (P9_SRR1_MC_IFETCH(srr1)) {
532 case P9_SRR1_MC_IFETCH_UE:
533 mce_err->error_type = MCE_ERROR_TYPE_UE;
534 mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
535 break;
536 case P9_SRR1_MC_IFETCH_SLB_PARITY:
537 mce_err->error_type = MCE_ERROR_TYPE_SLB;
538 mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
539 break;
540 case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
541 mce_err->error_type = MCE_ERROR_TYPE_SLB;
542 mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
543 break;
544 case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
545 mce_err->error_type = MCE_ERROR_TYPE_ERAT;
546 mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
547 break;
548 case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
549 mce_err->error_type = MCE_ERROR_TYPE_TLB;
550 mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
551 break;
552 case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD:
553 mce_err->error_type = MCE_ERROR_TYPE_UE;
554 mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
555 break;
556 case P9_SRR1_MC_IFETCH_LINK_TIMEOUT:
557 mce_err->error_type = MCE_ERROR_TYPE_LINK;
558 mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT;
559 break;
560 case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT:
561 mce_err->error_type = MCE_ERROR_TYPE_LINK;
562 mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT;
563 break;
564 case P9_SRR1_MC_IFETCH_RA:
565 mce_err->error_type = MCE_ERROR_TYPE_RA;
566 mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH;
567 break;
568 case P9_SRR1_MC_IFETCH_RA_TABLEWALK:
569 mce_err->error_type = MCE_ERROR_TYPE_RA;
570 mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH;
571 break;
572 case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
573 mce_err->error_type = MCE_ERROR_TYPE_RA;
574 mce_err->u.ra_error_type = MCE_RA_ERROR_STORE;
575 break;
576 case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
577 mce_err->error_type = MCE_ERROR_TYPE_LINK;
578 mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT;
579 break;
580 case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN:
581 mce_err->error_type = MCE_ERROR_TYPE_RA;
582 mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN;
583 break;
584 default:
585 break;
586 }
587} 566}
588 567
589long __machine_check_early_realmode_p9(struct pt_regs *regs) 568long __machine_check_early_realmode_p9(struct pt_regs *regs)
590{ 569{
591 uint64_t nip, addr; 570 return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
592 long handled;
593 struct mce_error_info mce_error_info = { 0 };
594
595 nip = regs->nip;
596
597 if (P9_SRR1_MC_LOADSTORE(regs->msr)) {
598 handled = mce_handle_derror_p9(regs);
599 mce_get_derror_p9(regs, &mce_error_info, &addr);
600 } else {
601 handled = mce_handle_ierror_p9(regs);
602 mce_get_ierror_p9(regs, &mce_error_info, &addr);
603 }
604
605 /* Handle UE error. */
606 if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
607 handled = mce_handle_ue_error(regs);
608
609 save_mce_event(regs, handled, &mce_error_info, nip, addr);
610 return handled;
611} 571}
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2282bf4e63cd..ec60ed0d4aad 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,10 +243,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
243 /* 243 /*
244 * 2. branch to optimized_callback() and emulate_step() 244 * 2. branch to optimized_callback() and emulate_step()
245 */ 245 */
246 kprobe_lookup_name("optimized_callback", op_callback_addr); 246 op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback");
247 kprobe_lookup_name("emulate_step", emulate_step_addr); 247 emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step");
248 if (!op_callback_addr || !emulate_step_addr) { 248 if (!op_callback_addr || !emulate_step_addr) {
249 WARN(1, "kprobe_lookup_name() failed\n"); 249 WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n");
250 goto error; 250 goto error;
251 } 251 }
252 252
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index dfc479df9634..8d63627e067f 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -245,3 +245,24 @@ void __init free_unused_pacas(void)
245 245
246 free_lppacas(); 246 free_lppacas();
247} 247}
248
249void copy_mm_to_paca(struct mm_struct *mm)
250{
251#ifdef CONFIG_PPC_BOOK3S
252 mm_context_t *context = &mm->context;
253
254 get_paca()->mm_ctx_id = context->id;
255#ifdef CONFIG_PPC_MM_SLICES
256 VM_BUG_ON(!mm->context.addr_limit);
257 get_paca()->addr_limit = mm->context.addr_limit;
258 get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
259 memcpy(&get_paca()->mm_ctx_high_slices_psize,
260 &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
261#else /* CONFIG_PPC_MM_SLICES */
262 get_paca()->mm_ctx_user_psize = context->user_psize;
263 get_paca()->mm_ctx_sllp = context->sllp;
264#endif
265#else /* CONFIG_PPC_BOOK3S */
266 return;
267#endif
268}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f5d399e46193..d2f0afeae5a0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,7 +55,6 @@
55#include <asm/kexec.h> 55#include <asm/kexec.h>
56#include <asm/opal.h> 56#include <asm/opal.h>
57#include <asm/fadump.h> 57#include <asm/fadump.h>
58#include <asm/debug.h>
59#include <asm/epapr_hcalls.h> 58#include <asm/epapr_hcalls.h>
60#include <asm/firmware.h> 59#include <asm/firmware.h>
61 60
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1c1b44ec7642..dd8a04f3053a 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
815 .virt_base = cpu_to_be32(0xffffffff), 815 .virt_base = cpu_to_be32(0xffffffff),
816 .virt_size = cpu_to_be32(0xffffffff), 816 .virt_size = cpu_to_be32(0xffffffff),
817 .load_base = cpu_to_be32(0xffffffff), 817 .load_base = cpu_to_be32(0xffffffff),
818 .min_rma = cpu_to_be32(256), /* 256MB min RMA */ 818 .min_rma = cpu_to_be32(512), /* 512MB min RMA */
819 .min_load = cpu_to_be32(0xffffffff), /* full client load */ 819 .min_load = cpu_to_be32(0xffffffff), /* full client load */
820 .min_rma_percent = 0, /* min RMA percentage of total RAM */ 820 .min_rma_percent = 0, /* min RMA percentage of total RAM */
821 .max_pft_size = 48, /* max log_2(hash table size) */ 821 .max_pft_size = 48, /* max log_2(hash table size) */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4697da895133..5c10b5925ac2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -31,11 +31,11 @@
31#include <linux/unistd.h> 31#include <linux/unistd.h>
32#include <linux/serial.h> 32#include <linux/serial.h>
33#include <linux/serial_8250.h> 33#include <linux/serial_8250.h>
34#include <linux/debugfs.h>
35#include <linux/percpu.h> 34#include <linux/percpu.h>
36#include <linux/memblock.h> 35#include <linux/memblock.h>
37#include <linux/of_platform.h> 36#include <linux/of_platform.h>
38#include <linux/hugetlb.h> 37#include <linux/hugetlb.h>
38#include <asm/debugfs.h>
39#include <asm/io.h> 39#include <asm/io.h>
40#include <asm/paca.h> 40#include <asm/paca.h>
41#include <asm/prom.h> 41#include <asm/prom.h>
@@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p)
920 init_mm.end_code = (unsigned long) _etext; 920 init_mm.end_code = (unsigned long) _etext;
921 init_mm.end_data = (unsigned long) _edata; 921 init_mm.end_data = (unsigned long) _edata;
922 init_mm.brk = klimit; 922 init_mm.brk = klimit;
923
924#ifdef CONFIG_PPC_MM_SLICES
925#ifdef CONFIG_PPC64
926 init_mm.context.addr_limit = TASK_SIZE_128TB;
927#else
928#error "context.addr_limit not initialized."
929#endif
930#endif
931
923#ifdef CONFIG_PPC_64K_PAGES 932#ifdef CONFIG_PPC_64K_PAGES
924 init_mm.context.pte_frag = NULL; 933 init_mm.context.pte_frag = NULL;
925#endif 934#endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index f997154dfc41..0d4dcaeaafcb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void)
230 * If we are not in hypervisor mode the job is done once for 230 * If we are not in hypervisor mode the job is done once for
231 * the whole partition in configure_exceptions(). 231 * the whole partition in configure_exceptions().
232 */ 232 */
233 if (early_cpu_has_feature(CPU_FTR_HVMODE) && 233 if (cpu_has_feature(CPU_FTR_HVMODE) &&
234 early_cpu_has_feature(CPU_FTR_ARCH_207S)) { 234 cpu_has_feature(CPU_FTR_ARCH_207S)) {
235 unsigned long lpcr = mfspr(SPRN_LPCR); 235 unsigned long lpcr = mfspr(SPRN_LPCR);
236 mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); 236 mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
237 } 237 }
@@ -637,6 +637,11 @@ void __init emergency_stack_init(void)
637 paca[i].emergency_sp = (void *)ti + THREAD_SIZE; 637 paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
638 638
639#ifdef CONFIG_PPC_BOOK3S_64 639#ifdef CONFIG_PPC_BOOK3S_64
640 /* emergency stack for NMI exception handling. */
641 ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
642 klp_init_thread_info(ti);
643 paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
644
640 /* emergency stack for machine check exception handling. */ 645 /* emergency stack for machine check exception handling. */
641 ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); 646 ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
642 klp_init_thread_info(ti); 647 klp_init_thread_info(ti);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d68ed1f004a3..df2a41647d8e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -39,6 +39,7 @@
39#include <asm/irq.h> 39#include <asm/irq.h>
40#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
41#include <asm/kvm_ppc.h> 41#include <asm/kvm_ppc.h>
42#include <asm/dbell.h>
42#include <asm/page.h> 43#include <asm/page.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/prom.h> 45#include <asm/prom.h>
@@ -86,8 +87,6 @@ volatile unsigned int cpu_callin_map[NR_CPUS];
86 87
87int smt_enabled_at_boot = 1; 88int smt_enabled_at_boot = 1;
88 89
89static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL;
90
91/* 90/*
92 * Returns 1 if the specified cpu should be brought up during boot. 91 * Returns 1 if the specified cpu should be brought up during boot.
93 * Used to inhibit booting threads if they've been disabled or 92 * Used to inhibit booting threads if they've been disabled or
@@ -158,32 +157,33 @@ static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
158 return IRQ_HANDLED; 157 return IRQ_HANDLED;
159} 158}
160 159
161static irqreturn_t debug_ipi_action(int irq, void *data) 160#ifdef CONFIG_NMI_IPI
161static irqreturn_t nmi_ipi_action(int irq, void *data)
162{ 162{
163 if (crash_ipi_function_ptr) { 163 smp_handle_nmi_ipi(get_irq_regs());
164 crash_ipi_function_ptr(get_irq_regs());
165 return IRQ_HANDLED;
166 }
167
168#ifdef CONFIG_DEBUGGER
169 debugger_ipi(get_irq_regs());
170#endif /* CONFIG_DEBUGGER */
171
172 return IRQ_HANDLED; 164 return IRQ_HANDLED;
173} 165}
166#endif
174 167
175static irq_handler_t smp_ipi_action[] = { 168static irq_handler_t smp_ipi_action[] = {
176 [PPC_MSG_CALL_FUNCTION] = call_function_action, 169 [PPC_MSG_CALL_FUNCTION] = call_function_action,
177 [PPC_MSG_RESCHEDULE] = reschedule_action, 170 [PPC_MSG_RESCHEDULE] = reschedule_action,
178 [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action, 171 [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
179 [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action, 172#ifdef CONFIG_NMI_IPI
173 [PPC_MSG_NMI_IPI] = nmi_ipi_action,
174#endif
180}; 175};
181 176
177/*
178 * The NMI IPI is a fallback and not truly non-maskable. It is simpler
179 * than going through the call function infrastructure, and strongly
180 * serialized, so it is more appropriate for debugging.
181 */
182const char *smp_ipi_name[] = { 182const char *smp_ipi_name[] = {
183 [PPC_MSG_CALL_FUNCTION] = "ipi call function", 183 [PPC_MSG_CALL_FUNCTION] = "ipi call function",
184 [PPC_MSG_RESCHEDULE] = "ipi reschedule", 184 [PPC_MSG_RESCHEDULE] = "ipi reschedule",
185 [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast", 185 [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
186 [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger", 186 [PPC_MSG_NMI_IPI] = "nmi ipi",
187}; 187};
188 188
189/* optional function to request ipi, for controllers with >= 4 ipis */ 189/* optional function to request ipi, for controllers with >= 4 ipis */
@@ -191,14 +191,13 @@ int smp_request_message_ipi(int virq, int msg)
191{ 191{
192 int err; 192 int err;
193 193
194 if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) { 194 if (msg < 0 || msg > PPC_MSG_NMI_IPI)
195 return -EINVAL; 195 return -EINVAL;
196 } 196#ifndef CONFIG_NMI_IPI
197#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE) 197 if (msg == PPC_MSG_NMI_IPI)
198 if (msg == PPC_MSG_DEBUGGER_BREAK) {
199 return 1; 198 return 1;
200 }
201#endif 199#endif
200
202 err = request_irq(virq, smp_ipi_action[msg], 201 err = request_irq(virq, smp_ipi_action[msg],
203 IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND, 202 IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
204 smp_ipi_name[msg], NULL); 203 smp_ipi_name[msg], NULL);
@@ -211,17 +210,9 @@ int smp_request_message_ipi(int virq, int msg)
211#ifdef CONFIG_PPC_SMP_MUXED_IPI 210#ifdef CONFIG_PPC_SMP_MUXED_IPI
212struct cpu_messages { 211struct cpu_messages {
213 long messages; /* current messages */ 212 long messages; /* current messages */
214 unsigned long data; /* data for cause ipi */
215}; 213};
216static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); 214static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
217 215
218void smp_muxed_ipi_set_data(int cpu, unsigned long data)
219{
220 struct cpu_messages *info = &per_cpu(ipi_message, cpu);
221
222 info->data = data;
223}
224
225void smp_muxed_ipi_set_message(int cpu, int msg) 216void smp_muxed_ipi_set_message(int cpu, int msg)
226{ 217{
227 struct cpu_messages *info = &per_cpu(ipi_message, cpu); 218 struct cpu_messages *info = &per_cpu(ipi_message, cpu);
@@ -236,14 +227,13 @@ void smp_muxed_ipi_set_message(int cpu, int msg)
236 227
237void smp_muxed_ipi_message_pass(int cpu, int msg) 228void smp_muxed_ipi_message_pass(int cpu, int msg)
238{ 229{
239 struct cpu_messages *info = &per_cpu(ipi_message, cpu);
240
241 smp_muxed_ipi_set_message(cpu, msg); 230 smp_muxed_ipi_set_message(cpu, msg);
231
242 /* 232 /*
243 * cause_ipi functions are required to include a full barrier 233 * cause_ipi functions are required to include a full barrier
244 * before doing whatever causes the IPI. 234 * before doing whatever causes the IPI.
245 */ 235 */
246 smp_ops->cause_ipi(cpu, info->data); 236 smp_ops->cause_ipi(cpu);
247} 237}
248 238
249#ifdef __BIG_ENDIAN__ 239#ifdef __BIG_ENDIAN__
@@ -254,11 +244,18 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
254 244
255irqreturn_t smp_ipi_demux(void) 245irqreturn_t smp_ipi_demux(void)
256{ 246{
257 struct cpu_messages *info = this_cpu_ptr(&ipi_message);
258 unsigned long all;
259
260 mb(); /* order any irq clear */ 247 mb(); /* order any irq clear */
261 248
249 return smp_ipi_demux_relaxed();
250}
251
252/* sync-free variant. Callers should ensure synchronization */
253irqreturn_t smp_ipi_demux_relaxed(void)
254{
255 struct cpu_messages *info;
256 unsigned long all;
257
258 info = this_cpu_ptr(&ipi_message);
262 do { 259 do {
263 all = xchg(&info->messages, 0); 260 all = xchg(&info->messages, 0);
264#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) 261#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
@@ -278,8 +275,10 @@ irqreturn_t smp_ipi_demux(void)
278 scheduler_ipi(); 275 scheduler_ipi();
279 if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST)) 276 if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
280 tick_broadcast_ipi_handler(); 277 tick_broadcast_ipi_handler();
281 if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK)) 278#ifdef CONFIG_NMI_IPI
282 debug_ipi_action(0, NULL); 279 if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI))
280 nmi_ipi_action(0, NULL);
281#endif
283 } while (info->messages); 282 } while (info->messages);
284 283
285 return IRQ_HANDLED; 284 return IRQ_HANDLED;
@@ -316,6 +315,187 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
316 do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); 315 do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
317} 316}
318 317
318#ifdef CONFIG_NMI_IPI
319
320/*
321 * "NMI IPI" system.
322 *
323 * NMI IPIs may not be recoverable, so should not be used as ongoing part of
324 * a running system. They can be used for crash, debug, halt/reboot, etc.
325 *
326 * NMI IPIs are globally single threaded. No more than one in progress at
327 * any time.
328 *
329 * The IPI call waits with interrupts disabled until all targets enter the
330 * NMI handler, then the call returns.
331 *
332 * No new NMI can be initiated until targets exit the handler.
333 *
334 * The IPI call may time out without all targets entering the NMI handler.
335 * In that case, there is some logic to recover (and ignore subsequent
336 * NMI interrupts that may eventually be raised), but the platform interrupt
337 * handler may not be able to distinguish this from other exception causes,
338 * which may cause a crash.
339 */
340
341static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
342static struct cpumask nmi_ipi_pending_mask;
343static int nmi_ipi_busy_count = 0;
344static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
345
346static void nmi_ipi_lock_start(unsigned long *flags)
347{
348 raw_local_irq_save(*flags);
349 hard_irq_disable();
350 while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
351 raw_local_irq_restore(*flags);
352 cpu_relax();
353 raw_local_irq_save(*flags);
354 hard_irq_disable();
355 }
356}
357
358static void nmi_ipi_lock(void)
359{
360 while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
361 cpu_relax();
362}
363
364static void nmi_ipi_unlock(void)
365{
366 smp_mb();
367 WARN_ON(atomic_read(&__nmi_ipi_lock) != 1);
368 atomic_set(&__nmi_ipi_lock, 0);
369}
370
371static void nmi_ipi_unlock_end(unsigned long *flags)
372{
373 nmi_ipi_unlock();
374 raw_local_irq_restore(*flags);
375}
376
377/*
378 * Platform NMI handler calls this to ack
379 */
380int smp_handle_nmi_ipi(struct pt_regs *regs)
381{
382 void (*fn)(struct pt_regs *);
383 unsigned long flags;
384 int me = raw_smp_processor_id();
385 int ret = 0;
386
387 /*
388 * Unexpected NMIs are possible here because the interrupt may not
389 * be able to distinguish NMI IPIs from other types of NMIs, or
390 * because the caller may have timed out.
391 */
392 nmi_ipi_lock_start(&flags);
393 if (!nmi_ipi_busy_count)
394 goto out;
395 if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask))
396 goto out;
397
398 fn = nmi_ipi_function;
399 if (!fn)
400 goto out;
401
402 cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
403 nmi_ipi_busy_count++;
404 nmi_ipi_unlock();
405
406 ret = 1;
407
408 fn(regs);
409
410 nmi_ipi_lock();
411 nmi_ipi_busy_count--;
412out:
413 nmi_ipi_unlock_end(&flags);
414
415 return ret;
416}
417
418static void do_smp_send_nmi_ipi(int cpu)
419{
420 if (smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu))
421 return;
422
423 if (cpu >= 0) {
424 do_message_pass(cpu, PPC_MSG_NMI_IPI);
425 } else {
426 int c;
427
428 for_each_online_cpu(c) {
429 if (c == raw_smp_processor_id())
430 continue;
431 do_message_pass(c, PPC_MSG_NMI_IPI);
432 }
433 }
434}
435
436/*
437 * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
438 * - fn is the target callback function.
439 * - delay_us > 0 is the delay before giving up waiting for targets to
440 * enter the handler, == 0 specifies indefinite delay.
441 */
442static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
443{
444 unsigned long flags;
445 int me = raw_smp_processor_id();
446 int ret = 1;
447
448 BUG_ON(cpu == me);
449 BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS);
450
451 if (unlikely(!smp_ops))
452 return 0;
453
454 /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
455 nmi_ipi_lock_start(&flags);
456 while (nmi_ipi_busy_count) {
457 nmi_ipi_unlock_end(&flags);
458 cpu_relax();
459 nmi_ipi_lock_start(&flags);
460 }
461
462 nmi_ipi_function = fn;
463
464 if (cpu < 0) {
465 /* ALL_OTHERS */
466 cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
467 cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
468 } else {
469 /* cpumask starts clear */
470 cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
471 }
472 nmi_ipi_busy_count++;
473 nmi_ipi_unlock();
474
475 do_smp_send_nmi_ipi(cpu);
476
477 while (!cpumask_empty(&nmi_ipi_pending_mask)) {
478 udelay(1);
479 if (delay_us) {
480 delay_us--;
481 if (!delay_us)
482 break;
483 }
484 }
485
486 nmi_ipi_lock();
487 if (!cpumask_empty(&nmi_ipi_pending_mask)) {
488 /* Could not gather all CPUs */
489 ret = 0;
490 cpumask_clear(&nmi_ipi_pending_mask);
491 }
492 nmi_ipi_busy_count--;
493 nmi_ipi_unlock_end(&flags);
494
495 return ret;
496}
497#endif /* CONFIG_NMI_IPI */
498
319#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 499#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
320void tick_broadcast(const struct cpumask *mask) 500void tick_broadcast(const struct cpumask *mask)
321{ 501{
@@ -326,29 +506,22 @@ void tick_broadcast(const struct cpumask *mask)
326} 506}
327#endif 507#endif
328 508
329#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) 509#ifdef CONFIG_DEBUGGER
330void smp_send_debugger_break(void) 510void debugger_ipi_callback(struct pt_regs *regs)
331{ 511{
332 int cpu; 512 debugger_ipi(regs);
333 int me = raw_smp_processor_id(); 513}
334
335 if (unlikely(!smp_ops))
336 return;
337 514
338 for_each_online_cpu(cpu) 515void smp_send_debugger_break(void)
339 if (cpu != me) 516{
340 do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK); 517 smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
341} 518}
342#endif 519#endif
343 520
344#ifdef CONFIG_KEXEC_CORE 521#ifdef CONFIG_KEXEC_CORE
345void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) 522void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
346{ 523{
347 crash_ipi_function_ptr = crash_ipi_callback; 524 smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
348 if (crash_ipi_callback) {
349 mb();
350 smp_send_debugger_break();
351 }
352} 525}
353#endif 526#endif
354 527
@@ -439,7 +612,21 @@ int generic_cpu_disable(void)
439#ifdef CONFIG_PPC64 612#ifdef CONFIG_PPC64
440 vdso_data->processorCount--; 613 vdso_data->processorCount--;
441#endif 614#endif
442 migrate_irqs(); 615 /* Update affinity of all IRQs previously aimed at this CPU */
616 irq_migrate_all_off_this_cpu();
617
618 /*
619 * Depending on the details of the interrupt controller, it's possible
620 * that one of the interrupts we just migrated away from this CPU is
621 * actually already pending on this CPU. If we leave it in that state
622 * the interrupt will never be EOI'ed, and will never fire again. So
623 * temporarily enable interrupts here, to allow any pending interrupt to
624 * be received (and EOI'ed), before we take this CPU offline.
625 */
626 local_irq_enable();
627 mdelay(1);
628 local_irq_disable();
629
443 return 0; 630 return 0;
444} 631}
445 632
@@ -521,6 +708,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
521 708
522 cpu_idle_thread_init(cpu, tidle); 709 cpu_idle_thread_init(cpu, tidle);
523 710
711 /*
712 * The platform might need to allocate resources prior to bringing
713 * up the CPU
714 */
715 if (smp_ops->prepare_cpu) {
716 rc = smp_ops->prepare_cpu(cpu);
717 if (rc)
718 return rc;
719 }
720
524 /* Make sure callin-map entry is 0 (can be leftover a CPU 721 /* Make sure callin-map entry is 0 (can be leftover a CPU
525 * hotplug 722 * hotplug
526 */ 723 */
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 66711958493c..d534ed901538 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
59 59
60void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 60void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
61{ 61{
62 save_context_stack(trace, tsk->thread.ksp, tsk, 0); 62 unsigned long sp;
63
64 if (tsk == current)
65 sp = current_stack_pointer();
66 else
67 sp = tsk->thread.ksp;
68
69 save_context_stack(trace, sp, tsk, 0);
63} 70}
64EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 71EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
65 72
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 6ae9bd5086a4..0050b2d2ff7a 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/suspend.h>
13#include <asm/current.h> 14#include <asm/current.h>
14#include <asm/mmu_context.h> 15#include <asm/mmu_context.h>
15#include <asm/switch_to.h> 16#include <asm/switch_to.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index de04c9fbb5cd..a877bf8269fe 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -42,11 +42,11 @@
42#include <asm/unistd.h> 42#include <asm/unistd.h>
43#include <asm/asm-prototypes.h> 43#include <asm/asm-prototypes.h>
44 44
45static inline unsigned long do_mmap2(unsigned long addr, size_t len, 45static inline long do_mmap2(unsigned long addr, size_t len,
46 unsigned long prot, unsigned long flags, 46 unsigned long prot, unsigned long flags,
47 unsigned long fd, unsigned long off, int shift) 47 unsigned long fd, unsigned long off, int shift)
48{ 48{
49 unsigned long ret = -EINVAL; 49 long ret = -EINVAL;
50 50
51 if (!arch_validate_prot(prot)) 51 if (!arch_validate_prot(prot))
52 goto out; 52 goto out;
@@ -62,16 +62,16 @@ out:
62 return ret; 62 return ret;
63} 63}
64 64
65unsigned long sys_mmap2(unsigned long addr, size_t len, 65SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
66 unsigned long prot, unsigned long flags, 66 unsigned long, prot, unsigned long, flags,
67 unsigned long fd, unsigned long pgoff) 67 unsigned long, fd, unsigned long, pgoff)
68{ 68{
69 return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12); 69 return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
70} 70}
71 71
72unsigned long sys_mmap(unsigned long addr, size_t len, 72SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len,
73 unsigned long prot, unsigned long flags, 73 unsigned long, prot, unsigned long, flags,
74 unsigned long fd, off_t offset) 74 unsigned long, fd, off_t, offset)
75{ 75{
76 return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT); 76 return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
77} 77}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c1fb255a60d6..4437c70c7c2b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -710,6 +710,10 @@ static int register_cpu_online(unsigned int cpu)
710 struct device_attribute *attrs, *pmc_attrs; 710 struct device_attribute *attrs, *pmc_attrs;
711 int i, nattrs; 711 int i, nattrs;
712 712
713 /* For cpus present at boot a reference was already grabbed in register_cpu() */
714 if (!s->of_node)
715 s->of_node = of_get_cpu_node(cpu, NULL);
716
713#ifdef CONFIG_PPC64 717#ifdef CONFIG_PPC64
714 if (cpu_has_feature(CPU_FTR_SMT)) 718 if (cpu_has_feature(CPU_FTR_SMT))
715 device_create_file(s, &dev_attr_smt_snooze_delay); 719 device_create_file(s, &dev_attr_smt_snooze_delay);
@@ -785,9 +789,9 @@ static int register_cpu_online(unsigned int cpu)
785 return 0; 789 return 0;
786} 790}
787 791
792#ifdef CONFIG_HOTPLUG_CPU
788static int unregister_cpu_online(unsigned int cpu) 793static int unregister_cpu_online(unsigned int cpu)
789{ 794{
790#ifdef CONFIG_HOTPLUG_CPU
791 struct cpu *c = &per_cpu(cpu_devices, cpu); 795 struct cpu *c = &per_cpu(cpu_devices, cpu);
792 struct device *s = &c->dev; 796 struct device *s = &c->dev;
793 struct device_attribute *attrs, *pmc_attrs; 797 struct device_attribute *attrs, *pmc_attrs;
@@ -864,9 +868,13 @@ static int unregister_cpu_online(unsigned int cpu)
864 } 868 }
865#endif 869#endif
866 cacheinfo_cpu_offline(cpu); 870 cacheinfo_cpu_offline(cpu);
867#endif /* CONFIG_HOTPLUG_CPU */ 871 of_node_put(s->of_node);
872 s->of_node = NULL;
868 return 0; 873 return 0;
869} 874}
875#else /* !CONFIG_HOTPLUG_CPU */
876#define unregister_cpu_online NULL
877#endif
870 878
871#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE 879#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
872ssize_t arch_cpu_probe(const char *buf, size_t count) 880ssize_t arch_cpu_probe(const char *buf, size_t count)
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
new file mode 100644
index 000000000000..729dffc5f7bc
--- /dev/null
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -0,0 +1,29 @@
1#
2# Makefile for the powerpc trace subsystem
3#
4
5subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6
7ifdef CONFIG_FUNCTION_TRACER
8# do not trace tracer code
9CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
10endif
11
12obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o
13obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64.o
14ifdef CONFIG_MPROFILE_KERNEL
15obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_mprofile.o
16else
17obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o
18endif
19obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
20obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
21obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23
24obj-$(CONFIG_PPC64) += $(obj64-y)
25obj-$(CONFIG_PPC32) += $(obj32-y)
26
27# Disable GCOV & sanitizers in odd or sensitive code
28GCOV_PROFILE_ftrace.o := n
29UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 5c9f50c1aa99..32509de6ce4c 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/list.h> 22#include <linux/list.h>
23 23
24#include <asm/asm-prototypes.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
25#include <asm/code-patching.h> 26#include <asm/code-patching.h>
26#include <asm/ftrace.h> 27#include <asm/ftrace.h>
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
new file mode 100644
index 000000000000..afef2c076282
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -0,0 +1,118 @@
1/*
2 * Split from entry_32.S
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/magic.h>
11#include <asm/reg.h>
12#include <asm/ppc_asm.h>
13#include <asm/asm-offsets.h>
14#include <asm/ftrace.h>
15#include <asm/export.h>
16
17#ifdef CONFIG_DYNAMIC_FTRACE
18_GLOBAL(mcount)
19_GLOBAL(_mcount)
20 /*
21 * It is required that _mcount on PPC32 must preserve the
22 * link register. But we have r0 to play with. We use r0
23 * to push the return address back to the caller of mcount
24 * into the ctr register, restore the link register and
25 * then jump back using the ctr register.
26 */
27 mflr r0
28 mtctr r0
29 lwz r0, 4(r1)
30 mtlr r0
31 bctr
32
33_GLOBAL(ftrace_caller)
34 MCOUNT_SAVE_FRAME
35 /* r3 ends up with link register */
36 subi r3, r3, MCOUNT_INSN_SIZE
37.globl ftrace_call
38ftrace_call:
39 bl ftrace_stub
40 nop
41#ifdef CONFIG_FUNCTION_GRAPH_TRACER
42.globl ftrace_graph_call
43ftrace_graph_call:
44 b ftrace_graph_stub
45_GLOBAL(ftrace_graph_stub)
46#endif
47 MCOUNT_RESTORE_FRAME
48 /* old link register ends up in ctr reg */
49 bctr
50#else
51_GLOBAL(mcount)
52_GLOBAL(_mcount)
53
54 MCOUNT_SAVE_FRAME
55
56 subi r3, r3, MCOUNT_INSN_SIZE
57 LOAD_REG_ADDR(r5, ftrace_trace_function)
58 lwz r5,0(r5)
59
60 mtctr r5
61 bctrl
62 nop
63
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 b ftrace_graph_caller
66#endif
67 MCOUNT_RESTORE_FRAME
68 bctr
69#endif
70EXPORT_SYMBOL(_mcount)
71
72_GLOBAL(ftrace_stub)
73 blr
74
75#ifdef CONFIG_FUNCTION_GRAPH_TRACER
76_GLOBAL(ftrace_graph_caller)
77 /* load r4 with local address */
78 lwz r4, 44(r1)
79 subi r4, r4, MCOUNT_INSN_SIZE
80
81 /* Grab the LR out of the caller stack frame */
82 lwz r3,52(r1)
83
84 bl prepare_ftrace_return
85 nop
86
87 /*
88 * prepare_ftrace_return gives us the address we divert to.
89 * Change the LR in the callers stack frame to this.
90 */
91 stw r3,52(r1)
92
93 MCOUNT_RESTORE_FRAME
94 /* old link register ends up in ctr reg */
95 bctr
96
97_GLOBAL(return_to_handler)
98 /* need to save return values */
99 stwu r1, -32(r1)
100 stw r3, 20(r1)
101 stw r4, 16(r1)
102 stw r31, 12(r1)
103 mr r31, r1
104
105 bl ftrace_return_to_handler
106 nop
107
108 /* return value has real return address */
109 mtlr r3
110
111 lwz r3, 20(r1)
112 lwz r4, 16(r1)
113 lwz r31,12(r1)
114 lwz r1, 0(r1)
115
116 /* Jump back to real return address */
117 blr
118#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
new file mode 100644
index 000000000000..e5ccea19821e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -0,0 +1,85 @@
1/*
2 * Split from entry_64.S
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/magic.h>
11#include <asm/ppc_asm.h>
12#include <asm/asm-offsets.h>
13#include <asm/ftrace.h>
14#include <asm/ppc-opcode.h>
15#include <asm/export.h>
16
17#ifdef CONFIG_DYNAMIC_FTRACE
18_GLOBAL(mcount)
19_GLOBAL(_mcount)
20EXPORT_SYMBOL(_mcount)
21 mflr r12
22 mtctr r12
23 mtlr r0
24 bctr
25
26#else /* CONFIG_DYNAMIC_FTRACE */
27_GLOBAL_TOC(_mcount)
28EXPORT_SYMBOL(_mcount)
29 /* Taken from output of objdump from lib64/glibc */
30 mflr r3
31 ld r11, 0(r1)
32 stdu r1, -112(r1)
33 std r3, 128(r1)
34 ld r4, 16(r11)
35
36 subi r3, r3, MCOUNT_INSN_SIZE
37 LOAD_REG_ADDR(r5,ftrace_trace_function)
38 ld r5,0(r5)
39 ld r5,0(r5)
40 mtctr r5
41 bctrl
42 nop
43
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 b ftrace_graph_caller
46#endif
47 ld r0, 128(r1)
48 mtlr r0
49 addi r1, r1, 112
50_GLOBAL(ftrace_stub)
51 blr
52#endif /* CONFIG_DYNAMIC_FTRACE */
53
54#ifdef CONFIG_FUNCTION_GRAPH_TRACER
55_GLOBAL(return_to_handler)
56 /* need to save return values */
57 std r4, -32(r1)
58 std r3, -24(r1)
59 /* save TOC */
60 std r2, -16(r1)
61 std r31, -8(r1)
62 mr r31, r1
63 stdu r1, -112(r1)
64
65 /*
66 * We might be called from a module.
67 * Switch to our TOC to run inside the core kernel.
68 */
69 ld r2, PACATOC(r13)
70
71 bl ftrace_return_to_handler
72 nop
73
74 /* return value has real return address */
75 mtlr r3
76
77 ld r1, 0(r1)
78 ld r4, -32(r1)
79 ld r3, -24(r1)
80 ld r2, -16(r1)
81 ld r31, -8(r1)
82
83 /* Jump back to real return address */
84 blr
85#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
new file mode 100644
index 000000000000..7c933a99f5d5
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -0,0 +1,272 @@
1/*
2 * Split from ftrace_64.S
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/magic.h>
11#include <asm/ppc_asm.h>
12#include <asm/asm-offsets.h>
13#include <asm/ftrace.h>
14#include <asm/ppc-opcode.h>
15#include <asm/export.h>
16#include <asm/thread_info.h>
17#include <asm/bug.h>
18#include <asm/ptrace.h>
19
20#ifdef CONFIG_DYNAMIC_FTRACE
21/*
22 *
23 * ftrace_caller() is the function that replaces _mcount() when ftrace is
24 * active.
25 *
26 * We arrive here after a function A calls function B, and we are the trace
27 * function for B. When we enter r1 points to A's stack frame, B has not yet
28 * had a chance to allocate one yet.
29 *
30 * Additionally r2 may point either to the TOC for A, or B, depending on
31 * whether B did a TOC setup sequence before calling us.
32 *
33 * On entry the LR points back to the _mcount() call site, and r0 holds the
34 * saved LR as it was on entry to B, ie. the original return address at the
35 * call site in A.
36 *
37 * Our job is to save the register state into a struct pt_regs (on the stack)
38 * and then arrange for the ftrace function to be called.
39 */
40_GLOBAL(ftrace_caller)
41 /* Save the original return address in A's stack frame */
42 std r0,LRSAVE(r1)
43
44 /* Create our stack frame + pt_regs */
45 stdu r1,-SWITCH_FRAME_SIZE(r1)
46
47 /* Save all gprs to pt_regs */
48 SAVE_8GPRS(0,r1)
49 SAVE_8GPRS(8,r1)
50 SAVE_8GPRS(16,r1)
51 SAVE_8GPRS(24,r1)
52
53 /* Load special regs for save below */
54 mfmsr r8
55 mfctr r9
56 mfxer r10
57 mfcr r11
58
59 /* Get the _mcount() call site out of LR */
60 mflr r7
61 /* Save it as pt_regs->nip */
62 std r7, _NIP(r1)
63 /* Save the read LR in pt_regs->link */
64 std r0, _LINK(r1)
65
66 /* Save callee's TOC in the ABI compliant location */
67 std r2, 24(r1)
68 ld r2,PACATOC(r13) /* get kernel TOC in r2 */
69
70 addis r3,r2,function_trace_op@toc@ha
71 addi r3,r3,function_trace_op@toc@l
72 ld r5,0(r3)
73
74#ifdef CONFIG_LIVEPATCH
75 mr r14,r7 /* remember old NIP */
76#endif
77 /* Calculate ip from nip-4 into r3 for call below */
78 subi r3, r7, MCOUNT_INSN_SIZE
79
80 /* Put the original return address in r4 as parent_ip */
81 mr r4, r0
82
83 /* Save special regs */
84 std r8, _MSR(r1)
85 std r9, _CTR(r1)
86 std r10, _XER(r1)
87 std r11, _CCR(r1)
88
89 /* Load &pt_regs in r6 for call below */
90 addi r6, r1 ,STACK_FRAME_OVERHEAD
91
92 /* ftrace_call(r3, r4, r5, r6) */
93.globl ftrace_call
94ftrace_call:
95 bl ftrace_stub
96 nop
97
98 /* Load ctr with the possibly modified NIP */
99 ld r3, _NIP(r1)
100 mtctr r3
101#ifdef CONFIG_LIVEPATCH
102 cmpd r14,r3 /* has NIP been altered? */
103#endif
104
105 /* Restore gprs */
106 REST_8GPRS(0,r1)
107 REST_8GPRS(8,r1)
108 REST_8GPRS(16,r1)
109 REST_8GPRS(24,r1)
110
111 /* Restore possibly modified LR */
112 ld r0, _LINK(r1)
113 mtlr r0
114
115 /* Restore callee's TOC */
116 ld r2, 24(r1)
117
118 /* Pop our stack frame */
119 addi r1, r1, SWITCH_FRAME_SIZE
120
121#ifdef CONFIG_LIVEPATCH
122 /* Based on the cmpd above, if the NIP was altered handle livepatch */
123 bne- livepatch_handler
124#endif
125
126#ifdef CONFIG_FUNCTION_GRAPH_TRACER
127.globl ftrace_graph_call
128ftrace_graph_call:
129 b ftrace_graph_stub
130_GLOBAL(ftrace_graph_stub)
131#endif
132
133 bctr /* jump after _mcount site */
134
135_GLOBAL(ftrace_stub)
136 blr
137
138#ifdef CONFIG_LIVEPATCH
139 /*
140 * This function runs in the mcount context, between two functions. As
141 * such it can only clobber registers which are volatile and used in
142 * function linkage.
143 *
144 * We get here when a function A, calls another function B, but B has
145 * been live patched with a new function C.
146 *
147 * On entry:
148 * - we have no stack frame and can not allocate one
149 * - LR points back to the original caller (in A)
150 * - CTR holds the new NIP in C
151 * - r0 & r12 are free
152 *
153 * r0 can't be used as the base register for a DS-form load or store, so
154 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
155 */
156livepatch_handler:
157 CURRENT_THREAD_INFO(r12, r1)
158
159 /* Save stack pointer into r0 */
160 mr r0, r1
161
162 /* Allocate 3 x 8 bytes */
163 ld r1, TI_livepatch_sp(r12)
164 addi r1, r1, 24
165 std r1, TI_livepatch_sp(r12)
166
167 /* Save toc & real LR on livepatch stack */
168 std r2, -24(r1)
169 mflr r12
170 std r12, -16(r1)
171
172 /* Store stack end marker */
173 lis r12, STACK_END_MAGIC@h
174 ori r12, r12, STACK_END_MAGIC@l
175 std r12, -8(r1)
176
177 /* Restore real stack pointer */
178 mr r1, r0
179
180 /* Put ctr in r12 for global entry and branch there */
181 mfctr r12
182 bctrl
183
184 /*
185 * Now we are returning from the patched function to the original
186 * caller A. We are free to use r0 and r12, and we can use r2 until we
187 * restore it.
188 */
189
190 CURRENT_THREAD_INFO(r12, r1)
191
192 /* Save stack pointer into r0 */
193 mr r0, r1
194
195 ld r1, TI_livepatch_sp(r12)
196
197 /* Check stack marker hasn't been trashed */
198 lis r2, STACK_END_MAGIC@h
199 ori r2, r2, STACK_END_MAGIC@l
200 ld r12, -8(r1)
2011: tdne r12, r2
202 EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
203
204 /* Restore LR & toc from livepatch stack */
205 ld r12, -16(r1)
206 mtlr r12
207 ld r2, -24(r1)
208
209 /* Pop livepatch stack frame */
210 CURRENT_THREAD_INFO(r12, r0)
211 subi r1, r1, 24
212 std r1, TI_livepatch_sp(r12)
213
214 /* Restore real stack pointer */
215 mr r1, r0
216
217 /* Return to original caller of live patched function */
218 blr
219#endif /* CONFIG_LIVEPATCH */
220
221#endif /* CONFIG_DYNAMIC_FTRACE */
222
223#ifdef CONFIG_FUNCTION_GRAPH_TRACER
224_GLOBAL(ftrace_graph_caller)
225 stdu r1, -112(r1)
226 /* with -mprofile-kernel, parameter regs are still alive at _mcount */
227 std r10, 104(r1)
228 std r9, 96(r1)
229 std r8, 88(r1)
230 std r7, 80(r1)
231 std r6, 72(r1)
232 std r5, 64(r1)
233 std r4, 56(r1)
234 std r3, 48(r1)
235
236 /* Save callee's TOC in the ABI compliant location */
237 std r2, 24(r1)
238 ld r2, PACATOC(r13) /* get kernel TOC in r2 */
239
240 mfctr r4 /* ftrace_caller has moved local addr here */
241 std r4, 40(r1)
242 mflr r3 /* ftrace_caller has restored LR from stack */
243 subi r4, r4, MCOUNT_INSN_SIZE
244
245 bl prepare_ftrace_return
246 nop
247
248 /*
249 * prepare_ftrace_return gives us the address we divert to.
250 * Change the LR to this.
251 */
252 mtlr r3
253
254 ld r0, 40(r1)
255 mtctr r0
256 ld r10, 104(r1)
257 ld r9, 96(r1)
258 ld r8, 88(r1)
259 ld r7, 80(r1)
260 ld r6, 72(r1)
261 ld r5, 64(r1)
262 ld r4, 56(r1)
263 ld r3, 48(r1)
264
265 /* Restore callee's TOC */
266 ld r2, 24(r1)
267
268 addi r1, r1, 112
269 mflr r0
270 std r0, LRSAVE(r1)
271 bctr
272#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
new file mode 100644
index 000000000000..f095358da96e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -0,0 +1,68 @@
1/*
2 * Split from ftrace_64.S
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/magic.h>
11#include <asm/ppc_asm.h>
12#include <asm/asm-offsets.h>
13#include <asm/ftrace.h>
14#include <asm/ppc-opcode.h>
15#include <asm/export.h>
16
17#ifdef CONFIG_DYNAMIC_FTRACE
18_GLOBAL_TOC(ftrace_caller)
19 /* Taken from output of objdump from lib64/glibc */
20 mflr r3
21 ld r11, 0(r1)
22 stdu r1, -112(r1)
23 std r3, 128(r1)
24 ld r4, 16(r11)
25 subi r3, r3, MCOUNT_INSN_SIZE
26.globl ftrace_call
27ftrace_call:
28 bl ftrace_stub
29 nop
30#ifdef CONFIG_FUNCTION_GRAPH_TRACER
31.globl ftrace_graph_call
32ftrace_graph_call:
33 b ftrace_graph_stub
34_GLOBAL(ftrace_graph_stub)
35#endif
36 ld r0, 128(r1)
37 mtlr r0
38 addi r1, r1, 112
39
40_GLOBAL(ftrace_stub)
41 blr
42#endif /* CONFIG_DYNAMIC_FTRACE */
43
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45_GLOBAL(ftrace_graph_caller)
46 /* load r4 with local address */
47 ld r4, 128(r1)
48 subi r4, r4, MCOUNT_INSN_SIZE
49
50 /* Grab the LR out of the caller stack frame */
51 ld r11, 112(r1)
52 ld r3, 16(r11)
53
54 bl prepare_ftrace_return
55 nop
56
57 /*
58 * prepare_ftrace_return gives us the address we divert to.
59 * Change the LR in the callers stack frame to this.
60 */
61 ld r11, 112(r1)
62 std r3, 16(r11)
63
64 ld r0, 128(r1)
65 mtlr r0
66 addi r1, r1, 112
67 blr
68#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c
index 49170690946d..49170690946d 100644
--- a/arch/powerpc/kernel/trace_clock.c
+++ b/arch/powerpc/kernel/trace/trace_clock.c
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..d4e545d27ef9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,13 +35,13 @@
35#include <linux/backlight.h> 35#include <linux/backlight.h>
36#include <linux/bug.h> 36#include <linux/bug.h>
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/debugfs.h>
39#include <linux/ratelimit.h> 38#include <linux/ratelimit.h>
40#include <linux/context_tracking.h> 39#include <linux/context_tracking.h>
41 40
42#include <asm/emulated_ops.h> 41#include <asm/emulated_ops.h>
43#include <asm/pgtable.h> 42#include <asm/pgtable.h>
44#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44#include <asm/debugfs.h>
45#include <asm/io.h> 45#include <asm/io.h>
46#include <asm/machdep.h> 46#include <asm/machdep.h>
47#include <asm/rtas.h> 47#include <asm/rtas.h>
@@ -279,18 +279,35 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
279 279
280void system_reset_exception(struct pt_regs *regs) 280void system_reset_exception(struct pt_regs *regs)
281{ 281{
282 /*
283 * Avoid crashes in case of nested NMI exceptions. Recoverability
284 * is determined by RI and in_nmi
285 */
286 bool nested = in_nmi();
287 if (!nested)
288 nmi_enter();
289
282 /* See if any machine dependent calls */ 290 /* See if any machine dependent calls */
283 if (ppc_md.system_reset_exception) { 291 if (ppc_md.system_reset_exception) {
284 if (ppc_md.system_reset_exception(regs)) 292 if (ppc_md.system_reset_exception(regs))
285 return; 293 goto out;
286 } 294 }
287 295
288 die("System Reset", regs, SIGABRT); 296 die("System Reset", regs, SIGABRT);
289 297
298out:
299#ifdef CONFIG_PPC_BOOK3S_64
300 BUG_ON(get_paca()->in_nmi == 0);
301 if (get_paca()->in_nmi > 1)
302 panic("Unrecoverable nested System Reset");
303#endif
290 /* Must die if the interrupt is not recoverable */ 304 /* Must die if the interrupt is not recoverable */
291 if (!(regs->msr & MSR_RI)) 305 if (!(regs->msr & MSR_RI))
292 panic("Unrecoverable System Reset"); 306 panic("Unrecoverable System Reset");
293 307
308 if (!nested)
309 nmi_exit();
310
294 /* What should we do here? We could issue a shutdown or hard reset. */ 311 /* What should we do here? We could issue a shutdown or hard reset. */
295} 312}
296 313
@@ -306,8 +323,6 @@ long machine_check_early(struct pt_regs *regs)
306 323
307 __this_cpu_inc(irq_stat.mce_exceptions); 324 __this_cpu_inc(irq_stat.mce_exceptions);
308 325
309 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
310
311 if (cur_cpu_spec && cur_cpu_spec->machine_check_early) 326 if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
312 handled = cur_cpu_spec->machine_check_early(regs); 327 handled = cur_cpu_spec->machine_check_early(regs);
313 return handled; 328 return handled;
@@ -741,6 +756,8 @@ void machine_check_exception(struct pt_regs *regs)
741 756
742 __this_cpu_inc(irq_stat.mce_exceptions); 757 __this_cpu_inc(irq_stat.mce_exceptions);
743 758
759 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
760
744 /* See if any machine dependent calls. In theory, we would want 761 /* See if any machine dependent calls. In theory, we would want
745 * to call the CPU first, and call the ppc_md. one if the CPU 762 * to call the CPU first, and call the ppc_md. one if the CPU
746 * one returns a positive number. However there is existing code 763 * one returns a positive number. However there is existing code
@@ -1440,6 +1457,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
1440 [FSCR_TM_LG] = "TM", 1457 [FSCR_TM_LG] = "TM",
1441 [FSCR_EBB_LG] = "EBB", 1458 [FSCR_EBB_LG] = "EBB",
1442 [FSCR_TAR_LG] = "TAR", 1459 [FSCR_TAR_LG] = "TAR",
1460 [FSCR_MSGP_LG] = "MSGP",
1461 [FSCR_SCV_LG] = "SCV",
1443 }; 1462 };
1444 char *facility = "unknown"; 1463 char *facility = "unknown";
1445 u64 value; 1464 u64 value;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c24c894c908..2f793be3d2b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -77,6 +77,8 @@ SECTIONS
77#endif 77#endif
78 } :kernel 78 } :kernel
79 79
80 __head_end = .;
81
80 /* 82 /*
81 * If the build dies here, it's likely code in head_64.S is referencing 83 * If the build dies here, it's likely code in head_64.S is referencing
82 * labels it can't reach, and the linker inserting stubs without the 84 * labels it can't reach, and the linker inserting stubs without the
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92..aedacefd961d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -20,6 +20,10 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/gfp.h>
24#include <linux/sched.h>
25#include <linux/vmalloc.h>
26#include <linux/highmem.h>
23 27
24#include <asm/reg.h> 28#include <asm/reg.h>
25#include <asm/cputable.h> 29#include <asm/cputable.h>
@@ -31,10 +35,6 @@
31#include <asm/kvm_book3s.h> 35#include <asm/kvm_book3s.h>
32#include <asm/mmu_context.h> 36#include <asm/mmu_context.h>
33#include <asm/page.h> 37#include <asm/page.h>
34#include <linux/gfp.h>
35#include <linux/sched.h>
36#include <linux/vmalloc.h>
37#include <linux/highmem.h>
38 38
39#include "book3s.h" 39#include "book3s.h"
40#include "trace.h" 40#include "trace.h"
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd26..74b0153780e3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
229 229
230static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) 230static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
231{ 231{
232 unsigned long vsid_bits = VSID_BITS_65_256M;
232 struct kvmppc_sid_map *map; 233 struct kvmppc_sid_map *map;
233 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 234 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
234 u16 sid_map_mask; 235 u16 sid_map_mask;
@@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
257 kvmppc_mmu_pte_flush(vcpu, 0, 0); 258 kvmppc_mmu_pte_flush(vcpu, 0, 0);
258 kvmppc_mmu_flush_segments(vcpu); 259 kvmppc_mmu_flush_segments(vcpu);
259 } 260 }
260 map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); 261
262 if (mmu_has_feature(MMU_FTR_68_BIT_VA))
263 vsid_bits = VSID_BITS_256M;
264
265 map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++,
266 VSID_MULTIPLIER_256M, vsid_bits);
261 267
262 map->guest_vsid = gvsid; 268 map->guest_vsid = gvsid;
263 map->valid = true; 269 map->valid = true;
@@ -390,7 +396,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
390 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 396 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
391 int err; 397 int err;
392 398
393 err = __init_new_context(); 399 err = hash__alloc_context_id();
394 if (err < 0) 400 if (err < 0)
395 return -1; 401 return -1;
396 vcpu3s->context_id[0] = err; 402 vcpu3s->context_id[0] = err;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82..fadb75abfe37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -35,6 +35,15 @@
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36#include <linux/miscdevice.h> 36#include <linux/miscdevice.h>
37#include <linux/debugfs.h> 37#include <linux/debugfs.h>
38#include <linux/gfp.h>
39#include <linux/vmalloc.h>
40#include <linux/highmem.h>
41#include <linux/hugetlb.h>
42#include <linux/kvm_irqfd.h>
43#include <linux/irqbypass.h>
44#include <linux/module.h>
45#include <linux/compiler.h>
46#include <linux/of.h>
38 47
39#include <asm/reg.h> 48#include <asm/reg.h>
40#include <asm/cputable.h> 49#include <asm/cputable.h>
@@ -58,15 +67,6 @@
58#include <asm/mmu.h> 67#include <asm/mmu.h>
59#include <asm/opal.h> 68#include <asm/opal.h>
60#include <asm/xics.h> 69#include <asm/xics.h>
61#include <linux/gfp.h>
62#include <linux/vmalloc.h>
63#include <linux/highmem.h>
64#include <linux/hugetlb.h>
65#include <linux/kvm_irqfd.h>
66#include <linux/irqbypass.h>
67#include <linux/module.h>
68#include <linux/compiler.h>
69#include <linux/of.h>
70 70
71#include "book3s.h" 71#include "book3s.h"
72 72
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 4d6c64b3041c..a752e29977e0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -23,6 +23,7 @@
23#include <asm/kvm_book3s.h> 23#include <asm/kvm_book3s.h>
24#include <asm/archrandom.h> 24#include <asm/archrandom.h>
25#include <asm/xics.h> 25#include <asm/xics.h>
26#include <asm/xive.h>
26#include <asm/dbell.h> 27#include <asm/dbell.h>
27#include <asm/cputhreads.h> 28#include <asm/cputhreads.h>
28#include <asm/io.h> 29#include <asm/io.h>
@@ -193,12 +194,6 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu)
193 return H_HARDWARE; 194 return H_HARDWARE;
194} 195}
195 196
196static inline void rm_writeb(unsigned long paddr, u8 val)
197{
198 __asm__ __volatile__("stbcix %0,0,%1"
199 : : "r" (val), "r" (paddr) : "memory");
200}
201
202/* 197/*
203 * Send an interrupt or message to another CPU. 198 * Send an interrupt or message to another CPU.
204 * The caller needs to include any barrier needed to order writes 199 * The caller needs to include any barrier needed to order writes
@@ -206,7 +201,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
206 */ 201 */
207void kvmhv_rm_send_ipi(int cpu) 202void kvmhv_rm_send_ipi(int cpu)
208{ 203{
209 unsigned long xics_phys; 204 void __iomem *xics_phys;
210 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 205 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
211 206
212 /* On POWER9 we can use msgsnd for any destination cpu. */ 207 /* On POWER9 we can use msgsnd for any destination cpu. */
@@ -224,10 +219,14 @@ void kvmhv_rm_send_ipi(int cpu)
224 return; 219 return;
225 } 220 }
226 221
222 /* We should never reach this */
223 if (WARN_ON_ONCE(xive_enabled()))
224 return;
225
227 /* Else poke the target with an IPI */ 226 /* Else poke the target with an IPI */
228 xics_phys = paca[cpu].kvm_hstate.xics_phys; 227 xics_phys = paca[cpu].kvm_hstate.xics_phys;
229 if (xics_phys) 228 if (xics_phys)
230 rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 229 __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
231 else 230 else
232 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); 231 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
233} 232}
@@ -386,6 +385,9 @@ long kvmppc_read_intr(void)
386 long rc; 385 long rc;
387 bool again; 386 bool again;
388 387
388 if (xive_enabled())
389 return 1;
390
389 do { 391 do {
390 again = false; 392 again = false;
391 rc = kvmppc_read_one_intr(&again); 393 rc = kvmppc_read_one_intr(&again);
@@ -397,7 +399,7 @@ long kvmppc_read_intr(void)
397 399
398static long kvmppc_read_one_intr(bool *again) 400static long kvmppc_read_one_intr(bool *again)
399{ 401{
400 unsigned long xics_phys; 402 void __iomem *xics_phys;
401 u32 h_xirr; 403 u32 h_xirr;
402 __be32 xirr; 404 __be32 xirr;
403 u32 xisr; 405 u32 xisr;
@@ -415,7 +417,7 @@ static long kvmppc_read_one_intr(bool *again)
415 if (!xics_phys) 417 if (!xics_phys)
416 rc = opal_int_get_xirr(&xirr, false); 418 rc = opal_int_get_xirr(&xirr, false);
417 else 419 else
418 xirr = _lwzcix(xics_phys + XICS_XIRR); 420 xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
419 if (rc < 0) 421 if (rc < 0)
420 return 1; 422 return 1;
421 423
@@ -445,8 +447,8 @@ static long kvmppc_read_one_intr(bool *again)
445 if (xisr == XICS_IPI) { 447 if (xisr == XICS_IPI) {
446 rc = 0; 448 rc = 0;
447 if (xics_phys) { 449 if (xics_phys) {
448 _stbcix(xics_phys + XICS_MFRR, 0xff); 450 __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
449 _stwcix(xics_phys + XICS_XIRR, xirr); 451 __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
450 } else { 452 } else {
451 opal_int_set_mfrr(hard_smp_processor_id(), 0xff); 453 opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
452 rc = opal_int_eoi(h_xirr); 454 rc = opal_int_eoi(h_xirr);
@@ -471,7 +473,8 @@ static long kvmppc_read_one_intr(bool *again)
471 * we need to resend that IPI, bummer 473 * we need to resend that IPI, bummer
472 */ 474 */
473 if (xics_phys) 475 if (xics_phys)
474 _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); 476 __raw_rm_writeb(IPI_PRIORITY,
477 xics_phys + XICS_MFRR);
475 else 478 else
476 opal_int_set_mfrr(hard_smp_processor_id(), 479 opal_int_set_mfrr(hard_smp_processor_id(),
477 IPI_PRIORITY); 480 IPI_PRIORITY);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..ffde4507ddfd 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -16,7 +16,6 @@
16#include <asm/kvm_ppc.h> 16#include <asm/kvm_ppc.h>
17#include <asm/hvcall.h> 17#include <asm/hvcall.h>
18#include <asm/xics.h> 18#include <asm/xics.h>
19#include <asm/debug.h>
20#include <asm/synch.h> 19#include <asm/synch.h>
21#include <asm/cputhreads.h> 20#include <asm/cputhreads.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -766,7 +765,7 @@ unsigned long eoi_rc;
766 765
767static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) 766static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
768{ 767{
769 unsigned long xics_phys; 768 void __iomem *xics_phys;
770 int64_t rc; 769 int64_t rc;
771 770
772 rc = pnv_opal_pci_msi_eoi(c, hwirq); 771 rc = pnv_opal_pci_msi_eoi(c, hwirq);
@@ -779,7 +778,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
779 /* EOI it */ 778 /* EOI it */
780 xics_phys = local_paca->kvm_hstate.xics_phys; 779 xics_phys = local_paca->kvm_hstate.xics_phys;
781 if (xics_phys) { 780 if (xics_phys) {
782 _stwcix(xics_phys + XICS_XIRR, xirr); 781 __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
783 } else { 782 } else {
784 rc = opal_int_eoi(be32_to_cpu(xirr)); 783 rc = opal_int_eoi(be32_to_cpu(xirr));
785 *again = rc > 0; 784 *again = rc > 0;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..459b72cb617a 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,10 +19,9 @@
19#include <asm/kvm_ppc.h> 19#include <asm/kvm_ppc.h>
20#include <asm/hvcall.h> 20#include <asm/hvcall.h>
21#include <asm/xics.h> 21#include <asm/xics.h>
22#include <asm/debug.h> 22#include <asm/debugfs.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#include <linux/debugfs.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27 26
28#include "book3s_xics.h" 27#include "book3s_xics.h"
@@ -1084,7 +1083,7 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
1084 return xics->ics[icsid]; 1083 return xics->ics[icsid];
1085} 1084}
1086 1085
1087int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) 1086static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
1088{ 1087{
1089 struct kvmppc_icp *icp; 1088 struct kvmppc_icp *icp;
1090 1089
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0d3002b7e2b4..500b0f6a0b64 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/kprobes.h>
11#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/mm.h> 14#include <linux/mm.h>
@@ -59,7 +60,7 @@ bool is_offset_in_branch_range(long offset)
59 * Helper to check if a given instruction is a conditional branch 60 * Helper to check if a given instruction is a conditional branch
60 * Derived from the conditional checks in analyse_instr() 61 * Derived from the conditional checks in analyse_instr()
61 */ 62 */
62bool __kprobes is_conditional_branch(unsigned int instr) 63bool is_conditional_branch(unsigned int instr)
63{ 64{
64 unsigned int opcode = instr >> 26; 65 unsigned int opcode = instr >> 26;
65 66
@@ -75,6 +76,7 @@ bool __kprobes is_conditional_branch(unsigned int instr)
75 } 76 }
76 return false; 77 return false;
77} 78}
79NOKPROBE_SYMBOL(is_conditional_branch);
78 80
79unsigned int create_branch(const unsigned int *addr, 81unsigned int create_branch(const unsigned int *addr,
80 unsigned long target, int flags) 82 unsigned long target, int flags)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9c542ec70c5b..33117f8a0882 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -49,7 +49,8 @@ extern int do_stxvd2x(int rn, unsigned long ea);
49/* 49/*
50 * Emulate the truncation of 64 bit values in 32-bit mode. 50 * Emulate the truncation of 64 bit values in 32-bit mode.
51 */ 51 */
52static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) 52static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr,
53 unsigned long val)
53{ 54{
54#ifdef __powerpc64__ 55#ifdef __powerpc64__
55 if ((msr & MSR_64BIT) == 0) 56 if ((msr & MSR_64BIT) == 0)
@@ -61,7 +62,7 @@ static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
61/* 62/*
62 * Determine whether a conditional branch instruction would branch. 63 * Determine whether a conditional branch instruction would branch.
63 */ 64 */
64static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) 65static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs)
65{ 66{
66 unsigned int bo = (instr >> 21) & 0x1f; 67 unsigned int bo = (instr >> 21) & 0x1f;
67 unsigned int bi; 68 unsigned int bi;
@@ -81,8 +82,7 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
81 return 1; 82 return 1;
82} 83}
83 84
84 85static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb)
85static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
86{ 86{
87 if (!user_mode(regs)) 87 if (!user_mode(regs))
88 return 1; 88 return 1;
@@ -92,7 +92,7 @@ static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
92/* 92/*
93 * Calculate effective address for a D-form instruction 93 * Calculate effective address for a D-form instruction
94 */ 94 */
95static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) 95static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs)
96{ 96{
97 int ra; 97 int ra;
98 unsigned long ea; 98 unsigned long ea;
@@ -109,7 +109,7 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs
109/* 109/*
110 * Calculate effective address for a DS-form instruction 110 * Calculate effective address for a DS-form instruction
111 */ 111 */
112static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) 112static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs)
113{ 113{
114 int ra; 114 int ra;
115 unsigned long ea; 115 unsigned long ea;
@@ -126,8 +126,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg
126/* 126/*
127 * Calculate effective address for an X-form instruction 127 * Calculate effective address for an X-form instruction
128 */ 128 */
129static unsigned long __kprobes xform_ea(unsigned int instr, 129static nokprobe_inline unsigned long xform_ea(unsigned int instr,
130 struct pt_regs *regs) 130 struct pt_regs *regs)
131{ 131{
132 int ra, rb; 132 int ra, rb;
133 unsigned long ea; 133 unsigned long ea;
@@ -145,33 +145,33 @@ static unsigned long __kprobes xform_ea(unsigned int instr,
145 * Return the largest power of 2, not greater than sizeof(unsigned long), 145 * Return the largest power of 2, not greater than sizeof(unsigned long),
146 * such that x is a multiple of it. 146 * such that x is a multiple of it.
147 */ 147 */
148static inline unsigned long max_align(unsigned long x) 148static nokprobe_inline unsigned long max_align(unsigned long x)
149{ 149{
150 x |= sizeof(unsigned long); 150 x |= sizeof(unsigned long);
151 return x & -x; /* isolates rightmost bit */ 151 return x & -x; /* isolates rightmost bit */
152} 152}
153 153
154 154
155static inline unsigned long byterev_2(unsigned long x) 155static nokprobe_inline unsigned long byterev_2(unsigned long x)
156{ 156{
157 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); 157 return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
158} 158}
159 159
160static inline unsigned long byterev_4(unsigned long x) 160static nokprobe_inline unsigned long byterev_4(unsigned long x)
161{ 161{
162 return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | 162 return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) |
163 ((x & 0xff00) << 8) | ((x & 0xff) << 24); 163 ((x & 0xff00) << 8) | ((x & 0xff) << 24);
164} 164}
165 165
166#ifdef __powerpc64__ 166#ifdef __powerpc64__
167static inline unsigned long byterev_8(unsigned long x) 167static nokprobe_inline unsigned long byterev_8(unsigned long x)
168{ 168{
169 return (byterev_4(x) << 32) | byterev_4(x >> 32); 169 return (byterev_4(x) << 32) | byterev_4(x >> 32);
170} 170}
171#endif 171#endif
172 172
173static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, 173static nokprobe_inline int read_mem_aligned(unsigned long *dest,
174 int nb) 174 unsigned long ea, int nb)
175{ 175{
176 int err = 0; 176 int err = 0;
177 unsigned long x = 0; 177 unsigned long x = 0;
@@ -197,8 +197,8 @@ static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
197 return err; 197 return err;
198} 198}
199 199
200static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, 200static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
201 int nb, struct pt_regs *regs) 201 unsigned long ea, int nb, struct pt_regs *regs)
202{ 202{
203 int err; 203 int err;
204 unsigned long x, b, c; 204 unsigned long x, b, c;
@@ -248,7 +248,7 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
248 * Read memory at address ea for nb bytes, return 0 for success 248 * Read memory at address ea for nb bytes, return 0 for success
249 * or -EFAULT if an error occurred. 249 * or -EFAULT if an error occurred.
250 */ 250 */
251static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, 251static int read_mem(unsigned long *dest, unsigned long ea, int nb,
252 struct pt_regs *regs) 252 struct pt_regs *regs)
253{ 253{
254 if (!address_ok(regs, ea, nb)) 254 if (!address_ok(regs, ea, nb))
@@ -257,9 +257,10 @@ static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
257 return read_mem_aligned(dest, ea, nb); 257 return read_mem_aligned(dest, ea, nb);
258 return read_mem_unaligned(dest, ea, nb, regs); 258 return read_mem_unaligned(dest, ea, nb, regs);
259} 259}
260NOKPROBE_SYMBOL(read_mem);
260 261
261static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, 262static nokprobe_inline int write_mem_aligned(unsigned long val,
262 int nb) 263 unsigned long ea, int nb)
263{ 264{
264 int err = 0; 265 int err = 0;
265 266
@@ -282,8 +283,8 @@ static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
282 return err; 283 return err;
283} 284}
284 285
285static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, 286static nokprobe_inline int write_mem_unaligned(unsigned long val,
286 int nb, struct pt_regs *regs) 287 unsigned long ea, int nb, struct pt_regs *regs)
287{ 288{
288 int err; 289 int err;
289 unsigned long c; 290 unsigned long c;
@@ -325,7 +326,7 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
325 * Write memory at address ea for nb bytes, return 0 for success 326 * Write memory at address ea for nb bytes, return 0 for success
326 * or -EFAULT if an error occurred. 327 * or -EFAULT if an error occurred.
327 */ 328 */
328static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, 329static int write_mem(unsigned long val, unsigned long ea, int nb,
329 struct pt_regs *regs) 330 struct pt_regs *regs)
330{ 331{
331 if (!address_ok(regs, ea, nb)) 332 if (!address_ok(regs, ea, nb))
@@ -334,13 +335,14 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
334 return write_mem_aligned(val, ea, nb); 335 return write_mem_aligned(val, ea, nb);
335 return write_mem_unaligned(val, ea, nb, regs); 336 return write_mem_unaligned(val, ea, nb, regs);
336} 337}
338NOKPROBE_SYMBOL(write_mem);
337 339
338#ifdef CONFIG_PPC_FPU 340#ifdef CONFIG_PPC_FPU
339/* 341/*
340 * Check the address and alignment, and call func to do the actual 342 * Check the address and alignment, and call func to do the actual
341 * load or store. 343 * load or store.
342 */ 344 */
343static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), 345static int do_fp_load(int rn, int (*func)(int, unsigned long),
344 unsigned long ea, int nb, 346 unsigned long ea, int nb,
345 struct pt_regs *regs) 347 struct pt_regs *regs)
346{ 348{
@@ -380,8 +382,9 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
380 return err; 382 return err;
381 return (*func)(rn, ptr); 383 return (*func)(rn, ptr);
382} 384}
385NOKPROBE_SYMBOL(do_fp_load);
383 386
384static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), 387static int do_fp_store(int rn, int (*func)(int, unsigned long),
385 unsigned long ea, int nb, 388 unsigned long ea, int nb,
386 struct pt_regs *regs) 389 struct pt_regs *regs)
387{ 390{
@@ -425,11 +428,12 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
425 } 428 }
426 return err; 429 return err;
427} 430}
431NOKPROBE_SYMBOL(do_fp_store);
428#endif 432#endif
429 433
430#ifdef CONFIG_ALTIVEC 434#ifdef CONFIG_ALTIVEC
431/* For Altivec/VMX, no need to worry about alignment */ 435/* For Altivec/VMX, no need to worry about alignment */
432static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), 436static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long),
433 unsigned long ea, struct pt_regs *regs) 437 unsigned long ea, struct pt_regs *regs)
434{ 438{
435 if (!address_ok(regs, ea & ~0xfUL, 16)) 439 if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -437,7 +441,7 @@ static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
437 return (*func)(rn, ea); 441 return (*func)(rn, ea);
438} 442}
439 443
440static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), 444static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long),
441 unsigned long ea, struct pt_regs *regs) 445 unsigned long ea, struct pt_regs *regs)
442{ 446{
443 if (!address_ok(regs, ea & ~0xfUL, 16)) 447 if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -447,7 +451,7 @@ static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
447#endif /* CONFIG_ALTIVEC */ 451#endif /* CONFIG_ALTIVEC */
448 452
449#ifdef CONFIG_VSX 453#ifdef CONFIG_VSX
450static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), 454static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long),
451 unsigned long ea, struct pt_regs *regs) 455 unsigned long ea, struct pt_regs *regs)
452{ 456{
453 int err; 457 int err;
@@ -465,7 +469,7 @@ static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
465 return err; 469 return err;
466} 470}
467 471
468static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), 472static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long),
469 unsigned long ea, struct pt_regs *regs) 473 unsigned long ea, struct pt_regs *regs)
470{ 474{
471 int err; 475 int err;
@@ -522,7 +526,7 @@ static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
522 : "=r" (err) \ 526 : "=r" (err) \
523 : "r" (addr), "i" (-EFAULT), "0" (err)) 527 : "r" (addr), "i" (-EFAULT), "0" (err))
524 528
525static void __kprobes set_cr0(struct pt_regs *regs, int rd) 529static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd)
526{ 530{
527 long val = regs->gpr[rd]; 531 long val = regs->gpr[rd];
528 532
@@ -539,7 +543,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd)
539 regs->ccr |= 0x20000000; 543 regs->ccr |= 0x20000000;
540} 544}
541 545
542static void __kprobes add_with_carry(struct pt_regs *regs, int rd, 546static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd,
543 unsigned long val1, unsigned long val2, 547 unsigned long val1, unsigned long val2,
544 unsigned long carry_in) 548 unsigned long carry_in)
545{ 549{
@@ -560,7 +564,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
560 regs->xer &= ~XER_CA; 564 regs->xer &= ~XER_CA;
561} 565}
562 566
563static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, 567static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2,
564 int crfld) 568 int crfld)
565{ 569{
566 unsigned int crval, shift; 570 unsigned int crval, shift;
@@ -576,7 +580,7 @@ static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
576 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); 580 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
577} 581}
578 582
579static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, 583static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
580 unsigned long v2, int crfld) 584 unsigned long v2, int crfld)
581{ 585{
582 unsigned int crval, shift; 586 unsigned int crval, shift;
@@ -592,7 +596,7 @@ static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
592 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); 596 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
593} 597}
594 598
595static int __kprobes trap_compare(long v1, long v2) 599static nokprobe_inline int trap_compare(long v1, long v2)
596{ 600{
597 int ret = 0; 601 int ret = 0;
598 602
@@ -631,7 +635,7 @@ static int __kprobes trap_compare(long v1, long v2)
631 * Returns 1 if the instruction has been executed, or 0 if not. 635 * Returns 1 if the instruction has been executed, or 0 if not.
632 * Sets *op to indicate what the instruction does. 636 * Sets *op to indicate what the instruction does.
633 */ 637 */
634int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, 638int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
635 unsigned int instr) 639 unsigned int instr)
636{ 640{
637 unsigned int opcode, ra, rb, rd, spr, u; 641 unsigned int opcode, ra, rb, rd, spr, u;
@@ -1692,6 +1696,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
1692#endif 1696#endif
1693} 1697}
1694EXPORT_SYMBOL_GPL(analyse_instr); 1698EXPORT_SYMBOL_GPL(analyse_instr);
1699NOKPROBE_SYMBOL(analyse_instr);
1695 1700
1696/* 1701/*
1697 * For PPC32 we always use stwu with r1 to change the stack pointer. 1702 * For PPC32 we always use stwu with r1 to change the stack pointer.
@@ -1701,7 +1706,7 @@ EXPORT_SYMBOL_GPL(analyse_instr);
1701 * don't emulate the real store operation. We will do real store 1706 * don't emulate the real store operation. We will do real store
1702 * operation safely in exception return code by checking this flag. 1707 * operation safely in exception return code by checking this flag.
1703 */ 1708 */
1704static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs) 1709static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs)
1705{ 1710{
1706#ifdef CONFIG_PPC32 1711#ifdef CONFIG_PPC32
1707 /* 1712 /*
@@ -1721,7 +1726,7 @@ static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
1721 return 0; 1726 return 0;
1722} 1727}
1723 1728
1724static __kprobes void do_signext(unsigned long *valp, int size) 1729static nokprobe_inline void do_signext(unsigned long *valp, int size)
1725{ 1730{
1726 switch (size) { 1731 switch (size) {
1727 case 2: 1732 case 2:
@@ -1733,7 +1738,7 @@ static __kprobes void do_signext(unsigned long *valp, int size)
1733 } 1738 }
1734} 1739}
1735 1740
1736static __kprobes void do_byterev(unsigned long *valp, int size) 1741static nokprobe_inline void do_byterev(unsigned long *valp, int size)
1737{ 1742{
1738 switch (size) { 1743 switch (size) {
1739 case 2: 1744 case 2:
@@ -1757,7 +1762,7 @@ static __kprobes void do_byterev(unsigned long *valp, int size)
1757 * or -1 if the instruction is one that should not be stepped, 1762 * or -1 if the instruction is one that should not be stepped,
1758 * such as an rfid, or a mtmsrd that would clear MSR_RI. 1763 * such as an rfid, or a mtmsrd that would clear MSR_RI.
1759 */ 1764 */
1760int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) 1765int emulate_step(struct pt_regs *regs, unsigned int instr)
1761{ 1766{
1762 struct instruction_op op; 1767 struct instruction_op op;
1763 int r, err, size; 1768 int r, err, size;
@@ -1988,3 +1993,4 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
1988 regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); 1993 regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
1989 return 1; 1994 return 1;
1990} 1995}
1996NOKPROBE_SYMBOL(emulate_step);
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index d979709a0239..c6b900f54c07 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st)
468 unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; 468 unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
469 469
470 for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + 470 for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
471 memblock_phys_mem_size(); addr += psize) 471 memblock_end_of_DRAM(); addr += psize)
472 hpte_find(st, addr, mmu_linear_psize); 472 hpte_find(st, addr, mmu_linear_psize);
473} 473}
474 474
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..d659345a98d6 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -26,6 +26,10 @@
26#include <asm/page.h> 26#include <asm/page.h>
27#include <asm/pgalloc.h> 27#include <asm/pgalloc.h>
28 28
29#ifdef CONFIG_PPC32
30#define KERN_VIRT_START 0
31#endif
32
29/* 33/*
30 * To visualise what is happening, 34 * To visualise what is happening,
31 * 35 *
@@ -56,6 +60,8 @@ struct pg_state {
56 struct seq_file *seq; 60 struct seq_file *seq;
57 const struct addr_marker *marker; 61 const struct addr_marker *marker;
58 unsigned long start_address; 62 unsigned long start_address;
63 unsigned long start_pa;
64 unsigned long last_pa;
59 unsigned int level; 65 unsigned int level;
60 u64 current_flags; 66 u64 current_flags;
61}; 67};
@@ -69,6 +75,7 @@ static struct addr_marker address_markers[] = {
69 { 0, "Start of kernel VM" }, 75 { 0, "Start of kernel VM" },
70 { 0, "vmalloc() Area" }, 76 { 0, "vmalloc() Area" },
71 { 0, "vmalloc() End" }, 77 { 0, "vmalloc() End" },
78#ifdef CONFIG_PPC64
72 { 0, "isa I/O start" }, 79 { 0, "isa I/O start" },
73 { 0, "isa I/O end" }, 80 { 0, "isa I/O end" },
74 { 0, "phb I/O start" }, 81 { 0, "phb I/O start" },
@@ -76,6 +83,20 @@ static struct addr_marker address_markers[] = {
76 { 0, "I/O remap start" }, 83 { 0, "I/O remap start" },
77 { 0, "I/O remap end" }, 84 { 0, "I/O remap end" },
78 { 0, "vmemmap start" }, 85 { 0, "vmemmap start" },
86#else
87 { 0, "Early I/O remap start" },
88 { 0, "Early I/O remap end" },
89#ifdef CONFIG_NOT_COHERENT_CACHE
90 { 0, "Consistent mem start" },
91 { 0, "Consistent mem end" },
92#endif
93#ifdef CONFIG_HIGHMEM
94 { 0, "Highmem PTEs start" },
95 { 0, "Highmem PTEs end" },
96#endif
97 { 0, "Fixmap start" },
98 { 0, "Fixmap end" },
99#endif
79 { -1, NULL }, 100 { -1, NULL },
80}; 101};
81 102
@@ -100,8 +121,13 @@ static const struct flag_info flag_array[] = {
100 .set = "user", 121 .set = "user",
101 .clear = " ", 122 .clear = " ",
102 }, { 123 }, {
124#if _PAGE_RO == 0
103 .mask = _PAGE_RW, 125 .mask = _PAGE_RW,
104 .val = _PAGE_RW, 126 .val = _PAGE_RW,
127#else
128 .mask = _PAGE_RO,
129 .val = 0,
130#endif
105 .set = "rw", 131 .set = "rw",
106 .clear = "ro", 132 .clear = "ro",
107 }, { 133 }, {
@@ -154,11 +180,24 @@ static const struct flag_info flag_array[] = {
154 .clear = " ", 180 .clear = " ",
155 }, { 181 }, {
156#endif 182#endif
183#ifndef CONFIG_PPC_BOOK3S_64
157 .mask = _PAGE_NO_CACHE, 184 .mask = _PAGE_NO_CACHE,
158 .val = _PAGE_NO_CACHE, 185 .val = _PAGE_NO_CACHE,
159 .set = "no cache", 186 .set = "no cache",
160 .clear = " ", 187 .clear = " ",
161 }, { 188 }, {
189#else
190 .mask = _PAGE_NON_IDEMPOTENT,
191 .val = _PAGE_NON_IDEMPOTENT,
192 .set = "non-idempotent",
193 .clear = " ",
194 }, {
195 .mask = _PAGE_TOLERANT,
196 .val = _PAGE_TOLERANT,
197 .set = "tolerant",
198 .clear = " ",
199 }, {
200#endif
162#ifdef CONFIG_PPC_BOOK3S_64 201#ifdef CONFIG_PPC_BOOK3S_64
163 .mask = H_PAGE_BUSY, 202 .mask = H_PAGE_BUSY,
164 .val = H_PAGE_BUSY, 203 .val = H_PAGE_BUSY,
@@ -188,6 +227,10 @@ static const struct flag_info flag_array[] = {
188 .mask = _PAGE_SPECIAL, 227 .mask = _PAGE_SPECIAL,
189 .val = _PAGE_SPECIAL, 228 .val = _PAGE_SPECIAL,
190 .set = "special", 229 .set = "special",
230 }, {
231 .mask = _PAGE_SHARED,
232 .val = _PAGE_SHARED,
233 .set = "shared",
191 } 234 }
192}; 235};
193 236
@@ -252,7 +295,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
252 const char *unit = units; 295 const char *unit = units;
253 unsigned long delta; 296 unsigned long delta;
254 297
255 seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); 298#ifdef CONFIG_PPC64
299 seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
300 seq_printf(st->seq, "0x%016lx ", st->start_pa);
301#else
302 seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
303 seq_printf(st->seq, "0x%08lx ", st->start_pa);
304#endif
305
256 delta = (addr - st->start_address) >> 10; 306 delta = (addr - st->start_address) >> 10;
257 /* Work out what appropriate unit to use */ 307 /* Work out what appropriate unit to use */
258 while (!(delta & 1023) && unit[1]) { 308 while (!(delta & 1023) && unit[1]) {
@@ -267,11 +317,15 @@ static void note_page(struct pg_state *st, unsigned long addr,
267 unsigned int level, u64 val) 317 unsigned int level, u64 val)
268{ 318{
269 u64 flag = val & pg_level[level].mask; 319 u64 flag = val & pg_level[level].mask;
320 u64 pa = val & PTE_RPN_MASK;
321
270 /* At first no level is set */ 322 /* At first no level is set */
271 if (!st->level) { 323 if (!st->level) {
272 st->level = level; 324 st->level = level;
273 st->current_flags = flag; 325 st->current_flags = flag;
274 st->start_address = addr; 326 st->start_address = addr;
327 st->start_pa = pa;
328 st->last_pa = pa;
275 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 329 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
276 /* 330 /*
277 * Dump the section of virtual memory when: 331 * Dump the section of virtual memory when:
@@ -279,9 +333,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
279 * - we change levels in the tree. 333 * - we change levels in the tree.
280 * - the address is in a different section of memory and is thus 334 * - the address is in a different section of memory and is thus
281 * used for a different purpose, regardless of the flags. 335 * used for a different purpose, regardless of the flags.
336 * - the pa of this page is not adjacent to the last inspected page
282 */ 337 */
283 } else if (flag != st->current_flags || level != st->level || 338 } else if (flag != st->current_flags || level != st->level ||
284 addr >= st->marker[1].start_address) { 339 addr >= st->marker[1].start_address ||
340 pa != st->last_pa + PAGE_SIZE) {
285 341
286 /* Check the PTE flags */ 342 /* Check the PTE flags */
287 if (st->current_flags) { 343 if (st->current_flags) {
@@ -305,8 +361,12 @@ static void note_page(struct pg_state *st, unsigned long addr,
305 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 361 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
306 } 362 }
307 st->start_address = addr; 363 st->start_address = addr;
364 st->start_pa = pa;
365 st->last_pa = pa;
308 st->current_flags = flag; 366 st->current_flags = flag;
309 st->level = level; 367 st->level = level;
368 } else {
369 st->last_pa = pa;
310 } 370 }
311} 371}
312 372
@@ -377,20 +437,38 @@ static void walk_pagetables(struct pg_state *st)
377 437
378static void populate_markers(void) 438static void populate_markers(void)
379{ 439{
380 address_markers[0].start_address = PAGE_OFFSET; 440 int i = 0;
381 address_markers[1].start_address = VMALLOC_START; 441
382 address_markers[2].start_address = VMALLOC_END; 442 address_markers[i++].start_address = PAGE_OFFSET;
383 address_markers[3].start_address = ISA_IO_BASE; 443 address_markers[i++].start_address = VMALLOC_START;
384 address_markers[4].start_address = ISA_IO_END; 444 address_markers[i++].start_address = VMALLOC_END;
385 address_markers[5].start_address = PHB_IO_BASE; 445#ifdef CONFIG_PPC64
386 address_markers[6].start_address = PHB_IO_END; 446 address_markers[i++].start_address = ISA_IO_BASE;
387 address_markers[7].start_address = IOREMAP_BASE; 447 address_markers[i++].start_address = ISA_IO_END;
388 address_markers[8].start_address = IOREMAP_END; 448 address_markers[i++].start_address = PHB_IO_BASE;
449 address_markers[i++].start_address = PHB_IO_END;
450 address_markers[i++].start_address = IOREMAP_BASE;
451 address_markers[i++].start_address = IOREMAP_END;
389#ifdef CONFIG_PPC_STD_MMU_64 452#ifdef CONFIG_PPC_STD_MMU_64
390 address_markers[9].start_address = H_VMEMMAP_BASE; 453 address_markers[i++].start_address = H_VMEMMAP_BASE;
391#else 454#else
392 address_markers[9].start_address = VMEMMAP_BASE; 455 address_markers[i++].start_address = VMEMMAP_BASE;
456#endif
457#else /* !CONFIG_PPC64 */
458 address_markers[i++].start_address = ioremap_bot;
459 address_markers[i++].start_address = IOREMAP_TOP;
460#ifdef CONFIG_NOT_COHERENT_CACHE
461 address_markers[i++].start_address = IOREMAP_TOP;
462 address_markers[i++].start_address = IOREMAP_TOP +
463 CONFIG_CONSISTENT_SIZE;
464#endif
465#ifdef CONFIG_HIGHMEM
466 address_markers[i++].start_address = PKMAP_BASE;
467 address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
393#endif 468#endif
469 address_markers[i++].start_address = FIXADDR_START;
470 address_markers[i++].start_address = FIXADDR_TOP;
471#endif /* CONFIG_PPC64 */
394} 472}
395 473
396static int ptdump_show(struct seq_file *m, void *v) 474static int ptdump_show(struct seq_file *m, void *v)
@@ -435,7 +513,7 @@ static int ptdump_init(void)
435 513
436 populate_markers(); 514 populate_markers();
437 build_pgtable_complete_mask(); 515 build_pgtable_complete_mask();
438 debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL, 516 debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
439 NULL, &ptdump_fops); 517 NULL, &ptdump_fops);
440 return debugfs_file ? 0 : -ENOMEM; 518 return debugfs_file ? 0 : -ENOMEM;
441} 519}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51def8a515be..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
120 siginfo_t info; 120 siginfo_t info;
121 unsigned int lsb = 0; 121 unsigned int lsb = 0;
122 122
123 up_read(&current->mm->mmap_sem);
124
125 if (!user_mode(regs)) 123 if (!user_mode(regs))
126 return MM_FAULT_ERR(SIGBUS); 124 return MM_FAULT_ERR(SIGBUS);
127 125
@@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
154 * continue the pagefault. 152 * continue the pagefault.
155 */ 153 */
156 if (fatal_signal_pending(current)) { 154 if (fatal_signal_pending(current)) {
157 /*
158 * If we have retry set, the mmap semaphore will have
159 * alrady been released in __lock_page_or_retry(). Else
160 * we release it now.
161 */
162 if (!(fault & VM_FAULT_RETRY))
163 up_read(&current->mm->mmap_sem);
164 /* Coming from kernel, we need to deal with uaccess fixups */ 155 /* Coming from kernel, we need to deal with uaccess fixups */
165 if (user_mode(regs)) 156 if (user_mode(regs))
166 return MM_FAULT_RETURN; 157 return MM_FAULT_RETURN;
@@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
173 164
174 /* Out of memory */ 165 /* Out of memory */
175 if (fault & VM_FAULT_OOM) { 166 if (fault & VM_FAULT_OOM) {
176 up_read(&current->mm->mmap_sem);
177
178 /* 167 /*
179 * We ran out of memory, or some other thing happened to us that 168 * We ran out of memory, or some other thing happened to us that
180 * made us unable to handle the page fault gracefully. 169 * made us unable to handle the page fault gracefully.
@@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
298 * can result in fault, which will cause a deadlock when called with 287 * can result in fault, which will cause a deadlock when called with
299 * mmap_sem held 288 * mmap_sem held
300 */ 289 */
301 if (user_mode(regs)) 290 if (!is_exec && user_mode(regs))
302 store_update_sp = store_updates_sp(regs); 291 store_update_sp = store_updates_sp(regs);
303 292
304 if (user_mode(regs)) 293 if (user_mode(regs))
@@ -458,9 +447,30 @@ good_area:
458 * the fault. 447 * the fault.
459 */ 448 */
460 fault = handle_mm_fault(vma, address, flags); 449 fault = handle_mm_fault(vma, address, flags);
450
451 /*
452 * Handle the retry right now, the mmap_sem has been released in that
453 * case.
454 */
455 if (unlikely(fault & VM_FAULT_RETRY)) {
456 /* We retry only once */
457 if (flags & FAULT_FLAG_ALLOW_RETRY) {
458 /*
459 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
460 * of starvation.
461 */
462 flags &= ~FAULT_FLAG_ALLOW_RETRY;
463 flags |= FAULT_FLAG_TRIED;
464 if (!fatal_signal_pending(current))
465 goto retry;
466 }
467 /* We will enter mm_fault_error() below */
468 } else
469 up_read(&current->mm->mmap_sem);
470
461 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { 471 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
462 if (fault & VM_FAULT_SIGSEGV) 472 if (fault & VM_FAULT_SIGSEGV)
463 goto bad_area; 473 goto bad_area_nosemaphore;
464 rc = mm_fault_error(regs, address, fault); 474 rc = mm_fault_error(regs, address, fault);
465 if (rc >= MM_FAULT_RETURN) 475 if (rc >= MM_FAULT_RETURN)
466 goto bail; 476 goto bail;
@@ -469,41 +479,29 @@ good_area:
469 } 479 }
470 480
471 /* 481 /*
472 * Major/minor page fault accounting is only done on the 482 * Major/minor page fault accounting.
473 * initial attempt. If we go through a retry, it is extremely
474 * likely that the page will be found in page cache at that point.
475 */ 483 */
476 if (flags & FAULT_FLAG_ALLOW_RETRY) { 484 if (fault & VM_FAULT_MAJOR) {
477 if (fault & VM_FAULT_MAJOR) { 485 current->maj_flt++;
478 current->maj_flt++; 486 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
479 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 487 regs, address);
480 regs, address);
481#ifdef CONFIG_PPC_SMLPAR 488#ifdef CONFIG_PPC_SMLPAR
482 if (firmware_has_feature(FW_FEATURE_CMO)) { 489 if (firmware_has_feature(FW_FEATURE_CMO)) {
483 u32 page_ins; 490 u32 page_ins;
484 491
485 preempt_disable(); 492 preempt_disable();
486 page_ins = be32_to_cpu(get_lppaca()->page_ins); 493 page_ins = be32_to_cpu(get_lppaca()->page_ins);
487 page_ins += 1 << PAGE_FACTOR; 494 page_ins += 1 << PAGE_FACTOR;
488 get_lppaca()->page_ins = cpu_to_be32(page_ins); 495 get_lppaca()->page_ins = cpu_to_be32(page_ins);
489 preempt_enable(); 496 preempt_enable();
490 }
491#endif /* CONFIG_PPC_SMLPAR */
492 } else {
493 current->min_flt++;
494 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
495 regs, address);
496 }
497 if (fault & VM_FAULT_RETRY) {
498 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
499 * of starvation. */
500 flags &= ~FAULT_FLAG_ALLOW_RETRY;
501 flags |= FAULT_FLAG_TRIED;
502 goto retry;
503 } 497 }
498#endif /* CONFIG_PPC_SMLPAR */
499 } else {
500 current->min_flt++;
501 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
502 regs, address);
504 } 503 }
505 504
506 up_read(&mm->mmap_sem);
507 goto bail; 505 goto bail;
508 506
509bad_area: 507bad_area:
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 09cc50c8dace..6f962e5cb5e1 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -31,10 +31,8 @@
31#ifdef CONFIG_SMP 31#ifdef CONFIG_SMP
32 .section .bss 32 .section .bss
33 .align 2 33 .align 2
34 .globl mmu_hash_lock
35mmu_hash_lock: 34mmu_hash_lock:
36 .space 4 35 .space 4
37EXPORT_SYMBOL(mmu_hash_lock)
38#endif /* CONFIG_SMP */ 36#endif /* CONFIG_SMP */
39 37
40/* 38/*
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c554768b1fa2..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
35#include <linux/memblock.h> 35#include <linux/memblock.h>
36#include <linux/context_tracking.h> 36#include <linux/context_tracking.h>
37#include <linux/libfdt.h> 37#include <linux/libfdt.h>
38#include <linux/debugfs.h>
39 38
40#include <asm/debug.h> 39#include <asm/debugfs.h>
41#include <asm/processor.h> 40#include <asm/processor.h>
42#include <asm/pgtable.h> 41#include <asm/pgtable.h>
43#include <asm/mmu.h> 42#include <asm/mmu.h>
@@ -927,11 +926,6 @@ static void __init htab_initialize(void)
927 } 926 }
928#endif /* CONFIG_DEBUG_PAGEALLOC */ 927#endif /* CONFIG_DEBUG_PAGEALLOC */
929 928
930 /* On U3 based machines, we need to reserve the DART area and
931 * _NOT_ map it to avoid cache paradoxes as it's remapped non
932 * cacheable later on
933 */
934
935 /* create bolted the linear mapping in the hash table */ 929 /* create bolted the linear mapping in the hash table */
936 for_each_memblock(memory, reg) { 930 for_each_memblock(memory, reg) {
937 base = (unsigned long)__va(reg->base); 931 base = (unsigned long)__va(reg->base);
@@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void)
981 975
982void __init hash__early_init_mmu(void) 976void __init hash__early_init_mmu(void)
983{ 977{
978 /*
979 * We have code in __hash_page_64K() and elsewhere, which assumes it can
980 * do the following:
981 * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
982 *
983 * Where the slot number is between 0-15, and values of 8-15 indicate
984 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
985 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
986 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
987 * with a BUILD_BUG_ON().
988 */
989 BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
990
984 htab_init_page_sizes(); 991 htab_init_page_sizes();
985 992
986 /* 993 /*
@@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
1120 copro_flush_all_slbs(mm); 1127 copro_flush_all_slbs(mm);
1121 if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { 1128 if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
1122 1129
1123 copy_mm_to_paca(&mm->context); 1130 copy_mm_to_paca(mm);
1124 slb_flush_and_rebolt(); 1131 slb_flush_and_rebolt();
1125 } 1132 }
1126} 1133}
@@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
1192{ 1199{
1193 if (user_region) { 1200 if (user_region) {
1194 if (psize != get_paca_psize(ea)) { 1201 if (psize != get_paca_psize(ea)) {
1195 copy_mm_to_paca(&mm->context); 1202 copy_mm_to_paca(mm);
1196 slb_flush_and_rebolt(); 1203 slb_flush_and_rebolt();
1197 } 1204 }
1198 } else if (get_paca()->vmalloc_sllp != 1205 } else if (get_paca()->vmalloc_sllp !=
@@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void)
1855 return 0; 1862 return 0;
1856} 1863}
1857machine_device_initcall(pseries, hash64_debugfs); 1864machine_device_initcall(pseries, hash64_debugfs);
1858
1859#endif /* CONFIG_DEBUG_FS */ 1865#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 83a8be791e06..bfe4e8526b2d 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
148 148
149 mm = vma->vm_mm; 149 mm = vma->vm_mm;
150 150
151#ifdef CONFIG_PPC_MM_SLICES
152 psize = get_slice_psize(mm, ea);
153 tsize = mmu_get_tsize(psize);
154 shift = mmu_psize_defs[psize].shift;
155#else
156 psize = vma_mmu_pagesize(vma); 151 psize = vma_mmu_pagesize(vma);
157 shift = __ilog2(psize); 152 shift = __ilog2(psize);
158 tsize = shift - 10; 153 tsize = shift - 10;
159#endif
160
161 /* 154 /*
162 * We can't be interrupted while we're setting up the MAS 155 * We can't be interrupted while we're setting up the MAS
163 * regusters or after we've confirmed that no tlb exists. 156 * regusters or after we've confirmed that no tlb exists.
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 35254a678456..6575b9aabef4 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
50 struct hstate *h = hstate_file(file); 50 struct hstate *h = hstate_file(file);
51 struct vm_unmapped_area_info info; 51 struct vm_unmapped_area_info info;
52 52
53 if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
54 mm->context.addr_limit = TASK_SIZE;
55
53 if (len & ~huge_page_mask(h)) 56 if (len & ~huge_page_mask(h))
54 return -EINVAL; 57 return -EINVAL;
55 if (len > TASK_SIZE) 58 if (len > mm->task_size)
56 return -ENOMEM; 59 return -ENOMEM;
57 60
58 if (flags & MAP_FIXED) { 61 if (flags & MAP_FIXED) {
@@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
64 if (addr) { 67 if (addr) {
65 addr = ALIGN(addr, huge_page_size(h)); 68 addr = ALIGN(addr, huge_page_size(h));
66 vma = find_vma(mm, addr); 69 vma = find_vma(mm, addr);
67 if (TASK_SIZE - len >= addr && 70 if (mm->task_size - len >= addr &&
68 (!vma || addr + len <= vma->vm_start)) 71 (!vma || addr + len <= vma->vm_start))
69 return addr; 72 return addr;
70 } 73 }
@@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
78 info.high_limit = current->mm->mmap_base; 81 info.high_limit = current->mm->mmap_base;
79 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 82 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
80 info.align_offset = 0; 83 info.align_offset = 0;
84
85 if (addr > DEFAULT_MAP_WINDOW)
86 info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
87
81 return vm_unmapped_area(&info); 88 return vm_unmapped_area(&info);
82} 89}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8c3389cbcd12..a4f33de4008e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size)
753 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 753 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
754 return -EINVAL; 754 return -EINVAL;
755 755
756#ifdef CONFIG_PPC_BOOK3S_64
757 /*
758 * We need to make sure that for different page sizes reported by
759 * firmware we only add hugetlb support for page sizes that can be
760 * supported by linux page table layout.
761 * For now we have
762 * Radix: 2M
763 * Hash: 16M and 16G
764 */
765 if (radix_enabled()) {
766 if (mmu_psize != MMU_PAGE_2M)
767 return -EINVAL;
768 } else {
769 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
770 return -EINVAL;
771 }
772#endif
773
756 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 774 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
757 775
758 /* Return if huge page size has already been setup */ 776 /* Return if huge page size has already been setup */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c22f207aa656..ec84b31c6c86 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -71,10 +71,6 @@
71#if H_PGTABLE_RANGE > USER_VSID_RANGE 71#if H_PGTABLE_RANGE > USER_VSID_RANGE
72#warning Limited user VSID range means pagetable space is wasted 72#warning Limited user VSID range means pagetable space is wasted
73#endif 73#endif
74
75#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
76#warning TASK_SIZE is smaller than it needs to be.
77#endif
78#endif /* CONFIG_PPC_STD_MMU_64 */ 74#endif /* CONFIG_PPC_STD_MMU_64 */
79 75
80phys_addr_t memstart_addr = ~0; 76phys_addr_t memstart_addr = ~0;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index a5d9ef59debe..9dbd2a733d6b 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void)
59 59
60unsigned long arch_mmap_rnd(void) 60unsigned long arch_mmap_rnd(void)
61{ 61{
62 unsigned long rnd; 62 unsigned long shift, rnd;
63 63
64 /* 8MB for 32bit, 1GB for 64bit */ 64 shift = mmap_rnd_bits;
65#ifdef CONFIG_COMPAT
65 if (is_32bit_task()) 66 if (is_32bit_task())
66 rnd = get_random_long() % (1<<(23-PAGE_SHIFT)); 67 shift = mmap_rnd_compat_bits;
67 else 68#endif
68 rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT)); 69 rnd = get_random_long() % (1ul << shift);
69 70
70 return rnd << PAGE_SHIFT; 71 return rnd << PAGE_SHIFT;
71} 72}
@@ -79,7 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd)
79 else if (gap > MAX_GAP) 80 else if (gap > MAX_GAP)
80 gap = MAX_GAP; 81 gap = MAX_GAP;
81 82
82 return PAGE_ALIGN(TASK_SIZE - gap - rnd); 83 return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
83} 84}
84 85
85#ifdef CONFIG_PPC_RADIX_MMU 86#ifdef CONFIG_PPC_RADIX_MMU
@@ -97,7 +98,11 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
97 struct vm_area_struct *vma; 98 struct vm_area_struct *vma;
98 struct vm_unmapped_area_info info; 99 struct vm_unmapped_area_info info;
99 100
100 if (len > TASK_SIZE - mmap_min_addr) 101 if (unlikely(addr > mm->context.addr_limit &&
102 mm->context.addr_limit != TASK_SIZE))
103 mm->context.addr_limit = TASK_SIZE;
104
105 if (len > mm->task_size - mmap_min_addr)
101 return -ENOMEM; 106 return -ENOMEM;
102 107
103 if (flags & MAP_FIXED) 108 if (flags & MAP_FIXED)
@@ -106,7 +111,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
106 if (addr) { 111 if (addr) {
107 addr = PAGE_ALIGN(addr); 112 addr = PAGE_ALIGN(addr);
108 vma = find_vma(mm, addr); 113 vma = find_vma(mm, addr);
109 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 114 if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
110 (!vma || addr + len <= vma->vm_start)) 115 (!vma || addr + len <= vma->vm_start))
111 return addr; 116 return addr;
112 } 117 }
@@ -114,8 +119,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
114 info.flags = 0; 119 info.flags = 0;
115 info.length = len; 120 info.length = len;
116 info.low_limit = mm->mmap_base; 121 info.low_limit = mm->mmap_base;
117 info.high_limit = TASK_SIZE;
118 info.align_mask = 0; 122 info.align_mask = 0;
123
124 if (unlikely(addr > DEFAULT_MAP_WINDOW))
125 info.high_limit = mm->context.addr_limit;
126 else
127 info.high_limit = DEFAULT_MAP_WINDOW;
128
119 return vm_unmapped_area(&info); 129 return vm_unmapped_area(&info);
120} 130}
121 131
@@ -131,8 +141,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
131 unsigned long addr = addr0; 141 unsigned long addr = addr0;
132 struct vm_unmapped_area_info info; 142 struct vm_unmapped_area_info info;
133 143
144 if (unlikely(addr > mm->context.addr_limit &&
145 mm->context.addr_limit != TASK_SIZE))
146 mm->context.addr_limit = TASK_SIZE;
147
134 /* requested length too big for entire address space */ 148 /* requested length too big for entire address space */
135 if (len > TASK_SIZE - mmap_min_addr) 149 if (len > mm->task_size - mmap_min_addr)
136 return -ENOMEM; 150 return -ENOMEM;
137 151
138 if (flags & MAP_FIXED) 152 if (flags & MAP_FIXED)
@@ -142,7 +156,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
142 if (addr) { 156 if (addr) {
143 addr = PAGE_ALIGN(addr); 157 addr = PAGE_ALIGN(addr);
144 vma = find_vma(mm, addr); 158 vma = find_vma(mm, addr);
145 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 159 if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
146 (!vma || addr + len <= vma->vm_start)) 160 (!vma || addr + len <= vma->vm_start))
147 return addr; 161 return addr;
148 } 162 }
@@ -152,7 +166,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
152 info.low_limit = max(PAGE_SIZE, mmap_min_addr); 166 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
153 info.high_limit = mm->mmap_base; 167 info.high_limit = mm->mmap_base;
154 info.align_mask = 0; 168 info.align_mask = 0;
169
170 if (addr > DEFAULT_MAP_WINDOW)
171 info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
172
155 addr = vm_unmapped_area(&info); 173 addr = vm_unmapped_area(&info);
174 if (!(addr & ~PAGE_MASK))
175 return addr;
176 VM_BUG_ON(addr != -ENOMEM);
156 177
157 /* 178 /*
158 * A failed mmap() very likely causes application failure, 179 * A failed mmap() very likely causes application failure,
@@ -160,15 +181,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
160 * can happen with large stack limits and large mmap() 181 * can happen with large stack limits and large mmap()
161 * allocations. 182 * allocations.
162 */ 183 */
163 if (addr & ~PAGE_MASK) { 184 return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
164 VM_BUG_ON(addr != -ENOMEM);
165 info.flags = 0;
166 info.low_limit = TASK_UNMAPPED_BASE;
167 info.high_limit = TASK_SIZE;
168 addr = vm_unmapped_area(&info);
169 }
170
171 return addr;
172} 185}
173 186
174static void radix__arch_pick_mmap_layout(struct mm_struct *mm, 187static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 73bf6e14c3aa..c6dca2ae78ef 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,17 +30,16 @@
30static DEFINE_SPINLOCK(mmu_context_lock); 30static DEFINE_SPINLOCK(mmu_context_lock);
31static DEFINE_IDA(mmu_context_ida); 31static DEFINE_IDA(mmu_context_ida);
32 32
33int __init_new_context(void) 33static int alloc_context_id(int min_id, int max_id)
34{ 34{
35 int index; 35 int index, err;
36 int err;
37 36
38again: 37again:
39 if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) 38 if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
40 return -ENOMEM; 39 return -ENOMEM;
41 40
42 spin_lock(&mmu_context_lock); 41 spin_lock(&mmu_context_lock);
43 err = ida_get_new_above(&mmu_context_ida, 1, &index); 42 err = ida_get_new_above(&mmu_context_ida, min_id, &index);
44 spin_unlock(&mmu_context_lock); 43 spin_unlock(&mmu_context_lock);
45 44
46 if (err == -EAGAIN) 45 if (err == -EAGAIN)
@@ -48,7 +47,7 @@ again:
48 else if (err) 47 else if (err)
49 return err; 48 return err;
50 49
51 if (index > MAX_USER_CONTEXT) { 50 if (index > max_id) {
52 spin_lock(&mmu_context_lock); 51 spin_lock(&mmu_context_lock);
53 ida_remove(&mmu_context_ida, index); 52 ida_remove(&mmu_context_ida, index);
54 spin_unlock(&mmu_context_lock); 53 spin_unlock(&mmu_context_lock);
@@ -57,48 +56,105 @@ again:
57 56
58 return index; 57 return index;
59} 58}
60EXPORT_SYMBOL_GPL(__init_new_context); 59
61static int radix__init_new_context(struct mm_struct *mm, int index) 60void hash__reserve_context_id(int id)
61{
62 int rc, result = 0;
63
64 do {
65 if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
66 break;
67
68 spin_lock(&mmu_context_lock);
69 rc = ida_get_new_above(&mmu_context_ida, id, &result);
70 spin_unlock(&mmu_context_lock);
71 } while (rc == -EAGAIN);
72
73 WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
74}
75
76int hash__alloc_context_id(void)
77{
78 unsigned long max;
79
80 if (mmu_has_feature(MMU_FTR_68_BIT_VA))
81 max = MAX_USER_CONTEXT;
82 else
83 max = MAX_USER_CONTEXT_65BIT_VA;
84
85 return alloc_context_id(MIN_USER_CONTEXT, max);
86}
87EXPORT_SYMBOL_GPL(hash__alloc_context_id);
88
89static int hash__init_new_context(struct mm_struct *mm)
90{
91 int index;
92
93 index = hash__alloc_context_id();
94 if (index < 0)
95 return index;
96
97 /*
98 * We do switch_slb() early in fork, even before we setup the
99 * mm->context.addr_limit. Default to max task size so that we copy the
100 * default values to paca which will help us to handle slb miss early.
101 */
102 mm->context.addr_limit = TASK_SIZE_128TB;
103
104 /*
105 * The old code would re-promote on fork, we don't do that when using
106 * slices as it could cause problem promoting slices that have been
107 * forced down to 4K.
108 *
109 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
110 * explicitly against context.id == 0. This ensures that we properly
111 * initialize context slice details for newly allocated mm's (which will
112 * have id == 0) and don't alter context slice inherited via fork (which
113 * will have id != 0).
114 *
115 * We should not be calling init_new_context() on init_mm. Hence a
116 * check against 0 is OK.
117 */
118 if (mm->context.id == 0)
119 slice_set_user_psize(mm, mmu_virtual_psize);
120
121 subpage_prot_init_new_context(mm);
122
123 return index;
124}
125
126static int radix__init_new_context(struct mm_struct *mm)
62{ 127{
63 unsigned long rts_field; 128 unsigned long rts_field;
129 int index;
130
131 index = alloc_context_id(1, PRTB_ENTRIES - 1);
132 if (index < 0)
133 return index;
64 134
65 /* 135 /*
66 * set the process table entry, 136 * set the process table entry,
67 */ 137 */
68 rts_field = radix__get_tree_size(); 138 rts_field = radix__get_tree_size();
69 process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); 139 process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
70 return 0; 140
141 mm->context.npu_context = NULL;
142
143 return index;
71} 144}
72 145
73int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 146int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
74{ 147{
75 int index; 148 int index;
76 149
77 index = __init_new_context(); 150 if (radix_enabled())
151 index = radix__init_new_context(mm);
152 else
153 index = hash__init_new_context(mm);
154
78 if (index < 0) 155 if (index < 0)
79 return index; 156 return index;
80 157
81 if (radix_enabled()) {
82 radix__init_new_context(mm, index);
83 } else {
84
85 /* The old code would re-promote on fork, we don't do that
86 * when using slices as it could cause problem promoting slices
87 * that have been forced down to 4K
88 *
89 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
90 * explicitly against context.id == 0. This ensures that we
91 * properly initialize context slice details for newly allocated
92 * mm's (which will have id == 0) and don't alter context slice
93 * inherited via fork (which will have id != 0).
94 *
95 * We should not be calling init_new_context() on init_mm. Hence a
96 * check against 0 is ok.
97 */
98 if (mm->context.id == 0)
99 slice_set_user_psize(mm, mmu_virtual_psize);
100 subpage_prot_init_new_context(mm);
101 }
102 mm->context.id = index; 158 mm->context.id = index;
103#ifdef CONFIG_PPC_ICSWX 159#ifdef CONFIG_PPC_ICSWX
104 mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); 160 mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..e0a2d8e806ed 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
81 gfp_t gfp_mask = GFP_USER; 81 gfp_t gfp_mask = GFP_USER;
82 struct page *new_page; 82 struct page *new_page;
83 83
84 if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) 84 if (PageCompound(page))
85 return NULL; 85 return NULL;
86 86
87 if (PageHighMem(page)) 87 if (PageHighMem(page))
@@ -100,7 +100,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
100 LIST_HEAD(cma_migrate_pages); 100 LIST_HEAD(cma_migrate_pages);
101 101
102 /* Ignore huge pages for now */ 102 /* Ignore huge pages for now */
103 if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) 103 if (PageCompound(page))
104 return -EBUSY; 104 return -EBUSY;
105 105
106 lru_add_drain(); 106 lru_add_drain();
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
314} 314}
315EXPORT_SYMBOL_GPL(mm_iommu_lookup); 315EXPORT_SYMBOL_GPL(mm_iommu_lookup);
316 316
317struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
318 unsigned long ua, unsigned long size)
319{
320 struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
321
322 list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
323 next) {
324 if ((mem->ua <= ua) &&
325 (ua + size <= mem->ua +
326 (mem->entries << PAGE_SHIFT))) {
327 ret = mem;
328 break;
329 }
330 }
331
332 return ret;
333}
334EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
335
317struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 336struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
318 unsigned long ua, unsigned long entries) 337 unsigned long ua, unsigned long entries)
319{ 338{
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
345} 364}
346EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); 365EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
347 366
367long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
368 unsigned long ua, unsigned long *hpa)
369{
370 const long entry = (ua - mem->ua) >> PAGE_SHIFT;
371 void *va = &mem->hpas[entry];
372 unsigned long *pa;
373
374 if (entry >= mem->entries)
375 return -EFAULT;
376
377 pa = (void *) vmalloc_to_phys(va);
378 if (!pa)
379 return -EFAULT;
380
381 *hpa = *pa | (ua & ~PAGE_MASK);
382
383 return 0;
384}
385EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
386
348long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) 387long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
349{ 388{
350 if (atomic64_inc_not_zero(&mem->mapped)) 389 if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index c491f2c8f2b9..4554d6527682 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
333 333
334 mm->context.id = MMU_NO_CONTEXT; 334 mm->context.id = MMU_NO_CONTEXT;
335 mm->context.active = 0; 335 mm->context.active = 0;
336
337#ifdef CONFIG_PPC_MM_SLICES
338 slice_set_user_psize(mm, mmu_virtual_psize);
339#endif
340
341 return 0; 336 return 0;
342} 337}
343 338
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9befaee237d6..371792e4418f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
875 void *nd; 875 void *nd;
876 int tnid; 876 int tnid;
877 877
878 if (spanned_pages)
879 pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
880 nid, start_pfn << PAGE_SHIFT,
881 (end_pfn << PAGE_SHIFT) - 1);
882 else
883 pr_info("Initmem setup node %d\n", nid);
884
885 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 878 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
886 nd = __va(nd_pa); 879 nd = __va(nd_pa);
887 880
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5e01b2ece1d0..654a0d7ba0e7 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -131,7 +131,7 @@ static void __slb_flush_and_rebolt(void)
131 "slbmte %2,%3\n" 131 "slbmte %2,%3\n"
132 "isync" 132 "isync"
133 :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), 133 :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
134 "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), 134 "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
135 "r"(ksp_vsid_data), 135 "r"(ksp_vsid_data),
136 "r"(ksp_esid_data) 136 "r"(ksp_esid_data)
137 : "memory"); 137 : "memory");
@@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
229 asm volatile("slbie %0" : : "r" (slbie_data)); 229 asm volatile("slbie %0" : : "r" (slbie_data));
230 230
231 get_paca()->slb_cache_ptr = 0; 231 get_paca()->slb_cache_ptr = 0;
232 copy_mm_to_paca(&mm->context); 232 copy_mm_to_paca(mm);
233 233
234 /* 234 /*
235 * preload some userspace segments into the SLB. 235 * preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a85e06ea6c20..1519617aab36 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -23,6 +23,48 @@
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/firmware.h> 24#include <asm/firmware.h>
25 25
26/*
27 * This macro generates asm code to compute the VSID scramble
28 * function. Used in slb_allocate() and do_stab_bolted. The function
29 * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
30 *
31 * rt = register containing the proto-VSID and into which the
32 * VSID will be stored
33 * rx = scratch register (clobbered)
34 * rf = flags
35 *
36 * - rt and rx must be different registers
37 * - The answer will end up in the low VSID_BITS bits of rt. The higher
38 * bits may contain other garbage, so you may need to mask the
39 * result.
40 */
41#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \
42 lis rx,VSID_MULTIPLIER_##size@h; \
43 ori rx,rx,VSID_MULTIPLIER_##size@l; \
44 mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
45/* \
46 * powermac get slb fault before feature fixup, so make 65 bit part \
47 * the default part of feature fixup \
48 */ \
49BEGIN_MMU_FTR_SECTION \
50 srdi rx,rt,VSID_BITS_65_##size; \
51 clrldi rt,rt,(64-VSID_BITS_65_##size); \
52 add rt,rt,rx; \
53 addi rx,rt,1; \
54 srdi rx,rx,VSID_BITS_65_##size; \
55 add rt,rt,rx; \
56 rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
57MMU_FTR_SECTION_ELSE \
58 srdi rx,rt,VSID_BITS_##size; \
59 clrldi rt,rt,(64-VSID_BITS_##size); \
60 add rt,rt,rx; /* add high and low bits */ \
61 addi rx,rt,1; \
62 srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
63 add rt,rt,rx; \
64 rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
65ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
66
67
26/* void slb_allocate_realmode(unsigned long ea); 68/* void slb_allocate_realmode(unsigned long ea);
27 * 69 *
28 * Create an SLB entry for the given EA (user or kernel). 70 * Create an SLB entry for the given EA (user or kernel).
@@ -45,13 +87,6 @@ _GLOBAL(slb_allocate_realmode)
45 /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ 87 /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
46 blt cr7,0f /* user or kernel? */ 88 blt cr7,0f /* user or kernel? */
47 89
48 /* kernel address: proto-VSID = ESID */
49 /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
50 * this code will generate the protoVSID 0xfffffffff for the
51 * top segment. That's ok, the scramble below will translate
52 * it to VSID 0, which is reserved as a bad VSID - one which
53 * will never have any pages in it. */
54
55 /* Check if hitting the linear mapping or some other kernel space 90 /* Check if hitting the linear mapping or some other kernel space
56 */ 91 */
57 bne cr7,1f 92 bne cr7,1f
@@ -63,12 +98,10 @@ _GLOBAL(slb_allocate_realmode)
63slb_miss_kernel_load_linear: 98slb_miss_kernel_load_linear:
64 li r11,0 99 li r11,0
65 /* 100 /*
66 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 101 * context = (ea >> 60) - (0xc - 1)
67 * r9 = region id. 102 * r9 = region id.
68 */ 103 */
69 addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 104 subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
70 addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
71
72 105
73BEGIN_FTR_SECTION 106BEGIN_FTR_SECTION
74 b .Lslb_finish_load 107 b .Lslb_finish_load
@@ -77,9 +110,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
77 110
781: 1111:
79#ifdef CONFIG_SPARSEMEM_VMEMMAP 112#ifdef CONFIG_SPARSEMEM_VMEMMAP
80 /* Check virtual memmap region. To be patches at kernel boot */
81 cmpldi cr0,r9,0xf 113 cmpldi cr0,r9,0xf
82 bne 1f 114 bne 1f
115/* Check virtual memmap region. To be patched at kernel boot */
83.globl slb_miss_kernel_load_vmemmap 116.globl slb_miss_kernel_load_vmemmap
84slb_miss_kernel_load_vmemmap: 117slb_miss_kernel_load_vmemmap:
85 li r11,0 118 li r11,0
@@ -102,11 +135,10 @@ slb_miss_kernel_load_io:
102 li r11,0 135 li r11,0
1036: 1366:
104 /* 137 /*
105 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 138 * context = (ea >> 60) - (0xc - 1)
106 * r9 = region id. 139 * r9 = region id.
107 */ 140 */
108 addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 141 subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
109 addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
110 142
111BEGIN_FTR_SECTION 143BEGIN_FTR_SECTION
112 b .Lslb_finish_load 144 b .Lslb_finish_load
@@ -117,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
117 * For userspace addresses, make sure this is region 0. 149 * For userspace addresses, make sure this is region 0.
118 */ 150 */
119 cmpdi r9, 0 151 cmpdi r9, 0
120 bne 8f 152 bne- 8f
153 /*
154 * user space make sure we are within the allowed limit
155 */
156 ld r11,PACA_ADDR_LIMIT(r13)
157 cmpld r3,r11
158 bge- 8f
121 159
122 /* when using slices, we extract the psize off the slice bitmaps 160 /* when using slices, we extract the psize off the slice bitmaps
123 * and then we need to get the sllp encoding off the mmu_psize_defs 161 * and then we need to get the sllp encoding off the mmu_psize_defs
@@ -189,13 +227,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
189 */ 227 */
190.Lslb_finish_load: 228.Lslb_finish_load:
191 rldimi r10,r9,ESID_BITS,0 229 rldimi r10,r9,ESID_BITS,0
192 ASM_VSID_SCRAMBLE(r10,r9,256M) 230 ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
193 /*
194 * bits above VSID_BITS_256M need to be ignored from r10
195 * also combine VSID and flags
196 */
197 rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
198
199 /* r3 = EA, r11 = VSID data */ 231 /* r3 = EA, r11 = VSID data */
200 /* 232 /*
201 * Find a slot, round robin. Previously we tried to find a 233 * Find a slot, round robin. Previously we tried to find a
@@ -259,12 +291,12 @@ slb_compare_rr_to_size:
259.Lslb_finish_load_1T: 291.Lslb_finish_load_1T:
260 srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ 292 srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
261 rldimi r10,r9,ESID_BITS_1T,0 293 rldimi r10,r9,ESID_BITS_1T,0
262 ASM_VSID_SCRAMBLE(r10,r9,1T) 294 ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
263 /* 295 /*
264 * bits above VSID_BITS_1T need to be ignored from r10 296 * bits above VSID_BITS_1T need to be ignored from r10
265 * also combine VSID and flags 297 * also combine VSID and flags
266 */ 298 */
267 rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) 299
268 li r10,MMU_SEGSIZE_1T 300 li r10,MMU_SEGSIZE_1T
269 rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ 301 rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
270 302
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 2b27458902ee..966b9fccfa66 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -36,38 +36,29 @@
36#include <asm/copro.h> 36#include <asm/copro.h>
37#include <asm/hugetlb.h> 37#include <asm/hugetlb.h>
38 38
39/* some sanity checks */
40#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
41#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
42#endif
43
44static DEFINE_SPINLOCK(slice_convert_lock); 39static DEFINE_SPINLOCK(slice_convert_lock);
45 40/*
41 * One bit per slice. We have lower slices which cover 256MB segments
42 * upto 4G range. That gets us 16 low slices. For the rest we track slices
43 * in 1TB size.
44 */
45struct slice_mask {
46 u64 low_slices;
47 DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
48};
46 49
47#ifdef DEBUG 50#ifdef DEBUG
48int _slice_debug = 1; 51int _slice_debug = 1;
49 52
50static void slice_print_mask(const char *label, struct slice_mask mask) 53static void slice_print_mask(const char *label, struct slice_mask mask)
51{ 54{
52 char *p, buf[16 + 3 + 64 + 1];
53 int i;
54
55 if (!_slice_debug) 55 if (!_slice_debug)
56 return; 56 return;
57 p = buf; 57 pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
58 for (i = 0; i < SLICE_NUM_LOW; i++) 58 pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
59 *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
60 *(p++) = ' ';
61 *(p++) = '-';
62 *(p++) = ' ';
63 for (i = 0; i < SLICE_NUM_HIGH; i++)
64 *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
65 *(p++) = 0;
66
67 printk(KERN_DEBUG "%s:%s\n", label, buf);
68} 59}
69 60
70#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) 61#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
71 62
72#else 63#else
73 64
@@ -76,25 +67,28 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {}
76 67
77#endif 68#endif
78 69
79static struct slice_mask slice_range_to_mask(unsigned long start, 70static void slice_range_to_mask(unsigned long start, unsigned long len,
80 unsigned long len) 71 struct slice_mask *ret)
81{ 72{
82 unsigned long end = start + len - 1; 73 unsigned long end = start + len - 1;
83 struct slice_mask ret = { 0, 0 }; 74
75 ret->low_slices = 0;
76 bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
84 77
85 if (start < SLICE_LOW_TOP) { 78 if (start < SLICE_LOW_TOP) {
86 unsigned long mend = min(end, SLICE_LOW_TOP); 79 unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
87 unsigned long mstart = min(start, SLICE_LOW_TOP);
88 80
89 ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) 81 ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
90 - (1u << GET_LOW_SLICE_INDEX(mstart)); 82 - (1u << GET_LOW_SLICE_INDEX(start));
91 } 83 }
92 84
93 if ((start + len) > SLICE_LOW_TOP) 85 if ((start + len) > SLICE_LOW_TOP) {
94 ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) 86 unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
95 - (1ul << GET_HIGH_SLICE_INDEX(start)); 87 unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
88 unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
96 89
97 return ret; 90 bitmap_set(ret->high_slices, start_index, count);
91 }
98} 92}
99 93
100static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, 94static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
@@ -128,53 +122,60 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
128 return !slice_area_is_free(mm, start, end - start); 122 return !slice_area_is_free(mm, start, end - start);
129} 123}
130 124
131static struct slice_mask slice_mask_for_free(struct mm_struct *mm) 125static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
132{ 126{
133 struct slice_mask ret = { 0, 0 };
134 unsigned long i; 127 unsigned long i;
135 128
129 ret->low_slices = 0;
130 bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
131
136 for (i = 0; i < SLICE_NUM_LOW; i++) 132 for (i = 0; i < SLICE_NUM_LOW; i++)
137 if (!slice_low_has_vma(mm, i)) 133 if (!slice_low_has_vma(mm, i))
138 ret.low_slices |= 1u << i; 134 ret->low_slices |= 1u << i;
139 135
140 if (mm->task_size <= SLICE_LOW_TOP) 136 if (mm->task_size <= SLICE_LOW_TOP)
141 return ret; 137 return;
142 138
143 for (i = 0; i < SLICE_NUM_HIGH; i++) 139 for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
144 if (!slice_high_has_vma(mm, i)) 140 if (!slice_high_has_vma(mm, i))
145 ret.high_slices |= 1ul << i; 141 __set_bit(i, ret->high_slices);
146
147 return ret;
148} 142}
149 143
150static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) 144static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret)
151{ 145{
152 unsigned char *hpsizes; 146 unsigned char *hpsizes;
153 int index, mask_index; 147 int index, mask_index;
154 struct slice_mask ret = { 0, 0 };
155 unsigned long i; 148 unsigned long i;
156 u64 lpsizes; 149 u64 lpsizes;
157 150
151 ret->low_slices = 0;
152 bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
153
158 lpsizes = mm->context.low_slices_psize; 154 lpsizes = mm->context.low_slices_psize;
159 for (i = 0; i < SLICE_NUM_LOW; i++) 155 for (i = 0; i < SLICE_NUM_LOW; i++)
160 if (((lpsizes >> (i * 4)) & 0xf) == psize) 156 if (((lpsizes >> (i * 4)) & 0xf) == psize)
161 ret.low_slices |= 1u << i; 157 ret->low_slices |= 1u << i;
162 158
163 hpsizes = mm->context.high_slices_psize; 159 hpsizes = mm->context.high_slices_psize;
164 for (i = 0; i < SLICE_NUM_HIGH; i++) { 160 for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
165 mask_index = i & 0x1; 161 mask_index = i & 0x1;
166 index = i >> 1; 162 index = i >> 1;
167 if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) 163 if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
168 ret.high_slices |= 1ul << i; 164 __set_bit(i, ret->high_slices);
169 } 165 }
170
171 return ret;
172} 166}
173 167
174static int slice_check_fit(struct slice_mask mask, struct slice_mask available) 168static int slice_check_fit(struct mm_struct *mm,
169 struct slice_mask mask, struct slice_mask available)
175{ 170{
171 DECLARE_BITMAP(result, SLICE_NUM_HIGH);
172 unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
173
174 bitmap_and(result, mask.high_slices,
175 available.high_slices, slice_count);
176
176 return (mask.low_slices & available.low_slices) == mask.low_slices && 177 return (mask.low_slices & available.low_slices) == mask.low_slices &&
177 (mask.high_slices & available.high_slices) == mask.high_slices; 178 bitmap_equal(result, mask.high_slices, slice_count);
178} 179}
179 180
180static void slice_flush_segments(void *parm) 181static void slice_flush_segments(void *parm)
@@ -185,7 +186,7 @@ static void slice_flush_segments(void *parm)
185 if (mm != current->active_mm) 186 if (mm != current->active_mm)
186 return; 187 return;
187 188
188 copy_mm_to_paca(&current->active_mm->context); 189 copy_mm_to_paca(current->active_mm);
189 190
190 local_irq_save(flags); 191 local_irq_save(flags);
191 slb_flush_and_rebolt(); 192 slb_flush_and_rebolt();
@@ -218,18 +219,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
218 mm->context.low_slices_psize = lpsizes; 219 mm->context.low_slices_psize = lpsizes;
219 220
220 hpsizes = mm->context.high_slices_psize; 221 hpsizes = mm->context.high_slices_psize;
221 for (i = 0; i < SLICE_NUM_HIGH; i++) { 222 for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
222 mask_index = i & 0x1; 223 mask_index = i & 0x1;
223 index = i >> 1; 224 index = i >> 1;
224 if (mask.high_slices & (1ul << i)) 225 if (test_bit(i, mask.high_slices))
225 hpsizes[index] = (hpsizes[index] & 226 hpsizes[index] = (hpsizes[index] &
226 ~(0xf << (mask_index * 4))) | 227 ~(0xf << (mask_index * 4))) |
227 (((unsigned long)psize) << (mask_index * 4)); 228 (((unsigned long)psize) << (mask_index * 4));
228 } 229 }
229 230
230 slice_dbg(" lsps=%lx, hsps=%lx\n", 231 slice_dbg(" lsps=%lx, hsps=%lx\n",
231 mm->context.low_slices_psize, 232 (unsigned long)mm->context.low_slices_psize,
232 mm->context.high_slices_psize); 233 (unsigned long)mm->context.high_slices_psize);
233 234
234 spin_unlock_irqrestore(&slice_convert_lock, flags); 235 spin_unlock_irqrestore(&slice_convert_lock, flags);
235 236
@@ -257,14 +258,14 @@ static bool slice_scan_available(unsigned long addr,
257 slice = GET_HIGH_SLICE_INDEX(addr); 258 slice = GET_HIGH_SLICE_INDEX(addr);
258 *boundary_addr = (slice + end) ? 259 *boundary_addr = (slice + end) ?
259 ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; 260 ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
260 return !!(available.high_slices & (1ul << slice)); 261 return !!test_bit(slice, available.high_slices);
261 } 262 }
262} 263}
263 264
264static unsigned long slice_find_area_bottomup(struct mm_struct *mm, 265static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
265 unsigned long len, 266 unsigned long len,
266 struct slice_mask available, 267 struct slice_mask available,
267 int psize) 268 int psize, unsigned long high_limit)
268{ 269{
269 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 270 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
270 unsigned long addr, found, next_end; 271 unsigned long addr, found, next_end;
@@ -276,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
276 info.align_offset = 0; 277 info.align_offset = 0;
277 278
278 addr = TASK_UNMAPPED_BASE; 279 addr = TASK_UNMAPPED_BASE;
279 while (addr < TASK_SIZE) { 280 /*
281 * Check till the allow max value for this mmap request
282 */
283 while (addr < high_limit) {
280 info.low_limit = addr; 284 info.low_limit = addr;
281 if (!slice_scan_available(addr, available, 1, &addr)) 285 if (!slice_scan_available(addr, available, 1, &addr))
282 continue; 286 continue;
@@ -288,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
288 * Check if we need to reduce the range, or if we can 292 * Check if we need to reduce the range, or if we can
289 * extend it to cover the next available slice. 293 * extend it to cover the next available slice.
290 */ 294 */
291 if (addr >= TASK_SIZE) 295 if (addr >= high_limit)
292 addr = TASK_SIZE; 296 addr = high_limit;
293 else if (slice_scan_available(addr, available, 1, &next_end)) { 297 else if (slice_scan_available(addr, available, 1, &next_end)) {
294 addr = next_end; 298 addr = next_end;
295 goto next_slice; 299 goto next_slice;
@@ -307,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
307static unsigned long slice_find_area_topdown(struct mm_struct *mm, 311static unsigned long slice_find_area_topdown(struct mm_struct *mm,
308 unsigned long len, 312 unsigned long len,
309 struct slice_mask available, 313 struct slice_mask available,
310 int psize) 314 int psize, unsigned long high_limit)
311{ 315{
312 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 316 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
313 unsigned long addr, found, prev; 317 unsigned long addr, found, prev;
@@ -319,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
319 info.align_offset = 0; 323 info.align_offset = 0;
320 324
321 addr = mm->mmap_base; 325 addr = mm->mmap_base;
326 /*
327 * If we are trying to allocate above DEFAULT_MAP_WINDOW
328 * Add the different to the mmap_base.
329 * Only for that request for which high_limit is above
330 * DEFAULT_MAP_WINDOW we should apply this.
331 */
332 if (high_limit > DEFAULT_MAP_WINDOW)
333 addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
334
322 while (addr > PAGE_SIZE) { 335 while (addr > PAGE_SIZE) {
323 info.high_limit = addr; 336 info.high_limit = addr;
324 if (!slice_scan_available(addr - 1, available, 0, &addr)) 337 if (!slice_scan_available(addr - 1, available, 0, &addr))
@@ -350,29 +363,38 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
350 * can happen with large stack limits and large mmap() 363 * can happen with large stack limits and large mmap()
351 * allocations. 364 * allocations.
352 */ 365 */
353 return slice_find_area_bottomup(mm, len, available, psize); 366 return slice_find_area_bottomup(mm, len, available, psize, high_limit);
354} 367}
355 368
356 369
357static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, 370static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
358 struct slice_mask mask, int psize, 371 struct slice_mask mask, int psize,
359 int topdown) 372 int topdown, unsigned long high_limit)
360{ 373{
361 if (topdown) 374 if (topdown)
362 return slice_find_area_topdown(mm, len, mask, psize); 375 return slice_find_area_topdown(mm, len, mask, psize, high_limit);
363 else 376 else
364 return slice_find_area_bottomup(mm, len, mask, psize); 377 return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
365} 378}
366 379
367#define or_mask(dst, src) do { \ 380static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
368 (dst).low_slices |= (src).low_slices; \ 381{
369 (dst).high_slices |= (src).high_slices; \ 382 DECLARE_BITMAP(result, SLICE_NUM_HIGH);
370} while (0) 383
384 dst->low_slices |= src->low_slices;
385 bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
386 bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
387}
371 388
372#define andnot_mask(dst, src) do { \ 389static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
373 (dst).low_slices &= ~(src).low_slices; \ 390{
374 (dst).high_slices &= ~(src).high_slices; \ 391 DECLARE_BITMAP(result, SLICE_NUM_HIGH);
375} while (0) 392
393 dst->low_slices &= ~src->low_slices;
394
395 bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
396 bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
397}
376 398
377#ifdef CONFIG_PPC_64K_PAGES 399#ifdef CONFIG_PPC_64K_PAGES
378#define MMU_PAGE_BASE MMU_PAGE_64K 400#define MMU_PAGE_BASE MMU_PAGE_64K
@@ -384,14 +406,43 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
384 unsigned long flags, unsigned int psize, 406 unsigned long flags, unsigned int psize,
385 int topdown) 407 int topdown)
386{ 408{
387 struct slice_mask mask = {0, 0}; 409 struct slice_mask mask;
388 struct slice_mask good_mask; 410 struct slice_mask good_mask;
389 struct slice_mask potential_mask = {0,0} /* silence stupid warning */; 411 struct slice_mask potential_mask;
390 struct slice_mask compat_mask = {0, 0}; 412 struct slice_mask compat_mask;
391 int fixed = (flags & MAP_FIXED); 413 int fixed = (flags & MAP_FIXED);
392 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 414 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
393 struct mm_struct *mm = current->mm; 415 struct mm_struct *mm = current->mm;
394 unsigned long newaddr; 416 unsigned long newaddr;
417 unsigned long high_limit;
418
419 /*
420 * Check if we need to expland slice area.
421 */
422 if (unlikely(addr > mm->context.addr_limit &&
423 mm->context.addr_limit != TASK_SIZE)) {
424 mm->context.addr_limit = TASK_SIZE;
425 on_each_cpu(slice_flush_segments, mm, 1);
426 }
427 /*
428 * This mmap request can allocate upt to 512TB
429 */
430 if (addr > DEFAULT_MAP_WINDOW)
431 high_limit = mm->context.addr_limit;
432 else
433 high_limit = DEFAULT_MAP_WINDOW;
434 /*
435 * init different masks
436 */
437 mask.low_slices = 0;
438 bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
439
440 /* silence stupid warning */;
441 potential_mask.low_slices = 0;
442 bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
443
444 compat_mask.low_slices = 0;
445 bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
395 446
396 /* Sanity checks */ 447 /* Sanity checks */
397 BUG_ON(mm->task_size == 0); 448 BUG_ON(mm->task_size == 0);
@@ -423,7 +474,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
423 /* First make up a "good" mask of slices that have the right size 474 /* First make up a "good" mask of slices that have the right size
424 * already 475 * already
425 */ 476 */
426 good_mask = slice_mask_for_size(mm, psize); 477 slice_mask_for_size(mm, psize, &good_mask);
427 slice_print_mask(" good_mask", good_mask); 478 slice_print_mask(" good_mask", good_mask);
428 479
429 /* 480 /*
@@ -448,22 +499,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
448#ifdef CONFIG_PPC_64K_PAGES 499#ifdef CONFIG_PPC_64K_PAGES
449 /* If we support combo pages, we can allow 64k pages in 4k slices */ 500 /* If we support combo pages, we can allow 64k pages in 4k slices */
450 if (psize == MMU_PAGE_64K) { 501 if (psize == MMU_PAGE_64K) {
451 compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); 502 slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
452 if (fixed) 503 if (fixed)
453 or_mask(good_mask, compat_mask); 504 slice_or_mask(&good_mask, &compat_mask);
454 } 505 }
455#endif 506#endif
456 507
457 /* First check hint if it's valid or if we have MAP_FIXED */ 508 /* First check hint if it's valid or if we have MAP_FIXED */
458 if (addr != 0 || fixed) { 509 if (addr != 0 || fixed) {
459 /* Build a mask for the requested range */ 510 /* Build a mask for the requested range */
460 mask = slice_range_to_mask(addr, len); 511 slice_range_to_mask(addr, len, &mask);
461 slice_print_mask(" mask", mask); 512 slice_print_mask(" mask", mask);
462 513
463 /* Check if we fit in the good mask. If we do, we just return, 514 /* Check if we fit in the good mask. If we do, we just return,
464 * nothing else to do 515 * nothing else to do
465 */ 516 */
466 if (slice_check_fit(mask, good_mask)) { 517 if (slice_check_fit(mm, mask, good_mask)) {
467 slice_dbg(" fits good !\n"); 518 slice_dbg(" fits good !\n");
468 return addr; 519 return addr;
469 } 520 }
@@ -471,7 +522,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
471 /* Now let's see if we can find something in the existing 522 /* Now let's see if we can find something in the existing
472 * slices for that size 523 * slices for that size
473 */ 524 */
474 newaddr = slice_find_area(mm, len, good_mask, psize, topdown); 525 newaddr = slice_find_area(mm, len, good_mask,
526 psize, topdown, high_limit);
475 if (newaddr != -ENOMEM) { 527 if (newaddr != -ENOMEM) {
476 /* Found within the good mask, we don't have to setup, 528 /* Found within the good mask, we don't have to setup,
477 * we thus return directly 529 * we thus return directly
@@ -484,11 +536,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
484 /* We don't fit in the good mask, check what other slices are 536 /* We don't fit in the good mask, check what other slices are
485 * empty and thus can be converted 537 * empty and thus can be converted
486 */ 538 */
487 potential_mask = slice_mask_for_free(mm); 539 slice_mask_for_free(mm, &potential_mask);
488 or_mask(potential_mask, good_mask); 540 slice_or_mask(&potential_mask, &good_mask);
489 slice_print_mask(" potential", potential_mask); 541 slice_print_mask(" potential", potential_mask);
490 542
491 if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { 543 if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
492 slice_dbg(" fits potential !\n"); 544 slice_dbg(" fits potential !\n");
493 goto convert; 545 goto convert;
494 } 546 }
@@ -503,7 +555,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
503 * anywhere in the good area. 555 * anywhere in the good area.
504 */ 556 */
505 if (addr) { 557 if (addr) {
506 addr = slice_find_area(mm, len, good_mask, psize, topdown); 558 addr = slice_find_area(mm, len, good_mask,
559 psize, topdown, high_limit);
507 if (addr != -ENOMEM) { 560 if (addr != -ENOMEM) {
508 slice_dbg(" found area at 0x%lx\n", addr); 561 slice_dbg(" found area at 0x%lx\n", addr);
509 return addr; 562 return addr;
@@ -513,28 +566,29 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
513 /* Now let's see if we can find something in the existing slices 566 /* Now let's see if we can find something in the existing slices
514 * for that size plus free slices 567 * for that size plus free slices
515 */ 568 */
516 addr = slice_find_area(mm, len, potential_mask, psize, topdown); 569 addr = slice_find_area(mm, len, potential_mask,
570 psize, topdown, high_limit);
517 571
518#ifdef CONFIG_PPC_64K_PAGES 572#ifdef CONFIG_PPC_64K_PAGES
519 if (addr == -ENOMEM && psize == MMU_PAGE_64K) { 573 if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
520 /* retry the search with 4k-page slices included */ 574 /* retry the search with 4k-page slices included */
521 or_mask(potential_mask, compat_mask); 575 slice_or_mask(&potential_mask, &compat_mask);
522 addr = slice_find_area(mm, len, potential_mask, psize, 576 addr = slice_find_area(mm, len, potential_mask,
523 topdown); 577 psize, topdown, high_limit);
524 } 578 }
525#endif 579#endif
526 580
527 if (addr == -ENOMEM) 581 if (addr == -ENOMEM)
528 return -ENOMEM; 582 return -ENOMEM;
529 583
530 mask = slice_range_to_mask(addr, len); 584 slice_range_to_mask(addr, len, &mask);
531 slice_dbg(" found potential area at 0x%lx\n", addr); 585 slice_dbg(" found potential area at 0x%lx\n", addr);
532 slice_print_mask(" mask", mask); 586 slice_print_mask(" mask", mask);
533 587
534 convert: 588 convert:
535 andnot_mask(mask, good_mask); 589 slice_andnot_mask(&mask, &good_mask);
536 andnot_mask(mask, compat_mask); 590 slice_andnot_mask(&mask, &compat_mask);
537 if (mask.low_slices || mask.high_slices) { 591 if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
538 slice_convert(mm, mask, psize); 592 slice_convert(mm, mask, psize);
539 if (psize > MMU_PAGE_BASE) 593 if (psize > MMU_PAGE_BASE)
540 on_each_cpu(slice_flush_segments, mm, 1); 594 on_each_cpu(slice_flush_segments, mm, 1);
@@ -649,8 +703,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
649 703
650 704
651 slice_dbg(" lsps=%lx, hsps=%lx\n", 705 slice_dbg(" lsps=%lx, hsps=%lx\n",
652 mm->context.low_slices_psize, 706 (unsigned long)mm->context.low_slices_psize,
653 mm->context.high_slices_psize); 707 (unsigned long)mm->context.high_slices_psize);
654 708
655 bail: 709 bail:
656 spin_unlock_irqrestore(&slice_convert_lock, flags); 710 spin_unlock_irqrestore(&slice_convert_lock, flags);
@@ -659,9 +713,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
659void slice_set_range_psize(struct mm_struct *mm, unsigned long start, 713void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
660 unsigned long len, unsigned int psize) 714 unsigned long len, unsigned int psize)
661{ 715{
662 struct slice_mask mask = slice_range_to_mask(start, len); 716 struct slice_mask mask;
663 717
664 VM_BUG_ON(radix_enabled()); 718 VM_BUG_ON(radix_enabled());
719
720 slice_range_to_mask(start, len, &mask);
665 slice_convert(mm, mask, psize); 721 slice_convert(mm, mask, psize);
666} 722}
667 723
@@ -694,14 +750,14 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
694 if (radix_enabled()) 750 if (radix_enabled())
695 return 0; 751 return 0;
696 752
697 mask = slice_range_to_mask(addr, len); 753 slice_range_to_mask(addr, len, &mask);
698 available = slice_mask_for_size(mm, psize); 754 slice_mask_for_size(mm, psize, &available);
699#ifdef CONFIG_PPC_64K_PAGES 755#ifdef CONFIG_PPC_64K_PAGES
700 /* We need to account for 4k slices too */ 756 /* We need to account for 4k slices too */
701 if (psize == MMU_PAGE_64K) { 757 if (psize == MMU_PAGE_64K) {
702 struct slice_mask compat_mask; 758 struct slice_mask compat_mask;
703 compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); 759 slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
704 or_mask(available, compat_mask); 760 slice_or_mask(&available, &compat_mask);
705 } 761 }
706#endif 762#endif
707 763
@@ -711,6 +767,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
711 slice_print_mask(" mask", mask); 767 slice_print_mask(" mask", mask);
712 slice_print_mask(" available", available); 768 slice_print_mask(" available", available);
713#endif 769#endif
714 return !slice_check_fit(mask, available); 770 return !slice_check_fit(mm, mask, available);
715} 771}
716#endif 772#endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 94210940112f..e94fbd4c8845 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
197 197
198 /* Check parameters */ 198 /* Check parameters */
199 if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || 199 if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
200 addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) 200 addr >= mm->task_size || len >= mm->task_size ||
201 addr + len > mm->task_size)
201 return -EINVAL; 202 return -EINVAL;
202 203
203 if (is_hugepage_only_range(mm, addr, len)) 204 if (is_hugepage_only_range(mm, addr, len))
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 952713d6cf04..02e71402fdd3 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -17,7 +17,6 @@
17#include <asm/tlb.h> 17#include <asm/tlb.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
21 20
22#define RIC_FLUSH_TLB 0 21#define RIC_FLUSH_TLB 0
23#define RIC_FLUSH_PWC 1 22#define RIC_FLUSH_PWC 1
@@ -34,10 +33,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
34 prs = 1; /* process scoped */ 33 prs = 1; /* process scoped */
35 r = 1; /* raidx format */ 34 r = 1; /* raidx format */
36 35
37 asm volatile("ptesync": : :"memory");
38 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) 36 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
39 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 37 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
40 asm volatile("ptesync": : :"memory");
41} 38}
42 39
43/* 40/*
@@ -47,9 +44,33 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
47{ 44{
48 int set; 45 int set;
49 46
50 for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { 47 asm volatile("ptesync": : :"memory");
48
49 /*
50 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
51 * also flush the entire Page Walk Cache.
52 */
53 __tlbiel_pid(pid, 0, ric);
54
55 if (ric == RIC_FLUSH_ALL)
56 /* For the remaining sets, just flush the TLB */
57 ric = RIC_FLUSH_TLB;
58
59 for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
51 __tlbiel_pid(pid, set, ric); 60 __tlbiel_pid(pid, set, ric);
52 } 61
62 asm volatile("ptesync": : :"memory");
63 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
64}
65
66static inline void tlbiel_pwc(unsigned long pid)
67{
68 asm volatile("ptesync": : :"memory");
69
70 /* For PWC flush, we don't look at set number */
71 __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
72
73 asm volatile("ptesync": : :"memory");
53 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 74 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
54} 75}
55 76
@@ -129,12 +150,18 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
129{ 150{
130 unsigned long pid; 151 unsigned long pid;
131 struct mm_struct *mm = tlb->mm; 152 struct mm_struct *mm = tlb->mm;
153 /*
154 * If we are doing a full mm flush, we will do a tlb flush
155 * with RIC_FLUSH_ALL later.
156 */
157 if (tlb->fullmm)
158 return;
132 159
133 preempt_disable(); 160 preempt_disable();
134 161
135 pid = mm->context.id; 162 pid = mm->context.id;
136 if (pid != MMU_NO_CONTEXT) 163 if (pid != MMU_NO_CONTEXT)
137 _tlbiel_pid(pid, RIC_FLUSH_PWC); 164 tlbiel_pwc(pid);
138 165
139 preempt_enable(); 166 preempt_enable();
140} 167}
@@ -175,15 +202,9 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
175 if (unlikely(pid == MMU_NO_CONTEXT)) 202 if (unlikely(pid == MMU_NO_CONTEXT))
176 goto no_context; 203 goto no_context;
177 204
178 if (!mm_is_thread_local(mm)) { 205 if (!mm_is_thread_local(mm))
179 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
180
181 if (lock_tlbie)
182 raw_spin_lock(&native_tlbie_lock);
183 _tlbie_pid(pid, RIC_FLUSH_ALL); 206 _tlbie_pid(pid, RIC_FLUSH_ALL);
184 if (lock_tlbie) 207 else
185 raw_spin_unlock(&native_tlbie_lock);
186 } else
187 _tlbiel_pid(pid, RIC_FLUSH_ALL); 208 _tlbiel_pid(pid, RIC_FLUSH_ALL);
188no_context: 209no_context:
189 preempt_enable(); 210 preempt_enable();
@@ -195,22 +216,22 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
195 unsigned long pid; 216 unsigned long pid;
196 struct mm_struct *mm = tlb->mm; 217 struct mm_struct *mm = tlb->mm;
197 218
219 /*
220 * If we are doing a full mm flush, we will do a tlb flush
221 * with RIC_FLUSH_ALL later.
222 */
223 if (tlb->fullmm)
224 return;
198 preempt_disable(); 225 preempt_disable();
199 226
200 pid = mm->context.id; 227 pid = mm->context.id;
201 if (unlikely(pid == MMU_NO_CONTEXT)) 228 if (unlikely(pid == MMU_NO_CONTEXT))
202 goto no_context; 229 goto no_context;
203 230
204 if (!mm_is_thread_local(mm)) { 231 if (!mm_is_thread_local(mm))
205 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
206
207 if (lock_tlbie)
208 raw_spin_lock(&native_tlbie_lock);
209 _tlbie_pid(pid, RIC_FLUSH_PWC); 232 _tlbie_pid(pid, RIC_FLUSH_PWC);
210 if (lock_tlbie) 233 else
211 raw_spin_unlock(&native_tlbie_lock); 234 tlbiel_pwc(pid);
212 } else
213 _tlbiel_pid(pid, RIC_FLUSH_PWC);
214no_context: 235no_context:
215 preempt_enable(); 236 preempt_enable();
216} 237}
@@ -226,15 +247,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
226 pid = mm ? mm->context.id : 0; 247 pid = mm ? mm->context.id : 0;
227 if (unlikely(pid == MMU_NO_CONTEXT)) 248 if (unlikely(pid == MMU_NO_CONTEXT))
228 goto bail; 249 goto bail;
229 if (!mm_is_thread_local(mm)) { 250 if (!mm_is_thread_local(mm))
230 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
231
232 if (lock_tlbie)
233 raw_spin_lock(&native_tlbie_lock);
234 _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); 251 _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
235 if (lock_tlbie) 252 else
236 raw_spin_unlock(&native_tlbie_lock);
237 } else
238 _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); 253 _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
239bail: 254bail:
240 preempt_enable(); 255 preempt_enable();
@@ -255,13 +270,7 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
255 270
256void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) 271void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
257{ 272{
258 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
259
260 if (lock_tlbie)
261 raw_spin_lock(&native_tlbie_lock);
262 _tlbie_pid(0, RIC_FLUSH_ALL); 273 _tlbie_pid(0, RIC_FLUSH_ALL);
263 if (lock_tlbie)
264 raw_spin_unlock(&native_tlbie_lock);
265} 274}
266EXPORT_SYMBOL(radix__flush_tlb_kernel_range); 275EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
267 276
@@ -323,7 +332,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
323 unsigned long addr; 332 unsigned long addr;
324 int local = mm_is_thread_local(mm); 333 int local = mm_is_thread_local(mm);
325 unsigned long ap = mmu_get_ap(psize); 334 unsigned long ap = mmu_get_ap(psize);
326 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
327 unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; 335 unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
328 336
329 337
@@ -344,13 +352,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
344 352
345 if (local) 353 if (local)
346 _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); 354 _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
347 else { 355 else
348 if (lock_tlbie)
349 raw_spin_lock(&native_tlbie_lock);
350 _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); 356 _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
351 if (lock_tlbie)
352 raw_spin_unlock(&native_tlbie_lock);
353 }
354 } 357 }
355err_out: 358err_out:
356 preempt_enable(); 359 preempt_enable();
@@ -437,7 +440,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
437 return; 440 return;
438 } 441 }
439 442
440 if (old_pte & _PAGE_LARGE) 443 if (old_pte & R_PAGE_LARGE)
441 radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); 444 radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
442 else 445 else
443 radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); 446 radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ba28fcb98597..bfc4a0869609 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
770 * avoid going over total available memory just in case... 770 * avoid going over total available memory just in case...
771 */ 771 */
772#ifdef CONFIG_PPC_FSL_BOOK3E 772#ifdef CONFIG_PPC_FSL_BOOK3E
773 if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { 773 if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
774 unsigned long linear_sz; 774 unsigned long linear_sz;
775 unsigned int num_cams; 775 unsigned int num_cams;
776 776
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2ff13249f87a..6c2d4168daec 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2049,6 +2049,14 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
2049 data.br_stack = &cpuhw->bhrb_stack; 2049 data.br_stack = &cpuhw->bhrb_stack;
2050 } 2050 }
2051 2051
2052 if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
2053 ppmu->get_mem_data_src)
2054 ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
2055
2056 if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
2057 ppmu->get_mem_weight)
2058 ppmu->get_mem_weight(&data.weight);
2059
2052 if (perf_event_overflow(event, &data, regs)) 2060 if (perf_event_overflow(event, &data, regs))
2053 power_pmu_stop(event, 0); 2061 power_pmu_stop(event, 0);
2054 } 2062 }
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index cd951fd231c4..8125160be7bc 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -148,6 +148,88 @@ static bool is_thresh_cmp_valid(u64 event)
148 return true; 148 return true;
149} 149}
150 150
151static inline u64 isa207_find_source(u64 idx, u32 sub_idx)
152{
153 u64 ret = PERF_MEM_NA;
154
155 switch(idx) {
156 case 0:
157 /* Nothing to do */
158 break;
159 case 1:
160 ret = PH(LVL, L1);
161 break;
162 case 2:
163 ret = PH(LVL, L2);
164 break;
165 case 3:
166 ret = PH(LVL, L3);
167 break;
168 case 4:
169 if (sub_idx <= 1)
170 ret = PH(LVL, LOC_RAM);
171 else if (sub_idx > 1 && sub_idx <= 2)
172 ret = PH(LVL, REM_RAM1);
173 else
174 ret = PH(LVL, REM_RAM2);
175 ret |= P(SNOOP, HIT);
176 break;
177 case 5:
178 ret = PH(LVL, REM_CCE1);
179 if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4))
180 ret |= P(SNOOP, HIT);
181 else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5))
182 ret |= P(SNOOP, HITM);
183 break;
184 case 6:
185 ret = PH(LVL, REM_CCE2);
186 if ((sub_idx == 0) || (sub_idx == 2))
187 ret |= P(SNOOP, HIT);
188 else if ((sub_idx == 1) || (sub_idx == 3))
189 ret |= P(SNOOP, HITM);
190 break;
191 case 7:
192 ret = PM(LVL, L1);
193 break;
194 }
195
196 return ret;
197}
198
199void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
200 struct pt_regs *regs)
201{
202 u64 idx;
203 u32 sub_idx;
204 u64 sier;
205 u64 val;
206
207 /* Skip if no SIER support */
208 if (!(flags & PPMU_HAS_SIER)) {
209 dsrc->val = 0;
210 return;
211 }
212
213 sier = mfspr(SPRN_SIER);
214 val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT;
215 if (val == 1 || val == 2) {
216 idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT;
217 sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT;
218
219 dsrc->val = isa207_find_source(idx, sub_idx);
220 dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE);
221 }
222}
223
224void isa207_get_mem_weight(u64 *weight)
225{
226 u64 mmcra = mfspr(SPRN_MMCRA);
227 u64 exp = MMCRA_THR_CTR_EXP(mmcra);
228 u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
229
230 *weight = mantissa << (2 * exp);
231}
232
151int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) 233int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
152{ 234{
153 unsigned int unit, pmc, cache, ebb; 235 unsigned int unit, pmc, cache, ebb;
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 899210f14ee4..8acbe6e802c7 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -248,6 +248,15 @@
248#define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT) 248#define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT)
249#define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) 249#define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT)
250#define MMCRA_IFM_SHIFT 30 250#define MMCRA_IFM_SHIFT 30
251#define MMCRA_THR_CTR_MANT_SHIFT 19
252#define MMCRA_THR_CTR_MANT_MASK 0x7Ful
253#define MMCRA_THR_CTR_MANT(v) (((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\
254 MMCRA_THR_CTR_MANT_MASK)
255
256#define MMCRA_THR_CTR_EXP_SHIFT 27
257#define MMCRA_THR_CTR_EXP_MASK 0x7ul
258#define MMCRA_THR_CTR_EXP(v) (((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\
259 MMCRA_THR_CTR_EXP_MASK)
251 260
252/* MMCR1 Threshold Compare bit constant for power9 */ 261/* MMCR1 Threshold Compare bit constant for power9 */
253#define p9_MMCRA_THR_CMP_SHIFT 45 262#define p9_MMCRA_THR_CMP_SHIFT 45
@@ -260,6 +269,19 @@
260#define MAX_ALT 2 269#define MAX_ALT 2
261#define MAX_PMU_COUNTERS 6 270#define MAX_PMU_COUNTERS 6
262 271
272#define ISA207_SIER_TYPE_SHIFT 15
273#define ISA207_SIER_TYPE_MASK (0x7ull << ISA207_SIER_TYPE_SHIFT)
274
275#define ISA207_SIER_LDST_SHIFT 1
276#define ISA207_SIER_LDST_MASK (0x7ull << ISA207_SIER_LDST_SHIFT)
277
278#define ISA207_SIER_DATA_SRC_SHIFT 53
279#define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
280
281#define P(a, b) PERF_MEM_S(a, b)
282#define PH(a, b) (P(LVL, HIT) | P(a, b))
283#define PM(a, b) (P(LVL, MISS) | P(a, b))
284
263int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp); 285int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
264int isa207_compute_mmcr(u64 event[], int n_ev, 286int isa207_compute_mmcr(u64 event[], int n_ev,
265 unsigned int hwc[], unsigned long mmcr[], 287 unsigned int hwc[], unsigned long mmcr[],
@@ -267,6 +289,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
267void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]); 289void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
268int isa207_get_alternatives(u64 event, u64 alt[], 290int isa207_get_alternatives(u64 event, u64 alt[],
269 const unsigned int ev_alt[][MAX_ALT], int size); 291 const unsigned int ev_alt[][MAX_ALT], int size);
270 292void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
293 struct pt_regs *regs);
294void isa207_get_mem_weight(u64 *weight);
271 295
272#endif 296#endif
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
index 3a2e6e8ebb92..0f1d184627cc 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH, 0x2013c)
89EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) 89EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e)
90/* Alternate event code for PM_LD_MISS_L1 */ 90/* Alternate event code for PM_LD_MISS_L1 */
91EVENT(PM_LD_MISS_L1_ALT, 0x400f0) 91EVENT(PM_LD_MISS_L1_ALT, 0x400f0)
92/*
93 * Memory Access Event -- mem_access
94 * Primary PMU event used here is PM_MRK_INST_CMPL, along with
95 * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]).
96 */
97EVENT(MEM_ACCESS, 0x10401e0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index ce15b19a7962..5463516e369b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BRU_FIN);
90GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); 90GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
91GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); 91GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1);
92GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1); 92GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1);
93GENERIC_EVENT_ATTR(mem_access, MEM_ACCESS);
93 94
94CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1); 95CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1);
95CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); 96CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1);
@@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = {
120 GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), 121 GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
121 GENERIC_EVENT_PTR(PM_LD_REF_L1), 122 GENERIC_EVENT_PTR(PM_LD_REF_L1),
122 GENERIC_EVENT_PTR(PM_LD_MISS_L1), 123 GENERIC_EVENT_PTR(PM_LD_MISS_L1),
124 GENERIC_EVENT_PTR(MEM_ACCESS),
123 125
124 CACHE_EVENT_PTR(PM_LD_MISS_L1), 126 CACHE_EVENT_PTR(PM_LD_MISS_L1),
125 CACHE_EVENT_PTR(PM_LD_REF_L1), 127 CACHE_EVENT_PTR(PM_LD_REF_L1),
@@ -325,6 +327,8 @@ static struct power_pmu power8_pmu = {
325 .bhrb_filter_map = power8_bhrb_filter_map, 327 .bhrb_filter_map = power8_bhrb_filter_map,
326 .get_constraint = isa207_get_constraint, 328 .get_constraint = isa207_get_constraint,
327 .get_alternatives = power8_get_alternatives, 329 .get_alternatives = power8_get_alternatives,
330 .get_mem_data_src = isa207_get_mem_data_src,
331 .get_mem_weight = isa207_get_mem_weight,
328 .disable_pmc = isa207_disable_pmc, 332 .disable_pmc = isa207_disable_pmc,
329 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, 333 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S,
330 .n_generic = ARRAY_SIZE(power8_generic_events), 334 .n_generic = ARRAY_SIZE(power8_generic_events),
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7f6582708e06..018f8e90ac35 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = {
427 .bhrb_filter_map = power9_bhrb_filter_map, 427 .bhrb_filter_map = power9_bhrb_filter_map,
428 .get_constraint = isa207_get_constraint, 428 .get_constraint = isa207_get_constraint,
429 .get_alternatives = power9_get_alternatives, 429 .get_alternatives = power9_get_alternatives,
430 .get_mem_data_src = isa207_get_mem_data_src,
431 .get_mem_weight = isa207_get_mem_weight,
430 .disable_pmc = isa207_disable_pmc, 432 .disable_pmc = isa207_disable_pmc,
431 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, 433 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S,
432 .n_generic = ARRAY_SIZE(power9_generic_events), 434 .n_generic = ARRAY_SIZE(power9_generic_events),
diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c
index 688ffeab0699..55fed5e4de14 100644
--- a/arch/powerpc/platforms/44x/sam440ep.c
+++ b/arch/powerpc/platforms/44x/sam440ep.c
@@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = {
70 .irq = -1, 70 .irq = -1,
71}; 71};
72 72
73static int sam440ep_setup_rtc(void) 73static int __init sam440ep_setup_rtc(void)
74{ 74{
75 return i2c_register_board_info(0, &sam440ep_rtc_info, 1); 75 return i2c_register_board_info(0, &sam440ep_rtc_info, 1);
76} 76}
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index b625a2c6f4f2..e4c745981912 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -33,7 +33,6 @@ config PPC_EFIKA
33 bool "bPlan Efika 5k2. MPC5200B based computer" 33 bool "bPlan Efika 5k2. MPC5200B based computer"
34 depends on PPC_MPC52xx 34 depends on PPC_MPC52xx
35 select PPC_RTAS 35 select PPC_RTAS
36 select RTAS_PROC
37 select PPC_NATIVE 36 select PPC_NATIVE
38 37
39config PPC_LITE5200 38config PPC_LITE5200
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 078097a0b09d..f51fd35f4618 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -344,6 +344,7 @@ done:
344} 344}
345 345
346struct smp_ops_t smp_85xx_ops = { 346struct smp_ops_t smp_85xx_ops = {
347 .cause_nmi_ipi = NULL,
347 .kick_cpu = smp_85xx_kick_cpu, 348 .kick_cpu = smp_85xx_kick_cpu,
348 .cpu_bootable = smp_generic_cpu_bootable, 349 .cpu_bootable = smp_generic_cpu_bootable,
349#ifdef CONFIG_HOTPLUG_CPU 350#ifdef CONFIG_HOTPLUG_CPU
@@ -461,16 +462,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
461} 462}
462#endif /* CONFIG_KEXEC_CORE */ 463#endif /* CONFIG_KEXEC_CORE */
463 464
464static void smp_85xx_basic_setup(int cpu_nr)
465{
466 if (cpu_has_feature(CPU_FTR_DBELL))
467 doorbell_setup_this_cpu();
468}
469
470static void smp_85xx_setup_cpu(int cpu_nr) 465static void smp_85xx_setup_cpu(int cpu_nr)
471{ 466{
472 mpic_setup_this_cpu(); 467 mpic_setup_this_cpu();
473 smp_85xx_basic_setup(cpu_nr);
474} 468}
475 469
476void __init mpc85xx_smp_init(void) 470void __init mpc85xx_smp_init(void)
@@ -484,7 +478,7 @@ void __init mpc85xx_smp_init(void)
484 smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu; 478 smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
485 smp_85xx_ops.message_pass = smp_mpic_message_pass; 479 smp_85xx_ops.message_pass = smp_mpic_message_pass;
486 } else 480 } else
487 smp_85xx_ops.setup_cpu = smp_85xx_basic_setup; 481 smp_85xx_ops.setup_cpu = NULL;
488 482
489 if (cpu_has_feature(CPU_FTR_DBELL)) { 483 if (cpu_has_feature(CPU_FTR_DBELL)) {
490 /* 484 /*
@@ -492,7 +486,7 @@ void __init mpc85xx_smp_init(void)
492 * smp_muxed_ipi_message_pass 486 * smp_muxed_ipi_message_pass
493 */ 487 */
494 smp_85xx_ops.message_pass = NULL; 488 smp_85xx_ops.message_pass = NULL;
495 smp_85xx_ops.cause_ipi = doorbell_cause_ipi; 489 smp_85xx_ops.cause_ipi = doorbell_global_ipi;
496 smp_85xx_ops.probe = NULL; 490 smp_85xx_ops.probe = NULL;
497 } 491 }
498 492
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index af09baee22cb..020e84a47a32 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -105,6 +105,7 @@ smp_86xx_setup_cpu(int cpu_nr)
105 105
106 106
107struct smp_ops_t smp_86xx_ops = { 107struct smp_ops_t smp_86xx_ops = {
108 .cause_nmi_ipi = NULL,
108 .message_pass = smp_mpic_message_pass, 109 .message_pass = smp_mpic_message_pass,
109 .probe = smp_mpic_probe, 110 .probe = smp_mpic_probe,
110 .kick_cpu = smp_86xx_kick_cpu, 111 .kick_cpu = smp_86xx_kick_cpu,
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99b0ae8acb78..684e886eaae4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -279,7 +279,8 @@ config PPC_ICSWX
279 279
280 This option enables kernel support for the PowerPC Initiate 280 This option enables kernel support for the PowerPC Initiate
281 Coprocessor Store Word (icswx) coprocessor instruction on POWER7 281 Coprocessor Store Word (icswx) coprocessor instruction on POWER7
282 or newer processors. 282 and POWER8 processors. POWER9 uses new copy/paste instructions
283 to invoke the coprocessor.
283 284
284 This option is only useful if you have a processor that supports 285 This option is only useful if you have a processor that supports
285 the icswx coprocessor instruction. It does not have any effect 286 the icswx coprocessor instruction. It does not have any effect
@@ -359,7 +360,7 @@ config PPC_BOOK3E_MMU
359 360
360config PPC_MM_SLICES 361config PPC_MM_SLICES
361 bool 362 bool
362 default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) 363 default y if PPC_STD_MMU_64
363 default n 364 default n
364 365
365config PPC_HAVE_PMU_SUPPORT 366config PPC_HAVE_PMU_SUPPORT
@@ -371,9 +372,16 @@ config PPC_PERF_CTRS
371 help 372 help
372 This enables the powerpc-specific perf_event back-end. 373 This enables the powerpc-specific perf_event back-end.
373 374
375config FORCE_SMP
376 # Allow platforms to force SMP=y by selecting this
377 bool
378 default n
379 select SMP
380
374config SMP 381config SMP
375 depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x 382 depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x
376 bool "Symmetric multi-processing support" 383 select GENERIC_IRQ_MIGRATION
384 bool "Symmetric multi-processing support" if !FORCE_SMP
377 ---help--- 385 ---help---
378 This enables support for systems with more than one CPU. If you have 386 This enables support for systems with more than one CPU. If you have
379 a system with only one CPU, say N. If you have a system with more 387 a system with only one CPU, say N. If you have a system with more
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 8b55c5f19d4c..8d3ae2cc52bf 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -15,9 +15,9 @@
15#include <linux/msi.h> 15#include <linux/msi.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/of_platform.h> 17#include <linux/of_platform.h>
18#include <linux/debugfs.h>
19#include <linux/slab.h> 18#include <linux/slab.h>
20 19
20#include <asm/debugfs.h>
21#include <asm/dcr.h> 21#include <asm/dcr.h>
22#include <asm/machdep.h> 22#include <asm/machdep.h>
23#include <asm/prom.h> 23#include <asm/prom.h>
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index a6bbbaba14a3..871d38479a25 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -211,7 +211,7 @@ void iic_request_IPIs(void)
211 iic_request_ipi(PPC_MSG_CALL_FUNCTION); 211 iic_request_ipi(PPC_MSG_CALL_FUNCTION);
212 iic_request_ipi(PPC_MSG_RESCHEDULE); 212 iic_request_ipi(PPC_MSG_RESCHEDULE);
213 iic_request_ipi(PPC_MSG_TICK_BROADCAST); 213 iic_request_ipi(PPC_MSG_TICK_BROADCAST);
214 iic_request_ipi(PPC_MSG_DEBUGGER_BREAK); 214 iic_request_ipi(PPC_MSG_NMI_IPI);
215} 215}
216 216
217#endif /* CONFIG_SMP */ 217#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index e7d075077cb0..a88944db9fc3 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -88,11 +88,14 @@ static void cbe_power_save(void)
88static int cbe_system_reset_exception(struct pt_regs *regs) 88static int cbe_system_reset_exception(struct pt_regs *regs)
89{ 89{
90 switch (regs->msr & SRR1_WAKEMASK) { 90 switch (regs->msr & SRR1_WAKEMASK) {
91 case SRR1_WAKEEE:
92 do_IRQ(regs);
93 break;
94 case SRR1_WAKEDEC: 91 case SRR1_WAKEDEC:
95 timer_interrupt(regs); 92 set_dec(1);
93 case SRR1_WAKEEE:
94 /*
95 * Handle these when interrupts get re-enabled and we take
96 * them as regular exceptions. We are in an NMI context
97 * and can't handle these here.
98 */
96 break; 99 break;
97 case SRR1_WAKEMT: 100 case SRR1_WAKEMT:
98 return cbe_sysreset_hack(); 101 return cbe_sysreset_hack();
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index b6c9a0dcc924..14515040f7cd 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -44,6 +44,7 @@ static void smp_chrp_setup_cpu(int cpu_nr)
44 44
45/* CHRP with openpic */ 45/* CHRP with openpic */
46struct smp_ops_t chrp_smp_ops = { 46struct smp_ops_t chrp_smp_ops = {
47 .cause_nmi_ipi = NULL,
47 .message_pass = smp_mpic_message_pass, 48 .message_pass = smp_mpic_message_pass,
48 .probe = smp_mpic_probe, 49 .probe = smp_mpic_probe,
49 .kick_cpu = smp_chrp_kick_cpu, 50 .kick_cpu = smp_chrp_kick_cpu,
diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c
index 75b296bc51af..44e0d9226f0a 100644
--- a/arch/powerpc/platforms/pasemi/idle.c
+++ b/arch/powerpc/platforms/pasemi/idle.c
@@ -53,11 +53,14 @@ static int pasemi_system_reset_exception(struct pt_regs *regs)
53 regs->nip = regs->link; 53 regs->nip = regs->link;
54 54
55 switch (regs->msr & SRR1_WAKEMASK) { 55 switch (regs->msr & SRR1_WAKEMASK) {
56 case SRR1_WAKEEE:
57 do_IRQ(regs);
58 break;
59 case SRR1_WAKEDEC: 56 case SRR1_WAKEDEC:
60 timer_interrupt(regs); 57 set_dec(1);
58 case SRR1_WAKEEE:
59 /*
60 * Handle these when interrupts get re-enabled and we take
61 * them as regular exceptions. We are in an NMI context
62 * and can't handle these here.
63 */
61 break; 64 break;
62 default: 65 default:
63 /* do system reset */ 66 /* do system reset */
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 746ca7321b03..2cd99eb30762 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d)
172 return IRQ_HANDLED; 172 return IRQ_HANDLED;
173} 173}
174 174
175static void smp_psurge_cause_ipi(int cpu, unsigned long data) 175static void smp_psurge_cause_ipi(int cpu)
176{ 176{
177 psurge_set_ipi(cpu); 177 psurge_set_ipi(cpu);
178} 178}
@@ -447,6 +447,7 @@ void __init smp_psurge_give_timebase(void)
447struct smp_ops_t psurge_smp_ops = { 447struct smp_ops_t psurge_smp_ops = {
448 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ 448 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
449 .cause_ipi = smp_psurge_cause_ipi, 449 .cause_ipi = smp_psurge_cause_ipi,
450 .cause_nmi_ipi = NULL,
450 .probe = smp_psurge_probe, 451 .probe = smp_psurge_probe,
451 .kick_cpu = smp_psurge_kick_cpu, 452 .kick_cpu = smp_psurge_kick_cpu,
452 .setup_cpu = smp_psurge_setup_cpu, 453 .setup_cpu = smp_psurge_setup_cpu,
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..6a6f4ef46b9e 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -4,6 +4,7 @@ config PPC_POWERNV
4 select PPC_NATIVE 4 select PPC_NATIVE
5 select PPC_XICS 5 select PPC_XICS
6 select PPC_ICP_NATIVE 6 select PPC_ICP_NATIVE
7 select PPC_XIVE_NATIVE
7 select PPC_P7_NAP 8 select PPC_P7_NAP
8 select PCI 9 select PCI
9 select PCI_MSI 10 select PCI_MSI
@@ -19,6 +20,8 @@ config PPC_POWERNV
19 select CPU_FREQ_GOV_ONDEMAND 20 select CPU_FREQ_GOV_ONDEMAND
20 select CPU_FREQ_GOV_CONSERVATIVE 21 select CPU_FREQ_GOV_CONSERVATIVE
21 select PPC_DOORBELL 22 select PPC_DOORBELL
23 select MMU_NOTIFIER
24 select FORCE_SMP
22 default y 25 default y
23 26
24config OPAL_PRD 27config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6fb5522acd70..d2f19821d71d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
1102 return -EIO; 1102 return -EIO;
1103 } 1103 }
1104 1104
1105 /*
1106 * If dealing with the root bus (or the bus underneath the
1107 * root port), we reset the bus underneath the root port.
1108 *
1109 * The cxl driver depends on this behaviour for bi-modal card
1110 * switching.
1111 */
1105 if (pci_is_root_bus(bus) || 1112 if (pci_is_root_bus(bus) ||
1106 pci_is_root_bus(bus->parent)) 1113 pci_is_root_bus(bus->parent))
1107 return pnv_eeh_root_reset(hose, option); 1114 return pnv_eeh_root_reset(hose, option);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 4ee837e6391a..445f30a2c5ef 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void)
53 uint64_t pir = get_hard_smp_processor_id(cpu); 53 uint64_t pir = get_hard_smp_processor_id(cpu);
54 uint64_t hsprg0_val = (uint64_t)&paca[cpu]; 54 uint64_t hsprg0_val = (uint64_t)&paca[cpu];
55 55
56 if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
57 /*
58 * HSPRG0 is used to store the cpu's pointer to paca.
59 * Hence last 3 bits are guaranteed to be 0. Program
60 * slw to restore HSPRG0 with 63rd bit set, so that
61 * when a thread wakes up at 0x100 we can use this bit
62 * to distinguish between fastsleep and deep winkle.
63 * This is not necessary with stop/psscr since PLS
64 * field of psscr indicates which state we are waking
65 * up from.
66 */
67 hsprg0_val |= 1;
68 }
69 rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); 56 rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
70 if (rc != 0) 57 if (rc != 0)
71 return rc; 58 return rc;
@@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void)
122 for (i = 0; i < nr_cores; i++) { 109 for (i = 0; i < nr_cores; i++) {
123 int first_cpu = i * threads_per_core; 110 int first_cpu = i * threads_per_core;
124 int node = cpu_to_node(first_cpu); 111 int node = cpu_to_node(first_cpu);
112 size_t paca_ptr_array_size;
125 113
126 core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); 114 core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
127 *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; 115 *core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
116 paca_ptr_array_size = (threads_per_core *
117 sizeof(struct paca_struct *));
128 118
129 for (j = 0; j < threads_per_core; j++) { 119 for (j = 0; j < threads_per_core; j++) {
130 int cpu = first_cpu + j; 120 int cpu = first_cpu + j;
@@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void)
132 paca[cpu].core_idle_state_ptr = core_idle_state; 122 paca[cpu].core_idle_state_ptr = core_idle_state;
133 paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; 123 paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
134 paca[cpu].thread_mask = 1 << j; 124 paca[cpu].thread_mask = 1 << j;
125 if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
126 continue;
127 paca[cpu].thread_sibling_pacas =
128 kmalloc_node(paca_ptr_array_size,
129 GFP_KERNEL, node);
135 } 130 }
136 } 131 }
137 132
@@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void)
147} 142}
148EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); 143EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
149 144
150
151static void pnv_fastsleep_workaround_apply(void *info) 145static void pnv_fastsleep_workaround_apply(void *info)
152 146
153{ 147{
@@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
241 * The default stop state that will be used by ppc_md.power_save 235 * The default stop state that will be used by ppc_md.power_save
242 * function on platforms that support stop instruction. 236 * function on platforms that support stop instruction.
243 */ 237 */
244u64 pnv_default_stop_val; 238static u64 pnv_default_stop_val;
245u64 pnv_default_stop_mask; 239static u64 pnv_default_stop_mask;
240static bool default_stop_found;
246 241
247/* 242/*
248 * Used for ppc_md.power_save which needs a function with no parameters 243 * Used for ppc_md.power_save which needs a function with no parameters
@@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
262 * psscr value and mask of the deepest stop idle state. 257 * psscr value and mask of the deepest stop idle state.
263 * Used when a cpu is offlined. 258 * Used when a cpu is offlined.
264 */ 259 */
265u64 pnv_deepest_stop_psscr_val; 260static u64 pnv_deepest_stop_psscr_val;
266u64 pnv_deepest_stop_psscr_mask; 261static u64 pnv_deepest_stop_psscr_mask;
262static bool deepest_stop_found;
263
264/*
265 * pnv_cpu_offline: A function that puts the CPU into the deepest
266 * available platform idle state on a CPU-Offline.
267 */
268unsigned long pnv_cpu_offline(unsigned int cpu)
269{
270 unsigned long srr1;
271
272 u32 idle_states = pnv_get_supported_cpuidle_states();
273
274 if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
275 srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
276 pnv_deepest_stop_psscr_mask);
277 } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
278 srr1 = power7_winkle();
279 } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
280 (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
281 srr1 = power7_sleep();
282 } else if (idle_states & OPAL_PM_NAP_ENABLED) {
283 srr1 = power7_nap(1);
284 } else {
285 /* This is the fallback method. We emulate snooze */
286 while (!generic_check_cpu_restart(cpu)) {
287 HMT_low();
288 HMT_very_low();
289 }
290 srr1 = 0;
291 HMT_medium();
292 }
293
294 return srr1;
295}
267 296
268/* 297/*
269 * Power ISA 3.0 idle initialization. 298 * Power ISA 3.0 idle initialization.
@@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
352 u32 *residency_ns = NULL; 381 u32 *residency_ns = NULL;
353 u64 max_residency_ns = 0; 382 u64 max_residency_ns = 0;
354 int rc = 0, i; 383 int rc = 0, i;
355 bool default_stop_found = false, deepest_stop_found = false;
356 384
357 psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); 385 psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
358 psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); 386 psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
@@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
432 } 460 }
433 } 461 }
434 462
435 if (!default_stop_found) { 463 if (unlikely(!default_stop_found)) {
436 pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; 464 pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
437 pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; 465 } else {
438 pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", 466 ppc_md.power_save = power9_idle;
467 pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
439 pnv_default_stop_val, pnv_default_stop_mask); 468 pnv_default_stop_val, pnv_default_stop_mask);
440 } 469 }
441 470
442 if (!deepest_stop_found) { 471 if (unlikely(!deepest_stop_found)) {
443 pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; 472 pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
444 pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; 473 } else {
445 pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", 474 pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
446 pnv_deepest_stop_psscr_val, 475 pnv_deepest_stop_psscr_val,
447 pnv_deepest_stop_psscr_mask); 476 pnv_deepest_stop_psscr_mask);
448 } 477 }
449 478
479 pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
480 pnv_first_deep_stop_state);
450out: 481out:
451 kfree(psscr_val); 482 kfree(psscr_val);
452 kfree(psscr_mask); 483 kfree(psscr_mask);
@@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void)
524 555
525 pnv_alloc_idle_core_states(); 556 pnv_alloc_idle_core_states();
526 557
558 /*
559 * For each CPU, record its PACA address in each of it's
560 * sibling thread's PACA at the slot corresponding to this
561 * CPU's index in the core.
562 */
563 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
564 int cpu;
565
566 pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
567 for_each_possible_cpu(cpu) {
568 int base_cpu = cpu_first_thread_sibling(cpu);
569 int idx = cpu_thread_in_core(cpu);
570 int i;
571
572 for (i = 0; i < threads_per_core; i++) {
573 int j = base_cpu + i;
574
575 paca[j].thread_sibling_pacas[idx] = &paca[cpu];
576 }
577 }
578 }
579
527 if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) 580 if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
528 ppc_md.power_save = power7_idle; 581 ppc_md.power_save = power7_idle;
529 else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST)
530 ppc_md.power_save = power9_idle;
531 582
532out: 583out:
533 return 0; 584 return 0;
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1c383f38031d..067defeea691 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,11 +9,20 @@
9 * License as published by the Free Software Foundation. 9 * License as published by the Free Software Foundation.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/mmu_notifier.h>
14#include <linux/mmu_context.h>
15#include <linux/of.h>
12#include <linux/export.h> 16#include <linux/export.h>
13#include <linux/pci.h> 17#include <linux/pci.h>
14#include <linux/memblock.h> 18#include <linux/memblock.h>
15#include <linux/iommu.h> 19#include <linux/iommu.h>
16 20
21#include <asm/tlb.h>
22#include <asm/powernv.h>
23#include <asm/reg.h>
24#include <asm/opal.h>
25#include <asm/io.h>
17#include <asm/iommu.h> 26#include <asm/iommu.h>
18#include <asm/pnv-pci.h> 27#include <asm/pnv-pci.h>
19#include <asm/msi_bitmap.h> 28#include <asm/msi_bitmap.h>
@@ -22,6 +31,8 @@
22#include "powernv.h" 31#include "powernv.h"
23#include "pci.h" 32#include "pci.h"
24 33
34#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
35
25/* 36/*
26 * Other types of TCE cache invalidation are not functional in the 37 * Other types of TCE cache invalidation are not functional in the
27 * hardware. 38 * hardware.
@@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
37 struct device_node *dn; 48 struct device_node *dn;
38 struct pci_dev *gpdev; 49 struct pci_dev *gpdev;
39 50
51 if (WARN_ON(!npdev))
52 return NULL;
53
54 if (WARN_ON(!npdev->dev.of_node))
55 return NULL;
56
40 /* Get assoicated PCI device */ 57 /* Get assoicated PCI device */
41 dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); 58 dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
42 if (!dn) 59 if (!dn)
@@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
55 struct device_node *dn; 72 struct device_node *dn;
56 struct pci_dev *npdev; 73 struct pci_dev *npdev;
57 74
75 if (WARN_ON(!gpdev))
76 return NULL;
77
78 if (WARN_ON(!gpdev->dev.of_node))
79 return NULL;
80
58 /* Get assoicated PCI device */ 81 /* Get assoicated PCI device */
59 dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); 82 dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
60 if (!dn) 83 if (!dn)
@@ -180,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
180 pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); 203 pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
181 return rc; 204 return rc;
182 } 205 }
183 pnv_pci_phb3_tce_invalidate_entire(phb, false); 206 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
184 207
185 /* Add the table to the list so its TCE cache will get invalidated */ 208 /* Add the table to the list so its TCE cache will get invalidated */
186 pnv_pci_link_table_and_group(phb->hose->node, num, 209 pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -204,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
204 pe_err(npe, "Unmapping failed, ret = %lld\n", rc); 227 pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
205 return rc; 228 return rc;
206 } 229 }
207 pnv_pci_phb3_tce_invalidate_entire(phb, false); 230 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
208 231
209 pnv_pci_unlink_table_and_group(npe->table_group.tables[num], 232 pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
210 &npe->table_group); 233 &npe->table_group);
@@ -270,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
270 0 /* bypass base */, top); 293 0 /* bypass base */, top);
271 294
272 if (rc == OPAL_SUCCESS) 295 if (rc == OPAL_SUCCESS)
273 pnv_pci_phb3_tce_invalidate_entire(phb, false); 296 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
274 297
275 return rc; 298 return rc;
276} 299}
@@ -334,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
334 pe_err(npe, "Failed to disable bypass, err %lld\n", rc); 357 pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
335 return; 358 return;
336 } 359 }
337 pnv_pci_phb3_tce_invalidate_entire(npe->phb, false); 360 pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
338} 361}
339 362
340struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) 363struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
@@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
359 382
360 return gpe; 383 return gpe;
361} 384}
385
386/* Maximum number of nvlinks per npu */
387#define NV_MAX_LINKS 6
388
389/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
390static int max_npu2_index;
391
392struct npu_context {
393 struct mm_struct *mm;
394 struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
395 struct mmu_notifier mn;
396 struct kref kref;
397
398 /* Callback to stop translation requests on a given GPU */
399 struct npu_context *(*release_cb)(struct npu_context *, void *);
400
401 /*
402 * Private pointer passed to the above callback for usage by
403 * device drivers.
404 */
405 void *priv;
406};
407
408/*
409 * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
410 * if none are available.
411 */
412static int get_mmio_atsd_reg(struct npu *npu)
413{
414 int i;
415
416 for (i = 0; i < npu->mmio_atsd_count; i++) {
417 if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
418 return i;
419 }
420
421 return -ENOSPC;
422}
423
424static void put_mmio_atsd_reg(struct npu *npu, int reg)
425{
426 clear_bit(reg, &npu->mmio_atsd_usage);
427}
428
429/* MMIO ATSD register offsets */
430#define XTS_ATSD_AVA 1
431#define XTS_ATSD_STAT 2
432
433static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
434 unsigned long va)
435{
436 int mmio_atsd_reg;
437
438 do {
439 mmio_atsd_reg = get_mmio_atsd_reg(npu);
440 cpu_relax();
441 } while (mmio_atsd_reg < 0);
442
443 __raw_writeq(cpu_to_be64(va),
444 npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
445 eieio();
446 __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
447
448 return mmio_atsd_reg;
449}
450
451static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
452{
453 unsigned long launch;
454
455 /* IS set to invalidate matching PID */
456 launch = PPC_BIT(12);
457
458 /* PRS set to process-scoped */
459 launch |= PPC_BIT(13);
460
461 /* AP */
462 launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
463
464 /* PID */
465 launch |= pid << PPC_BITLSHIFT(38);
466
467 /* Invalidating the entire process doesn't use a va */
468 return mmio_launch_invalidate(npu, launch, 0);
469}
470
471static int mmio_invalidate_va(struct npu *npu, unsigned long va,
472 unsigned long pid)
473{
474 unsigned long launch;
475
476 /* IS set to invalidate target VA */
477 launch = 0;
478
479 /* PRS set to process scoped */
480 launch |= PPC_BIT(13);
481
482 /* AP */
483 launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
484
485 /* PID */
486 launch |= pid << PPC_BITLSHIFT(38);
487
488 return mmio_launch_invalidate(npu, launch, va);
489}
490
491#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
492
493/*
494 * Invalidate either a single address or an entire PID depending on
495 * the value of va.
496 */
497static void mmio_invalidate(struct npu_context *npu_context, int va,
498 unsigned long address)
499{
500 int i, j, reg;
501 struct npu *npu;
502 struct pnv_phb *nphb;
503 struct pci_dev *npdev;
504 struct {
505 struct npu *npu;
506 int reg;
507 } mmio_atsd_reg[NV_MAX_NPUS];
508 unsigned long pid = npu_context->mm->context.id;
509
510 /*
511 * Loop over all the NPUs this process is active on and launch
512 * an invalidate.
513 */
514 for (i = 0; i <= max_npu2_index; i++) {
515 mmio_atsd_reg[i].reg = -1;
516 for (j = 0; j < NV_MAX_LINKS; j++) {
517 npdev = npu_context->npdev[i][j];
518 if (!npdev)
519 continue;
520
521 nphb = pci_bus_to_host(npdev->bus)->private_data;
522 npu = &nphb->npu;
523 mmio_atsd_reg[i].npu = npu;
524
525 if (va)
526 mmio_atsd_reg[i].reg =
527 mmio_invalidate_va(npu, address, pid);
528 else
529 mmio_atsd_reg[i].reg =
530 mmio_invalidate_pid(npu, pid);
531
532 /*
533 * The NPU hardware forwards the shootdown to all GPUs
534 * so we only have to launch one shootdown per NPU.
535 */
536 break;
537 }
538 }
539
540 /*
541 * Unfortunately the nest mmu does not support flushing specific
542 * addresses so we have to flush the whole mm.
543 */
544 flush_tlb_mm(npu_context->mm);
545
546 /* Wait for all invalidations to complete */
547 for (i = 0; i <= max_npu2_index; i++) {
548 if (mmio_atsd_reg[i].reg < 0)
549 continue;
550
551 /* Wait for completion */
552 npu = mmio_atsd_reg[i].npu;
553 reg = mmio_atsd_reg[i].reg;
554 while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
555 cpu_relax();
556 put_mmio_atsd_reg(npu, reg);
557 }
558}
559
560static void pnv_npu2_mn_release(struct mmu_notifier *mn,
561 struct mm_struct *mm)
562{
563 struct npu_context *npu_context = mn_to_npu_context(mn);
564
565 /* Call into device driver to stop requests to the NMMU */
566 if (npu_context->release_cb)
567 npu_context->release_cb(npu_context, npu_context->priv);
568
569 /*
570 * There should be no more translation requests for this PID, but we
571 * need to ensure any entries for it are removed from the TLB.
572 */
573 mmio_invalidate(npu_context, 0, 0);
574}
575
576static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
577 struct mm_struct *mm,
578 unsigned long address,
579 pte_t pte)
580{
581 struct npu_context *npu_context = mn_to_npu_context(mn);
582
583 mmio_invalidate(npu_context, 1, address);
584}
585
586static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
587 struct mm_struct *mm,
588 unsigned long address)
589{
590 struct npu_context *npu_context = mn_to_npu_context(mn);
591
592 mmio_invalidate(npu_context, 1, address);
593}
594
595static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
596 struct mm_struct *mm,
597 unsigned long start, unsigned long end)
598{
599 struct npu_context *npu_context = mn_to_npu_context(mn);
600 unsigned long address;
601
602 for (address = start; address <= end; address += PAGE_SIZE)
603 mmio_invalidate(npu_context, 1, address);
604}
605
606static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
607 .release = pnv_npu2_mn_release,
608 .change_pte = pnv_npu2_mn_change_pte,
609 .invalidate_page = pnv_npu2_mn_invalidate_page,
610 .invalidate_range = pnv_npu2_mn_invalidate_range,
611};
612
613/*
614 * Call into OPAL to setup the nmmu context for the current task in
615 * the NPU. This must be called to setup the context tables before the
616 * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
617 *
618 * A release callback should be registered to allow a device driver to
619 * be notified that it should not launch any new translation requests
620 * as the final TLB invalidate is about to occur.
621 *
622 * Returns an error if there no contexts are currently available or a
623 * npu_context which should be passed to pnv_npu2_handle_fault().
624 *
625 * mmap_sem must be held in write mode.
626 */
627struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
628 unsigned long flags,
629 struct npu_context *(*cb)(struct npu_context *, void *),
630 void *priv)
631{
632 int rc;
633 u32 nvlink_index;
634 struct device_node *nvlink_dn;
635 struct mm_struct *mm = current->mm;
636 struct pnv_phb *nphb;
637 struct npu *npu;
638 struct npu_context *npu_context;
639
640 /*
641 * At present we don't support GPUs connected to multiple NPUs and I'm
642 * not sure the hardware does either.
643 */
644 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
645
646 if (!firmware_has_feature(FW_FEATURE_OPAL))
647 return ERR_PTR(-ENODEV);
648
649 if (!npdev)
650 /* No nvlink associated with this GPU device */
651 return ERR_PTR(-ENODEV);
652
653 if (!mm) {
654 /* kernel thread contexts are not supported */
655 return ERR_PTR(-EINVAL);
656 }
657
658 nphb = pci_bus_to_host(npdev->bus)->private_data;
659 npu = &nphb->npu;
660
661 /*
662 * Setup the NPU context table for a particular GPU. These need to be
663 * per-GPU as we need the tables to filter ATSDs when there are no
664 * active contexts on a particular GPU.
665 */
666 rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
667 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
668 if (rc < 0)
669 return ERR_PTR(-ENOSPC);
670
671 /*
672 * We store the npu pci device so we can more easily get at the
673 * associated npus.
674 */
675 npu_context = mm->context.npu_context;
676 if (!npu_context) {
677 npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
678 if (!npu_context)
679 return ERR_PTR(-ENOMEM);
680
681 mm->context.npu_context = npu_context;
682 npu_context->mm = mm;
683 npu_context->mn.ops = &nv_nmmu_notifier_ops;
684 __mmu_notifier_register(&npu_context->mn, mm);
685 kref_init(&npu_context->kref);
686 } else {
687 kref_get(&npu_context->kref);
688 }
689
690 npu_context->release_cb = cb;
691 npu_context->priv = priv;
692 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
693 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
694 &nvlink_index)))
695 return ERR_PTR(-ENODEV);
696 npu_context->npdev[npu->index][nvlink_index] = npdev;
697
698 return npu_context;
699}
700EXPORT_SYMBOL(pnv_npu2_init_context);
701
702static void pnv_npu2_release_context(struct kref *kref)
703{
704 struct npu_context *npu_context =
705 container_of(kref, struct npu_context, kref);
706
707 npu_context->mm->context.npu_context = NULL;
708 mmu_notifier_unregister(&npu_context->mn,
709 npu_context->mm);
710
711 kfree(npu_context);
712}
713
714void pnv_npu2_destroy_context(struct npu_context *npu_context,
715 struct pci_dev *gpdev)
716{
717 struct pnv_phb *nphb, *phb;
718 struct npu *npu;
719 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
720 struct device_node *nvlink_dn;
721 u32 nvlink_index;
722
723 if (WARN_ON(!npdev))
724 return;
725
726 if (!firmware_has_feature(FW_FEATURE_OPAL))
727 return;
728
729 nphb = pci_bus_to_host(npdev->bus)->private_data;
730 npu = &nphb->npu;
731 phb = pci_bus_to_host(gpdev->bus)->private_data;
732 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
733 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
734 &nvlink_index)))
735 return;
736 npu_context->npdev[npu->index][nvlink_index] = NULL;
737 opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
738 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
739 kref_put(&npu_context->kref, pnv_npu2_release_context);
740}
741EXPORT_SYMBOL(pnv_npu2_destroy_context);
742
743/*
744 * Assumes mmap_sem is held for the contexts associated mm.
745 */
746int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
747 unsigned long *flags, unsigned long *status, int count)
748{
749 u64 rc = 0, result = 0;
750 int i, is_write;
751 struct page *page[1];
752
753 /* mmap_sem should be held so the struct_mm must be present */
754 struct mm_struct *mm = context->mm;
755
756 if (!firmware_has_feature(FW_FEATURE_OPAL))
757 return -ENODEV;
758
759 WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
760
761 for (i = 0; i < count; i++) {
762 is_write = flags[i] & NPU2_WRITE;
763 rc = get_user_pages_remote(NULL, mm, ea[i], 1,
764 is_write ? FOLL_WRITE : 0,
765 page, NULL, NULL);
766
767 /*
768 * To support virtualised environments we will have to do an
769 * access to the page to ensure it gets faulted into the
770 * hypervisor. For the moment virtualisation is not supported in
771 * other areas so leave the access out.
772 */
773 if (rc != 1) {
774 status[i] = rc;
775 result = -EFAULT;
776 continue;
777 }
778
779 status[i] = 0;
780 put_page(page[0]);
781 }
782
783 return result;
784}
785EXPORT_SYMBOL(pnv_npu2_handle_fault);
786
787int pnv_npu2_init(struct pnv_phb *phb)
788{
789 unsigned int i;
790 u64 mmio_atsd;
791 struct device_node *dn;
792 struct pci_dev *gpdev;
793 static int npu_index;
794 uint64_t rc = 0;
795
796 for_each_child_of_node(phb->hose->dn, dn) {
797 gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
798 if (gpdev) {
799 rc = opal_npu_map_lpar(phb->opal_id,
800 PCI_DEVID(gpdev->bus->number, gpdev->devfn),
801 0, 0);
802 if (rc)
803 dev_err(&gpdev->dev,
804 "Error %lld mapping device to LPAR\n",
805 rc);
806 }
807 }
808
809 for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
810 i, &mmio_atsd); i++)
811 phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
812
813 pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
814 phb->npu.mmio_atsd_count = i;
815 phb->npu.mmio_atsd_usage = 0;
816 npu_index++;
817 if (WARN_ON(npu_index >= NV_MAX_NPUS))
818 return -ENOSPC;
819 max_npu2_index = npu_index;
820 phb->npu.index = npu_index;
821
822 return 0;
823}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/of.h> 13#include <linux/of.h>
14#include <linux/bug.h> 14#include <linux/bug.h>
15#include <linux/debugfs.h>
16#include <linux/io.h> 15#include <linux/io.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18 17
@@ -21,7 +20,7 @@
21#include <asm/opal.h> 20#include <asm/opal.h>
22#include <asm/prom.h> 21#include <asm/prom.h>
23#include <linux/uaccess.h> 22#include <linux/uaccess.h>
24#include <asm/debug.h> 23#include <asm/debugfs.h>
25#include <asm/isa-bridge.h> 24#include <asm/isa-bridge.h>
26 25
27static int opal_lpc_chip_id = -1; 26static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 308efd170c27..aa267f120033 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
64 *sensor_data = be32_to_cpu(data); 64 *sensor_data = be32_to_cpu(data);
65 break; 65 break;
66 66
67 case OPAL_WRONG_STATE:
68 ret = -EIO;
69 break;
70
67 default: 71 default:
68 ret = opal_error_code(ret); 72 ret = opal_error_code(ret);
69 break; 73 break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..f620572f891f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \
50#define OPAL_BRANCH(LABEL) 50#define OPAL_BRANCH(LABEL)
51#endif 51#endif
52 52
53/* TODO: 53/*
54 * 54 * DO_OPAL_CALL assumes:
55 * - Trace irqs in/off (needs saving/restoring all args, argh...) 55 * r0 = opal call token
56 * - Get r11 feed up by Dave so I can have better register usage 56 * r12 = msr
57 * LR has been saved
57 */ 58 */
58 59#define DO_OPAL_CALL() \
59#define OPAL_CALL(name, token) \
60 _GLOBAL_TOC(name); \
61 mfmsr r12; \
62 mflr r0; \
63 andi. r11,r12,MSR_IR|MSR_DR; \
64 std r0,PPC_LR_STKOFF(r1); \
65 li r0,token; \
66 beq opal_real_call; \
67 OPAL_BRANCH(opal_tracepoint_entry) \
68 mfcr r11; \ 60 mfcr r11; \
69 stw r11,8(r1); \ 61 stw r11,8(r1); \
70 li r11,0; \ 62 li r11,0; \
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \
83 mtspr SPRN_HSRR0,r12; \ 75 mtspr SPRN_HSRR0,r12; \
84 hrfid 76 hrfid
85 77
78#define OPAL_CALL(name, token) \
79 _GLOBAL_TOC(name); \
80 mfmsr r12; \
81 mflr r0; \
82 andi. r11,r12,MSR_IR|MSR_DR; \
83 std r0,PPC_LR_STKOFF(r1); \
84 li r0,token; \
85 beq opal_real_call; \
86 OPAL_BRANCH(opal_tracepoint_entry) \
87 DO_OPAL_CALL()
88
89
86opal_return: 90opal_return:
87 /* 91 /*
88 * Fixup endian on OPAL return... we should be able to simplify 92 * Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
148 ld r8,STK_REG(R29)(r1) 152 ld r8,STK_REG(R29)(r1)
149 ld r9,STK_REG(R30)(r1) 153 ld r9,STK_REG(R30)(r1)
150 ld r10,STK_REG(R31)(r1) 154 ld r10,STK_REG(R31)(r1)
155
156 /* setup LR so we return via tracepoint_return */
151 LOAD_REG_ADDR(r11,opal_tracepoint_return) 157 LOAD_REG_ADDR(r11,opal_tracepoint_return)
152 mfcr r12
153 std r11,16(r1) 158 std r11,16(r1)
154 stw r12,8(r1) 159
155 li r11,0
156 mfmsr r12 160 mfmsr r12
157 ori r11,r11,MSR_EE 161 DO_OPAL_CALL()
158 std r12,PACASAVEDMSR(r13)
159 andc r12,r12,r11
160 mtmsrd r12,1
161 LOAD_REG_ADDR(r11,opal_return)
162 mtlr r11
163 li r11,MSR_DR|MSR_IR|MSR_LE
164 andc r12,r12,r11
165 mtspr SPRN_HSRR1,r12
166 LOAD_REG_ADDR(r11,opal)
167 ld r12,8(r11)
168 ld r2,0(r11)
169 mtspr SPRN_HSRR0,r12
170 hrfid
171 162
172opal_tracepoint_return: 163opal_tracepoint_return:
173 std r3,STK_REG(R31)(r1) 164 std r3,STK_REG(R31)(r1)
@@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
301OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); 292OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
302OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); 293OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
303OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); 294OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
295OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
296OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
297OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
298OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
299OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
300OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
301OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
302OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
303OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
304OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
305OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
306OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
307OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
308OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
309OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
310OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
311OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
312OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index d0ac535cf5d7..28651fb25417 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc)
73 73
74static u64 opal_scom_unmangle(u64 addr) 74static u64 opal_scom_unmangle(u64 addr)
75{ 75{
76 u64 tmp;
77
76 /* 78 /*
77 * XSCOM indirect addresses have the top bit set. Additionally 79 * XSCOM addresses use the top nibble to set indirect mode and
78 * the rest of the top 3 nibbles is always 0. 80 * its form. Bits 4-11 are always 0.
79 * 81 *
80 * Because the debugfs interface uses signed offsets and shifts 82 * Because the debugfs interface uses signed offsets and shifts
81 * the address left by 3, we basically cannot use the top 4 bits 83 * the address left by 3, we basically cannot use the top 4 bits
82 * of the 64-bit address, and thus cannot use the indirect bit. 84 * of the 64-bit address, and thus cannot use the indirect bit.
83 * 85 *
84 * To deal with that, we support the indirect bit being in bit 86 * To deal with that, we support the indirect bits being in
85 * 4 (IBM notation) instead of bit 0 in this API, we do the 87 * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
86 * conversion here. To leave room for further xscom address 88 * do the conversion here.
87 * expansion, we only clear out the top byte
88 * 89 *
89 * For in-kernel use, we also support the real indirect bit, so 90 * For in-kernel use, we don't need to do this mangling. In
90 * we test for any of the top 5 bits 91 * kernel won't have bits 4-7 set.
91 * 92 *
93 * So:
94 * debugfs will always set 0-3 = 0 and clear 4-7
95 * kernel will always clear 0-3 = 0 and set 4-7
92 */ 96 */
93 if (addr & (0x1full << 59)) 97 tmp = addr;
94 addr = (addr & ~(0xffull << 56)) | (1ull << 63); 98 tmp &= 0x0f00000000000000;
99 addr &= 0xf0ffffffffffffff;
100 addr |= tmp << 4;
101
95 return addr; 102 return addr;
96} 103}
97 104
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..7925a9d72cca 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs)
435 evt.version); 435 evt.version);
436 return 0; 436 return 0;
437 } 437 }
438 machine_check_print_event_info(&evt); 438 machine_check_print_event_info(&evt, user_mode(regs));
439 439
440 if (opal_recover_mce(regs, &evt)) 440 if (opal_recover_mce(regs, &evt))
441 return 1; 441 return 1;
@@ -595,6 +595,80 @@ static void opal_export_symmap(void)
595 pr_warn("Error %d creating OPAL symbols file\n", rc); 595 pr_warn("Error %d creating OPAL symbols file\n", rc);
596} 596}
597 597
598static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
599 struct bin_attribute *bin_attr, char *buf,
600 loff_t off, size_t count)
601{
602 return memory_read_from_buffer(buf, count, &off, bin_attr->private,
603 bin_attr->size);
604}
605
606/*
607 * opal_export_attrs: creates a sysfs node for each property listed in
608 * the device-tree under /ibm,opal/firmware/exports/
609 * All new sysfs nodes are created under /opal/exports/.
610 * This allows for reserved memory regions (e.g. HDAT) to be read.
611 * The new sysfs nodes are only readable by root.
612 */
613static void opal_export_attrs(void)
614{
615 struct bin_attribute *attr;
616 struct device_node *np;
617 struct property *prop;
618 struct kobject *kobj;
619 u64 vals[2];
620 int rc;
621
622 np = of_find_node_by_path("/ibm,opal/firmware/exports");
623 if (!np)
624 return;
625
626 /* Create new 'exports' directory - /sys/firmware/opal/exports */
627 kobj = kobject_create_and_add("exports", opal_kobj);
628 if (!kobj) {
629 pr_warn("kobject_create_and_add() of exports failed\n");
630 return;
631 }
632
633 for_each_property_of_node(np, prop) {
634 if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
635 continue;
636
637 if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
638 continue;
639
640 attr = kzalloc(sizeof(*attr), GFP_KERNEL);
641
642 if (attr == NULL) {
643 pr_warn("Failed kmalloc for bin_attribute!");
644 continue;
645 }
646
647 sysfs_bin_attr_init(attr);
648 attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
649 attr->attr.mode = 0400;
650 attr->read = export_attr_read;
651 attr->private = __va(vals[0]);
652 attr->size = vals[1];
653
654 if (attr->attr.name == NULL) {
655 pr_warn("Failed kstrdup for bin_attribute attr.name");
656 kfree(attr);
657 continue;
658 }
659
660 rc = sysfs_create_bin_file(kobj, attr);
661 if (rc) {
662 pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
663 rc, prop->name);
664 kfree(attr->attr.name);
665 kfree(attr);
666 }
667 }
668
669 of_node_put(np);
670}
671
598static void __init opal_dump_region_init(void) 672static void __init opal_dump_region_init(void)
599{ 673{
600 void *addr; 674 void *addr;
@@ -733,6 +807,9 @@ static int __init opal_init(void)
733 opal_msglog_sysfs_init(); 807 opal_msglog_sysfs_init();
734 } 808 }
735 809
810 /* Export all properties */
811 opal_export_attrs();
812
736 /* Initialize platform devices: IPMI backend, PRD & flash interface */ 813 /* Initialize platform devices: IPMI backend, PRD & flash interface */
737 opal_pdev_init("ibm,opal-ipmi"); 814 opal_pdev_init("ibm,opal-ipmi");
738 opal_pdev_init("ibm,opal-flash"); 815 opal_pdev_init("ibm,opal-flash");
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..6fdbd383f676 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,7 +14,6 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/pci.h> 15#include <linux/pci.h>
16#include <linux/crash_dump.h> 16#include <linux/crash_dump.h>
17#include <linux/debugfs.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/string.h> 18#include <linux/string.h>
20#include <linux/init.h> 19#include <linux/init.h>
@@ -38,7 +37,7 @@
38#include <asm/iommu.h> 37#include <asm/iommu.h>
39#include <asm/tce.h> 38#include <asm/tce.h>
40#include <asm/xics.h> 39#include <asm/xics.h>
41#include <asm/debug.h> 40#include <asm/debugfs.h>
42#include <asm/firmware.h> 41#include <asm/firmware.h>
43#include <asm/pnv-pci.h> 42#include <asm/pnv-pci.h>
44#include <asm/mmzone.h> 43#include <asm/mmzone.h>
@@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void)
1262 /* PE#0 is needed for error reporting */ 1261 /* PE#0 is needed for error reporting */
1263 pnv_ioda_reserve_pe(phb, 0); 1262 pnv_ioda_reserve_pe(phb, 0);
1264 pnv_ioda_setup_npu_PEs(hose->bus); 1263 pnv_ioda_setup_npu_PEs(hose->bus);
1264 if (phb->model == PNV_PHB_MODEL_NPU2)
1265 pnv_npu2_init(phb);
1265 } 1266 }
1266 } 1267 }
1267} 1268}
@@ -1424,8 +1425,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
1424 iommu_group_put(pe->table_group.group); 1425 iommu_group_put(pe->table_group.group);
1425 BUG_ON(pe->table_group.group); 1426 BUG_ON(pe->table_group.group);
1426 } 1427 }
1427 pnv_pci_ioda2_table_free_pages(tbl); 1428 iommu_tce_table_put(tbl);
1428 iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
1429} 1429}
1430 1430
1431static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) 1431static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1860 1860
1861 return ret; 1861 return ret;
1862} 1862}
1863
1864static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
1865 unsigned long *hpa, enum dma_data_direction *direction)
1866{
1867 long ret = pnv_tce_xchg(tbl, index, hpa, direction);
1868
1869 if (!ret)
1870 pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
1871
1872 return ret;
1873}
1863#endif 1874#endif
1864 1875
1865static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, 1876static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1874 .set = pnv_ioda1_tce_build, 1885 .set = pnv_ioda1_tce_build,
1875#ifdef CONFIG_IOMMU_API 1886#ifdef CONFIG_IOMMU_API
1876 .exchange = pnv_ioda1_tce_xchg, 1887 .exchange = pnv_ioda1_tce_xchg,
1888 .exchange_rm = pnv_ioda1_tce_xchg_rm,
1877#endif 1889#endif
1878 .clear = pnv_ioda1_tce_free, 1890 .clear = pnv_ioda1_tce_free,
1879 .get = pnv_tce_get, 1891 .get = pnv_tce_get,
@@ -1883,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1883#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) 1895#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
1884#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) 1896#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
1885 1897
1886void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) 1898static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1887{ 1899{
1888 __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); 1900 __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
1889 const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; 1901 const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
@@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1948{ 1960{
1949 struct iommu_table_group_link *tgl; 1961 struct iommu_table_group_link *tgl;
1950 1962
1951 list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { 1963 list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
1952 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1964 struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1953 struct pnv_ioda_pe, table_group); 1965 struct pnv_ioda_pe, table_group);
1954 struct pnv_phb *phb = pe->phb; 1966 struct pnv_phb *phb = pe->phb;
@@ -1979,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1979 } 1991 }
1980} 1992}
1981 1993
1994void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1995{
1996 if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
1997 pnv_pci_phb3_tce_invalidate_entire(phb, rm);
1998 else
1999 opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
2000}
2001
1982static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, 2002static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
1983 long npages, unsigned long uaddr, 2003 long npages, unsigned long uaddr,
1984 enum dma_data_direction direction, 2004 enum dma_data_direction direction,
@@ -2004,6 +2024,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2004 2024
2005 return ret; 2025 return ret;
2006} 2026}
2027
2028static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2029 unsigned long *hpa, enum dma_data_direction *direction)
2030{
2031 long ret = pnv_tce_xchg(tbl, index, hpa, direction);
2032
2033 if (!ret)
2034 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2035
2036 return ret;
2037}
2007#endif 2038#endif
2008 2039
2009static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, 2040static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2017,13 +2048,13 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2017static void pnv_ioda2_table_free(struct iommu_table *tbl) 2048static void pnv_ioda2_table_free(struct iommu_table *tbl)
2018{ 2049{
2019 pnv_pci_ioda2_table_free_pages(tbl); 2050 pnv_pci_ioda2_table_free_pages(tbl);
2020 iommu_free_table(tbl, "pnv");
2021} 2051}
2022 2052
2023static struct iommu_table_ops pnv_ioda2_iommu_ops = { 2053static struct iommu_table_ops pnv_ioda2_iommu_ops = {
2024 .set = pnv_ioda2_tce_build, 2054 .set = pnv_ioda2_tce_build,
2025#ifdef CONFIG_IOMMU_API 2055#ifdef CONFIG_IOMMU_API
2026 .exchange = pnv_ioda2_tce_xchg, 2056 .exchange = pnv_ioda2_tce_xchg,
2057 .exchange_rm = pnv_ioda2_tce_xchg_rm,
2027#endif 2058#endif
2028 .clear = pnv_ioda2_tce_free, 2059 .clear = pnv_ioda2_tce_free,
2029 .get = pnv_tce_get, 2060 .get = pnv_tce_get,
@@ -2128,6 +2159,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
2128 2159
2129found: 2160found:
2130 tbl = pnv_pci_table_alloc(phb->hose->node); 2161 tbl = pnv_pci_table_alloc(phb->hose->node);
2162 if (WARN_ON(!tbl))
2163 return;
2164
2131 iommu_register_group(&pe->table_group, phb->hose->global_number, 2165 iommu_register_group(&pe->table_group, phb->hose->global_number,
2132 pe->pe_number); 2166 pe->pe_number);
2133 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); 2167 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
@@ -2203,7 +2237,7 @@ found:
2203 __free_pages(tce_mem, get_order(tce32_segsz * segs)); 2237 __free_pages(tce_mem, get_order(tce32_segsz * segs));
2204 if (tbl) { 2238 if (tbl) {
2205 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 2239 pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
2206 iommu_free_table(tbl, "pnv"); 2240 iommu_tce_table_put(tbl);
2207 } 2241 }
2208} 2242}
2209 2243
@@ -2293,16 +2327,16 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2293 if (!tbl) 2327 if (!tbl)
2294 return -ENOMEM; 2328 return -ENOMEM;
2295 2329
2330 tbl->it_ops = &pnv_ioda2_iommu_ops;
2331
2296 ret = pnv_pci_ioda2_table_alloc_pages(nid, 2332 ret = pnv_pci_ioda2_table_alloc_pages(nid,
2297 bus_offset, page_shift, window_size, 2333 bus_offset, page_shift, window_size,
2298 levels, tbl); 2334 levels, tbl);
2299 if (ret) { 2335 if (ret) {
2300 iommu_free_table(tbl, "pnv"); 2336 iommu_tce_table_put(tbl);
2301 return ret; 2337 return ret;
2302 } 2338 }
2303 2339
2304 tbl->it_ops = &pnv_ioda2_iommu_ops;
2305
2306 *ptbl = tbl; 2340 *ptbl = tbl;
2307 2341
2308 return 0; 2342 return 0;
@@ -2343,7 +2377,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2343 if (rc) { 2377 if (rc) {
2344 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", 2378 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2345 rc); 2379 rc);
2346 pnv_ioda2_table_free(tbl); 2380 iommu_tce_table_put(tbl);
2347 return rc; 2381 return rc;
2348 } 2382 }
2349 2383
@@ -2414,7 +2448,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2414 2448
2415 tce_table_size /= direct_table_size; 2449 tce_table_size /= direct_table_size;
2416 tce_table_size <<= 3; 2450 tce_table_size <<= 3;
2417 tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size); 2451 tce_table_size = max_t(unsigned long,
2452 tce_table_size, direct_table_size);
2418 } 2453 }
2419 2454
2420 return bytes; 2455 return bytes;
@@ -2431,7 +2466,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2431 pnv_pci_ioda2_unset_window(&pe->table_group, 0); 2466 pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2432 if (pe->pbus) 2467 if (pe->pbus)
2433 pnv_ioda_setup_bus_dma(pe, pe->pbus, false); 2468 pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
2434 pnv_ioda2_table_free(tbl); 2469 iommu_tce_table_put(tbl);
2435} 2470}
2436 2471
2437static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) 2472static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2735,9 +2770,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2735 if (rc) 2770 if (rc)
2736 return; 2771 return;
2737 2772
2738 if (pe->flags & PNV_IODA_PE_DEV) 2773 if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2739 iommu_add_device(&pe->pdev->dev);
2740 else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2741 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 2774 pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
2742} 2775}
2743 2776
@@ -3406,7 +3439,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
3406 } 3439 }
3407 3440
3408 free_pages(tbl->it_base, get_order(tbl->it_size << 3)); 3441 free_pages(tbl->it_base, get_order(tbl->it_size << 3));
3409 iommu_free_table(tbl, "pnv"); 3442 iommu_tce_table_put(tbl);
3410} 3443}
3411 3444
3412static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) 3445static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3433,7 +3466,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
3433 } 3466 }
3434 3467
3435 pnv_pci_ioda2_table_free_pages(tbl); 3468 pnv_pci_ioda2_table_free_pages(tbl);
3436 iommu_free_table(tbl, "pnv"); 3469 iommu_tce_table_put(tbl);
3437} 3470}
3438 3471
3439static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, 3472static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..935ccb249a8a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
758 758
759unsigned long pnv_tce_get(struct iommu_table *tbl, long index) 759unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
760{ 760{
761 return *(pnv_tce(tbl, index - tbl->it_offset)); 761 return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
762} 762}
763 763
764struct iommu_table *pnv_pci_table_alloc(int nid) 764struct iommu_table *pnv_pci_table_alloc(int nid)
@@ -766,7 +766,11 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
766 struct iommu_table *tbl; 766 struct iommu_table *tbl;
767 767
768 tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); 768 tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
769 if (!tbl)
770 return NULL;
771
769 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 772 INIT_LIST_HEAD_RCU(&tbl->it_group_list);
773 kref_init(&tbl->it_kref);
770 774
771 return tbl; 775 return tbl;
772} 776}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e1d3e5526b54..18c8a2fa03b8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,9 @@
7 7
8struct pci_dn; 8struct pci_dn;
9 9
10/* Maximum possible number of ATSD MMIO registers per NPU */
11#define NV_NMMU_ATSD_REGS 8
12
10enum pnv_phb_type { 13enum pnv_phb_type {
11 PNV_PHB_IODA1 = 0, 14 PNV_PHB_IODA1 = 0,
12 PNV_PHB_IODA2 = 1, 15 PNV_PHB_IODA2 = 1,
@@ -174,6 +177,16 @@ struct pnv_phb {
174 struct OpalIoP7IOCErrorData hub_diag; 177 struct OpalIoP7IOCErrorData hub_diag;
175 } diag; 178 } diag;
176 179
180 /* Nvlink2 data */
181 struct npu {
182 int index;
183 __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
184 unsigned int mmio_atsd_count;
185
186 /* Bitmask for MMIO register usage */
187 unsigned long mmio_atsd_usage;
188 } npu;
189
177#ifdef CONFIG_CXL_BASE 190#ifdef CONFIG_CXL_BASE
178 struct cxl_afu *cxl_afu; 191 struct cxl_afu *cxl_afu;
179#endif 192#endif
@@ -229,14 +242,14 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
229 242
230/* Nvlink functions */ 243/* Nvlink functions */
231extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); 244extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
232extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm); 245extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
233extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); 246extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
234extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, 247extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
235 struct iommu_table *tbl); 248 struct iommu_table *tbl);
236extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); 249extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
237extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); 250extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
238extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); 251extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
239 252extern int pnv_npu2_init(struct pnv_phb *phb);
240 253
241/* cxl functions */ 254/* cxl functions */
242extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); 255extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 613052232475..6dbc0a1da1f6 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { }
18#endif 18#endif
19 19
20extern u32 pnv_get_supported_cpuidle_states(void); 20extern u32 pnv_get_supported_cpuidle_states(void);
21extern u64 pnv_deepest_stop_psscr_val;
22extern u64 pnv_deepest_stop_psscr_mask;
23 21
24extern void pnv_lpc_init(void); 22extern void pnv_lpc_init(void);
25 23
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 5dcbdea1afac..1a9d84371a4d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v)
62 62
63 rng = raw_cpu_read(powernv_rng); 63 rng = raw_cpu_read(powernv_rng);
64 64
65 *v = rng_whiten(rng, in_rm64(rng->regs_real)); 65 *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
66 66
67 return 1; 67 return 1;
68} 68}
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..2dc7e5fb86c3 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -32,6 +32,7 @@
32#include <asm/machdep.h> 32#include <asm/machdep.h>
33#include <asm/firmware.h> 33#include <asm/firmware.h>
34#include <asm/xics.h> 34#include <asm/xics.h>
35#include <asm/xive.h>
35#include <asm/opal.h> 36#include <asm/opal.h>
36#include <asm/kexec.h> 37#include <asm/kexec.h>
37#include <asm/smp.h> 38#include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
76 77
77static void __init pnv_init_IRQ(void) 78static void __init pnv_init_IRQ(void)
78{ 79{
79 xics_init(); 80 /* Try using a XIVE if available, otherwise use a XICS */
81 if (!xive_native_init())
82 xics_init();
80 83
81 WARN_ON(!ppc_md.get_irq); 84 WARN_ON(!ppc_md.get_irq);
82} 85}
@@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m)
95 else 98 else
96 seq_printf(m, "firmware\t: BML\n"); 99 seq_printf(m, "firmware\t: BML\n");
97 of_node_put(root); 100 of_node_put(root);
101 if (radix_enabled())
102 seq_printf(m, "MMU\t\t: Radix\n");
103 else
104 seq_printf(m, "MMU\t\t: Hash\n");
98} 105}
99 106
100static void pnv_prepare_going_down(void) 107static void pnv_prepare_going_down(void)
@@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void)
218 225
219static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) 226static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
220{ 227{
221 xics_kexec_teardown_cpu(secondary); 228 if (xive_enabled())
229 xive_kexec_teardown_cpu(secondary);
230 else
231 xics_kexec_teardown_cpu(secondary);
222 232
223 /* On OPAL, we return all CPUs to firmware */ 233 /* On OPAL, we return all CPUs to firmware */
224
225 if (!firmware_has_feature(FW_FEATURE_OPAL)) 234 if (!firmware_has_feature(FW_FEATURE_OPAL))
226 return; 235 return;
227 236
@@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
237 /* Primary waits for the secondaries to have reached OPAL */ 246 /* Primary waits for the secondaries to have reached OPAL */
238 pnv_kexec_wait_secondaries_down(); 247 pnv_kexec_wait_secondaries_down();
239 248
249 /* Switch XIVE back to emulation mode */
250 if (xive_enabled())
251 xive_shutdown();
252
240 /* 253 /*
241 * We might be running as little-endian - now that interrupts 254 * We might be running as little-endian - now that interrupts
242 * are disabled, reset the HILE bit to big-endian so we don't 255 * are disabled, reset the HILE bit to big-endian so we don't
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..4aff754b6f2c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -29,12 +29,14 @@
29#include <asm/vdso_datapage.h> 29#include <asm/vdso_datapage.h>
30#include <asm/cputhreads.h> 30#include <asm/cputhreads.h>
31#include <asm/xics.h> 31#include <asm/xics.h>
32#include <asm/xive.h>
32#include <asm/opal.h> 33#include <asm/opal.h>
33#include <asm/runlatch.h> 34#include <asm/runlatch.h>
34#include <asm/code-patching.h> 35#include <asm/code-patching.h>
35#include <asm/dbell.h> 36#include <asm/dbell.h>
36#include <asm/kvm_ppc.h> 37#include <asm/kvm_ppc.h>
37#include <asm/ppc-opcode.h> 38#include <asm/ppc-opcode.h>
39#include <asm/cpuidle.h>
38 40
39#include "powernv.h" 41#include "powernv.h"
40 42
@@ -47,13 +49,10 @@
47 49
48static void pnv_smp_setup_cpu(int cpu) 50static void pnv_smp_setup_cpu(int cpu)
49{ 51{
50 if (cpu != boot_cpuid) 52 if (xive_enabled())
53 xive_smp_setup_cpu();
54 else if (cpu != boot_cpuid)
51 xics_setup_cpu(); 55 xics_setup_cpu();
52
53#ifdef CONFIG_PPC_DOORBELL
54 if (cpu_has_feature(CPU_FTR_DBELL))
55 doorbell_setup_this_cpu();
56#endif
57} 56}
58 57
59static int pnv_smp_kick_cpu(int nr) 58static int pnv_smp_kick_cpu(int nr)
@@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void)
132 vdso_data->processorCount--; 131 vdso_data->processorCount--;
133 if (cpu == boot_cpuid) 132 if (cpu == boot_cpuid)
134 boot_cpuid = cpumask_any(cpu_online_mask); 133 boot_cpuid = cpumask_any(cpu_online_mask);
135 xics_migrate_irqs_away(); 134 if (xive_enabled())
135 xive_smp_disable_cpu();
136 else
137 xics_migrate_irqs_away();
136 return 0; 138 return 0;
137} 139}
138 140
@@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void)
140{ 142{
141 unsigned int cpu; 143 unsigned int cpu;
142 unsigned long srr1, wmask; 144 unsigned long srr1, wmask;
143 u32 idle_states;
144 145
145 /* Standard hot unplug procedure */ 146 /* Standard hot unplug procedure */
146 local_irq_disable(); 147 local_irq_disable();
@@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void)
155 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 156 if (cpu_has_feature(CPU_FTR_ARCH_207S))
156 wmask = SRR1_WAKEMASK_P8; 157 wmask = SRR1_WAKEMASK_P8;
157 158
158 idle_states = pnv_get_supported_cpuidle_states();
159
160 /* We don't want to take decrementer interrupts while we are offline, 159 /* We don't want to take decrementer interrupts while we are offline,
161 * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) 160 * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9)
162 * enabled as to let IPIs in. 161 * enabled as to let IPIs in.
@@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void)
184 kvmppc_set_host_ipi(cpu, 0); 183 kvmppc_set_host_ipi(cpu, 0);
185 184
186 ppc64_runlatch_off(); 185 ppc64_runlatch_off();
187 186 srr1 = pnv_cpu_offline(cpu);
188 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
189 srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
190 pnv_deepest_stop_psscr_mask);
191 } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
192 srr1 = power7_winkle();
193 } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
194 (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
195 srr1 = power7_sleep();
196 } else {
197 srr1 = power7_nap(1);
198 }
199
200 ppc64_runlatch_on(); 187 ppc64_runlatch_on();
201 188
202 /* 189 /*
@@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void)
213 if (((srr1 & wmask) == SRR1_WAKEEE) || 200 if (((srr1 & wmask) == SRR1_WAKEEE) ||
214 ((srr1 & wmask) == SRR1_WAKEHVI) || 201 ((srr1 & wmask) == SRR1_WAKEHVI) ||
215 (local_paca->irq_happened & PACA_IRQ_EE)) { 202 (local_paca->irq_happened & PACA_IRQ_EE)) {
216 if (cpu_has_feature(CPU_FTR_ARCH_300)) 203 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
217 icp_opal_flush_interrupt(); 204 if (xive_enabled())
218 else 205 xive_flush_interrupt();
206 else
207 icp_opal_flush_interrupt();
208 } else
219 icp_native_flush_interrupt(); 209 icp_native_flush_interrupt();
220 } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { 210 } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
221 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 211 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@@ -252,10 +242,69 @@ static int pnv_cpu_bootable(unsigned int nr)
252 return smp_generic_cpu_bootable(nr); 242 return smp_generic_cpu_bootable(nr);
253} 243}
254 244
245static int pnv_smp_prepare_cpu(int cpu)
246{
247 if (xive_enabled())
248 return xive_smp_prepare_cpu(cpu);
249 return 0;
250}
251
252/* Cause IPI as setup by the interrupt controller (xics or xive) */
253static void (*ic_cause_ipi)(int cpu);
254
255static void pnv_cause_ipi(int cpu)
256{
257 if (doorbell_try_core_ipi(cpu))
258 return;
259
260 ic_cause_ipi(cpu);
261}
262
263static void pnv_p9_dd1_cause_ipi(int cpu)
264{
265 int this_cpu = get_cpu();
266
267 /*
268 * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
269 * IPIs to same core, because it requires additional synchronization
270 * for inter-core doorbells which we do not implement.
271 */
272 if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
273 doorbell_global_ipi(cpu);
274 else
275 ic_cause_ipi(cpu);
276
277 put_cpu();
278}
279
280static void __init pnv_smp_probe(void)
281{
282 if (xive_enabled())
283 xive_smp_probe();
284 else
285 xics_smp_probe();
286
287 if (cpu_has_feature(CPU_FTR_DBELL)) {
288 ic_cause_ipi = smp_ops->cause_ipi;
289 WARN_ON(!ic_cause_ipi);
290
291 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
292 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
293 smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
294 else
295 smp_ops->cause_ipi = doorbell_global_ipi;
296 } else {
297 smp_ops->cause_ipi = pnv_cause_ipi;
298 }
299 }
300}
301
255static struct smp_ops_t pnv_smp_ops = { 302static struct smp_ops_t pnv_smp_ops = {
256 .message_pass = smp_muxed_ipi_message_pass, 303 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
257 .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ 304 .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */
258 .probe = xics_smp_probe, 305 .cause_nmi_ipi = NULL,
306 .probe = pnv_smp_probe,
307 .prepare_cpu = pnv_smp_prepare_cpu,
259 .kick_cpu = pnv_smp_kick_cpu, 308 .kick_cpu = pnv_smp_kick_cpu,
260 .setup_cpu = pnv_smp_setup_cpu, 309 .setup_cpu = pnv_smp_setup_cpu,
261 .cpu_bootable = pnv_cpu_bootable, 310 .cpu_bootable = pnv_cpu_bootable,
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 60154d08debf..1d1ad5df106f 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -77,7 +77,7 @@ static void __init ps3_smp_probe(void)
77 BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0); 77 BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0);
78 BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1); 78 BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1);
79 BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2); 79 BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2);
80 BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3); 80 BUILD_BUG_ON(PPC_MSG_NMI_IPI != 3);
81 81
82 for (i = 0; i < MSG_COUNT; i++) { 82 for (i = 0; i < MSG_COUNT; i++) {
83 result = ps3_event_receive_port_setup(cpu, &virqs[i]); 83 result = ps3_event_receive_port_setup(cpu, &virqs[i]);
@@ -96,7 +96,7 @@ static void __init ps3_smp_probe(void)
96 ps3_register_ipi_irq(cpu, virqs[i]); 96 ps3_register_ipi_irq(cpu, virqs[i]);
97 } 97 }
98 98
99 ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]); 99 ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]);
100 100
101 DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu); 101 DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu);
102 } 102 }
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 30ec04f1c67c..913c54e23eea 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,9 +17,10 @@ config PPC_PSERIES
17 select PPC_UDBG_16550 17 select PPC_UDBG_16550
18 select PPC_NATIVE 18 select PPC_NATIVE
19 select PPC_DOORBELL 19 select PPC_DOORBELL
20 select HOTPLUG_CPU if SMP 20 select HOTPLUG_CPU
21 select ARCH_RANDOM 21 select ARCH_RANDOM
22 select PPC_DOORBELL 22 select PPC_DOORBELL
23 select FORCE_SMP
23 default y 24 default y
24 25
25config PPC_SPLPAR 26config PPC_SPLPAR
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 193e052fa0dd..bda18d8e1674 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn)
288 if (rc) 288 if (rc)
289 return rc; 289 return rc;
290 290
291 of_node_put(dn); /* Must decrement the refcount */
292 return 0; 291 return 0;
293} 292}
294 293
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 6b04e3f0f982..18014cdeb590 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -21,13 +21,12 @@
21 */ 21 */
22 22
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/debugfs.h>
25#include <linux/spinlock.h> 24#include <linux/spinlock.h>
26#include <asm/smp.h> 25#include <asm/smp.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <asm/firmware.h> 27#include <asm/firmware.h>
29#include <asm/lppaca.h> 28#include <asm/lppaca.h>
30#include <asm/debug.h> 29#include <asm/debugfs.h>
31#include <asm/plpar_wrappers.h> 30#include <asm/plpar_wrappers.h>
32#include <asm/machdep.h> 31#include <asm/machdep.h>
33 32
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index f02ec3ab428c..957ae347b0b3 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -29,6 +29,16 @@
29#include <asm/trace.h> 29#include <asm/trace.h>
30#include <asm/machdep.h> 30#include <asm/machdep.h>
31 31
32/* For hcall instrumentation. One structure per-hcall, per-CPU */
33struct hcall_stats {
34 unsigned long num_calls; /* number of calls (on this CPU) */
35 unsigned long tb_total; /* total wall time (mftb) of calls. */
36 unsigned long purr_total; /* total cpu time (PURR) of calls. */
37 unsigned long tb_start;
38 unsigned long purr_start;
39};
40#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1)
41
32DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); 42DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
33 43
34/* 44/*
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757eaa46bf..8374adee27e3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
74 goto fail_exit; 74 goto fail_exit;
75 75
76 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 76 INIT_LIST_HEAD_RCU(&tbl->it_group_list);
77 kref_init(&tbl->it_kref);
77 tgl->table_group = table_group; 78 tgl->table_group = table_group;
78 list_add_rcu(&tgl->next, &tbl->it_group_list); 79 list_add_rcu(&tgl->next, &tbl->it_group_list);
79 80
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
115 BUG_ON(table_group->group); 116 BUG_ON(table_group->group);
116 } 117 }
117#endif 118#endif
118 iommu_free_table(tbl, node_name); 119 iommu_tce_table_put(tbl);
119 120
120 kfree(table_group); 121 kfree(table_group);
121} 122}
@@ -550,6 +551,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
550static void iommu_table_setparms_lpar(struct pci_controller *phb, 551static void iommu_table_setparms_lpar(struct pci_controller *phb,
551 struct device_node *dn, 552 struct device_node *dn,
552 struct iommu_table *tbl, 553 struct iommu_table *tbl,
554 struct iommu_table_group *table_group,
553 const __be32 *dma_window) 555 const __be32 *dma_window)
554{ 556{
555 unsigned long offset, size; 557 unsigned long offset, size;
@@ -563,6 +565,9 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
563 tbl->it_type = TCE_PCI; 565 tbl->it_type = TCE_PCI;
564 tbl->it_offset = offset >> tbl->it_page_shift; 566 tbl->it_offset = offset >> tbl->it_page_shift;
565 tbl->it_size = size >> tbl->it_page_shift; 567 tbl->it_size = size >> tbl->it_page_shift;
568
569 table_group->tce32_start = offset;
570 table_group->tce32_size = size;
566} 571}
567 572
568struct iommu_table_ops iommu_table_pseries_ops = { 573struct iommu_table_ops iommu_table_pseries_ops = {
@@ -651,8 +656,38 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
651 pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); 656 pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
652} 657}
653 658
659#ifdef CONFIG_IOMMU_API
660static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
661 long *tce, enum dma_data_direction *direction)
662{
663 long rc;
664 unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
665 unsigned long flags, oldtce = 0;
666 u64 proto_tce = iommu_direction_to_tce_perm(*direction);
667 unsigned long newtce = *tce | proto_tce;
668
669 spin_lock_irqsave(&tbl->large_pool.lock, flags);
670
671 rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
672 if (!rc)
673 rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
674
675 if (!rc) {
676 *direction = iommu_tce_direction(oldtce);
677 *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
678 }
679
680 spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
681
682 return rc;
683}
684#endif
685
654struct iommu_table_ops iommu_table_lpar_multi_ops = { 686struct iommu_table_ops iommu_table_lpar_multi_ops = {
655 .set = tce_buildmulti_pSeriesLP, 687 .set = tce_buildmulti_pSeriesLP,
688#ifdef CONFIG_IOMMU_API
689 .exchange = tce_exchange_pseries,
690#endif
656 .clear = tce_freemulti_pSeriesLP, 691 .clear = tce_freemulti_pSeriesLP,
657 .get = tce_get_pSeriesLP 692 .get = tce_get_pSeriesLP
658}; 693};
@@ -689,7 +724,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
689 if (!ppci->table_group) { 724 if (!ppci->table_group) {
690 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); 725 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
691 tbl = ppci->table_group->tables[0]; 726 tbl = ppci->table_group->tables[0];
692 iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); 727 iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
728 ppci->table_group, dma_window);
693 tbl->it_ops = &iommu_table_lpar_multi_ops; 729 tbl->it_ops = &iommu_table_lpar_multi_ops;
694 iommu_init_table(tbl, ppci->phb->node); 730 iommu_init_table(tbl, ppci->phb->node);
695 iommu_register_group(ppci->table_group, 731 iommu_register_group(ppci->table_group,
@@ -1143,7 +1179,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1143 if (!pci->table_group) { 1179 if (!pci->table_group) {
1144 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 1180 pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
1145 tbl = pci->table_group->tables[0]; 1181 tbl = pci->table_group->tables[0];
1146 iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); 1182 iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1183 pci->table_group, dma_window);
1147 tbl->it_ops = &iommu_table_lpar_multi_ops; 1184 tbl->it_ops = &iommu_table_lpar_multi_ops;
1148 iommu_init_table(tbl, pci->phb->node); 1185 iommu_init_table(tbl, pci->phb->node);
1149 iommu_register_group(pci->table_group, 1186 iommu_register_group(pci->table_group,
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 8b1fe895daa3..6541d0b03e4c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
958 958
959 return rc; 959 return rc;
960} 960}
961
962static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
963{
964 unsigned long protovsid;
965 unsigned long va_bits = VA_BITS;
966 unsigned long modinv, vsid_modulus;
967 unsigned long max_mod_inv, tmp_modinv;
968
969 if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
970 va_bits = 65;
971
972 if (ssize == MMU_SEGSIZE_256M) {
973 modinv = VSID_MULINV_256M;
974 vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
975 } else {
976 modinv = VSID_MULINV_1T;
977 vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
978 }
979
980 /*
981 * vsid outside our range.
982 */
983 if (vsid >= vsid_modulus)
984 return 0;
985
986 /*
987 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
988 * and vsid = (protovsid * x) % vsid_modulus, then we say:
989 * protovsid = (vsid * modinv) % vsid_modulus
990 */
991
992 /* Check if (vsid * modinv) overflow (63 bits) */
993 max_mod_inv = 0x7fffffffffffffffull / vsid;
994 if (modinv < max_mod_inv)
995 return (vsid * modinv) % vsid_modulus;
996
997 tmp_modinv = modinv/max_mod_inv;
998 modinv %= max_mod_inv;
999
1000 protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
1001 protovsid = (protovsid + vsid * modinv) % vsid_modulus;
1002
1003 return protovsid;
1004}
1005
1006static int __init reserve_vrma_context_id(void)
1007{
1008 unsigned long protovsid;
1009
1010 /*
1011 * Reserve context ids which map to reserved virtual addresses. For now
1012 * we only reserve the context id which maps to the VRMA VSID. We ignore
1013 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
1014 * enable adjunct support via the "ibm,client-architecture-support"
1015 * interface.
1016 */
1017 protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
1018 hash__reserve_context_id(protovsid >> ESID_BITS_1T);
1019 return 0;
1020}
1021machine_device_initcall(pseries, reserve_vrma_context_id);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 904a677208d1..bb70b26334f0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -386,6 +386,10 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
386 } 386 }
387 fwnmi_release_errinfo(); 387 fwnmi_release_errinfo();
388 } 388 }
389
390 if (smp_handle_nmi_ipi(regs))
391 return 1;
392
389 return 0; /* need to perform reset */ 393 return 0; /* need to perform reset */
390} 394}
391 395
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b4d362ed03a1..b5d86426e97b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
87 model = of_get_property(root, "model", NULL); 87 model = of_get_property(root, "model", NULL);
88 seq_printf(m, "machine\t\t: CHRP %s\n", model); 88 seq_printf(m, "machine\t\t: CHRP %s\n", model);
89 of_node_put(root); 89 of_node_put(root);
90 if (radix_enabled())
91 seq_printf(m, "MMU\t\t: Radix\n");
92 else
93 seq_printf(m, "MMU\t\t: Hash\n");
90} 94}
91 95
92/* Initialize firmware assisted non-maskable interrupts if 96/* Initialize firmware assisted non-maskable interrupts if
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index f6f83aeccaaa..52ca6b311d44 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -55,11 +55,6 @@
55 */ 55 */
56static cpumask_var_t of_spin_mask; 56static cpumask_var_t of_spin_mask;
57 57
58/*
59 * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
60 */
61static void (*xics_cause_ipi)(int cpu, unsigned long data);
62
63/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ 58/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
64int smp_query_cpu_stopped(unsigned int pcpu) 59int smp_query_cpu_stopped(unsigned int pcpu)
65{ 60{
@@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu)
143{ 138{
144 if (cpu != boot_cpuid) 139 if (cpu != boot_cpuid)
145 xics_setup_cpu(); 140 xics_setup_cpu();
146 if (cpu_has_feature(CPU_FTR_DBELL))
147 doorbell_setup_this_cpu();
148 141
149 if (firmware_has_feature(FW_FEATURE_SPLPAR)) 142 if (firmware_has_feature(FW_FEATURE_SPLPAR))
150 vpa_init(cpu); 143 vpa_init(cpu);
@@ -187,28 +180,50 @@ static int smp_pSeries_kick_cpu(int nr)
187 return 0; 180 return 0;
188} 181}
189 182
190/* Only used on systems that support multiple IPI mechanisms */ 183static void smp_pseries_cause_ipi(int cpu)
191static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
192{ 184{
193 if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) 185 /* POWER9 should not use this handler */
194 doorbell_cause_ipi(cpu, data); 186 if (doorbell_try_core_ipi(cpu))
195 else 187 return;
196 xics_cause_ipi(cpu, data); 188
189 icp_ops->cause_ipi(cpu);
190}
191
192static int pseries_cause_nmi_ipi(int cpu)
193{
194 int hwcpu;
195
196 if (cpu == NMI_IPI_ALL_OTHERS) {
197 hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
198 } else {
199 if (cpu < 0) {
200 WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
201 return 0;
202 }
203
204 hwcpu = get_hard_smp_processor_id(cpu);
205 }
206
207 if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS)
208 return 1;
209
210 return 0;
197} 211}
198 212
199static __init void pSeries_smp_probe(void) 213static __init void pSeries_smp_probe(void)
200{ 214{
201 xics_smp_probe(); 215 xics_smp_probe();
202 216
203 if (cpu_has_feature(CPU_FTR_DBELL)) { 217 if (cpu_has_feature(CPU_FTR_DBELL))
204 xics_cause_ipi = smp_ops->cause_ipi; 218 smp_ops->cause_ipi = smp_pseries_cause_ipi;
205 smp_ops->cause_ipi = pSeries_cause_ipi_mux; 219 else
206 } 220 smp_ops->cause_ipi = icp_ops->cause_ipi;
207} 221}
208 222
209static struct smp_ops_t pseries_smp_ops = { 223static struct smp_ops_t pseries_smp_ops = {
210 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ 224 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
211 .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ 225 .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
226 .cause_nmi_ipi = pseries_cause_nmi_ipi,
212 .probe = pSeries_smp_probe, 227 .probe = pSeries_smp_probe,
213 .kick_cpu = smp_pSeries_kick_cpu, 228 .kick_cpu = smp_pSeries_kick_cpu,
214 .setup_cpu = smp_setup_cpu, 229 .setup_cpu = smp_setup_cpu,
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 720493932486..28b09fd797ec 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
1318 struct iommu_table *tbl = get_iommu_table_base(dev); 1318 struct iommu_table *tbl = get_iommu_table_base(dev);
1319 1319
1320 if (tbl) 1320 if (tbl)
1321 iommu_free_table(tbl, of_node_full_name(dev->of_node)); 1321 iommu_tce_table_put(tbl);
1322 of_node_put(dev->of_node); 1322 of_node_put(dev->of_node);
1323 kfree(to_vio_dev(dev)); 1323 kfree(to_vio_dev(dev));
1324} 1324}
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index 52dc165c0efb..caf882e749dc 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
28 default y if PPC_POWERNV 28 default y if PPC_POWERNV
29 29
30source "arch/powerpc/sysdev/xics/Kconfig" 30source "arch/powerpc/sysdev/xics/Kconfig"
31source "arch/powerpc/sysdev/xive/Kconfig"
31 32
32config PPC_SCOM 33config PPC_SCOM
33 bool 34 bool
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index a254824719f1..c0ae11d4f62f 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o
71subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 71subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
72 72
73obj-$(CONFIG_PPC_XICS) += xics/ 73obj-$(CONFIG_PPC_XICS) += xics/
74obj-$(CONFIG_PPC_XIVE) += xive/
74 75
75obj-$(CONFIG_GE_FPGA) += ge/ 76obj-$(CONFIG_GE_FPGA) += ge/
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
index d0e9f178a324..76ea32c1b664 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/sysdev/scom.c
@@ -19,10 +19,9 @@
19 */ 19 */
20 20
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/debugfs.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
24#include <linux/export.h> 23#include <linux/export.h>
25#include <asm/debug.h> 24#include <asm/debugfs.h>
26#include <asm/prom.h> 25#include <asm/prom.h>
27#include <asm/scom.h> 26#include <asm/scom.h>
28#include <linux/uaccess.h> 27#include <linux/uaccess.h>
diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c
index e7fa26c4ff73..bbc839a98c41 100644
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr)
138 138
139#ifdef CONFIG_SMP 139#ifdef CONFIG_SMP
140 140
141static void icp_hv_cause_ipi(int cpu, unsigned long data) 141static void icp_hv_cause_ipi(int cpu)
142{ 142{
143 icp_hv_set_qirr(cpu, IPI_PRIORITY); 143 icp_hv_set_qirr(cpu, IPI_PRIORITY);
144} 144}
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 8a6a043e239b..2bfb9968d562 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void)
143 143
144#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
145 145
146static void icp_native_cause_ipi(int cpu, unsigned long data) 146static void icp_native_cause_ipi(int cpu)
147{ 147{
148 kvmppc_set_host_ipi(cpu, 1); 148 kvmppc_set_host_ipi(cpu, 1);
149#ifdef CONFIG_PPC_DOORBELL
150 if (cpu_has_feature(CPU_FTR_DBELL)) {
151 if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
152 doorbell_cause_ipi(cpu, data);
153 put_cpu();
154 return;
155 }
156 put_cpu();
157 }
158#endif
159 icp_native_set_qirr(cpu, IPI_PRIORITY); 149 icp_native_set_qirr(cpu, IPI_PRIORITY);
160} 150}
161 151
@@ -168,15 +158,15 @@ void icp_native_cause_ipi_rm(int cpu)
168 * Need the physical address of the XICS to be 158 * Need the physical address of the XICS to be
169 * previously saved in kvm_hstate in the paca. 159 * previously saved in kvm_hstate in the paca.
170 */ 160 */
171 unsigned long xics_phys; 161 void __iomem *xics_phys;
172 162
173 /* 163 /*
174 * Just like the cause_ipi functions, it is required to 164 * Just like the cause_ipi functions, it is required to
175 * include a full barrier (out8 includes a sync) before 165 * include a full barrier before causing the IPI.
176 * causing the IPI.
177 */ 166 */
178 xics_phys = paca[cpu].kvm_hstate.xics_phys; 167 xics_phys = paca[cpu].kvm_hstate.xics_phys;
179 out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); 168 mb();
169 __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
180} 170}
181#endif 171#endif
182 172
diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index b53f80f0b4d8..c71d2ea42627 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c
@@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d)
126 126
127#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
128 128
129static void icp_opal_cause_ipi(int cpu, unsigned long data) 129static void icp_opal_cause_ipi(int cpu)
130{ 130{
131 int hw_cpu = get_hard_smp_processor_id(cpu); 131 int hw_cpu = get_hard_smp_processor_id(cpu);
132 132
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 23efe4e42172..ffe138b8b9dc 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -143,11 +143,11 @@ static void xics_request_ipi(void)
143 143
144void __init xics_smp_probe(void) 144void __init xics_smp_probe(void)
145{ 145{
146 /* Setup cause_ipi callback based on which ICP is used */
147 smp_ops->cause_ipi = icp_ops->cause_ipi;
148
149 /* Register all the IPIs */ 146 /* Register all the IPIs */
150 xics_request_ipi(); 147 xics_request_ipi();
148
149 /* Setup cause_ipi callback based on which ICP is used */
150 smp_ops->cause_ipi = icp_ops->cause_ipi;
151} 151}
152 152
153#endif /* CONFIG_SMP */ 153#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
new file mode 100644
index 000000000000..12ccd7373d2f
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Kconfig
@@ -0,0 +1,11 @@
1config PPC_XIVE
2 bool
3 default n
4 select PPC_SMP_MUXED_IPI
5 select HARDIRQS_SW_RESEND
6
7config PPC_XIVE_NATIVE
8 bool
9 default n
10 select PPC_XIVE
11 depends on PPC_POWERNV
diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile
new file mode 100644
index 000000000000..3fab303fc169
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Makefile
@@ -0,0 +1,4 @@
1subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
2
3obj-y += common.o
4obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
new file mode 100644
index 000000000000..6a98efb14264
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -0,0 +1,1302 @@
1/*
2 * Copyright 2016,2017 IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#define pr_fmt(fmt) "xive: " fmt
11
12#include <linux/types.h>
13#include <linux/threads.h>
14#include <linux/kernel.h>
15#include <linux/irq.h>
16#include <linux/debugfs.h>
17#include <linux/smp.h>
18#include <linux/interrupt.h>
19#include <linux/seq_file.h>
20#include <linux/init.h>
21#include <linux/cpu.h>
22#include <linux/of.h>
23#include <linux/slab.h>
24#include <linux/spinlock.h>
25#include <linux/msi.h>
26
27#include <asm/prom.h>
28#include <asm/io.h>
29#include <asm/smp.h>
30#include <asm/machdep.h>
31#include <asm/irq.h>
32#include <asm/errno.h>
33#include <asm/xive.h>
34#include <asm/xive-regs.h>
35#include <asm/xmon.h>
36
37#include "xive-internal.h"
38
39#undef DEBUG_FLUSH
40#undef DEBUG_ALL
41
42#ifdef DEBUG_ALL
43#define DBG_VERBOSE(fmt...) pr_devel(fmt)
44#else
45#define DBG_VERBOSE(fmt...) do { } while(0)
46#endif
47
48bool __xive_enabled;
49bool xive_cmdline_disabled;
50
51/* We use only one priority for now */
52static u8 xive_irq_priority;
53
54/* TIMA */
55void __iomem *xive_tima;
56u32 xive_tima_offset;
57
58/* Backend ops */
59static const struct xive_ops *xive_ops;
60
61/* Our global interrupt domain */
62static struct irq_domain *xive_irq_domain;
63
64#ifdef CONFIG_SMP
65/* The IPIs all use the same logical irq number */
66static u32 xive_ipi_irq;
67#endif
68
69/* Xive state for each CPU */
70static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
71
72/*
73 * A "disabled" interrupt should never fire, to catch problems
74 * we set its logical number to this
75 */
76#define XIVE_BAD_IRQ 0x7fffffff
77#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1)
78
79/* An invalid CPU target */
80#define XIVE_INVALID_TARGET (-1)
81
82/*
83 * Read the next entry in a queue, return its content if it's valid
84 * or 0 if there is no new entry.
85 *
86 * The queue pointer is moved forward unless "just_peek" is set
87 */
88static u32 xive_read_eq(struct xive_q *q, bool just_peek)
89{
90 u32 cur;
91
92 if (!q->qpage)
93 return 0;
94 cur = be32_to_cpup(q->qpage + q->idx);
95
96 /* Check valid bit (31) vs current toggle polarity */
97 if ((cur >> 31) == q->toggle)
98 return 0;
99
100 /* If consuming from the queue ... */
101 if (!just_peek) {
102 /* Next entry */
103 q->idx = (q->idx + 1) & q->msk;
104
105 /* Wrap around: flip valid toggle */
106 if (q->idx == 0)
107 q->toggle ^= 1;
108 }
109 /* Mask out the valid bit (31) */
110 return cur & 0x7fffffff;
111}
112
113/*
114 * Scans all the queue that may have interrupts in them
115 * (based on "pending_prio") in priority order until an
116 * interrupt is found or all the queues are empty.
117 *
118 * Then updates the CPPR (Current Processor Priority
119 * Register) based on the most favored interrupt found
120 * (0xff if none) and return what was found (0 if none).
121 *
122 * If just_peek is set, return the most favored pending
123 * interrupt if any but don't update the queue pointers.
124 *
125 * Note: This function can operate generically on any number
126 * of queues (up to 8). The current implementation of the XIVE
127 * driver only uses a single queue however.
128 *
129 * Note2: This will also "flush" "the pending_count" of a queue
130 * into the "count" when that queue is observed to be empty.
131 * This is used to keep track of the amount of interrupts
132 * targetting a queue. When an interrupt is moved away from
133 * a queue, we only decrement that queue count once the queue
134 * has been observed empty to avoid races.
135 */
136static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
137{
138 u32 irq = 0;
139 u8 prio;
140
141 /* Find highest pending priority */
142 while (xc->pending_prio != 0) {
143 struct xive_q *q;
144
145 prio = ffs(xc->pending_prio) - 1;
146 DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
147
148 /* Try to fetch */
149 irq = xive_read_eq(&xc->queue[prio], just_peek);
150
151 /* Found something ? That's it */
152 if (irq)
153 break;
154
155 /* Clear pending bits */
156 xc->pending_prio &= ~(1 << prio);
157
158 /*
159 * Check if the queue count needs adjusting due to
160 * interrupts being moved away. See description of
161 * xive_dec_target_count()
162 */
163 q = &xc->queue[prio];
164 if (atomic_read(&q->pending_count)) {
165 int p = atomic_xchg(&q->pending_count, 0);
166 if (p) {
167 WARN_ON(p > atomic_read(&q->count));
168 atomic_sub(p, &q->count);
169 }
170 }
171 }
172
173 /* If nothing was found, set CPPR to 0xff */
174 if (irq == 0)
175 prio = 0xff;
176
177 /* Update HW CPPR to match if necessary */
178 if (prio != xc->cppr) {
179 DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
180 xc->cppr = prio;
181 out_8(xive_tima + xive_tima_offset + TM_CPPR, prio);
182 }
183
184 return irq;
185}
186
187/*
188 * This is used to perform the magic loads from an ESB
189 * described in xive.h
190 */
191static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
192{
193 u64 val;
194
195 /* Handle HW errata */
196 if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
197 offset |= offset << 4;
198
199 val = in_be64(xd->eoi_mmio + offset);
200
201 return (u8)val;
202}
203
204#ifdef CONFIG_XMON
205static void xive_dump_eq(const char *name, struct xive_q *q)
206{
207 u32 i0, i1, idx;
208
209 if (!q->qpage)
210 return;
211 idx = q->idx;
212 i0 = be32_to_cpup(q->qpage + idx);
213 idx = (idx + 1) & q->msk;
214 i1 = be32_to_cpup(q->qpage + idx);
215 xmon_printf(" %s Q T=%d %08x %08x ...\n", name,
216 q->toggle, i0, i1);
217}
218
219void xmon_xive_do_dump(int cpu)
220{
221 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
222
223 xmon_printf("XIVE state for CPU %d:\n", cpu);
224 xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
225 xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
226#ifdef CONFIG_SMP
227 {
228 u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET);
229 xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi,
230 val & XIVE_ESB_VAL_P ? 'P' : 'p',
231 val & XIVE_ESB_VAL_P ? 'Q' : 'q');
232 }
233#endif
234}
235#endif /* CONFIG_XMON */
236
237static unsigned int xive_get_irq(void)
238{
239 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
240 u32 irq;
241
242 /*
243 * This can be called either as a result of a HW interrupt or
244 * as a "replay" because EOI decided there was still something
245 * in one of the queues.
246 *
247 * First we perform an ACK cycle in order to update our mask
248 * of pending priorities. This will also have the effect of
249 * updating the CPPR to the most favored pending interrupts.
250 *
251 * In the future, if we have a way to differenciate a first
252 * entry (on HW interrupt) from a replay triggered by EOI,
253 * we could skip this on replays unless we soft-mask tells us
254 * that a new HW interrupt occurred.
255 */
256 xive_ops->update_pending(xc);
257
258 DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
259
260 /* Scan our queue(s) for interrupts */
261 irq = xive_scan_interrupts(xc, false);
262
263 DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
264 irq, xc->pending_prio);
265
266 /* Return pending interrupt if any */
267 if (irq == XIVE_BAD_IRQ)
268 return 0;
269 return irq;
270}
271
272/*
273 * After EOI'ing an interrupt, we need to re-check the queue
274 * to see if another interrupt is pending since multiple
275 * interrupts can coalesce into a single notification to the
276 * CPU.
277 *
278 * If we find that there is indeed more in there, we call
279 * force_external_irq_replay() to make Linux synthetize an
280 * external interrupt on the next call to local_irq_restore().
281 */
282static void xive_do_queue_eoi(struct xive_cpu *xc)
283{
284 if (xive_scan_interrupts(xc, true) != 0) {
285 DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
286 force_external_irq_replay();
287 }
288}
289
290/*
291 * EOI an interrupt at the source. There are several methods
292 * to do this depending on the HW version and source type
293 */
294void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
295{
296 /* If the XIVE supports the new "store EOI facility, use it */
297 if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
298 out_be64(xd->eoi_mmio, 0);
299 else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
300 /*
301 * The FW told us to call it. This happens for some
302 * interrupt sources that need additional HW whacking
303 * beyond the ESB manipulation. For example LPC interrupts
304 * on P9 DD1.0 need a latch to be clared in the LPC bridge
305 * itself. The Firmware will take care of it.
306 */
307 if (WARN_ON_ONCE(!xive_ops->eoi))
308 return;
309 xive_ops->eoi(hw_irq);
310 } else {
311 u8 eoi_val;
312
313 /*
314 * Otherwise for EOI, we use the special MMIO that does
315 * a clear of both P and Q and returns the old Q,
316 * except for LSIs where we use the "EOI cycle" special
317 * load.
318 *
319 * This allows us to then do a re-trigger if Q was set
320 * rather than synthesizing an interrupt in software
321 *
322 * For LSIs, using the HW EOI cycle works around a problem
323 * on P9 DD1 PHBs where the other ESB accesses don't work
324 * properly.
325 */
326 if (xd->flags & XIVE_IRQ_FLAG_LSI)
327 in_be64(xd->eoi_mmio);
328 else {
329 eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
330 DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
331
332 /* Re-trigger if needed */
333 if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio)
334 out_be64(xd->trig_mmio, 0);
335 }
336 }
337}
338
339/* irq_chip eoi callback */
340static void xive_irq_eoi(struct irq_data *d)
341{
342 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
343 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
344
345 DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
346 d->irq, irqd_to_hwirq(d), xc->pending_prio);
347
348 /* EOI the source if it hasn't been disabled */
349 if (!irqd_irq_disabled(d))
350 xive_do_source_eoi(irqd_to_hwirq(d), xd);
351
352 /*
353 * Clear saved_p to indicate that it's no longer occupying
354 * a queue slot on the target queue
355 */
356 xd->saved_p = false;
357
358 /* Check for more work in the queue */
359 xive_do_queue_eoi(xc);
360}
361
362/*
363 * Helper used to mask and unmask an interrupt source. This
364 * is only called for normal interrupts that do not require
365 * masking/unmasking via firmware.
366 */
367static void xive_do_source_set_mask(struct xive_irq_data *xd,
368 bool mask)
369{
370 u64 val;
371
372 /*
373 * If the interrupt had P set, it may be in a queue.
374 *
375 * We need to make sure we don't re-enable it until it
376 * has been fetched from that queue and EOId. We keep
377 * a copy of that P state and use it to restore the
378 * ESB accordingly on unmask.
379 */
380 if (mask) {
381 val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
382 xd->saved_p = !!(val & XIVE_ESB_VAL_P);
383 } else if (xd->saved_p)
384 xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
385 else
386 xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
387}
388
389/*
390 * Try to chose "cpu" as a new interrupt target. Increments
391 * the queue accounting for that target if it's not already
392 * full.
393 */
394static bool xive_try_pick_target(int cpu)
395{
396 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
397 struct xive_q *q = &xc->queue[xive_irq_priority];
398 int max;
399
400 /*
401 * Calculate max number of interrupts in that queue.
402 *
403 * We leave a gap of 1 just in case...
404 */
405 max = (q->msk + 1) - 1;
406 return !!atomic_add_unless(&q->count, 1, max);
407}
408
409/*
410 * Un-account an interrupt for a target CPU. We don't directly
411 * decrement q->count since the interrupt might still be present
412 * in the queue.
413 *
414 * Instead increment a separate counter "pending_count" which
415 * will be substracted from "count" later when that CPU observes
416 * the queue to be empty.
417 */
418static void xive_dec_target_count(int cpu)
419{
420 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
421 struct xive_q *q = &xc->queue[xive_irq_priority];
422
423 if (unlikely(WARN_ON(cpu < 0 || !xc))) {
424 pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
425 return;
426 }
427
428 /*
429 * We increment the "pending count" which will be used
430 * to decrement the target queue count whenever it's next
431 * processed and found empty. This ensure that we don't
432 * decrement while we still have the interrupt there
433 * occupying a slot.
434 */
435 atomic_inc(&q->pending_count);
436}
437
438/* Find a tentative CPU target in a CPU mask */
439static int xive_find_target_in_mask(const struct cpumask *mask,
440 unsigned int fuzz)
441{
442 int cpu, first, num, i;
443
444 /* Pick up a starting point CPU in the mask based on fuzz */
445 num = cpumask_weight(mask);
446 first = fuzz % num;
447
448 /* Locate it */
449 cpu = cpumask_first(mask);
450 for (i = 0; i < first && cpu < nr_cpu_ids; i++)
451 cpu = cpumask_next(cpu, mask);
452
453 /* Sanity check */
454 if (WARN_ON(cpu >= nr_cpu_ids))
455 cpu = cpumask_first(cpu_online_mask);
456
457 /* Remember first one to handle wrap-around */
458 first = cpu;
459
460 /*
461 * Now go through the entire mask until we find a valid
462 * target.
463 */
464 for (;;) {
465 /*
466 * We re-check online as the fallback case passes us
467 * an untested affinity mask
468 */
469 if (cpu_online(cpu) && xive_try_pick_target(cpu))
470 return cpu;
471 cpu = cpumask_next(cpu, mask);
472 if (cpu == first)
473 break;
474 /* Wrap around */
475 if (cpu >= nr_cpu_ids)
476 cpu = cpumask_first(mask);
477 }
478 return -1;
479}
480
481/*
482 * Pick a target CPU for an interrupt. This is done at
483 * startup or if the affinity is changed in a way that
484 * invalidates the current target.
485 */
486static int xive_pick_irq_target(struct irq_data *d,
487 const struct cpumask *affinity)
488{
489 static unsigned int fuzz;
490 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
491 cpumask_var_t mask;
492 int cpu = -1;
493
494 /*
495 * If we have chip IDs, first we try to build a mask of
496 * CPUs matching the CPU and find a target in there
497 */
498 if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
499 zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
500 /* Build a mask of matching chip IDs */
501 for_each_cpu_and(cpu, affinity, cpu_online_mask) {
502 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
503 if (xc->chip_id == xd->src_chip)
504 cpumask_set_cpu(cpu, mask);
505 }
506 /* Try to find a target */
507 if (cpumask_empty(mask))
508 cpu = -1;
509 else
510 cpu = xive_find_target_in_mask(mask, fuzz++);
511 free_cpumask_var(mask);
512 if (cpu >= 0)
513 return cpu;
514 fuzz--;
515 }
516
517 /* No chip IDs, fallback to using the affinity mask */
518 return xive_find_target_in_mask(affinity, fuzz++);
519}
520
521static unsigned int xive_irq_startup(struct irq_data *d)
522{
523 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
524 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
525 int target, rc;
526
527 pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
528 d->irq, hw_irq, d);
529
530#ifdef CONFIG_PCI_MSI
531 /*
532 * The generic MSI code returns with the interrupt disabled on the
533 * card, using the MSI mask bits. Firmware doesn't appear to unmask
534 * at that level, so we do it here by hand.
535 */
536 if (irq_data_get_msi_desc(d))
537 pci_msi_unmask_irq(d);
538#endif
539
540 /* Pick a target */
541 target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
542 if (target == XIVE_INVALID_TARGET) {
543 /* Try again breaking affinity */
544 target = xive_pick_irq_target(d, cpu_online_mask);
545 if (target == XIVE_INVALID_TARGET)
546 return -ENXIO;
547 pr_warn("irq %d started with broken affinity\n", d->irq);
548 }
549
550 /* Sanity check */
551 if (WARN_ON(target == XIVE_INVALID_TARGET ||
552 target >= nr_cpu_ids))
553 target = smp_processor_id();
554
555 xd->target = target;
556
557 /*
558 * Configure the logical number to be the Linux IRQ number
559 * and set the target queue
560 */
561 rc = xive_ops->configure_irq(hw_irq,
562 get_hard_smp_processor_id(target),
563 xive_irq_priority, d->irq);
564 if (rc)
565 return rc;
566
567 /* Unmask the ESB */
568 xive_do_source_set_mask(xd, false);
569
570 return 0;
571}
572
573static void xive_irq_shutdown(struct irq_data *d)
574{
575 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
576 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
577
578 pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
579 d->irq, hw_irq, d);
580
581 if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
582 return;
583
584 /* Mask the interrupt at the source */
585 xive_do_source_set_mask(xd, true);
586
587 /*
588 * The above may have set saved_p. We clear it otherwise it
589 * will prevent re-enabling later on. It is ok to forget the
590 * fact that the interrupt might be in a queue because we are
591 * accounting that already in xive_dec_target_count() and will
592 * be re-routing it to a new queue with proper accounting when
593 * it's started up again
594 */
595 xd->saved_p = false;
596
597 /*
598 * Mask the interrupt in HW in the IVT/EAS and set the number
599 * to be the "bad" IRQ number
600 */
601 xive_ops->configure_irq(hw_irq,
602 get_hard_smp_processor_id(xd->target),
603 0xff, XIVE_BAD_IRQ);
604
605 xive_dec_target_count(xd->target);
606 xd->target = XIVE_INVALID_TARGET;
607}
608
609static void xive_irq_unmask(struct irq_data *d)
610{
611 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
612
613 pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
614
615 /*
616 * This is a workaround for PCI LSI problems on P9, for
617 * these, we call FW to set the mask. The problems might
618 * be fixed by P9 DD2.0, if that is the case, firmware
619 * will no longer set that flag.
620 */
621 if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
622 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
623 xive_ops->configure_irq(hw_irq,
624 get_hard_smp_processor_id(xd->target),
625 xive_irq_priority, d->irq);
626 return;
627 }
628
629 xive_do_source_set_mask(xd, false);
630}
631
632static void xive_irq_mask(struct irq_data *d)
633{
634 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
635
636 pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
637
638 /*
639 * This is a workaround for PCI LSI problems on P9, for
640 * these, we call OPAL to set the mask. The problems might
641 * be fixed by P9 DD2.0, if that is the case, firmware
642 * will no longer set that flag.
643 */
644 if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
645 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
646 xive_ops->configure_irq(hw_irq,
647 get_hard_smp_processor_id(xd->target),
648 0xff, d->irq);
649 return;
650 }
651
652 xive_do_source_set_mask(xd, true);
653}
654
655static int xive_irq_set_affinity(struct irq_data *d,
656 const struct cpumask *cpumask,
657 bool force)
658{
659 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
660 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
661 u32 target, old_target;
662 int rc = 0;
663
664 pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
665
666 /* Is this valid ? */
667 if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
668 return -EINVAL;
669
670 /*
671 * If existing target is already in the new mask, and is
672 * online then do nothing.
673 */
674 if (xd->target != XIVE_INVALID_TARGET &&
675 cpu_online(xd->target) &&
676 cpumask_test_cpu(xd->target, cpumask))
677 return IRQ_SET_MASK_OK;
678
679 /* Pick a new target */
680 target = xive_pick_irq_target(d, cpumask);
681
682 /* No target found */
683 if (target == XIVE_INVALID_TARGET)
684 return -ENXIO;
685
686 /* Sanity check */
687 if (WARN_ON(target >= nr_cpu_ids))
688 target = smp_processor_id();
689
690 old_target = xd->target;
691
692 rc = xive_ops->configure_irq(hw_irq,
693 get_hard_smp_processor_id(target),
694 xive_irq_priority, d->irq);
695 if (rc < 0) {
696 pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
697 return rc;
698 }
699
700 pr_devel(" target: 0x%x\n", target);
701 xd->target = target;
702
703 /* Give up previous target */
704 if (old_target != XIVE_INVALID_TARGET)
705 xive_dec_target_count(old_target);
706
707 return IRQ_SET_MASK_OK;
708}
709
710static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
711{
712 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
713
714 /*
715 * We only support these. This has really no effect other than setting
716 * the corresponding descriptor bits mind you but those will in turn
717 * affect the resend function when re-enabling an edge interrupt.
718 *
719 * Set set the default to edge as explained in map().
720 */
721 if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
722 flow_type = IRQ_TYPE_EDGE_RISING;
723
724 if (flow_type != IRQ_TYPE_EDGE_RISING &&
725 flow_type != IRQ_TYPE_LEVEL_LOW)
726 return -EINVAL;
727
728 irqd_set_trigger_type(d, flow_type);
729
730 /*
731 * Double check it matches what the FW thinks
732 *
733 * NOTE: We don't know yet if the PAPR interface will provide
734 * the LSI vs MSI information apart from the device-tree so
735 * this check might have to move into an optional backend call
736 * that is specific to the native backend
737 */
738 if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
739 !!(xd->flags & XIVE_IRQ_FLAG_LSI)) {
740 pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n",
741 d->irq, (u32)irqd_to_hwirq(d),
742 (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
743 (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
744 }
745
746 return IRQ_SET_MASK_OK_NOCOPY;
747}
748
749static int xive_irq_retrigger(struct irq_data *d)
750{
751 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
752
753 /* This should be only for MSIs */
754 if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
755 return 0;
756
757 /*
758 * To perform a retrigger, we first set the PQ bits to
759 * 11, then perform an EOI.
760 */
761 xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
762
763 /*
764 * Note: We pass "0" to the hw_irq argument in order to
765 * avoid calling into the backend EOI code which we don't
766 * want to do in the case of a re-trigger. Backends typically
767 * only do EOI for LSIs anyway.
768 */
769 xive_do_source_eoi(0, xd);
770
771 return 1;
772}
773
774static struct irq_chip xive_irq_chip = {
775 .name = "XIVE-IRQ",
776 .irq_startup = xive_irq_startup,
777 .irq_shutdown = xive_irq_shutdown,
778 .irq_eoi = xive_irq_eoi,
779 .irq_mask = xive_irq_mask,
780 .irq_unmask = xive_irq_unmask,
781 .irq_set_affinity = xive_irq_set_affinity,
782 .irq_set_type = xive_irq_set_type,
783 .irq_retrigger = xive_irq_retrigger,
784};
785
786bool is_xive_irq(struct irq_chip *chip)
787{
788 return chip == &xive_irq_chip;
789}
790
791void xive_cleanup_irq_data(struct xive_irq_data *xd)
792{
793 if (xd->eoi_mmio) {
794 iounmap(xd->eoi_mmio);
795 if (xd->eoi_mmio == xd->trig_mmio)
796 xd->trig_mmio = NULL;
797 xd->eoi_mmio = NULL;
798 }
799 if (xd->trig_mmio) {
800 iounmap(xd->trig_mmio);
801 xd->trig_mmio = NULL;
802 }
803}
804
805static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
806{
807 struct xive_irq_data *xd;
808 int rc;
809
810 xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
811 if (!xd)
812 return -ENOMEM;
813 rc = xive_ops->populate_irq_data(hw, xd);
814 if (rc) {
815 kfree(xd);
816 return rc;
817 }
818 xd->target = XIVE_INVALID_TARGET;
819 irq_set_handler_data(virq, xd);
820
821 return 0;
822}
823
824static void xive_irq_free_data(unsigned int virq)
825{
826 struct xive_irq_data *xd = irq_get_handler_data(virq);
827
828 if (!xd)
829 return;
830 irq_set_handler_data(virq, NULL);
831 xive_cleanup_irq_data(xd);
832 kfree(xd);
833}
834
835#ifdef CONFIG_SMP
836
837static void xive_cause_ipi(int cpu)
838{
839 struct xive_cpu *xc;
840 struct xive_irq_data *xd;
841
842 xc = per_cpu(xive_cpu, cpu);
843
844 DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n",
845 smp_processor_id(), cpu, xc->hw_ipi);
846
847 xd = &xc->ipi_data;
848 if (WARN_ON(!xd->trig_mmio))
849 return;
850 out_be64(xd->trig_mmio, 0);
851}
852
853static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
854{
855 return smp_ipi_demux();
856}
857
858static void xive_ipi_eoi(struct irq_data *d)
859{
860 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
861
862 /* Handle possible race with unplug and drop stale IPIs */
863 if (!xc)
864 return;
865 xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
866 xive_do_queue_eoi(xc);
867}
868
869static void xive_ipi_do_nothing(struct irq_data *d)
870{
871 /*
872 * Nothing to do, we never mask/unmask IPIs, but the callback
873 * has to exist for the struct irq_chip.
874 */
875}
876
877static struct irq_chip xive_ipi_chip = {
878 .name = "XIVE-IPI",
879 .irq_eoi = xive_ipi_eoi,
880 .irq_mask = xive_ipi_do_nothing,
881 .irq_unmask = xive_ipi_do_nothing,
882};
883
884static void __init xive_request_ipi(void)
885{
886 unsigned int virq;
887
888 /*
889 * Initialization failed, move on, we might manage to
890 * reach the point where we display our errors before
891 * the system falls appart
892 */
893 if (!xive_irq_domain)
894 return;
895
896 /* Initialize it */
897 virq = irq_create_mapping(xive_irq_domain, 0);
898 xive_ipi_irq = virq;
899
900 WARN_ON(request_irq(virq, xive_muxed_ipi_action,
901 IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
902}
903
904static int xive_setup_cpu_ipi(unsigned int cpu)
905{
906 struct xive_cpu *xc;
907 int rc;
908
909 pr_debug("Setting up IPI for CPU %d\n", cpu);
910
911 xc = per_cpu(xive_cpu, cpu);
912
913 /* Check if we are already setup */
914 if (xc->hw_ipi != 0)
915 return 0;
916
917 /* Grab an IPI from the backend, this will populate xc->hw_ipi */
918 if (xive_ops->get_ipi(cpu, xc))
919 return -EIO;
920
921 /*
922 * Populate the IRQ data in the xive_cpu structure and
923 * configure the HW / enable the IPIs.
924 */
925 rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
926 if (rc) {
927 pr_err("Failed to populate IPI data on CPU %d\n", cpu);
928 return -EIO;
929 }
930 rc = xive_ops->configure_irq(xc->hw_ipi,
931 get_hard_smp_processor_id(cpu),
932 xive_irq_priority, xive_ipi_irq);
933 if (rc) {
934 pr_err("Failed to map IPI CPU %d\n", cpu);
935 return -EIO;
936 }
937 pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
938 xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
939
940 /* Unmask it */
941 xive_do_source_set_mask(&xc->ipi_data, false);
942
943 return 0;
944}
945
946static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
947{
948 /* Disable the IPI and free the IRQ data */
949
950 /* Already cleaned up ? */
951 if (xc->hw_ipi == 0)
952 return;
953
954 /* Mask the IPI */
955 xive_do_source_set_mask(&xc->ipi_data, true);
956
957 /*
958 * Note: We don't call xive_cleanup_irq_data() to free
959 * the mappings as this is called from an IPI on kexec
960 * which is not a safe environment to call iounmap()
961 */
962
963 /* Deconfigure/mask in the backend */
964 xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
965 0xff, xive_ipi_irq);
966
967 /* Free the IPIs in the backend */
968 xive_ops->put_ipi(cpu, xc);
969}
970
971void __init xive_smp_probe(void)
972{
973 smp_ops->cause_ipi = xive_cause_ipi;
974
975 /* Register the IPI */
976 xive_request_ipi();
977
978 /* Allocate and setup IPI for the boot CPU */
979 xive_setup_cpu_ipi(smp_processor_id());
980}
981
982#endif /* CONFIG_SMP */
983
984static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
985 irq_hw_number_t hw)
986{
987 int rc;
988
989 /*
990 * Mark interrupts as edge sensitive by default so that resend
991 * actually works. Will fix that up below if needed.
992 */
993 irq_clear_status_flags(virq, IRQ_LEVEL);
994
995#ifdef CONFIG_SMP
996 /* IPIs are special and come up with HW number 0 */
997 if (hw == 0) {
998 /*
999 * IPIs are marked per-cpu. We use separate HW interrupts under
1000 * the hood but associated with the same "linux" interrupt
1001 */
1002 irq_set_chip_and_handler(virq, &xive_ipi_chip,
1003 handle_percpu_irq);
1004 return 0;
1005 }
1006#endif
1007
1008 rc = xive_irq_alloc_data(virq, hw);
1009 if (rc)
1010 return rc;
1011
1012 irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
1013
1014 return 0;
1015}
1016
1017static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
1018{
1019 struct irq_data *data = irq_get_irq_data(virq);
1020 unsigned int hw_irq;
1021
1022 /* XXX Assign BAD number */
1023 if (!data)
1024 return;
1025 hw_irq = (unsigned int)irqd_to_hwirq(data);
1026 if (hw_irq)
1027 xive_irq_free_data(virq);
1028}
1029
1030static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
1031 const u32 *intspec, unsigned int intsize,
1032 irq_hw_number_t *out_hwirq, unsigned int *out_flags)
1033
1034{
1035 *out_hwirq = intspec[0];
1036
1037 /*
1038 * If intsize is at least 2, we look for the type in the second cell,
1039 * we assume the LSB indicates a level interrupt.
1040 */
1041 if (intsize > 1) {
1042 if (intspec[1] & 1)
1043 *out_flags = IRQ_TYPE_LEVEL_LOW;
1044 else
1045 *out_flags = IRQ_TYPE_EDGE_RISING;
1046 } else
1047 *out_flags = IRQ_TYPE_LEVEL_LOW;
1048
1049 return 0;
1050}
1051
1052static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node,
1053 enum irq_domain_bus_token bus_token)
1054{
1055 return xive_ops->match(node);
1056}
1057
1058static const struct irq_domain_ops xive_irq_domain_ops = {
1059 .match = xive_irq_domain_match,
1060 .map = xive_irq_domain_map,
1061 .unmap = xive_irq_domain_unmap,
1062 .xlate = xive_irq_domain_xlate,
1063};
1064
1065static void __init xive_init_host(void)
1066{
1067 xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
1068 &xive_irq_domain_ops, NULL);
1069 if (WARN_ON(xive_irq_domain == NULL))
1070 return;
1071 irq_set_default_host(xive_irq_domain);
1072}
1073
1074static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
1075{
1076 if (xc->queue[xive_irq_priority].qpage)
1077 xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
1078}
1079
1080static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
1081{
1082 int rc = 0;
1083
1084 /* We setup 1 queues for now with a 64k page */
1085 if (!xc->queue[xive_irq_priority].qpage)
1086 rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
1087
1088 return rc;
1089}
1090
1091static int xive_prepare_cpu(unsigned int cpu)
1092{
1093 struct xive_cpu *xc;
1094
1095 xc = per_cpu(xive_cpu, cpu);
1096 if (!xc) {
1097 struct device_node *np;
1098
1099 xc = kzalloc_node(sizeof(struct xive_cpu),
1100 GFP_KERNEL, cpu_to_node(cpu));
1101 if (!xc)
1102 return -ENOMEM;
1103 np = of_get_cpu_node(cpu, NULL);
1104 if (np)
1105 xc->chip_id = of_get_ibm_chip_id(np);
1106 of_node_put(np);
1107
1108 per_cpu(xive_cpu, cpu) = xc;
1109 }
1110
1111 /* Setup EQs if not already */
1112 return xive_setup_cpu_queues(cpu, xc);
1113}
1114
1115static void xive_setup_cpu(void)
1116{
1117 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
1118
1119 /* Debug: Dump the TM state */
1120 pr_devel("CPU %d [HW 0x%02x] VT=%02x\n",
1121 smp_processor_id(), hard_smp_processor_id(),
1122 in_8(xive_tima + xive_tima_offset + TM_WORD2));
1123
1124 /* The backend might have additional things to do */
1125 if (xive_ops->setup_cpu)
1126 xive_ops->setup_cpu(smp_processor_id(), xc);
1127
1128 /* Set CPPR to 0xff to enable flow of interrupts */
1129 xc->cppr = 0xff;
1130 out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
1131}
1132
1133#ifdef CONFIG_SMP
1134void xive_smp_setup_cpu(void)
1135{
1136 pr_devel("SMP setup CPU %d\n", smp_processor_id());
1137
1138 /* This will have already been done on the boot CPU */
1139 if (smp_processor_id() != boot_cpuid)
1140 xive_setup_cpu();
1141
1142}
1143
1144int xive_smp_prepare_cpu(unsigned int cpu)
1145{
1146 int rc;
1147
1148 /* Allocate per-CPU data and queues */
1149 rc = xive_prepare_cpu(cpu);
1150 if (rc)
1151 return rc;
1152
1153 /* Allocate and setup IPI for the new CPU */
1154 return xive_setup_cpu_ipi(cpu);
1155}
1156
1157#ifdef CONFIG_HOTPLUG_CPU
1158static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
1159{
1160 u32 irq;
1161
1162 /* We assume local irqs are disabled */
1163 WARN_ON(!irqs_disabled());
1164
1165 /* Check what's already in the CPU queue */
1166 while ((irq = xive_scan_interrupts(xc, false)) != 0) {
1167 /*
1168 * We need to re-route that interrupt to its new destination.
1169 * First get and lock the descriptor
1170 */
1171 struct irq_desc *desc = irq_to_desc(irq);
1172 struct irq_data *d = irq_desc_get_irq_data(desc);
1173 struct xive_irq_data *xd;
1174 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
1175
1176 /*
1177 * Ignore anything that isn't a XIVE irq and ignore
1178 * IPIs, so can just be dropped.
1179 */
1180 if (d->domain != xive_irq_domain || hw_irq == 0)
1181 continue;
1182
1183 /*
1184 * The IRQ should have already been re-routed, it's just a
1185 * stale in the old queue, so re-trigger it in order to make
1186 * it reach is new destination.
1187 */
1188#ifdef DEBUG_FLUSH
1189 pr_info("CPU %d: Got irq %d while offline, re-sending...\n",
1190 cpu, irq);
1191#endif
1192 raw_spin_lock(&desc->lock);
1193 xd = irq_desc_get_handler_data(desc);
1194
1195 /*
1196 * For LSIs, we EOI, this will cause a resend if it's
1197 * still asserted. Otherwise do an MSI retrigger.
1198 */
1199 if (xd->flags & XIVE_IRQ_FLAG_LSI)
1200 xive_do_source_eoi(irqd_to_hwirq(d), xd);
1201 else
1202 xive_irq_retrigger(d);
1203
1204 raw_spin_unlock(&desc->lock);
1205 }
1206}
1207
1208void xive_smp_disable_cpu(void)
1209{
1210 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
1211 unsigned int cpu = smp_processor_id();
1212
1213 /* Migrate interrupts away from the CPU */
1214 irq_migrate_all_off_this_cpu();
1215
1216 /* Set CPPR to 0 to disable flow of interrupts */
1217 xc->cppr = 0;
1218 out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
1219
1220 /* Flush everything still in the queue */
1221 xive_flush_cpu_queue(cpu, xc);
1222
1223 /* Re-enable CPPR */
1224 xc->cppr = 0xff;
1225 out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
1226}
1227
1228void xive_flush_interrupt(void)
1229{
1230 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
1231 unsigned int cpu = smp_processor_id();
1232
1233 /* Called if an interrupt occurs while the CPU is hot unplugged */
1234 xive_flush_cpu_queue(cpu, xc);
1235}
1236
1237#endif /* CONFIG_HOTPLUG_CPU */
1238
1239#endif /* CONFIG_SMP */
1240
1241void xive_kexec_teardown_cpu(int secondary)
1242{
1243 struct xive_cpu *xc = __this_cpu_read(xive_cpu);
1244 unsigned int cpu = smp_processor_id();
1245
1246 /* Set CPPR to 0 to disable flow of interrupts */
1247 xc->cppr = 0;
1248 out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
1249
1250 /* Backend cleanup if any */
1251 if (xive_ops->teardown_cpu)
1252 xive_ops->teardown_cpu(cpu, xc);
1253
1254#ifdef CONFIG_SMP
1255 /* Get rid of IPI */
1256 xive_cleanup_cpu_ipi(cpu, xc);
1257#endif
1258
1259 /* Disable and free the queues */
1260 xive_cleanup_cpu_queues(cpu, xc);
1261}
1262
1263void xive_shutdown(void)
1264{
1265 xive_ops->shutdown();
1266}
1267
1268bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
1269 u8 max_prio)
1270{
1271 xive_tima = area;
1272 xive_tima_offset = offset;
1273 xive_ops = ops;
1274 xive_irq_priority = max_prio;
1275
1276 ppc_md.get_irq = xive_get_irq;
1277 __xive_enabled = true;
1278
1279 pr_devel("Initializing host..\n");
1280 xive_init_host();
1281
1282 pr_devel("Initializing boot CPU..\n");
1283
1284 /* Allocate per-CPU data and queues */
1285 xive_prepare_cpu(smp_processor_id());
1286
1287 /* Get ready for interrupts */
1288 xive_setup_cpu();
1289
1290 pr_info("Interrupt handling intialized with %s backend\n",
1291 xive_ops->name);
1292 pr_info("Using priority %d for all interrupts\n", max_prio);
1293
1294 return true;
1295}
1296
1297static int __init xive_off(char *arg)
1298{
1299 xive_cmdline_disabled = true;
1300 return 0;
1301}
1302__setup("xive=off", xive_off);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
new file mode 100644
index 000000000000..1a726229a427
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -0,0 +1,640 @@
1/*
2 * Copyright 2016,2017 IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#define pr_fmt(fmt) "xive: " fmt
11
12#include <linux/types.h>
13#include <linux/irq.h>
14#include <linux/debugfs.h>
15#include <linux/smp.h>
16#include <linux/interrupt.h>
17#include <linux/seq_file.h>
18#include <linux/init.h>
19#include <linux/of.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/delay.h>
23#include <linux/cpumask.h>
24#include <linux/mm.h>
25
26#include <asm/prom.h>
27#include <asm/io.h>
28#include <asm/smp.h>
29#include <asm/irq.h>
30#include <asm/errno.h>
31#include <asm/xive.h>
32#include <asm/xive-regs.h>
33#include <asm/opal.h>
34
35#include "xive-internal.h"
36
37
38static u32 xive_provision_size;
39static u32 *xive_provision_chips;
40static u32 xive_provision_chip_count;
41static u32 xive_queue_shift;
42static u32 xive_pool_vps = XIVE_INVALID_VP;
43static struct kmem_cache *xive_provision_cache;
44
45int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
46{
47 __be64 flags, eoi_page, trig_page;
48 __be32 esb_shift, src_chip;
49 u64 opal_flags;
50 s64 rc;
51
52 memset(data, 0, sizeof(*data));
53
54 rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
55 &esb_shift, &src_chip);
56 if (rc) {
57 pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n",
58 hw_irq, rc);
59 return -EINVAL;
60 }
61
62 opal_flags = be64_to_cpu(flags);
63 if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
64 data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
65 if (opal_flags & OPAL_XIVE_IRQ_LSI)
66 data->flags |= XIVE_IRQ_FLAG_LSI;
67 if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
68 data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
69 if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
70 data->flags |= XIVE_IRQ_FLAG_MASK_FW;
71 if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
72 data->flags |= XIVE_IRQ_FLAG_EOI_FW;
73 data->eoi_page = be64_to_cpu(eoi_page);
74 data->trig_page = be64_to_cpu(trig_page);
75 data->esb_shift = be32_to_cpu(esb_shift);
76 data->src_chip = be32_to_cpu(src_chip);
77
78 data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
79 if (!data->eoi_mmio) {
80 pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq);
81 return -ENOMEM;
82 }
83
84 if (!data->trig_page)
85 return 0;
86 if (data->trig_page == data->eoi_page) {
87 data->trig_mmio = data->eoi_mmio;
88 return 0;
89 }
90
91 data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
92 if (!data->trig_mmio) {
93 pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq);
94 return -ENOMEM;
95 }
96 return 0;
97}
98
99int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
100{
101 s64 rc;
102
103 for (;;) {
104 rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
105 if (rc != OPAL_BUSY)
106 break;
107 msleep(1);
108 }
109 return rc == 0 ? 0 : -ENXIO;
110}
111
112/* This can be called multiple time to change a queue configuration */
113int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
114 __be32 *qpage, u32 order, bool can_escalate)
115{
116 s64 rc = 0;
117 __be64 qeoi_page_be;
118 __be32 esc_irq_be;
119 u64 flags, qpage_phys;
120
121 /* If there's an actual queue page, clean it */
122 if (order) {
123 if (WARN_ON(!qpage))
124 return -EINVAL;
125 qpage_phys = __pa(qpage);
126 } else
127 qpage_phys = 0;
128
129 /* Initialize the rest of the fields */
130 q->msk = order ? ((1u << (order - 2)) - 1) : 0;
131 q->idx = 0;
132 q->toggle = 0;
133
134 rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
135 &qeoi_page_be,
136 &esc_irq_be,
137 NULL);
138 if (rc) {
139 pr_err("Error %lld getting queue info prio %d\n", rc, prio);
140 rc = -EIO;
141 goto fail;
142 }
143 q->eoi_phys = be64_to_cpu(qeoi_page_be);
144
145 /* Default flags */
146 flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED;
147
148 /* Escalation needed ? */
149 if (can_escalate) {
150 q->esc_irq = be32_to_cpu(esc_irq_be);
151 flags |= OPAL_XIVE_EQ_ESCALATE;
152 }
153
154 /* Configure and enable the queue in HW */
155 for (;;) {
156 rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags);
157 if (rc != OPAL_BUSY)
158 break;
159 msleep(1);
160 }
161 if (rc) {
162 pr_err("Error %lld setting queue for prio %d\n", rc, prio);
163 rc = -EIO;
164 } else {
165 /*
166 * KVM code requires all of the above to be visible before
167 * q->qpage is set due to how it manages IPI EOIs
168 */
169 wmb();
170 q->qpage = qpage;
171 }
172fail:
173 return rc;
174}
175
176static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
177{
178 s64 rc;
179
180 /* Disable the queue in HW */
181 for (;;) {
182 rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
183 if (rc != OPAL_BUSY)
184 break;
185 msleep(1);
186 }
187 if (rc)
188 pr_err("Error %lld disabling queue for prio %d\n", rc, prio);
189}
190
191void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
192{
193 __xive_native_disable_queue(vp_id, q, prio);
194}
195
196static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
197{
198 struct xive_q *q = &xc->queue[prio];
199 unsigned int alloc_order;
200 struct page *pages;
201 __be32 *qpage;
202
203 alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
204 (xive_queue_shift - PAGE_SHIFT) : 0;
205 pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order);
206 if (!pages)
207 return -ENOMEM;
208 qpage = (__be32 *)page_address(pages);
209 memset(qpage, 0, 1 << xive_queue_shift);
210 return xive_native_configure_queue(get_hard_smp_processor_id(cpu),
211 q, prio, qpage, xive_queue_shift, false);
212}
213
214static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
215{
216 struct xive_q *q = &xc->queue[prio];
217 unsigned int alloc_order;
218
219 /*
220 * We use the variant with no iounmap as this is called on exec
221 * from an IPI and iounmap isn't safe
222 */
223 __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio);
224 alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
225 (xive_queue_shift - PAGE_SHIFT) : 0;
226 free_pages((unsigned long)q->qpage, alloc_order);
227 q->qpage = NULL;
228}
229
230static bool xive_native_match(struct device_node *node)
231{
232 return of_device_is_compatible(node, "ibm,opal-xive-vc");
233}
234
235#ifdef CONFIG_SMP
236static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
237{
238 struct device_node *np;
239 unsigned int chip_id;
240 s64 irq;
241
242 /* Find the chip ID */
243 np = of_get_cpu_node(cpu, NULL);
244 if (np) {
245 if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0)
246 chip_id = 0;
247 }
248
249 /* Allocate an IPI and populate info about it */
250 for (;;) {
251 irq = opal_xive_allocate_irq(chip_id);
252 if (irq == OPAL_BUSY) {
253 msleep(1);
254 continue;
255 }
256 if (irq < 0) {
257 pr_err("Failed to allocate IPI on CPU %d\n", cpu);
258 return -ENXIO;
259 }
260 xc->hw_ipi = irq;
261 break;
262 }
263 return 0;
264}
265
266u32 xive_native_alloc_irq(void)
267{
268 s64 rc;
269
270 for (;;) {
271 rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
272 if (rc != OPAL_BUSY)
273 break;
274 msleep(1);
275 }
276 if (rc < 0)
277 return 0;
278 return rc;
279}
280
281void xive_native_free_irq(u32 irq)
282{
283 for (;;) {
284 s64 rc = opal_xive_free_irq(irq);
285 if (rc != OPAL_BUSY)
286 break;
287 msleep(1);
288 }
289}
290
291static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
292{
293 s64 rc;
294
295 /* Free the IPI */
296 if (!xc->hw_ipi)
297 return;
298 for (;;) {
299 rc = opal_xive_free_irq(xc->hw_ipi);
300 if (rc == OPAL_BUSY) {
301 msleep(1);
302 continue;
303 }
304 xc->hw_ipi = 0;
305 break;
306 }
307}
308#endif /* CONFIG_SMP */
309
310static void xive_native_shutdown(void)
311{
312 /* Switch the XIVE to emulation mode */
313 opal_xive_reset(OPAL_XIVE_MODE_EMU);
314}
315
316/*
317 * Perform an "ack" cycle on the current thread, thus
318 * grabbing the pending active priorities and updating
319 * the CPPR to the most favored one.
320 */
321static void xive_native_update_pending(struct xive_cpu *xc)
322{
323 u8 he, cppr;
324 u16 ack;
325
326 /* Perform the acknowledge hypervisor to register cycle */
327 ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG));
328
329 /* Synchronize subsequent queue accesses */
330 mb();
331
332 /*
333 * Grab the CPPR and the "HE" field which indicates the source
334 * of the hypervisor interrupt (if any)
335 */
336 cppr = ack & 0xff;
337 he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
338 switch(he) {
339 case TM_QW3_NSR_HE_NONE: /* Nothing to see here */
340 break;
341 case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */
342 if (cppr == 0xff)
343 return;
344 /* Mark the priority pending */
345 xc->pending_prio |= 1 << cppr;
346
347 /*
348 * A new interrupt should never have a CPPR less favored
349 * than our current one.
350 */
351 if (cppr >= xc->cppr)
352 pr_err("CPU %d odd ack CPPR, got %d at %d\n",
353 smp_processor_id(), cppr, xc->cppr);
354
355 /* Update our idea of what the CPPR is */
356 xc->cppr = cppr;
357 break;
358 case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */
359 case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */
360 pr_err("CPU %d got unexpected interrupt type HE=%d\n",
361 smp_processor_id(), he);
362 return;
363 }
364}
365
366static void xive_native_eoi(u32 hw_irq)
367{
368 /*
369 * Not normally used except if specific interrupts need
370 * a workaround on EOI.
371 */
372 opal_int_eoi(hw_irq);
373}
374
375static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
376{
377 s64 rc;
378 u32 vp;
379 __be64 vp_cam_be;
380 u64 vp_cam;
381
382 if (xive_pool_vps == XIVE_INVALID_VP)
383 return;
384
385 /* Enable the pool VP */
386 vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
387 pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
388 for (;;) {
389 rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
390 if (rc != OPAL_BUSY)
391 break;
392 msleep(1);
393 }
394 if (rc) {
395 pr_err("Failed to enable pool VP on CPU %d\n", cpu);
396 return;
397 }
398
399 /* Grab it's CAM value */
400 rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL);
401 if (rc) {
402 pr_err("Failed to get pool VP info CPU %d\n", cpu);
403 return;
404 }
405 vp_cam = be64_to_cpu(vp_cam_be);
406
407 pr_debug("VP CAM = %llx\n", vp_cam);
408
409 /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */
410 pr_debug("(Old HW value: %08x)\n",
411 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
412 out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff);
413 out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2,
414 TM_QW2W2_VP | vp_cam);
415 pr_debug("(New HW value: %08x)\n",
416 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
417}
418
419static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
420{
421 s64 rc;
422 u32 vp;
423
424 if (xive_pool_vps == XIVE_INVALID_VP)
425 return;
426
427 /* Pull the pool VP from the CPU */
428 in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
429
430 /* Disable it */
431 vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
432 for (;;) {
433 rc = opal_xive_set_vp_info(vp, 0, 0);
434 if (rc != OPAL_BUSY)
435 break;
436 msleep(1);
437 }
438}
439
440static void xive_native_sync_source(u32 hw_irq)
441{
442 opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
443}
444
445static const struct xive_ops xive_native_ops = {
446 .populate_irq_data = xive_native_populate_irq_data,
447 .configure_irq = xive_native_configure_irq,
448 .setup_queue = xive_native_setup_queue,
449 .cleanup_queue = xive_native_cleanup_queue,
450 .match = xive_native_match,
451 .shutdown = xive_native_shutdown,
452 .update_pending = xive_native_update_pending,
453 .eoi = xive_native_eoi,
454 .setup_cpu = xive_native_setup_cpu,
455 .teardown_cpu = xive_native_teardown_cpu,
456 .sync_source = xive_native_sync_source,
457#ifdef CONFIG_SMP
458 .get_ipi = xive_native_get_ipi,
459 .put_ipi = xive_native_put_ipi,
460#endif /* CONFIG_SMP */
461 .name = "native",
462};
463
464static bool xive_parse_provisioning(struct device_node *np)
465{
466 int rc;
467
468 if (of_property_read_u32(np, "ibm,xive-provision-page-size",
469 &xive_provision_size) < 0)
470 return true;
471 rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
472 if (rc < 0) {
473 pr_err("Error %d getting provision chips array\n", rc);
474 return false;
475 }
476 xive_provision_chip_count = rc;
477 if (rc == 0)
478 return true;
479
480 xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
481 GFP_KERNEL);
482 if (WARN_ON(!xive_provision_chips))
483 return false;
484
485 rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
486 xive_provision_chips,
487 xive_provision_chip_count);
488 if (rc < 0) {
489 pr_err("Error %d reading provision chips array\n", rc);
490 return false;
491 }
492
493 xive_provision_cache = kmem_cache_create("xive-provision",
494 xive_provision_size,
495 xive_provision_size,
496 0, NULL);
497 if (!xive_provision_cache) {
498 pr_err("Failed to allocate provision cache\n");
499 return false;
500 }
501 return true;
502}
503
504u32 xive_native_default_eq_shift(void)
505{
506 return xive_queue_shift;
507}
508
509bool xive_native_init(void)
510{
511 struct device_node *np;
512 struct resource r;
513 void __iomem *tima;
514 struct property *prop;
515 u8 max_prio = 7;
516 const __be32 *p;
517 u32 val;
518 s64 rc;
519
520 if (xive_cmdline_disabled)
521 return false;
522
523 pr_devel("xive_native_init()\n");
524 np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe");
525 if (!np) {
526 pr_devel("not found !\n");
527 return false;
528 }
529 pr_devel("Found %s\n", np->full_name);
530
531 /* Resource 1 is HV window */
532 if (of_address_to_resource(np, 1, &r)) {
533 pr_err("Failed to get thread mgmnt area resource\n");
534 return false;
535 }
536 tima = ioremap(r.start, resource_size(&r));
537 if (!tima) {
538 pr_err("Failed to map thread mgmnt area\n");
539 return false;
540 }
541
542 /* Read number of priorities */
543 if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0)
544 max_prio = val - 1;
545
546 /* Iterate the EQ sizes and pick one */
547 of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
548 xive_queue_shift = val;
549 if (val == PAGE_SHIFT)
550 break;
551 }
552
553 /* Grab size of provisioning pages */
554 xive_parse_provisioning(np);
555
556 /* Switch the XIVE to exploitation mode */
557 rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL);
558 if (rc) {
559 pr_err("Switch to exploitation mode failed with error %lld\n", rc);
560 return false;
561 }
562
563 /* Initialize XIVE core with our backend */
564 if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
565 max_prio)) {
566 opal_xive_reset(OPAL_XIVE_MODE_EMU);
567 return false;
568 }
569 pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10));
570 return true;
571}
572
573static bool xive_native_provision_pages(void)
574{
575 u32 i;
576 void *p;
577
578 for (i = 0; i < xive_provision_chip_count; i++) {
579 u32 chip = xive_provision_chips[i];
580
581 /*
582 * XXX TODO: Try to make the allocation local to the node where
583 * the chip resides.
584 */
585 p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL);
586 if (!p) {
587 pr_err("Failed to allocate provisioning page\n");
588 return false;
589 }
590 opal_xive_donate_page(chip, __pa(p));
591 }
592 return true;
593}
594
595u32 xive_native_alloc_vp_block(u32 max_vcpus)
596{
597 s64 rc;
598 u32 order;
599
600 order = fls(max_vcpus) - 1;
601 if (max_vcpus > (1 << order))
602 order++;
603
604 pr_info("VP block alloc, for max VCPUs %d use order %d\n",
605 max_vcpus, order);
606
607 for (;;) {
608 rc = opal_xive_alloc_vp_block(order);
609 switch (rc) {
610 case OPAL_BUSY:
611 msleep(1);
612 break;
613 case OPAL_XIVE_PROVISIONING:
614 if (!xive_native_provision_pages())
615 return XIVE_INVALID_VP;
616 break;
617 default:
618 if (rc < 0) {
619 pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n",
620 order, rc);
621 return XIVE_INVALID_VP;
622 }
623 return rc;
624 }
625 }
626}
627EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block);
628
629void xive_native_free_vp_block(u32 vp_base)
630{
631 s64 rc;
632
633 if (vp_base == XIVE_INVALID_VP)
634 return;
635
636 rc = opal_xive_free_vp_block(vp_base);
637 if (rc < 0)
638 pr_warn("OPAL error %lld freeing VP block\n", rc);
639}
640EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
new file mode 100644
index 000000000000..d07ef2d29caf
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright 2016,2017 IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#ifndef __XIVE_INTERNAL_H
10#define __XIVE_INTERNAL_H
11
12/* Each CPU carry one of these with various per-CPU state */
13struct xive_cpu {
14#ifdef CONFIG_SMP
15 /* HW irq number and data of IPI */
16 u32 hw_ipi;
17 struct xive_irq_data ipi_data;
18#endif /* CONFIG_SMP */
19
20 int chip_id;
21
22 /* Queue datas. Only one is populated */
23#define XIVE_MAX_QUEUES 8
24 struct xive_q queue[XIVE_MAX_QUEUES];
25
26 /*
27 * Pending mask. Each bit corresponds to a priority that
28 * potentially has pending interrupts.
29 */
30 u8 pending_prio;
31
32 /* Cache of HW CPPR */
33 u8 cppr;
34};
35
36/* Backend ops */
37struct xive_ops {
38 int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
39 int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
40 int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
41 void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
42 void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
43 void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
44 bool (*match)(struct device_node *np);
45 void (*shutdown)(void);
46
47 void (*update_pending)(struct xive_cpu *xc);
48 void (*eoi)(u32 hw_irq);
49 void (*sync_source)(u32 hw_irq);
50#ifdef CONFIG_SMP
51 int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
52 void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
53#endif
54 const char *name;
55};
56
57bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
58 u8 max_prio);
59
60extern bool xive_cmdline_disabled;
61
62#endif /* __XIVE_INTERNAL_H */
diff --git a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
index c658d8cf760b..c658d8cf760b 100755
--- a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
diff --git a/arch/powerpc/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh
index ec2d5c835170..ec2d5c835170 100755
--- a/arch/powerpc/relocs_check.sh
+++ b/arch/powerpc/tools/relocs_check.sh
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 16321ad9e70c..f11f65634aab 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,7 +29,9 @@
29#include <linux/nmi.h> 29#include <linux/nmi.h>
30#include <linux/ctype.h> 30#include <linux/ctype.h>
31 31
32#include <asm/debugfs.h>
32#include <asm/ptrace.h> 33#include <asm/ptrace.h>
34#include <asm/smp.h>
33#include <asm/string.h> 35#include <asm/string.h>
34#include <asm/prom.h> 36#include <asm/prom.h>
35#include <asm/machdep.h> 37#include <asm/machdep.h>
@@ -48,7 +50,7 @@
48#include <asm/reg.h> 50#include <asm/reg.h>
49#include <asm/debug.h> 51#include <asm/debug.h>
50#include <asm/hw_breakpoint.h> 52#include <asm/hw_breakpoint.h>
51 53#include <asm/xive.h>
52#include <asm/opal.h> 54#include <asm/opal.h>
53#include <asm/firmware.h> 55#include <asm/firmware.h>
54 56
@@ -76,6 +78,7 @@ static int xmon_gate;
76#endif /* CONFIG_SMP */ 78#endif /* CONFIG_SMP */
77 79
78static unsigned long in_xmon __read_mostly = 0; 80static unsigned long in_xmon __read_mostly = 0;
81static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
79 82
80static unsigned long adrs; 83static unsigned long adrs;
81static int size = 1; 84static int size = 1;
@@ -184,8 +187,6 @@ static void dump_tlb_44x(void);
184static void dump_tlb_book3e(void); 187static void dump_tlb_book3e(void);
185#endif 188#endif
186 189
187static int xmon_no_auto_backtrace;
188
189#ifdef CONFIG_PPC64 190#ifdef CONFIG_PPC64
190#define REG "%.16lx" 191#define REG "%.16lx"
191#else 192#else
@@ -232,7 +233,13 @@ Commands:\n\
232 "\ 233 "\
233 dr dump stream of raw bytes\n\ 234 dr dump stream of raw bytes\n\
234 dt dump the tracing buffers (uses printk)\n\ 235 dt dump the tracing buffers (uses printk)\n\
235 e print exception information\n\ 236"
237#ifdef CONFIG_PPC_POWERNV
238" dx# dump xive on CPU #\n\
239 dxi# dump xive irq state #\n\
240 dxa dump xive on all CPUs\n"
241#endif
242" e print exception information\n\
236 f flush cache\n\ 243 f flush cache\n\
237 la lookup symbol+offset of specified address\n\ 244 la lookup symbol+offset of specified address\n\
238 ls lookup address of specified symbol\n\ 245 ls lookup address of specified symbol\n\
@@ -411,7 +418,22 @@ int cpus_are_in_xmon(void)
411{ 418{
412 return !cpumask_empty(&cpus_in_xmon); 419 return !cpumask_empty(&cpus_in_xmon);
413} 420}
414#endif 421
422static bool wait_for_other_cpus(int ncpus)
423{
424 unsigned long timeout;
425
426 /* We wait for 2s, which is a metric "little while" */
427 for (timeout = 20000; timeout != 0; --timeout) {
428 if (cpumask_weight(&cpus_in_xmon) >= ncpus)
429 return true;
430 udelay(100);
431 barrier();
432 }
433
434 return false;
435}
436#endif /* CONFIG_SMP */
415 437
416static inline int unrecoverable_excp(struct pt_regs *regs) 438static inline int unrecoverable_excp(struct pt_regs *regs)
417{ 439{
@@ -433,7 +455,6 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
433#ifdef CONFIG_SMP 455#ifdef CONFIG_SMP
434 int cpu; 456 int cpu;
435 int secondary; 457 int secondary;
436 unsigned long timeout;
437#endif 458#endif
438 459
439 local_irq_save(flags); 460 local_irq_save(flags);
@@ -520,13 +541,17 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
520 xmon_owner = cpu; 541 xmon_owner = cpu;
521 mb(); 542 mb();
522 if (ncpus > 1) { 543 if (ncpus > 1) {
523 smp_send_debugger_break(); 544 /*
524 /* wait for other cpus to come in */ 545 * A system reset (trap == 0x100) can be triggered on
525 for (timeout = 100000000; timeout != 0; --timeout) { 546 * all CPUs, so when we come in via 0x100 try waiting
526 if (cpumask_weight(&cpus_in_xmon) >= ncpus) 547 * for the other CPUs to come in before we send the
527 break; 548 * debugger break (IPI). This is similar to
528 barrier(); 549 * crash_kexec_secondary().
529 } 550 */
551 if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus))
552 smp_send_debugger_break();
553
554 wait_for_other_cpus(ncpus);
530 } 555 }
531 remove_bpts(); 556 remove_bpts();
532 disable_surveillance(); 557 disable_surveillance();
@@ -884,10 +909,7 @@ cmds(struct pt_regs *excp)
884 last_cmd = NULL; 909 last_cmd = NULL;
885 xmon_regs = excp; 910 xmon_regs = excp;
886 911
887 if (!xmon_no_auto_backtrace) { 912 xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
888 xmon_no_auto_backtrace = 1;
889 xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
890 }
891 913
892 for(;;) { 914 for(;;) {
893#ifdef CONFIG_SMP 915#ifdef CONFIG_SMP
@@ -1347,9 +1369,19 @@ const char *getvecname(unsigned long vec)
1347 case 0x100: ret = "(System Reset)"; break; 1369 case 0x100: ret = "(System Reset)"; break;
1348 case 0x200: ret = "(Machine Check)"; break; 1370 case 0x200: ret = "(Machine Check)"; break;
1349 case 0x300: ret = "(Data Access)"; break; 1371 case 0x300: ret = "(Data Access)"; break;
1350 case 0x380: ret = "(Data SLB Access)"; break; 1372 case 0x380:
1373 if (radix_enabled())
1374 ret = "(Data Access Out of Range)";
1375 else
1376 ret = "(Data SLB Access)";
1377 break;
1351 case 0x400: ret = "(Instruction Access)"; break; 1378 case 0x400: ret = "(Instruction Access)"; break;
1352 case 0x480: ret = "(Instruction SLB Access)"; break; 1379 case 0x480:
1380 if (radix_enabled())
1381 ret = "(Instruction Access Out of Range)";
1382 else
1383 ret = "(Instruction SLB Access)";
1384 break;
1353 case 0x500: ret = "(Hardware Interrupt)"; break; 1385 case 0x500: ret = "(Hardware Interrupt)"; break;
1354 case 0x600: ret = "(Alignment)"; break; 1386 case 0x600: ret = "(Alignment)"; break;
1355 case 0x700: ret = "(Program Check)"; break; 1387 case 0x700: ret = "(Program Check)"; break;
@@ -2231,7 +2263,9 @@ static void dump_one_paca(int cpu)
2231 DUMP(p, kernel_msr, "lx"); 2263 DUMP(p, kernel_msr, "lx");
2232 DUMP(p, emergency_sp, "p"); 2264 DUMP(p, emergency_sp, "p");
2233#ifdef CONFIG_PPC_BOOK3S_64 2265#ifdef CONFIG_PPC_BOOK3S_64
2266 DUMP(p, nmi_emergency_sp, "p");
2234 DUMP(p, mc_emergency_sp, "p"); 2267 DUMP(p, mc_emergency_sp, "p");
2268 DUMP(p, in_nmi, "x");
2235 DUMP(p, in_mce, "x"); 2269 DUMP(p, in_mce, "x");
2236 DUMP(p, hmi_event_available, "x"); 2270 DUMP(p, hmi_event_available, "x");
2237#endif 2271#endif
@@ -2338,6 +2372,81 @@ static void dump_pacas(void)
2338} 2372}
2339#endif 2373#endif
2340 2374
2375#ifdef CONFIG_PPC_POWERNV
2376static void dump_one_xive(int cpu)
2377{
2378 unsigned int hwid = get_hard_smp_processor_id(cpu);
2379
2380 opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
2381 opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
2382 opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
2383 opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
2384 opal_xive_dump(XIVE_DUMP_VP, hwid);
2385 opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
2386
2387 if (setjmp(bus_error_jmp) != 0) {
2388 catch_memory_errors = 0;
2389 printf("*** Error dumping xive on cpu %d\n", cpu);
2390 return;
2391 }
2392
2393 catch_memory_errors = 1;
2394 sync();
2395 xmon_xive_do_dump(cpu);
2396 sync();
2397 __delay(200);
2398 catch_memory_errors = 0;
2399}
2400
2401static void dump_all_xives(void)
2402{
2403 int cpu;
2404
2405 if (num_possible_cpus() == 0) {
2406 printf("No possible cpus, use 'dx #' to dump individual cpus\n");
2407 return;
2408 }
2409
2410 for_each_possible_cpu(cpu)
2411 dump_one_xive(cpu);
2412}
2413
2414static void dump_one_xive_irq(u32 num)
2415{
2416 s64 rc;
2417 __be64 vp;
2418 u8 prio;
2419 __be32 lirq;
2420
2421 rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
2422 xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
2423 num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
2424}
2425
2426static void dump_xives(void)
2427{
2428 unsigned long num;
2429 int c;
2430
2431 c = inchar();
2432 if (c == 'a') {
2433 dump_all_xives();
2434 return;
2435 } else if (c == 'i') {
2436 if (scanhex(&num))
2437 dump_one_xive_irq(num);
2438 return;
2439 }
2440
2441 termch = c; /* Put c back, it wasn't 'a' */
2442
2443 if (scanhex(&num))
2444 dump_one_xive(num);
2445 else
2446 dump_one_xive(xmon_owner);
2447}
2448#endif /* CONFIG_PPC_POWERNV */
2449
2341static void dump_by_size(unsigned long addr, long count, int size) 2450static void dump_by_size(unsigned long addr, long count, int size)
2342{ 2451{
2343 unsigned char temp[16]; 2452 unsigned char temp[16];
@@ -2386,6 +2495,14 @@ dump(void)
2386 return; 2495 return;
2387 } 2496 }
2388#endif 2497#endif
2498#ifdef CONFIG_PPC_POWERNV
2499 if (c == 'x') {
2500 xmon_start_pagination();
2501 dump_xives();
2502 xmon_end_pagination();
2503 return;
2504 }
2505#endif
2389 2506
2390 if (c == '\n') 2507 if (c == '\n')
2391 termch = c; 2508 termch = c;
@@ -3070,23 +3187,28 @@ void dump_segments(void)
3070 for (i = 0; i < mmu_slb_size; i++) { 3187 for (i = 0; i < mmu_slb_size; i++) {
3071 asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (i)); 3188 asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (i));
3072 asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (i)); 3189 asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (i));
3073 if (esid || vsid) { 3190
3074 printf("%02d %016lx %016lx", i, esid, vsid); 3191 if (!esid && !vsid)
3075 if (esid & SLB_ESID_V) { 3192 continue;
3076 llp = vsid & SLB_VSID_LLP; 3193
3077 if (vsid & SLB_VSID_B_1T) { 3194 printf("%02d %016lx %016lx", i, esid, vsid);
3078 printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n", 3195
3079 GET_ESID_1T(esid), 3196 if (!(esid & SLB_ESID_V)) {
3080 (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, 3197 printf("\n");
3081 llp); 3198 continue;
3082 } else { 3199 }
3083 printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n", 3200
3084 GET_ESID(esid), 3201 llp = vsid & SLB_VSID_LLP;
3085 (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT, 3202 if (vsid & SLB_VSID_B_1T) {
3086 llp); 3203 printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n",
3087 } 3204 GET_ESID_1T(esid),
3088 } else 3205 (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
3089 printf("\n"); 3206 llp);
3207 } else {
3208 printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n",
3209 GET_ESID(esid),
3210 (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
3211 llp);
3090 } 3212 }
3091 } 3213 }
3092} 3214}
@@ -3302,6 +3424,8 @@ static void sysrq_handle_xmon(int key)
3302 /* ensure xmon is enabled */ 3424 /* ensure xmon is enabled */
3303 xmon_init(1); 3425 xmon_init(1);
3304 debugger(get_irq_regs()); 3426 debugger(get_irq_regs());
3427 if (!xmon_on)
3428 xmon_init(0);
3305} 3429}
3306 3430
3307static struct sysrq_key_op sysrq_xmon_op = { 3431static struct sysrq_key_op sysrq_xmon_op = {
@@ -3315,10 +3439,37 @@ static int __init setup_xmon_sysrq(void)
3315 register_sysrq_key('x', &sysrq_xmon_op); 3439 register_sysrq_key('x', &sysrq_xmon_op);
3316 return 0; 3440 return 0;
3317} 3441}
3318__initcall(setup_xmon_sysrq); 3442device_initcall(setup_xmon_sysrq);
3319#endif /* CONFIG_MAGIC_SYSRQ */ 3443#endif /* CONFIG_MAGIC_SYSRQ */
3320 3444
3321static int __initdata xmon_early, xmon_off; 3445#ifdef CONFIG_DEBUG_FS
3446static int xmon_dbgfs_set(void *data, u64 val)
3447{
3448 xmon_on = !!val;
3449 xmon_init(xmon_on);
3450
3451 return 0;
3452}
3453
3454static int xmon_dbgfs_get(void *data, u64 *val)
3455{
3456 *val = xmon_on;
3457 return 0;
3458}
3459
3460DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
3461 xmon_dbgfs_set, "%llu\n");
3462
3463static int __init setup_xmon_dbgfs(void)
3464{
3465 debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
3466 &xmon_dbgfs_ops);
3467 return 0;
3468}
3469device_initcall(setup_xmon_dbgfs);
3470#endif /* CONFIG_DEBUG_FS */
3471
3472static int xmon_early __initdata;
3322 3473
3323static int __init early_parse_xmon(char *p) 3474static int __init early_parse_xmon(char *p)
3324{ 3475{
@@ -3326,12 +3477,12 @@ static int __init early_parse_xmon(char *p)
3326 /* just "xmon" is equivalent to "xmon=early" */ 3477 /* just "xmon" is equivalent to "xmon=early" */
3327 xmon_init(1); 3478 xmon_init(1);
3328 xmon_early = 1; 3479 xmon_early = 1;
3329 } else if (strncmp(p, "on", 2) == 0) 3480 xmon_on = 1;
3481 } else if (strncmp(p, "on", 2) == 0) {
3330 xmon_init(1); 3482 xmon_init(1);
3331 else if (strncmp(p, "off", 3) == 0) 3483 xmon_on = 1;
3332 xmon_off = 1; 3484 } else if (strncmp(p, "off", 3) == 0)
3333 else if (strncmp(p, "nobt", 4) == 0) 3485 xmon_on = 0;
3334 xmon_no_auto_backtrace = 1;
3335 else 3486 else
3336 return 1; 3487 return 1;
3337 3488
@@ -3341,10 +3492,8 @@ early_param("xmon", early_parse_xmon);
3341 3492
3342void __init xmon_setup(void) 3493void __init xmon_setup(void)
3343{ 3494{
3344#ifdef CONFIG_XMON_DEFAULT 3495 if (xmon_on)
3345 if (!xmon_off)
3346 xmon_init(1); 3496 xmon_init(1);
3347#endif
3348 if (xmon_early) 3497 if (xmon_early)
3349 debugger(NULL); 3498 debugger(NULL);
3350} 3499}
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index bcc030eacab7..1a138c83f877 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -14,6 +14,7 @@
14#include <linux/msi.h> 14#include <linux/msi.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/sched/mm.h>
17 18
18#include "cxl.h" 19#include "cxl.h"
19 20
@@ -321,19 +322,29 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
321 322
322 if (task) { 323 if (task) {
323 ctx->pid = get_task_pid(task, PIDTYPE_PID); 324 ctx->pid = get_task_pid(task, PIDTYPE_PID);
324 ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
325 kernel = false; 325 kernel = false;
326 ctx->real_mode = false; 326 ctx->real_mode = false;
327
328 /* acquire a reference to the task's mm */
329 ctx->mm = get_task_mm(current);
330
331 /* ensure this mm_struct can't be freed */
332 cxl_context_mm_count_get(ctx);
333
334 /* decrement the use count */
335 if (ctx->mm)
336 mmput(ctx->mm);
327 } 337 }
328 338
329 cxl_ctx_get(); 339 cxl_ctx_get();
330 340
331 if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) { 341 if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
332 put_pid(ctx->glpid);
333 put_pid(ctx->pid); 342 put_pid(ctx->pid);
334 ctx->glpid = ctx->pid = NULL; 343 ctx->pid = NULL;
335 cxl_adapter_context_put(ctx->afu->adapter); 344 cxl_adapter_context_put(ctx->afu->adapter);
336 cxl_ctx_put(); 345 cxl_ctx_put();
346 if (task)
347 cxl_context_mm_count_put(ctx);
337 goto out; 348 goto out;
338 } 349 }
339 350
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 062bf6ca2625..4472ce11f98d 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -17,6 +17,7 @@
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/sched/mm.h>
20#include <asm/cputable.h> 21#include <asm/cputable.h>
21#include <asm/current.h> 22#include <asm/current.h>
22#include <asm/copro.h> 23#include <asm/copro.h>
@@ -38,23 +39,26 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
38{ 39{
39 int i; 40 int i;
40 41
41 spin_lock_init(&ctx->sste_lock);
42 ctx->afu = afu; 42 ctx->afu = afu;
43 ctx->master = master; 43 ctx->master = master;
44 ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */ 44 ctx->pid = NULL; /* Set in start work ioctl */
45 mutex_init(&ctx->mapping_lock); 45 mutex_init(&ctx->mapping_lock);
46 ctx->mapping = NULL; 46 ctx->mapping = NULL;
47 47
48 /* 48 if (cxl_is_psl8(afu)) {
49 * Allocate the segment table before we put it in the IDR so that we 49 spin_lock_init(&ctx->sste_lock);
50 * can always access it when dereferenced from IDR. For the same 50
51 * reason, the segment table is only destroyed after the context is 51 /*
52 * removed from the IDR. Access to this in the IOCTL is protected by 52 * Allocate the segment table before we put it in the IDR so that we
53 * Linux filesytem symantics (can't IOCTL until open is complete). 53 * can always access it when dereferenced from IDR. For the same
54 */ 54 * reason, the segment table is only destroyed after the context is
55 i = cxl_alloc_sst(ctx); 55 * removed from the IDR. Access to this in the IOCTL is protected by
56 if (i) 56 * Linux filesytem symantics (can't IOCTL until open is complete).
57 return i; 57 */
58 i = cxl_alloc_sst(ctx);
59 if (i)
60 return i;
61 }
58 62
59 INIT_WORK(&ctx->fault_work, cxl_handle_fault); 63 INIT_WORK(&ctx->fault_work, cxl_handle_fault);
60 64
@@ -184,13 +188,26 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma)
184 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { 188 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
185 if (start + len > ctx->afu->adapter->ps_size) 189 if (start + len > ctx->afu->adapter->ps_size)
186 return -EINVAL; 190 return -EINVAL;
191
192 if (cxl_is_psl9(ctx->afu)) {
193 /*
194 * Make sure there is a valid problem state
195 * area space for this AFU.
196 */
197 if (ctx->master && !ctx->afu->psa) {
198 pr_devel("AFU doesn't support mmio space\n");
199 return -EINVAL;
200 }
201
202 /* Can't mmap until the AFU is enabled */
203 if (!ctx->afu->enabled)
204 return -EBUSY;
205 }
187 } else { 206 } else {
188 if (start + len > ctx->psn_size) 207 if (start + len > ctx->psn_size)
189 return -EINVAL; 208 return -EINVAL;
190 }
191 209
192 if (ctx->afu->current_mode != CXL_MODE_DEDICATED) { 210 /* Make sure there is a valid per process space for this AFU */
193 /* make sure there is a valid per process space for this AFU */
194 if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) { 211 if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) {
195 pr_devel("AFU doesn't support mmio space\n"); 212 pr_devel("AFU doesn't support mmio space\n");
196 return -EINVAL; 213 return -EINVAL;
@@ -242,12 +259,16 @@ int __detach_context(struct cxl_context *ctx)
242 259
243 /* release the reference to the group leader and mm handling pid */ 260 /* release the reference to the group leader and mm handling pid */
244 put_pid(ctx->pid); 261 put_pid(ctx->pid);
245 put_pid(ctx->glpid);
246 262
247 cxl_ctx_put(); 263 cxl_ctx_put();
248 264
249 /* Decrease the attached context count on the adapter */ 265 /* Decrease the attached context count on the adapter */
250 cxl_adapter_context_put(ctx->afu->adapter); 266 cxl_adapter_context_put(ctx->afu->adapter);
267
268 /* Decrease the mm count on the context */
269 cxl_context_mm_count_put(ctx);
270 ctx->mm = NULL;
271
251 return 0; 272 return 0;
252} 273}
253 274
@@ -303,7 +324,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
303{ 324{
304 struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu); 325 struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
305 326
306 free_page((u64)ctx->sstp); 327 if (cxl_is_psl8(ctx->afu))
328 free_page((u64)ctx->sstp);
307 if (ctx->ff_page) 329 if (ctx->ff_page)
308 __free_page(ctx->ff_page); 330 __free_page(ctx->ff_page);
309 ctx->sstp = NULL; 331 ctx->sstp = NULL;
@@ -325,3 +347,15 @@ void cxl_context_free(struct cxl_context *ctx)
325 mutex_unlock(&ctx->afu->contexts_lock); 347 mutex_unlock(&ctx->afu->contexts_lock);
326 call_rcu(&ctx->rcu, reclaim_ctx); 348 call_rcu(&ctx->rcu, reclaim_ctx);
327} 349}
350
351void cxl_context_mm_count_get(struct cxl_context *ctx)
352{
353 if (ctx->mm)
354 atomic_inc(&ctx->mm->mm_count);
355}
356
357void cxl_context_mm_count_put(struct cxl_context *ctx)
358{
359 if (ctx->mm)
360 mmdrop(ctx->mm);
361}
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 79e60ec70bd3..c8568ea7c518 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -63,7 +63,7 @@ typedef struct {
63/* Memory maps. Ref CXL Appendix A */ 63/* Memory maps. Ref CXL Appendix A */
64 64
65/* PSL Privilege 1 Memory Map */ 65/* PSL Privilege 1 Memory Map */
66/* Configuration and Control area */ 66/* Configuration and Control area - CAIA 1&2 */
67static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x0000}; 67static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x0000};
68static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008}; 68static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008};
69static const cxl_p1_reg_t CXL_PSL_KEY1 = {0x0010}; 69static const cxl_p1_reg_t CXL_PSL_KEY1 = {0x0010};
@@ -73,7 +73,7 @@ static const cxl_p1_reg_t CXL_PSL_Control = {0x0020};
73static const cxl_p1_reg_t CXL_PSL_DLCNTL = {0x0060}; 73static const cxl_p1_reg_t CXL_PSL_DLCNTL = {0x0060};
74static const cxl_p1_reg_t CXL_PSL_DLADDR = {0x0068}; 74static const cxl_p1_reg_t CXL_PSL_DLADDR = {0x0068};
75 75
76/* PSL Lookaside Buffer Management Area */ 76/* PSL Lookaside Buffer Management Area - CAIA 1 */
77static const cxl_p1_reg_t CXL_PSL_LBISEL = {0x0080}; 77static const cxl_p1_reg_t CXL_PSL_LBISEL = {0x0080};
78static const cxl_p1_reg_t CXL_PSL_SLBIE = {0x0088}; 78static const cxl_p1_reg_t CXL_PSL_SLBIE = {0x0088};
79static const cxl_p1_reg_t CXL_PSL_SLBIA = {0x0090}; 79static const cxl_p1_reg_t CXL_PSL_SLBIA = {0x0090};
@@ -82,7 +82,7 @@ static const cxl_p1_reg_t CXL_PSL_TLBIA = {0x00A8};
82static const cxl_p1_reg_t CXL_PSL_AFUSEL = {0x00B0}; 82static const cxl_p1_reg_t CXL_PSL_AFUSEL = {0x00B0};
83 83
84/* 0x00C0:7EFF Implementation dependent area */ 84/* 0x00C0:7EFF Implementation dependent area */
85/* PSL registers */ 85/* PSL registers - CAIA 1 */
86static const cxl_p1_reg_t CXL_PSL_FIR1 = {0x0100}; 86static const cxl_p1_reg_t CXL_PSL_FIR1 = {0x0100};
87static const cxl_p1_reg_t CXL_PSL_FIR2 = {0x0108}; 87static const cxl_p1_reg_t CXL_PSL_FIR2 = {0x0108};
88static const cxl_p1_reg_t CXL_PSL_Timebase = {0x0110}; 88static const cxl_p1_reg_t CXL_PSL_Timebase = {0x0110};
@@ -98,61 +98,83 @@ static const cxl_p1_reg_t CXL_XSL_Timebase = {0x0100};
98static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108}; 98static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108};
99static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158}; 99static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158};
100static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168}; 100static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168};
101/* PSL registers - CAIA 2 */
102static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020};
103static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168};
104static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300};
105static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308};
106static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310};
107static const cxl_p1_reg_t CXL_PSL9_DEBUG = {0x0320};
108static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348};
109static const cxl_p1_reg_t CXL_PSL9_DSNDCTL = {0x0350};
110static const cxl_p1_reg_t CXL_PSL9_TB_CTLSTAT = {0x0340};
111static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368};
112static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378};
113static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380};
114static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388};
115static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398};
116static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588};
117static const cxl_p1_reg_t CXL_XSL9_ILPP = {0x0590};
118
101/* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */ 119/* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */
102/* 0x8000:FFFF Reserved PCIe MSI-X Table Area */ 120/* 0x8000:FFFF Reserved PCIe MSI-X Table Area */
103 121
104/* PSL Slice Privilege 1 Memory Map */ 122/* PSL Slice Privilege 1 Memory Map */
105/* Configuration Area */ 123/* Configuration Area - CAIA 1&2 */
106static const cxl_p1n_reg_t CXL_PSL_SR_An = {0x00}; 124static const cxl_p1n_reg_t CXL_PSL_SR_An = {0x00};
107static const cxl_p1n_reg_t CXL_PSL_LPID_An = {0x08}; 125static const cxl_p1n_reg_t CXL_PSL_LPID_An = {0x08};
108static const cxl_p1n_reg_t CXL_PSL_AMBAR_An = {0x10}; 126static const cxl_p1n_reg_t CXL_PSL_AMBAR_An = {0x10};
109static const cxl_p1n_reg_t CXL_PSL_SPOffset_An = {0x18}; 127static const cxl_p1n_reg_t CXL_PSL_SPOffset_An = {0x18};
110static const cxl_p1n_reg_t CXL_PSL_ID_An = {0x20}; 128static const cxl_p1n_reg_t CXL_PSL_ID_An = {0x20};
111static const cxl_p1n_reg_t CXL_PSL_SERR_An = {0x28}; 129static const cxl_p1n_reg_t CXL_PSL_SERR_An = {0x28};
112/* Memory Management and Lookaside Buffer Management */ 130/* Memory Management and Lookaside Buffer Management - CAIA 1*/
113static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30}; 131static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30};
132/* Memory Management and Lookaside Buffer Management - CAIA 1&2 */
114static const cxl_p1n_reg_t CXL_PSL_AMOR_An = {0x38}; 133static const cxl_p1n_reg_t CXL_PSL_AMOR_An = {0x38};
115/* Pointer Area */ 134/* Pointer Area - CAIA 1&2 */
116static const cxl_p1n_reg_t CXL_HAURP_An = {0x80}; 135static const cxl_p1n_reg_t CXL_HAURP_An = {0x80};
117static const cxl_p1n_reg_t CXL_PSL_SPAP_An = {0x88}; 136static const cxl_p1n_reg_t CXL_PSL_SPAP_An = {0x88};
118static const cxl_p1n_reg_t CXL_PSL_LLCMD_An = {0x90}; 137static const cxl_p1n_reg_t CXL_PSL_LLCMD_An = {0x90};
119/* Control Area */ 138/* Control Area - CAIA 1&2 */
120static const cxl_p1n_reg_t CXL_PSL_SCNTL_An = {0xA0}; 139static const cxl_p1n_reg_t CXL_PSL_SCNTL_An = {0xA0};
121static const cxl_p1n_reg_t CXL_PSL_CtxTime_An = {0xA8}; 140static const cxl_p1n_reg_t CXL_PSL_CtxTime_An = {0xA8};
122static const cxl_p1n_reg_t CXL_PSL_IVTE_Offset_An = {0xB0}; 141static const cxl_p1n_reg_t CXL_PSL_IVTE_Offset_An = {0xB0};
123static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An = {0xB8}; 142static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An = {0xB8};
124/* 0xC0:FF Implementation Dependent Area */ 143/* 0xC0:FF Implementation Dependent Area - CAIA 1&2 */
125static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An = {0xC0}; 144static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An = {0xC0};
126static const cxl_p1n_reg_t CXL_AFU_DEBUG_An = {0xC8}; 145static const cxl_p1n_reg_t CXL_AFU_DEBUG_An = {0xC8};
146/* 0xC0:FF Implementation Dependent Area - CAIA 1 */
127static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A = {0xD0}; 147static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A = {0xD0};
128static const cxl_p1n_reg_t CXL_PSL_COALLOC_A = {0xD8}; 148static const cxl_p1n_reg_t CXL_PSL_COALLOC_A = {0xD8};
129static const cxl_p1n_reg_t CXL_PSL_RXCTL_A = {0xE0}; 149static const cxl_p1n_reg_t CXL_PSL_RXCTL_A = {0xE0};
130static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE = {0xE8}; 150static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE = {0xE8};
131 151
132/* PSL Slice Privilege 2 Memory Map */ 152/* PSL Slice Privilege 2 Memory Map */
133/* Configuration and Control Area */ 153/* Configuration and Control Area - CAIA 1&2 */
134static const cxl_p2n_reg_t CXL_PSL_PID_TID_An = {0x000}; 154static const cxl_p2n_reg_t CXL_PSL_PID_TID_An = {0x000};
135static const cxl_p2n_reg_t CXL_CSRP_An = {0x008}; 155static const cxl_p2n_reg_t CXL_CSRP_An = {0x008};
156/* Configuration and Control Area - CAIA 1 */
136static const cxl_p2n_reg_t CXL_AURP0_An = {0x010}; 157static const cxl_p2n_reg_t CXL_AURP0_An = {0x010};
137static const cxl_p2n_reg_t CXL_AURP1_An = {0x018}; 158static const cxl_p2n_reg_t CXL_AURP1_An = {0x018};
138static const cxl_p2n_reg_t CXL_SSTP0_An = {0x020}; 159static const cxl_p2n_reg_t CXL_SSTP0_An = {0x020};
139static const cxl_p2n_reg_t CXL_SSTP1_An = {0x028}; 160static const cxl_p2n_reg_t CXL_SSTP1_An = {0x028};
161/* Configuration and Control Area - CAIA 1 */
140static const cxl_p2n_reg_t CXL_PSL_AMR_An = {0x030}; 162static const cxl_p2n_reg_t CXL_PSL_AMR_An = {0x030};
141/* Segment Lookaside Buffer Management */ 163/* Segment Lookaside Buffer Management - CAIA 1 */
142static const cxl_p2n_reg_t CXL_SLBIE_An = {0x040}; 164static const cxl_p2n_reg_t CXL_SLBIE_An = {0x040};
143static const cxl_p2n_reg_t CXL_SLBIA_An = {0x048}; 165static const cxl_p2n_reg_t CXL_SLBIA_An = {0x048};
144static const cxl_p2n_reg_t CXL_SLBI_Select_An = {0x050}; 166static const cxl_p2n_reg_t CXL_SLBI_Select_An = {0x050};
145/* Interrupt Registers */ 167/* Interrupt Registers - CAIA 1&2 */
146static const cxl_p2n_reg_t CXL_PSL_DSISR_An = {0x060}; 168static const cxl_p2n_reg_t CXL_PSL_DSISR_An = {0x060};
147static const cxl_p2n_reg_t CXL_PSL_DAR_An = {0x068}; 169static const cxl_p2n_reg_t CXL_PSL_DAR_An = {0x068};
148static const cxl_p2n_reg_t CXL_PSL_DSR_An = {0x070}; 170static const cxl_p2n_reg_t CXL_PSL_DSR_An = {0x070};
149static const cxl_p2n_reg_t CXL_PSL_TFC_An = {0x078}; 171static const cxl_p2n_reg_t CXL_PSL_TFC_An = {0x078};
150static const cxl_p2n_reg_t CXL_PSL_PEHandle_An = {0x080}; 172static const cxl_p2n_reg_t CXL_PSL_PEHandle_An = {0x080};
151static const cxl_p2n_reg_t CXL_PSL_ErrStat_An = {0x088}; 173static const cxl_p2n_reg_t CXL_PSL_ErrStat_An = {0x088};
152/* AFU Registers */ 174/* AFU Registers - CAIA 1&2 */
153static const cxl_p2n_reg_t CXL_AFU_Cntl_An = {0x090}; 175static const cxl_p2n_reg_t CXL_AFU_Cntl_An = {0x090};
154static const cxl_p2n_reg_t CXL_AFU_ERR_An = {0x098}; 176static const cxl_p2n_reg_t CXL_AFU_ERR_An = {0x098};
155/* Work Element Descriptor */ 177/* Work Element Descriptor - CAIA 1&2 */
156static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; 178static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
157/* 0x0C0:FFF Implementation Dependent Area */ 179/* 0x0C0:FFF Implementation Dependent Area */
158 180
@@ -179,6 +201,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
179#define CXL_PSL_SR_An_SF MSR_SF /* 64bit */ 201#define CXL_PSL_SR_An_SF MSR_SF /* 64bit */
180#define CXL_PSL_SR_An_TA (1ull << (63-1)) /* Tags active, GA1: 0 */ 202#define CXL_PSL_SR_An_TA (1ull << (63-1)) /* Tags active, GA1: 0 */
181#define CXL_PSL_SR_An_HV MSR_HV /* Hypervisor, GA1: 0 */ 203#define CXL_PSL_SR_An_HV MSR_HV /* Hypervisor, GA1: 0 */
204#define CXL_PSL_SR_An_XLAT_hpt (0ull << (63-6))/* Hashed page table (HPT) mode */
205#define CXL_PSL_SR_An_XLAT_roh (2ull << (63-6))/* Radix on HPT mode */
206#define CXL_PSL_SR_An_XLAT_ror (3ull << (63-6))/* Radix on Radix mode */
207#define CXL_PSL_SR_An_BOT (1ull << (63-10)) /* Use the in-memory segment table */
182#define CXL_PSL_SR_An_PR MSR_PR /* Problem state, GA1: 1 */ 208#define CXL_PSL_SR_An_PR MSR_PR /* Problem state, GA1: 1 */
183#define CXL_PSL_SR_An_ISL (1ull << (63-53)) /* Ignore Segment Large Page */ 209#define CXL_PSL_SR_An_ISL (1ull << (63-53)) /* Ignore Segment Large Page */
184#define CXL_PSL_SR_An_TC (1ull << (63-54)) /* Page Table secondary hash */ 210#define CXL_PSL_SR_An_TC (1ull << (63-54)) /* Page Table secondary hash */
@@ -202,6 +228,24 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
202#define CXL_PSL_SERR_An_llcmdto (1ull << (63-6)) 228#define CXL_PSL_SERR_An_llcmdto (1ull << (63-6))
203#define CXL_PSL_SERR_An_afupar (1ull << (63-7)) 229#define CXL_PSL_SERR_An_afupar (1ull << (63-7))
204#define CXL_PSL_SERR_An_afudup (1ull << (63-8)) 230#define CXL_PSL_SERR_An_afudup (1ull << (63-8))
231#define CXL_PSL_SERR_An_IRQS ( \
232 CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
233 CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
234 CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
235#define CXL_PSL_SERR_An_afuto_mask (1ull << (63-32))
236#define CXL_PSL_SERR_An_afudis_mask (1ull << (63-33))
237#define CXL_PSL_SERR_An_afuov_mask (1ull << (63-34))
238#define CXL_PSL_SERR_An_badsrc_mask (1ull << (63-35))
239#define CXL_PSL_SERR_An_badctx_mask (1ull << (63-36))
240#define CXL_PSL_SERR_An_llcmdis_mask (1ull << (63-37))
241#define CXL_PSL_SERR_An_llcmdto_mask (1ull << (63-38))
242#define CXL_PSL_SERR_An_afupar_mask (1ull << (63-39))
243#define CXL_PSL_SERR_An_afudup_mask (1ull << (63-40))
244#define CXL_PSL_SERR_An_IRQ_MASKS ( \
245 CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
246 CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
247 CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
248
205#define CXL_PSL_SERR_An_AE (1ull << (63-30)) 249#define CXL_PSL_SERR_An_AE (1ull << (63-30))
206 250
207/****** CXL_PSL_SCNTL_An ****************************************************/ 251/****** CXL_PSL_SCNTL_An ****************************************************/
@@ -257,7 +301,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
257#define CXL_SSTP1_An_STVA_L_MASK (~((1ull << (63-55))-1)) 301#define CXL_SSTP1_An_STVA_L_MASK (~((1ull << (63-55))-1))
258#define CXL_SSTP1_An_V (1ull << (63-63)) 302#define CXL_SSTP1_An_V (1ull << (63-63))
259 303
260/****** CXL_PSL_SLBIE_[An] **************************************************/ 304/****** CXL_PSL_SLBIE_[An] - CAIA 1 **************************************************/
261/* write: */ 305/* write: */
262#define CXL_SLBIE_C PPC_BIT(36) /* Class */ 306#define CXL_SLBIE_C PPC_BIT(36) /* Class */
263#define CXL_SLBIE_SS PPC_BITMASK(37, 38) /* Segment Size */ 307#define CXL_SLBIE_SS PPC_BITMASK(37, 38) /* Segment Size */
@@ -267,10 +311,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
267#define CXL_SLBIE_MAX PPC_BITMASK(24, 31) 311#define CXL_SLBIE_MAX PPC_BITMASK(24, 31)
268#define CXL_SLBIE_PENDING PPC_BITMASK(56, 63) 312#define CXL_SLBIE_PENDING PPC_BITMASK(56, 63)
269 313
270/****** Common to all CXL_TLBIA/SLBIA_[An] **********************************/ 314/****** Common to all CXL_TLBIA/SLBIA_[An] - CAIA 1 **********************************/
271#define CXL_TLB_SLB_P (1ull) /* Pending (read) */ 315#define CXL_TLB_SLB_P (1ull) /* Pending (read) */
272 316
273/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers **********************/ 317/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers - CAIA 1 **********************/
274#define CXL_TLB_SLB_IQ_ALL (0ull) /* Inv qualifier */ 318#define CXL_TLB_SLB_IQ_ALL (0ull) /* Inv qualifier */
275#define CXL_TLB_SLB_IQ_LPID (1ull) /* Inv qualifier */ 319#define CXL_TLB_SLB_IQ_LPID (1ull) /* Inv qualifier */
276#define CXL_TLB_SLB_IQ_LPIDPID (3ull) /* Inv qualifier */ 320#define CXL_TLB_SLB_IQ_LPIDPID (3ull) /* Inv qualifier */
@@ -278,7 +322,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
278/****** CXL_PSL_AFUSEL ******************************************************/ 322/****** CXL_PSL_AFUSEL ******************************************************/
279#define CXL_PSL_AFUSEL_A (1ull << (63-55)) /* Adapter wide invalidates affect all AFUs */ 323#define CXL_PSL_AFUSEL_A (1ull << (63-55)) /* Adapter wide invalidates affect all AFUs */
280 324
281/****** CXL_PSL_DSISR_An ****************************************************/ 325/****** CXL_PSL_DSISR_An - CAIA 1 ****************************************************/
282#define CXL_PSL_DSISR_An_DS (1ull << (63-0)) /* Segment not found */ 326#define CXL_PSL_DSISR_An_DS (1ull << (63-0)) /* Segment not found */
283#define CXL_PSL_DSISR_An_DM (1ull << (63-1)) /* PTE not found (See also: M) or protection fault */ 327#define CXL_PSL_DSISR_An_DM (1ull << (63-1)) /* PTE not found (See also: M) or protection fault */
284#define CXL_PSL_DSISR_An_ST (1ull << (63-2)) /* Segment Table PTE not found */ 328#define CXL_PSL_DSISR_An_ST (1ull << (63-2)) /* Segment Table PTE not found */
@@ -295,12 +339,39 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
295#define CXL_PSL_DSISR_An_S DSISR_ISSTORE /* Access was afu_wr or afu_zero */ 339#define CXL_PSL_DSISR_An_S DSISR_ISSTORE /* Access was afu_wr or afu_zero */
296#define CXL_PSL_DSISR_An_K DSISR_KEYFAULT /* Access not permitted by virtual page class key protection */ 340#define CXL_PSL_DSISR_An_K DSISR_KEYFAULT /* Access not permitted by virtual page class key protection */
297 341
342/****** CXL_PSL_DSISR_An - CAIA 2 ****************************************************/
343#define CXL_PSL9_DSISR_An_TF (1ull << (63-3)) /* Translation fault */
344#define CXL_PSL9_DSISR_An_PE (1ull << (63-4)) /* PSL Error (implementation specific) */
345#define CXL_PSL9_DSISR_An_AE (1ull << (63-5)) /* AFU Error */
346#define CXL_PSL9_DSISR_An_OC (1ull << (63-6)) /* OS Context Warning */
347#define CXL_PSL9_DSISR_An_S (1ull << (63-38)) /* TF for a write operation */
348#define CXL_PSL9_DSISR_PENDING (CXL_PSL9_DSISR_An_TF | CXL_PSL9_DSISR_An_PE | CXL_PSL9_DSISR_An_AE | CXL_PSL9_DSISR_An_OC)
349/*
350 * NOTE: Bits 56:63 (Checkout Response Status) are valid when DSISR_An[TF] = 1
351 * Status (0:7) Encoding
352 */
353#define CXL_PSL9_DSISR_An_CO_MASK 0x00000000000000ffULL
354#define CXL_PSL9_DSISR_An_SF 0x0000000000000080ULL /* Segment Fault 0b10000000 */
355#define CXL_PSL9_DSISR_An_PF_SLR 0x0000000000000088ULL /* PTE not found (Single Level Radix) 0b10001000 */
356#define CXL_PSL9_DSISR_An_PF_RGC 0x000000000000008CULL /* PTE not found (Radix Guest (child)) 0b10001100 */
357#define CXL_PSL9_DSISR_An_PF_RGP 0x0000000000000090ULL /* PTE not found (Radix Guest (parent)) 0b10010000 */
358#define CXL_PSL9_DSISR_An_PF_HRH 0x0000000000000094ULL /* PTE not found (HPT/Radix Host) 0b10010100 */
359#define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL /* PTE not found (STEG VA) 0b10011100 */
360
298/****** CXL_PSL_TFC_An ******************************************************/ 361/****** CXL_PSL_TFC_An ******************************************************/
299#define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */ 362#define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */
300#define CXL_PSL_TFC_An_C (1ull << (63-29)) /* Continue (abort transaction) */ 363#define CXL_PSL_TFC_An_C (1ull << (63-29)) /* Continue (abort transaction) */
301#define CXL_PSL_TFC_An_AE (1ull << (63-30)) /* Restart PSL with address error */ 364#define CXL_PSL_TFC_An_AE (1ull << (63-30)) /* Restart PSL with address error */
302#define CXL_PSL_TFC_An_R (1ull << (63-31)) /* Restart PSL transaction */ 365#define CXL_PSL_TFC_An_R (1ull << (63-31)) /* Restart PSL transaction */
303 366
367/****** CXL_XSL9_IERAT_ERAT - CAIA 2 **********************************/
368#define CXL_XSL9_IERAT_MLPID (1ull << (63-0)) /* Match LPID */
369#define CXL_XSL9_IERAT_MPID (1ull << (63-1)) /* Match PID */
370#define CXL_XSL9_IERAT_PRS (1ull << (63-4)) /* PRS bit for Radix invalidations */
371#define CXL_XSL9_IERAT_INVR (1ull << (63-3)) /* Invalidate Radix */
372#define CXL_XSL9_IERAT_IALL (1ull << (63-8)) /* Invalidate All */
373#define CXL_XSL9_IERAT_IINPROG (1ull << (63-63)) /* Invalidate in progress */
374
304/* cxl_process_element->software_status */ 375/* cxl_process_element->software_status */
305#define CXL_PE_SOFTWARE_STATE_V (1ul << (31 - 0)) /* Valid */ 376#define CXL_PE_SOFTWARE_STATE_V (1ul << (31 - 0)) /* Valid */
306#define CXL_PE_SOFTWARE_STATE_C (1ul << (31 - 29)) /* Complete */ 377#define CXL_PE_SOFTWARE_STATE_C (1ul << (31 - 29)) /* Complete */
@@ -482,8 +553,6 @@ struct cxl_context {
482 unsigned int sst_size, sst_lru; 553 unsigned int sst_size, sst_lru;
483 554
484 wait_queue_head_t wq; 555 wait_queue_head_t wq;
485 /* pid of the group leader associated with the pid */
486 struct pid *glpid;
487 /* use mm context associated with this pid for ds faults */ 556 /* use mm context associated with this pid for ds faults */
488 struct pid *pid; 557 struct pid *pid;
489 spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */ 558 spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */
@@ -551,15 +620,27 @@ struct cxl_context {
551 * CX4 only: 620 * CX4 only:
552 */ 621 */
553 struct list_head extra_irq_contexts; 622 struct list_head extra_irq_contexts;
623
624 struct mm_struct *mm;
554}; 625};
555 626
627struct cxl_irq_info;
628
556struct cxl_service_layer_ops { 629struct cxl_service_layer_ops {
557 int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev); 630 int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev);
631 int (*invalidate_all)(struct cxl *adapter);
558 int (*afu_regs_init)(struct cxl_afu *afu); 632 int (*afu_regs_init)(struct cxl_afu *afu);
633 int (*sanitise_afu_regs)(struct cxl_afu *afu);
559 int (*register_serr_irq)(struct cxl_afu *afu); 634 int (*register_serr_irq)(struct cxl_afu *afu);
560 void (*release_serr_irq)(struct cxl_afu *afu); 635 void (*release_serr_irq)(struct cxl_afu *afu);
561 void (*debugfs_add_adapter_sl_regs)(struct cxl *adapter, struct dentry *dir); 636 irqreturn_t (*handle_interrupt)(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
562 void (*debugfs_add_afu_sl_regs)(struct cxl_afu *afu, struct dentry *dir); 637 irqreturn_t (*fail_irq)(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
638 int (*activate_dedicated_process)(struct cxl_afu *afu);
639 int (*attach_afu_directed)(struct cxl_context *ctx, u64 wed, u64 amr);
640 int (*attach_dedicated_process)(struct cxl_context *ctx, u64 wed, u64 amr);
641 void (*update_dedicated_ivtes)(struct cxl_context *ctx);
642 void (*debugfs_add_adapter_regs)(struct cxl *adapter, struct dentry *dir);
643 void (*debugfs_add_afu_regs)(struct cxl_afu *afu, struct dentry *dir);
563 void (*psl_irq_dump_registers)(struct cxl_context *ctx); 644 void (*psl_irq_dump_registers)(struct cxl_context *ctx);
564 void (*err_irq_dump_registers)(struct cxl *adapter); 645 void (*err_irq_dump_registers)(struct cxl *adapter);
565 void (*debugfs_stop_trace)(struct cxl *adapter); 646 void (*debugfs_stop_trace)(struct cxl *adapter);
@@ -641,25 +722,38 @@ int cxl_pci_reset(struct cxl *adapter);
641void cxl_pci_release_afu(struct device *dev); 722void cxl_pci_release_afu(struct device *dev);
642ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len); 723ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len);
643 724
644/* common == phyp + powernv */ 725/* common == phyp + powernv - CAIA 1&2 */
645struct cxl_process_element_common { 726struct cxl_process_element_common {
646 __be32 tid; 727 __be32 tid;
647 __be32 pid; 728 __be32 pid;
648 __be64 csrp; 729 __be64 csrp;
649 __be64 aurp0; 730 union {
650 __be64 aurp1; 731 struct {
651 __be64 sstp0; 732 __be64 aurp0;
652 __be64 sstp1; 733 __be64 aurp1;
734 __be64 sstp0;
735 __be64 sstp1;
736 } psl8; /* CAIA 1 */
737 struct {
738 u8 reserved2[8];
739 u8 reserved3[8];
740 u8 reserved4[8];
741 u8 reserved5[8];
742 } psl9; /* CAIA 2 */
743 } u;
653 __be64 amr; 744 __be64 amr;
654 u8 reserved3[4]; 745 u8 reserved6[4];
655 __be64 wed; 746 __be64 wed;
656} __packed; 747} __packed;
657 748
658/* just powernv */ 749/* just powernv - CAIA 1&2 */
659struct cxl_process_element { 750struct cxl_process_element {
660 __be64 sr; 751 __be64 sr;
661 __be64 SPOffset; 752 __be64 SPOffset;
662 __be64 sdr; 753 union {
754 __be64 sdr; /* CAIA 1 */
755 u8 reserved1[8]; /* CAIA 2 */
756 } u;
663 __be64 haurp; 757 __be64 haurp;
664 __be32 ctxtime; 758 __be32 ctxtime;
665 __be16 ivte_offsets[4]; 759 __be16 ivte_offsets[4];
@@ -739,6 +833,39 @@ static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
739 return ~0ULL; 833 return ~0ULL;
740} 834}
741 835
836static inline bool cxl_is_power8(void)
837{
838 if ((pvr_version_is(PVR_POWER8E)) ||
839 (pvr_version_is(PVR_POWER8NVL)) ||
840 (pvr_version_is(PVR_POWER8)))
841 return true;
842 return false;
843}
844
845static inline bool cxl_is_power9(void)
846{
847 /* intermediate solution */
848 if (!cxl_is_power8() &&
849 (cpu_has_feature(CPU_FTRS_POWER9) ||
850 cpu_has_feature(CPU_FTR_POWER9_DD1)))
851 return true;
852 return false;
853}
854
855static inline bool cxl_is_psl8(struct cxl_afu *afu)
856{
857 if (afu->adapter->caia_major == 1)
858 return true;
859 return false;
860}
861
862static inline bool cxl_is_psl9(struct cxl_afu *afu)
863{
864 if (afu->adapter->caia_major == 2)
865 return true;
866 return false;
867}
868
742ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf, 869ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
743 loff_t off, size_t count); 870 loff_t off, size_t count);
744 871
@@ -765,7 +892,6 @@ int cxl_update_properties(struct device_node *dn, struct property *new_prop);
765 892
766void cxl_remove_adapter_nr(struct cxl *adapter); 893void cxl_remove_adapter_nr(struct cxl *adapter);
767 894
768int cxl_alloc_spa(struct cxl_afu *afu);
769void cxl_release_spa(struct cxl_afu *afu); 895void cxl_release_spa(struct cxl_afu *afu);
770 896
771dev_t cxl_get_dev(void); 897dev_t cxl_get_dev(void);
@@ -803,6 +929,15 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
803void afu_release_irqs(struct cxl_context *ctx, void *cookie); 929void afu_release_irqs(struct cxl_context *ctx, void *cookie);
804void afu_irq_name_free(struct cxl_context *ctx); 930void afu_irq_name_free(struct cxl_context *ctx);
805 931
932int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
933int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
934int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu);
935int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu);
936int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
937int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
938void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx);
939void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx);
940
806#ifdef CONFIG_DEBUG_FS 941#ifdef CONFIG_DEBUG_FS
807 942
808int cxl_debugfs_init(void); 943int cxl_debugfs_init(void);
@@ -811,10 +946,13 @@ int cxl_debugfs_adapter_add(struct cxl *adapter);
811void cxl_debugfs_adapter_remove(struct cxl *adapter); 946void cxl_debugfs_adapter_remove(struct cxl *adapter);
812int cxl_debugfs_afu_add(struct cxl_afu *afu); 947int cxl_debugfs_afu_add(struct cxl_afu *afu);
813void cxl_debugfs_afu_remove(struct cxl_afu *afu); 948void cxl_debugfs_afu_remove(struct cxl_afu *afu);
814void cxl_stop_trace(struct cxl *cxl); 949void cxl_stop_trace_psl9(struct cxl *cxl);
815void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir); 950void cxl_stop_trace_psl8(struct cxl *cxl);
816void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir); 951void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir);
817void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir); 952void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir);
953void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
954void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir);
955void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir);
818 956
819#else /* CONFIG_DEBUG_FS */ 957#else /* CONFIG_DEBUG_FS */
820 958
@@ -845,21 +983,34 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu)
845{ 983{
846} 984}
847 985
848static inline void cxl_stop_trace(struct cxl *cxl) 986static inline void cxl_stop_trace_psl9(struct cxl *cxl)
849{ 987{
850} 988}
851 989
852static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, 990static inline void cxl_stop_trace_psl8(struct cxl *cxl)
991{
992}
993
994static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter,
853 struct dentry *dir) 995 struct dentry *dir)
854{ 996{
855} 997}
856 998
857static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, 999static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter,
858 struct dentry *dir) 1000 struct dentry *dir)
859{ 1001{
860} 1002}
861 1003
862static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir) 1004static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter,
1005 struct dentry *dir)
1006{
1007}
1008
1009static inline void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
1010{
1011}
1012
1013static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
863{ 1014{
864} 1015}
865 1016
@@ -888,27 +1039,15 @@ int __detach_context(struct cxl_context *ctx);
888/* 1039/*
889 * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined 1040 * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined
890 * in PAPR. 1041 * in PAPR.
891 * A word about endianness: a pointer to this structure is passed when 1042 * Field pid_tid is now 'reserved' because it's no more used on bare-metal.
892 * calling the hcall. However, it is not a block of memory filled up by 1043 * On a guest environment, PSL_PID_An is located on the upper 32 bits and
893 * the hypervisor. The return values are found in registers, and copied 1044 * PSL_TID_An register in the lower 32 bits.
894 * one by one when returning from the hcall. See the end of the call to
895 * plpar_hcall9() in hvCall.S
896 * As a consequence:
897 * - we don't need to do any endianness conversion
898 * - the pid and tid are an exception. They are 32-bit values returned in
899 * the same 64-bit register. So we do need to worry about byte ordering.
900 */ 1045 */
901struct cxl_irq_info { 1046struct cxl_irq_info {
902 u64 dsisr; 1047 u64 dsisr;
903 u64 dar; 1048 u64 dar;
904 u64 dsr; 1049 u64 dsr;
905#ifndef CONFIG_CPU_LITTLE_ENDIAN 1050 u64 reserved;
906 u32 pid;
907 u32 tid;
908#else
909 u32 tid;
910 u32 pid;
911#endif
912 u64 afu_err; 1051 u64 afu_err;
913 u64 errstat; 1052 u64 errstat;
914 u64 proc_handle; 1053 u64 proc_handle;
@@ -916,19 +1055,23 @@ struct cxl_irq_info {
916}; 1055};
917 1056
918void cxl_assign_psn_space(struct cxl_context *ctx); 1057void cxl_assign_psn_space(struct cxl_context *ctx);
919irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info); 1058int cxl_invalidate_all_psl9(struct cxl *adapter);
1059int cxl_invalidate_all_psl8(struct cxl *adapter);
1060irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
1061irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
1062irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
920int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler, 1063int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler,
921 void *cookie, irq_hw_number_t *dest_hwirq, 1064 void *cookie, irq_hw_number_t *dest_hwirq,
922 unsigned int *dest_virq, const char *name); 1065 unsigned int *dest_virq, const char *name);
923 1066
924int cxl_check_error(struct cxl_afu *afu); 1067int cxl_check_error(struct cxl_afu *afu);
925int cxl_afu_slbia(struct cxl_afu *afu); 1068int cxl_afu_slbia(struct cxl_afu *afu);
926int cxl_tlb_slb_invalidate(struct cxl *adapter);
927int cxl_data_cache_flush(struct cxl *adapter); 1069int cxl_data_cache_flush(struct cxl *adapter);
928int cxl_afu_disable(struct cxl_afu *afu); 1070int cxl_afu_disable(struct cxl_afu *afu);
929int cxl_psl_purge(struct cxl_afu *afu); 1071int cxl_psl_purge(struct cxl_afu *afu);
930 1072
931void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx); 1073void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx);
1074void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx);
932void cxl_native_err_irq_dump_regs(struct cxl *adapter); 1075void cxl_native_err_irq_dump_regs(struct cxl *adapter);
933int cxl_pci_vphb_add(struct cxl_afu *afu); 1076int cxl_pci_vphb_add(struct cxl_afu *afu);
934void cxl_pci_vphb_remove(struct cxl_afu *afu); 1077void cxl_pci_vphb_remove(struct cxl_afu *afu);
@@ -1024,4 +1167,10 @@ int cxl_adapter_context_lock(struct cxl *adapter);
1024/* Unlock the contexts-lock if taken. Warn and force unlock otherwise */ 1167/* Unlock the contexts-lock if taken. Warn and force unlock otherwise */
1025void cxl_adapter_context_unlock(struct cxl *adapter); 1168void cxl_adapter_context_unlock(struct cxl *adapter);
1026 1169
1170/* Increases the reference count to "struct mm_struct" */
1171void cxl_context_mm_count_get(struct cxl_context *ctx);
1172
1173/* Decrements the reference count to "struct mm_struct" */
1174void cxl_context_mm_count_put(struct cxl_context *ctx);
1175
1027#endif 1176#endif
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index 9c06ac8fa5ac..eae9d749f967 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -15,7 +15,13 @@
15 15
16static struct dentry *cxl_debugfs; 16static struct dentry *cxl_debugfs;
17 17
18void cxl_stop_trace(struct cxl *adapter) 18void cxl_stop_trace_psl9(struct cxl *adapter)
19{
20 /* Stop the trace */
21 cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL);
22}
23
24void cxl_stop_trace_psl8(struct cxl *adapter)
19{ 25{
20 int slice; 26 int slice;
21 27
@@ -53,7 +59,15 @@ static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode,
53 (void __force *)value, &fops_io_x64); 59 (void __force *)value, &fops_io_x64);
54} 60}
55 61
56void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir) 62void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir)
63{
64 debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1));
65 debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2));
66 debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL));
67 debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG));
68}
69
70void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir)
57{ 71{
58 debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR1)); 72 debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR1));
59 debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR2)); 73 debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR2));
@@ -61,7 +75,7 @@ void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
61 debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE)); 75 debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE));
62} 76}
63 77
64void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir) 78void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir)
65{ 79{
66 debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC)); 80 debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC));
67} 81}
@@ -82,8 +96,8 @@ int cxl_debugfs_adapter_add(struct cxl *adapter)
82 96
83 debugfs_create_io_x64("err_ivte", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_ErrIVTE)); 97 debugfs_create_io_x64("err_ivte", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_ErrIVTE));
84 98
85 if (adapter->native->sl_ops->debugfs_add_adapter_sl_regs) 99 if (adapter->native->sl_ops->debugfs_add_adapter_regs)
86 adapter->native->sl_ops->debugfs_add_adapter_sl_regs(adapter, dir); 100 adapter->native->sl_ops->debugfs_add_adapter_regs(adapter, dir);
87 return 0; 101 return 0;
88} 102}
89 103
@@ -92,8 +106,16 @@ void cxl_debugfs_adapter_remove(struct cxl *adapter)
92 debugfs_remove_recursive(adapter->debugfs); 106 debugfs_remove_recursive(adapter->debugfs);
93} 107}
94 108
95void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir) 109void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
96{ 110{
111 debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
112}
113
114void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
115{
116 debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An));
117 debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
118
97 debugfs_create_io_x64("fir", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_FIR_SLICE_An)); 119 debugfs_create_io_x64("fir", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_FIR_SLICE_An));
98 debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An)); 120 debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
99 debugfs_create_io_x64("afu_debug", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_AFU_DEBUG_An)); 121 debugfs_create_io_x64("afu_debug", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_AFU_DEBUG_An));
@@ -117,12 +139,11 @@ int cxl_debugfs_afu_add(struct cxl_afu *afu)
117 debugfs_create_io_x64("sr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SR_An)); 139 debugfs_create_io_x64("sr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SR_An));
118 debugfs_create_io_x64("dsisr", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DSISR_An)); 140 debugfs_create_io_x64("dsisr", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DSISR_An));
119 debugfs_create_io_x64("dar", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DAR_An)); 141 debugfs_create_io_x64("dar", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DAR_An));
120 debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An)); 142
121 debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
122 debugfs_create_io_x64("err_status", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_ErrStat_An)); 143 debugfs_create_io_x64("err_status", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_ErrStat_An));
123 144
124 if (afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs) 145 if (afu->adapter->native->sl_ops->debugfs_add_afu_regs)
125 afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs(afu, dir); 146 afu->adapter->native->sl_ops->debugfs_add_afu_regs(afu, dir);
126 147
127 return 0; 148 return 0;
128} 149}
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 2fa015c05561..5344448f514e 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -146,108 +146,67 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
146 return cxl_ack_ae(ctx); 146 return cxl_ack_ae(ctx);
147 } 147 }
148 148
149 /* 149 if (!radix_enabled()) {
150 * update_mmu_cache() will not have loaded the hash since current->trap 150 /*
151 * is not a 0x400 or 0x300, so just call hash_page_mm() here. 151 * update_mmu_cache() will not have loaded the hash since current->trap
152 */ 152 * is not a 0x400 or 0x300, so just call hash_page_mm() here.
153 access = _PAGE_PRESENT | _PAGE_READ; 153 */
154 if (dsisr & CXL_PSL_DSISR_An_S) 154 access = _PAGE_PRESENT | _PAGE_READ;
155 access |= _PAGE_WRITE; 155 if (dsisr & CXL_PSL_DSISR_An_S)
156 156 access |= _PAGE_WRITE;
157 access |= _PAGE_PRIVILEGED; 157
158 if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) 158 access |= _PAGE_PRIVILEGED;
159 access &= ~_PAGE_PRIVILEGED; 159 if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
160 160 access &= ~_PAGE_PRIVILEGED;
161 if (dsisr & DSISR_NOHPTE) 161
162 inv_flags |= HPTE_NOHPTE_UPDATE; 162 if (dsisr & DSISR_NOHPTE)
163 163 inv_flags |= HPTE_NOHPTE_UPDATE;
164 local_irq_save(flags); 164
165 hash_page_mm(mm, dar, access, 0x300, inv_flags); 165 local_irq_save(flags);
166 local_irq_restore(flags); 166 hash_page_mm(mm, dar, access, 0x300, inv_flags);
167 167 local_irq_restore(flags);
168 }
168 pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); 169 pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe);
169 cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); 170 cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
170} 171}
171 172
172/* 173/*
173 * Returns the mm_struct corresponding to the context ctx via ctx->pid 174 * Returns the mm_struct corresponding to the context ctx.
174 * In case the task has exited we use the task group leader accessible 175 * mm_users == 0, the context may be in the process of being closed.
175 * via ctx->glpid to find the next task in the thread group that has a
176 * valid mm_struct associated with it. If a task with valid mm_struct
177 * is found the ctx->pid is updated to use the task struct for subsequent
178 * translations. In case no valid mm_struct is found in the task group to
179 * service the fault a NULL is returned.
180 */ 176 */
181static struct mm_struct *get_mem_context(struct cxl_context *ctx) 177static struct mm_struct *get_mem_context(struct cxl_context *ctx)
182{ 178{
183 struct task_struct *task = NULL; 179 if (ctx->mm == NULL)
184 struct mm_struct *mm = NULL;
185 struct pid *old_pid = ctx->pid;
186
187 if (old_pid == NULL) {
188 pr_warn("%s: Invalid context for pe=%d\n",
189 __func__, ctx->pe);
190 return NULL; 180 return NULL;
191 }
192 181
193 task = get_pid_task(old_pid, PIDTYPE_PID); 182 if (!atomic_inc_not_zero(&ctx->mm->mm_users))
183 return NULL;
194 184
195 /* 185 return ctx->mm;
196 * pid_alive may look racy but this saves us from costly 186}
197 * get_task_mm when the task is a zombie. In worst case
198 * we may think a task is alive, which is about to die
199 * but get_task_mm will return NULL.
200 */
201 if (task != NULL && pid_alive(task))
202 mm = get_task_mm(task);
203 187
204 /* release the task struct that was taken earlier */ 188static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr)
205 if (task) 189{
206 put_task_struct(task); 190 if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS))
207 else 191 return true;
208 pr_devel("%s: Context owning pid=%i for pe=%i dead\n",
209 __func__, pid_nr(old_pid), ctx->pe);
210
211 /*
212 * If we couldn't find the mm context then use the group
213 * leader to iterate over the task group and find a task
214 * that gives us mm_struct.
215 */
216 if (unlikely(mm == NULL && ctx->glpid != NULL)) {
217
218 rcu_read_lock();
219 task = pid_task(ctx->glpid, PIDTYPE_PID);
220 if (task)
221 do {
222 mm = get_task_mm(task);
223 if (mm) {
224 ctx->pid = get_task_pid(task,
225 PIDTYPE_PID);
226 break;
227 }
228 task = next_thread(task);
229 } while (task && !thread_group_leader(task));
230 rcu_read_unlock();
231
232 /* check if we switched pid */
233 if (ctx->pid != old_pid) {
234 if (mm)
235 pr_devel("%s:pe=%i switch pid %i->%i\n",
236 __func__, ctx->pe, pid_nr(old_pid),
237 pid_nr(ctx->pid));
238 else
239 pr_devel("%s:Cannot find mm for pid=%i\n",
240 __func__, pid_nr(old_pid));
241
242 /* drop the reference to older pid */
243 put_pid(old_pid);
244 }
245 }
246 192
247 return mm; 193 return false;
248} 194}
249 195
196static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr)
197{
198 if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM))
199 return true;
200
201 if ((cxl_is_psl9(ctx->afu)) &&
202 ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) &
203 (CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC |
204 CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH |
205 CXL_PSL9_DSISR_An_PF_STEG)))
206 return true;
250 207
208 return false;
209}
251 210
252void cxl_handle_fault(struct work_struct *fault_work) 211void cxl_handle_fault(struct work_struct *fault_work)
253{ 212{
@@ -282,7 +241,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
282 if (!ctx->kernel) { 241 if (!ctx->kernel) {
283 242
284 mm = get_mem_context(ctx); 243 mm = get_mem_context(ctx);
285 /* indicates all the thread in task group have exited */
286 if (mm == NULL) { 244 if (mm == NULL) {
287 pr_devel("%s: unable to get mm for pe=%d pid=%i\n", 245 pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
288 __func__, ctx->pe, pid_nr(ctx->pid)); 246 __func__, ctx->pe, pid_nr(ctx->pid));
@@ -294,9 +252,9 @@ void cxl_handle_fault(struct work_struct *fault_work)
294 } 252 }
295 } 253 }
296 254
297 if (dsisr & CXL_PSL_DSISR_An_DS) 255 if (cxl_is_segment_miss(ctx, dsisr))
298 cxl_handle_segment_miss(ctx, mm, dar); 256 cxl_handle_segment_miss(ctx, mm, dar);
299 else if (dsisr & CXL_PSL_DSISR_An_DM) 257 else if (cxl_is_page_fault(ctx, dsisr))
300 cxl_handle_page_fault(ctx, mm, dsisr, dar); 258 cxl_handle_page_fault(ctx, mm, dsisr, dar);
301 else 259 else
302 WARN(1, "cxl_handle_fault has nothing to handle\n"); 260 WARN(1, "cxl_handle_fault has nothing to handle\n");
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index e7139c76f961..17b433f1ce23 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/sched/mm.h>
21#include <asm/cputable.h> 22#include <asm/cputable.h>
22#include <asm/current.h> 23#include <asm/current.h>
23#include <asm/copro.h> 24#include <asm/copro.h>
@@ -216,8 +217,16 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
216 * process is still accessible. 217 * process is still accessible.
217 */ 218 */
218 ctx->pid = get_task_pid(current, PIDTYPE_PID); 219 ctx->pid = get_task_pid(current, PIDTYPE_PID);
219 ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID);
220 220
221 /* acquire a reference to the task's mm */
222 ctx->mm = get_task_mm(current);
223
224 /* ensure this mm_struct can't be freed */
225 cxl_context_mm_count_get(ctx);
226
227 /* decrement the use count */
228 if (ctx->mm)
229 mmput(ctx->mm);
221 230
222 trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); 231 trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
223 232
@@ -225,9 +234,9 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
225 amr))) { 234 amr))) {
226 afu_release_irqs(ctx, ctx); 235 afu_release_irqs(ctx, ctx);
227 cxl_adapter_context_put(ctx->afu->adapter); 236 cxl_adapter_context_put(ctx->afu->adapter);
228 put_pid(ctx->glpid);
229 put_pid(ctx->pid); 237 put_pid(ctx->pid);
230 ctx->glpid = ctx->pid = NULL; 238 ctx->pid = NULL;
239 cxl_context_mm_count_put(ctx);
231 goto out; 240 goto out;
232 } 241 }
233 242
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index e04bc4ddfd74..f58b4b6c79f2 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -169,7 +169,7 @@ static irqreturn_t guest_psl_irq(int irq, void *data)
169 return IRQ_HANDLED; 169 return IRQ_HANDLED;
170 } 170 }
171 171
172 rc = cxl_irq(irq, ctx, &irq_info); 172 rc = cxl_irq_psl8(irq, ctx, &irq_info);
173 return rc; 173 return rc;
174} 174}
175 175
@@ -551,13 +551,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
551 elem->common.tid = cpu_to_be32(0); /* Unused */ 551 elem->common.tid = cpu_to_be32(0); /* Unused */
552 elem->common.pid = cpu_to_be32(pid); 552 elem->common.pid = cpu_to_be32(pid);
553 elem->common.csrp = cpu_to_be64(0); /* disable */ 553 elem->common.csrp = cpu_to_be64(0); /* disable */
554 elem->common.aurp0 = cpu_to_be64(0); /* disable */ 554 elem->common.u.psl8.aurp0 = cpu_to_be64(0); /* disable */
555 elem->common.aurp1 = cpu_to_be64(0); /* disable */ 555 elem->common.u.psl8.aurp1 = cpu_to_be64(0); /* disable */
556 556
557 cxl_prefault(ctx, wed); 557 cxl_prefault(ctx, wed);
558 558
559 elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 559 elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0);
560 elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 560 elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1);
561 561
562 /* 562 /*
563 * Ensure we have at least one interrupt allocated to take faults for 563 * Ensure we have at least one interrupt allocated to take faults for
diff --git a/drivers/misc/cxl/hcalls.c b/drivers/misc/cxl/hcalls.c
index d6d11f4056d7..9b8bb0f80c3b 100644
--- a/drivers/misc/cxl/hcalls.c
+++ b/drivers/misc/cxl/hcalls.c
@@ -413,9 +413,9 @@ long cxl_h_collect_int_info(u64 unit_address, u64 process_token,
413 413
414 switch (rc) { 414 switch (rc) {
415 case H_SUCCESS: /* The interrupt info is returned in return registers. */ 415 case H_SUCCESS: /* The interrupt info is returned in return registers. */
416 pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, afu_err:%#llx, errstat:%#llx\n", 416 pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid_tid:%#llx, afu_err:%#llx, errstat:%#llx\n",
417 info->dsisr, info->dar, info->dsr, info->pid, 417 info->dsisr, info->dar, info->dsr, info->reserved,
418 info->tid, info->afu_err, info->errstat); 418 info->afu_err, info->errstat);
419 return 0; 419 return 0;
420 case H_PARAMETER: /* An incorrect parameter was supplied. */ 420 case H_PARAMETER: /* An incorrect parameter was supplied. */
421 return -EINVAL; 421 return -EINVAL;
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 1a402bbed687..ce08a9f22308 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -34,7 +34,58 @@ static irqreturn_t schedule_cxl_fault(struct cxl_context *ctx, u64 dsisr, u64 da
34 return IRQ_HANDLED; 34 return IRQ_HANDLED;
35} 35}
36 36
37irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info) 37irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
38{
39 u64 dsisr, dar;
40
41 dsisr = irq_info->dsisr;
42 dar = irq_info->dar;
43
44 trace_cxl_psl9_irq(ctx, irq, dsisr, dar);
45
46 pr_devel("CXL interrupt %i for afu pe: %i DSISR: %#llx DAR: %#llx\n", irq, ctx->pe, dsisr, dar);
47
48 if (dsisr & CXL_PSL9_DSISR_An_TF) {
49 pr_devel("CXL interrupt: Scheduling translation fault handling for later (pe: %i)\n", ctx->pe);
50 return schedule_cxl_fault(ctx, dsisr, dar);
51 }
52
53 if (dsisr & CXL_PSL9_DSISR_An_PE)
54 return cxl_ops->handle_psl_slice_error(ctx, dsisr,
55 irq_info->errstat);
56 if (dsisr & CXL_PSL9_DSISR_An_AE) {
57 pr_devel("CXL interrupt: AFU Error 0x%016llx\n", irq_info->afu_err);
58
59 if (ctx->pending_afu_err) {
60 /*
61 * This shouldn't happen - the PSL treats these errors
62 * as fatal and will have reset the AFU, so there's not
63 * much point buffering multiple AFU errors.
64 * OTOH if we DO ever see a storm of these come in it's
65 * probably best that we log them somewhere:
66 */
67 dev_err_ratelimited(&ctx->afu->dev, "CXL AFU Error undelivered to pe %i: 0x%016llx\n",
68 ctx->pe, irq_info->afu_err);
69 } else {
70 spin_lock(&ctx->lock);
71 ctx->afu_err = irq_info->afu_err;
72 ctx->pending_afu_err = 1;
73 spin_unlock(&ctx->lock);
74
75 wake_up_all(&ctx->wq);
76 }
77
78 cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_A, 0);
79 return IRQ_HANDLED;
80 }
81 if (dsisr & CXL_PSL9_DSISR_An_OC)
82 pr_devel("CXL interrupt: OS Context Warning\n");
83
84 WARN(1, "Unhandled CXL PSL IRQ\n");
85 return IRQ_HANDLED;
86}
87
88irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
38{ 89{
39 u64 dsisr, dar; 90 u64 dsisr, dar;
40 91
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index b0b6ed31918e..1703655072b1 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -59,16 +59,10 @@ int cxl_afu_slbia(struct cxl_afu *afu)
59 59
60static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm) 60static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
61{ 61{
62 struct task_struct *task;
63 unsigned long flags; 62 unsigned long flags;
64 if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
65 pr_devel("%s unable to get task %i\n",
66 __func__, pid_nr(ctx->pid));
67 return;
68 }
69 63
70 if (task->mm != mm) 64 if (ctx->mm != mm)
71 goto out_put; 65 return;
72 66
73 pr_devel("%s matched mm - card: %i afu: %i pe: %i\n", __func__, 67 pr_devel("%s matched mm - card: %i afu: %i pe: %i\n", __func__,
74 ctx->afu->adapter->adapter_num, ctx->afu->slice, ctx->pe); 68 ctx->afu->adapter->adapter_num, ctx->afu->slice, ctx->pe);
@@ -79,8 +73,6 @@ static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
79 spin_unlock_irqrestore(&ctx->sste_lock, flags); 73 spin_unlock_irqrestore(&ctx->sste_lock, flags);
80 mb(); 74 mb();
81 cxl_afu_slbia(ctx->afu); 75 cxl_afu_slbia(ctx->afu);
82out_put:
83 put_task_struct(task);
84} 76}
85 77
86static inline void cxl_slbia_core(struct mm_struct *mm) 78static inline void cxl_slbia_core(struct mm_struct *mm)
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 7ae710585267..871a2f09c718 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -95,12 +95,23 @@ int cxl_afu_disable(struct cxl_afu *afu)
95/* This will disable as well as reset */ 95/* This will disable as well as reset */
96static int native_afu_reset(struct cxl_afu *afu) 96static int native_afu_reset(struct cxl_afu *afu)
97{ 97{
98 int rc;
99 u64 serr;
100
98 pr_devel("AFU reset request\n"); 101 pr_devel("AFU reset request\n");
99 102
100 return afu_control(afu, CXL_AFU_Cntl_An_RA, 0, 103 rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
101 CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled, 104 CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
102 CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK, 105 CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
103 false); 106 false);
107
108 /* Re-enable any masked interrupts */
109 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
110 serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
111 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
112
113
114 return rc;
104} 115}
105 116
106static int native_afu_check_and_enable(struct cxl_afu *afu) 117static int native_afu_check_and_enable(struct cxl_afu *afu)
@@ -120,6 +131,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
120 u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An); 131 u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
121 u64 dsisr, dar; 132 u64 dsisr, dar;
122 u64 start, end; 133 u64 start, end;
134 u64 trans_fault = 0x0ULL;
123 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 135 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
124 int rc = 0; 136 int rc = 0;
125 137
@@ -127,6 +139,11 @@ int cxl_psl_purge(struct cxl_afu *afu)
127 139
128 pr_devel("PSL purge request\n"); 140 pr_devel("PSL purge request\n");
129 141
142 if (cxl_is_psl8(afu))
143 trans_fault = CXL_PSL_DSISR_TRANS;
144 if (cxl_is_psl9(afu))
145 trans_fault = CXL_PSL9_DSISR_An_TF;
146
130 if (!cxl_ops->link_ok(afu->adapter, afu)) { 147 if (!cxl_ops->link_ok(afu->adapter, afu)) {
131 dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n"); 148 dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
132 rc = -EIO; 149 rc = -EIO;
@@ -155,13 +172,17 @@ int cxl_psl_purge(struct cxl_afu *afu)
155 } 172 }
156 173
157 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 174 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
158 pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n", PSL_CNTL, dsisr); 175 pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n",
159 if (dsisr & CXL_PSL_DSISR_TRANS) { 176 PSL_CNTL, dsisr);
177
178 if (dsisr & trans_fault) {
160 dar = cxl_p2n_read(afu, CXL_PSL_DAR_An); 179 dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
161 dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n", dsisr, dar); 180 dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n",
181 dsisr, dar);
162 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE); 182 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
163 } else if (dsisr) { 183 } else if (dsisr) {
164 dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n", dsisr); 184 dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n",
185 dsisr);
165 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A); 186 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
166 } else { 187 } else {
167 cpu_relax(); 188 cpu_relax();
@@ -196,7 +217,7 @@ static int spa_max_procs(int spa_size)
196 return ((spa_size / 8) - 96) / 17; 217 return ((spa_size / 8) - 96) / 17;
197} 218}
198 219
199int cxl_alloc_spa(struct cxl_afu *afu) 220static int cxl_alloc_spa(struct cxl_afu *afu, int mode)
200{ 221{
201 unsigned spa_size; 222 unsigned spa_size;
202 223
@@ -209,7 +230,8 @@ int cxl_alloc_spa(struct cxl_afu *afu)
209 if (spa_size > 0x100000) { 230 if (spa_size > 0x100000) {
210 dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n", 231 dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n",
211 afu->native->spa_max_procs, afu->native->spa_size); 232 afu->native->spa_max_procs, afu->native->spa_size);
212 afu->num_procs = afu->native->spa_max_procs; 233 if (mode != CXL_MODE_DEDICATED)
234 afu->num_procs = afu->native->spa_max_procs;
213 break; 235 break;
214 } 236 }
215 237
@@ -258,7 +280,37 @@ void cxl_release_spa(struct cxl_afu *afu)
258 } 280 }
259} 281}
260 282
261int cxl_tlb_slb_invalidate(struct cxl *adapter) 283/*
284 * Invalidation of all ERAT entries is no longer required by CAIA2. Use
285 * only for debug.
286 */
287int cxl_invalidate_all_psl9(struct cxl *adapter)
288{
289 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
290 u64 ierat;
291
292 pr_devel("CXL adapter - invalidation of all ERAT entries\n");
293
294 /* Invalidates all ERAT entries for Radix or HPT */
295 ierat = CXL_XSL9_IERAT_IALL;
296 if (radix_enabled())
297 ierat |= CXL_XSL9_IERAT_INVR;
298 cxl_p1_write(adapter, CXL_XSL9_IERAT, ierat);
299
300 while (cxl_p1_read(adapter, CXL_XSL9_IERAT) & CXL_XSL9_IERAT_IINPROG) {
301 if (time_after_eq(jiffies, timeout)) {
302 dev_warn(&adapter->dev,
303 "WARNING: CXL adapter invalidation of all ERAT entries timed out!\n");
304 return -EBUSY;
305 }
306 if (!cxl_ops->link_ok(adapter, NULL))
307 return -EIO;
308 cpu_relax();
309 }
310 return 0;
311}
312
313int cxl_invalidate_all_psl8(struct cxl *adapter)
262{ 314{
263 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 315 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
264 316
@@ -466,7 +518,8 @@ static int remove_process_element(struct cxl_context *ctx)
466 518
467 if (!rc) 519 if (!rc)
468 ctx->pe_inserted = false; 520 ctx->pe_inserted = false;
469 slb_invalid(ctx); 521 if (cxl_is_power8())
522 slb_invalid(ctx);
470 pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe); 523 pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
471 mutex_unlock(&ctx->afu->native->spa_mutex); 524 mutex_unlock(&ctx->afu->native->spa_mutex);
472 525
@@ -493,13 +546,14 @@ static int activate_afu_directed(struct cxl_afu *afu)
493 546
494 afu->num_procs = afu->max_procs_virtualised; 547 afu->num_procs = afu->max_procs_virtualised;
495 if (afu->native->spa == NULL) { 548 if (afu->native->spa == NULL) {
496 if (cxl_alloc_spa(afu)) 549 if (cxl_alloc_spa(afu, CXL_MODE_DIRECTED))
497 return -ENOMEM; 550 return -ENOMEM;
498 } 551 }
499 attach_spa(afu); 552 attach_spa(afu);
500 553
501 cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_AFU); 554 cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_AFU);
502 cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL); 555 if (cxl_is_power8())
556 cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL);
503 cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L); 557 cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
504 558
505 afu->current_mode = CXL_MODE_DIRECTED; 559 afu->current_mode = CXL_MODE_DIRECTED;
@@ -542,10 +596,19 @@ static u64 calculate_sr(struct cxl_context *ctx)
542 sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; 596 sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV;
543 } else { 597 } else {
544 sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; 598 sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
545 sr &= ~(CXL_PSL_SR_An_HV); 599 if (radix_enabled())
600 sr |= CXL_PSL_SR_An_HV;
601 else
602 sr &= ~(CXL_PSL_SR_An_HV);
546 if (!test_tsk_thread_flag(current, TIF_32BIT)) 603 if (!test_tsk_thread_flag(current, TIF_32BIT))
547 sr |= CXL_PSL_SR_An_SF; 604 sr |= CXL_PSL_SR_An_SF;
548 } 605 }
606 if (cxl_is_psl9(ctx->afu)) {
607 if (radix_enabled())
608 sr |= CXL_PSL_SR_An_XLAT_ror;
609 else
610 sr |= CXL_PSL_SR_An_XLAT_hpt;
611 }
549 return sr; 612 return sr;
550} 613}
551 614
@@ -578,7 +641,71 @@ static void update_ivtes_directed(struct cxl_context *ctx)
578 WARN_ON(add_process_element(ctx)); 641 WARN_ON(add_process_element(ctx));
579} 642}
580 643
581static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) 644static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
645{
646 u32 pid;
647
648 cxl_assign_psn_space(ctx);
649
650 ctx->elem->ctxtime = 0; /* disable */
651 ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
652 ctx->elem->haurp = 0; /* disable */
653
654 if (ctx->kernel)
655 pid = 0;
656 else {
657 if (ctx->mm == NULL) {
658 pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
659 __func__, ctx->pe, pid_nr(ctx->pid));
660 return -EINVAL;
661 }
662 pid = ctx->mm->context.id;
663 }
664
665 ctx->elem->common.tid = 0;
666 ctx->elem->common.pid = cpu_to_be32(pid);
667
668 ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
669
670 ctx->elem->common.csrp = 0; /* disable */
671
672 cxl_prefault(ctx, wed);
673
674 /*
675 * Ensure we have the multiplexed PSL interrupt set up to take faults
676 * for kernel contexts that may not have allocated any AFU IRQs at all:
677 */
678 if (ctx->irqs.range[0] == 0) {
679 ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
680 ctx->irqs.range[0] = 1;
681 }
682
683 ctx->elem->common.amr = cpu_to_be64(amr);
684 ctx->elem->common.wed = cpu_to_be64(wed);
685
686 return 0;
687}
688
689int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
690{
691 int result;
692
693 /* fill the process element entry */
694 result = process_element_entry_psl9(ctx, wed, amr);
695 if (result)
696 return result;
697
698 update_ivtes_directed(ctx);
699
700 /* first guy needs to enable */
701 result = cxl_ops->afu_check_and_enable(ctx->afu);
702 if (result)
703 return result;
704
705 return add_process_element(ctx);
706}
707
708int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
582{ 709{
583 u32 pid; 710 u32 pid;
584 int result; 711 int result;
@@ -588,7 +715,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
588 ctx->elem->ctxtime = 0; /* disable */ 715 ctx->elem->ctxtime = 0; /* disable */
589 ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID)); 716 ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
590 ctx->elem->haurp = 0; /* disable */ 717 ctx->elem->haurp = 0; /* disable */
591 ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1)); 718 ctx->elem->u.sdr = cpu_to_be64(mfspr(SPRN_SDR1));
592 719
593 pid = current->pid; 720 pid = current->pid;
594 if (ctx->kernel) 721 if (ctx->kernel)
@@ -599,13 +726,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
599 ctx->elem->sr = cpu_to_be64(calculate_sr(ctx)); 726 ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
600 727
601 ctx->elem->common.csrp = 0; /* disable */ 728 ctx->elem->common.csrp = 0; /* disable */
602 ctx->elem->common.aurp0 = 0; /* disable */ 729 ctx->elem->common.u.psl8.aurp0 = 0; /* disable */
603 ctx->elem->common.aurp1 = 0; /* disable */ 730 ctx->elem->common.u.psl8.aurp1 = 0; /* disable */
604 731
605 cxl_prefault(ctx, wed); 732 cxl_prefault(ctx, wed);
606 733
607 ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 734 ctx->elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0);
608 ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 735 ctx->elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1);
609 736
610 /* 737 /*
611 * Ensure we have the multiplexed PSL interrupt set up to take faults 738 * Ensure we have the multiplexed PSL interrupt set up to take faults
@@ -671,7 +798,33 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
671 return 0; 798 return 0;
672} 799}
673 800
674static int activate_dedicated_process(struct cxl_afu *afu) 801int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu)
802{
803 dev_info(&afu->dev, "Activating dedicated process mode\n");
804
805 /*
806 * If XSL is set to dedicated mode (Set in PSL_SCNTL reg), the
807 * XSL and AFU are programmed to work with a single context.
808 * The context information should be configured in the SPA area
809 * index 0 (so PSL_SPAP must be configured before enabling the
810 * AFU).
811 */
812 afu->num_procs = 1;
813 if (afu->native->spa == NULL) {
814 if (cxl_alloc_spa(afu, CXL_MODE_DEDICATED))
815 return -ENOMEM;
816 }
817 attach_spa(afu);
818
819 cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_Process);
820 cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
821
822 afu->current_mode = CXL_MODE_DEDICATED;
823
824 return cxl_chardev_d_afu_add(afu);
825}
826
827int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu)
675{ 828{
676 dev_info(&afu->dev, "Activating dedicated process mode\n"); 829 dev_info(&afu->dev, "Activating dedicated process mode\n");
677 830
@@ -694,7 +847,17 @@ static int activate_dedicated_process(struct cxl_afu *afu)
694 return cxl_chardev_d_afu_add(afu); 847 return cxl_chardev_d_afu_add(afu);
695} 848}
696 849
697static void update_ivtes_dedicated(struct cxl_context *ctx) 850void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx)
851{
852 int r;
853
854 for (r = 0; r < CXL_IRQ_RANGES; r++) {
855 ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);
856 ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]);
857 }
858}
859
860void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx)
698{ 861{
699 struct cxl_afu *afu = ctx->afu; 862 struct cxl_afu *afu = ctx->afu;
700 863
@@ -710,7 +873,27 @@ static void update_ivtes_dedicated(struct cxl_context *ctx)
710 ((u64)ctx->irqs.range[3] & 0xffff)); 873 ((u64)ctx->irqs.range[3] & 0xffff));
711} 874}
712 875
713static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr) 876int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
877{
878 struct cxl_afu *afu = ctx->afu;
879 int result;
880
881 /* fill the process element entry */
882 result = process_element_entry_psl9(ctx, wed, amr);
883 if (result)
884 return result;
885
886 if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
887 afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
888
889 result = cxl_ops->afu_reset(afu);
890 if (result)
891 return result;
892
893 return afu_enable(afu);
894}
895
896int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
714{ 897{
715 struct cxl_afu *afu = ctx->afu; 898 struct cxl_afu *afu = ctx->afu;
716 u64 pid; 899 u64 pid;
@@ -728,7 +911,8 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
728 911
729 cxl_prefault(ctx, wed); 912 cxl_prefault(ctx, wed);
730 913
731 update_ivtes_dedicated(ctx); 914 if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
915 afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
732 916
733 cxl_p2n_write(afu, CXL_PSL_AMR_An, amr); 917 cxl_p2n_write(afu, CXL_PSL_AMR_An, amr);
734 918
@@ -778,8 +962,9 @@ static int native_afu_activate_mode(struct cxl_afu *afu, int mode)
778 962
779 if (mode == CXL_MODE_DIRECTED) 963 if (mode == CXL_MODE_DIRECTED)
780 return activate_afu_directed(afu); 964 return activate_afu_directed(afu);
781 if (mode == CXL_MODE_DEDICATED) 965 if ((mode == CXL_MODE_DEDICATED) &&
782 return activate_dedicated_process(afu); 966 (afu->adapter->native->sl_ops->activate_dedicated_process))
967 return afu->adapter->native->sl_ops->activate_dedicated_process(afu);
783 968
784 return -EINVAL; 969 return -EINVAL;
785} 970}
@@ -793,11 +978,13 @@ static int native_attach_process(struct cxl_context *ctx, bool kernel,
793 } 978 }
794 979
795 ctx->kernel = kernel; 980 ctx->kernel = kernel;
796 if (ctx->afu->current_mode == CXL_MODE_DIRECTED) 981 if ((ctx->afu->current_mode == CXL_MODE_DIRECTED) &&
797 return attach_afu_directed(ctx, wed, amr); 982 (ctx->afu->adapter->native->sl_ops->attach_afu_directed))
983 return ctx->afu->adapter->native->sl_ops->attach_afu_directed(ctx, wed, amr);
798 984
799 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) 985 if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
800 return attach_dedicated(ctx, wed, amr); 986 (ctx->afu->adapter->native->sl_ops->attach_dedicated_process))
987 return ctx->afu->adapter->native->sl_ops->attach_dedicated_process(ctx, wed, amr);
801 988
802 return -EINVAL; 989 return -EINVAL;
803} 990}
@@ -830,8 +1017,9 @@ static void native_update_ivtes(struct cxl_context *ctx)
830{ 1017{
831 if (ctx->afu->current_mode == CXL_MODE_DIRECTED) 1018 if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
832 return update_ivtes_directed(ctx); 1019 return update_ivtes_directed(ctx);
833 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) 1020 if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
834 return update_ivtes_dedicated(ctx); 1021 (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes))
1022 return ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
835 WARN(1, "native_update_ivtes: Bad mode\n"); 1023 WARN(1, "native_update_ivtes: Bad mode\n");
836} 1024}
837 1025
@@ -859,8 +1047,6 @@ static int native_detach_process(struct cxl_context *ctx)
859 1047
860static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info) 1048static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
861{ 1049{
862 u64 pidtid;
863
864 /* If the adapter has gone away, we can't get any meaningful 1050 /* If the adapter has gone away, we can't get any meaningful
865 * information. 1051 * information.
866 */ 1052 */
@@ -869,10 +1055,8 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
869 1055
870 info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 1056 info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
871 info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An); 1057 info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
872 info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An); 1058 if (cxl_is_power8())
873 pidtid = cxl_p2n_read(afu, CXL_PSL_PID_TID_An); 1059 info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
874 info->pid = pidtid >> 32;
875 info->tid = pidtid & 0xffffffff;
876 info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An); 1060 info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An);
877 info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An); 1061 info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
878 info->proc_handle = 0; 1062 info->proc_handle = 0;
@@ -880,7 +1064,22 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
880 return 0; 1064 return 0;
881} 1065}
882 1066
883void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx) 1067void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx)
1068{
1069 u64 fir1, fir2, serr;
1070
1071 fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1);
1072 fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2);
1073
1074 dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
1075 dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
1076 if (ctx->afu->adapter->native->sl_ops->register_serr_irq) {
1077 serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
1078 cxl_afu_decode_psl_serr(ctx->afu, serr);
1079 }
1080}
1081
1082void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx)
884{ 1083{
885 u64 fir1, fir2, fir_slice, serr, afu_debug; 1084 u64 fir1, fir2, fir_slice, serr, afu_debug;
886 1085
@@ -916,9 +1115,20 @@ static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx,
916 return cxl_ops->ack_irq(ctx, 0, errstat); 1115 return cxl_ops->ack_irq(ctx, 0, errstat);
917} 1116}
918 1117
919static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info) 1118static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr)
1119{
1120 if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS))
1121 return true;
1122
1123 if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF))
1124 return true;
1125
1126 return false;
1127}
1128
1129irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
920{ 1130{
921 if (irq_info->dsisr & CXL_PSL_DSISR_TRANS) 1131 if (cxl_is_translation_fault(afu, irq_info->dsisr))
922 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE); 1132 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
923 else 1133 else
924 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A); 1134 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
@@ -932,7 +1142,7 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
932 struct cxl_context *ctx; 1142 struct cxl_context *ctx;
933 struct cxl_irq_info irq_info; 1143 struct cxl_irq_info irq_info;
934 u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An); 1144 u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An);
935 int ph, ret; 1145 int ph, ret = IRQ_HANDLED, res;
936 1146
937 /* check if eeh kicked in while the interrupt was in flight */ 1147 /* check if eeh kicked in while the interrupt was in flight */
938 if (unlikely(phreg == ~0ULL)) { 1148 if (unlikely(phreg == ~0ULL)) {
@@ -943,15 +1153,18 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
943 } 1153 }
944 /* Mask the pe-handle from register value */ 1154 /* Mask the pe-handle from register value */
945 ph = phreg & 0xffff; 1155 ph = phreg & 0xffff;
946 if ((ret = native_get_irq_info(afu, &irq_info))) { 1156 if ((res = native_get_irq_info(afu, &irq_info))) {
947 WARN(1, "Unable to get CXL IRQ Info: %i\n", ret); 1157 WARN(1, "Unable to get CXL IRQ Info: %i\n", res);
948 return fail_psl_irq(afu, &irq_info); 1158 if (afu->adapter->native->sl_ops->fail_irq)
1159 return afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
1160 return ret;
949 } 1161 }
950 1162
951 rcu_read_lock(); 1163 rcu_read_lock();
952 ctx = idr_find(&afu->contexts_idr, ph); 1164 ctx = idr_find(&afu->contexts_idr, ph);
953 if (ctx) { 1165 if (ctx) {
954 ret = cxl_irq(irq, ctx, &irq_info); 1166 if (afu->adapter->native->sl_ops->handle_interrupt)
1167 ret = afu->adapter->native->sl_ops->handle_interrupt(irq, ctx, &irq_info);
955 rcu_read_unlock(); 1168 rcu_read_unlock();
956 return ret; 1169 return ret;
957 } 1170 }
@@ -961,7 +1174,9 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
961 " %016llx\n(Possible AFU HW issue - was a term/remove acked" 1174 " %016llx\n(Possible AFU HW issue - was a term/remove acked"
962 " with outstanding transactions?)\n", ph, irq_info.dsisr, 1175 " with outstanding transactions?)\n", ph, irq_info.dsisr,
963 irq_info.dar); 1176 irq_info.dar);
964 return fail_psl_irq(afu, &irq_info); 1177 if (afu->adapter->native->sl_ops->fail_irq)
1178 ret = afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
1179 return ret;
965} 1180}
966 1181
967static void native_irq_wait(struct cxl_context *ctx) 1182static void native_irq_wait(struct cxl_context *ctx)
@@ -979,7 +1194,11 @@ static void native_irq_wait(struct cxl_context *ctx)
979 if (ph != ctx->pe) 1194 if (ph != ctx->pe)
980 return; 1195 return;
981 dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An); 1196 dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An);
982 if ((dsisr & CXL_PSL_DSISR_PENDING) == 0) 1197 if (cxl_is_psl8(ctx->afu) &&
1198 ((dsisr & CXL_PSL_DSISR_PENDING) == 0))
1199 return;
1200 if (cxl_is_psl9(ctx->afu) &&
1201 ((dsisr & CXL_PSL9_DSISR_PENDING) == 0))
983 return; 1202 return;
984 /* 1203 /*
985 * We are waiting for the workqueue to process our 1204 * We are waiting for the workqueue to process our
@@ -996,25 +1215,33 @@ static void native_irq_wait(struct cxl_context *ctx)
996static irqreturn_t native_slice_irq_err(int irq, void *data) 1215static irqreturn_t native_slice_irq_err(int irq, void *data)
997{ 1216{
998 struct cxl_afu *afu = data; 1217 struct cxl_afu *afu = data;
999 u64 fir_slice, errstat, serr, afu_debug, afu_error, dsisr; 1218 u64 errstat, serr, afu_error, dsisr;
1219 u64 fir_slice, afu_debug, irq_mask;
1000 1220
1001 /* 1221 /*
1002 * slice err interrupt is only used with full PSL (no XSL) 1222 * slice err interrupt is only used with full PSL (no XSL)
1003 */ 1223 */
1004 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); 1224 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
1005 fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
1006 errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An); 1225 errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
1007 afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
1008 afu_error = cxl_p2n_read(afu, CXL_AFU_ERR_An); 1226 afu_error = cxl_p2n_read(afu, CXL_AFU_ERR_An);
1009 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 1227 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
1010 cxl_afu_decode_psl_serr(afu, serr); 1228 cxl_afu_decode_psl_serr(afu, serr);
1011 dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice); 1229
1230 if (cxl_is_power8()) {
1231 fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
1232 afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
1233 dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
1234 dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
1235 }
1012 dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat); 1236 dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat);
1013 dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
1014 dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error); 1237 dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
1015 dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr); 1238 dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
1016 1239
1240 /* mask off the IRQ so it won't retrigger until the AFU is reset */
1241 irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
1242 serr |= irq_mask;
1017 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); 1243 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
1244 dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
1018 1245
1019 return IRQ_HANDLED; 1246 return IRQ_HANDLED;
1020} 1247}
@@ -1103,7 +1330,15 @@ int cxl_native_register_serr_irq(struct cxl_afu *afu)
1103 } 1330 }
1104 1331
1105 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); 1332 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
1106 serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff); 1333 if (cxl_is_power8())
1334 serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
1335 if (cxl_is_power9()) {
1336 /*
1337 * By default, all errors are masked. So don't set all masks.
1338 * Slice errors will be transfered.
1339 */
1340 serr = (serr & ~0xff0000007fffffffULL) | (afu->serr_hwirq & 0xffff);
1341 }
1107 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); 1342 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
1108 1343
1109 return 0; 1344 return 0;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b27ea98b781f..6dc1ee5b92c9 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -60,7 +60,7 @@
60#define CXL_VSEC_PROTOCOL_MASK 0xe0 60#define CXL_VSEC_PROTOCOL_MASK 0xe0
61#define CXL_VSEC_PROTOCOL_1024TB 0x80 61#define CXL_VSEC_PROTOCOL_1024TB 0x80
62#define CXL_VSEC_PROTOCOL_512TB 0x40 62#define CXL_VSEC_PROTOCOL_512TB 0x40
63#define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8 uses this */ 63#define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8/9 uses this */
64#define CXL_VSEC_PROTOCOL_ENABLE 0x01 64#define CXL_VSEC_PROTOCOL_ENABLE 0x01
65 65
66#define CXL_READ_VSEC_PSL_REVISION(dev, vsec, dest) \ 66#define CXL_READ_VSEC_PSL_REVISION(dev, vsec, dest) \
@@ -123,6 +123,8 @@ static const struct pci_device_id cxl_pci_tbl[] = {
123 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), }, 123 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), },
124 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), }, 124 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), },
125 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), }, 125 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), },
126 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0623), },
127 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0628), },
126 { PCI_DEVICE_CLASS(0x120000, ~0), }, 128 { PCI_DEVICE_CLASS(0x120000, ~0), },
127 129
128 { } 130 { }
@@ -324,38 +326,59 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
324#undef show_reg 326#undef show_reg
325} 327}
326 328
327#define CAPP_UNIT0_ID 0xBA 329#define P8_CAPP_UNIT0_ID 0xBA
328#define CAPP_UNIT1_ID 0XBE 330#define P8_CAPP_UNIT1_ID 0XBE
331#define P9_CAPP_UNIT0_ID 0xC0
332#define P9_CAPP_UNIT1_ID 0xE0
329 333
330static u64 get_capp_unit_id(struct device_node *np) 334static int get_phb_index(struct device_node *np, u32 *phb_index)
331{ 335{
332 u32 phb_index; 336 if (of_property_read_u32(np, "ibm,phb-index", phb_index))
337 return -ENODEV;
338 return 0;
339}
333 340
341static u64 get_capp_unit_id(struct device_node *np, u32 phb_index)
342{
334 /* 343 /*
335 * For chips other than POWER8NVL, we only have CAPP 0, 344 * POWER 8:
336 * irrespective of which PHB is used. 345 * - For chips other than POWER8NVL, we only have CAPP 0,
346 * irrespective of which PHB is used.
347 * - For POWER8NVL, assume CAPP 0 is attached to PHB0 and
348 * CAPP 1 is attached to PHB1.
337 */ 349 */
338 if (!pvr_version_is(PVR_POWER8NVL)) 350 if (cxl_is_power8()) {
339 return CAPP_UNIT0_ID; 351 if (!pvr_version_is(PVR_POWER8NVL))
352 return P8_CAPP_UNIT0_ID;
353
354 if (phb_index == 0)
355 return P8_CAPP_UNIT0_ID;
356
357 if (phb_index == 1)
358 return P8_CAPP_UNIT1_ID;
359 }
340 360
341 /* 361 /*
342 * For POWER8NVL, assume CAPP 0 is attached to PHB0 and 362 * POWER 9:
343 * CAPP 1 is attached to PHB1. 363 * PEC0 (PHB0). Capp ID = CAPP0 (0b1100_0000)
364 * PEC1 (PHB1 - PHB2). No capi mode
365 * PEC2 (PHB3 - PHB4 - PHB5): Capi mode on PHB3 only. Capp ID = CAPP1 (0b1110_0000)
344 */ 366 */
345 if (of_property_read_u32(np, "ibm,phb-index", &phb_index)) 367 if (cxl_is_power9()) {
346 return 0; 368 if (phb_index == 0)
369 return P9_CAPP_UNIT0_ID;
347 370
348 if (phb_index == 0) 371 if (phb_index == 3)
349 return CAPP_UNIT0_ID; 372 return P9_CAPP_UNIT1_ID;
350 373 }
351 if (phb_index == 1)
352 return CAPP_UNIT1_ID;
353 374
354 return 0; 375 return 0;
355} 376}
356 377
357static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id) 378static int calc_capp_routing(struct pci_dev *dev, u64 *chipid,
379 u32 *phb_index, u64 *capp_unit_id)
358{ 380{
381 int rc;
359 struct device_node *np; 382 struct device_node *np;
360 const __be32 *prop; 383 const __be32 *prop;
361 384
@@ -366,8 +389,16 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
366 np = of_get_next_parent(np); 389 np = of_get_next_parent(np);
367 if (!np) 390 if (!np)
368 return -ENODEV; 391 return -ENODEV;
392
369 *chipid = be32_to_cpup(prop); 393 *chipid = be32_to_cpup(prop);
370 *capp_unit_id = get_capp_unit_id(np); 394
395 rc = get_phb_index(np, phb_index);
396 if (rc) {
397 pr_err("cxl: invalid phb index\n");
398 return rc;
399 }
400
401 *capp_unit_id = get_capp_unit_id(np, *phb_index);
371 of_node_put(np); 402 of_node_put(np);
372 if (!*capp_unit_id) { 403 if (!*capp_unit_id) {
373 pr_err("cxl: invalid capp unit id\n"); 404 pr_err("cxl: invalid capp unit id\n");
@@ -377,14 +408,104 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
377 return 0; 408 return 0;
378} 409}
379 410
380static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_dev *dev) 411static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev)
412{
413 u64 xsl_dsnctl, psl_fircntl;
414 u64 chipid;
415 u32 phb_index;
416 u64 capp_unit_id;
417 int rc;
418
419 rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
420 if (rc)
421 return rc;
422
423 /*
424 * CAPI Identifier bits [0:7]
425 * bit 61:60 MSI bits --> 0
426 * bit 59 TVT selector --> 0
427 */
428
429 /*
430 * Tell XSL where to route data to.
431 * The field chipid should match the PHB CAPI_CMPM register
432 */
433 xsl_dsnctl = ((u64)0x2 << (63-7)); /* Bit 57 */
434 xsl_dsnctl |= (capp_unit_id << (63-15));
435
436 /* nMMU_ID Defaults to: b’000001001’*/
437 xsl_dsnctl |= ((u64)0x09 << (63-28));
438
439 if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) {
440 /*
441 * Used to identify CAPI packets which should be sorted into
442 * the Non-Blocking queues by the PHB. This field should match
443 * the PHB PBL_NBW_CMPM register
444 * nbwind=0x03, bits [57:58], must include capi indicator.
445 * Not supported on P9 DD1.
446 */
447 xsl_dsnctl |= ((u64)0x03 << (63-47));
448
449 /*
450 * Upper 16b address bits of ASB_Notify messages sent to the
451 * system. Need to match the PHB’s ASN Compare/Mask Register.
452 * Not supported on P9 DD1.
453 */
454 xsl_dsnctl |= ((u64)0x04 << (63-55));
455 }
456
457 cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl);
458
459 /* Set fir_cntl to recommended value for production env */
460 psl_fircntl = (0x2ULL << (63-3)); /* ce_report */
461 psl_fircntl |= (0x1ULL << (63-6)); /* FIR_report */
462 psl_fircntl |= 0x1ULL; /* ce_thresh */
463 cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl);
464
465 /* vccredits=0x1 pcklat=0x4 */
466 cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL);
467
468 /*
469 * For debugging with trace arrays.
470 * Configure RX trace 0 segmented mode.
471 * Configure CT trace 0 segmented mode.
472 * Configure LA0 trace 0 segmented mode.
473 * Configure LA1 trace 0 segmented mode.
474 */
475 cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL);
476 cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL);
477 cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL);
478 cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL);
479
480 /*
481 * A response to an ASB_Notify request is returned by the
482 * system as an MMIO write to the address defined in
483 * the PSL_TNR_ADDR register
484 */
485 /* PSL_TNR_ADDR */
486
487 /* NORST */
488 cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL);
489
490 /* allocate the apc machines */
491 cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL);
492
493 /* Disable vc dd1 fix */
494 if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1)))
495 cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL);
496
497 return 0;
498}
499
500static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci_dev *dev)
381{ 501{
382 u64 psl_dsnctl, psl_fircntl; 502 u64 psl_dsnctl, psl_fircntl;
383 u64 chipid; 503 u64 chipid;
504 u32 phb_index;
384 u64 capp_unit_id; 505 u64 capp_unit_id;
385 int rc; 506 int rc;
386 507
387 rc = calc_capp_routing(dev, &chipid, &capp_unit_id); 508 rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
388 if (rc) 509 if (rc)
389 return rc; 510 return rc;
390 511
@@ -409,14 +530,15 @@ static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_
409 return 0; 530 return 0;
410} 531}
411 532
412static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_dev *dev) 533static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_dev *dev)
413{ 534{
414 u64 xsl_dsnctl; 535 u64 xsl_dsnctl;
415 u64 chipid; 536 u64 chipid;
537 u32 phb_index;
416 u64 capp_unit_id; 538 u64 capp_unit_id;
417 int rc; 539 int rc;
418 540
419 rc = calc_capp_routing(dev, &chipid, &capp_unit_id); 541 rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
420 if (rc) 542 if (rc)
421 return rc; 543 return rc;
422 544
@@ -434,7 +556,13 @@ static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_
434/* For the PSL this is a multiple for 0 < n <= 7: */ 556/* For the PSL this is a multiple for 0 < n <= 7: */
435#define PSL_2048_250MHZ_CYCLES 1 557#define PSL_2048_250MHZ_CYCLES 1
436 558
437static void write_timebase_ctrl_psl(struct cxl *adapter) 559static void write_timebase_ctrl_psl9(struct cxl *adapter)
560{
561 cxl_p1_write(adapter, CXL_PSL9_TB_CTLSTAT,
562 TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
563}
564
565static void write_timebase_ctrl_psl8(struct cxl *adapter)
438{ 566{
439 cxl_p1_write(adapter, CXL_PSL_TB_CTLSTAT, 567 cxl_p1_write(adapter, CXL_PSL_TB_CTLSTAT,
440 TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES)); 568 TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
@@ -455,7 +583,12 @@ static void write_timebase_ctrl_xsl(struct cxl *adapter)
455 TBSYNC_CNT(XSL_4000_CLOCKS)); 583 TBSYNC_CNT(XSL_4000_CLOCKS));
456} 584}
457 585
458static u64 timebase_read_psl(struct cxl *adapter) 586static u64 timebase_read_psl9(struct cxl *adapter)
587{
588 return cxl_p1_read(adapter, CXL_PSL9_Timebase);
589}
590
591static u64 timebase_read_psl8(struct cxl *adapter)
459{ 592{
460 return cxl_p1_read(adapter, CXL_PSL_Timebase); 593 return cxl_p1_read(adapter, CXL_PSL_Timebase);
461} 594}
@@ -513,7 +646,12 @@ static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
513 return; 646 return;
514} 647}
515 648
516static int init_implementation_afu_psl_regs(struct cxl_afu *afu) 649static int init_implementation_afu_regs_psl9(struct cxl_afu *afu)
650{
651 return 0;
652}
653
654static int init_implementation_afu_regs_psl8(struct cxl_afu *afu)
517{ 655{
518 /* read/write masks for this slice */ 656 /* read/write masks for this slice */
519 cxl_p1n_write(afu, CXL_PSL_APCALLOC_A, 0xFFFFFFFEFEFEFEFEULL); 657 cxl_p1n_write(afu, CXL_PSL_APCALLOC_A, 0xFFFFFFFEFEFEFEFEULL);
@@ -611,7 +749,7 @@ static int setup_cxl_bars(struct pci_dev *dev)
611 /* 749 /*
612 * BAR 4/5 has a special meaning for CXL and must be programmed with a 750 * BAR 4/5 has a special meaning for CXL and must be programmed with a
613 * special value corresponding to the CXL protocol address range. 751 * special value corresponding to the CXL protocol address range.
614 * For POWER 8 that means bits 48:49 must be set to 10 752 * For POWER 8/9 that means bits 48:49 must be set to 10
615 */ 753 */
616 pci_write_config_dword(dev, PCI_BASE_ADDRESS_4, 0x00000000); 754 pci_write_config_dword(dev, PCI_BASE_ADDRESS_4, 0x00000000);
617 pci_write_config_dword(dev, PCI_BASE_ADDRESS_5, 0x00020000); 755 pci_write_config_dword(dev, PCI_BASE_ADDRESS_5, 0x00020000);
@@ -968,7 +1106,7 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
968 } 1106 }
969 1107
970 if (afu->pp_psa && (afu->pp_size < PAGE_SIZE)) 1108 if (afu->pp_psa && (afu->pp_size < PAGE_SIZE))
971 dev_warn(&afu->dev, "AFU uses < PAGE_SIZE per-process PSA!"); 1109 dev_warn(&afu->dev, "AFU uses pp_size(%#016llx) < PAGE_SIZE per-process PSA!\n", afu->pp_size);
972 1110
973 for (i = 0; i < afu->crs_num; i++) { 1111 for (i = 0; i < afu->crs_num; i++) {
974 rc = cxl_ops->afu_cr_read32(afu, i, 0, &val); 1112 rc = cxl_ops->afu_cr_read32(afu, i, 0, &val);
@@ -996,7 +1134,53 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
996 return 0; 1134 return 0;
997} 1135}
998 1136
999static int sanitise_afu_regs(struct cxl_afu *afu) 1137static int sanitise_afu_regs_psl9(struct cxl_afu *afu)
1138{
1139 u64 reg;
1140
1141 /*
1142 * Clear out any regs that contain either an IVTE or address or may be
1143 * waiting on an acknowledgment to try to be a bit safer as we bring
1144 * it online
1145 */
1146 reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
1147 if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
1148 dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#016llx\n", reg);
1149 if (cxl_ops->afu_reset(afu))
1150 return -EIO;
1151 if (cxl_afu_disable(afu))
1152 return -EIO;
1153 if (cxl_psl_purge(afu))
1154 return -EIO;
1155 }
1156 cxl_p1n_write(afu, CXL_PSL_SPAP_An, 0x0000000000000000);
1157 cxl_p1n_write(afu, CXL_PSL_AMBAR_An, 0x0000000000000000);
1158 reg = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
1159 if (reg) {
1160 dev_warn(&afu->dev, "AFU had pending DSISR: %#016llx\n", reg);
1161 if (reg & CXL_PSL9_DSISR_An_TF)
1162 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
1163 else
1164 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
1165 }
1166 if (afu->adapter->native->sl_ops->register_serr_irq) {
1167 reg = cxl_p1n_read(afu, CXL_PSL_SERR_An);
1168 if (reg) {
1169 if (reg & ~0x000000007fffffff)
1170 dev_warn(&afu->dev, "AFU had pending SERR: %#016llx\n", reg);
1171 cxl_p1n_write(afu, CXL_PSL_SERR_An, reg & ~0xffff);
1172 }
1173 }
1174 reg = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
1175 if (reg) {
1176 dev_warn(&afu->dev, "AFU had pending error status: %#016llx\n", reg);
1177 cxl_p2n_write(afu, CXL_PSL_ErrStat_An, reg);
1178 }
1179
1180 return 0;
1181}
1182
1183static int sanitise_afu_regs_psl8(struct cxl_afu *afu)
1000{ 1184{
1001 u64 reg; 1185 u64 reg;
1002 1186
@@ -1102,8 +1286,11 @@ static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc
1102 if ((rc = pci_map_slice_regs(afu, adapter, dev))) 1286 if ((rc = pci_map_slice_regs(afu, adapter, dev)))
1103 return rc; 1287 return rc;
1104 1288
1105 if ((rc = sanitise_afu_regs(afu))) 1289 if (adapter->native->sl_ops->sanitise_afu_regs) {
1106 goto err1; 1290 rc = adapter->native->sl_ops->sanitise_afu_regs(afu);
1291 if (rc)
1292 goto err1;
1293 }
1107 1294
1108 /* We need to reset the AFU before we can read the AFU descriptor */ 1295 /* We need to reset the AFU before we can read the AFU descriptor */
1109 if ((rc = cxl_ops->afu_reset(afu))) 1296 if ((rc = cxl_ops->afu_reset(afu)))
@@ -1248,8 +1435,13 @@ int cxl_pci_reset(struct cxl *adapter)
1248 1435
1249 dev_info(&dev->dev, "CXL reset\n"); 1436 dev_info(&dev->dev, "CXL reset\n");
1250 1437
1251 /* the adapter is about to be reset, so ignore errors */ 1438 /*
1252 cxl_data_cache_flush(adapter); 1439 * The adapter is about to be reset, so ignore errors.
1440 * Not supported on P9 DD1
1441 */
1442 if ((cxl_is_power8()) ||
1443 ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
1444 cxl_data_cache_flush(adapter);
1253 1445
1254 /* pcie_warm_reset requests a fundamental pci reset which includes a 1446 /* pcie_warm_reset requests a fundamental pci reset which includes a
1255 * PERST assert/deassert. PERST triggers a loading of the image 1447 * PERST assert/deassert. PERST triggers a loading of the image
@@ -1332,6 +1524,7 @@ static int cxl_read_vsec(struct cxl *adapter, struct pci_dev *dev)
1332 CXL_READ_VSEC_IMAGE_STATE(dev, vsec, &image_state); 1524 CXL_READ_VSEC_IMAGE_STATE(dev, vsec, &image_state);
1333 adapter->user_image_loaded = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED); 1525 adapter->user_image_loaded = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
1334 adapter->perst_select_user = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED); 1526 adapter->perst_select_user = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
1527 adapter->perst_loads_image = !!(image_state & CXL_VSEC_PERST_LOADS_IMAGE);
1335 1528
1336 CXL_READ_VSEC_NAFUS(dev, vsec, &adapter->slices); 1529 CXL_READ_VSEC_NAFUS(dev, vsec, &adapter->slices);
1337 CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, &afu_desc_off); 1530 CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, &afu_desc_off);
@@ -1378,6 +1571,17 @@ static void cxl_fixup_malformed_tlp(struct cxl *adapter, struct pci_dev *dev)
1378 pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, data); 1571 pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, data);
1379} 1572}
1380 1573
1574static bool cxl_compatible_caia_version(struct cxl *adapter)
1575{
1576 if (cxl_is_power8() && (adapter->caia_major == 1))
1577 return true;
1578
1579 if (cxl_is_power9() && (adapter->caia_major == 2))
1580 return true;
1581
1582 return false;
1583}
1584
1381static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev) 1585static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
1382{ 1586{
1383 if (adapter->vsec_status & CXL_STATUS_SECOND_PORT) 1587 if (adapter->vsec_status & CXL_STATUS_SECOND_PORT)
@@ -1388,6 +1592,12 @@ static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
1388 return -EINVAL; 1592 return -EINVAL;
1389 } 1593 }
1390 1594
1595 if (!cxl_compatible_caia_version(adapter)) {
1596 dev_info(&dev->dev, "Ignoring card. PSL type is not supported (caia version: %d)\n",
1597 adapter->caia_major);
1598 return -ENODEV;
1599 }
1600
1391 if (!adapter->slices) { 1601 if (!adapter->slices) {
1392 /* Once we support dynamic reprogramming we can use the card if 1602 /* Once we support dynamic reprogramming we can use the card if
1393 * it supports loadable AFUs */ 1603 * it supports loadable AFUs */
@@ -1431,9 +1641,19 @@ static void cxl_release_adapter(struct device *dev)
1431 1641
1432static int sanitise_adapter_regs(struct cxl *adapter) 1642static int sanitise_adapter_regs(struct cxl *adapter)
1433{ 1643{
1644 int rc = 0;
1645
1434 /* Clear PSL tberror bit by writing 1 to it */ 1646 /* Clear PSL tberror bit by writing 1 to it */
1435 cxl_p1_write(adapter, CXL_PSL_ErrIVTE, CXL_PSL_ErrIVTE_tberror); 1647 cxl_p1_write(adapter, CXL_PSL_ErrIVTE, CXL_PSL_ErrIVTE_tberror);
1436 return cxl_tlb_slb_invalidate(adapter); 1648
1649 if (adapter->native->sl_ops->invalidate_all) {
1650 /* do not invalidate ERAT entries when not reloading on PERST */
1651 if (cxl_is_power9() && (adapter->perst_loads_image))
1652 return 0;
1653 rc = adapter->native->sl_ops->invalidate_all(adapter);
1654 }
1655
1656 return rc;
1437} 1657}
1438 1658
1439/* This should contain *only* operations that can safely be done in 1659/* This should contain *only* operations that can safely be done in
@@ -1496,8 +1716,6 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
1496 if ((rc = cxl_native_register_psl_err_irq(adapter))) 1716 if ((rc = cxl_native_register_psl_err_irq(adapter)))
1497 goto err; 1717 goto err;
1498 1718
1499 /* Release the context lock as adapter is configured */
1500 cxl_adapter_context_unlock(adapter);
1501 return 0; 1719 return 0;
1502 1720
1503err: 1721err:
@@ -1516,25 +1734,65 @@ static void cxl_deconfigure_adapter(struct cxl *adapter)
1516 pci_disable_device(pdev); 1734 pci_disable_device(pdev);
1517} 1735}
1518 1736
1519static const struct cxl_service_layer_ops psl_ops = { 1737static const struct cxl_service_layer_ops psl9_ops = {
1520 .adapter_regs_init = init_implementation_adapter_psl_regs, 1738 .adapter_regs_init = init_implementation_adapter_regs_psl9,
1521 .afu_regs_init = init_implementation_afu_psl_regs, 1739 .invalidate_all = cxl_invalidate_all_psl9,
1740 .afu_regs_init = init_implementation_afu_regs_psl9,
1741 .sanitise_afu_regs = sanitise_afu_regs_psl9,
1522 .register_serr_irq = cxl_native_register_serr_irq, 1742 .register_serr_irq = cxl_native_register_serr_irq,
1523 .release_serr_irq = cxl_native_release_serr_irq, 1743 .release_serr_irq = cxl_native_release_serr_irq,
1524 .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_psl_regs, 1744 .handle_interrupt = cxl_irq_psl9,
1525 .debugfs_add_afu_sl_regs = cxl_debugfs_add_afu_psl_regs, 1745 .fail_irq = cxl_fail_irq_psl,
1526 .psl_irq_dump_registers = cxl_native_psl_irq_dump_regs, 1746 .activate_dedicated_process = cxl_activate_dedicated_process_psl9,
1747 .attach_afu_directed = cxl_attach_afu_directed_psl9,
1748 .attach_dedicated_process = cxl_attach_dedicated_process_psl9,
1749 .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl9,
1750 .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
1751 .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
1752 .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
1527 .err_irq_dump_registers = cxl_native_err_irq_dump_regs, 1753 .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
1528 .debugfs_stop_trace = cxl_stop_trace, 1754 .debugfs_stop_trace = cxl_stop_trace_psl9,
1529 .write_timebase_ctrl = write_timebase_ctrl_psl, 1755 .write_timebase_ctrl = write_timebase_ctrl_psl9,
1530 .timebase_read = timebase_read_psl, 1756 .timebase_read = timebase_read_psl9,
1757 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
1758 .needs_reset_before_disable = true,
1759};
1760
1761static const struct cxl_service_layer_ops psl8_ops = {
1762 .adapter_regs_init = init_implementation_adapter_regs_psl8,
1763 .invalidate_all = cxl_invalidate_all_psl8,
1764 .afu_regs_init = init_implementation_afu_regs_psl8,
1765 .sanitise_afu_regs = sanitise_afu_regs_psl8,
1766 .register_serr_irq = cxl_native_register_serr_irq,
1767 .release_serr_irq = cxl_native_release_serr_irq,
1768 .handle_interrupt = cxl_irq_psl8,
1769 .fail_irq = cxl_fail_irq_psl,
1770 .activate_dedicated_process = cxl_activate_dedicated_process_psl8,
1771 .attach_afu_directed = cxl_attach_afu_directed_psl8,
1772 .attach_dedicated_process = cxl_attach_dedicated_process_psl8,
1773 .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
1774 .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8,
1775 .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8,
1776 .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8,
1777 .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
1778 .debugfs_stop_trace = cxl_stop_trace_psl8,
1779 .write_timebase_ctrl = write_timebase_ctrl_psl8,
1780 .timebase_read = timebase_read_psl8,
1531 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI, 1781 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
1532 .needs_reset_before_disable = true, 1782 .needs_reset_before_disable = true,
1533}; 1783};
1534 1784
1535static const struct cxl_service_layer_ops xsl_ops = { 1785static const struct cxl_service_layer_ops xsl_ops = {
1536 .adapter_regs_init = init_implementation_adapter_xsl_regs, 1786 .adapter_regs_init = init_implementation_adapter_regs_xsl,
1537 .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_xsl_regs, 1787 .invalidate_all = cxl_invalidate_all_psl8,
1788 .sanitise_afu_regs = sanitise_afu_regs_psl8,
1789 .handle_interrupt = cxl_irq_psl8,
1790 .fail_irq = cxl_fail_irq_psl,
1791 .activate_dedicated_process = cxl_activate_dedicated_process_psl8,
1792 .attach_afu_directed = cxl_attach_afu_directed_psl8,
1793 .attach_dedicated_process = cxl_attach_dedicated_process_psl8,
1794 .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
1795 .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_xsl,
1538 .write_timebase_ctrl = write_timebase_ctrl_xsl, 1796 .write_timebase_ctrl = write_timebase_ctrl_xsl,
1539 .timebase_read = timebase_read_xsl, 1797 .timebase_read = timebase_read_xsl,
1540 .capi_mode = OPAL_PHB_CAPI_MODE_DMA, 1798 .capi_mode = OPAL_PHB_CAPI_MODE_DMA,
@@ -1548,8 +1806,13 @@ static void set_sl_ops(struct cxl *adapter, struct pci_dev *dev)
1548 adapter->native->sl_ops = &xsl_ops; 1806 adapter->native->sl_ops = &xsl_ops;
1549 adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */ 1807 adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */
1550 } else { 1808 } else {
1551 dev_info(&dev->dev, "Device uses a PSL\n"); 1809 if (cxl_is_power8()) {
1552 adapter->native->sl_ops = &psl_ops; 1810 dev_info(&dev->dev, "Device uses a PSL8\n");
1811 adapter->native->sl_ops = &psl8_ops;
1812 } else {
1813 dev_info(&dev->dev, "Device uses a PSL9\n");
1814 adapter->native->sl_ops = &psl9_ops;
1815 }
1553 } 1816 }
1554} 1817}
1555 1818
@@ -1596,6 +1859,9 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev)
1596 if ((rc = cxl_sysfs_adapter_add(adapter))) 1859 if ((rc = cxl_sysfs_adapter_add(adapter)))
1597 goto err_put1; 1860 goto err_put1;
1598 1861
1862 /* Release the context lock as adapter is configured */
1863 cxl_adapter_context_unlock(adapter);
1864
1599 return adapter; 1865 return adapter;
1600 1866
1601err_put1: 1867err_put1:
@@ -1619,8 +1885,13 @@ static void cxl_pci_remove_adapter(struct cxl *adapter)
1619 cxl_sysfs_adapter_remove(adapter); 1885 cxl_sysfs_adapter_remove(adapter);
1620 cxl_debugfs_adapter_remove(adapter); 1886 cxl_debugfs_adapter_remove(adapter);
1621 1887
1622 /* Flush adapter datacache as its about to be removed */ 1888 /*
1623 cxl_data_cache_flush(adapter); 1889 * Flush adapter datacache as its about to be removed.
1890 * Not supported on P9 DD1.
1891 */
1892 if ((cxl_is_power8()) ||
1893 ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
1894 cxl_data_cache_flush(adapter);
1624 1895
1625 cxl_deconfigure_adapter(adapter); 1896 cxl_deconfigure_adapter(adapter);
1626 1897
@@ -1704,6 +1975,11 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
1704 return -ENODEV; 1975 return -ENODEV;
1705 } 1976 }
1706 1977
1978 if (cxl_is_power9() && !radix_enabled()) {
1979 dev_info(&dev->dev, "Only Radix mode supported\n");
1980 return -ENODEV;
1981 }
1982
1707 if (cxl_verbose) 1983 if (cxl_verbose)
1708 dump_cxl_config_space(dev); 1984 dump_cxl_config_space(dev);
1709 1985
@@ -1781,7 +2057,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1781{ 2057{
1782 struct cxl *adapter = pci_get_drvdata(pdev); 2058 struct cxl *adapter = pci_get_drvdata(pdev);
1783 struct cxl_afu *afu; 2059 struct cxl_afu *afu;
1784 pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; 2060 pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
1785 int i; 2061 int i;
1786 2062
1787 /* At this point, we could still have an interrupt pending. 2063 /* At this point, we could still have an interrupt pending.
@@ -1885,16 +2161,26 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1885 for (i = 0; i < adapter->slices; i++) { 2161 for (i = 0; i < adapter->slices; i++) {
1886 afu = adapter->afu[i]; 2162 afu = adapter->afu[i];
1887 2163
1888 result = cxl_vphb_error_detected(afu, state); 2164 afu_result = cxl_vphb_error_detected(afu, state);
1889
1890 /* Only continue if everyone agrees on NEED_RESET */
1891 if (result != PCI_ERS_RESULT_NEED_RESET)
1892 return result;
1893 2165
1894 cxl_context_detach_all(afu); 2166 cxl_context_detach_all(afu);
1895 cxl_ops->afu_deactivate_mode(afu, afu->current_mode); 2167 cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
1896 pci_deconfigure_afu(afu); 2168 pci_deconfigure_afu(afu);
2169
2170 /* Disconnect trumps all, NONE trumps NEED_RESET */
2171 if (afu_result == PCI_ERS_RESULT_DISCONNECT)
2172 result = PCI_ERS_RESULT_DISCONNECT;
2173 else if ((afu_result == PCI_ERS_RESULT_NONE) &&
2174 (result == PCI_ERS_RESULT_NEED_RESET))
2175 result = PCI_ERS_RESULT_NONE;
1897 } 2176 }
2177
2178 /* should take the context lock here */
2179 if (cxl_adapter_context_lock(adapter) != 0)
2180 dev_warn(&adapter->dev,
2181 "Couldn't take context lock with %d active-contexts\n",
2182 atomic_read(&adapter->contexts_num));
2183
1898 cxl_deconfigure_adapter(adapter); 2184 cxl_deconfigure_adapter(adapter);
1899 2185
1900 return result; 2186 return result;
@@ -1913,6 +2199,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
1913 if (cxl_configure_adapter(adapter, pdev)) 2199 if (cxl_configure_adapter(adapter, pdev))
1914 goto err; 2200 goto err;
1915 2201
2202 /*
2203 * Unlock context activation for the adapter. Ideally this should be
2204 * done in cxl_pci_resume but cxlflash module tries to activate the
2205 * master context as part of slot_reset callback.
2206 */
2207 cxl_adapter_context_unlock(adapter);
2208
1916 for (i = 0; i < adapter->slices; i++) { 2209 for (i = 0; i < adapter->slices; i++) {
1917 afu = adapter->afu[i]; 2210 afu = adapter->afu[i];
1918 2211
diff --git a/drivers/misc/cxl/trace.h b/drivers/misc/cxl/trace.h
index 751d6119683e..b8e300af0e55 100644
--- a/drivers/misc/cxl/trace.h
+++ b/drivers/misc/cxl/trace.h
@@ -17,6 +17,15 @@
17 17
18#include "cxl.h" 18#include "cxl.h"
19 19
20#define dsisr_psl9_flags(flags) \
21 __print_flags(flags, "|", \
22 { CXL_PSL9_DSISR_An_CO_MASK, "FR" }, \
23 { CXL_PSL9_DSISR_An_TF, "TF" }, \
24 { CXL_PSL9_DSISR_An_PE, "PE" }, \
25 { CXL_PSL9_DSISR_An_AE, "AE" }, \
26 { CXL_PSL9_DSISR_An_OC, "OC" }, \
27 { CXL_PSL9_DSISR_An_S, "S" })
28
20#define DSISR_FLAGS \ 29#define DSISR_FLAGS \
21 { CXL_PSL_DSISR_An_DS, "DS" }, \ 30 { CXL_PSL_DSISR_An_DS, "DS" }, \
22 { CXL_PSL_DSISR_An_DM, "DM" }, \ 31 { CXL_PSL_DSISR_An_DM, "DM" }, \
@@ -154,6 +163,40 @@ TRACE_EVENT(cxl_afu_irq,
154 ) 163 )
155); 164);
156 165
166TRACE_EVENT(cxl_psl9_irq,
167 TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
168
169 TP_ARGS(ctx, irq, dsisr, dar),
170
171 TP_STRUCT__entry(
172 __field(u8, card)
173 __field(u8, afu)
174 __field(u16, pe)
175 __field(int, irq)
176 __field(u64, dsisr)
177 __field(u64, dar)
178 ),
179
180 TP_fast_assign(
181 __entry->card = ctx->afu->adapter->adapter_num;
182 __entry->afu = ctx->afu->slice;
183 __entry->pe = ctx->pe;
184 __entry->irq = irq;
185 __entry->dsisr = dsisr;
186 __entry->dar = dar;
187 ),
188
189 TP_printk("afu%i.%i pe=%i irq=%i dsisr=0x%016llx dsisr=%s dar=0x%016llx",
190 __entry->card,
191 __entry->afu,
192 __entry->pe,
193 __entry->irq,
194 __entry->dsisr,
195 dsisr_psl9_flags(__entry->dsisr),
196 __entry->dar
197 )
198);
199
157TRACE_EVENT(cxl_psl_irq, 200TRACE_EVENT(cxl_psl_irq,
158 TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar), 201 TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
159 202
diff --git a/drivers/of/base.c b/drivers/of/base.c
index d7c4629a3a2d..0ea16bd3c8f1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1213,6 +1213,37 @@ int of_property_read_u32_index(const struct device_node *np,
1213EXPORT_SYMBOL_GPL(of_property_read_u32_index); 1213EXPORT_SYMBOL_GPL(of_property_read_u32_index);
1214 1214
1215/** 1215/**
1216 * of_property_read_u64_index - Find and read a u64 from a multi-value property.
1217 *
1218 * @np: device node from which the property value is to be read.
1219 * @propname: name of the property to be searched.
1220 * @index: index of the u64 in the list of values
1221 * @out_value: pointer to return value, modified only if no error.
1222 *
1223 * Search for a property in a device node and read nth 64-bit value from
1224 * it. Returns 0 on success, -EINVAL if the property does not exist,
1225 * -ENODATA if property does not have a value, and -EOVERFLOW if the
1226 * property data isn't large enough.
1227 *
1228 * The out_value is modified only if a valid u64 value can be decoded.
1229 */
1230int of_property_read_u64_index(const struct device_node *np,
1231 const char *propname,
1232 u32 index, u64 *out_value)
1233{
1234 const u64 *val = of_find_property_value_of_size(np, propname,
1235 ((index + 1) * sizeof(*out_value)),
1236 0, NULL);
1237
1238 if (IS_ERR(val))
1239 return PTR_ERR(val);
1240
1241 *out_value = be64_to_cpup(((__be64 *)val) + index);
1242 return 0;
1243}
1244EXPORT_SYMBOL_GPL(of_property_read_u64_index);
1245
1246/**
1216 * of_property_read_variable_u8_array - Find and read an array of u8 from a 1247 * of_property_read_variable_u8_array - Find and read an array of u8 from a
1217 * property, with bounds on the minimum and maximum array size. 1248 * property, with bounds on the minimum and maximum array size.
1218 * 1249 *
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index 4d7bc3f4124a..c6fe2a4a7a6a 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -207,7 +207,7 @@ static int electra_cf_probe(struct platform_device *ofdev)
207 return -ENOMEM; 207 return -ENOMEM;
208 208
209 setup_timer(&cf->timer, electra_cf_timer, (unsigned long)cf); 209 setup_timer(&cf->timer, electra_cf_timer, (unsigned long)cf);
210 cf->irq = NO_IRQ; 210 cf->irq = 0;
211 211
212 cf->ofdev = ofdev; 212 cf->ofdev = ofdev;
213 cf->mem_phys = mem.start; 213 cf->mem_phys = mem.start;
@@ -313,7 +313,7 @@ fail3:
313fail2: 313fail2:
314 release_mem_region(cf->mem_phys, cf->mem_size); 314 release_mem_region(cf->mem_phys, cf->mem_size);
315fail1: 315fail1:
316 if (cf->irq != NO_IRQ) 316 if (cf->irq)
317 free_irq(cf->irq, cf); 317 free_irq(cf->irq, cf);
318 318
319 if (cf->io_virt) 319 if (cf->io_virt)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 78dca1aa6410..63112c36ab2d 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -685,7 +685,7 @@ static void tce_iommu_free_table(struct tce_container *container,
685 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 685 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
686 686
687 tce_iommu_userspace_view_free(tbl, container->mm); 687 tce_iommu_userspace_view_free(tbl, container->mm);
688 tbl->it_ops->free(tbl); 688 iommu_tce_table_put(tbl);
689 decrement_locked_vm(container->mm, pages); 689 decrement_locked_vm(container->mm, pages);
690} 690}
691 691
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 47e4da5b4fa2..30f90c1a0aaf 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -381,6 +381,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
381 return this_cpu_ptr(&kprobe_ctlblk); 381 return this_cpu_ptr(&kprobe_ctlblk);
382} 382}
383 383
384kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
384int register_kprobe(struct kprobe *p); 385int register_kprobe(struct kprobe *p);
385void unregister_kprobe(struct kprobe *p); 386void unregister_kprobe(struct kprobe *p);
386int register_kprobes(struct kprobe **kps, int num); 387int register_kprobes(struct kprobe **kps, int num);
diff --git a/include/linux/of.h b/include/linux/of.h
index e5d4225fda35..50fcdb54087f 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -294,6 +294,9 @@ extern int of_property_count_elems_of_size(const struct device_node *np,
294extern int of_property_read_u32_index(const struct device_node *np, 294extern int of_property_read_u32_index(const struct device_node *np,
295 const char *propname, 295 const char *propname,
296 u32 index, u32 *out_value); 296 u32 index, u32 *out_value);
297extern int of_property_read_u64_index(const struct device_node *np,
298 const char *propname,
299 u32 index, u64 *out_value);
297extern int of_property_read_variable_u8_array(const struct device_node *np, 300extern int of_property_read_variable_u8_array(const struct device_node *np,
298 const char *propname, u8 *out_values, 301 const char *propname, u8 *out_values,
299 size_t sz_min, size_t sz_max); 302 size_t sz_min, size_t sz_max);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
922#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 922#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
923#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 923#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
924 924
925#if defined(__LITTLE_ENDIAN_BITFIELD)
925union perf_mem_data_src { 926union perf_mem_data_src {
926 __u64 val; 927 __u64 val;
927 struct { 928 struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
933 mem_rsvd:31; 934 mem_rsvd:31;
934 }; 935 };
935}; 936};
937#elif defined(__BIG_ENDIAN_BITFIELD)
938union perf_mem_data_src {
939 __u64 val;
940 struct {
941 __u64 mem_rsvd:31,
942 mem_dtlb:7, /* tlb access */
943 mem_lock:2, /* lock instr */
944 mem_snoop:5, /* snoop mode */
945 mem_lvl:14, /* memory hierarchy level */
946 mem_op:5; /* type of opcode */
947 };
948};
949#else
950#error "Unknown endianness"
951#endif
936 952
937/* type of opcode (load/store/prefetch,code) */ 953/* type of opcode (load/store/prefetch,code) */
938#define PERF_MEM_OP_NA 0x01 /* not available */ 954#define PERF_MEM_OP_NA 0x01 /* not available */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d733479a10ee..7367e0ec6f81 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
58#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 58#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
59 59
60 60
61/*
62 * Some oddball architectures like 64bit powerpc have function descriptors
63 * so this must be overridable.
64 */
65#ifndef kprobe_lookup_name
66#define kprobe_lookup_name(name, addr) \
67 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
68#endif
69
70static int kprobes_initialized; 61static int kprobes_initialized;
71static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 62static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
72static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 63static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
81 raw_spinlock_t lock ____cacheline_aligned_in_smp; 72 raw_spinlock_t lock ____cacheline_aligned_in_smp;
82} kretprobe_table_locks[KPROBE_TABLE_SIZE]; 73} kretprobe_table_locks[KPROBE_TABLE_SIZE];
83 74
75kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
76 unsigned int __unused)
77{
78 return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
79}
80
84static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 81static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85{ 82{
86 return &(kretprobe_table_locks[hash].lock); 83 return &(kretprobe_table_locks[hash].lock);
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
746 arch_remove_optimized_kprobe(op); 743 arch_remove_optimized_kprobe(op);
747} 744}
748 745
746static inline
747void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
748{
749 if (!kprobe_ftrace(p))
750 arch_prepare_optimized_kprobe(op, p);
751}
752
749/* Try to prepare optimized instructions */ 753/* Try to prepare optimized instructions */
750static void prepare_optimized_kprobe(struct kprobe *p) 754static void prepare_optimized_kprobe(struct kprobe *p)
751{ 755{
752 struct optimized_kprobe *op; 756 struct optimized_kprobe *op;
753 757
754 op = container_of(p, struct optimized_kprobe, kp); 758 op = container_of(p, struct optimized_kprobe, kp);
755 arch_prepare_optimized_kprobe(op, p); 759 __prepare_optimized_kprobe(op, p);
756} 760}
757 761
758/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 762/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
766 770
767 INIT_LIST_HEAD(&op->list); 771 INIT_LIST_HEAD(&op->list);
768 op->kp.addr = p->addr; 772 op->kp.addr = p->addr;
769 arch_prepare_optimized_kprobe(op, p); 773 __prepare_optimized_kprobe(op, p);
770 774
771 return &op->kp; 775 return &op->kp;
772} 776}
@@ -1398,7 +1402,7 @@ static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
1398 goto invalid; 1402 goto invalid;
1399 1403
1400 if (symbol_name) { 1404 if (symbol_name) {
1401 kprobe_lookup_name(symbol_name, addr); 1405 addr = kprobe_lookup_name(symbol_name, offset);
1402 if (!addr) 1406 if (!addr)
1403 return ERR_PTR(-ENOENT); 1407 return ERR_PTR(-ENOENT);
1404 } 1408 }
@@ -2218,8 +2222,8 @@ static int __init init_kprobes(void)
2218 if (kretprobe_blacklist_size) { 2222 if (kretprobe_blacklist_size) {
2219 /* lookup the function address from its name */ 2223 /* lookup the function address from its name */
2220 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 2224 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
2221 kprobe_lookup_name(kretprobe_blacklist[i].name, 2225 kretprobe_blacklist[i].addr =
2222 kretprobe_blacklist[i].addr); 2226 kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
2223 if (!kretprobe_blacklist[i].addr) 2227 if (!kretprobe_blacklist[i].addr)
2224 printk("kretprobe: lookup failed: %s\n", 2228 printk("kretprobe: lookup failed: %s\n",
2225 kretprobe_blacklist[i].name); 2229 kretprobe_blacklist[i].name);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
922#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 922#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
923#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 923#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
924 924
925#if defined(__LITTLE_ENDIAN_BITFIELD)
925union perf_mem_data_src { 926union perf_mem_data_src {
926 __u64 val; 927 __u64 val;
927 struct { 928 struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
933 mem_rsvd:31; 934 mem_rsvd:31;
934 }; 935 };
935}; 936};
937#elif defined(__BIG_ENDIAN_BITFIELD)
938union perf_mem_data_src {
939 __u64 val;
940 struct {
941 __u64 mem_rsvd:31,
942 mem_dtlb:7, /* tlb access */
943 mem_lock:2, /* lock instr */
944 mem_snoop:5, /* snoop mode */
945 mem_lvl:14, /* memory hierarchy level */
946 mem_op:5; /* type of opcode */
947 };
948};
949#else
950#error "Unknown endianness"
951#endif
936 952
937/* type of opcode (load/store/prefetch,code) */ 953/* type of opcode (load/store/prefetch,code) */
938#define PERF_MEM_OP_NA 0x01 /* not available */ 954#define PERF_MEM_OP_NA 0x01 /* not available */
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 2132ff8eb4e7..03e1617367d3 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -14,6 +14,7 @@ export CFLAGS
14 14
15SUB_DIRS = alignment \ 15SUB_DIRS = alignment \
16 benchmarks \ 16 benchmarks \
17 cache_shape \
17 copyloops \ 18 copyloops \
18 context_switch \ 19 context_switch \
19 dscr \ 20 dscr \
diff --git a/tools/testing/selftests/powerpc/cache_shape/.gitignore b/tools/testing/selftests/powerpc/cache_shape/.gitignore
new file mode 100644
index 000000000000..ec1848434be5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/.gitignore
@@ -0,0 +1 @@
cache_shape
diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile
new file mode 100644
index 000000000000..b24485ab30e2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/Makefile
@@ -0,0 +1,10 @@
1TEST_PROGS := cache_shape
2
3all: $(TEST_PROGS)
4
5$(TEST_PROGS): ../harness.c ../utils.c
6
7include ../../lib.mk
8
9clean:
10 rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/cache_shape/cache_shape.c b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
new file mode 100644
index 000000000000..29ec07eba7f9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
@@ -0,0 +1,125 @@
1/*
2 * Copyright 2017, Michael Ellerman, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <elf.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <link.h>
14#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <sys/stat.h>
18#include <sys/types.h>
19#include <sys/wait.h>
20#include <unistd.h>
21
22#include "utils.h"
23
24#ifndef AT_L1I_CACHESIZE
25#define AT_L1I_CACHESIZE 40
26#define AT_L1I_CACHEGEOMETRY 41
27#define AT_L1D_CACHESIZE 42
28#define AT_L1D_CACHEGEOMETRY 43
29#define AT_L2_CACHESIZE 44
30#define AT_L2_CACHEGEOMETRY 45
31#define AT_L3_CACHESIZE 46
32#define AT_L3_CACHEGEOMETRY 47
33#endif
34
35static void print_size(const char *label, uint32_t val)
36{
37 printf("%s cache size: %#10x %10dB %10dK\n", label, val, val, val / 1024);
38}
39
40static void print_geo(const char *label, uint32_t val)
41{
42 uint16_t assoc;
43
44 printf("%s line size: %#10x ", label, val & 0xFFFF);
45
46 assoc = val >> 16;
47 if (assoc)
48 printf("%u-way", assoc);
49 else
50 printf("fully");
51
52 printf(" associative\n");
53}
54
55static int test_cache_shape()
56{
57 static char buffer[4096];
58 ElfW(auxv_t) *p;
59 int found;
60
61 FAIL_IF(read_auxv(buffer, sizeof(buffer)));
62
63 found = 0;
64
65 p = find_auxv_entry(AT_L1I_CACHESIZE, buffer);
66 if (p) {
67 found++;
68 print_size("L1I ", (uint32_t)p->a_un.a_val);
69 }
70
71 p = find_auxv_entry(AT_L1I_CACHEGEOMETRY, buffer);
72 if (p) {
73 found++;
74 print_geo("L1I ", (uint32_t)p->a_un.a_val);
75 }
76
77 p = find_auxv_entry(AT_L1D_CACHESIZE, buffer);
78 if (p) {
79 found++;
80 print_size("L1D ", (uint32_t)p->a_un.a_val);
81 }
82
83 p = find_auxv_entry(AT_L1D_CACHEGEOMETRY, buffer);
84 if (p) {
85 found++;
86 print_geo("L1D ", (uint32_t)p->a_un.a_val);
87 }
88
89 p = find_auxv_entry(AT_L2_CACHESIZE, buffer);
90 if (p) {
91 found++;
92 print_size("L2 ", (uint32_t)p->a_un.a_val);
93 }
94
95 p = find_auxv_entry(AT_L2_CACHEGEOMETRY, buffer);
96 if (p) {
97 found++;
98 print_geo("L2 ", (uint32_t)p->a_un.a_val);
99 }
100
101 p = find_auxv_entry(AT_L3_CACHESIZE, buffer);
102 if (p) {
103 found++;
104 print_size("L3 ", (uint32_t)p->a_un.a_val);
105 }
106
107 p = find_auxv_entry(AT_L3_CACHEGEOMETRY, buffer);
108 if (p) {
109 found++;
110 print_geo("L3 ", (uint32_t)p->a_un.a_val);
111 }
112
113 /* If we found none we're probably on a system where they don't exist */
114 SKIP_IF(found == 0);
115
116 /* But if we found any, we expect to find them all */
117 FAIL_IF(found != 8);
118
119 return 0;
120}
121
122int main(void)
123{
124 return test_harness(test_cache_shape, "cache_shape");
125}
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 53405e8a52ab..735815b3ad7f 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -24,7 +24,11 @@ typedef uint8_t u8;
24 24
25void test_harness_set_timeout(uint64_t time); 25void test_harness_set_timeout(uint64_t time);
26int test_harness(int (test_function)(void), char *name); 26int test_harness(int (test_function)(void), char *name);
27extern void *get_auxv_entry(int type); 27
28int read_auxv(char *buf, ssize_t buf_size);
29void *find_auxv_entry(int type, char *auxv);
30void *get_auxv_entry(int type);
31
28int pick_online_cpu(void); 32int pick_online_cpu(void);
29 33
30static inline bool have_hwcap(unsigned long ftr) 34static inline bool have_hwcap(unsigned long ftr)
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index dcf74184bfd0..d46916867a6f 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -19,45 +19,64 @@
19 19
20static char auxv[4096]; 20static char auxv[4096];
21 21
22void *get_auxv_entry(int type) 22int read_auxv(char *buf, ssize_t buf_size)
23{ 23{
24 ElfW(auxv_t) *p;
25 void *result;
26 ssize_t num; 24 ssize_t num;
27 int fd; 25 int rc, fd;
28 26
29 fd = open("/proc/self/auxv", O_RDONLY); 27 fd = open("/proc/self/auxv", O_RDONLY);
30 if (fd == -1) { 28 if (fd == -1) {
31 perror("open"); 29 perror("open");
32 return NULL; 30 return -errno;
33 } 31 }
34 32
35 result = NULL; 33 num = read(fd, buf, buf_size);
36
37 num = read(fd, auxv, sizeof(auxv));
38 if (num < 0) { 34 if (num < 0) {
39 perror("read"); 35 perror("read");
36 rc = -EIO;
40 goto out; 37 goto out;
41 } 38 }
42 39
43 if (num > sizeof(auxv)) { 40 if (num > buf_size) {
44 printf("Overflowed auxv buffer\n"); 41 printf("overflowed auxv buffer\n");
42 rc = -EOVERFLOW;
45 goto out; 43 goto out;
46 } 44 }
47 45
46 rc = 0;
47out:
48 close(fd);
49 return rc;
50}
51
52void *find_auxv_entry(int type, char *auxv)
53{
54 ElfW(auxv_t) *p;
55
48 p = (ElfW(auxv_t) *)auxv; 56 p = (ElfW(auxv_t) *)auxv;
49 57
50 while (p->a_type != AT_NULL) { 58 while (p->a_type != AT_NULL) {
51 if (p->a_type == type) { 59 if (p->a_type == type)
52 result = (void *)p->a_un.a_val; 60 return p;
53 break;
54 }
55 61
56 p++; 62 p++;
57 } 63 }
58out: 64
59 close(fd); 65 return NULL;
60 return result; 66}
67
68void *get_auxv_entry(int type)
69{
70 ElfW(auxv_t) *p;
71
72 if (read_auxv(auxv, sizeof(auxv)))
73 return NULL;
74
75 p = find_auxv_entry(type, auxv);
76 if (p)
77 return (void *)p->a_un.a_val;
78
79 return NULL;
61} 80}
62 81
63int pick_online_cpu(void) 82int pick_online_cpu(void)