summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-20 14:48:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-20 14:48:06 -0400
commit45824fc0da6e46cc5d563105e1eaaf3098a686f9 (patch)
tree8e57c1f18104ed5f0d74d9eed9dc0365b3c137b8
parent8c2b418c3f95a488f5226870eee68574d323f0f8 (diff)
parentd9101bfa6adc831bda8836c4d774820553c14942 (diff)
Merge tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman: "This is a bit late, partly due to me travelling, and partly due to a power outage knocking out some of my test systems *while* I was travelling. - Initial support for running on a system with an Ultravisor, which is software that runs below the hypervisor and protects guests against some attacks by the hypervisor. - Support for building the kernel to run as a "Secure Virtual Machine", ie. as a guest capable of running on a system with an Ultravisor. - Some changes to our DMA code on bare metal, to allow devices with medium sized DMA masks (> 32 && < 59 bits) to use more than 2GB of DMA space. - Support for firmware assisted crash dumps on bare metal (powernv). - Two series fixing bugs in and refactoring our PCI EEH code. - A large series refactoring our exception entry code to use gas macros, both to make it more readable and also enable some future optimisations. As well as many cleanups and other minor features & fixups. Thanks to: Adam Zerella, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual, Balbir Singh, Benjamin Herrenschmidt, Cédric Le Goater, Christophe JAILLET, Christophe Leroy, Christopher M. Riedl, Christoph Hellwig, Claudio Carvalho, Daniel Axtens, David Gibson, David Hildenbrand, Desnes A. Nunes do Rosario, Ganesh Goudar, Gautham R. Shenoy, Greg Kurz, Guerney Hunt, Gustavo Romero, Halil Pasic, Hari Bathini, Joakim Tjernlund, Jonathan Neuschafer, Jordan Niethe, Leonardo Bras, Lianbo Jiang, Madhavan Srinivasan, Mahesh Salgaonkar, Mahesh Salgaonkar, Masahiro Yamada, Maxiwell S. Garcia, Michael Anderson, Nathan Chancellor, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Qian Cai, Ram Pai, Ravi Bangoria, Reza Arbab, Ryan Grimm, Sam Bobroff, Santosh Sivaraj, Segher Boessenkool, Sukadev Bhattiprolu, Thiago Bauermann, Thiago Jung Bauermann, Thomas Gleixner, Tom Lendacky, Vasant Hegde" * tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (264 commits) powerpc/mm/mce: Keep irqs disabled during lockless page table walk powerpc: Use ftrace_graph_ret_addr() when unwinding powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR ftrace: Look up the address of return_to_handler() using helpers powerpc: dump kernel log before carrying out fadump or kdump docs: powerpc: Add missing documentation reference powerpc/xmon: Fix output of XIVE IPI powerpc/xmon: Improve output of XIVE interrupts powerpc/mm/radix: remove useless kernel messages powerpc/fadump: support holes in kernel boot memory area powerpc/fadump: remove RMA_START and RMA_END macros powerpc/fadump: update documentation about option to release opalcore powerpc/fadump: consider f/w load area powerpc/opalcore: provide an option to invalidate /sys/firmware/opal/core file powerpc/opalcore: export /sys/firmware/opal/core for analysing opal crashes powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel powerpc/fadump: improve how crashed kernel's memory is reserved powerpc/fadump: consider reserved ranges while releasing memory powerpc/fadump: make crash memory ranges array allocation generic ...
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu10
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt28
-rw-r--r--Documentation/powerpc/elfnote.rst41
-rw-r--r--Documentation/powerpc/firmware-assisted-dump.rst220
-rw-r--r--Documentation/powerpc/index.rst2
-rw-r--r--Documentation/powerpc/ultravisor.rst1054
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/powerpc/Kconfig43
-rw-r--r--arch/powerpc/Makefile1
-rw-r--r--arch/powerpc/boot/main.c41
-rw-r--r--arch/powerpc/boot/ops.h2
-rwxr-xr-xarch/powerpc/boot/wrapper24
-rw-r--r--arch/powerpc/boot/zImage.lds.S8
-rw-r--r--arch/powerpc/configs/pmac32_defconfig1
-rw-r--r--arch/powerpc/configs/powernv_defconfig2
-rw-r--r--arch/powerpc/configs/ppc40x_defconfig1
-rw-r--r--arch/powerpc/configs/ppc64_defconfig1
-rw-r--r--arch/powerpc/configs/pseries_defconfig1
-rw-r--r--arch/powerpc/configs/skiroot_defconfig1
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h14
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h18
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h4
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h8
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h3
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h12
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h9
-rw-r--r--arch/powerpc/include/asm/book3s/pgtable.h11
-rw-r--r--arch/powerpc/include/asm/bug.h8
-rw-r--r--arch/powerpc/include/asm/cputable.h20
-rw-r--r--arch/powerpc/include/asm/current.h3
-rw-r--r--arch/powerpc/include/asm/eeh.h40
-rw-r--r--arch/powerpc/include/asm/elfnote.h24
-rw-r--r--arch/powerpc/include/asm/fadump-internal.h169
-rw-r--r--arch/powerpc/include/asm/fadump.h194
-rw-r--r--arch/powerpc/include/asm/firmware.h5
-rw-r--r--arch/powerpc/include/asm/ftrace.h2
-rw-r--r--arch/powerpc/include/asm/futex.h3
-rw-r--r--arch/powerpc/include/asm/head-64.h41
-rw-r--r--arch/powerpc/include/asm/hugetlb.h3
-rw-r--r--arch/powerpc/include/asm/io-workarounds.h20
-rw-r--r--arch/powerpc/include/asm/io.h16
-rw-r--r--arch/powerpc/include/asm/iommu.h28
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/include/asm/machdep.h7
-rw-r--r--arch/powerpc/include/asm/mce.h10
-rw-r--r--arch/powerpc/include/asm/mem_encrypt.h26
-rw-r--r--arch/powerpc/include/asm/mmu.h2
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h18
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/nohash/pgtable.h13
-rw-r--r--arch/powerpc/include/asm/opal-api.h45
-rw-r--r--arch/powerpc/include/asm/opal.h7
-rw-r--r--arch/powerpc/include/asm/page.h14
-rw-r--r--arch/powerpc/include/asm/page_32.h4
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h14
-rw-r--r--arch/powerpc/include/asm/plpar_wrappers.h6
-rw-r--r--arch/powerpc/include/asm/ppc-pci.h7
-rw-r--r--arch/powerpc/include/asm/ppc4xx_ocm.h31
-rw-r--r--arch/powerpc/include/asm/ppc_asm.h80
-rw-r--r--arch/powerpc/include/asm/ptrace.h6
-rw-r--r--arch/powerpc/include/asm/reg.h3
-rw-r--r--arch/powerpc/include/asm/scom.h154
-rw-r--r--arch/powerpc/include/asm/sections.h11
-rw-r--r--arch/powerpc/include/asm/setjmp.h4
-rw-r--r--arch/powerpc/include/asm/spinlock.h62
-rw-r--r--arch/powerpc/include/asm/string.h2
-rw-r--r--arch/powerpc/include/asm/svm.h31
-rw-r--r--arch/powerpc/include/asm/time.h6
-rw-r--r--arch/powerpc/include/asm/timex.h34
-rw-r--r--arch/powerpc/include/asm/uaccess.h14
-rw-r--r--arch/powerpc/include/asm/ultravisor-api.h33
-rw-r--r--arch/powerpc/include/asm/ultravisor.h49
-rw-r--r--arch/powerpc/include/asm/xive.h1
-rw-r--r--arch/powerpc/kernel/.gitignore1
-rw-r--r--arch/powerpc/kernel/Makefile23
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/cputable.c6
-rw-r--r--arch/powerpc/kernel/dma-iommu.c11
-rw-r--r--arch/powerpc/kernel/eeh.c281
-rw-r--r--arch/powerpc/kernel/eeh_cache.c37
-rw-r--r--arch/powerpc/kernel/eeh_dev.c2
-rw-r--r--arch/powerpc/kernel/eeh_driver.c280
-rw-r--r--arch/powerpc/kernel/eeh_event.c34
-rw-r--r--arch/powerpc/kernel/eeh_pe.c145
-rw-r--r--arch/powerpc/kernel/entry_32.S40
-rw-r--r--arch/powerpc/kernel/entry_64.S21
-rw-r--r--arch/powerpc/kernel/exceptions-64e.S22
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S1634
-rw-r--r--arch/powerpc/kernel/fadump.c1340
-rw-r--r--arch/powerpc/kernel/head_32.S55
-rw-r--r--arch/powerpc/kernel/head_32.h21
-rw-r--r--arch/powerpc/kernel/head_64.S8
-rw-r--r--arch/powerpc/kernel/head_8xx.S28
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c77
-rw-r--r--arch/powerpc/kernel/io-workarounds.c13
-rw-r--r--arch/powerpc/kernel/iommu.c97
-rw-r--r--arch/powerpc/kernel/kvm.c58
-rw-r--r--arch/powerpc/kernel/kvm_emul.S16
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c9
-rw-r--r--arch/powerpc/kernel/mce.c71
-rw-r--r--arch/powerpc/kernel/mce_power.c50
-rw-r--r--arch/powerpc/kernel/misc_32.S36
-rw-r--r--arch/powerpc/kernel/note.S40
-rw-r--r--arch/powerpc/kernel/paca.c52
-rw-r--r--arch/powerpc/kernel/pci-common.c4
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c7
-rw-r--r--arch/powerpc/kernel/pci_32.c4
-rw-r--r--arch/powerpc/kernel/pci_64.c12
-rw-r--r--arch/powerpc/kernel/pci_dn.c21
-rw-r--r--arch/powerpc/kernel/pci_of_scan.c66
-rw-r--r--arch/powerpc/kernel/process.c28
-rw-r--r--arch/powerpc/kernel/prom.c8
-rw-r--r--arch/powerpc/kernel/prom_init.c98
-rw-r--r--arch/powerpc/kernel/rtas.c15
-rw-r--r--arch/powerpc/kernel/security.c19
-rw-r--r--arch/powerpc/kernel/setup-common.c8
-rw-r--r--arch/powerpc/kernel/setup_32.c2
-rw-r--r--arch/powerpc/kernel/stacktrace.c2
-rw-r--r--arch/powerpc/kernel/sysfs.c20
-rw-r--r--arch/powerpc/kernel/trace/ftrace.c5
-rw-r--r--arch/powerpc/kernel/trace/ftrace_32.S1
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S1
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_pg.S1
-rw-r--r--arch/powerpc/kernel/traps.c1
-rw-r--r--arch/powerpc/kernel/ucall.S14
-rw-r--r--arch/powerpc/kernel/vdso.c22
-rw-r--r--arch/powerpc/kernel/vdso32/datapage.S2
-rw-r--r--arch/powerpc/kernel/vdso32/vdso32.lds.S4
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c29
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c38
-rw-r--r--arch/powerpc/kvm/book3s_hv.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S39
-rw-r--r--arch/powerpc/lib/Makefile4
-rw-r--r--arch/powerpc/lib/locks.c6
-rw-r--r--arch/powerpc/lib/memcpy_mcsafe_64.S242
-rw-r--r--arch/powerpc/mm/Makefile2
-rw-r--r--arch/powerpc/mm/book3s32/mmu.c60
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c88
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c121
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c80
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c303
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c312
-rw-r--r--arch/powerpc/mm/ioremap.c99
-rw-r--r--arch/powerpc/mm/ioremap_32.c92
-rw-r--r--arch/powerpc/mm/ioremap_64.c113
-rw-r--r--arch/powerpc/mm/kasan/kasan_init_32.c23
-rw-r--r--arch/powerpc/mm/mem.c66
-rw-r--r--arch/powerpc/mm/mmu_decl.h7
-rw-r--r--arch/powerpc/mm/nohash/book3e_hugetlbpage.c16
-rw-r--r--arch/powerpc/mm/nohash/tlb.c2
-rw-r--r--arch/powerpc/mm/pgtable_32.c155
-rw-r--r--arch/powerpc/mm/pgtable_64.c203
-rw-r--r--arch/powerpc/mm/ptdump/bats.c2
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c24
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c37
-rw-r--r--arch/powerpc/perf/imc-pmu.c29
-rw-r--r--arch/powerpc/platforms/44x/Kconfig8
-rw-r--r--arch/powerpc/platforms/4xx/Makefile1
-rw-r--r--arch/powerpc/platforms/4xx/ocm.c390
-rw-r--r--arch/powerpc/platforms/Kconfig3
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype16
-rw-r--r--arch/powerpc/platforms/cell/iommu.c2
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c2
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig5
-rw-r--r--arch/powerpc/platforms/powernv/Makefile6
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c97
-rw-r--r--arch/powerpc/platforms/powernv/idle.c6
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c101
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c5
-rw-r--r--arch/powerpc/platforms/powernv/opal-core.c636
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.c716
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.h146
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c12
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c57
-rw-r--r--arch/powerpc/platforms/powernv/opal-prd.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c213
-rw-r--r--arch/powerpc/platforms/powernv/opal.c42
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c38
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c98
-rw-r--r--arch/powerpc/platforms/powernv/pci.c3
-rw-r--r--arch/powerpc/platforms/powernv/pci.h2
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h5
-rw-r--r--arch/powerpc/platforms/powernv/setup.c9
-rw-r--r--arch/powerpc/platforms/powernv/ultravisor.c69
-rw-r--r--arch/powerpc/platforms/ps3/spu.c10
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig14
-rw-r--r--arch/powerpc/platforms/pseries/Makefile2
-rw-r--r--arch/powerpc/platforms/pseries/eeh_pseries.c68
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c26
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c24
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c20
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c9
-rw-r--r--arch/powerpc/platforms/pseries/pci.c3
-rw-r--r--arch/powerpc/platforms/pseries/ras.c460
-rw-r--r--arch/powerpc/platforms/pseries/rtas-fadump.c550
-rw-r--r--arch/powerpc/platforms/pseries/rtas-fadump.h114
-rw-r--r--arch/powerpc/platforms/pseries/setup.c32
-rw-r--r--arch/powerpc/platforms/pseries/smp.c3
-rw-r--r--arch/powerpc/platforms/pseries/svm.c85
-rw-r--r--arch/powerpc/platforms/pseries/vio.c2
-rw-r--r--arch/powerpc/sysdev/Kconfig7
-rw-r--r--arch/powerpc/sysdev/Makefile2
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c2
-rw-r--r--arch/powerpc/sysdev/scom.c223
-rw-r--r--arch/powerpc/sysdev/xive/common.c59
-rw-r--r--arch/powerpc/sysdev/xive/native.c26
-rw-r--r--arch/powerpc/sysdev/xive/spapr.c57
-rw-r--r--arch/powerpc/sysdev/xive/xive-internal.h2
-rw-r--r--arch/powerpc/xmon/xmon.c51
-rw-r--r--arch/s390/Kconfig4
-rw-r--r--arch/s390/include/asm/mem_encrypt.h5
-rw-r--r--arch/s390/mm/init.c7
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/include/asm/mem_encrypt.h10
-rw-r--r--arch/x86/kernel/crash_dump_64.c5
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--drivers/misc/cxl/main.c4
-rw-r--r--drivers/misc/ocxl/main.c4
-rw-r--r--drivers/pci/hotplug/pnv_php.c59
-rw-r--r--drivers/pci/hotplug/rpaphp_core.c18
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c18
-rw-r--r--fs/proc/vmcore.c8
-rw-r--r--include/linux/crash_dump.h14
-rw-r--r--include/linux/extable.h2
-rw-r--r--include/linux/mem_encrypt.h15
-rw-r--r--kernel/dma/mapping.c8
-rw-r--r--kernel/dma/swiotlb.c3
-rw-r--r--kernel/extable.c11
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--tools/testing/selftests/powerpc/Makefile1
-rw-r--r--tools/testing/selftests/powerpc/copyloops/.gitignore9
-rw-r--r--tools/testing/selftests/powerpc/copyloops/Makefile7
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/export.h1
l---------tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S1
-rw-r--r--tools/testing/selftests/powerpc/eeh/Makefile9
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-basic.sh82
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-functions.sh76
-rw-r--r--tools/testing/selftests/powerpc/ptrace/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/security/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/.gitignore5
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c49
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c59
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c74
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c130
-rw-r--r--tools/testing/selftests/powerpc/tm/tm.h3
247 files changed, 9731 insertions, 5443 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 5f7d7b14fa44..06d0931119cc 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -562,3 +562,13 @@ Description: Umwait control
562 or C0.2 state. The time is an unsigned 32-bit number. 562 or C0.2 state. The time is an unsigned 32-bit number.
563 Note that a value of zero means there is no limit. 563 Note that a value of zero means there is no limit.
564 Low order two bits must be zero. 564 Low order two bits must be zero.
565
566What: /sys/devices/system/cpu/svm
567Date: August 2019
568Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
569 Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
570Description: Secure Virtual Machine
571
572 If 1, it means the system is using the Protected Execution
573 Facility in POWER9 and newer processors. i.e., it is a Secure
574 Virtual Machine.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 782e9072407b..d3814789304f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -860,6 +860,10 @@
860 disable_radix [PPC] 860 disable_radix [PPC]
861 Disable RADIX MMU mode on POWER9 861 Disable RADIX MMU mode on POWER9
862 862
863 disable_tlbie [PPC]
864 Disable TLBIE instruction. Currently does not work
865 with KVM, with HASH MMU, or with coherent accelerators.
866
863 disable_cpu_apicid= [X86,APIC,SMP] 867 disable_cpu_apicid= [X86,APIC,SMP]
864 Format: <int> 868 Format: <int>
865 The number of initial APIC ID for the 869 The number of initial APIC ID for the
@@ -4641,6 +4645,11 @@
4641 /sys/power/pm_test). Only available when CONFIG_PM_DEBUG 4645 /sys/power/pm_test). Only available when CONFIG_PM_DEBUG
4642 is set. Default value is 5. 4646 is set. Default value is 5.
4643 4647
4648 svm= [PPC]
4649 Format: { on | off | y | n | 1 | 0 }
4650 This parameter controls use of the Protected
4651 Execution Facility on pSeries.
4652
4644 swapaccount=[0|1] 4653 swapaccount=[0|1]
4645 [KNL] Enable accounting of swap in memory resource 4654 [KNL] Enable accounting of swap in memory resource
4646 controller if no parameter or 1 is given or disable 4655 controller if no parameter or 1 is given or disable
@@ -5326,3 +5335,22 @@
5326 A hex value specifying bitmask with supplemental xhci 5335 A hex value specifying bitmask with supplemental xhci
5327 host controller quirks. Meaning of each bit can be 5336 host controller quirks. Meaning of each bit can be
5328 consulted in header drivers/usb/host/xhci.h. 5337 consulted in header drivers/usb/host/xhci.h.
5338
5339 xmon [PPC]
5340 Format: { early | on | rw | ro | off }
5341 Controls if xmon debugger is enabled. Default is off.
5342 Passing only "xmon" is equivalent to "xmon=early".
5343 early Call xmon as early as possible on boot; xmon
5344 debugger is called from setup_arch().
5345 on xmon debugger hooks will be installed so xmon
5346 is only called on a kernel crash. Default mode,
5347 i.e. either "ro" or "rw" mode, is controlled
5348 with CONFIG_XMON_DEFAULT_RO_MODE.
5349 rw xmon debugger hooks will be installed so xmon
5350 is called only on a kernel crash, mode is write,
5351 meaning SPR registers, memory and, other data
5352 can be written using xmon commands.
5353 ro same as "rw" option above but SPR registers,
5354 memory, and other data can't be written using
5355 xmon commands.
5356 off xmon is disabled.
diff --git a/Documentation/powerpc/elfnote.rst b/Documentation/powerpc/elfnote.rst
new file mode 100644
index 000000000000..06602248621c
--- /dev/null
+++ b/Documentation/powerpc/elfnote.rst
@@ -0,0 +1,41 @@
1==========================
2ELF Note PowerPC Namespace
3==========================
4
5The PowerPC namespace in an ELF Note of the kernel binary is used to store
6capabilities and information which can be used by a bootloader or userland.
7
8Types and Descriptors
9---------------------
10
11The types to be used with the "PowerPC" namesapce are defined in [#f1]_.
12
13 1) PPC_ELFNOTE_CAPABILITIES
14
15Define the capabilities supported/required by the kernel. This type uses a
16bitmap as "descriptor" field. Each bit is described below:
17
18- Ultravisor-capable bit (PowerNV only).
19
20.. code-block:: c
21
22 #define PPCCAP_ULTRAVISOR_BIT (1 << 0)
23
24Indicate that the powerpc kernel binary knows how to run in an
25ultravisor-enabled system.
26
27In an ultravisor-enabled system, some machine resources are now controlled
28by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
29being run on a machine with ultravisor, the kernel will probably crash
30trying to access ultravisor resources. For instance, it may crash in early
31boot trying to set the partition table entry 0.
32
33In an ultravisor-enabled system, a bootloader could warn the user or prevent
34the kernel from being run if the PowerPC ultravisor capability doesn't exist
35or the Ultravisor-capable bit is not set.
36
37References
38----------
39
40.. [#f1] arch/powerpc/include/asm/elfnote.h
41
diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst
index 9ca12830a48e..0455a78486d5 100644
--- a/Documentation/powerpc/firmware-assisted-dump.rst
+++ b/Documentation/powerpc/firmware-assisted-dump.rst
@@ -9,18 +9,18 @@ a crashed system, and to do so from a fully-reset system, and
9to minimize the total elapsed time until the system is back 9to minimize the total elapsed time until the system is back
10in production use. 10in production use.
11 11
12- Firmware assisted dump (fadump) infrastructure is intended to replace 12- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
13 the existing phyp assisted dump. 13 the existing phyp assisted dump.
14- Fadump uses the same firmware interfaces and memory reservation model 14- Fadump uses the same firmware interfaces and memory reservation model
15 as phyp assisted dump. 15 as phyp assisted dump.
16- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore 16- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
17 in the ELF format in the same way as kdump. This helps us reuse the 17 in the ELF format in the same way as kdump. This helps us reuse the
18 kdump infrastructure for dump capture and filtering. 18 kdump infrastructure for dump capture and filtering.
19- Unlike phyp dump, userspace tool does not need to refer any sysfs 19- Unlike phyp dump, userspace tool does not need to refer any sysfs
20 interface while reading /proc/vmcore. 20 interface while reading /proc/vmcore.
21- Unlike phyp dump, fadump allows user to release all the memory reserved 21- Unlike phyp dump, FADump allows user to release all the memory reserved
22 for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. 22 for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
23- Once enabled through kernel boot parameter, fadump can be 23- Once enabled through kernel boot parameter, FADump can be
24 started/stopped through /sys/kernel/fadump_registered interface (see 24 started/stopped through /sys/kernel/fadump_registered interface (see
25 sysfs files section below) and can be easily integrated with kdump 25 sysfs files section below) and can be easily integrated with kdump
26 service start/stop init scripts. 26 service start/stop init scripts.
@@ -34,7 +34,7 @@ dump offers several strong, practical advantages:
34 in a clean, consistent state. 34 in a clean, consistent state.
35- Once the dump is copied out, the memory that held the dump 35- Once the dump is copied out, the memory that held the dump
36 is immediately available to the running kernel. And therefore, 36 is immediately available to the running kernel. And therefore,
37 unlike kdump, fadump doesn't need a 2nd reboot to get back 37 unlike kdump, FADump doesn't need a 2nd reboot to get back
38 the system to the production configuration. 38 the system to the production configuration.
39 39
40The above can only be accomplished by coordination with, 40The above can only be accomplished by coordination with,
@@ -46,10 +46,9 @@ as follows:
46 These registered sections of memory are reserved by the first 46 These registered sections of memory are reserved by the first
47 kernel during early boot. 47 kernel during early boot.
48 48
49- When a system crashes, the Power firmware will save 49- When system crashes, the Power firmware will copy the registered
50 the low memory (boot memory of size larger of 5% of system RAM 50 low memory regions (boot memory) from source to destination area.
51 or 256MB) of RAM to the previous registered region. It will 51 It will also save hardware PTE's.
52 also save system registers, and hardware PTE's.
53 52
54 NOTE: 53 NOTE:
55 The term 'boot memory' means size of the low memory chunk 54 The term 'boot memory' means size of the low memory chunk
@@ -61,9 +60,9 @@ as follows:
61 the default calculated size. Use this option if default 60 the default calculated size. Use this option if default
62 boot memory size is not sufficient for second kernel to 61 boot memory size is not sufficient for second kernel to
63 boot successfully. For syntax of crashkernel= parameter, 62 boot successfully. For syntax of crashkernel= parameter,
64 refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is 63 refer to Documentation/admin-guide/kdump/kdump.rst. If any
65 provided in crashkernel= parameter, it will be ignored 64 offset is provided in crashkernel= parameter, it will be
66 as fadump uses a predefined offset to reserve memory 65 ignored as FADump uses a predefined offset to reserve memory
67 for boot memory dump preservation in case of a crash. 66 for boot memory dump preservation in case of a crash.
68 67
69- After the low memory (boot memory) area has been saved, the 68- After the low memory (boot memory) area has been saved, the
@@ -71,13 +70,15 @@ as follows:
71 *not* clear the RAM. It will then launch the bootloader, as 70 *not* clear the RAM. It will then launch the bootloader, as
72 normal. 71 normal.
73 72
74- The freshly booted kernel will notice that there is a new 73- The freshly booted kernel will notice that there is a new node
75 node (ibm,dump-kernel) in the device tree, indicating that 74 (rtas/ibm,kernel-dump on pSeries or ibm,opal/dump/mpipl-boot
75 on OPAL platform) in the device tree, indicating that
76 there is crash data available from a previous boot. During 76 there is crash data available from a previous boot. During
77 the early boot OS will reserve rest of the memory above 77 the early boot OS will reserve rest of the memory above
78 boot memory size effectively booting with restricted memory 78 boot memory size effectively booting with restricted memory
79 size. This will make sure that the second kernel will not 79 size. This will make sure that this kernel (also, referred
80 touch any of the dump memory area. 80 to as second kernel or capture kernel) will not touch any
81 of the dump memory area.
81 82
82- User-space tools will read /proc/vmcore to obtain the contents 83- User-space tools will read /proc/vmcore to obtain the contents
83 of memory, which holds the previous crashed kernel dump in ELF 84 of memory, which holds the previous crashed kernel dump in ELF
@@ -94,8 +95,30 @@ as follows:
94 # echo 1 > /sys/kernel/fadump_release_mem 95 # echo 1 > /sys/kernel/fadump_release_mem
95 96
96Please note that the firmware-assisted dump feature 97Please note that the firmware-assisted dump feature
97is only available on Power6 and above systems with recent 98is only available on POWER6 and above systems on pSeries
98firmware versions. 99(PowerVM) platform and POWER9 and above systems with OP940
100or later firmware versions on PowerNV (OPAL) platform.
101Note that, OPAL firmware exports ibm,opal/dump node when
102FADump is supported on PowerNV platform.
103
104On OPAL based machines, system first boots into an intermittent
105kernel (referred to as petitboot kernel) before booting into the
106capture kernel. This kernel would have minimal kernel and/or
107userspace support to process crash data. Such kernel needs to
108preserve previously crash'ed kernel's memory for the subsequent
109capture kernel boot to process this crash data. Kernel config
110option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel
111to ensure that crash data is preserved to process later.
112
113-- On OPAL based machines (PowerNV), if the kernel is build with
114 CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also
115 exported as /sys/firmware/opal/core file. This procfs file is
116 helpful in debugging OPAL crashes with GDB. The kernel memory
117 used for exporting this procfs file can be released by echo'ing
118 '1' to /sys/kernel/fadump_release_opalcore node.
119
120 e.g.
121 # echo 1 > /sys/kernel/fadump_release_opalcore
99 122
100Implementation details: 123Implementation details:
101----------------------- 124-----------------------
@@ -110,72 +133,95 @@ that are run. If there is dump data, then the
110/sys/kernel/fadump_release_mem file is created, and the reserved 133/sys/kernel/fadump_release_mem file is created, and the reserved
111memory is held. 134memory is held.
112 135
113If there is no waiting dump data, then only the memory required 136If there is no waiting dump data, then only the memory required to
114to hold CPU state, HPTE region, boot memory dump and elfcore 137hold CPU state, HPTE region, boot memory dump, FADump header and
115header, is usually reserved at an offset greater than boot memory 138elfcore header, is usually reserved at an offset greater than boot
116size (see Fig. 1). This area is *not* released: this region will 139memory size (see Fig. 1). This area is *not* released: this region
117be kept permanently reserved, so that it can act as a receptacle 140will be kept permanently reserved, so that it can act as a receptacle
118for a copy of the boot memory content in addition to CPU state 141for a copy of the boot memory content in addition to CPU state and
119and HPTE region, in the case a crash does occur. Since this reserved 142HPTE region, in the case a crash does occur.
120memory area is used only after the system crash, there is no point in 143
121blocking this significant chunk of memory from production kernel. 144Since this reserved memory area is used only after the system crash,
122Hence, the implementation uses the Linux kernel's Contiguous Memory 145there is no point in blocking this significant chunk of memory from
123Allocator (CMA) for memory reservation if CMA is configured for kernel. 146production kernel. Hence, the implementation uses the Linux kernel's
124With CMA reservation this memory will be available for applications to 147Contiguous Memory Allocator (CMA) for memory reservation if CMA is
125use it, while kernel is prevented from using it. With this fadump will 148configured for kernel. With CMA reservation this memory will be
126still be able to capture all of the kernel memory and most of the user 149available for applications to use it, while kernel is prevented from
127space memory except the user pages that were present in CMA region:: 150using it. With this FADump will still be able to capture all of the
151kernel memory and most of the user space memory except the user pages
152that were present in CMA region::
128 153
129 o Memory Reservation during first kernel 154 o Memory Reservation during first kernel
130 155
131 Low memory Top of memory 156 Low memory Top of memory
132 0 boot memory size | 157 0 boot memory size |<--- Reserved dump area --->| |
133 | | |<--Reserved dump area -->| | 158 | | | Permanent Reservation | |
134 V V | Permanent Reservation | V 159 V V | | V
135 +-----------+----------/ /---+---+----+-----------+----+------+ 160 +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
136 | | |CPU|HPTE| DUMP |ELF | | 161 | | |///|////| DUMP | HDR | ELF |////| |
137 +-----------+----------/ /---+---+----+-----------+----+------+ 162 +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
138 | ^ 163 | ^ ^ ^ ^ ^
139 | | 164 | | | | | |
140 \ / 165 \ CPU HPTE / | |
141 ------------------------------------------- 166 ------------------------------ | |
142 Boot memory content gets transferred to 167 Boot memory content gets transferred | |
143 reserved area by firmware at the time of 168 to reserved area by firmware at the | |
144 crash 169 time of crash. | |
170 FADump Header |
171 (meta area) |
172 |
173 |
174 Metadata: This area holds a metadata struture whose
175 address is registered with f/w and retrieved in the
176 second kernel after crash, on platforms that support
177 tags (OPAL). Having such structure with info needed
178 to process the crashdump eases dump capture process.
179
145 Fig. 1 180 Fig. 1
146 181
182
147 o Memory Reservation during second kernel after crash 183 o Memory Reservation during second kernel after crash
148 184
149 Low memory Top of memory 185 Low memory Top of memory
150 0 boot memory size | 186 0 boot memory size |
151 | |<------------- Reserved dump area ----------- -->| 187 | |<------------ Crash preserved area ------------>|
152 V V V 188 V V |<--- Reserved dump area --->| |
153 +-----------+----------/ /---+---+----+-----------+----+------+ 189 +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
154 | | |CPU|HPTE| DUMP |ELF | | 190 | | |///|////| DUMP | HDR | ELF |////| |
155 +-----------+----------/ /---+---+----+-----------+----+------+ 191 +-----------+-----/ /---+---+----+-------+-----+-----+----+--+
156 | | 192 | |
157 V V 193 V V
158 Used by second /proc/vmcore 194 Used by second /proc/vmcore
159 kernel to boot 195 kernel to boot
196
197 +---+
198 |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above
199 +---+ figures are not always present. For example, OPAL platform
200 does not have CPU & HPTE regions while Metadata region is
201 not supported on pSeries currently.
202
160 Fig. 2 203 Fig. 2
161 204
162Currently the dump will be copied from /proc/vmcore to a 205
163a new file upon user intervention. The dump data available through 206Currently the dump will be copied from /proc/vmcore to a new file upon
164/proc/vmcore will be in ELF format. Hence the existing kdump 207user intervention. The dump data available through /proc/vmcore will be
165infrastructure (kdump scripts) to save the dump works fine with 208in ELF format. Hence the existing kdump infrastructure (kdump scripts)
166minor modifications. 209to save the dump works fine with minor modifications. KDump scripts on
210major Distro releases have already been modified to work seemlessly (no
211user intervention in saving the dump) when FADump is used, instead of
212KDump, as dump mechanism.
167 213
168The tools to examine the dump will be same as the ones 214The tools to examine the dump will be same as the ones
169used for kdump. 215used for kdump.
170 216
171How to enable firmware-assisted dump (fadump): 217How to enable firmware-assisted dump (FADump):
172---------------------------------------------- 218----------------------------------------------
173 219
1741. Set config option CONFIG_FA_DUMP=y and build kernel. 2201. Set config option CONFIG_FA_DUMP=y and build kernel.
1752. Boot into linux kernel with 'fadump=on' kernel cmdline option. 2212. Boot into linux kernel with 'fadump=on' kernel cmdline option.
176 By default, fadump reserved memory will be initialized as CMA area. 222 By default, FADump reserved memory will be initialized as CMA area.
177 Alternatively, user can boot linux kernel with 'fadump=nocma' to 223 Alternatively, user can boot linux kernel with 'fadump=nocma' to
178 prevent fadump to use CMA. 224 prevent FADump to use CMA.
1793. Optionally, user can also set 'crashkernel=' kernel cmdline 2253. Optionally, user can also set 'crashkernel=' kernel cmdline
180 to specify size of the memory to reserve for boot memory dump 226 to specify size of the memory to reserve for boot memory dump
181 preservation. 227 preservation.
@@ -201,29 +247,29 @@ the control files and debugfs file to display memory reserved region.
201Here is the list of files under kernel sysfs: 247Here is the list of files under kernel sysfs:
202 248
203 /sys/kernel/fadump_enabled 249 /sys/kernel/fadump_enabled
204 This is used to display the fadump status. 250 This is used to display the FADump status.
205 251
206 - 0 = fadump is disabled 252 - 0 = FADump is disabled
207 - 1 = fadump is enabled 253 - 1 = FADump is enabled
208 254
209 This interface can be used by kdump init scripts to identify if 255 This interface can be used by kdump init scripts to identify if
210 fadump is enabled in the kernel and act accordingly. 256 FADump is enabled in the kernel and act accordingly.
211 257
212 /sys/kernel/fadump_registered 258 /sys/kernel/fadump_registered
213 This is used to display the fadump registration status as well 259 This is used to display the FADump registration status as well
214 as to control (start/stop) the fadump registration. 260 as to control (start/stop) the FADump registration.
215 261
216 - 0 = fadump is not registered. 262 - 0 = FADump is not registered.
217 - 1 = fadump is registered and ready to handle system crash. 263 - 1 = FADump is registered and ready to handle system crash.
218 264
219 To register fadump echo 1 > /sys/kernel/fadump_registered and 265 To register FADump echo 1 > /sys/kernel/fadump_registered and
220 echo 0 > /sys/kernel/fadump_registered for un-register and stop the 266 echo 0 > /sys/kernel/fadump_registered for un-register and stop the
221 fadump. Once the fadump is un-registered, the system crash will not 267 FADump. Once the FADump is un-registered, the system crash will not
222 be handled and vmcore will not be captured. This interface can be 268 be handled and vmcore will not be captured. This interface can be
223 easily integrated with kdump service start/stop. 269 easily integrated with kdump service start/stop.
224 270
225 /sys/kernel/fadump_release_mem 271 /sys/kernel/fadump_release_mem
226 This file is available only when fadump is active during 272 This file is available only when FADump is active during
227 second kernel. This is used to release the reserved memory 273 second kernel. This is used to release the reserved memory
228 region that are held for saving crash dump. To release the 274 region that are held for saving crash dump. To release the
229 reserved memory echo 1 to it:: 275 reserved memory echo 1 to it::
@@ -237,25 +283,38 @@ Here is the list of files under kernel sysfs:
237 enhanced to use this interface to release the memory reserved for 283 enhanced to use this interface to release the memory reserved for
238 dump and continue without 2nd reboot. 284 dump and continue without 2nd reboot.
239 285
286 /sys/kernel/fadump_release_opalcore
287
288 This file is available only on OPAL based machines when FADump is
289 active during capture kernel. This is used to release the memory
290 used by the kernel to export /sys/firmware/opal/core file. To
291 release this memory, echo '1' to it:
292
293 echo 1 > /sys/kernel/fadump_release_opalcore
294
240Here is the list of files under powerpc debugfs: 295Here is the list of files under powerpc debugfs:
241(Assuming debugfs is mounted on /sys/kernel/debug directory.) 296(Assuming debugfs is mounted on /sys/kernel/debug directory.)
242 297
243 /sys/kernel/debug/powerpc/fadump_region 298 /sys/kernel/debug/powerpc/fadump_region
244 This file shows the reserved memory regions if fadump is 299 This file shows the reserved memory regions if FADump is
245 enabled otherwise this file is empty. The output format 300 enabled otherwise this file is empty. The output format
246 is:: 301 is::
247 302
248 <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size> 303 <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
249 304
305 and for kernel DUMP region is:
306
307 DUMP: Src: <src-addr>, Dest: <dest-addr>, Size: <size>, Dumped: # bytes
308
250 e.g. 309 e.g.
251 Contents when fadump is registered during first kernel:: 310 Contents when FADump is registered during first kernel::
252 311
253 # cat /sys/kernel/debug/powerpc/fadump_region 312 # cat /sys/kernel/debug/powerpc/fadump_region
254 CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0 313 CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
255 HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0 314 HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
256 DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0 315 DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
257 316
258 Contents when fadump is active during second kernel:: 317 Contents when FADump is active during second kernel::
259 318
260 # cat /sys/kernel/debug/powerpc/fadump_region 319 # cat /sys/kernel/debug/powerpc/fadump_region
261 CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020 320 CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
@@ -263,6 +322,7 @@ Here is the list of files under powerpc debugfs:
263 DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000 322 DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
264 : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000 323 : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
265 324
325
266NOTE: 326NOTE:
267 Please refer to Documentation/filesystems/debugfs.txt on 327 Please refer to Documentation/filesystems/debugfs.txt on
268 how to mount the debugfs filesystem. 328 how to mount the debugfs filesystem.
@@ -273,7 +333,7 @@ TODO:
273 - Need to come up with the better approach to find out more 333 - Need to come up with the better approach to find out more
274 accurate boot memory size that is required for a kernel to 334 accurate boot memory size that is required for a kernel to
275 boot successfully when booted with restricted memory. 335 boot successfully when booted with restricted memory.
276 - The fadump implementation introduces a fadump crash info structure 336 - The FADump implementation introduces a FADump crash info structure
277 in the scratch area before the ELF core header. The idea of introducing 337 in the scratch area before the ELF core header. The idea of introducing
278 this structure is to pass some important crash info data to the second 338 this structure is to pass some important crash info data to the second
279 kernel which will help second kernel to populate ELF core header with 339 kernel which will help second kernel to populate ELF core header with
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index 549b1cdd77ae..db7b6a880f52 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -15,6 +15,7 @@ powerpc
15 dawr-power9 15 dawr-power9
16 dscr 16 dscr
17 eeh-pci-error-recovery 17 eeh-pci-error-recovery
18 elfnote
18 firmware-assisted-dump 19 firmware-assisted-dump
19 hvcs 20 hvcs
20 isa-versions 21 isa-versions
@@ -25,6 +26,7 @@ powerpc
25 qe_firmware 26 qe_firmware
26 syscall64-abi 27 syscall64-abi
27 transactional_memory 28 transactional_memory
29 ultravisor
28 30
29.. only:: subproject and html 31.. only:: subproject and html
30 32
diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
new file mode 100644
index 000000000000..730854f73830
--- /dev/null
+++ b/Documentation/powerpc/ultravisor.rst
@@ -0,0 +1,1054 @@
1.. SPDX-License-Identifier: GPL-2.0
2.. _ultravisor:
3
4============================
5Protected Execution Facility
6============================
7
8.. contents::
9 :depth: 3
10
11Protected Execution Facility
12############################
13
14 Protected Execution Facility (PEF) is an architectural change for
15 POWER 9 that enables Secure Virtual Machines (SVMs). DD2.3 chips
16 (PVR=0x004e1203) or greater will be PEF-capable. A new ISA release
17 will include the PEF RFC02487 changes.
18
19 When enabled, PEF adds a new higher privileged mode, called Ultravisor
20 mode, to POWER architecture. Along with the new mode there is new
21 firmware called the Protected Execution Ultravisor (or Ultravisor
22 for short). Ultravisor mode is the highest privileged mode in POWER
23 architecture.
24
25 +------------------+
26 | Privilege States |
27 +==================+
28 | Problem |
29 +------------------+
30 | Supervisor |
31 +------------------+
32 | Hypervisor |
33 +------------------+
34 | Ultravisor |
35 +------------------+
36
37 PEF protects SVMs from the hypervisor, privileged users, and other
38 VMs in the system. SVMs are protected while at rest and can only be
39 executed by an authorized machine. All virtual machines utilize
40 hypervisor services. The Ultravisor filters calls between the SVMs
41 and the hypervisor to assure that information does not accidentally
42 leak. All hypercalls except H_RANDOM are reflected to the hypervisor.
43 H_RANDOM is not reflected to prevent the hypervisor from influencing
44 random values in the SVM.
45
46 To support this there is a refactoring of the ownership of resources
47 in the CPU. Some of the resources which were previously hypervisor
48 privileged are now ultravisor privileged.
49
50Hardware
51========
52
53 The hardware changes include the following:
54
55 * There is a new bit in the MSR that determines whether the current
56 process is running in secure mode, MSR(S) bit 41. MSR(S)=1, process
57 is in secure mode, MSR(s)=0 process is in normal mode.
58
59 * The MSR(S) bit can only be set by the Ultravisor.
60
61 * HRFID cannot be used to set the MSR(S) bit. If the hypervisor needs
62 to return to a SVM it must use an ultracall. It can determine if
63 the VM it is returning to is secure.
64
65 * There is a new Ultravisor privileged register, SMFCTRL, which has an
66 enable/disable bit SMFCTRL(E).
67
68 * The privilege of a process is now determined by three MSR bits,
69 MSR(S, HV, PR). In each of the tables below the modes are listed
70 from least privilege to highest privilege. The higher privilege
71 modes can access all the resources of the lower privilege modes.
72
73 **Secure Mode MSR Settings**
74
75 +---+---+---+---------------+
76 | S | HV| PR|Privilege |
77 +===+===+===+===============+
78 | 1 | 0 | 1 | Problem |
79 +---+---+---+---------------+
80 | 1 | 0 | 0 | Privileged(OS)|
81 +---+---+---+---------------+
82 | 1 | 1 | 0 | Ultravisor |
83 +---+---+---+---------------+
84 | 1 | 1 | 1 | Reserved |
85 +---+---+---+---------------+
86
87 **Normal Mode MSR Settings**
88
89 +---+---+---+---------------+
90 | S | HV| PR|Privilege |
91 +===+===+===+===============+
92 | 0 | 0 | 1 | Problem |
93 +---+---+---+---------------+
94 | 0 | 0 | 0 | Privileged(OS)|
95 +---+---+---+---------------+
96 | 0 | 1 | 0 | Hypervisor |
97 +---+---+---+---------------+
98 | 0 | 1 | 1 | Problem (Host)|
99 +---+---+---+---------------+
100
101 * Memory is partitioned into secure and normal memory. Only processes
102 that are running in secure mode can access secure memory.
103
104 * The hardware does not allow anything that is not running secure to
105 access secure memory. This means that the Hypervisor cannot access
106 the memory of the SVM without using an ultracall (asking the
107 Ultravisor). The Ultravisor will only allow the hypervisor to see
108 the SVM memory encrypted.
109
110 * I/O systems are not allowed to directly address secure memory. This
111 limits the SVMs to virtual I/O only.
112
113 * The architecture allows the SVM to share pages of memory with the
114 hypervisor that are not protected with encryption. However, this
115 sharing must be initiated by the SVM.
116
117 * When a process is running in secure mode all hypercalls
118 (syscall lev=1) go to the Ultravisor.
119
120 * When a process is in secure mode all interrupts go to the
121 Ultravisor.
122
123 * The following resources have become Ultravisor privileged and
124 require an Ultravisor interface to manipulate:
125
126 * Processor configurations registers (SCOMs).
127
128 * Stop state information.
129
130 * The debug registers CIABR, DAWR, and DAWRX when SMFCTRL(D) is set.
131 If SMFCTRL(D) is not set they do not work in secure mode. When set,
132 reading and writing requires an Ultravisor call, otherwise that
133 will cause a Hypervisor Emulation Assistance interrupt.
134
135 * PTCR and partition table entries (partition table is in secure
136 memory). An attempt to write to PTCR will cause a Hypervisor
137 Emulation Assitance interrupt.
138
139 * LDBAR (LD Base Address Register) and IMC (In-Memory Collection)
140 non-architected registers. An attempt to write to them will cause a
141 Hypervisor Emulation Assistance interrupt.
142
143 * Paging for an SVM, sharing of memory with Hypervisor for an SVM.
144 (Including Virtual Processor Area (VPA) and virtual I/O).
145
146
147Software/Microcode
148==================
149
150 The software changes include:
151
152 * SVMs are created from normal VM using (open source) tooling supplied
153 by IBM.
154
155 * All SVMs start as normal VMs and utilize an ultracall, UV_ESM
156 (Enter Secure Mode), to make the transition.
157
158 * When the UV_ESM ultracall is made the Ultravisor copies the VM into
159 secure memory, decrypts the verification information, and checks the
160 integrity of the SVM. If the integrity check passes the Ultravisor
161 passes control in secure mode.
162
163 * The verification information includes the pass phrase for the
164 encrypted disk associated with the SVM. This pass phrase is given
165 to the SVM when requested.
166
167 * The Ultravisor is not involved in protecting the encrypted disk of
168 the SVM while at rest.
169
170 * For external interrupts the Ultravisor saves the state of the SVM,
171 and reflects the interrupt to the hypervisor for processing.
172 For hypercalls, the Ultravisor inserts neutral state into all
173 registers not needed for the hypercall then reflects the call to
174 the hypervisor for processing. The H_RANDOM hypercall is performed
175 by the Ultravisor and not reflected.
176
177 * For virtual I/O to work bounce buffering must be done.
178
179 * The Ultravisor uses AES (IAPM) for protection of SVM memory. IAPM
180 is a mode of AES that provides integrity and secrecy concurrently.
181
182 * The movement of data between normal and secure pages is coordinated
183 with the Ultravisor by a new HMM plug-in in the Hypervisor.
184
185 The Ultravisor offers new services to the hypervisor and SVMs. These
186 are accessed through ultracalls.
187
188Terminology
189===========
190
191 * Hypercalls: special system calls used to request services from
192 Hypervisor.
193
194 * Normal memory: Memory that is accessible to Hypervisor.
195
196 * Normal page: Page backed by normal memory and available to
197 Hypervisor.
198
199 * Shared page: A page backed by normal memory and available to both
200 the Hypervisor/QEMU and the SVM (i.e page has mappings in SVM and
201 Hypervisor/QEMU).
202
203 * Secure memory: Memory that is accessible only to Ultravisor and
204 SVMs.
205
206 * Secure page: Page backed by secure memory and only available to
207 Ultravisor and SVM.
208
209 * SVM: Secure Virtual Machine.
210
211 * Ultracalls: special system calls used to request services from
212 Ultravisor.
213
214
215Ultravisor calls API
216####################
217
218 This section describes Ultravisor calls (ultracalls) needed to
219 support Secure Virtual Machines (SVM)s and Paravirtualized KVM. The
220 ultracalls allow the SVMs and Hypervisor to request services from the
221 Ultravisor such as accessing a register or memory region that can only
222 be accessed when running in Ultravisor-privileged mode.
223
224 The specific service needed from an ultracall is specified in register
225 R3 (the first parameter to the ultracall). Other parameters to the
226 ultracall, if any, are specified in registers R4 through R12.
227
228 Return value of all ultracalls is in register R3. Other output values
229 from the ultracall, if any, are returned in registers R4 through R12.
230 The only exception to this register usage is the ``UV_RETURN``
231 ultracall described below.
232
233 Each ultracall returns specific error codes, applicable in the context
234 of the ultracall. However, like with the PowerPC Architecture Platform
235 Reference (PAPR), if no specific error code is defined for a
236 particular situation, then the ultracall will fallback to an erroneous
237 parameter-position based code. i.e U_PARAMETER, U_P2, U_P3 etc
238 depending on the ultracall parameter that may have caused the error.
239
240 Some ultracalls involve transferring a page of data between Ultravisor
241 and Hypervisor. Secure pages that are transferred from secure memory
242 to normal memory may be encrypted using dynamically generated keys.
243 When the secure pages are transferred back to secure memory, they may
244 be decrypted using the same dynamically generated keys. Generation and
245 management of these keys will be covered in a separate document.
246
247 For now this only covers ultracalls currently implemented and being
248 used by Hypervisor and SVMs but others can be added here when it
249 makes sense.
250
251 The full specification for all hypercalls/ultracalls will eventually
252 be made available in the public/OpenPower version of the PAPR
253 specification.
254
255 .. note::
256
257 If PEF is not enabled, the ultracalls will be redirected to the
258 Hypervisor which must handle/fail the calls.
259
260Ultracalls used by Hypervisor
261=============================
262
263 This section describes the virtual memory management ultracalls used
264 by the Hypervisor to manage SVMs.
265
266UV_PAGE_OUT
267-----------
268
269 Encrypt and move the contents of a page from secure memory to normal
270 memory.
271
272Syntax
273~~~~~~
274
275.. code-block:: c
276
277 uint64_t ultracall(const uint64_t UV_PAGE_OUT,
278 uint16_t lpid, /* LPAR ID */
279 uint64_t dest_ra, /* real address of destination page */
280 uint64_t src_gpa, /* source guest-physical-address */
281 uint8_t flags, /* flags */
282 uint64_t order) /* page size order */
283
284Return values
285~~~~~~~~~~~~~
286
287 One of the following values:
288
289 * U_SUCCESS on success.
290 * U_PARAMETER if ``lpid`` is invalid.
291 * U_P2 if ``dest_ra`` is invalid.
292 * U_P3 if the ``src_gpa`` address is invalid.
293 * U_P4 if any bit in the ``flags`` is unrecognized
294 * U_P5 if the ``order`` parameter is unsupported.
295 * U_FUNCTION if functionality is not supported.
296 * U_BUSY if page cannot be currently paged-out.
297
298Description
299~~~~~~~~~~~
300
301 Encrypt the contents of a secure-page and make it available to
302 Hypervisor in a normal page.
303
304 By default, the source page is unmapped from the SVM's partition-
305 scoped page table. But the Hypervisor can provide a hint to the
306 Ultravisor to retain the page mapping by setting the ``UV_SNAPSHOT``
307 flag in ``flags`` parameter.
308
309 If the source page is already a shared page the call returns
310 U_SUCCESS, without doing anything.
311
312Use cases
313~~~~~~~~~
314
315 #. QEMU attempts to access an address belonging to the SVM but the
316 page frame for that address is not mapped into QEMU's address
317 space. In this case, the Hypervisor will allocate a page frame,
318 map it into QEMU's address space and issue the ``UV_PAGE_OUT``
319 call to retrieve the encrypted contents of the page.
320
321 #. When Ultravisor runs low on secure memory and it needs to page-out
322 an LRU page. In this case, Ultravisor will issue the
323 ``H_SVM_PAGE_OUT`` hypercall to the Hypervisor. The Hypervisor will
324 then allocate a normal page and issue the ``UV_PAGE_OUT`` ultracall
325 and the Ultravisor will encrypt and move the contents of the secure
326 page into the normal page.
327
328 #. When Hypervisor accesses SVM data, the Hypervisor requests the
329 Ultravisor to transfer the corresponding page into a insecure page,
330 which the Hypervisor can access. The data in the normal page will
331 be encrypted though.
332
333UV_PAGE_IN
334----------
335
336 Move the contents of a page from normal memory to secure memory.
337
338Syntax
339~~~~~~
340
341.. code-block:: c
342
343 uint64_t ultracall(const uint64_t UV_PAGE_IN,
344 uint16_t lpid, /* the LPAR ID */
345 uint64_t src_ra, /* source real address of page */
346 uint64_t dest_gpa, /* destination guest physical address */
347 uint64_t flags, /* flags */
348 uint64_t order) /* page size order */
349
350Return values
351~~~~~~~~~~~~~
352
353 One of the following values:
354
355 * U_SUCCESS on success.
356 * U_BUSY if page cannot be currently paged-in.
357 * U_FUNCTION if functionality is not supported
358 * U_PARAMETER if ``lpid`` is invalid.
359 * U_P2 if ``src_ra`` is invalid.
360 * U_P3 if the ``dest_gpa`` address is invalid.
361 * U_P4 if any bit in the ``flags`` is unrecognized
362 * U_P5 if the ``order`` parameter is unsupported.
363
364Description
365~~~~~~~~~~~
366
367 Move the contents of the page identified by ``src_ra`` from normal
368 memory to secure memory and map it to the guest physical address
369 ``dest_gpa``.
370
371 If `dest_gpa` refers to a shared address, map the page into the
372 partition-scoped page-table of the SVM. If `dest_gpa` is not shared,
373 copy the contents of the page into the corresponding secure page.
374 Depending on the context, decrypt the page before being copied.
375
376 The caller provides the attributes of the page through the ``flags``
377 parameter. Valid values for ``flags`` are:
378
379 * CACHE_INHIBITED
380 * CACHE_ENABLED
381 * WRITE_PROTECTION
382
383 The Hypervisor must pin the page in memory before making
384 ``UV_PAGE_IN`` ultracall.
385
386Use cases
387~~~~~~~~~
388
389 #. When a normal VM switches to secure mode, all its pages residing
390 in normal memory, are moved into secure memory.
391
392 #. When an SVM requests to share a page with Hypervisor the Hypervisor
393 allocates a page and informs the Ultravisor.
394
395 #. When an SVM accesses a secure page that has been paged-out,
396 Ultravisor invokes the Hypervisor to locate the page. After
397 locating the page, the Hypervisor uses UV_PAGE_IN to make the
398 page available to Ultravisor.
399
400UV_PAGE_INVAL
401-------------
402
403 Invalidate the Ultravisor mapping of a page.
404
405Syntax
406~~~~~~
407
408.. code-block:: c
409
410 uint64_t ultracall(const uint64_t UV_PAGE_INVAL,
411 uint16_t lpid, /* the LPAR ID */
412 uint64_t guest_pa, /* destination guest-physical-address */
413 uint64_t order) /* page size order */
414
415Return values
416~~~~~~~~~~~~~
417
418 One of the following values:
419
420 * U_SUCCESS on success.
421 * U_PARAMETER if ``lpid`` is invalid.
422 * U_P2 if ``guest_pa`` is invalid (or corresponds to a secure
423 page mapping).
424 * U_P3 if the ``order`` is invalid.
425 * U_FUNCTION if functionality is not supported.
426 * U_BUSY if page cannot be currently invalidated.
427
428Description
429~~~~~~~~~~~
430
431 This ultracall informs Ultravisor that the page mapping in Hypervisor
432 corresponding to the given guest physical address has been invalidated
433 and that the Ultravisor should not access the page. If the specified
434 ``guest_pa`` corresponds to a secure page, Ultravisor will ignore the
435 attempt to invalidate the page and return U_P2.
436
437Use cases
438~~~~~~~~~
439
440 #. When a shared page is unmapped from the QEMU's page table, possibly
441 because it is paged-out to disk, Ultravisor needs to know that the
442 page should not be accessed from its side too.
443
444
445UV_WRITE_PATE
446-------------
447
448 Validate and write the partition table entry (PATE) for a given
449 partition.
450
451Syntax
452~~~~~~
453
454.. code-block:: c
455
456 uint64_t ultracall(const uint64_t UV_WRITE_PATE,
457 uint32_t lpid, /* the LPAR ID */
458 uint64_t dw0 /* the first double word to write */
459 uint64_t dw1) /* the second double word to write */
460
461Return values
462~~~~~~~~~~~~~
463
464 One of the following values:
465
466 * U_SUCCESS on success.
467 * U_BUSY if PATE cannot be currently written to.
468 * U_FUNCTION if functionality is not supported.
469 * U_PARAMETER if ``lpid`` is invalid.
470 * U_P2 if ``dw0`` is invalid.
471 * U_P3 if the ``dw1`` address is invalid.
472 * U_PERMISSION if the Hypervisor is attempting to change the PATE
473 of a secure virtual machine or if called from a
474 context other than Hypervisor.
475
476Description
477~~~~~~~~~~~
478
479 Validate and write a LPID and its partition-table-entry for the given
480 LPID. If the LPID is already allocated and initialized, this call
481 results in changing the partition table entry.
482
483Use cases
484~~~~~~~~~
485
486 #. The Partition table resides in Secure memory and its entries,
487 called PATE (Partition Table Entries), point to the partition-
488 scoped page tables for the Hypervisor as well as each of the
489 virtual machines (both secure and normal). The Hypervisor
490 operates in partition 0 and its partition-scoped page tables
491 reside in normal memory.
492
493 #. This ultracall allows the Hypervisor to register the partition-
494 scoped and process-scoped page table entries for the Hypervisor
495 and other partitions (virtual machines) with the Ultravisor.
496
497 #. If the value of the PATE for an existing partition (VM) changes,
498 the TLB cache for the partition is flushed.
499
500 #. The Hypervisor is responsible for allocating LPID. The LPID and
501 its PATE entry are registered together. The Hypervisor manages
502 the PATE entries for a normal VM and can change the PATE entry
503 anytime. Ultravisor manages the PATE entries for an SVM and
504 Hypervisor is not allowed to modify them.
505
506UV_RETURN
507---------
508
509 Return control from the Hypervisor back to the Ultravisor after
510 processing an hypercall or interrupt that was forwarded (aka
511 *reflected*) to the Hypervisor.
512
513Syntax
514~~~~~~
515
516.. code-block:: c
517
518 uint64_t ultracall(const uint64_t UV_RETURN)
519
520Return values
521~~~~~~~~~~~~~
522
523 This call never returns to Hypervisor on success. It returns
524 U_INVALID if ultracall is not made from a Hypervisor context.
525
526Description
527~~~~~~~~~~~
528
529 When an SVM makes an hypercall or incurs some other exception, the
530 Ultravisor usually forwards (aka *reflects*) the exceptions to the
531 Hypervisor. After processing the exception, Hypervisor uses the
532 ``UV_RETURN`` ultracall to return control back to the SVM.
533
534 The expected register state on entry to this ultracall is:
535
536 * Non-volatile registers are restored to their original values.
537 * If returning from an hypercall, register R0 contains the return
538 value (**unlike other ultracalls**) and, registers R4 through R12
539 contain any output values of the hypercall.
540 * R3 contains the ultracall number, i.e UV_RETURN.
541 * If returning with a synthesized interrupt, R2 contains the
542 synthesized interrupt number.
543
544Use cases
545~~~~~~~~~
546
547 #. Ultravisor relies on the Hypervisor to provide several services to
548 the SVM such as processing hypercall and other exceptions. After
549 processing the exception, Hypervisor uses UV_RETURN to return
550 control back to the Ultravisor.
551
552 #. Hypervisor has to use this ultracall to return control to the SVM.
553
554
555UV_REGISTER_MEM_SLOT
556--------------------
557
558 Register an SVM address-range with specified properties.
559
560Syntax
561~~~~~~
562
563.. code-block:: c
564
565 uint64_t ultracall(const uint64_t UV_REGISTER_MEM_SLOT,
566 uint64_t lpid, /* LPAR ID of the SVM */
567 uint64_t start_gpa, /* start guest physical address */
568 uint64_t size, /* size of address range in bytes */
569 uint64_t flags /* reserved for future expansion */
570 uint16_t slotid) /* slot identifier */
571
572Return values
573~~~~~~~~~~~~~
574
575 One of the following values:
576
577 * U_SUCCESS on success.
578 * U_PARAMETER if ``lpid`` is invalid.
579 * U_P2 if ``start_gpa`` is invalid.
580 * U_P3 if ``size`` is invalid.
581 * U_P4 if any bit in the ``flags`` is unrecognized.
582 * U_P5 if the ``slotid`` parameter is unsupported.
583 * U_PERMISSION if called from context other than Hypervisor.
584 * U_FUNCTION if functionality is not supported.
585
586
587Description
588~~~~~~~~~~~
589
590 Register a memory range for an SVM. The memory range starts at the
591 guest physical address ``start_gpa`` and is ``size`` bytes long.
592
593Use cases
594~~~~~~~~~
595
596
597 #. When a virtual machine goes secure, all the memory slots managed by
598 the Hypervisor move into secure memory. The Hypervisor iterates
599 through each of memory slots, and registers the slot with
600 Ultravisor. Hypervisor may discard some slots such as those used
601 for firmware (SLOF).
602
603 #. When new memory is hot-plugged, a new memory slot gets registered.
604
605
606UV_UNREGISTER_MEM_SLOT
607----------------------
608
609 Unregister an SVM address-range that was previously registered using
610 UV_REGISTER_MEM_SLOT.
611
612Syntax
613~~~~~~
614
615.. code-block:: c
616
617 uint64_t ultracall(const uint64_t UV_UNREGISTER_MEM_SLOT,
618 uint64_t lpid, /* LPAR ID of the SVM */
619 uint64_t slotid) /* reservation slotid */
620
621Return values
622~~~~~~~~~~~~~
623
624 One of the following values:
625
626 * U_SUCCESS on success.
627 * U_FUNCTION if functionality is not supported.
628 * U_PARAMETER if ``lpid`` is invalid.
629 * U_P2 if ``slotid`` is invalid.
630 * U_PERMISSION if called from context other than Hypervisor.
631
632Description
633~~~~~~~~~~~
634
635 Release the memory slot identified by ``slotid`` and free any
636 resources allocated towards the reservation.
637
638Use cases
639~~~~~~~~~
640
641 #. Memory hot-remove.
642
643
644UV_SVM_TERMINATE
645----------------
646
647 Terminate an SVM and release its resources.
648
649Syntax
650~~~~~~
651
652.. code-block:: c
653
654 uint64_t ultracall(const uint64_t UV_SVM_TERMINATE,
655 uint64_t lpid, /* LPAR ID of the SVM */)
656
657Return values
658~~~~~~~~~~~~~
659
660 One of the following values:
661
662 * U_SUCCESS on success.
663 * U_FUNCTION if functionality is not supported.
664 * U_PARAMETER if ``lpid`` is invalid.
665 * U_INVALID if VM is not secure.
666 * U_PERMISSION if not called from a Hypervisor context.
667
668Description
669~~~~~~~~~~~
670
671 Terminate an SVM and release all its resources.
672
673Use cases
674~~~~~~~~~
675
676 #. Called by Hypervisor when terminating an SVM.
677
678
679Ultracalls used by SVM
680======================
681
682UV_SHARE_PAGE
683-------------
684
685 Share a set of guest physical pages with the Hypervisor.
686
687Syntax
688~~~~~~
689
690.. code-block:: c
691
692 uint64_t ultracall(const uint64_t UV_SHARE_PAGE,
693 uint64_t gfn, /* guest page frame number */
694 uint64_t num) /* number of pages of size PAGE_SIZE */
695
696Return values
697~~~~~~~~~~~~~
698
699 One of the following values:
700
701 * U_SUCCESS on success.
702 * U_FUNCTION if functionality is not supported.
703 * U_INVALID if the VM is not secure.
704 * U_PARAMETER if ``gfn`` is invalid.
705 * U_P2 if ``num`` is invalid.
706
707Description
708~~~~~~~~~~~
709
710 Share the ``num`` pages starting at guest physical frame number ``gfn``
711 with the Hypervisor. Assume page size is PAGE_SIZE bytes. Zero the
712 pages before returning.
713
714 If the address is already backed by a secure page, unmap the page and
715 back it with an insecure page, with the help of the Hypervisor. If it
716 is not backed by any page yet, mark the PTE as insecure and back it
717 with an insecure page when the address is accessed. If it is already
718 backed by an insecure page, zero the page and return.
719
720Use cases
721~~~~~~~~~
722
723 #. The Hypervisor cannot access the SVM pages since they are backed by
724 secure pages. Hence an SVM must explicitly request Ultravisor for
725 pages it can share with Hypervisor.
726
727 #. Shared pages are needed to support virtio and Virtual Processor Area
728 (VPA) in SVMs.
729
730
731UV_UNSHARE_PAGE
732---------------
733
734 Restore a shared SVM page to its initial state.
735
736Syntax
737~~~~~~
738
739.. code-block:: c
740
741 uint64_t ultracall(const uint64_t UV_UNSHARE_PAGE,
742 uint64_t gfn, /* guest page frame number */
743 uint73 num) /* number of pages of size PAGE_SIZE*/
744
745Return values
746~~~~~~~~~~~~~
747
748 One of the following values:
749
750 * U_SUCCESS on success.
751 * U_FUNCTION if functionality is not supported.
752 * U_INVALID if VM is not secure.
753 * U_PARAMETER if ``gfn`` is invalid.
754 * U_P2 if ``num`` is invalid.
755
756Description
757~~~~~~~~~~~
758
759 Stop sharing ``num`` pages starting at ``gfn`` with the Hypervisor.
760 Assume that the page size is PAGE_SIZE. Zero the pages before
761 returning.
762
763 If the address is already backed by an insecure page, unmap the page
764 and back it with a secure page. Inform the Hypervisor to release
765 reference to its shared page. If the address is not backed by a page
766 yet, mark the PTE as secure and back it with a secure page when that
767 address is accessed. If it is already backed by an secure page zero
768 the page and return.
769
770Use cases
771~~~~~~~~~
772
773 #. The SVM may decide to unshare a page from the Hypervisor.
774
775
776UV_UNSHARE_ALL_PAGES
777--------------------
778
779 Unshare all pages the SVM has shared with Hypervisor.
780
781Syntax
782~~~~~~
783
784.. code-block:: c
785
786 uint64_t ultracall(const uint64_t UV_UNSHARE_ALL_PAGES)
787
788Return values
789~~~~~~~~~~~~~
790
791 One of the following values:
792
793 * U_SUCCESS on success.
794 * U_FUNCTION if functionality is not supported.
795 * U_INVAL if VM is not secure.
796
797Description
798~~~~~~~~~~~
799
800 Unshare all shared pages from the Hypervisor. All unshared pages are
801 zeroed on return. Only pages explicitly shared by the SVM with the
802 Hypervisor (using UV_SHARE_PAGE ultracall) are unshared. Ultravisor
803 may internally share some pages with the Hypervisor without explicit
804 request from the SVM. These pages will not be unshared by this
805 ultracall.
806
807Use cases
808~~~~~~~~~
809
810 #. This call is needed when ``kexec`` is used to boot a different
811 kernel. It may also be needed during SVM reset.
812
813UV_ESM
814------
815
816 Secure the virtual machine (*enter secure mode*).
817
818Syntax
819~~~~~~
820
821.. code-block:: c
822
823 uint64_t ultracall(const uint64_t UV_ESM,
824 uint64_t esm_blob_addr, /* location of the ESM blob */
825 unint64_t fdt) /* Flattened device tree */
826
827Return values
828~~~~~~~~~~~~~
829
830 One of the following values:
831
832 * U_SUCCESS on success (including if VM is already secure).
833 * U_FUNCTION if functionality is not supported.
834 * U_INVALID if VM is not secure.
835 * U_PARAMETER if ``esm_blob_addr`` is invalid.
836 * U_P2 if ``fdt`` is invalid.
837 * U_PERMISSION if any integrity checks fail.
838 * U_RETRY insufficient memory to create SVM.
839 * U_NO_KEY symmetric key unavailable.
840
841Description
842~~~~~~~~~~~
843
844 Secure the virtual machine. On successful completion, return
845 control to the virtual machine at the address specified in the
846 ESM blob.
847
848Use cases
849~~~~~~~~~
850
851 #. A normal virtual machine can choose to switch to a secure mode.
852
853Hypervisor Calls API
854####################
855
856 This document describes the Hypervisor calls (hypercalls) that are
857 needed to support the Ultravisor. Hypercalls are services provided by
858 the Hypervisor to virtual machines and Ultravisor.
859
860 Register usage for these hypercalls is identical to that of the other
861 hypercalls defined in the Power Architecture Platform Reference (PAPR)
862 document. i.e on input, register R3 identifies the specific service
863 that is being requested and registers R4 through R11 contain
864 additional parameters to the hypercall, if any. On output, register
865 R3 contains the return value and registers R4 through R9 contain any
866 other output values from the hypercall.
867
868 This document only covers hypercalls currently implemented/planned
869 for Ultravisor usage but others can be added here when it makes sense.
870
871 The full specification for all hypercalls/ultracalls will eventually
872 be made available in the public/OpenPower version of the PAPR
873 specification.
874
875Hypervisor calls to support Ultravisor
876======================================
877
878 Following are the set of hypercalls needed to support Ultravisor.
879
880H_SVM_INIT_START
881----------------
882
883 Begin the process of converting a normal virtual machine into an SVM.
884
885Syntax
886~~~~~~
887
888.. code-block:: c
889
890 uint64_t hypercall(const uint64_t H_SVM_INIT_START)
891
892Return values
893~~~~~~~~~~~~~
894
895 One of the following values:
896
897 * H_SUCCESS on success.
898
899Description
900~~~~~~~~~~~
901
902 Initiate the process of securing a virtual machine. This involves
903 coordinating with the Ultravisor, using ultracalls, to allocate
904 resources in the Ultravisor for the new SVM, transferring the VM's
905 pages from normal to secure memory etc. When the process is
906 completed, Ultravisor issues the H_SVM_INIT_DONE hypercall.
907
908Use cases
909~~~~~~~~~
910
911 #. Ultravisor uses this hypercall to inform Hypervisor that a VM
912 has initiated the process of switching to secure mode.
913
914
915H_SVM_INIT_DONE
916---------------
917
918 Complete the process of securing an SVM.
919
920Syntax
921~~~~~~
922
923.. code-block:: c
924
925 uint64_t hypercall(const uint64_t H_SVM_INIT_DONE)
926
927Return values
928~~~~~~~~~~~~~
929
930 One of the following values:
931
932 * H_SUCCESS on success.
933 * H_UNSUPPORTED if called from the wrong context (e.g.
934 from an SVM or before an H_SVM_INIT_START
935 hypercall).
936
937Description
938~~~~~~~~~~~
939
940 Complete the process of securing a virtual machine. This call must
941 be made after a prior call to ``H_SVM_INIT_START`` hypercall.
942
943Use cases
944~~~~~~~~~
945
946 On successfully securing a virtual machine, the Ultravisor informs
947 Hypervisor about it. Hypervisor can use this call to finish setting
948 up its internal state for this virtual machine.
949
950
951H_SVM_PAGE_IN
952-------------
953
954 Move the contents of a page from normal memory to secure memory.
955
956Syntax
957~~~~~~
958
959.. code-block:: c
960
961 uint64_t hypercall(const uint64_t H_SVM_PAGE_IN,
962 uint64_t guest_pa, /* guest-physical-address */
963 uint64_t flags, /* flags */
964 uint64_t order) /* page size order */
965
966Return values
967~~~~~~~~~~~~~
968
969 One of the following values:
970
971 * H_SUCCESS on success.
972 * H_PARAMETER if ``guest_pa`` is invalid.
973 * H_P2 if ``flags`` is invalid.
974 * H_P3 if ``order`` of page is invalid.
975
976Description
977~~~~~~~~~~~
978
979 Retrieve the content of the page, belonging to the VM at the specified
980 guest physical address.
981
982 Only valid value(s) in ``flags`` are:
983
984 * H_PAGE_IN_SHARED which indicates that the page is to be shared
985 with the Ultravisor.
986
987 * H_PAGE_IN_NONSHARED indicates that the UV is not anymore
988 interested in the page. Applicable if the page is a shared page.
989
990 The ``order`` parameter must correspond to the configured page size.
991
992Use cases
993~~~~~~~~~
994
995 #. When a normal VM becomes a secure VM (using the UV_ESM ultracall),
996 the Ultravisor uses this hypercall to move contents of each page of
997 the VM from normal memory to secure memory.
998
999 #. Ultravisor uses this hypercall to ask Hypervisor to provide a page
1000 in normal memory that can be shared between the SVM and Hypervisor.
1001
1002 #. Ultravisor uses this hypercall to page-in a paged-out page. This
1003 can happen when the SVM touches a paged-out page.
1004
1005 #. If SVM wants to disable sharing of pages with Hypervisor, it can
1006 inform Ultravisor to do so. Ultravisor will then use this hypercall
1007 and inform Hypervisor that it has released access to the normal
1008 page.
1009
1010H_SVM_PAGE_OUT
1011---------------
1012
1013 Move the contents of the page to normal memory.
1014
1015Syntax
1016~~~~~~
1017
1018.. code-block:: c
1019
1020 uint64_t hypercall(const uint64_t H_SVM_PAGE_OUT,
1021 uint64_t guest_pa, /* guest-physical-address */
1022 uint64_t flags, /* flags (currently none) */
1023 uint64_t order) /* page size order */
1024
1025Return values
1026~~~~~~~~~~~~~
1027
1028 One of the following values:
1029
1030 * H_SUCCESS on success.
1031 * H_PARAMETER if ``guest_pa`` is invalid.
1032 * H_P2 if ``flags`` is invalid.
1033 * H_P3 if ``order`` is invalid.
1034
1035Description
1036~~~~~~~~~~~
1037
1038 Move the contents of the page identified by ``guest_pa`` to normal
1039 memory.
1040
1041 Currently ``flags`` is unused and must be set to 0. The ``order``
1042 parameter must correspond to the configured page size.
1043
1044Use cases
1045~~~~~~~~~
1046
1047 #. If Ultravisor is running low on secure pages, it can move the
1048 contents of some secure pages, into normal pages using this
1049 hypercall. The content will be encrypted.
1050
1051References
1052##########
1053
1054- `Supporting Protected Computing on IBM Power Architecture <https://developer.ibm.com/articles/l-support-protected-computing/>`_
diff --git a/arch/Kconfig b/arch/Kconfig
index f2a3dc80d46b..0fcf8ec1e098 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -946,6 +946,9 @@ config RELR
946 well as compatible NM and OBJCOPY utilities (llvm-nm and llvm-objcopy 946 well as compatible NM and OBJCOPY utilities (llvm-nm and llvm-objcopy
947 are compatible). 947 are compatible).
948 948
949config ARCH_HAS_MEM_ENCRYPT
950 bool
951
949source "kernel/gcov/Kconfig" 952source "kernel/gcov/Kconfig"
950 953
951source "scripts/gcc-plugins/Kconfig" 954source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 19ee5f155a08..3e56c9c2f16e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -128,14 +128,15 @@ config PPC
128 select ARCH_HAS_HUGEPD if HUGETLB_PAGE 128 select ARCH_HAS_HUGEPD if HUGETLB_PAGE
129 select ARCH_HAS_MMIOWB if PPC64 129 select ARCH_HAS_MMIOWB if PPC64
130 select ARCH_HAS_PHYS_TO_DMA 130 select ARCH_HAS_PHYS_TO_DMA
131 select ARCH_HAS_PMEM_API if PPC64 131 select ARCH_HAS_PMEM_API
132 select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 132 select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64
133 select ARCH_HAS_PTE_SPECIAL 133 select ARCH_HAS_PTE_SPECIAL
134 select ARCH_HAS_MEMBARRIER_CALLBACKS 134 select ARCH_HAS_MEMBARRIER_CALLBACKS
135 select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 135 select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
136 select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) 136 select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
137 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 137 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
138 select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 138 select ARCH_HAS_UACCESS_FLUSHCACHE
139 select ARCH_HAS_UACCESS_MCSAFE if PPC64
139 select ARCH_HAS_UBSAN_SANITIZE_ALL 140 select ARCH_HAS_UBSAN_SANITIZE_ALL
140 select ARCH_HAVE_NMI_SAFE_CMPXCHG 141 select ARCH_HAVE_NMI_SAFE_CMPXCHG
141 select ARCH_KEEP_MEMBLOCK 142 select ARCH_KEEP_MEMBLOCK
@@ -183,6 +184,7 @@ config PPC
183 select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) 184 select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
184 select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) 185 select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
185 select HAVE_CONTEXT_TRACKING if PPC64 186 select HAVE_CONTEXT_TRACKING if PPC64
187 select HAVE_COPY_THREAD_TLS
186 select HAVE_DEBUG_KMEMLEAK 188 select HAVE_DEBUG_KMEMLEAK
187 select HAVE_DEBUG_STACKOVERFLOW 189 select HAVE_DEBUG_STACKOVERFLOW
188 select HAVE_DYNAMIC_FTRACE 190 select HAVE_DYNAMIC_FTRACE
@@ -568,7 +570,7 @@ config CRASH_DUMP
568 570
569config FA_DUMP 571config FA_DUMP
570 bool "Firmware-assisted dump" 572 bool "Firmware-assisted dump"
571 depends on PPC64 && PPC_RTAS 573 depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
572 select CRASH_CORE 574 select CRASH_CORE
573 select CRASH_DUMP 575 select CRASH_DUMP
574 help 576 help
@@ -579,7 +581,26 @@ config FA_DUMP
579 is meant to be a kdump replacement offering robustness and 581 is meant to be a kdump replacement offering robustness and
580 speed not possible without system firmware assistance. 582 speed not possible without system firmware assistance.
581 583
582 If unsure, say "N" 584 If unsure, say "y". Only special kernels like petitboot may
585 need to say "N" here.
586
587config PRESERVE_FA_DUMP
588 bool "Preserve Firmware-assisted dump"
589 depends on PPC64 && PPC_POWERNV && !FA_DUMP
590 help
591 On a kernel with FA_DUMP disabled, this option helps to preserve
592 crash data from a previously crash'ed kernel. Useful when the next
593 memory preserving kernel boot would process this crash data.
594 Petitboot kernel is the typical usecase for this option.
595
596config OPAL_CORE
597 bool "Export OPAL memory as /sys/firmware/opal/core"
598 depends on PPC64 && PPC_POWERNV
599 help
600 This option uses the MPIPL support in firmware to provide an
601 ELF core of OPAL memory after a crash. The ELF core is exported
602 as /sys/firmware/opal/core file which is helpful in debugging
603 OPAL crashes using GDB.
583 604
584config IRQ_ALL_CPUS 605config IRQ_ALL_CPUS
585 bool "Distribute interrupts on all CPUs by default" 606 bool "Distribute interrupts on all CPUs by default"
@@ -1140,18 +1161,6 @@ config TASK_SIZE
1140 default "0x80000000" if PPC_8xx 1161 default "0x80000000" if PPC_8xx
1141 default "0xc0000000" 1162 default "0xc0000000"
1142 1163
1143config CONSISTENT_SIZE_BOOL
1144 bool "Set custom consistent memory pool size"
1145 depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
1146 help
1147 This option allows you to set the size of the
1148 consistent memory pool. This pool of virtual memory
1149 is used to make consistent memory allocations.
1150
1151config CONSISTENT_SIZE
1152 hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL
1153 default "0x00200000" if NOT_COHERENT_CACHE
1154
1155config PIN_TLB 1164config PIN_TLB
1156 bool "Pinned Kernel TLBs (860 ONLY)" 1165 bool "Pinned Kernel TLBs (860 ONLY)"
1157 depends on ADVANCED_OPTIONS && PPC_8xx && \ 1166 depends on ADVANCED_OPTIONS && PPC_8xx && \
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 37990dd105dc..83522c9fc7b6 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -110,7 +110,6 @@ ifeq ($(HAS_BIARCH),y)
110KBUILD_CFLAGS += -m$(BITS) 110KBUILD_CFLAGS += -m$(BITS)
111KBUILD_AFLAGS += -m$(BITS) -Wl,-a$(BITS) 111KBUILD_AFLAGS += -m$(BITS) -Wl,-a$(BITS)
112KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) 112KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION)
113KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET)
114endif 113endif
115 114
116cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls 115cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls
diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index 102cc546444d..a9d209135975 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -146,6 +146,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen,
146 return (struct addr_range){(void *)initrd_addr, initrd_size}; 146 return (struct addr_range){(void *)initrd_addr, initrd_size};
147} 147}
148 148
149#ifdef __powerpc64__
150static void prep_esm_blob(struct addr_range vmlinux, void *chosen)
151{
152 unsigned long esm_blob_addr, esm_blob_size;
153
154 /* Do we have an ESM (Enter Secure Mode) blob? */
155 if (_esm_blob_end <= _esm_blob_start)
156 return;
157
158 printf("Attached ESM blob at 0x%p-0x%p\n\r",
159 _esm_blob_start, _esm_blob_end);
160 esm_blob_addr = (unsigned long)_esm_blob_start;
161 esm_blob_size = _esm_blob_end - _esm_blob_start;
162
163 /*
164 * If the ESM blob is too low it will be clobbered when the
165 * kernel relocates to its final location. In this case,
166 * allocate a safer place and move it.
167 */
168 if (esm_blob_addr < vmlinux.size) {
169 void *old_addr = (void *)esm_blob_addr;
170
171 printf("Allocating 0x%lx bytes for esm_blob ...\n\r",
172 esm_blob_size);
173 esm_blob_addr = (unsigned long)malloc(esm_blob_size);
174 if (!esm_blob_addr)
175 fatal("Can't allocate memory for ESM blob !\n\r");
176 printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r",
177 esm_blob_addr, old_addr, esm_blob_size);
178 memmove((void *)esm_blob_addr, old_addr, esm_blob_size);
179 }
180
181 /* Tell the kernel ESM blob address via device tree. */
182 setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr));
183 setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size));
184}
185#else
186static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { }
187#endif
188
149/* A buffer that may be edited by tools operating on a zImage binary so as to 189/* A buffer that may be edited by tools operating on a zImage binary so as to
150 * edit the command line passed to vmlinux (by setting /chosen/bootargs). 190 * edit the command line passed to vmlinux (by setting /chosen/bootargs).
151 * The buffer is put in it's own section so that tools may locate it easier. 191 * The buffer is put in it's own section so that tools may locate it easier.
@@ -214,6 +254,7 @@ void start(void)
214 vmlinux = prep_kernel(); 254 vmlinux = prep_kernel();
215 initrd = prep_initrd(vmlinux, chosen, 255 initrd = prep_initrd(vmlinux, chosen,
216 loader_info.initrd_addr, loader_info.initrd_size); 256 loader_info.initrd_addr, loader_info.initrd_size);
257 prep_esm_blob(vmlinux, chosen);
217 prep_cmdline(chosen); 258 prep_cmdline(chosen);
218 259
219 printf("Finalizing device tree..."); 260 printf("Finalizing device tree...");
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index cd043726ed88..e0606766480f 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -251,6 +251,8 @@ extern char _initrd_start[];
251extern char _initrd_end[]; 251extern char _initrd_end[];
252extern char _dtb_start[]; 252extern char _dtb_start[];
253extern char _dtb_end[]; 253extern char _dtb_end[];
254extern char _esm_blob_start[];
255extern char _esm_blob_end[];
254 256
255static inline __attribute__((const)) 257static inline __attribute__((const))
256int __ilog2_u32(u32 n) 258int __ilog2_u32(u32 n)
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 5148ac271f28..ed6266367bc0 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -13,6 +13,7 @@
13# -i initrd specify initrd file 13# -i initrd specify initrd file
14# -d devtree specify device-tree blob 14# -d devtree specify device-tree blob
15# -s tree.dts specify device-tree source file (needs dtc installed) 15# -s tree.dts specify device-tree source file (needs dtc installed)
16# -e esm_blob specify ESM blob for secure images
16# -c cache $kernel.strip.gz (use if present & newer, else make) 17# -c cache $kernel.strip.gz (use if present & newer, else make)
17# -C prefix specify command prefix for cross-building tools 18# -C prefix specify command prefix for cross-building tools
18# (strip, objcopy, ld) 19# (strip, objcopy, ld)
@@ -37,6 +38,7 @@ platform=of
37initrd= 38initrd=
38dtb= 39dtb=
39dts= 40dts=
41esm_blob=
40cacheit= 42cacheit=
41binary= 43binary=
42compression=.gz 44compression=.gz
@@ -60,9 +62,9 @@ tmpdir=.
60 62
61usage() { 63usage() {
62 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 64 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
63 echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 65 echo ' [-d devtree] [-s tree.dts] [-e esm_blob]' >&2
64 echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 66 echo ' [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2
65 echo ' [--no-compression] [vmlinux]' >&2 67 echo ' [-Z (gz|xz|none)] [--no-compression] [vmlinux]' >&2
66 exit 1 68 exit 1
67} 69}
68 70
@@ -105,6 +107,11 @@ while [ "$#" -gt 0 ]; do
105 [ "$#" -gt 0 ] || usage 107 [ "$#" -gt 0 ] || usage
106 dtb="$1" 108 dtb="$1"
107 ;; 109 ;;
110 -e)
111 shift
112 [ "$#" -gt 0 ] || usage
113 esm_blob="$1"
114 ;;
108 -s) 115 -s)
109 shift 116 shift
110 [ "$#" -gt 0 ] || usage 117 [ "$#" -gt 0 ] || usage
@@ -218,9 +225,16 @@ objflags=-S
218tmp=$tmpdir/zImage.$$.o 225tmp=$tmpdir/zImage.$$.o
219ksection=.kernel:vmlinux.strip 226ksection=.kernel:vmlinux.strip
220isection=.kernel:initrd 227isection=.kernel:initrd
228esection=.kernel:esm_blob
221link_address='0x400000' 229link_address='0x400000'
222make_space=y 230make_space=y
223 231
232
233if [ -n "$esm_blob" -a "$platform" != "pseries" ]; then
234 echo "ESM blob not support on non-pseries platforms" >&2
235 exit 1
236fi
237
224case "$platform" in 238case "$platform" in
225of) 239of)
226 platformo="$object/of.o $object/epapr.o" 240 platformo="$object/of.o $object/epapr.o"
@@ -477,6 +491,10 @@ if [ -n "$dtb" ]; then
477 fi 491 fi
478fi 492fi
479 493
494if [ -n "$esm_blob" ]; then
495 addsec $tmp "$esm_blob" $esection
496fi
497
480if [ "$platform" != "miboot" ]; then 498if [ "$platform" != "miboot" ]; then
481 if [ -n "$link_address" ] ; then 499 if [ -n "$link_address" ] ; then
482 text_start="-Ttext $link_address" 500 text_start="-Ttext $link_address"
diff --git a/arch/powerpc/boot/zImage.lds.S b/arch/powerpc/boot/zImage.lds.S
index 4ac1e36edfe7..a21f3a76e06f 100644
--- a/arch/powerpc/boot/zImage.lds.S
+++ b/arch/powerpc/boot/zImage.lds.S
@@ -68,6 +68,14 @@ SECTIONS
68 _initrd_end = .; 68 _initrd_end = .;
69 } 69 }
70 70
71 . = ALIGN(4096);
72 .kernel:esm_blob :
73 {
74 _esm_blob_start = .;
75 *(.kernel:esm_blob)
76 _esm_blob_end = .;
77 }
78
71#ifdef CONFIG_PPC64_BOOT_WRAPPER 79#ifdef CONFIG_PPC64_BOOT_WRAPPER
72 . = ALIGN(256); 80 . = ALIGN(256);
73 .got : 81 .got :
diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 7e6654848531..4e6e95f92646 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -20,7 +20,6 @@ CONFIG_CPU_FREQ=y
20CONFIG_CPU_FREQ_GOV_POWERSAVE=y 20CONFIG_CPU_FREQ_GOV_POWERSAVE=y
21CONFIG_CPU_FREQ_GOV_USERSPACE=y 21CONFIG_CPU_FREQ_GOV_USERSPACE=y
22CONFIG_CPU_FREQ_PMAC=y 22CONFIG_CPU_FREQ_PMAC=y
23CONFIG_PPC601_SYNC_FIX=y
24CONFIG_GEN_RTC=y 23CONFIG_GEN_RTC=y
25CONFIG_HIGHMEM=y 24CONFIG_HIGHMEM=y
26CONFIG_BINFMT_MISC=m 25CONFIG_BINFMT_MISC=m
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 34219d555e8a..6658cceb928c 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -38,7 +38,7 @@ CONFIG_MODULE_UNLOAD=y
38CONFIG_MODVERSIONS=y 38CONFIG_MODVERSIONS=y
39CONFIG_MODULE_SRCVERSION_ALL=y 39CONFIG_MODULE_SRCVERSION_ALL=y
40CONFIG_PARTITION_ADVANCED=y 40CONFIG_PARTITION_ADVANCED=y
41CONFIG_SCOM_DEBUGFS=y 41# CONFIG_SCOM_DEBUGFS is not set
42CONFIG_OPAL_PRD=y 42CONFIG_OPAL_PRD=y
43CONFIG_PPC_MEMTRACE=y 43CONFIG_PPC_MEMTRACE=y
44# CONFIG_PPC_PSERIES is not set 44# CONFIG_PPC_PSERIES is not set
diff --git a/arch/powerpc/configs/ppc40x_defconfig b/arch/powerpc/configs/ppc40x_defconfig
index 8f136b52198b..a5f683aed328 100644
--- a/arch/powerpc/configs/ppc40x_defconfig
+++ b/arch/powerpc/configs/ppc40x_defconfig
@@ -84,4 +84,3 @@ CONFIG_CRYPTO_ECB=y
84CONFIG_CRYPTO_PCBC=y 84CONFIG_CRYPTO_PCBC=y
85CONFIG_CRYPTO_MD5=y 85CONFIG_CRYPTO_MD5=y
86CONFIG_CRYPTO_DES=y 86CONFIG_CRYPTO_DES=y
87CONFIG_PPC4xx_OCM=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index dc83fefa04f7..b250e6f5a7ca 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -29,6 +29,7 @@ CONFIG_DTL=y
29CONFIG_SCANLOG=m 29CONFIG_SCANLOG=m
30CONFIG_PPC_SMLPAR=y 30CONFIG_PPC_SMLPAR=y
31CONFIG_IBMEBUS=y 31CONFIG_IBMEBUS=y
32CONFIG_PPC_SVM=y
32CONFIG_PPC_MAPLE=y 33CONFIG_PPC_MAPLE=y
33CONFIG_PPC_PASEMI=y 34CONFIG_PPC_PASEMI=y
34CONFIG_PPC_PASEMI_IOMMU=y 35CONFIG_PPC_PASEMI_IOMMU=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 38abc9c1770a..26126b4d4de3 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -42,6 +42,7 @@ CONFIG_DTL=y
42CONFIG_SCANLOG=m 42CONFIG_SCANLOG=m
43CONFIG_PPC_SMLPAR=y 43CONFIG_PPC_SMLPAR=y
44CONFIG_IBMEBUS=y 44CONFIG_IBMEBUS=y
45CONFIG_PPC_SVM=y
45# CONFIG_PPC_PMAC is not set 46# CONFIG_PPC_PMAC is not set
46CONFIG_RTAS_FLASH=m 47CONFIG_RTAS_FLASH=m
47CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y 48CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 557b530b2f70..1253482a67c0 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -213,6 +213,7 @@ CONFIG_IPMI_WATCHDOG=y
213CONFIG_HW_RANDOM=y 213CONFIG_HW_RANDOM=y
214CONFIG_TCG_TPM=y 214CONFIG_TCG_TPM=y
215CONFIG_TCG_TIS_I2C_NUVOTON=y 215CONFIG_TCG_TIS_I2C_NUVOTON=y
216# CONFIG_DEVPORT is not set
216CONFIG_I2C=y 217CONFIG_I2C=y
217# CONFIG_I2C_COMPAT is not set 218# CONFIG_I2C_COMPAT is not set
218CONFIG_I2C_CHARDEV=y 219CONFIG_I2C_CHARDEV=y
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index ec1c97a8e8cb..8561498e653c 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -15,6 +15,7 @@
15#include <asm/epapr_hcalls.h> 15#include <asm/epapr_hcalls.h>
16#include <asm/dcr.h> 16#include <asm/dcr.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/ultravisor-api.h>
18 19
19#include <uapi/asm/ucontext.h> 20#include <uapi/asm/ucontext.h>
20 21
@@ -34,6 +35,16 @@ extern struct static_key hcall_tracepoint_key;
34void __trace_hcall_entry(unsigned long opcode, unsigned long *args); 35void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
35void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); 36void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
36 37
38/* Ultravisor */
39#if defined(CONFIG_PPC_POWERNV) || defined(CONFIG_PPC_SVM)
40long ucall_norets(unsigned long opcode, ...);
41#else
42static inline long ucall_norets(unsigned long opcode, ...)
43{
44 return U_NOT_AVAILABLE;
45}
46#endif
47
37/* OPAL */ 48/* OPAL */
38int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, 49int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
39 int64_t a4, int64_t a5, int64_t a6, int64_t a7, 50 int64_t a4, int64_t a5, int64_t a6, int64_t a7,
@@ -123,7 +134,8 @@ extern int __ucmpdi2(u64, u64);
123 134
124/* tracing */ 135/* tracing */
125void _mcount(void); 136void _mcount(void);
126unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); 137unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
138 unsigned long sp);
127 139
128void pnv_power9_force_smt4_catch(void); 140void pnv_power9_force_smt4_catch(void);
129void pnv_power9_force_smt4_release(void); 141void pnv_power9_force_smt4_release(void);
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 838de59f6754..0796533d37dd 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -148,23 +148,21 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
148 */ 148 */
149#include <asm/fixmap.h> 149#include <asm/fixmap.h>
150 150
151#ifdef CONFIG_HIGHMEM
152#define KVIRT_TOP PKMAP_BASE
153#else
154#define KVIRT_TOP FIXADDR_START
155#endif
156
157/* 151/*
158 * ioremap_bot starts at that address. Early ioremaps move down from there, 152 * ioremap_bot starts at that address. Early ioremaps move down from there,
159 * until mem_init() at which point this becomes the top of the vmalloc 153 * until mem_init() at which point this becomes the top of the vmalloc
160 * and ioremap space 154 * and ioremap space
161 */ 155 */
162#ifdef CONFIG_NOT_COHERENT_CACHE 156#ifdef CONFIG_HIGHMEM
163#define IOREMAP_TOP ((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK) 157#define IOREMAP_TOP PKMAP_BASE
164#else 158#else
165#define IOREMAP_TOP KVIRT_TOP 159#define IOREMAP_TOP FIXADDR_START
166#endif 160#endif
167 161
162/* PPC32 shares vmalloc area with ioremap */
163#define IOREMAP_START VMALLOC_START
164#define IOREMAP_END VMALLOC_END
165
168/* 166/*
169 * Just any arbitrary offset to the start of the vmalloc VM area: the 167 * Just any arbitrary offset to the start of the vmalloc VM area: the
170 * current 16MB value just means that there will be a 64MB "hole" after the 168 * current 16MB value just means that there will be a 64MB "hole" after the
@@ -201,8 +199,6 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
201#include <linux/sched.h> 199#include <linux/sched.h>
202#include <linux/threads.h> 200#include <linux/threads.h>
203 201
204extern unsigned long ioremap_bot;
205
206/* Bits to mask out from a PGD to get to the PUD page */ 202/* Bits to mask out from a PGD to get to the PUD page */
207#define PGD_MASKED_BITS 0 203#define PGD_MASKED_BITS 0
208 204
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 23b83d3593e2..bb3deb76c951 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -206,7 +206,6 @@ extern int mmu_io_psize;
206void mmu_early_init_devtree(void); 206void mmu_early_init_devtree(void);
207void hash__early_init_devtree(void); 207void hash__early_init_devtree(void);
208void radix__early_init_devtree(void); 208void radix__early_init_devtree(void);
209extern void radix_init_native(void);
210extern void hash__early_init_mmu(void); 209extern void hash__early_init_mmu(void);
211extern void radix__early_init_mmu(void); 210extern void radix__early_init_mmu(void);
212static inline void early_init_mmu(void) 211static inline void early_init_mmu(void)
@@ -238,9 +237,6 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
238 first_memblock_size); 237 first_memblock_size);
239} 238}
240 239
241extern int (*register_process_table)(unsigned long base, unsigned long page_size,
242 unsigned long tbl_size);
243
244#ifdef CONFIG_PPC_PSERIES 240#ifdef CONFIG_PPC_PSERIES
245extern void radix_init_pseries(void); 241extern void radix_init_pseries(void);
246#else 242#else
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8308f32e9782..b01624e5c467 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -289,7 +289,6 @@ extern unsigned long __kernel_io_end;
289#define KERN_IO_END __kernel_io_end 289#define KERN_IO_END __kernel_io_end
290 290
291extern struct page *vmemmap; 291extern struct page *vmemmap;
292extern unsigned long ioremap_bot;
293extern unsigned long pci_io_base; 292extern unsigned long pci_io_base;
294#endif /* __ASSEMBLY__ */ 293#endif /* __ASSEMBLY__ */
295 294
@@ -317,6 +316,7 @@ extern unsigned long pci_io_base;
317#define PHB_IO_BASE (ISA_IO_END) 316#define PHB_IO_BASE (ISA_IO_END)
318#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) 317#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE)
319#define IOREMAP_BASE (PHB_IO_END) 318#define IOREMAP_BASE (PHB_IO_END)
319#define IOREMAP_START (ioremap_bot)
320#define IOREMAP_END (KERN_IO_END) 320#define IOREMAP_END (KERN_IO_END)
321 321
322/* Advertise special mapping type for AGP */ 322/* Advertise special mapping type for AGP */
@@ -608,8 +608,10 @@ static inline bool pte_access_permitted(pte_t pte, bool write)
608 */ 608 */
609static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) 609static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
610{ 610{
611 return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | 611 VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
612 pgprot_val(pgprot)); 612 VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
613
614 return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
613} 615}
614 616
615static inline unsigned long pte_pfn(pte_t pte) 617static inline unsigned long pte_pfn(pte_t pte)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index e04a839cb5b9..574eca33f893 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -266,9 +266,6 @@ extern void radix__vmemmap_remove_mapping(unsigned long start,
266extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, 266extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
267 pgprot_t flags, unsigned int psz); 267 pgprot_t flags, unsigned int psz);
268 268
269extern int radix__ioremap_range(unsigned long ea, phys_addr_t pa,
270 unsigned long size, pgprot_t prot, int nid);
271
272static inline unsigned long radix__get_tree_size(void) 269static inline unsigned long radix__get_tree_size(void)
273{ 270{
274 unsigned long rts_field; 271 unsigned long rts_field;
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 05147cecb8df..4ce795d30377 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -17,8 +17,8 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
17 unsigned long addr, 17 unsigned long addr,
18 unsigned long page_size); 18 unsigned long page_size);
19extern void radix__flush_pwc_lpid(unsigned int lpid); 19extern void radix__flush_pwc_lpid(unsigned int lpid);
20extern void radix__flush_tlb_lpid(unsigned int lpid); 20extern void radix__flush_all_lpid(unsigned int lpid);
21extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); 21extern void radix__flush_all_lpid_guest(unsigned int lpid);
22#else 22#else
23static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; 23static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
24static inline void radix__flush_tlb_lpid_page(unsigned int lpid, 24static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
@@ -31,11 +31,7 @@ static inline void radix__flush_pwc_lpid(unsigned int lpid)
31{ 31{
32 WARN_ON(1); 32 WARN_ON(1);
33} 33}
34static inline void radix__flush_tlb_lpid(unsigned int lpid) 34static inline void radix__flush_all_lpid(unsigned int lpid)
35{
36 WARN_ON(1);
37}
38static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
39{ 35{
40 WARN_ON(1); 36 WARN_ON(1);
41} 37}
@@ -73,6 +69,4 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
73extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); 69extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
74extern void radix__flush_tlb_all(void); 70extern void radix__flush_tlb_all(void);
75 71
76extern void radix__local_flush_tlb_lpid(unsigned int lpid);
77
78#endif 72#endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index ebf572ea621e..7aa8195b6cff 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long addre
162 162
163 radix__flush_tlb_pwc(tlb, address); 163 radix__flush_tlb_pwc(tlb, address);
164} 164}
165
166extern bool tlbie_capable;
167extern bool tlbie_enabled;
168
169static inline bool cputlb_use_tlbie(void)
170{
171 return tlbie_enabled;
172}
173
165#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ 174#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 6436b65ac7bc..0e1263455d73 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -26,5 +26,16 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
26 unsigned long size, pgprot_t vma_prot); 26 unsigned long size, pgprot_t vma_prot);
27#define __HAVE_PHYS_MEM_ACCESS_PROT 27#define __HAVE_PHYS_MEM_ACCESS_PROT
28 28
29/*
30 * This gets called at the end of handling a page fault, when
31 * the kernel has put a new PTE into the page table for the process.
32 * We use it to ensure coherency between the i-cache and d-cache
33 * for the page which has just been mapped in.
34 * On machines which use an MMU hash table, we use this to put a
35 * corresponding HPTE into the hash table ahead of time, instead of
36 * waiting for the inevitable extra hash-table miss exception.
37 */
38void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
39
29#endif /* __ASSEMBLY__ */ 40#endif /* __ASSEMBLY__ */
30#endif 41#endif
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index fed7e6241349..f47e6ff6554d 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -5,14 +5,6 @@
5 5
6#include <asm/asm-compat.h> 6#include <asm/asm-compat.h>
7 7
8/*
9 * Define an illegal instr to trap on the bug.
10 * We don't use 0 because that marks the end of a function
11 * in the ELF ABI. That's "Boo Boo" in case you wonder...
12 */
13#define BUG_OPCODE .long 0x00b00b00 /* For asm */
14#define BUG_ILLEGAL_INSTR "0x00b00b00" /* For BUG macro */
15
16#ifdef CONFIG_BUG 8#ifdef CONFIG_BUG
17 9
18#ifdef __ASSEMBLY__ 10#ifdef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index d05f0c28e515..a1ebcbc3931f 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -145,12 +145,10 @@ static inline void cpu_feature_keys_init(void) { }
145 145
146/* Definitions for features that only exist on 32-bit chips */ 146/* Definitions for features that only exist on 32-bit chips */
147#ifdef CONFIG_PPC32 147#ifdef CONFIG_PPC32
148#define CPU_FTR_601 ASM_CONST(0x00001000)
149#define CPU_FTR_L2CR ASM_CONST(0x00002000) 148#define CPU_FTR_L2CR ASM_CONST(0x00002000)
150#define CPU_FTR_SPEC7450 ASM_CONST(0x00004000) 149#define CPU_FTR_SPEC7450 ASM_CONST(0x00004000)
151#define CPU_FTR_TAU ASM_CONST(0x00008000) 150#define CPU_FTR_TAU ASM_CONST(0x00008000)
152#define CPU_FTR_CAN_DOZE ASM_CONST(0x00010000) 151#define CPU_FTR_CAN_DOZE ASM_CONST(0x00010000)
153#define CPU_FTR_USE_RTC ASM_CONST(0x00020000)
154#define CPU_FTR_L3CR ASM_CONST(0x00040000) 152#define CPU_FTR_L3CR ASM_CONST(0x00040000)
155#define CPU_FTR_L3_DISABLE_NAP ASM_CONST(0x00080000) 153#define CPU_FTR_L3_DISABLE_NAP ASM_CONST(0x00080000)
156#define CPU_FTR_NAP_DISABLE_L2_PR ASM_CONST(0x00100000) 154#define CPU_FTR_NAP_DISABLE_L2_PR ASM_CONST(0x00100000)
@@ -160,14 +158,12 @@ static inline void cpu_feature_keys_init(void) { }
160#define CPU_FTR_NEED_COHERENT ASM_CONST(0x01000000) 158#define CPU_FTR_NEED_COHERENT ASM_CONST(0x01000000)
161#define CPU_FTR_NO_BTIC ASM_CONST(0x02000000) 159#define CPU_FTR_NO_BTIC ASM_CONST(0x02000000)
162#define CPU_FTR_PPC_LE ASM_CONST(0x04000000) 160#define CPU_FTR_PPC_LE ASM_CONST(0x04000000)
163#define CPU_FTR_UNIFIED_ID_CACHE ASM_CONST(0x08000000)
164#define CPU_FTR_SPE ASM_CONST(0x10000000) 161#define CPU_FTR_SPE ASM_CONST(0x10000000)
165#define CPU_FTR_NEED_PAIRED_STWCX ASM_CONST(0x20000000) 162#define CPU_FTR_NEED_PAIRED_STWCX ASM_CONST(0x20000000)
166#define CPU_FTR_INDEXED_DCR ASM_CONST(0x40000000) 163#define CPU_FTR_INDEXED_DCR ASM_CONST(0x40000000)
167 164
168#else /* CONFIG_PPC32 */ 165#else /* CONFIG_PPC32 */
169/* Define these to 0 for the sake of tests in common code */ 166/* Define these to 0 for the sake of tests in common code */
170#define CPU_FTR_601 (0)
171#define CPU_FTR_PPC_LE (0) 167#define CPU_FTR_PPC_LE (0)
172#endif 168#endif
173 169
@@ -294,8 +290,8 @@ static inline void cpu_feature_keys_init(void) { }
294#define CPU_FTR_MAYBE_CAN_NAP 0 290#define CPU_FTR_MAYBE_CAN_NAP 0
295#endif 291#endif
296 292
297#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | CPU_FTR_601 | \ 293#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | \
298 CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC) 294 CPU_FTR_COHERENT_ICACHE)
299#define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ 295#define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \
300 CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) 296 CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE)
301#define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE) 297#define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE)
@@ -386,7 +382,7 @@ static inline void cpu_feature_keys_init(void) { }
386#define CPU_FTRS_47X (CPU_FTRS_440x6) 382#define CPU_FTRS_47X (CPU_FTRS_440x6)
387#define CPU_FTRS_E200 (CPU_FTR_SPE_COMP | \ 383#define CPU_FTRS_E200 (CPU_FTR_SPE_COMP | \
388 CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \ 384 CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
389 CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \ 385 CPU_FTR_NOEXECUTE | \
390 CPU_FTR_DEBUG_LVL_EXC) 386 CPU_FTR_DEBUG_LVL_EXC)
391#define CPU_FTRS_E500 (CPU_FTR_MAYBE_CAN_DOZE | \ 387#define CPU_FTRS_E500 (CPU_FTR_MAYBE_CAN_DOZE | \
392 CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \ 388 CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
@@ -498,7 +494,9 @@ static inline void cpu_feature_keys_init(void) { }
498#else 494#else
499enum { 495enum {
500 CPU_FTRS_POSSIBLE = 496 CPU_FTRS_POSSIBLE =
501#ifdef CONFIG_PPC_BOOK3S_32 497#ifdef CONFIG_PPC_BOOK3S_601
498 CPU_FTRS_PPC601 |
499#elif defined(CONFIG_PPC_BOOK3S_32)
502 CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | 500 CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU |
503 CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 | 501 CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 |
504 CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX | 502 CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX |
@@ -574,8 +572,10 @@ enum {
574#else 572#else
575enum { 573enum {
576 CPU_FTRS_ALWAYS = 574 CPU_FTRS_ALWAYS =
577#ifdef CONFIG_PPC_BOOK3S_32 575#ifdef CONFIG_PPC_BOOK3S_601
578 CPU_FTRS_PPC601 & CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU & 576 CPU_FTRS_PPC601 &
577#elif defined(CONFIG_PPC_BOOK3S_32)
578 CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU &
579 CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 & 579 CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 &
580 CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX & 580 CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX &
581 CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 & 581 CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 &
diff --git a/arch/powerpc/include/asm/current.h b/arch/powerpc/include/asm/current.h
index 297827b76169..bbfb94800415 100644
--- a/arch/powerpc/include/asm/current.h
+++ b/arch/powerpc/include/asm/current.h
@@ -16,7 +16,8 @@ static inline struct task_struct *get_current(void)
16{ 16{
17 struct task_struct *task; 17 struct task_struct *task;
18 18
19 __asm__ __volatile__("ld %0,%1(13)" 19 /* get_current can be cached by the compiler, so no volatile */
20 asm ("ld %0,%1(13)"
20 : "=r" (task) 21 : "=r" (task)
21 : "i" (offsetof(struct paca_struct, __current))); 22 : "i" (offsetof(struct paca_struct, __current)));
22 23
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8aa7c76c2130..6f9b2a12540a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -88,6 +88,19 @@ struct eeh_pe {
88 struct list_head child_list; /* List of PEs below this PE */ 88 struct list_head child_list; /* List of PEs below this PE */
89 struct list_head child; /* Memb. child_list/eeh_phb_pe */ 89 struct list_head child; /* Memb. child_list/eeh_phb_pe */
90 struct list_head edevs; /* List of eeh_dev in this PE */ 90 struct list_head edevs; /* List of eeh_dev in this PE */
91
92#ifdef CONFIG_STACKTRACE
93 /*
94 * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
95 * the stack trace is saved here so we can print it in the recovery
96 * thread if it turns out to due to a real problem rather than
97 * a hot-remove.
98 *
99 * A max of 64 entries might be overkill, but it also might not be.
100 */
101 unsigned long stack_trace[64];
102 int trace_entries;
103#endif /* CONFIG_STACKTRACE */
91}; 104};
92 105
93#define eeh_pe_for_each_dev(pe, edev, tmp) \ 106#define eeh_pe_for_each_dev(pe, edev, tmp) \
@@ -121,6 +134,8 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
121struct eeh_dev { 134struct eeh_dev {
122 int mode; /* EEH mode */ 135 int mode; /* EEH mode */
123 int class_code; /* Class code of the device */ 136 int class_code; /* Class code of the device */
137 int bdfn; /* bdfn of device (for cfg ops) */
138 struct pci_controller *controller;
124 int pe_config_addr; /* PE config address */ 139 int pe_config_addr; /* PE config address */
125 u32 config_space[16]; /* Saved PCI config space */ 140 u32 config_space[16]; /* Saved PCI config space */
126 int pcix_cap; /* Saved PCIx capability */ 141 int pcix_cap; /* Saved PCIx capability */
@@ -136,6 +151,17 @@ struct eeh_dev {
136 struct pci_dev *physfn; /* Associated SRIOV PF */ 151 struct pci_dev *physfn; /* Associated SRIOV PF */
137}; 152};
138 153
154/* "fmt" must be a simple literal string */
155#define EEH_EDEV_PRINT(level, edev, fmt, ...) \
156 pr_##level("PCI %04x:%02x:%02x.%x#%04x: EEH: " fmt, \
157 (edev)->controller->global_number, PCI_BUSNO((edev)->bdfn), \
158 PCI_SLOT((edev)->bdfn), PCI_FUNC((edev)->bdfn), \
159 ((edev)->pe ? (edev)->pe_config_addr : 0xffff), ##__VA_ARGS__)
160#define eeh_edev_dbg(edev, fmt, ...) EEH_EDEV_PRINT(debug, (edev), fmt, ##__VA_ARGS__)
161#define eeh_edev_info(edev, fmt, ...) EEH_EDEV_PRINT(info, (edev), fmt, ##__VA_ARGS__)
162#define eeh_edev_warn(edev, fmt, ...) EEH_EDEV_PRINT(warn, (edev), fmt, ##__VA_ARGS__)
163#define eeh_edev_err(edev, fmt, ...) EEH_EDEV_PRINT(err, (edev), fmt, ##__VA_ARGS__)
164
139static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev) 165static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev)
140{ 166{
141 return edev ? edev->pdn : NULL; 167 return edev ? edev->pdn : NULL;
@@ -247,7 +273,7 @@ static inline bool eeh_state_active(int state)
247 == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); 273 == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
248} 274}
249 275
250typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); 276typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
251typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); 277typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
252void eeh_set_pe_aux_size(int size); 278void eeh_set_pe_aux_size(int size);
253int eeh_phb_pe_create(struct pci_controller *phb); 279int eeh_phb_pe_create(struct pci_controller *phb);
@@ -261,20 +287,20 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
261void eeh_pe_update_time_stamp(struct eeh_pe *pe); 287void eeh_pe_update_time_stamp(struct eeh_pe *pe);
262void *eeh_pe_traverse(struct eeh_pe *root, 288void *eeh_pe_traverse(struct eeh_pe *root,
263 eeh_pe_traverse_func fn, void *flag); 289 eeh_pe_traverse_func fn, void *flag);
264void *eeh_pe_dev_traverse(struct eeh_pe *root, 290void eeh_pe_dev_traverse(struct eeh_pe *root,
265 eeh_edev_traverse_func fn, void *flag); 291 eeh_edev_traverse_func fn, void *flag);
266void eeh_pe_restore_bars(struct eeh_pe *pe); 292void eeh_pe_restore_bars(struct eeh_pe *pe);
267const char *eeh_pe_loc_get(struct eeh_pe *pe); 293const char *eeh_pe_loc_get(struct eeh_pe *pe);
268struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); 294struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
269 295
270struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); 296struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
271void eeh_dev_phb_init_dynamic(struct pci_controller *phb); 297void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
272void eeh_probe_devices(void); 298void eeh_show_enabled(void);
273int __init eeh_ops_register(struct eeh_ops *ops); 299int __init eeh_ops_register(struct eeh_ops *ops);
274int __exit eeh_ops_unregister(const char *name); 300int __exit eeh_ops_unregister(const char *name);
275int eeh_check_failure(const volatile void __iomem *token); 301int eeh_check_failure(const volatile void __iomem *token);
276int eeh_dev_check_failure(struct eeh_dev *edev); 302int eeh_dev_check_failure(struct eeh_dev *edev);
277void eeh_addr_cache_build(void); 303void eeh_addr_cache_init(void);
278void eeh_add_device_early(struct pci_dn *); 304void eeh_add_device_early(struct pci_dn *);
279void eeh_add_device_tree_early(struct pci_dn *); 305void eeh_add_device_tree_early(struct pci_dn *);
280void eeh_add_device_late(struct pci_dev *); 306void eeh_add_device_late(struct pci_dev *);
@@ -316,7 +342,7 @@ static inline bool eeh_enabled(void)
316 return false; 342 return false;
317} 343}
318 344
319static inline void eeh_probe_devices(void) { } 345static inline void eeh_show_enabled(void) { }
320 346
321static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) 347static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
322{ 348{
@@ -332,7 +358,7 @@ static inline int eeh_check_failure(const volatile void __iomem *token)
332 358
333#define eeh_dev_check_failure(x) (0) 359#define eeh_dev_check_failure(x) (0)
334 360
335static inline void eeh_addr_cache_build(void) { } 361static inline void eeh_addr_cache_init(void) { }
336 362
337static inline void eeh_add_device_early(struct pci_dn *pdn) { } 363static inline void eeh_add_device_early(struct pci_dn *pdn) { }
338 364
diff --git a/arch/powerpc/include/asm/elfnote.h b/arch/powerpc/include/asm/elfnote.h
new file mode 100644
index 000000000000..a201b6e9ae44
--- /dev/null
+++ b/arch/powerpc/include/asm/elfnote.h
@@ -0,0 +1,24 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * PowerPC ELF notes.
4 *
5 * Copyright 2019, IBM Corporation
6 */
7
8#ifndef __ASM_POWERPC_ELFNOTE_H__
9#define __ASM_POWERPC_ELFNOTE_H__
10
11/*
12 * These note types should live in a SHT_NOTE segment and have
13 * "PowerPC" in the name field.
14 */
15
16/*
17 * The capabilities supported/required by this kernel (bitmap).
18 *
19 * This type uses a bitmap as "desc" field. Each bit is described
20 * in arch/powerpc/kernel/note.S
21 */
22#define PPC_ELFNOTE_CAPABILITIES 1
23
24#endif /* __ASM_POWERPC_ELFNOTE_H__ */
diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h
new file mode 100644
index 000000000000..c814a2b55389
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -0,0 +1,169 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Firmware-Assisted Dump internal code.
4 *
5 * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
6 * Copyright 2019, Hari Bathini, IBM Corporation.
7 */
8
9#ifndef _ASM_POWERPC_FADUMP_INTERNAL_H
10#define _ASM_POWERPC_FADUMP_INTERNAL_H
11
12/* Maximum number of memory regions kernel supports */
13#define FADUMP_MAX_MEM_REGS 128
14
15#ifndef CONFIG_PRESERVE_FA_DUMP
16
17/* The upper limit percentage for user specified boot memory size (25%) */
18#define MAX_BOOT_MEM_RATIO 4
19
20#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
21
22/* Alignment per CMA requirement. */
23#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \
24 max_t(unsigned long, MAX_ORDER - 1, \
25 pageblock_order))
26
27/* FAD commands */
28#define FADUMP_REGISTER 1
29#define FADUMP_UNREGISTER 2
30#define FADUMP_INVALIDATE 3
31
32/*
33 * Copy the ascii values for first 8 characters from a string into u64
34 * variable at their respective indexes.
35 * e.g.
36 * The string "FADMPINF" will be converted into 0x4641444d50494e46
37 */
38static inline u64 fadump_str_to_u64(const char *str)
39{
40 u64 val = 0;
41 int i;
42
43 for (i = 0; i < sizeof(val); i++)
44 val = (*str) ? (val << 8) | *str++ : val << 8;
45 return val;
46}
47
48#define FADUMP_CPU_UNKNOWN (~((u32)0))
49
50#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPINF")
51
52/* fadump crash info structure */
53struct fadump_crash_info_header {
54 u64 magic_number;
55 u64 elfcorehdr_addr;
56 u32 crashing_cpu;
57 struct pt_regs regs;
58 struct cpumask online_mask;
59};
60
61struct fadump_memory_range {
62 u64 base;
63 u64 size;
64};
65
66/* fadump memory ranges info */
67struct fadump_mrange_info {
68 char name[16];
69 struct fadump_memory_range *mem_ranges;
70 u32 mem_ranges_sz;
71 u32 mem_range_cnt;
72 u32 max_mem_ranges;
73};
74
75/* Platform specific callback functions */
76struct fadump_ops;
77
78/* Firmware-assisted dump configuration details. */
79struct fw_dump {
80 unsigned long reserve_dump_area_start;
81 unsigned long reserve_dump_area_size;
82 /* cmd line option during boot */
83 unsigned long reserve_bootvar;
84
85 unsigned long cpu_state_data_size;
86 u64 cpu_state_dest_vaddr;
87 u32 cpu_state_data_version;
88 u32 cpu_state_entry_size;
89
90 unsigned long hpte_region_size;
91
92 unsigned long boot_memory_size;
93 u64 boot_mem_dest_addr;
94 u64 boot_mem_addr[FADUMP_MAX_MEM_REGS];
95 u64 boot_mem_sz[FADUMP_MAX_MEM_REGS];
96 u64 boot_mem_top;
97 u64 boot_mem_regs_cnt;
98
99 unsigned long fadumphdr_addr;
100 unsigned long cpu_notes_buf_vaddr;
101 unsigned long cpu_notes_buf_size;
102
103 /*
104 * Maximum size supported by firmware to copy from source to
105 * destination address per entry.
106 */
107 u64 max_copy_size;
108 u64 kernel_metadata;
109
110 int ibm_configure_kernel_dump;
111
112 unsigned long fadump_enabled:1;
113 unsigned long fadump_supported:1;
114 unsigned long dump_active:1;
115 unsigned long dump_registered:1;
116 unsigned long nocma:1;
117
118 struct fadump_ops *ops;
119};
120
121struct fadump_ops {
122 u64 (*fadump_init_mem_struct)(struct fw_dump *fadump_conf);
123 u64 (*fadump_get_metadata_size)(void);
124 int (*fadump_setup_metadata)(struct fw_dump *fadump_conf);
125 u64 (*fadump_get_bootmem_min)(void);
126 int (*fadump_register)(struct fw_dump *fadump_conf);
127 int (*fadump_unregister)(struct fw_dump *fadump_conf);
128 int (*fadump_invalidate)(struct fw_dump *fadump_conf);
129 void (*fadump_cleanup)(struct fw_dump *fadump_conf);
130 int (*fadump_process)(struct fw_dump *fadump_conf);
131 void (*fadump_region_show)(struct fw_dump *fadump_conf,
132 struct seq_file *m);
133 void (*fadump_trigger)(struct fadump_crash_info_header *fdh,
134 const char *msg);
135};
136
137/* Helper functions */
138s32 fadump_setup_cpu_notes_buf(u32 num_cpus);
139void fadump_free_cpu_notes_buf(void);
140u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs);
141void fadump_update_elfcore_header(char *bufp);
142bool is_fadump_boot_mem_contiguous(void);
143bool is_fadump_reserved_mem_contiguous(void);
144
145#else /* !CONFIG_PRESERVE_FA_DUMP */
146
147/* Firmware-assisted dump configuration details. */
148struct fw_dump {
149 u64 boot_mem_top;
150 u64 dump_active;
151};
152
153#endif /* CONFIG_PRESERVE_FA_DUMP */
154
155#ifdef CONFIG_PPC_PSERIES
156extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
157#else
158static inline void
159rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
160#endif
161
162#ifdef CONFIG_PPC_POWERNV
163extern void opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node);
164#else
165static inline void
166opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { }
167#endif
168
169#endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 17d9b6acaf63..526a6a647312 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -6,196 +6,14 @@
6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7 */ 7 */
8 8
9#ifndef __PPC64_FA_DUMP_H__ 9#ifndef _ASM_POWERPC_FADUMP_H
10#define __PPC64_FA_DUMP_H__ 10#define _ASM_POWERPC_FADUMP_H
11 11
12#ifdef CONFIG_FA_DUMP 12#ifdef CONFIG_FA_DUMP
13 13
14/*
15 * The RMA region will be saved for later dumping when kernel crashes.
16 * RMA is Real Mode Area, the first block of logical memory address owned
17 * by logical partition, containing the storage that may be accessed with
18 * translate off.
19 */
20#define RMA_START 0x0
21#define RMA_END (ppc64_rma_size)
22
23/*
24 * On some Power systems where RMO is 128MB, it still requires minimum of
25 * 256MB for kernel to boot successfully. When kdump infrastructure is
26 * configured to save vmcore over network, we run into OOM issue while
27 * loading modules related to network setup. Hence we need aditional 64M
28 * of memory to avoid OOM issue.
29 */
30#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
31 + (0x1UL << 26))
32
33/* The upper limit percentage for user specified boot memory size (25%) */
34#define MAX_BOOT_MEM_RATIO 4
35
36#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
37
38/* Alignement per CMA requirement. */
39#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \
40 max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
41
42/* Firmware provided dump sections */
43#define FADUMP_CPU_STATE_DATA 0x0001
44#define FADUMP_HPTE_REGION 0x0002
45#define FADUMP_REAL_MODE_REGION 0x0011
46
47/* Dump request flag */
48#define FADUMP_REQUEST_FLAG 0x00000001
49
50/* FAD commands */
51#define FADUMP_REGISTER 1
52#define FADUMP_UNREGISTER 2
53#define FADUMP_INVALIDATE 3
54
55/* Dump status flag */
56#define FADUMP_ERROR_FLAG 0x2000
57
58#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
59
60#define CPU_UNKNOWN (~((u32)0))
61
62/* Utility macros */
63#define SKIP_TO_NEXT_CPU(reg_entry) \
64({ \
65 while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \
66 reg_entry++; \
67 reg_entry++; \
68})
69
70extern int crashing_cpu; 14extern int crashing_cpu;
71 15
72/* Kernel Dump section info */
73struct fadump_section {
74 __be32 request_flag;
75 __be16 source_data_type;
76 __be16 error_flags;
77 __be64 source_address;
78 __be64 source_len;
79 __be64 bytes_dumped;
80 __be64 destination_address;
81};
82
83/* ibm,configure-kernel-dump header. */
84struct fadump_section_header {
85 __be32 dump_format_version;
86 __be16 dump_num_sections;
87 __be16 dump_status_flag;
88 __be32 offset_first_dump_section;
89
90 /* Fields for disk dump option. */
91 __be32 dd_block_size;
92 __be64 dd_block_offset;
93 __be64 dd_num_blocks;
94 __be32 dd_offset_disk_path;
95
96 /* Maximum time allowed to prevent an automatic dump-reboot. */
97 __be32 max_time_auto;
98};
99
100/*
101 * Firmware Assisted dump memory structure. This structure is required for
102 * registering future kernel dump with power firmware through rtas call.
103 *
104 * No disk dump option. Hence disk dump path string section is not included.
105 */
106struct fadump_mem_struct {
107 struct fadump_section_header header;
108
109 /* Kernel dump sections */
110 struct fadump_section cpu_state_data;
111 struct fadump_section hpte_region;
112 struct fadump_section rmr_region;
113};
114
115/* Firmware-assisted dump configuration details. */
116struct fw_dump {
117 unsigned long cpu_state_data_size;
118 unsigned long hpte_region_size;
119 unsigned long boot_memory_size;
120 unsigned long reserve_dump_area_start;
121 unsigned long reserve_dump_area_size;
122 /* cmd line option during boot */
123 unsigned long reserve_bootvar;
124
125 unsigned long fadumphdr_addr;
126 unsigned long cpu_notes_buf;
127 unsigned long cpu_notes_buf_size;
128
129 int ibm_configure_kernel_dump;
130
131 unsigned long fadump_enabled:1;
132 unsigned long fadump_supported:1;
133 unsigned long dump_active:1;
134 unsigned long dump_registered:1;
135 unsigned long nocma:1;
136};
137
138/*
139 * Copy the ascii values for first 8 characters from a string into u64
140 * variable at their respective indexes.
141 * e.g.
142 * The string "FADMPINF" will be converted into 0x4641444d50494e46
143 */
144static inline u64 str_to_u64(const char *str)
145{
146 u64 val = 0;
147 int i;
148
149 for (i = 0; i < sizeof(val); i++)
150 val = (*str) ? (val << 8) | *str++ : val << 8;
151 return val;
152}
153#define STR_TO_HEX(x) str_to_u64(x)
154#define REG_ID(x) str_to_u64(x)
155
156#define FADUMP_CRASH_INFO_MAGIC STR_TO_HEX("FADMPINF")
157#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
158
159/* The firmware-assisted dump format.
160 *
161 * The register save area is an area in the partition's memory used to preserve
162 * the register contents (CPU state data) for the active CPUs during a firmware
163 * assisted dump. The dump format contains register save area header followed
164 * by register entries. Each list of registers for a CPU starts with
165 * "CPUSTRT" and ends with "CPUEND".
166 */
167
168/* Register save area header. */
169struct fadump_reg_save_area_header {
170 __be64 magic_number;
171 __be32 version;
172 __be32 num_cpu_offset;
173};
174
175/* Register entry. */
176struct fadump_reg_entry {
177 __be64 reg_id;
178 __be64 reg_value;
179};
180
181/* fadump crash info structure */
182struct fadump_crash_info_header {
183 u64 magic_number;
184 u64 elfcorehdr_addr;
185 u32 crashing_cpu;
186 struct pt_regs regs;
187 struct cpumask online_mask;
188};
189
190struct fad_crash_memory_ranges {
191 unsigned long long base;
192 unsigned long long size;
193};
194
195extern int is_fadump_memory_area(u64 addr, ulong size); 16extern int is_fadump_memory_area(u64 addr, ulong size);
196extern int early_init_dt_scan_fw_dump(unsigned long node,
197 const char *uname, int depth, void *data);
198extern int fadump_reserve_mem(void);
199extern int setup_fadump(void); 17extern int setup_fadump(void);
200extern int is_fadump_active(void); 18extern int is_fadump_active(void);
201extern int should_fadump_crash(void); 19extern int should_fadump_crash(void);
@@ -207,5 +25,11 @@ static inline int is_fadump_active(void) { return 0; }
207static inline int should_fadump_crash(void) { return 0; } 25static inline int should_fadump_crash(void) { return 0; }
208static inline void crash_fadump(struct pt_regs *regs, const char *str) { } 26static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
209static inline void fadump_cleanup(void) { } 27static inline void fadump_cleanup(void) { }
28#endif /* !CONFIG_FA_DUMP */
29
30#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
31extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
32 int depth, void *data);
33extern int fadump_reserve_mem(void);
210#endif 34#endif
211#endif 35#endif /* _ASM_POWERPC_FADUMP_H */
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index faeca8b76c8c..b3e214a97f3a 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -50,6 +50,7 @@
50#define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) 50#define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000)
51#define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) 51#define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000)
52#define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) 52#define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000)
53#define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000)
53 54
54#ifndef __ASSEMBLY__ 55#ifndef __ASSEMBLY__
55 56
@@ -68,9 +69,9 @@ enum {
68 FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | 69 FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
69 FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | 70 FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
70 FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | 71 FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
71 FW_FEATURE_PAPR_SCM, 72 FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR,
72 FW_FEATURE_PSERIES_ALWAYS = 0, 73 FW_FEATURE_PSERIES_ALWAYS = 0,
73 FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, 74 FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
74 FW_FEATURE_POWERNV_ALWAYS = 0, 75 FW_FEATURE_POWERNV_ALWAYS = 0,
75 FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, 76 FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
76 FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, 77 FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 3dfb80b86561..f54a08a2cd70 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -8,6 +8,8 @@
8#define MCOUNT_ADDR ((unsigned long)(_mcount)) 8#define MCOUNT_ADDR ((unsigned long)(_mcount))
9#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ 9#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */
10 10
11#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
12
11#ifdef __ASSEMBLY__ 13#ifdef __ASSEMBLY__
12 14
13/* Based off of objdump optput from glibc */ 15/* Based off of objdump optput from glibc */
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 3a6aa57b9d90..eea28ca679db 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -60,8 +60,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
60 60
61 pagefault_enable(); 61 pagefault_enable();
62 62
63 if (!ret) 63 *oval = oldval;
64 *oval = oldval;
65 64
66 prevent_write_to_user(uaddr, sizeof(*uaddr)); 65 prevent_write_to_user(uaddr, sizeof(*uaddr));
67 return ret; 66 return ret;
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index a466765709a9..2dabcf668292 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -169,47 +169,6 @@ name:
169 169
170#define ABS_ADDR(label) (label - fs_label + fs_start) 170#define ABS_ADDR(label) (label - fs_label + fs_start)
171 171
172#define EXC_REAL_BEGIN(name, start, size) \
173 FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
174
175#define EXC_REAL_END(name, start, size) \
176 FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
177
178#define EXC_VIRT_BEGIN(name, start, size) \
179 FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
180
181#define EXC_VIRT_END(name, start, size) \
182 FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
183
184#define EXC_COMMON_BEGIN(name) \
185 USE_TEXT_SECTION(); \
186 .balign IFETCH_ALIGN_BYTES; \
187 .global name; \
188 _ASM_NOKPROBE_SYMBOL(name); \
189 DEFINE_FIXED_SYMBOL(name); \
190name:
191
192#define TRAMP_REAL_BEGIN(name) \
193 FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
194
195#define TRAMP_VIRT_BEGIN(name) \
196 FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
197
198#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
199#define TRAMP_KVM_BEGIN(name) \
200 TRAMP_VIRT_BEGIN(name)
201#else
202#define TRAMP_KVM_BEGIN(name)
203#endif
204
205#define EXC_REAL_NONE(start, size) \
206 FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
207 FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
208
209#define EXC_VIRT_NONE(start, size) \
210 FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
211 FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size)
212
213#endif /* __ASSEMBLY__ */ 172#endif /* __ASSEMBLY__ */
214 173
215#endif /* _ASM_POWERPC_HEAD_64_H */ 174#endif /* _ASM_POWERPC_HEAD_64_H */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 20a101046cff..bd6504c28c2f 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -31,9 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
31 return 0; 31 return 0;
32} 32}
33 33
34void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
35 pte_t pte);
36
37#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE 34#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
38void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 35void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
39 unsigned long end, unsigned long floor, 36 unsigned long end, unsigned long floor,
diff --git a/arch/powerpc/include/asm/io-workarounds.h b/arch/powerpc/include/asm/io-workarounds.h
index 01567ea4ceaf..3cce499fbe27 100644
--- a/arch/powerpc/include/asm/io-workarounds.h
+++ b/arch/powerpc/include/asm/io-workarounds.h
@@ -8,6 +8,7 @@
8#ifndef _IO_WORKAROUNDS_H 8#ifndef _IO_WORKAROUNDS_H
9#define _IO_WORKAROUNDS_H 9#define _IO_WORKAROUNDS_H
10 10
11#ifdef CONFIG_PPC_IO_WORKAROUNDS
11#include <linux/io.h> 12#include <linux/io.h>
12#include <asm/pci-bridge.h> 13#include <asm/pci-bridge.h>
13 14
@@ -32,4 +33,23 @@ extern int spiderpci_iowa_init(struct iowa_bus *, void *);
32#define SPIDER_PCI_DUMMY_READ 0x0810 33#define SPIDER_PCI_DUMMY_READ 0x0810
33#define SPIDER_PCI_DUMMY_READ_BASE 0x0814 34#define SPIDER_PCI_DUMMY_READ_BASE 0x0814
34 35
36#endif
37
38#if defined(CONFIG_PPC_IO_WORKAROUNDS) && defined(CONFIG_PPC_INDIRECT_MMIO)
39extern bool io_workaround_inited;
40
41static inline bool iowa_is_active(void)
42{
43 return unlikely(io_workaround_inited);
44}
45#else
46static inline bool iowa_is_active(void)
47{
48 return false;
49}
50#endif
51
52void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
53 pgprot_t prot, void *caller);
54
35#endif /* _IO_WORKAROUNDS_H */ 55#endif /* _IO_WORKAROUNDS_H */
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 23e5d5d16c7e..a63ec938636d 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -705,16 +705,9 @@ static inline void iosync(void)
705 * create hand-made mappings for use only by the PCI code and cannot 705 * create hand-made mappings for use only by the PCI code and cannot
706 * currently be hooked. Must be page aligned. 706 * currently be hooked. Must be page aligned.
707 * 707 *
708 * * __ioremap is the low level implementation used by ioremap and
709 * ioremap_prot and cannot be hooked (but can be used by a hook on one
710 * of the previous ones)
711 *
712 * * __ioremap_caller is the same as above but takes an explicit caller 708 * * __ioremap_caller is the same as above but takes an explicit caller
713 * reference rather than using __builtin_return_address(0) 709 * reference rather than using __builtin_return_address(0)
714 * 710 *
715 * * __iounmap, is the low level implementation used by iounmap and cannot
716 * be hooked (but can be used by a hook on iounmap)
717 *
718 */ 711 */
719extern void __iomem *ioremap(phys_addr_t address, unsigned long size); 712extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
720extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, 713extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
@@ -729,13 +722,14 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
729 722
730extern void iounmap(volatile void __iomem *addr); 723extern void iounmap(volatile void __iomem *addr);
731 724
732extern void __iomem *__ioremap(phys_addr_t, unsigned long size, 725int early_ioremap_range(unsigned long ea, phys_addr_t pa,
733 unsigned long flags); 726 unsigned long size, pgprot_t prot);
727void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
728 pgprot_t prot, void *caller);
729
734extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, 730extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
735 pgprot_t prot, void *caller); 731 pgprot_t prot, void *caller);
736 732
737extern void __iounmap(volatile void __iomem *addr);
738
739extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, 733extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea,
740 unsigned long size, pgprot_t prot); 734 unsigned long size, pgprot_t prot);
741extern void __iounmap_at(void *ea, unsigned long size); 735extern void __iounmap_at(void *ea, unsigned long size);
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 18d342b815e4..350101e11ddb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -48,15 +48,16 @@ struct iommu_table_ops {
48 * returns old TCE and DMA direction mask. 48 * returns old TCE and DMA direction mask.
49 * @tce is a physical address. 49 * @tce is a physical address.
50 */ 50 */
51 int (*exchange)(struct iommu_table *tbl, 51 int (*xchg_no_kill)(struct iommu_table *tbl,
52 long index, 52 long index,
53 unsigned long *hpa, 53 unsigned long *hpa,
54 enum dma_data_direction *direction); 54 enum dma_data_direction *direction,
55 /* Real mode */ 55 bool realmode);
56 int (*exchange_rm)(struct iommu_table *tbl, 56
57 long index, 57 void (*tce_kill)(struct iommu_table *tbl,
58 unsigned long *hpa, 58 unsigned long index,
59 enum dma_data_direction *direction); 59 unsigned long pages,
60 bool realmode);
60 61
61 __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); 62 __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
62#endif 63#endif
@@ -111,6 +112,8 @@ struct iommu_table {
111 struct iommu_table_ops *it_ops; 112 struct iommu_table_ops *it_ops;
112 struct kref it_kref; 113 struct kref it_kref;
113 int it_nid; 114 int it_nid;
115 unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
116 unsigned long it_reserved_end;
114}; 117};
115 118
116#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ 119#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -149,8 +152,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
149/* Initializes an iommu_table based in values set in the passed-in 152/* Initializes an iommu_table based in values set in the passed-in
150 * structure 153 * structure
151 */ 154 */
152extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, 155extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
153 int nid); 156 int nid, unsigned long res_start, unsigned long res_end);
157
154#define IOMMU_TABLE_GROUP_MAX_TABLES 2 158#define IOMMU_TABLE_GROUP_MAX_TABLES 2
155 159
156struct iommu_table_group; 160struct iommu_table_group;
@@ -206,6 +210,12 @@ extern void iommu_del_device(struct device *dev);
206extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, 210extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
207 unsigned long entry, unsigned long *hpa, 211 unsigned long entry, unsigned long *hpa,
208 enum dma_data_direction *direction); 212 enum dma_data_direction *direction);
213extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
214 struct iommu_table *tbl,
215 unsigned long entry, unsigned long *hpa,
216 enum dma_data_direction *direction);
217extern void iommu_tce_kill(struct iommu_table *tbl,
218 unsigned long entry, unsigned long pages);
209#else 219#else
210static inline void iommu_register_group(struct iommu_table_group *table_group, 220static inline void iommu_register_group(struct iommu_table_group *table_group,
211 int pci_domain_number, 221 int pci_domain_number,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6fb5fb4779e0..6fe6ad64cba5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -297,6 +297,7 @@ struct kvm_arch {
297 cpumask_t cpu_in_guest; 297 cpumask_t cpu_in_guest;
298 u8 radix; 298 u8 radix;
299 u8 fwnmi_enabled; 299 u8 fwnmi_enabled;
300 u8 secure_guest;
300 bool threads_indep; 301 bool threads_indep;
301 bool nested_enable; 302 bool nested_enable;
302 pgd_t *pgtable; 303 pgd_t *pgtable;
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index c43d6eca9edd..7bcb64444a39 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -3,9 +3,6 @@
3#define _ASM_POWERPC_MACHDEP_H 3#define _ASM_POWERPC_MACHDEP_H
4#ifdef __KERNEL__ 4#ifdef __KERNEL__
5 5
6/*
7 */
8
9#include <linux/seq_file.h> 6#include <linux/seq_file.h>
10#include <linux/init.h> 7#include <linux/init.h>
11#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
@@ -31,10 +28,6 @@ struct pci_host_bridge;
31struct machdep_calls { 28struct machdep_calls {
32 char *name; 29 char *name;
33#ifdef CONFIG_PPC64 30#ifdef CONFIG_PPC64
34 void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size,
35 pgprot_t prot, void *caller);
36 void (*iounmap)(volatile void __iomem *token);
37
38#ifdef CONFIG_PM 31#ifdef CONFIG_PM
39 void (*iommu_save)(void); 32 void (*iommu_save)(void);
40 void (*iommu_restore)(void); 33 void (*iommu_restore)(void);
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..6a6ddaabdb34 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -30,6 +30,10 @@ enum MCE_Disposition {
30enum MCE_Initiator { 30enum MCE_Initiator {
31 MCE_INITIATOR_UNKNOWN = 0, 31 MCE_INITIATOR_UNKNOWN = 0,
32 MCE_INITIATOR_CPU = 1, 32 MCE_INITIATOR_CPU = 1,
33 MCE_INITIATOR_PCI = 2,
34 MCE_INITIATOR_ISA = 3,
35 MCE_INITIATOR_MEMORY= 4,
36 MCE_INITIATOR_POWERMGM = 5,
33}; 37};
34 38
35enum MCE_ErrorType { 39enum MCE_ErrorType {
@@ -41,6 +45,8 @@ enum MCE_ErrorType {
41 MCE_ERROR_TYPE_USER = 5, 45 MCE_ERROR_TYPE_USER = 5,
42 MCE_ERROR_TYPE_RA = 6, 46 MCE_ERROR_TYPE_RA = 6,
43 MCE_ERROR_TYPE_LINK = 7, 47 MCE_ERROR_TYPE_LINK = 7,
48 MCE_ERROR_TYPE_DCACHE = 8,
49 MCE_ERROR_TYPE_ICACHE = 9,
44}; 50};
45 51
46enum MCE_ErrorClass { 52enum MCE_ErrorClass {
@@ -122,7 +128,8 @@ struct machine_check_event {
122 enum MCE_UeErrorType ue_error_type:8; 128 enum MCE_UeErrorType ue_error_type:8;
123 u8 effective_address_provided; 129 u8 effective_address_provided;
124 u8 physical_address_provided; 130 u8 physical_address_provided;
125 u8 reserved_1[5]; 131 u8 ignore_event;
132 u8 reserved_1[4];
126 u64 effective_address; 133 u64 effective_address;
127 u64 physical_address; 134 u64 physical_address;
128 u8 reserved_2[8]; 135 u8 reserved_2[8];
@@ -193,6 +200,7 @@ struct mce_error_info {
193 enum MCE_Initiator initiator:8; 200 enum MCE_Initiator initiator:8;
194 enum MCE_ErrorClass error_class:8; 201 enum MCE_ErrorClass error_class:8;
195 bool sync_error; 202 bool sync_error;
203 bool ignore_event;
196}; 204};
197 205
198#define MAX_MC_EVT 100 206#define MAX_MC_EVT 100
diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..ba9dab07c1be
--- /dev/null
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -0,0 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0+ */
2/*
3 * SVM helper functions
4 *
5 * Copyright 2018 IBM Corporation
6 */
7
8#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
9#define _ASM_POWERPC_MEM_ENCRYPT_H
10
11#include <asm/svm.h>
12
13static inline bool mem_encrypt_active(void)
14{
15 return is_secure_guest();
16}
17
18static inline bool force_dma_unencrypted(struct device *dev)
19{
20 return is_secure_guest();
21}
22
23int set_memory_encrypted(unsigned long addr, int numpages);
24int set_memory_decrypted(unsigned long addr, int numpages);
25
26#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index ba94ce8c22d7..0699cfeeb8c9 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -257,7 +257,7 @@ extern void radix__mmu_cleanup_all(void);
257/* Functions for creating and updating partition table on POWER9 */ 257/* Functions for creating and updating partition table on POWER9 */
258extern void mmu_partition_table_init(void); 258extern void mmu_partition_table_init(void);
259extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, 259extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
260 unsigned long dw1); 260 unsigned long dw1, bool flush);
261#endif /* CONFIG_PPC64 */ 261#endif /* CONFIG_PPC64 */
262 262
263struct mm_struct; 263struct mm_struct;
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 0284f8f5305f..552b96eef0c8 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -11,8 +11,6 @@
11#include <asm/mmu.h> /* For sub-arch specific PPC_PIN_SIZE */ 11#include <asm/mmu.h> /* For sub-arch specific PPC_PIN_SIZE */
12#include <asm/asm-405.h> 12#include <asm/asm-405.h>
13 13
14extern unsigned long ioremap_bot;
15
16#ifdef CONFIG_44x 14#ifdef CONFIG_44x
17extern int icache_44x_need_flush; 15extern int icache_44x_need_flush;
18#endif 16#endif
@@ -78,23 +76,21 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
78 */ 76 */
79#include <asm/fixmap.h> 77#include <asm/fixmap.h>
80 78
81#ifdef CONFIG_HIGHMEM
82#define KVIRT_TOP PKMAP_BASE
83#else
84#define KVIRT_TOP FIXADDR_START
85#endif
86
87/* 79/*
88 * ioremap_bot starts at that address. Early ioremaps move down from there, 80 * ioremap_bot starts at that address. Early ioremaps move down from there,
89 * until mem_init() at which point this becomes the top of the vmalloc 81 * until mem_init() at which point this becomes the top of the vmalloc
90 * and ioremap space 82 * and ioremap space
91 */ 83 */
92#ifdef CONFIG_NOT_COHERENT_CACHE 84#ifdef CONFIG_HIGHMEM
93#define IOREMAP_TOP ((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK) 85#define IOREMAP_TOP PKMAP_BASE
94#else 86#else
95#define IOREMAP_TOP KVIRT_TOP 87#define IOREMAP_TOP FIXADDR_START
96#endif 88#endif
97 89
90/* PPC32 shares vmalloc area with ioremap */
91#define IOREMAP_START VMALLOC_START
92#define IOREMAP_END VMALLOC_END
93
98/* 94/*
99 * Just any arbitrary offset to the start of the vmalloc VM area: the 95 * Just any arbitrary offset to the start of the vmalloc VM area: the
100 * current 16MB value just means that there will be a 64MB "hole" after the 96 * current 16MB value just means that there will be a 64MB "hole" after the
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index b9f66cf15c31..9a33b8bd842d 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -53,6 +53,7 @@
53#define PHB_IO_BASE (ISA_IO_END) 53#define PHB_IO_BASE (ISA_IO_END)
54#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) 54#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE)
55#define IOREMAP_BASE (PHB_IO_END) 55#define IOREMAP_BASE (PHB_IO_END)
56#define IOREMAP_START (ioremap_bot)
56#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) 57#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE)
57 58
58 59
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 1ca1c1864b32..7fed9dc0f147 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -293,5 +293,18 @@ static inline int pgd_huge(pgd_t pgd)
293#define is_hugepd(hpd) (hugepd_ok(hpd)) 293#define is_hugepd(hpd) (hugepd_ok(hpd))
294#endif 294#endif
295 295
296/*
297 * This gets called at the end of handling a page fault, when
298 * the kernel has put a new PTE into the page table for the process.
299 * We use it to ensure coherency between the i-cache and d-cache
300 * for the page which has just been mapped in.
301 */
302#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
303void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep);
304#else
305static inline
306void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {}
307#endif
308
296#endif /* __ASSEMBLY__ */ 309#endif /* __ASSEMBLY__ */
297#endif 310#endif
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 383242eb0dea..378e3997845a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,10 @@
208#define OPAL_HANDLE_HMI2 166 208#define OPAL_HANDLE_HMI2 166
209#define OPAL_NX_COPROC_INIT 167 209#define OPAL_NX_COPROC_INIT 167
210#define OPAL_XIVE_GET_VP_STATE 170 210#define OPAL_XIVE_GET_VP_STATE 170
211#define OPAL_LAST 170 211#define OPAL_MPIPL_UPDATE 173
212#define OPAL_MPIPL_REGISTER_TAG 174
213#define OPAL_MPIPL_QUERY_TAG 175
214#define OPAL_LAST 175
212 215
213#define QUIESCE_HOLD 1 /* Spin all calls at entry */ 216#define QUIESCE_HOLD 1 /* Spin all calls at entry */
214#define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ 217#define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -453,6 +456,7 @@ enum opal_msg_type {
453 OPAL_MSG_DPO = 5, 456 OPAL_MSG_DPO = 5,
454 OPAL_MSG_PRD = 6, 457 OPAL_MSG_PRD = 6,
455 OPAL_MSG_OCC = 7, 458 OPAL_MSG_OCC = 7,
459 OPAL_MSG_PRD2 = 8,
456 OPAL_MSG_TYPE_MAX, 460 OPAL_MSG_TYPE_MAX,
457}; 461};
458 462
@@ -1059,6 +1063,7 @@ enum {
1059 OPAL_REBOOT_NORMAL = 0, 1063 OPAL_REBOOT_NORMAL = 0,
1060 OPAL_REBOOT_PLATFORM_ERROR = 1, 1064 OPAL_REBOOT_PLATFORM_ERROR = 1,
1061 OPAL_REBOOT_FULL_IPL = 2, 1065 OPAL_REBOOT_FULL_IPL = 2,
1066 OPAL_REBOOT_MPIPL = 3,
1062}; 1067};
1063 1068
1064/* Argument to OPAL_PCI_TCE_KILL */ 1069/* Argument to OPAL_PCI_TCE_KILL */
@@ -1135,6 +1140,44 @@ enum {
1135#define OPAL_PCI_P2P_LOAD 0x2 1140#define OPAL_PCI_P2P_LOAD 0x2
1136#define OPAL_PCI_P2P_STORE 0x4 1141#define OPAL_PCI_P2P_STORE 0x4
1137 1142
1143/* MPIPL update operations */
1144enum opal_mpipl_ops {
1145 OPAL_MPIPL_ADD_RANGE = 0,
1146 OPAL_MPIPL_REMOVE_RANGE = 1,
1147 OPAL_MPIPL_REMOVE_ALL = 2,
1148 OPAL_MPIPL_FREE_PRESERVED_MEMORY = 3,
1149};
1150
1151/* Tag will point to various metadata area. Kernel will
1152 * use tag to get metadata value.
1153 */
1154enum opal_mpipl_tags {
1155 OPAL_MPIPL_TAG_CPU = 0,
1156 OPAL_MPIPL_TAG_OPAL = 1,
1157 OPAL_MPIPL_TAG_KERNEL = 2,
1158 OPAL_MPIPL_TAG_BOOT_MEM = 3,
1159};
1160
1161/* Preserved memory details */
1162struct opal_mpipl_region {
1163 __be64 src;
1164 __be64 dest;
1165 __be64 size;
1166};
1167
1168/* Structure version */
1169#define OPAL_MPIPL_VERSION 0x01
1170
1171struct opal_mpipl_fadump {
1172 u8 version;
1173 u8 reserved[7];
1174 __be32 crashing_pir; /* OPAL crashing CPU PIR */
1175 __be32 cpu_data_version;
1176 __be32 cpu_data_size;
1177 __be32 region_cnt;
1178 struct opal_mpipl_region region[];
1179} __packed;
1180
1138#endif /* __ASSEMBLY__ */ 1181#endif /* __ASSEMBLY__ */
1139 1182
1140#endif /* __OPAL_API_H */ 1183#endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 57bd029c715e..a0cf8fba4d12 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -39,6 +39,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn,
39 uint64_t PE_handle); 39 uint64_t PE_handle);
40int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap, 40int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
41 uint64_t rate_phys, uint32_t size); 41 uint64_t rate_phys, uint32_t size);
42
42int64_t opal_console_write(int64_t term_number, __be64 *length, 43int64_t opal_console_write(int64_t term_number, __be64 *length,
43 const uint8_t *buffer); 44 const uint8_t *buffer);
44int64_t opal_console_read(int64_t term_number, __be64 *length, 45int64_t opal_console_read(int64_t term_number, __be64 *length,
@@ -272,7 +273,7 @@ int64_t opal_xive_get_vp_info(uint64_t vp,
272int64_t opal_xive_set_vp_info(uint64_t vp, 273int64_t opal_xive_set_vp_info(uint64_t vp,
273 uint64_t flags, 274 uint64_t flags,
274 uint64_t report_cl_pair); 275 uint64_t report_cl_pair);
275int64_t opal_xive_allocate_irq(uint32_t chip_id); 276int64_t opal_xive_allocate_irq_raw(uint32_t chip_id);
276int64_t opal_xive_free_irq(uint32_t girq); 277int64_t opal_xive_free_irq(uint32_t girq);
277int64_t opal_xive_sync(uint32_t type, uint32_t id); 278int64_t opal_xive_sync(uint32_t type, uint32_t id);
278int64_t opal_xive_dump(uint32_t type, uint32_t id); 279int64_t opal_xive_dump(uint32_t type, uint32_t id);
@@ -297,6 +298,10 @@ int opal_sensor_group_clear(u32 group_hndl, int token);
297int opal_sensor_group_enable(u32 group_hndl, int token, bool enable); 298int opal_sensor_group_enable(u32 group_hndl, int token, bool enable);
298int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct); 299int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct);
299 300
301s64 opal_mpipl_update(enum opal_mpipl_ops op, u64 src, u64 dest, u64 size);
302s64 opal_mpipl_register_tag(enum opal_mpipl_tags tag, u64 addr);
303s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, u64 *addr);
304
300s64 opal_signal_system_reset(s32 cpu); 305s64 opal_signal_system_reset(s32 cpu);
301s64 opal_quiesce(u64 shutdown_type, s32 cpu); 306s64 opal_quiesce(u64 shutdown_type, s32 cpu);
302 307
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 0d52f57fca04..c8bb14ff4713 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -215,9 +215,19 @@ static inline bool pfn_valid(unsigned long pfn)
215/* 215/*
216 * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET 216 * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET
217 * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit. 217 * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit.
218 * This also results in better code generation.
218 */ 219 */
219#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET)) 220#define __va(x) \
220#define __pa(x) ((unsigned long)(x) & 0x0fffffffffffffffUL) 221({ \
222 VIRTUAL_BUG_ON((unsigned long)(x) >= PAGE_OFFSET); \
223 (void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET); \
224})
225
226#define __pa(x) \
227({ \
228 VIRTUAL_BUG_ON((unsigned long)(x) < PAGE_OFFSET); \
229 (unsigned long)(x) & 0x0fffffffffffffffUL; \
230})
221 231
222#else /* 32-bit, non book E */ 232#else /* 32-bit, non book E */
223#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START)) 233#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START))
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index 683dfbc67ca8..d64dfe3ac712 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -40,6 +40,8 @@ typedef unsigned long long pte_basic_t;
40typedef unsigned long pte_basic_t; 40typedef unsigned long pte_basic_t;
41#endif 41#endif
42 42
43#include <asm/bug.h>
44
43/* 45/*
44 * Clear page using the dcbz instruction, which doesn't cause any 46 * Clear page using the dcbz instruction, which doesn't cause any
45 * memory traffic (except to write out any cache lines which get 47 * memory traffic (except to write out any cache lines which get
@@ -49,6 +51,8 @@ static inline void clear_page(void *addr)
49{ 51{
50 unsigned int i; 52 unsigned int i;
51 53
54 WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1));
55
52 for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES) 56 for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES)
53 dcbz(addr); 57 dcbz(addr);
54} 58}
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 8dad1fdf4bd2..ea6ec65970ef 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -183,6 +183,7 @@ struct iommu_table;
183struct pci_dn { 183struct pci_dn {
184 int flags; 184 int flags;
185#define PCI_DN_FLAG_IOV_VF 0x01 185#define PCI_DN_FLAG_IOV_VF 0x01
186#define PCI_DN_FLAG_DEAD 0x02 /* Device has been hot-removed */
186 187
187 int busno; /* pci bus number */ 188 int busno; /* pci bus number */
188 int devfn; /* pci device and function number */ 189 int devfn; /* pci device and function number */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c58ba7963688..8b7865a2d576 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -68,6 +68,8 @@ extern pgd_t swapper_pg_dir[];
68 68
69extern void paging_init(void); 69extern void paging_init(void);
70 70
71extern unsigned long ioremap_bot;
72
71/* 73/*
72 * kern_addr_valid is intended to indicate whether an address is a valid 74 * kern_addr_valid is intended to indicate whether an address is a valid
73 * kernel address. Most 32-bit archs define it as always true (like this) 75 * kernel address. Most 32-bit archs define it as always true (like this)
@@ -77,18 +79,6 @@ extern void paging_init(void);
77 79
78#include <asm-generic/pgtable.h> 80#include <asm-generic/pgtable.h>
79 81
80
81/*
82 * This gets called at the end of handling a page fault, when
83 * the kernel has put a new PTE into the page table for the process.
84 * We use it to ensure coherency between the i-cache and d-cache
85 * for the page which has just been mapped in.
86 * On machines which use an MMU hash table, we use this to put a
87 * corresponding HPTE into the hash table ahead of time, instead of
88 * waiting for the inevitable extra hash-table miss exception.
89 */
90extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
91
92#ifndef CONFIG_TRANSPARENT_HUGEPAGE 82#ifndef CONFIG_TRANSPARENT_HUGEPAGE
93#define pmd_large(pmd) 0 83#define pmd_large(pmd) 0
94#endif 84#endif
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index cff5a411e595..4497c8afb573 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -340,6 +340,12 @@ static inline long plpar_set_ciabr(unsigned long ciabr)
340{ 340{
341 return 0; 341 return 0;
342} 342}
343
344static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
345 unsigned long *ptes)
346{
347 return 0;
348}
343#endif /* CONFIG_PPC_PSERIES */ 349#endif /* CONFIG_PPC_PSERIES */
344 350
345#endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ 351#endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index cec2d6409515..7f4be5a05eb3 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -62,11 +62,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
62void eeh_sysfs_add_device(struct pci_dev *pdev); 62void eeh_sysfs_add_device(struct pci_dev *pdev);
63void eeh_sysfs_remove_device(struct pci_dev *pdev); 63void eeh_sysfs_remove_device(struct pci_dev *pdev);
64 64
65static inline const char *eeh_pci_name(struct pci_dev *pdev)
66{
67 return pdev ? pci_name(pdev) : "<null>";
68}
69
70static inline const char *eeh_driver_name(struct pci_dev *pdev) 65static inline const char *eeh_driver_name(struct pci_dev *pdev)
71{ 66{
72 return (pdev && pdev->driver) ? pdev->driver->name : "<null>"; 67 return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
@@ -74,6 +69,8 @@ static inline const char *eeh_driver_name(struct pci_dev *pdev)
74 69
75#endif /* CONFIG_EEH */ 70#endif /* CONFIG_EEH */
76 71
72#define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
73
77#else /* CONFIG_PCI */ 74#else /* CONFIG_PCI */
78static inline void init_pci_config_tokens(void) { } 75static inline void init_pci_config_tokens(void) { }
79#endif /* !CONFIG_PCI */ 76#endif /* !CONFIG_PCI */
diff --git a/arch/powerpc/include/asm/ppc4xx_ocm.h b/arch/powerpc/include/asm/ppc4xx_ocm.h
deleted file mode 100644
index fc4db6dcde84..000000000000
--- a/arch/powerpc/include/asm/ppc4xx_ocm.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * PowerPC 4xx OCM memory allocation support
4 *
5 * (C) Copyright 2009, Applied Micro Circuits Corporation
6 * Victor Gallardo (vgallardo@amcc.com)
7 *
8 * See file CREDITS for list of people who contributed to this
9 * project.
10 */
11
12#ifndef __ASM_POWERPC_PPC4XX_OCM_H__
13#define __ASM_POWERPC_PPC4XX_OCM_H__
14
15#define PPC4XX_OCM_NON_CACHED 0
16#define PPC4XX_OCM_CACHED 1
17
18#if defined(CONFIG_PPC4xx_OCM)
19
20void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
21 int flags, const char *owner);
22void ppc4xx_ocm_free(const void *virt);
23
24#else
25
26#define ppc4xx_ocm_alloc(phys, size, align, flags, owner) NULL
27#define ppc4xx_ocm_free(addr) ((void)0)
28
29#endif /* CONFIG_PPC4xx_OCM */
30
31#endif /* __ASM_POWERPC_PPC4XX_OCM_H__ */
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index e0637730a8e7..6b03dff61a05 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -311,18 +311,48 @@ n:
311 addis reg,reg,(name - 0b)@ha; \ 311 addis reg,reg,(name - 0b)@ha; \
312 addi reg,reg,(name - 0b)@l; 312 addi reg,reg,(name - 0b)@l;
313 313
314#ifdef __powerpc64__ 314#if defined(__powerpc64__) && defined(HAVE_AS_ATHIGH)
315#ifdef HAVE_AS_ATHIGH
316#define __AS_ATHIGH high 315#define __AS_ATHIGH high
317#else 316#else
318#define __AS_ATHIGH h 317#define __AS_ATHIGH h
319#endif 318#endif
320#define LOAD_REG_IMMEDIATE(reg,expr) \ 319
321 lis reg,(expr)@highest; \ 320.macro __LOAD_REG_IMMEDIATE_32 r, x
322 ori reg,reg,(expr)@higher; \ 321 .if (\x) >= 0x8000 || (\x) < -0x8000
323 rldicr reg,reg,32,31; \ 322 lis \r, (\x)@__AS_ATHIGH
324 oris reg,reg,(expr)@__AS_ATHIGH; \ 323 .if (\x) & 0xffff != 0
325 ori reg,reg,(expr)@l; 324 ori \r, \r, (\x)@l
325 .endif
326 .else
327 li \r, (\x)@l
328 .endif
329.endm
330
331.macro __LOAD_REG_IMMEDIATE r, x
332 .if (\x) >= 0x80000000 || (\x) < -0x80000000
333 __LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
334 sldi \r, \r, 32
335 .if (\x) & 0xffff0000 != 0
336 oris \r, \r, (\x)@__AS_ATHIGH
337 .endif
338 .if (\x) & 0xffff != 0
339 ori \r, \r, (\x)@l
340 .endif
341 .else
342 __LOAD_REG_IMMEDIATE_32 \r, \x
343 .endif
344.endm
345
346#ifdef __powerpc64__
347
348#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr
349
350#define LOAD_REG_IMMEDIATE_SYM(reg, tmp, expr) \
351 lis tmp, (expr)@highest; \
352 lis reg, (expr)@__AS_ATHIGH; \
353 ori tmp, tmp, (expr)@higher; \
354 ori reg, reg, (expr)@l; \
355 rldimi reg, tmp, 32, 0
326 356
327#define LOAD_REG_ADDR(reg,name) \ 357#define LOAD_REG_ADDR(reg,name) \
328 ld reg,name@got(r2) 358 ld reg,name@got(r2)
@@ -335,11 +365,13 @@ n:
335 365
336#else /* 32-bit */ 366#else /* 32-bit */
337 367
338#define LOAD_REG_IMMEDIATE(reg,expr) \ 368#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE_32 reg, expr
369
370#define LOAD_REG_IMMEDIATE_SYM(reg,expr) \
339 lis reg,(expr)@ha; \ 371 lis reg,(expr)@ha; \
340 addi reg,reg,(expr)@l; 372 addi reg,reg,(expr)@l;
341 373
342#define LOAD_REG_ADDR(reg,name) LOAD_REG_IMMEDIATE(reg, name) 374#define LOAD_REG_ADDR(reg,name) LOAD_REG_IMMEDIATE_SYM(reg, name)
343 375
344#define LOAD_REG_ADDRBASE(reg, name) lis reg,name@ha 376#define LOAD_REG_ADDRBASE(reg, name) lis reg,name@ha
345#define ADDROFF(name) name@l 377#define ADDROFF(name) name@l
@@ -351,19 +383,9 @@ n:
351 383
352/* various errata or part fixups */ 384/* various errata or part fixups */
353#ifdef CONFIG_PPC601_SYNC_FIX 385#ifdef CONFIG_PPC601_SYNC_FIX
354#define SYNC \ 386#define SYNC sync; isync
355BEGIN_FTR_SECTION \ 387#define SYNC_601 sync
356 sync; \ 388#define ISYNC_601 isync
357 isync; \
358END_FTR_SECTION_IFSET(CPU_FTR_601)
359#define SYNC_601 \
360BEGIN_FTR_SECTION \
361 sync; \
362END_FTR_SECTION_IFSET(CPU_FTR_601)
363#define ISYNC_601 \
364BEGIN_FTR_SECTION \
365 isync; \
366END_FTR_SECTION_IFSET(CPU_FTR_601)
367#else 389#else
368#define SYNC 390#define SYNC
369#define SYNC_601 391#define SYNC_601
@@ -389,15 +411,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
389#define MFTBU(dest) mfspr dest, SPRN_TBRU 411#define MFTBU(dest) mfspr dest, SPRN_TBRU
390#endif 412#endif
391 413
392#ifndef CONFIG_SMP
393#define TLBSYNC
394#else /* CONFIG_SMP */
395/* tlbsync is not implemented on 601 */ 414/* tlbsync is not implemented on 601 */
396#define TLBSYNC \ 415#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601)
397BEGIN_FTR_SECTION \ 416#define TLBSYNC
398 tlbsync; \ 417#else
399 sync; \ 418#define TLBSYNC tlbsync; sync
400END_FTR_SECTION_IFCLR(CPU_FTR_601)
401#endif 419#endif
402 420
403#ifdef CONFIG_PPC64 421#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index feee1b21bbd5..ee3ada66deb5 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -203,7 +203,11 @@ do { \
203#endif /* __powerpc64__ */ 203#endif /* __powerpc64__ */
204 204
205#define arch_has_single_step() (1) 205#define arch_has_single_step() (1)
206#define arch_has_block_step() (!cpu_has_feature(CPU_FTR_601)) 206#ifndef CONFIG_BOOK3S_601
207#define arch_has_block_step() (true)
208#else
209#define arch_has_block_step() (false)
210#endif
207#define ARCH_HAS_USER_SINGLE_STEP_REPORT 211#define ARCH_HAS_USER_SINGLE_STEP_REPORT
208 212
209/* 213/*
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 10caa145f98b..ec3714cf0989 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -38,6 +38,7 @@
38#define MSR_TM_LG 32 /* Trans Mem Available */ 38#define MSR_TM_LG 32 /* Trans Mem Available */
39#define MSR_VEC_LG 25 /* Enable AltiVec */ 39#define MSR_VEC_LG 25 /* Enable AltiVec */
40#define MSR_VSX_LG 23 /* Enable VSX */ 40#define MSR_VSX_LG 23 /* Enable VSX */
41#define MSR_S_LG 22 /* Secure state */
41#define MSR_POW_LG 18 /* Enable Power Management */ 42#define MSR_POW_LG 18 /* Enable Power Management */
42#define MSR_WE_LG 18 /* Wait State Enable */ 43#define MSR_WE_LG 18 /* Wait State Enable */
43#define MSR_TGPR_LG 17 /* TLB Update registers in use */ 44#define MSR_TGPR_LG 17 /* TLB Update registers in use */
@@ -71,11 +72,13 @@
71#define MSR_SF __MASK(MSR_SF_LG) /* Enable 64 bit mode */ 72#define MSR_SF __MASK(MSR_SF_LG) /* Enable 64 bit mode */
72#define MSR_ISF __MASK(MSR_ISF_LG) /* Interrupt 64b mode valid on 630 */ 73#define MSR_ISF __MASK(MSR_ISF_LG) /* Interrupt 64b mode valid on 630 */
73#define MSR_HV __MASK(MSR_HV_LG) /* Hypervisor state */ 74#define MSR_HV __MASK(MSR_HV_LG) /* Hypervisor state */
75#define MSR_S __MASK(MSR_S_LG) /* Secure state */
74#else 76#else
75/* so tests for these bits fail on 32-bit */ 77/* so tests for these bits fail on 32-bit */
76#define MSR_SF 0 78#define MSR_SF 0
77#define MSR_ISF 0 79#define MSR_ISF 0
78#define MSR_HV 0 80#define MSR_HV 0
81#define MSR_S 0
79#endif 82#endif
80 83
81/* 84/*
diff --git a/arch/powerpc/include/asm/scom.h b/arch/powerpc/include/asm/scom.h
deleted file mode 100644
index 08c44396e54a..000000000000
--- a/arch/powerpc/include/asm/scom.h
+++ /dev/null
@@ -1,154 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
4 * <benh@kernel.crashing.org>
5 * and David Gibson, IBM Corporation.
6 */
7
8#ifndef _ASM_POWERPC_SCOM_H
9#define _ASM_POWERPC_SCOM_H
10
11#ifdef __KERNEL__
12#ifndef __ASSEMBLY__
13#ifdef CONFIG_PPC_SCOM
14
15/*
16 * The SCOM bus is a sideband bus used for accessing various internal
17 * registers of the processor or the chipset. The implementation details
18 * differ between processors and platforms, and the access method as
19 * well.
20 *
21 * This API allows to "map" ranges of SCOM register numbers associated
22 * with a given SCOM controller. The later must be represented by a
23 * device node, though some implementations might support NULL if there
24 * is no possible ambiguity
25 *
26 * Then, scom_read/scom_write can be used to accesses registers inside
27 * that range. The argument passed is a register number relative to
28 * the beginning of the range mapped.
29 */
30
31typedef void *scom_map_t;
32
33/* Value for an invalid SCOM map */
34#define SCOM_MAP_INVALID (NULL)
35
36/* The scom_controller data structure is what the platform passes
37 * to the core code in scom_init, it provides the actual implementation
38 * of all the SCOM functions
39 */
40struct scom_controller {
41 scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count);
42 void (*unmap)(scom_map_t map);
43
44 int (*read)(scom_map_t map, u64 reg, u64 *value);
45 int (*write)(scom_map_t map, u64 reg, u64 value);
46};
47
48extern const struct scom_controller *scom_controller;
49
50/**
51 * scom_init - Initialize the SCOM backend, called by the platform
52 * @controller: The platform SCOM controller
53 */
54static inline void scom_init(const struct scom_controller *controller)
55{
56 scom_controller = controller;
57}
58
59/**
60 * scom_map_ok - Test is a SCOM mapping is successful
61 * @map: The result of scom_map to test
62 */
63static inline int scom_map_ok(scom_map_t map)
64{
65 return map != SCOM_MAP_INVALID;
66}
67
68/**
69 * scom_map - Map a block of SCOM registers
70 * @ctrl_dev: Device node of the SCOM controller
71 * some implementations allow NULL here
72 * @reg: first SCOM register to map
73 * @count: Number of SCOM registers to map
74 */
75
76static inline scom_map_t scom_map(struct device_node *ctrl_dev,
77 u64 reg, u64 count)
78{
79 return scom_controller->map(ctrl_dev, reg, count);
80}
81
82/**
83 * scom_find_parent - Find the SCOM controller for a device
84 * @dev: OF node of the device
85 *
86 * This is not meant for general usage, but in combination with
87 * scom_map() allows to map registers not represented by the
88 * device own scom-reg property. Useful for applying HW workarounds
89 * on things not properly represented in the device-tree for example.
90 */
91struct device_node *scom_find_parent(struct device_node *dev);
92
93
94/**
95 * scom_map_device - Map a device's block of SCOM registers
96 * @dev: OF node of the device
97 * @index: Register bank index (index in "scom-reg" property)
98 *
99 * This function will use the device-tree binding for SCOM which
100 * is to follow "scom-parent" properties until it finds a node with
101 * a "scom-controller" property to find the controller. It will then
102 * use the "scom-reg" property which is made of reg/count pairs,
103 * each of them having a size defined by the controller's #scom-cells
104 * property
105 */
106extern scom_map_t scom_map_device(struct device_node *dev, int index);
107
108
109/**
110 * scom_unmap - Unmap a block of SCOM registers
111 * @map: Result of scom_map is to be unmapped
112 */
113static inline void scom_unmap(scom_map_t map)
114{
115 if (scom_map_ok(map))
116 scom_controller->unmap(map);
117}
118
119/**
120 * scom_read - Read a SCOM register
121 * @map: Result of scom_map
122 * @reg: Register index within that map
123 * @value: Updated with the value read
124 *
125 * Returns 0 (success) or a negative error code
126 */
127static inline int scom_read(scom_map_t map, u64 reg, u64 *value)
128{
129 int rc;
130
131 rc = scom_controller->read(map, reg, value);
132 if (rc)
133 *value = 0xfffffffffffffffful;
134 return rc;
135}
136
137/**
138 * scom_write - Write to a SCOM register
139 * @map: Result of scom_map
140 * @reg: Register index within that map
141 * @value: Value to write
142 *
143 * Returns 0 (success) or a negative error code
144 */
145static inline int scom_write(scom_map_t map, u64 reg, u64 value)
146{
147 return scom_controller->write(map, reg, value);
148}
149
150
151#endif /* CONFIG_PPC_SCOM */
152#endif /* __ASSEMBLY__ */
153#endif /* __KERNEL__ */
154#endif /* _ASM_POWERPC_SCOM_H */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 4a1664a8658d..5a9b6eb651b6 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -61,17 +61,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end)
61 (unsigned long)_stext < end; 61 (unsigned long)_stext < end;
62} 62}
63 63
64static inline int overlaps_kvm_tmp(unsigned long start, unsigned long end)
65{
66#ifdef CONFIG_KVM_GUEST
67 extern char kvm_tmp[];
68 return start < (unsigned long)kvm_tmp &&
69 (unsigned long)&kvm_tmp[1024 * 1024] < end;
70#else
71 return 0;
72#endif
73}
74
75#ifdef PPC64_ELF_ABI_v1 64#ifdef PPC64_ELF_ABI_v1
76 65
77#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1 66#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
diff --git a/arch/powerpc/include/asm/setjmp.h b/arch/powerpc/include/asm/setjmp.h
index d995061f5f86..e9f81bb3f83b 100644
--- a/arch/powerpc/include/asm/setjmp.h
+++ b/arch/powerpc/include/asm/setjmp.h
@@ -7,7 +7,7 @@
7 7
8#define JMP_BUF_LEN 23 8#define JMP_BUF_LEN 23
9 9
10extern long setjmp(long *); 10extern long setjmp(long *) __attribute__((returns_twice));
11extern void longjmp(long *, long); 11extern void longjmp(long *, long) __attribute__((noreturn));
12 12
13#endif /* _ASM_POWERPC_SETJMP_H */ 13#endif /* _ASM_POWERPC_SETJMP_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index a47f827bc5f1..e9a960e28f3c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -101,15 +101,43 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
101 101
102#if defined(CONFIG_PPC_SPLPAR) 102#if defined(CONFIG_PPC_SPLPAR)
103/* We only yield to the hypervisor if we are in shared processor mode */ 103/* We only yield to the hypervisor if we are in shared processor mode */
104#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr)) 104void splpar_spin_yield(arch_spinlock_t *lock);
105extern void __spin_yield(arch_spinlock_t *lock); 105void splpar_rw_yield(arch_rwlock_t *lock);
106extern void __rw_yield(arch_rwlock_t *lock);
107#else /* SPLPAR */ 106#else /* SPLPAR */
108#define __spin_yield(x) barrier() 107static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
109#define __rw_yield(x) barrier() 108static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
110#define SHARED_PROCESSOR 0
111#endif 109#endif
112 110
111static inline bool is_shared_processor(void)
112{
113/*
114 * LPPACA is only available on Pseries so guard anything LPPACA related to
115 * allow other platforms (which include this common header) to compile.
116 */
117#ifdef CONFIG_PPC_PSERIES
118 return (IS_ENABLED(CONFIG_PPC_SPLPAR) &&
119 lppaca_shared_proc(local_paca->lppaca_ptr));
120#else
121 return false;
122#endif
123}
124
125static inline void spin_yield(arch_spinlock_t *lock)
126{
127 if (is_shared_processor())
128 splpar_spin_yield(lock);
129 else
130 barrier();
131}
132
133static inline void rw_yield(arch_rwlock_t *lock)
134{
135 if (is_shared_processor())
136 splpar_rw_yield(lock);
137 else
138 barrier();
139}
140
113static inline void arch_spin_lock(arch_spinlock_t *lock) 141static inline void arch_spin_lock(arch_spinlock_t *lock)
114{ 142{
115 while (1) { 143 while (1) {
@@ -117,8 +145,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
117 break; 145 break;
118 do { 146 do {
119 HMT_low(); 147 HMT_low();
120 if (SHARED_PROCESSOR) 148 if (is_shared_processor())
121 __spin_yield(lock); 149 splpar_spin_yield(lock);
122 } while (unlikely(lock->slock != 0)); 150 } while (unlikely(lock->slock != 0));
123 HMT_medium(); 151 HMT_medium();
124 } 152 }
@@ -136,8 +164,8 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
136 local_irq_restore(flags); 164 local_irq_restore(flags);
137 do { 165 do {
138 HMT_low(); 166 HMT_low();
139 if (SHARED_PROCESSOR) 167 if (is_shared_processor())
140 __spin_yield(lock); 168 splpar_spin_yield(lock);
141 } while (unlikely(lock->slock != 0)); 169 } while (unlikely(lock->slock != 0));
142 HMT_medium(); 170 HMT_medium();
143 local_irq_restore(flags_dis); 171 local_irq_restore(flags_dis);
@@ -226,8 +254,8 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
226 break; 254 break;
227 do { 255 do {
228 HMT_low(); 256 HMT_low();
229 if (SHARED_PROCESSOR) 257 if (is_shared_processor())
230 __rw_yield(rw); 258 splpar_rw_yield(rw);
231 } while (unlikely(rw->lock < 0)); 259 } while (unlikely(rw->lock < 0));
232 HMT_medium(); 260 HMT_medium();
233 } 261 }
@@ -240,8 +268,8 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
240 break; 268 break;
241 do { 269 do {
242 HMT_low(); 270 HMT_low();
243 if (SHARED_PROCESSOR) 271 if (is_shared_processor())
244 __rw_yield(rw); 272 splpar_rw_yield(rw);
245 } while (unlikely(rw->lock != 0)); 273 } while (unlikely(rw->lock != 0));
246 HMT_medium(); 274 HMT_medium();
247 } 275 }
@@ -281,9 +309,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
281 rw->lock = 0; 309 rw->lock = 0;
282} 310}
283 311
284#define arch_spin_relax(lock) __spin_yield(lock) 312#define arch_spin_relax(lock) spin_yield(lock)
285#define arch_read_relax(lock) __rw_yield(lock) 313#define arch_read_relax(lock) rw_yield(lock)
286#define arch_write_relax(lock) __rw_yield(lock) 314#define arch_write_relax(lock) rw_yield(lock)
287 315
288/* See include/linux/spinlock.h */ 316/* See include/linux/spinlock.h */
289#define smp_mb__after_spinlock() smp_mb() 317#define smp_mb__after_spinlock() smp_mb()
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n);
53#ifndef CONFIG_KASAN 53#ifndef CONFIG_KASAN
54#define __HAVE_ARCH_MEMSET32 54#define __HAVE_ARCH_MEMSET32
55#define __HAVE_ARCH_MEMSET64 55#define __HAVE_ARCH_MEMSET64
56#define __HAVE_ARCH_MEMCPY_MCSAFE
56 57
58extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
57extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); 59extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
58extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); 60extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
59extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); 61extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
new file mode 100644
index 000000000000..85580b30aba4
--- /dev/null
+++ b/arch/powerpc/include/asm/svm.h
@@ -0,0 +1,31 @@
1/* SPDX-License-Identifier: GPL-2.0+ */
2/*
3 * SVM helper functions
4 *
5 * Copyright 2018 Anshuman Khandual, IBM Corporation.
6 */
7
8#ifndef _ASM_POWERPC_SVM_H
9#define _ASM_POWERPC_SVM_H
10
11#ifdef CONFIG_PPC_SVM
12
13static inline bool is_secure_guest(void)
14{
15 return mfmsr() & MSR_S;
16}
17
18void dtl_cache_ctor(void *addr);
19#define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL)
20
21#else /* CONFIG_PPC_SVM */
22
23static inline bool is_secure_guest(void)
24{
25 return false;
26}
27
28#define get_dtl_cache_ctor() NULL
29
30#endif /* CONFIG_PPC_SVM */
31#endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9fab..08dbe3e6831c 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -41,11 +41,7 @@ struct div_result {
41 41
42/* Accessor functions for the timebase (RTC on 601) registers. */ 42/* Accessor functions for the timebase (RTC on 601) registers. */
43/* If one day CONFIG_POWER is added just define __USE_RTC as 1 */ 43/* If one day CONFIG_POWER is added just define __USE_RTC as 1 */
44#ifdef CONFIG_PPC_BOOK3S_32 44#define __USE_RTC() (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
45#define __USE_RTC() (cpu_has_feature(CPU_FTR_USE_RTC))
46#else
47#define __USE_RTC() 0
48#endif
49 45
50#ifdef CONFIG_PPC64 46#ifdef CONFIG_PPC64
51 47
diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h
index 926b9f91a3ef..d2d2c4bd8435 100644
--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@@ -17,38 +17,10 @@ typedef unsigned long cycles_t;
17 17
18static inline cycles_t get_cycles(void) 18static inline cycles_t get_cycles(void)
19{ 19{
20#ifdef __powerpc64__ 20 if (IS_ENABLED(CONFIG_BOOK3S_601))
21 return 0;
22
21 return mftb(); 23 return mftb();
22#else
23 cycles_t ret;
24
25 /*
26 * For the "cycle" counter we use the timebase lower half.
27 * Currently only used on SMP.
28 */
29
30 ret = 0;
31
32 __asm__ __volatile__(
33#ifdef CONFIG_PPC_8xx
34 "97: mftb %0\n"
35#else
36 "97: mfspr %0, %2\n"
37#endif
38 "99:\n"
39 ".section __ftr_fixup,\"a\"\n"
40 ".align 2\n"
41 "98:\n"
42 " .long %1\n"
43 " .long 0\n"
44 " .long 97b-98b\n"
45 " .long 99b-98b\n"
46 " .long 0\n"
47 " .long 0\n"
48 ".previous"
49 : "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL));
50 return ret;
51#endif
52} 24}
53 25
54#endif /* __KERNEL__ */ 26#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 8b03eb44e876..15002b51ff18 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user *to,
387 return ret; 387 return ret;
388} 388}
389 389
390static __always_inline unsigned long __must_check
391copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
392{
393 if (likely(check_copy_size(from, n, true))) {
394 if (access_ok(to, n)) {
395 allow_write_to_user(to, n);
396 n = memcpy_mcsafe((void *)to, from, n);
397 prevent_write_to_user(to, n);
398 }
399 }
400
401 return n;
402}
403
390extern unsigned long __clear_user(void __user *addr, unsigned long size); 404extern unsigned long __clear_user(void __user *addr, unsigned long size);
391 405
392static inline unsigned long clear_user(void __user *addr, unsigned long size) 406static inline unsigned long clear_user(void __user *addr, unsigned long size)
diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
new file mode 100644
index 000000000000..4fcda1d5793d
--- /dev/null
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -0,0 +1,33 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Ultravisor API.
4 *
5 * Copyright 2019, IBM Corporation.
6 *
7 */
8#ifndef _ASM_POWERPC_ULTRAVISOR_API_H
9#define _ASM_POWERPC_ULTRAVISOR_API_H
10
11#include <asm/hvcall.h>
12
13/* Return codes */
14#define U_BUSY H_BUSY
15#define U_FUNCTION H_FUNCTION
16#define U_NOT_AVAILABLE H_NOT_AVAILABLE
17#define U_P2 H_P2
18#define U_P3 H_P3
19#define U_P4 H_P4
20#define U_P5 H_P5
21#define U_PARAMETER H_PARAMETER
22#define U_PERMISSION H_PERMISSION
23#define U_SUCCESS H_SUCCESS
24
25/* opcodes */
26#define UV_WRITE_PATE 0xF104
27#define UV_RETURN 0xF11C
28#define UV_ESM 0xF110
29#define UV_SHARE_PAGE 0xF130
30#define UV_UNSHARE_PAGE 0xF134
31#define UV_UNSHARE_ALL_PAGES 0xF140
32
33#endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
new file mode 100644
index 000000000000..b1bc2e043ed4
--- /dev/null
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -0,0 +1,49 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Ultravisor definitions
4 *
5 * Copyright 2019, IBM Corporation.
6 *
7 */
8#ifndef _ASM_POWERPC_ULTRAVISOR_H
9#define _ASM_POWERPC_ULTRAVISOR_H
10
11#include <asm/asm-prototypes.h>
12#include <asm/ultravisor-api.h>
13#include <asm/firmware.h>
14
15int early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
16 int depth, void *data);
17
18/*
19 * In ultravisor enabled systems, PTCR becomes ultravisor privileged only for
20 * writing and an attempt to write to it will cause a Hypervisor Emulation
21 * Assistance interrupt.
22 */
23static inline void set_ptcr_when_no_uv(u64 val)
24{
25 if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
26 mtspr(SPRN_PTCR, val);
27}
28
29static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
30{
31 return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
32}
33
34static inline int uv_share_page(u64 pfn, u64 npages)
35{
36 return ucall_norets(UV_SHARE_PAGE, pfn, npages);
37}
38
39static inline int uv_unshare_page(u64 pfn, u64 npages)
40{
41 return ucall_norets(UV_UNSHARE_PAGE, pfn, npages);
42}
43
44static inline int uv_unshare_all_pages(void)
45{
46 return ucall_norets(UV_UNSHARE_ALL_PAGES);
47}
48
49#endif /* _ASM_POWERPC_ULTRAVISOR_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 818989e11678..24cdf97376c4 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,6 +99,7 @@ extern void xive_flush_interrupt(void);
99 99
100/* xmon hook */ 100/* xmon hook */
101extern void xmon_xive_do_dump(int cpu); 101extern void xmon_xive_do_dump(int cpu);
102extern int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d);
102 103
103/* APIs used by KVM */ 104/* APIs used by KVM */
104extern u32 xive_native_default_eq_shift(void); 105extern u32 xive_native_default_eq_shift(void);
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore
index c5f676c3c224..67ebd3003c05 100644
--- a/arch/powerpc/kernel/.gitignore
+++ b/arch/powerpc/kernel/.gitignore
@@ -1 +1,2 @@
1prom_init_check
1vmlinux.lds 2vmlinux.lds
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 56dfa7a2a6f2..a7ca8fe62368 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -52,7 +52,7 @@ obj-y := cputable.o ptrace.o syscalls.o \
52 of_platform.o prom_parse.o 52 of_platform.o prom_parse.o
53obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ 53obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
54 signal_64.o ptrace32.o \ 54 signal_64.o ptrace32.o \
55 paca.o nvram_64.o firmware.o 55 paca.o nvram_64.o firmware.o note.o
56obj-$(CONFIG_VDSO32) += vdso32/ 56obj-$(CONFIG_VDSO32) += vdso32/
57obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o 57obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
58obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 58obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
@@ -78,7 +78,9 @@ obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
78 eeh_driver.o eeh_event.o eeh_sysfs.o 78 eeh_driver.o eeh_event.o eeh_sysfs.o
79obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o 79obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o
80obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 80obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
81obj-$(CONFIG_FA_DUMP) += fadump.o 81ifneq ($(CONFIG_FA_DUMP)$(CONFIG_PRESERVE_FA_DUMP),)
82obj-y += fadump.o
83endif
82ifdef CONFIG_PPC32 84ifdef CONFIG_PPC32
83obj-$(CONFIG_E500) += idle_e500.o 85obj-$(CONFIG_E500) += idle_e500.o
84endif 86endif
@@ -155,6 +157,9 @@ endif
155 157
156obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o 158obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
157obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o 159obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
160ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),)
161obj-y += ucall.o
162endif
158 163
159# Disable GCOV, KCOV & sanitizers in odd or sensitive code 164# Disable GCOV, KCOV & sanitizers in odd or sensitive code
160GCOV_PROFILE_prom_init.o := n 165GCOV_PROFILE_prom_init.o := n
@@ -184,15 +189,13 @@ extra-$(CONFIG_ALTIVEC) += vector.o
184extra-$(CONFIG_PPC64) += entry_64.o 189extra-$(CONFIG_PPC64) += entry_64.o
185extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o 190extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o
186 191
187ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE 192extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check
188$(obj)/built-in.a: prom_init_check
189 193
190quiet_cmd_prom_init_check = CALL $< 194quiet_cmd_prom_init_check = PROMCHK $@
191 cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o" 195 cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@
192 196
193PHONY += prom_init_check 197$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE
194prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o 198 $(call if_changed,prom_init_check)
195 $(call cmd,prom_init_check) 199targets += prom_init_check
196endif
197 200
198clean-files := vmlinux.lds 201clean-files := vmlinux.lds
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4ccb6b3a7fbd..484f54dab247 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,6 +506,7 @@ int main(void)
506 OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); 506 OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
507 OFFSET(KVM_RADIX, kvm, arch.radix); 507 OFFSET(KVM_RADIX, kvm, arch.radix);
508 OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); 508 OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
509 OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest);
509 OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); 510 OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
510 OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); 511 OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
511 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); 512 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index bfe5f4a2886b..e745abc5457a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
569#endif /* CONFIG_PPC_BOOK3S_64 */ 569#endif /* CONFIG_PPC_BOOK3S_64 */
570 570
571#ifdef CONFIG_PPC32 571#ifdef CONFIG_PPC32
572#ifdef CONFIG_PPC_BOOK3S_32 572#ifdef CONFIG_PPC_BOOK3S_601
573 { /* 601 */ 573 { /* 601 */
574 .pvr_mask = 0xffff0000, 574 .pvr_mask = 0xffff0000,
575 .pvr_value = 0x00010000, 575 .pvr_value = 0x00010000,
@@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = {
583 .machine_check = machine_check_generic, 583 .machine_check = machine_check_generic,
584 .platform = "ppc601", 584 .platform = "ppc601",
585 }, 585 },
586#endif /* CONFIG_PPC_BOOK3S_601 */
587#ifdef CONFIG_PPC_BOOK3S_6xx
586 { /* 603 */ 588 { /* 603 */
587 .pvr_mask = 0xffff0000, 589 .pvr_mask = 0xffff0000,
588 .pvr_value = 0x00030000, 590 .pvr_value = 0x00030000,
@@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
1212 .machine_check = machine_check_generic, 1214 .machine_check = machine_check_generic,
1213 .platform = "ppc603", 1215 .platform = "ppc603",
1214 }, 1216 },
1215#endif /* CONFIG_PPC_BOOK3S_32 */ 1217#endif /* CONFIG_PPC_BOOK3S_6xx */
1216#ifdef CONFIG_PPC_8xx 1218#ifdef CONFIG_PPC_8xx
1217 { /* 8xx */ 1219 { /* 8xx */
1218 .pvr_mask = 0xffff0000, 1220 .pvr_mask = 0xffff0000,
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 2f5a53874f6d..e486d1d78de2 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
122{ 122{
123 struct iommu_table *tbl = get_iommu_table_base(dev); 123 struct iommu_table *tbl = get_iommu_table_base(dev);
124 124
125 if (!tbl) {
126 dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
127 ", table unavailable\n", mask);
128 return 0;
129 }
130
131 if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { 125 if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
132 dev->archdata.iommu_bypass = true; 126 dev->archdata.iommu_bypass = true;
133 dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); 127 dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
134 return 1; 128 return 1;
135 } 129 }
136 130
131 if (!tbl) {
132 dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask);
133 return 0;
134 }
135
137 if (tbl->it_offset > (mask >> tbl->it_page_shift)) { 136 if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
138 dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); 137 dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
139 dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", 138 dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c0e4b73191f3..0a91dee51245 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -150,6 +150,16 @@ static int __init eeh_setup(char *str)
150} 150}
151__setup("eeh=", eeh_setup); 151__setup("eeh=", eeh_setup);
152 152
153void eeh_show_enabled(void)
154{
155 if (eeh_has_flag(EEH_FORCE_DISABLED))
156 pr_info("EEH: Recovery disabled by kernel parameter.\n");
157 else if (eeh_has_flag(EEH_ENABLED))
158 pr_info("EEH: Capable adapter found: recovery enabled.\n");
159 else
160 pr_info("EEH: No capable adapters found: recovery disabled.\n");
161}
162
153/* 163/*
154 * This routine captures assorted PCI configuration space data 164 * This routine captures assorted PCI configuration space data
155 * for the indicated PCI device, and puts them into a buffer 165 * for the indicated PCI device, and puts them into a buffer
@@ -410,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
410 eeh_pe_mark_isolated(phb_pe); 420 eeh_pe_mark_isolated(phb_pe);
411 eeh_serialize_unlock(flags); 421 eeh_serialize_unlock(flags);
412 422
413 pr_err("EEH: PHB#%x failure detected, location: %s\n", 423 pr_debug("EEH: PHB#%x failure detected, location: %s\n",
414 phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); 424 phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
415 dump_stack();
416 eeh_send_failure_event(phb_pe); 425 eeh_send_failure_event(phb_pe);
417
418 return 1; 426 return 1;
419out: 427out:
420 eeh_serialize_unlock(flags); 428 eeh_serialize_unlock(flags);
@@ -441,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
441 unsigned long flags; 449 unsigned long flags;
442 struct device_node *dn; 450 struct device_node *dn;
443 struct pci_dev *dev; 451 struct pci_dev *dev;
444 struct eeh_pe *pe, *parent_pe, *phb_pe; 452 struct eeh_pe *pe, *parent_pe;
445 int rc = 0; 453 int rc = 0;
446 const char *location = NULL; 454 const char *location = NULL;
447 455
@@ -460,8 +468,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
460 /* Access to IO BARs might get this far and still not want checking. */ 468 /* Access to IO BARs might get this far and still not want checking. */
461 if (!pe) { 469 if (!pe) {
462 eeh_stats.ignored_check++; 470 eeh_stats.ignored_check++;
463 pr_debug("EEH: Ignored check for %s\n", 471 eeh_edev_dbg(edev, "Ignored check\n");
464 eeh_pci_name(dev));
465 return 0; 472 return 0;
466 } 473 }
467 474
@@ -501,12 +508,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
501 if (dn) 508 if (dn)
502 location = of_get_property(dn, "ibm,loc-code", 509 location = of_get_property(dn, "ibm,loc-code",
503 NULL); 510 NULL);
504 printk(KERN_ERR "EEH: %d reads ignored for recovering device at " 511 eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n",
505 "location=%s driver=%s pci addr=%s\n",
506 pe->check_count, 512 pe->check_count,
507 location ? location : "unknown", 513 location ? location : "unknown",
508 eeh_driver_name(dev), eeh_pci_name(dev)); 514 eeh_driver_name(dev));
509 printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", 515 eeh_edev_err(edev, "Might be infinite loop in %s driver\n",
510 eeh_driver_name(dev)); 516 eeh_driver_name(dev));
511 dump_stack(); 517 dump_stack();
512 } 518 }
@@ -573,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
573 * a stack trace will help the device-driver authors figure 579 * a stack trace will help the device-driver authors figure
574 * out what happened. So print that out. 580 * out what happened. So print that out.
575 */ 581 */
576 phb_pe = eeh_phb_pe_get(pe->phb); 582 pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
577 pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", 583 __func__, pe->phb->global_number, pe->addr);
578 pe->phb->global_number, pe->addr);
579 pr_err("EEH: PE location: %s, PHB location: %s\n",
580 eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
581 dump_stack();
582
583 eeh_send_failure_event(pe); 584 eeh_send_failure_event(pe);
584 585
585 return 1; 586 return 1;
@@ -697,7 +698,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
697 return rc; 698 return rc;
698} 699}
699 700
700static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, 701static void eeh_disable_and_save_dev_state(struct eeh_dev *edev,
701 void *userdata) 702 void *userdata)
702{ 703{
703 struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); 704 struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
@@ -708,7 +709,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
708 * state for the specified device 709 * state for the specified device
709 */ 710 */
710 if (!pdev || pdev == dev) 711 if (!pdev || pdev == dev)
711 return NULL; 712 return;
712 713
713 /* Ensure we have D0 power state */ 714 /* Ensure we have D0 power state */
714 pci_set_power_state(pdev, PCI_D0); 715 pci_set_power_state(pdev, PCI_D0);
@@ -721,18 +722,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
721 * interrupt from the device 722 * interrupt from the device
722 */ 723 */
723 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 724 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
724
725 return NULL;
726} 725}
727 726
728static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) 727static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
729{ 728{
730 struct pci_dn *pdn = eeh_dev_to_pdn(edev); 729 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
731 struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); 730 struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
732 struct pci_dev *dev = userdata; 731 struct pci_dev *dev = userdata;
733 732
734 if (!pdev) 733 if (!pdev)
735 return NULL; 734 return;
736 735
737 /* Apply customization from firmware */ 736 /* Apply customization from firmware */
738 if (pdn && eeh_ops->restore_config) 737 if (pdn && eeh_ops->restore_config)
@@ -741,8 +740,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
741 /* The caller should restore state for the specified device */ 740 /* The caller should restore state for the specified device */
742 if (pdev != dev) 741 if (pdev != dev)
743 pci_restore_state(pdev); 742 pci_restore_state(pdev);
744
745 return NULL;
746} 743}
747 744
748int eeh_restore_vf_config(struct pci_dn *pdn) 745int eeh_restore_vf_config(struct pci_dn *pdn)
@@ -868,7 +865,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
868 * the indicated device and its children so that the bunch of the 865 * the indicated device and its children so that the bunch of the
869 * devices could be reset properly. 866 * devices could be reset properly.
870 */ 867 */
871static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) 868static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
872{ 869{
873 struct pci_dev *dev; 870 struct pci_dev *dev;
874 unsigned int *freset = (unsigned int *)flag; 871 unsigned int *freset = (unsigned int *)flag;
@@ -876,8 +873,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
876 dev = eeh_dev_to_pci_dev(edev); 873 dev = eeh_dev_to_pci_dev(edev);
877 if (dev) 874 if (dev)
878 *freset |= dev->needs_freset; 875 *freset |= dev->needs_freset;
879
880 return NULL;
881} 876}
882 877
883static void eeh_pe_refreeze_passed(struct eeh_pe *root) 878static void eeh_pe_refreeze_passed(struct eeh_pe *root)
@@ -1063,23 +1058,6 @@ static struct notifier_block eeh_reboot_nb = {
1063 .notifier_call = eeh_reboot_notifier, 1058 .notifier_call = eeh_reboot_notifier,
1064}; 1059};
1065 1060
1066void eeh_probe_devices(void)
1067{
1068 struct pci_controller *hose, *tmp;
1069 struct pci_dn *pdn;
1070
1071 /* Enable EEH for all adapters */
1072 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1073 pdn = hose->pci_data;
1074 traverse_pci_dn(pdn, eeh_ops->probe, NULL);
1075 }
1076 if (eeh_enabled())
1077 pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
1078 else
1079 pr_info("EEH: No capable adapters found\n");
1080
1081}
1082
1083/** 1061/**
1084 * eeh_init - EEH initialization 1062 * eeh_init - EEH initialization
1085 * 1063 *
@@ -1120,6 +1098,8 @@ static int eeh_init(void)
1120 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) 1098 list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
1121 eeh_dev_phb_init_dynamic(hose); 1099 eeh_dev_phb_init_dynamic(hose);
1122 1100
1101 eeh_addr_cache_init();
1102
1123 /* Initialize EEH event */ 1103 /* Initialize EEH event */
1124 return eeh_event_init(); 1104 return eeh_event_init();
1125} 1105}
@@ -1190,15 +1170,14 @@ void eeh_add_device_late(struct pci_dev *dev)
1190 struct pci_dn *pdn; 1170 struct pci_dn *pdn;
1191 struct eeh_dev *edev; 1171 struct eeh_dev *edev;
1192 1172
1193 if (!dev || !eeh_enabled()) 1173 if (!dev)
1194 return; 1174 return;
1195 1175
1196 pr_debug("EEH: Adding device %s\n", pci_name(dev));
1197
1198 pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); 1176 pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
1199 edev = pdn_to_eeh_dev(pdn); 1177 edev = pdn_to_eeh_dev(pdn);
1178 eeh_edev_dbg(edev, "Adding device\n");
1200 if (edev->pdev == dev) { 1179 if (edev->pdev == dev) {
1201 pr_debug("EEH: Already referenced !\n"); 1180 eeh_edev_dbg(edev, "Device already referenced!\n");
1202 return; 1181 return;
1203 } 1182 }
1204 1183
@@ -1246,6 +1225,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
1246{ 1225{
1247 struct pci_dev *dev; 1226 struct pci_dev *dev;
1248 1227
1228 if (eeh_has_flag(EEH_FORCE_DISABLED))
1229 return;
1249 list_for_each_entry(dev, &bus->devices, bus_list) { 1230 list_for_each_entry(dev, &bus->devices, bus_list) {
1250 eeh_add_device_late(dev); 1231 eeh_add_device_late(dev);
1251 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 1232 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
@@ -1299,10 +1280,10 @@ void eeh_remove_device(struct pci_dev *dev)
1299 edev = pci_dev_to_eeh_dev(dev); 1280 edev = pci_dev_to_eeh_dev(dev);
1300 1281
1301 /* Unregister the device with the EEH/PCI address search system */ 1282 /* Unregister the device with the EEH/PCI address search system */
1302 pr_debug("EEH: Removing device %s\n", pci_name(dev)); 1283 dev_dbg(&dev->dev, "EEH: Removing device\n");
1303 1284
1304 if (!edev || !edev->pdev || !edev->pe) { 1285 if (!edev || !edev->pdev || !edev->pe) {
1305 pr_debug("EEH: Not referenced !\n"); 1286 dev_dbg(&dev->dev, "EEH: Device not referenced!\n");
1306 return; 1287 return;
1307 } 1288 }
1308 1289
@@ -1890,6 +1871,198 @@ static const struct file_operations eeh_force_recover_fops = {
1890 .llseek = no_llseek, 1871 .llseek = no_llseek,
1891 .write = eeh_force_recover_write, 1872 .write = eeh_force_recover_write,
1892}; 1873};
1874
1875static ssize_t eeh_debugfs_dev_usage(struct file *filp,
1876 char __user *user_buf,
1877 size_t count, loff_t *ppos)
1878{
1879 static const char usage[] = "input format: <domain>:<bus>:<dev>.<fn>\n";
1880
1881 return simple_read_from_buffer(user_buf, count, ppos,
1882 usage, sizeof(usage) - 1);
1883}
1884
1885static ssize_t eeh_dev_check_write(struct file *filp,
1886 const char __user *user_buf,
1887 size_t count, loff_t *ppos)
1888{
1889 uint32_t domain, bus, dev, fn;
1890 struct pci_dev *pdev;
1891 struct eeh_dev *edev;
1892 char buf[20];
1893 int ret;
1894
1895 memset(buf, 0, sizeof(buf));
1896 ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
1897 if (!ret)
1898 return -EFAULT;
1899
1900 ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
1901 if (ret != 4) {
1902 pr_err("%s: expected 4 args, got %d\n", __func__, ret);
1903 return -EINVAL;
1904 }
1905
1906 pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
1907 if (!pdev)
1908 return -ENODEV;
1909
1910 edev = pci_dev_to_eeh_dev(pdev);
1911 if (!edev) {
1912 pci_err(pdev, "No eeh_dev for this device!\n");
1913 pci_dev_put(pdev);
1914 return -ENODEV;
1915 }
1916
1917 ret = eeh_dev_check_failure(edev);
1918 pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
1919 domain, bus, dev, fn, ret);
1920
1921 pci_dev_put(pdev);
1922
1923 return count;
1924}
1925
1926static const struct file_operations eeh_dev_check_fops = {
1927 .open = simple_open,
1928 .llseek = no_llseek,
1929 .write = eeh_dev_check_write,
1930 .read = eeh_debugfs_dev_usage,
1931};
1932
1933static int eeh_debugfs_break_device(struct pci_dev *pdev)
1934{
1935 struct resource *bar = NULL;
1936 void __iomem *mapped;
1937 u16 old, bit;
1938 int i, pos;
1939
1940 /* Do we have an MMIO BAR to disable? */
1941 for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
1942 struct resource *r = &pdev->resource[i];
1943
1944 if (!r->flags || !r->start)
1945 continue;
1946 if (r->flags & IORESOURCE_IO)
1947 continue;
1948 if (r->flags & IORESOURCE_UNSET)
1949 continue;
1950
1951 bar = r;
1952 break;
1953 }
1954
1955 if (!bar) {
1956 pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
1957 return -ENXIO;
1958 }
1959
1960 pci_err(pdev, "Going to break: %pR\n", bar);
1961
1962 if (pdev->is_virtfn) {
1963#ifndef CONFIG_IOV
1964 return -ENXIO;
1965#else
1966 /*
1967 * VFs don't have a per-function COMMAND register, so the best
1968 * we can do is clear the Memory Space Enable bit in the PF's
1969 * SRIOV control reg.
1970 *
1971 * Unfortunately, this requires that we have a PF (i.e doesn't
1972 * work for a passed-through VF) and it has the potential side
1973 * effect of also causing an EEH on every other VF under the
1974 * PF. Oh well.
1975 */
1976 pdev = pdev->physfn;
1977 if (!pdev)
1978 return -ENXIO; /* passed through VFs have no PF */
1979
1980 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
1981 pos += PCI_SRIOV_CTRL;
1982 bit = PCI_SRIOV_CTRL_MSE;
1983#endif /* !CONFIG_IOV */
1984 } else {
1985 bit = PCI_COMMAND_MEMORY;
1986 pos = PCI_COMMAND;
1987 }
1988
1989 /*
1990 * Process here is:
1991 *
1992 * 1. Disable Memory space.
1993 *
1994 * 2. Perform an MMIO to the device. This should result in an error
1995 * (CA / UR) being raised by the device which results in an EEH
1996 * PE freeze. Using the in_8() accessor skips the eeh detection hook
1997 * so the freeze hook so the EEH Detection machinery won't be
1998 * triggered here. This is to match the usual behaviour of EEH
1999 * where the HW will asyncronously freeze a PE and it's up to
2000 * the kernel to notice and deal with it.
2001 *
2002 * 3. Turn Memory space back on. This is more important for VFs
2003 * since recovery will probably fail if we don't. For normal
2004 * the COMMAND register is reset as a part of re-initialising
2005 * the device.
2006 *
2007 * Breaking stuff is the point so who cares if it's racy ;)
2008 */
2009 pci_read_config_word(pdev, pos, &old);
2010
2011 mapped = ioremap(bar->start, PAGE_SIZE);
2012 if (!mapped) {
2013 pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar);
2014 return -ENXIO;
2015 }
2016
2017 pci_write_config_word(pdev, pos, old & ~bit);
2018 in_8(mapped);
2019 pci_write_config_word(pdev, pos, old);
2020
2021 iounmap(mapped);
2022
2023 return 0;
2024}
2025
2026static ssize_t eeh_dev_break_write(struct file *filp,
2027 const char __user *user_buf,
2028 size_t count, loff_t *ppos)
2029{
2030 uint32_t domain, bus, dev, fn;
2031 struct pci_dev *pdev;
2032 char buf[20];
2033 int ret;
2034
2035 memset(buf, 0, sizeof(buf));
2036 ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
2037 if (!ret)
2038 return -EFAULT;
2039
2040 ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
2041 if (ret != 4) {
2042 pr_err("%s: expected 4 args, got %d\n", __func__, ret);
2043 return -EINVAL;
2044 }
2045
2046 pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
2047 if (!pdev)
2048 return -ENODEV;
2049
2050 ret = eeh_debugfs_break_device(pdev);
2051 pci_dev_put(pdev);
2052
2053 if (ret < 0)
2054 return ret;
2055
2056 return count;
2057}
2058
2059static const struct file_operations eeh_dev_break_fops = {
2060 .open = simple_open,
2061 .llseek = no_llseek,
2062 .write = eeh_dev_break_write,
2063 .read = eeh_debugfs_dev_usage,
2064};
2065
1893#endif 2066#endif
1894 2067
1895static int __init eeh_init_proc(void) 2068static int __init eeh_init_proc(void)
@@ -1905,6 +2078,12 @@ static int __init eeh_init_proc(void)
1905 debugfs_create_bool("eeh_disable_recovery", 0600, 2078 debugfs_create_bool("eeh_disable_recovery", 0600,
1906 powerpc_debugfs_root, 2079 powerpc_debugfs_root,
1907 &eeh_debugfs_no_recover); 2080 &eeh_debugfs_no_recover);
2081 debugfs_create_file_unsafe("eeh_dev_check", 0600,
2082 powerpc_debugfs_root, NULL,
2083 &eeh_dev_check_fops);
2084 debugfs_create_file_unsafe("eeh_dev_break", 0600,
2085 powerpc_debugfs_root, NULL,
2086 &eeh_dev_break_fops);
1908 debugfs_create_file_unsafe("eeh_force_recover", 0600, 2087 debugfs_create_file_unsafe("eeh_force_recover", 0600,
1909 powerpc_debugfs_root, NULL, 2088 powerpc_debugfs_root, NULL,
1910 &eeh_force_recover_fops); 2089 &eeh_force_recover_fops);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 05ffd32b3416..cf11277ebd02 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
148 piar->pcidev = dev; 148 piar->pcidev = dev;
149 piar->flags = flags; 149 piar->flags = flags;
150 150
151 pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", 151 eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n",
152 &alo, &ahi, pci_name(dev)); 152 &alo, &ahi);
153 153
154 rb_link_node(&piar->rb_node, parent, p); 154 rb_link_node(&piar->rb_node, parent, p);
155 rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); 155 rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -229,8 +229,8 @@ restart:
229 piar = rb_entry(n, struct pci_io_addr_range, rb_node); 229 piar = rb_entry(n, struct pci_io_addr_range, rb_node);
230 230
231 if (piar->pcidev == dev) { 231 if (piar->pcidev == dev) {
232 pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", 232 eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n",
233 &piar->addr_lo, &piar->addr_hi, pci_name(dev)); 233 &piar->addr_lo, &piar->addr_hi);
234 rb_erase(n, &pci_io_addr_cache_root.rb_root); 234 rb_erase(n, &pci_io_addr_cache_root.rb_root);
235 kfree(piar); 235 kfree(piar);
236 goto restart; 236 goto restart;
@@ -258,37 +258,14 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
258} 258}
259 259
260/** 260/**
261 * eeh_addr_cache_build - Build a cache of I/O addresses 261 * eeh_addr_cache_init - Initialize a cache of I/O addresses
262 * 262 *
263 * Build a cache of pci i/o addresses. This cache will be used to 263 * Initialize a cache of pci i/o addresses. This cache will be used to
264 * find the pci device that corresponds to a given address. 264 * find the pci device that corresponds to a given address.
265 * This routine scans all pci busses to build the cache.
266 * Must be run late in boot process, after the pci controllers
267 * have been scanned for devices (after all device resources are known).
268 */ 265 */
269void eeh_addr_cache_build(void) 266void eeh_addr_cache_init(void)
270{ 267{
271 struct pci_dn *pdn;
272 struct eeh_dev *edev;
273 struct pci_dev *dev = NULL;
274
275 spin_lock_init(&pci_io_addr_cache_root.piar_lock); 268 spin_lock_init(&pci_io_addr_cache_root.piar_lock);
276
277 for_each_pci_dev(dev) {
278 pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
279 if (!pdn)
280 continue;
281
282 edev = pdn_to_eeh_dev(pdn);
283 if (!edev)
284 continue;
285
286 dev->dev.archdata.edev = edev;
287 edev->pdev = dev;
288
289 eeh_addr_cache_insert_dev(dev);
290 eeh_sysfs_add_device(dev);
291 }
292} 269}
293 270
294static int eeh_addr_cache_show(struct seq_file *s, void *v) 271static int eeh_addr_cache_show(struct seq_file *s, void *v)
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index c4317c452d98..7370185c7a05 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
47 /* Associate EEH device with OF node */ 47 /* Associate EEH device with OF node */
48 pdn->edev = edev; 48 pdn->edev = edev;
49 edev->pdn = pdn; 49 edev->pdn = pdn;
50 edev->bdfn = (pdn->busno << 8) | pdn->devfn;
51 edev->controller = pdn->phb;
50 52
51 return edev; 53 return edev;
52} 54}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 89623962c727..d9279d0ee9f5 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -27,6 +27,7 @@
27#include <linux/irq.h> 27#include <linux/irq.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/pci.h> 29#include <linux/pci.h>
30#include <linux/pci_hotplug.h>
30#include <asm/eeh.h> 31#include <asm/eeh.h>
31#include <asm/eeh_event.h> 32#include <asm/eeh_event.h>
32#include <asm/ppc-pci.h> 33#include <asm/ppc-pci.h>
@@ -81,23 +82,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result)
81 } 82 }
82}; 83};
83 84
84static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
85 const char *fmt, ...)
86{
87 struct va_format vaf;
88 va_list args;
89
90 va_start(args, fmt);
91
92 vaf.fmt = fmt;
93 vaf.va = &args;
94
95 printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
96 edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
97
98 va_end(args);
99}
100
101static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, 85static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
102 enum pci_ers_result new) 86 enum pci_ers_result new)
103{ 87{
@@ -113,8 +97,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev)
113 97
114static bool eeh_edev_actionable(struct eeh_dev *edev) 98static bool eeh_edev_actionable(struct eeh_dev *edev)
115{ 99{
116 return (edev->pdev && !eeh_dev_removed(edev) && 100 if (!edev->pdev)
117 !eeh_pe_passed(edev->pe)); 101 return false;
102 if (edev->pdev->error_state == pci_channel_io_perm_failure)
103 return false;
104 if (eeh_dev_removed(edev))
105 return false;
106 if (eeh_pe_passed(edev->pe))
107 return false;
108
109 return true;
118} 110}
119 111
120/** 112/**
@@ -214,12 +206,12 @@ static void eeh_enable_irq(struct eeh_dev *edev)
214 } 206 }
215} 207}
216 208
217static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) 209static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
218{ 210{
219 struct pci_dev *pdev; 211 struct pci_dev *pdev;
220 212
221 if (!edev) 213 if (!edev)
222 return NULL; 214 return;
223 215
224 /* 216 /*
225 * We cannot access the config space on some adapters. 217 * We cannot access the config space on some adapters.
@@ -229,14 +221,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
229 * device is created. 221 * device is created.
230 */ 222 */
231 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) 223 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
232 return NULL; 224 return;
233 225
234 pdev = eeh_dev_to_pci_dev(edev); 226 pdev = eeh_dev_to_pci_dev(edev);
235 if (!pdev) 227 if (!pdev)
236 return NULL; 228 return;
237 229
238 pci_save_state(pdev); 230 pci_save_state(pdev);
239 return NULL;
240} 231}
241 232
242static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) 233static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
@@ -274,20 +265,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
274} 265}
275 266
276typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, 267typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
268 struct pci_dev *,
277 struct pci_driver *); 269 struct pci_driver *);
278static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, 270static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
279 enum pci_ers_result *result) 271 enum pci_ers_result *result)
280{ 272{
273 struct pci_dev *pdev;
281 struct pci_driver *driver; 274 struct pci_driver *driver;
282 enum pci_ers_result new_result; 275 enum pci_ers_result new_result;
283 276
284 if (!edev->pdev) { 277 pci_lock_rescan_remove();
278 pdev = edev->pdev;
279 if (pdev)
280 get_device(&pdev->dev);
281 pci_unlock_rescan_remove();
282 if (!pdev) {
285 eeh_edev_info(edev, "no device"); 283 eeh_edev_info(edev, "no device");
286 return; 284 return;
287 } 285 }
288 device_lock(&edev->pdev->dev); 286 device_lock(&pdev->dev);
289 if (eeh_edev_actionable(edev)) { 287 if (eeh_edev_actionable(edev)) {
290 driver = eeh_pcid_get(edev->pdev); 288 driver = eeh_pcid_get(pdev);
291 289
292 if (!driver) 290 if (!driver)
293 eeh_edev_info(edev, "no driver"); 291 eeh_edev_info(edev, "no driver");
@@ -296,7 +294,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
296 else if (edev->mode & EEH_DEV_NO_HANDLER) 294 else if (edev->mode & EEH_DEV_NO_HANDLER)
297 eeh_edev_info(edev, "driver bound too late"); 295 eeh_edev_info(edev, "driver bound too late");
298 else { 296 else {
299 new_result = fn(edev, driver); 297 new_result = fn(edev, pdev, driver);
300 eeh_edev_info(edev, "%s driver reports: '%s'", 298 eeh_edev_info(edev, "%s driver reports: '%s'",
301 driver->name, 299 driver->name,
302 pci_ers_result_name(new_result)); 300 pci_ers_result_name(new_result));
@@ -305,12 +303,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
305 new_result); 303 new_result);
306 } 304 }
307 if (driver) 305 if (driver)
308 eeh_pcid_put(edev->pdev); 306 eeh_pcid_put(pdev);
309 } else { 307 } else {
310 eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, 308 eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
311 !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); 309 !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
312 } 310 }
313 device_unlock(&edev->pdev->dev); 311 device_unlock(&pdev->dev);
312 if (edev->pdev != pdev)
313 eeh_edev_warn(edev, "Device changed during processing!\n");
314 put_device(&pdev->dev);
314} 315}
315 316
316static void eeh_pe_report(const char *name, struct eeh_pe *root, 317static void eeh_pe_report(const char *name, struct eeh_pe *root,
@@ -337,20 +338,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
337 * Report an EEH error to each device driver. 338 * Report an EEH error to each device driver.
338 */ 339 */
339static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, 340static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
341 struct pci_dev *pdev,
340 struct pci_driver *driver) 342 struct pci_driver *driver)
341{ 343{
342 enum pci_ers_result rc; 344 enum pci_ers_result rc;
343 struct pci_dev *dev = edev->pdev;
344 345
345 if (!driver->err_handler->error_detected) 346 if (!driver->err_handler->error_detected)
346 return PCI_ERS_RESULT_NONE; 347 return PCI_ERS_RESULT_NONE;
347 348
348 eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", 349 eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
349 driver->name); 350 driver->name);
350 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); 351 rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);
351 352
352 edev->in_error = true; 353 edev->in_error = true;
353 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); 354 pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
354 return rc; 355 return rc;
355} 356}
356 357
@@ -363,12 +364,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
363 * are now enabled. 364 * are now enabled.
364 */ 365 */
365static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, 366static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
367 struct pci_dev *pdev,
366 struct pci_driver *driver) 368 struct pci_driver *driver)
367{ 369{
368 if (!driver->err_handler->mmio_enabled) 370 if (!driver->err_handler->mmio_enabled)
369 return PCI_ERS_RESULT_NONE; 371 return PCI_ERS_RESULT_NONE;
370 eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); 372 eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
371 return driver->err_handler->mmio_enabled(edev->pdev); 373 return driver->err_handler->mmio_enabled(pdev);
372} 374}
373 375
374/** 376/**
@@ -382,20 +384,21 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
382 * driver can work again while the device is recovered. 384 * driver can work again while the device is recovered.
383 */ 385 */
384static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, 386static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
387 struct pci_dev *pdev,
385 struct pci_driver *driver) 388 struct pci_driver *driver)
386{ 389{
387 if (!driver->err_handler->slot_reset || !edev->in_error) 390 if (!driver->err_handler->slot_reset || !edev->in_error)
388 return PCI_ERS_RESULT_NONE; 391 return PCI_ERS_RESULT_NONE;
389 eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); 392 eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
390 return driver->err_handler->slot_reset(edev->pdev); 393 return driver->err_handler->slot_reset(pdev);
391} 394}
392 395
393static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) 396static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
394{ 397{
395 struct pci_dev *pdev; 398 struct pci_dev *pdev;
396 399
397 if (!edev) 400 if (!edev)
398 return NULL; 401 return;
399 402
400 /* 403 /*
401 * The content in the config space isn't saved because 404 * The content in the config space isn't saved because
@@ -407,15 +410,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
407 if (list_is_last(&edev->entry, &edev->pe->edevs)) 410 if (list_is_last(&edev->entry, &edev->pe->edevs))
408 eeh_pe_restore_bars(edev->pe); 411 eeh_pe_restore_bars(edev->pe);
409 412
410 return NULL; 413 return;
411 } 414 }
412 415
413 pdev = eeh_dev_to_pci_dev(edev); 416 pdev = eeh_dev_to_pci_dev(edev);
414 if (!pdev) 417 if (!pdev)
415 return NULL; 418 return;
416 419
417 pci_restore_state(pdev); 420 pci_restore_state(pdev);
418 return NULL;
419} 421}
420 422
421/** 423/**
@@ -428,13 +430,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
428 * to make the recovered device work again. 430 * to make the recovered device work again.
429 */ 431 */
430static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, 432static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
433 struct pci_dev *pdev,
431 struct pci_driver *driver) 434 struct pci_driver *driver)
432{ 435{
433 if (!driver->err_handler->resume || !edev->in_error) 436 if (!driver->err_handler->resume || !edev->in_error)
434 return PCI_ERS_RESULT_NONE; 437 return PCI_ERS_RESULT_NONE;
435 438
436 eeh_edev_info(edev, "Invoking %s->resume()", driver->name); 439 eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
437 driver->err_handler->resume(edev->pdev); 440 driver->err_handler->resume(pdev);
438 441
439 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); 442 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
440#ifdef CONFIG_PCI_IOV 443#ifdef CONFIG_PCI_IOV
@@ -453,6 +456,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
453 * dead, and that no further recovery attempts will be made on it. 456 * dead, and that no further recovery attempts will be made on it.
454 */ 457 */
455static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, 458static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
459 struct pci_dev *pdev,
456 struct pci_driver *driver) 460 struct pci_driver *driver)
457{ 461{
458 enum pci_ers_result rc; 462 enum pci_ers_result rc;
@@ -462,10 +466,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
462 466
463 eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", 467 eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
464 driver->name); 468 driver->name);
465 rc = driver->err_handler->error_detected(edev->pdev, 469 rc = driver->err_handler->error_detected(pdev,
466 pci_channel_io_perm_failure); 470 pci_channel_io_perm_failure);
467 471
468 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); 472 pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT);
469 return rc; 473 return rc;
470} 474}
471 475
@@ -473,12 +477,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
473{ 477{
474 struct pci_driver *driver; 478 struct pci_driver *driver;
475 struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 479 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
476 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
477 480
478 if (!(edev->physfn)) { 481 if (!(edev->physfn)) {
479 pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", 482 eeh_edev_warn(edev, "Not for VF\n");
480 __func__, pdn->phb->global_number, pdn->busno,
481 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
482 return NULL; 483 return NULL;
483 } 484 }
484 485
@@ -492,12 +493,12 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
492 } 493 }
493 494
494#ifdef CONFIG_PCI_IOV 495#ifdef CONFIG_PCI_IOV
495 pci_iov_add_virtfn(edev->physfn, pdn->vf_index); 496 pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
496#endif 497#endif
497 return NULL; 498 return NULL;
498} 499}
499 500
500static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) 501static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
501{ 502{
502 struct pci_driver *driver; 503 struct pci_driver *driver;
503 struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 504 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
@@ -512,7 +513,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
512 */ 513 */
513 if (!eeh_edev_actionable(edev) || 514 if (!eeh_edev_actionable(edev) ||
514 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) 515 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
515 return NULL; 516 return;
516 517
517 if (rmv_data) { 518 if (rmv_data) {
518 driver = eeh_pcid_get(dev); 519 driver = eeh_pcid_get(dev);
@@ -521,7 +522,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
521 driver->err_handler->error_detected && 522 driver->err_handler->error_detected &&
522 driver->err_handler->slot_reset) { 523 driver->err_handler->slot_reset) {
523 eeh_pcid_put(dev); 524 eeh_pcid_put(dev);
524 return NULL; 525 return;
525 } 526 }
526 eeh_pcid_put(dev); 527 eeh_pcid_put(dev);
527 } 528 }
@@ -554,8 +555,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
554 pci_stop_and_remove_bus_device(dev); 555 pci_stop_and_remove_bus_device(dev);
555 pci_unlock_rescan_remove(); 556 pci_unlock_rescan_remove();
556 } 557 }
557
558 return NULL;
559} 558}
560 559
561static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) 560static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
@@ -744,6 +743,99 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
744 */ 743 */
745#define MAX_WAIT_FOR_RECOVERY 300 744#define MAX_WAIT_FOR_RECOVERY 300
746 745
746
747/* Walks the PE tree after processing an event to remove any stale PEs.
748 *
749 * NB: This needs to be recursive to ensure the leaf PEs get removed
750 * before their parents do. Although this is possible to do recursively
751 * we don't since this is easier to read and we need to garantee
752 * the leaf nodes will be handled first.
753 */
754static void eeh_pe_cleanup(struct eeh_pe *pe)
755{
756 struct eeh_pe *child_pe, *tmp;
757
758 list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child)
759 eeh_pe_cleanup(child_pe);
760
761 if (pe->state & EEH_PE_KEEP)
762 return;
763
764 if (!(pe->state & EEH_PE_INVALID))
765 return;
766
767 if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) {
768 list_del(&pe->child);
769 kfree(pe);
770 }
771}
772
773/**
774 * eeh_check_slot_presence - Check if a device is still present in a slot
775 * @pdev: pci_dev to check
776 *
777 * This function may return a false positive if we can't determine the slot's
778 * presence state. This might happen for for PCIe slots if the PE containing
779 * the upstream bridge is also frozen, or the bridge is part of the same PE
780 * as the device.
781 *
782 * This shouldn't happen often, but you might see it if you hotplug a PCIe
783 * switch.
784 */
785static bool eeh_slot_presence_check(struct pci_dev *pdev)
786{
787 const struct hotplug_slot_ops *ops;
788 struct pci_slot *slot;
789 u8 state;
790 int rc;
791
792 if (!pdev)
793 return false;
794
795 if (pdev->error_state == pci_channel_io_perm_failure)
796 return false;
797
798 slot = pdev->slot;
799 if (!slot || !slot->hotplug)
800 return true;
801
802 ops = slot->hotplug->ops;
803 if (!ops || !ops->get_adapter_status)
804 return true;
805
806 /* set the attention indicator while we've got the slot ops */
807 if (ops->set_attention_status)
808 ops->set_attention_status(slot->hotplug, 1);
809
810 rc = ops->get_adapter_status(slot->hotplug, &state);
811 if (rc)
812 return true;
813
814 return !!state;
815}
816
817static void eeh_clear_slot_attention(struct pci_dev *pdev)
818{
819 const struct hotplug_slot_ops *ops;
820 struct pci_slot *slot;
821
822 if (!pdev)
823 return;
824
825 if (pdev->error_state == pci_channel_io_perm_failure)
826 return;
827
828 slot = pdev->slot;
829 if (!slot || !slot->hotplug)
830 return;
831
832 ops = slot->hotplug->ops;
833 if (!ops || !ops->set_attention_status)
834 return;
835
836 ops->set_attention_status(slot->hotplug, 0);
837}
838
747/** 839/**
748 * eeh_handle_normal_event - Handle EEH events on a specific PE 840 * eeh_handle_normal_event - Handle EEH events on a specific PE
749 * @pe: EEH PE - which should not be used after we return, as it may 841 * @pe: EEH PE - which should not be used after we return, as it may
@@ -774,6 +866,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
774 enum pci_ers_result result = PCI_ERS_RESULT_NONE; 866 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
775 struct eeh_rmv_data rmv_data = 867 struct eeh_rmv_data rmv_data =
776 {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; 868 {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
869 int devices = 0;
777 870
778 bus = eeh_pe_bus_get(pe); 871 bus = eeh_pe_bus_get(pe);
779 if (!bus) { 872 if (!bus) {
@@ -782,7 +875,59 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
782 return; 875 return;
783 } 876 }
784 877
785 eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 878 /*
879 * When devices are hot-removed we might get an EEH due to
880 * a driver attempting to touch the MMIO space of a removed
881 * device. In this case we don't have a device to recover
882 * so suppress the event if we can't find any present devices.
883 *
884 * The hotplug driver should take care of tearing down the
885 * device itself.
886 */
887 eeh_for_each_pe(pe, tmp_pe)
888 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
889 if (eeh_slot_presence_check(edev->pdev))
890 devices++;
891
892 if (!devices) {
893 pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
894 pe->phb->global_number, pe->addr);
895 goto out; /* nothing to recover */
896 }
897
898 /* Log the event */
899 if (pe->type & EEH_PE_PHB) {
900 pr_err("EEH: PHB#%x failure detected, location: %s\n",
901 pe->phb->global_number, eeh_pe_loc_get(pe));
902 } else {
903 struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
904
905 pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
906 pe->phb->global_number, pe->addr);
907 pr_err("EEH: PE location: %s, PHB location: %s\n",
908 eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
909 }
910
911#ifdef CONFIG_STACKTRACE
912 /*
913 * Print the saved stack trace now that we've verified there's
914 * something to recover.
915 */
916 if (pe->trace_entries) {
917 void **ptrs = (void **) pe->stack_trace;
918 int i;
919
920 pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
921 pe->phb->global_number, pe->addr);
922
923 /* FIXME: Use the same format as dump_stack() */
924 pr_err("EEH: Call Trace:\n");
925 for (i = 0; i < pe->trace_entries; i++)
926 pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
927
928 pe->trace_entries = 0;
929 }
930#endif /* CONFIG_STACKTRACE */
786 931
787 eeh_pe_update_time_stamp(pe); 932 eeh_pe_update_time_stamp(pe);
788 pe->freeze_count++; 933 pe->freeze_count++;
@@ -793,6 +938,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
793 result = PCI_ERS_RESULT_DISCONNECT; 938 result = PCI_ERS_RESULT_DISCONNECT;
794 } 939 }
795 940
941 eeh_for_each_pe(pe, tmp_pe)
942 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
943 edev->mode &= ~EEH_DEV_NO_HANDLER;
944
796 /* Walk the various device drivers attached to this slot through 945 /* Walk the various device drivers attached to this slot through
797 * a reset sequence, giving each an opportunity to do what it needs 946 * a reset sequence, giving each an opportunity to do what it needs
798 * to accomplish the reset. Each child gets a report of the 947 * to accomplish the reset. Each child gets a report of the
@@ -969,6 +1118,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
969 return; 1118 return;
970 } 1119 }
971 } 1120 }
1121
1122out:
1123 /*
1124 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
1125 * we don't want to modify the PE tree structure so we do it here.
1126 */
1127 eeh_pe_cleanup(pe);
1128
1129 /* clear the slot attention LED for all recovered devices */
1130 eeh_for_each_pe(pe, tmp_pe)
1131 eeh_pe_for_each_dev(tmp_pe, edev, tmp)
1132 eeh_clear_slot_attention(edev->pdev);
1133
972 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 1134 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
973} 1135}
974 1136
@@ -981,7 +1143,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
981 */ 1143 */
982void eeh_handle_special_event(void) 1144void eeh_handle_special_event(void)
983{ 1145{
984 struct eeh_pe *pe, *phb_pe; 1146 struct eeh_pe *pe, *phb_pe, *tmp_pe;
1147 struct eeh_dev *edev, *tmp_edev;
985 struct pci_bus *bus; 1148 struct pci_bus *bus;
986 struct pci_controller *hose; 1149 struct pci_controller *hose;
987 unsigned long flags; 1150 unsigned long flags;
@@ -1040,6 +1203,7 @@ void eeh_handle_special_event(void)
1040 */ 1203 */
1041 if (rc == EEH_NEXT_ERR_FROZEN_PE || 1204 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
1042 rc == EEH_NEXT_ERR_FENCED_PHB) { 1205 rc == EEH_NEXT_ERR_FENCED_PHB) {
1206 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
1043 eeh_handle_normal_event(pe); 1207 eeh_handle_normal_event(pe);
1044 } else { 1208 } else {
1045 pci_lock_rescan_remove(); 1209 pci_lock_rescan_remove();
@@ -1050,6 +1214,10 @@ void eeh_handle_special_event(void)
1050 (phb_pe->state & EEH_PE_RECOVERING)) 1214 (phb_pe->state & EEH_PE_RECOVERING))
1051 continue; 1215 continue;
1052 1216
1217 eeh_for_each_pe(pe, tmp_pe)
1218 eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
1219 edev->mode &= ~EEH_DEV_NO_HANDLER;
1220
1053 /* Notify all devices to be down */ 1221 /* Notify all devices to be down */
1054 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 1222 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
1055 eeh_set_channel_state(pe, pci_channel_io_perm_failure); 1223 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 64cfbe41174b..a7a8dc182efb 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy)
40{ 40{
41 unsigned long flags; 41 unsigned long flags;
42 struct eeh_event *event; 42 struct eeh_event *event;
43 struct eeh_pe *pe;
44 43
45 while (!kthread_should_stop()) { 44 while (!kthread_should_stop()) {
46 if (wait_for_completion_interruptible(&eeh_eventlist_event)) 45 if (wait_for_completion_interruptible(&eeh_eventlist_event))
@@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy)
59 continue; 58 continue;
60 59
61 /* We might have event without binding PE */ 60 /* We might have event without binding PE */
62 pe = event->pe; 61 if (event->pe)
63 if (pe) { 62 eeh_handle_normal_event(event->pe);
64 if (pe->type & EEH_PE_PHB) 63 else
65 pr_info("EEH: Detected error on PHB#%x\n",
66 pe->phb->global_number);
67 else
68 pr_info("EEH: Detected PCI bus error on "
69 "PHB#%x-PE#%x\n",
70 pe->phb->global_number, pe->addr);
71 eeh_handle_normal_event(pe);
72 } else {
73 eeh_handle_special_event(); 64 eeh_handle_special_event();
74 }
75 65
76 kfree(event); 66 kfree(event);
77 } 67 }
@@ -121,6 +111,24 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
121 } 111 }
122 event->pe = pe; 112 event->pe = pe;
123 113
114 /*
115 * Mark the PE as recovering before inserting it in the queue.
116 * This prevents the PE from being free()ed by a hotplug driver
117 * while the PE is sitting in the event queue.
118 */
119 if (pe) {
120#ifdef CONFIG_STACKTRACE
121 /*
122 * Save the current stack trace so we can dump it from the
123 * event handler thread.
124 */
125 pe->trace_entries = stack_trace_save(pe->stack_trace,
126 ARRAY_SIZE(pe->stack_trace), 0);
127#endif /* CONFIG_STACKTRACE */
128
129 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
130 }
131
124 /* We may or may not be called in an interrupt context */ 132 /* We may or may not be called in an interrupt context */
125 spin_lock_irqsave(&eeh_eventlist_lock, flags); 133 spin_lock_irqsave(&eeh_eventlist_lock, flags);
126 list_add(&event->list, &eeh_eventlist); 134 list_add(&event->list, &eeh_eventlist);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 854cef7b18f4..177852e39a25 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root,
231 * The function is used to traverse the devices of the specified 231 * The function is used to traverse the devices of the specified
232 * PE and its child PEs. 232 * PE and its child PEs.
233 */ 233 */
234void *eeh_pe_dev_traverse(struct eeh_pe *root, 234void eeh_pe_dev_traverse(struct eeh_pe *root,
235 eeh_edev_traverse_func fn, void *flag) 235 eeh_edev_traverse_func fn, void *flag)
236{ 236{
237 struct eeh_pe *pe; 237 struct eeh_pe *pe;
238 struct eeh_dev *edev, *tmp; 238 struct eeh_dev *edev, *tmp;
239 void *ret;
240 239
241 if (!root) { 240 if (!root) {
242 pr_warn("%s: Invalid PE %p\n", 241 pr_warn("%s: Invalid PE %p\n",
243 __func__, root); 242 __func__, root);
244 return NULL; 243 return;
245 } 244 }
246 245
247 /* Traverse root PE */ 246 /* Traverse root PE */
248 eeh_for_each_pe(root, pe) { 247 eeh_for_each_pe(root, pe)
249 eeh_pe_for_each_dev(pe, edev, tmp) { 248 eeh_pe_for_each_dev(pe, edev, tmp)
250 ret = fn(edev, flag); 249 fn(edev, flag);
251 if (ret)
252 return ret;
253 }
254 }
255
256 return NULL;
257} 250}
258 251
259/** 252/**
@@ -379,8 +372,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
379 372
380 /* Check if the PE number is valid */ 373 /* Check if the PE number is valid */
381 if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { 374 if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
382 pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", 375 eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n");
383 __func__, config_addr, pdn->phb->global_number);
384 return -EINVAL; 376 return -EINVAL;
385 } 377 }
386 378
@@ -391,42 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
391 * components. 383 * components.
392 */ 384 */
393 pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); 385 pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
394 if (pe && !(pe->type & EEH_PE_INVALID)) { 386 if (pe) {
395 /* Mark the PE as type of PCI bus */ 387 if (pe->type & EEH_PE_INVALID) {
396 pe->type = EEH_PE_BUS; 388 list_add_tail(&edev->entry, &pe->edevs);
397 edev->pe = pe; 389 edev->pe = pe;
398 390 /*
399 /* Put the edev to PE */ 391 * We're running to here because of PCI hotplug caused by
400 list_add_tail(&edev->entry, &pe->edevs); 392 * EEH recovery. We need clear EEH_PE_INVALID until the top.
401 pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", 393 */
402 pdn->phb->global_number, 394 parent = pe;
403 pdn->busno, 395 while (parent) {
404 PCI_SLOT(pdn->devfn), 396 if (!(parent->type & EEH_PE_INVALID))
405 PCI_FUNC(pdn->devfn), 397 break;
406 pe->addr); 398 parent->type &= ~EEH_PE_INVALID;
407 return 0; 399 parent = parent->parent;
408 } else if (pe && (pe->type & EEH_PE_INVALID)) { 400 }
409 list_add_tail(&edev->entry, &pe->edevs); 401
410 edev->pe = pe; 402 eeh_edev_dbg(edev,
411 /* 403 "Added to device PE (parent: PE#%x)\n",
412 * We're running to here because of PCI hotplug caused by 404 pe->parent->addr);
413 * EEH recovery. We need clear EEH_PE_INVALID until the top. 405 } else {
414 */ 406 /* Mark the PE as type of PCI bus */
415 parent = pe; 407 pe->type = EEH_PE_BUS;
416 while (parent) { 408 edev->pe = pe;
417 if (!(parent->type & EEH_PE_INVALID))
418 break;
419 parent->type &= ~EEH_PE_INVALID;
420 parent = parent->parent;
421 }
422 409
423 pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " 410 /* Put the edev to PE */
424 "PE#%x, Parent PE#%x\n", 411 list_add_tail(&edev->entry, &pe->edevs);
425 pdn->phb->global_number, 412 eeh_edev_dbg(edev, "Added to bus PE\n");
426 pdn->busno, 413 }
427 PCI_SLOT(pdn->devfn),
428 PCI_FUNC(pdn->devfn),
429 pe->addr, pe->parent->addr);
430 return 0; 414 return 0;
431 } 415 }
432 416
@@ -468,13 +452,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
468 list_add_tail(&pe->child, &parent->child_list); 452 list_add_tail(&pe->child, &parent->child_list);
469 list_add_tail(&edev->entry, &pe->edevs); 453 list_add_tail(&edev->entry, &pe->edevs);
470 edev->pe = pe; 454 edev->pe = pe;
471 pr_debug("EEH: Add %04x:%02x:%02x.%01x to " 455 eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
472 "Device PE#%x, Parent PE#%x\n", 456 pe->parent->addr);
473 pdn->phb->global_number,
474 pdn->busno,
475 PCI_SLOT(pdn->devfn),
476 PCI_FUNC(pdn->devfn),
477 pe->addr, pe->parent->addr);
478 457
479 return 0; 458 return 0;
480} 459}
@@ -491,16 +470,12 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
491int eeh_rmv_from_parent_pe(struct eeh_dev *edev) 470int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
492{ 471{
493 struct eeh_pe *pe, *parent, *child; 472 struct eeh_pe *pe, *parent, *child;
473 bool keep, recover;
494 int cnt; 474 int cnt;
495 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
496 475
497 pe = eeh_dev_to_pe(edev); 476 pe = eeh_dev_to_pe(edev);
498 if (!pe) { 477 if (!pe) {
499 pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", 478 eeh_edev_dbg(edev, "No PE found for device.\n");
500 __func__, pdn->phb->global_number,
501 pdn->busno,
502 PCI_SLOT(pdn->devfn),
503 PCI_FUNC(pdn->devfn));
504 return -EEXIST; 479 return -EEXIST;
505 } 480 }
506 481
@@ -516,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
516 */ 491 */
517 while (1) { 492 while (1) {
518 parent = pe->parent; 493 parent = pe->parent;
494
495 /* PHB PEs should never be removed */
519 if (pe->type & EEH_PE_PHB) 496 if (pe->type & EEH_PE_PHB)
520 break; 497 break;
521 498
522 if (!(pe->state & EEH_PE_KEEP)) { 499 /*
500 * XXX: KEEP is set while resetting a PE. I don't think it's
501 * ever set without RECOVERING also being set. I could
502 * be wrong though so catch that with a WARN.
503 */
504 keep = !!(pe->state & EEH_PE_KEEP);
505 recover = !!(pe->state & EEH_PE_RECOVERING);
506 WARN_ON(keep && !recover);
507
508 if (!keep && !recover) {
523 if (list_empty(&pe->edevs) && 509 if (list_empty(&pe->edevs) &&
524 list_empty(&pe->child_list)) { 510 list_empty(&pe->child_list)) {
525 list_del(&pe->child); 511 list_del(&pe->child);
@@ -528,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
528 break; 514 break;
529 } 515 }
530 } else { 516 } else {
517 /*
518 * Mark the PE as invalid. At the end of the recovery
519 * process any invalid PEs will be garbage collected.
520 *
521 * We need to delay the free()ing of them since we can
522 * remove edev's while traversing the PE tree which
523 * might trigger the removal of a PE and we can't
524 * deal with that (yet).
525 */
531 if (list_empty(&pe->edevs)) { 526 if (list_empty(&pe->edevs)) {
532 cnt = 0; 527 cnt = 0;
533 list_for_each_entry(child, &pe->child_list, child) { 528 list_for_each_entry(child, &pe->child_list, child) {
@@ -623,13 +618,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root)
623} 618}
624EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); 619EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated);
625 620
626static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) 621static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag)
627{ 622{
628 int mode = *((int *)flag); 623 int mode = *((int *)flag);
629 624
630 edev->mode |= mode; 625 edev->mode |= mode;
631
632 return NULL;
633} 626}
634 627
635/** 628/**
@@ -717,17 +710,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
717 if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) 710 if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
718 return; 711 return;
719 712
720 pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", 713 eeh_edev_dbg(edev, "Checking PCIe link...\n");
721 __func__, pdn->phb->global_number,
722 pdn->busno,
723 PCI_SLOT(pdn->devfn),
724 PCI_FUNC(pdn->devfn));
725 714
726 /* Check slot status */ 715 /* Check slot status */
727 cap = edev->pcie_cap; 716 cap = edev->pcie_cap;
728 eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val); 717 eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val);
729 if (!(val & PCI_EXP_SLTSTA_PDS)) { 718 if (!(val & PCI_EXP_SLTSTA_PDS)) {
730 pr_debug(" No card in the slot (0x%04x) !\n", val); 719 eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val);
731 return; 720 return;
732 } 721 }
733 722
@@ -736,7 +725,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
736 if (val & PCI_EXP_SLTCAP_PCP) { 725 if (val & PCI_EXP_SLTCAP_PCP) {
737 eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val); 726 eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val);
738 if (val & PCI_EXP_SLTCTL_PCC) { 727 if (val & PCI_EXP_SLTCTL_PCC) {
739 pr_debug(" In power-off state, power it on ...\n"); 728 eeh_edev_dbg(edev, "In power-off state, power it on ...\n");
740 val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); 729 val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
741 val |= (0x0100 & PCI_EXP_SLTCTL_PIC); 730 val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
742 eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val); 731 eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val);
@@ -752,7 +741,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
752 /* Check link */ 741 /* Check link */
753 eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val); 742 eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val);
754 if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { 743 if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
755 pr_debug(" No link reporting capability (0x%08x) \n", val); 744 eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val);
756 msleep(1000); 745 msleep(1000);
757 return; 746 return;
758 } 747 }
@@ -769,10 +758,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
769 } 758 }
770 759
771 if (val & PCI_EXP_LNKSTA_DLLLA) 760 if (val & PCI_EXP_LNKSTA_DLLLA)
772 pr_debug(" Link up (%s)\n", 761 eeh_edev_dbg(edev, "Link up (%s)\n",
773 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); 762 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
774 else 763 else
775 pr_debug(" Link not ready (0x%04x)\n", val); 764 eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val);
776} 765}
777 766
778#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) 767#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
@@ -852,7 +841,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev)
852 * the expansion ROM base address, the latency timer, and etc. 841 * the expansion ROM base address, the latency timer, and etc.
853 * from the saved values in the device node. 842 * from the saved values in the device node.
854 */ 843 */
855static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) 844static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
856{ 845{
857 struct pci_dn *pdn = eeh_dev_to_pdn(edev); 846 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
858 847
@@ -864,8 +853,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
864 853
865 if (eeh_ops->restore_config && pdn) 854 if (eeh_ops->restore_config && pdn)
866 eeh_ops->restore_config(pdn); 855 eeh_ops->restore_config(pdn);
867
868 return NULL;
869} 856}
870 857
871/** 858/**
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 54fab22c9a43..d60908ea37fb 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -230,7 +230,7 @@ transfer_to_handler_cont:
230 */ 230 */
231 lis r12,reenable_mmu@h 231 lis r12,reenable_mmu@h
232 ori r12,r12,reenable_mmu@l 232 ori r12,r12,reenable_mmu@l
233 LOAD_MSR_KERNEL(r0, MSR_KERNEL) 233 LOAD_REG_IMMEDIATE(r0, MSR_KERNEL)
234 mtspr SPRN_SRR0,r12 234 mtspr SPRN_SRR0,r12
235 mtspr SPRN_SRR1,r0 235 mtspr SPRN_SRR1,r0
236 SYNC 236 SYNC
@@ -304,7 +304,7 @@ stack_ovf:
304 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD 304 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
305 lis r9,StackOverflow@ha 305 lis r9,StackOverflow@ha
306 addi r9,r9,StackOverflow@l 306 addi r9,r9,StackOverflow@l
307 LOAD_MSR_KERNEL(r10,MSR_KERNEL) 307 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
308#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) 308#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
309 mtspr SPRN_NRI, r0 309 mtspr SPRN_NRI, r0
310#endif 310#endif
@@ -324,7 +324,7 @@ trace_syscall_entry_irq_off:
324 bl trace_hardirqs_on 324 bl trace_hardirqs_on
325 325
326 /* Now enable for real */ 326 /* Now enable for real */
327 LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) 327 LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
328 mtmsr r10 328 mtmsr r10
329 329
330 REST_GPR(0, r1) 330 REST_GPR(0, r1)
@@ -394,7 +394,7 @@ ret_from_syscall:
394#endif 394#endif
395 mr r6,r3 395 mr r6,r3
396 /* disable interrupts so current_thread_info()->flags can't change */ 396 /* disable interrupts so current_thread_info()->flags can't change */
397 LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ 397 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */
398 /* Note: We don't bother telling lockdep about it */ 398 /* Note: We don't bother telling lockdep about it */
399 SYNC 399 SYNC
400 MTMSRD(r10) 400 MTMSRD(r10)
@@ -777,11 +777,19 @@ fast_exception_return:
7771: lis r3,exc_exit_restart_end@ha 7771: lis r3,exc_exit_restart_end@ha
778 addi r3,r3,exc_exit_restart_end@l 778 addi r3,r3,exc_exit_restart_end@l
779 cmplw r12,r3 779 cmplw r12,r3
780#if CONFIG_PPC_BOOK3S_601
781 bge 2b
782#else
780 bge 3f 783 bge 3f
784#endif
781 lis r4,exc_exit_restart@ha 785 lis r4,exc_exit_restart@ha
782 addi r4,r4,exc_exit_restart@l 786 addi r4,r4,exc_exit_restart@l
783 cmplw r12,r4 787 cmplw r12,r4
788#if CONFIG_PPC_BOOK3S_601
789 blt 2b
790#else
784 blt 3f 791 blt 3f
792#endif
785 lis r3,fee_restarts@ha 793 lis r3,fee_restarts@ha
786 tophys(r3,r3) 794 tophys(r3,r3)
787 lwz r5,fee_restarts@l(r3) 795 lwz r5,fee_restarts@l(r3)
@@ -800,9 +808,6 @@ fee_restarts:
800/* aargh, we don't know which trap this is */ 808/* aargh, we don't know which trap this is */
801/* but the 601 doesn't implement the RI bit, so assume it's OK */ 809/* but the 601 doesn't implement the RI bit, so assume it's OK */
8023: 8103:
803BEGIN_FTR_SECTION
804 b 2b
805END_FTR_SECTION_IFSET(CPU_FTR_601)
806 li r10,-1 811 li r10,-1
807 stw r10,_TRAP(r11) 812 stw r10,_TRAP(r11)
808 addi r3,r1,STACK_FRAME_OVERHEAD 813 addi r3,r1,STACK_FRAME_OVERHEAD
@@ -824,7 +829,7 @@ ret_from_except:
824 * can't change between when we test it and when we return 829 * can't change between when we test it and when we return
825 * from the interrupt. */ 830 * from the interrupt. */
826 /* Note: We don't bother telling lockdep about it */ 831 /* Note: We don't bother telling lockdep about it */
827 LOAD_MSR_KERNEL(r10,MSR_KERNEL) 832 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
828 SYNC /* Some chip revs have problems here... */ 833 SYNC /* Some chip revs have problems here... */
829 MTMSRD(r10) /* disable interrupts */ 834 MTMSRD(r10) /* disable interrupts */
830 835
@@ -991,7 +996,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
991 * can restart the exception exit path at the label 996 * can restart the exception exit path at the label
992 * exc_exit_restart below. -- paulus 997 * exc_exit_restart below. -- paulus
993 */ 998 */
994 LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI) 999 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI)
995 SYNC 1000 SYNC
996 MTMSRD(r10) /* clear the RI bit */ 1001 MTMSRD(r10) /* clear the RI bit */
997 .globl exc_exit_restart 1002 .globl exc_exit_restart
@@ -1066,7 +1071,7 @@ exc_exit_restart_end:
1066 REST_NVGPRS(r1); \ 1071 REST_NVGPRS(r1); \
1067 lwz r3,_MSR(r1); \ 1072 lwz r3,_MSR(r1); \
1068 andi. r3,r3,MSR_PR; \ 1073 andi. r3,r3,MSR_PR; \
1069 LOAD_MSR_KERNEL(r10,MSR_KERNEL); \ 1074 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \
1070 bne user_exc_return; \ 1075 bne user_exc_return; \
1071 lwz r0,GPR0(r1); \ 1076 lwz r0,GPR0(r1); \
1072 lwz r2,GPR2(r1); \ 1077 lwz r2,GPR2(r1); \
@@ -1236,7 +1241,7 @@ recheck:
1236 * neither. Those disable/enable cycles used to peek at 1241 * neither. Those disable/enable cycles used to peek at
1237 * TI_FLAGS aren't advertised. 1242 * TI_FLAGS aren't advertised.
1238 */ 1243 */
1239 LOAD_MSR_KERNEL(r10,MSR_KERNEL) 1244 LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)
1240 SYNC 1245 SYNC
1241 MTMSRD(r10) /* disable interrupts */ 1246 MTMSRD(r10) /* disable interrupts */
1242 lwz r9,TI_FLAGS(r2) 1247 lwz r9,TI_FLAGS(r2)
@@ -1270,11 +1275,19 @@ nonrecoverable:
1270 lis r10,exc_exit_restart_end@ha 1275 lis r10,exc_exit_restart_end@ha
1271 addi r10,r10,exc_exit_restart_end@l 1276 addi r10,r10,exc_exit_restart_end@l
1272 cmplw r12,r10 1277 cmplw r12,r10
1278#ifdef CONFIG_PPC_BOOK3S_601
1279 bgelr
1280#else
1273 bge 3f 1281 bge 3f
1282#endif
1274 lis r11,exc_exit_restart@ha 1283 lis r11,exc_exit_restart@ha
1275 addi r11,r11,exc_exit_restart@l 1284 addi r11,r11,exc_exit_restart@l
1276 cmplw r12,r11 1285 cmplw r12,r11
1286#ifdef CONFIG_PPC_BOOK3S_601
1287 bltlr
1288#else
1277 blt 3f 1289 blt 3f
1290#endif
1278 lis r10,ee_restarts@ha 1291 lis r10,ee_restarts@ha
1279 lwz r12,ee_restarts@l(r10) 1292 lwz r12,ee_restarts@l(r10)
1280 addi r12,r12,1 1293 addi r12,r12,1
@@ -1283,9 +1296,6 @@ nonrecoverable:
1283 blr 1296 blr
12843: /* OK, we can't recover, kill this process */ 12973: /* OK, we can't recover, kill this process */
1285 /* but the 601 doesn't implement the RI bit, so assume it's OK */ 1298 /* but the 601 doesn't implement the RI bit, so assume it's OK */
1286BEGIN_FTR_SECTION
1287 blr
1288END_FTR_SECTION_IFSET(CPU_FTR_601)
1289 lwz r3,_TRAP(r1) 1299 lwz r3,_TRAP(r1)
1290 andi. r0,r3,1 1300 andi. r0,r3,1
1291 beq 5f 1301 beq 5f
@@ -1329,7 +1339,7 @@ _GLOBAL(enter_rtas)
1329 lwz r4,RTASBASE(r4) 1339 lwz r4,RTASBASE(r4)
1330 mfmsr r9 1340 mfmsr r9
1331 stw r9,8(r1) 1341 stw r9,8(r1)
1332 LOAD_MSR_KERNEL(r0,MSR_KERNEL) 1342 LOAD_REG_IMMEDIATE(r0,MSR_KERNEL)
1333 SYNC /* disable interrupts so SRR0/1 */ 1343 SYNC /* disable interrupts so SRR0/1 */
1334 MTMSRD(r0) /* don't get trashed */ 1344 MTMSRD(r0) /* don't get trashed */
1335 li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) 1345 li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 0a0b5310f54a..6467bdab8d40 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -69,24 +69,20 @@ BEGIN_FTR_SECTION
69 bne .Ltabort_syscall 69 bne .Ltabort_syscall
70END_FTR_SECTION_IFSET(CPU_FTR_TM) 70END_FTR_SECTION_IFSET(CPU_FTR_TM)
71#endif 71#endif
72 andi. r10,r12,MSR_PR
73 mr r10,r1 72 mr r10,r1
74 addi r1,r1,-INT_FRAME_SIZE
75 beq- 1f
76 ld r1,PACAKSAVE(r13) 73 ld r1,PACAKSAVE(r13)
771: std r10,0(r1) 74 std r10,0(r1)
78 std r11,_NIP(r1) 75 std r11,_NIP(r1)
79 std r12,_MSR(r1) 76 std r12,_MSR(r1)
80 std r0,GPR0(r1) 77 std r0,GPR0(r1)
81 std r10,GPR1(r1) 78 std r10,GPR1(r1)
82 beq 2f /* if from kernel mode */
83#ifdef CONFIG_PPC_FSL_BOOK3E 79#ifdef CONFIG_PPC_FSL_BOOK3E
84START_BTB_FLUSH_SECTION 80START_BTB_FLUSH_SECTION
85 BTB_FLUSH(r10) 81 BTB_FLUSH(r10)
86END_BTB_FLUSH_SECTION 82END_BTB_FLUSH_SECTION
87#endif 83#endif
88 ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) 84 ACCOUNT_CPU_USER_ENTRY(r13, r10, r11)
892: std r2,GPR2(r1) 85 std r2,GPR2(r1)
90 std r3,GPR3(r1) 86 std r3,GPR3(r1)
91 mfcr r2 87 mfcr r2
92 std r4,GPR4(r1) 88 std r4,GPR4(r1)
@@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION
122 118
123#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) 119#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
124BEGIN_FW_FTR_SECTION 120BEGIN_FW_FTR_SECTION
125 beq 33f 121 /* see if there are any DTL entries to process */
126 /* if from user, see if there are any DTL entries to process */
127 ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ 122 ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */
128 ld r11,PACA_DTL_RIDX(r13) /* get log read index */ 123 ld r11,PACA_DTL_RIDX(r13) /* get log read index */
129 addi r10,r10,LPPACA_DTLIDX 124 addi r10,r10,LPPACA_DTLIDX
130 LDX_BE r10,0,r10 /* get log write index */ 125 LDX_BE r10,0,r10 /* get log write index */
131 cmpd cr1,r11,r10 126 cmpd r11,r10
132 beq+ cr1,33f 127 beq+ 33f
133 bl accumulate_stolen_time 128 bl accumulate_stolen_time
134 REST_GPR(0,r1) 129 REST_GPR(0,r1)
135 REST_4GPRS(3,r1) 130 REST_4GPRS(3,r1)
@@ -203,6 +198,7 @@ system_call: /* label this so stack traces look sane */
203 mtctr r12 198 mtctr r12
204 bctrl /* Call handler */ 199 bctrl /* Call handler */
205 200
201 /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */
206.Lsyscall_exit: 202.Lsyscall_exit:
207 std r3,RESULT(r1) 203 std r3,RESULT(r1)
208 204
@@ -216,11 +212,6 @@ system_call: /* label this so stack traces look sane */
216 ld r12, PACA_THREAD_INFO(r13) 212 ld r12, PACA_THREAD_INFO(r13)
217 213
218 ld r8,_MSR(r1) 214 ld r8,_MSR(r1)
219#ifdef CONFIG_PPC_BOOK3S
220 /* No MSR:RI on BookE */
221 andi. r10,r8,MSR_RI
222 beq- .Lunrecov_restore
223#endif
224 215
225/* 216/*
226 * This is a few instructions into the actual syscall exit path (which actually 217 * This is a few instructions into the actual syscall exit path (which actually
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 1cfb3da4a84a..829950b96d29 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
750 ld r15,PACATOC(r13) 750 ld r15,PACATOC(r13)
751 ld r14,interrupt_base_book3e@got(r15) 751 ld r14,interrupt_base_book3e@got(r15)
752 ld r15,__end_interrupts@got(r15) 752 ld r15,__end_interrupts@got(r15)
753#else
754 LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
755 LOAD_REG_IMMEDIATE(r15,__end_interrupts)
756#endif
757 cmpld cr0,r10,r14 753 cmpld cr0,r10,r14
758 cmpld cr1,r10,r15 754 cmpld cr1,r10,r15
755#else
756 LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
757 cmpld cr0, r10, r14
758 LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts)
759 cmpld cr1, r10, r14
760#endif
759 blt+ cr0,1f 761 blt+ cr0,1f
760 bge+ cr1,1f 762 bge+ cr1,1f
761 763
@@ -820,12 +822,14 @@ kernel_dbg_exc:
820 ld r15,PACATOC(r13) 822 ld r15,PACATOC(r13)
821 ld r14,interrupt_base_book3e@got(r15) 823 ld r14,interrupt_base_book3e@got(r15)
822 ld r15,__end_interrupts@got(r15) 824 ld r15,__end_interrupts@got(r15)
823#else
824 LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
825 LOAD_REG_IMMEDIATE(r15,__end_interrupts)
826#endif
827 cmpld cr0,r10,r14 825 cmpld cr0,r10,r14
828 cmpld cr1,r10,r15 826 cmpld cr1,r10,r15
827#else
828 LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e)
829 cmpld cr0, r10, r14
830 LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts)
831 cmpld cr1, r10, r14
832#endif
829 blt+ cr0,1f 833 blt+ cr0,1f
830 bge+ cr1,1f 834 bge+ cr1,1f
831 835
@@ -1449,7 +1453,7 @@ a2_tlbinit_code_start:
1449a2_tlbinit_after_linear_map: 1453a2_tlbinit_after_linear_map:
1450 1454
1451 /* Now we branch the new virtual address mapped by this entry */ 1455 /* Now we branch the new virtual address mapped by this entry */
1452 LOAD_REG_IMMEDIATE(r3,1f) 1456 LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f)
1453 mtctr r3 1457 mtctr r3
1454 bctr 1458 bctr
1455 1459
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6ba3cc2ef8ab..d0018dd17e0a 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -44,6 +44,58 @@
44#endif 44#endif
45 45
46/* 46/*
47 * Following are fixed section helper macros.
48 *
49 * EXC_REAL_BEGIN/END - real, unrelocated exception vectors
50 * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors
51 * TRAMP_REAL_BEGIN - real, unrelocated helpers (virt may call these)
52 * TRAMP_VIRT_BEGIN - virt, unreloc helpers (in practice, real can use)
53 * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated
54 * EXC_COMMON - After switching to virtual, relocated mode.
55 */
56
57#define EXC_REAL_BEGIN(name, start, size) \
58 FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
59
60#define EXC_REAL_END(name, start, size) \
61 FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
62
63#define EXC_VIRT_BEGIN(name, start, size) \
64 FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
65
66#define EXC_VIRT_END(name, start, size) \
67 FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
68
69#define EXC_COMMON_BEGIN(name) \
70 USE_TEXT_SECTION(); \
71 .balign IFETCH_ALIGN_BYTES; \
72 .global name; \
73 _ASM_NOKPROBE_SYMBOL(name); \
74 DEFINE_FIXED_SYMBOL(name); \
75name:
76
77#define TRAMP_REAL_BEGIN(name) \
78 FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
79
80#define TRAMP_VIRT_BEGIN(name) \
81 FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
82
83#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
84#define TRAMP_KVM_BEGIN(name) \
85 TRAMP_VIRT_BEGIN(name)
86#else
87#define TRAMP_KVM_BEGIN(name)
88#endif
89
90#define EXC_REAL_NONE(start, size) \
91 FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
92 FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
93
94#define EXC_VIRT_NONE(start, size) \
95 FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
96 FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size)
97
98/*
47 * We're short on space and time in the exception prolog, so we can't 99 * We're short on space and time in the exception prolog, so we can't
48 * use the normal LOAD_REG_IMMEDIATE macro to load the address of label. 100 * use the normal LOAD_REG_IMMEDIATE macro to load the address of label.
49 * Instead we get the base of the kernel from paca->kernelbase and or in the low 101 * Instead we get the base of the kernel from paca->kernelbase and or in the low
@@ -68,6 +120,7 @@
68 addis reg,reg,(ABS_ADDR(label))@h 120 addis reg,reg,(ABS_ADDR(label))@h
69 121
70/* Exception register prefixes */ 122/* Exception register prefixes */
123#define EXC_HV_OR_STD 2 /* depends on HVMODE */
71#define EXC_HV 1 124#define EXC_HV 1
72#define EXC_STD 0 125#define EXC_STD 0
73 126
@@ -127,126 +180,6 @@ BEGIN_FTR_SECTION_NESTED(943) \
127 std ra,offset(r13); \ 180 std ra,offset(r13); \
128END_FTR_SECTION_NESTED(ftr,ftr,943) 181END_FTR_SECTION_NESTED(ftr,ftr,943)
129 182
130.macro EXCEPTION_PROLOG_0 area
131 SET_SCRATCH0(r13) /* save r13 */
132 GET_PACA(r13)
133 std r9,\area\()+EX_R9(r13) /* save r9 */
134 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
135 HMT_MEDIUM
136 std r10,\area\()+EX_R10(r13) /* save r10 - r12 */
137 OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
138.endm
139
140.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask
141 OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
142 OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
143 INTERRUPT_TO_KERNEL
144 SAVE_CTR(r10, \area\())
145 mfcr r9
146 .if \kvm
147 KVMTEST \hsrr \vec
148 .endif
149 .if \bitmask
150 lbz r10,PACAIRQSOFTMASK(r13)
151 andi. r10,r10,\bitmask
152 /* Associate vector numbers with bits in paca->irq_happened */
153 .if \vec == 0x500 || \vec == 0xea0
154 li r10,PACA_IRQ_EE
155 .elseif \vec == 0x900
156 li r10,PACA_IRQ_DEC
157 .elseif \vec == 0xa00 || \vec == 0xe80
158 li r10,PACA_IRQ_DBELL
159 .elseif \vec == 0xe60
160 li r10,PACA_IRQ_HMI
161 .elseif \vec == 0xf00
162 li r10,PACA_IRQ_PMI
163 .else
164 .abort "Bad maskable vector"
165 .endif
166
167 .if \hsrr
168 bne masked_Hinterrupt
169 .else
170 bne masked_interrupt
171 .endif
172 .endif
173
174 std r11,\area\()+EX_R11(r13)
175 std r12,\area\()+EX_R12(r13)
176
177 /*
178 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
179 * because a d-side MCE will clobber those registers so is
180 * not recoverable if they are live.
181 */
182 GET_SCRATCH0(r10)
183 std r10,\area\()+EX_R13(r13)
184 .if \dar
185 mfspr r10,SPRN_DAR
186 std r10,\area\()+EX_DAR(r13)
187 .endif
188 .if \dsisr
189 mfspr r10,SPRN_DSISR
190 stw r10,\area\()+EX_DSISR(r13)
191 .endif
192.endm
193
194.macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri
195 ld r10,PACAKMSR(r13) /* get MSR value for kernel */
196 .if ! \set_ri
197 xori r10,r10,MSR_RI /* Clear MSR_RI */
198 .endif
199 .if \hsrr
200 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
201 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
202 mtspr SPRN_HSRR1,r10
203 .else
204 mfspr r11,SPRN_SRR0 /* save SRR0 */
205 mfspr r12,SPRN_SRR1 /* and SRR1 */
206 mtspr SPRN_SRR1,r10
207 .endif
208 LOAD_HANDLER(r10, \label\())
209 .if \hsrr
210 mtspr SPRN_HSRR0,r10
211 HRFI_TO_KERNEL
212 .else
213 mtspr SPRN_SRR0,r10
214 RFI_TO_KERNEL
215 .endif
216 b . /* prevent speculative execution */
217.endm
218
219.macro EXCEPTION_PROLOG_2_VIRT label, hsrr
220#ifdef CONFIG_RELOCATABLE
221 .if \hsrr
222 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
223 .else
224 mfspr r11,SPRN_SRR0 /* save SRR0 */
225 .endif
226 LOAD_HANDLER(r12, \label\())
227 mtctr r12
228 .if \hsrr
229 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
230 .else
231 mfspr r12,SPRN_SRR1 /* and HSRR1 */
232 .endif
233 li r10,MSR_RI
234 mtmsrd r10,1 /* Set RI (EE=0) */
235 bctr
236#else
237 .if \hsrr
238 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
239 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
240 .else
241 mfspr r11,SPRN_SRR0 /* save SRR0 */
242 mfspr r12,SPRN_SRR1 /* and SRR1 */
243 .endif
244 li r10,MSR_RI
245 mtmsrd r10,1 /* Set RI (EE=0) */
246 b \label
247#endif
248.endm
249
250/* 183/*
251 * Branch to label using its 0xC000 address. This results in instruction 184 * Branch to label using its 0xC000 address. This results in instruction
252 * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned 185 * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned
@@ -260,6 +193,11 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
260 mtctr reg; \ 193 mtctr reg; \
261 bctr 194 bctr
262 195
196.macro INT_KVM_HANDLER name, vec, hsrr, area, skip
197 TRAMP_KVM_BEGIN(\name\()_kvm)
198 KVM_HANDLER \vec, \hsrr, \area, \skip
199.endm
200
263#ifdef CONFIG_KVM_BOOK3S_64_HANDLER 201#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
264#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 202#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
265/* 203/*
@@ -272,17 +210,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
272#define kvmppc_interrupt kvmppc_interrupt_pr 210#define kvmppc_interrupt kvmppc_interrupt_pr
273#endif 211#endif
274 212
275.macro KVMTEST hsrr, n 213.macro KVMTEST name, hsrr, n
276 lbz r10,HSTATE_IN_GUEST(r13) 214 lbz r10,HSTATE_IN_GUEST(r13)
277 cmpwi r10,0 215 cmpwi r10,0
278 .if \hsrr 216 bne \name\()_kvm
279 bne do_kvm_H\n
280 .else
281 bne do_kvm_\n
282 .endif
283.endm 217.endm
284 218
285.macro KVM_HANDLER area, hsrr, n, skip 219.macro KVM_HANDLER vec, hsrr, area, skip
286 .if \skip 220 .if \skip
287 cmpwi r10,KVM_GUEST_MODE_SKIP 221 cmpwi r10,KVM_GUEST_MODE_SKIP
288 beq 89f 222 beq 89f
@@ -301,10 +235,16 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
301 std r12,HSTATE_SCRATCH0(r13) 235 std r12,HSTATE_SCRATCH0(r13)
302 sldi r12,r9,32 236 sldi r12,r9,32
303 /* HSRR variants have the 0x2 bit added to their trap number */ 237 /* HSRR variants have the 0x2 bit added to their trap number */
304 .if \hsrr 238 .if \hsrr == EXC_HV_OR_STD
305 ori r12,r12,(\n + 0x2) 239 BEGIN_FTR_SECTION
240 ori r12,r12,(\vec + 0x2)
241 FTR_SECTION_ELSE
242 ori r12,r12,(\vec)
243 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
244 .elseif \hsrr
245 ori r12,r12,(\vec + 0x2)
306 .else 246 .else
307 ori r12,r12,(\n) 247 ori r12,r12,(\vec)
308 .endif 248 .endif
309 249
310#ifdef CONFIG_RELOCATABLE 250#ifdef CONFIG_RELOCATABLE
@@ -329,7 +269,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
32989: mtocrf 0x80,r9 26989: mtocrf 0x80,r9
330 ld r9,\area+EX_R9(r13) 270 ld r9,\area+EX_R9(r13)
331 ld r10,\area+EX_R10(r13) 271 ld r10,\area+EX_R10(r13)
332 .if \hsrr 272 .if \hsrr == EXC_HV_OR_STD
273 BEGIN_FTR_SECTION
274 b kvmppc_skip_Hinterrupt
275 FTR_SECTION_ELSE
276 b kvmppc_skip_interrupt
277 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
278 .elseif \hsrr
333 b kvmppc_skip_Hinterrupt 279 b kvmppc_skip_Hinterrupt
334 .else 280 .else
335 b kvmppc_skip_interrupt 281 b kvmppc_skip_interrupt
@@ -338,88 +284,328 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
338.endm 284.endm
339 285
340#else 286#else
341.macro KVMTEST hsrr, n 287.macro KVMTEST name, hsrr, n
288.endm
289.macro KVM_HANDLER name, vec, hsrr, area, skip
342.endm 290.endm
343.macro KVM_HANDLER area, hsrr, n, skip 291#endif
292
293.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri
294 ld r10,PACAKMSR(r13) /* get MSR value for kernel */
295 .if ! \set_ri
296 xori r10,r10,MSR_RI /* Clear MSR_RI */
297 .endif
298 .if \hsrr == EXC_HV_OR_STD
299 BEGIN_FTR_SECTION
300 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
301 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
302 mtspr SPRN_HSRR1,r10
303 FTR_SECTION_ELSE
304 mfspr r11,SPRN_SRR0 /* save SRR0 */
305 mfspr r12,SPRN_SRR1 /* and SRR1 */
306 mtspr SPRN_SRR1,r10
307 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
308 .elseif \hsrr
309 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
310 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
311 mtspr SPRN_HSRR1,r10
312 .else
313 mfspr r11,SPRN_SRR0 /* save SRR0 */
314 mfspr r12,SPRN_SRR1 /* and SRR1 */
315 mtspr SPRN_SRR1,r10
316 .endif
317 LOAD_HANDLER(r10, \label\())
318 .if \hsrr == EXC_HV_OR_STD
319 BEGIN_FTR_SECTION
320 mtspr SPRN_HSRR0,r10
321 HRFI_TO_KERNEL
322 FTR_SECTION_ELSE
323 mtspr SPRN_SRR0,r10
324 RFI_TO_KERNEL
325 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
326 .elseif \hsrr
327 mtspr SPRN_HSRR0,r10
328 HRFI_TO_KERNEL
329 .else
330 mtspr SPRN_SRR0,r10
331 RFI_TO_KERNEL
332 .endif
333 b . /* prevent speculative execution */
344.endm 334.endm
335
336/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */
337.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr
338#ifdef CONFIG_RELOCATABLE
339 .if \hsrr == EXC_HV_OR_STD
340 BEGIN_FTR_SECTION
341 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
342 FTR_SECTION_ELSE
343 mfspr r11,SPRN_SRR0 /* save SRR0 */
344 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
345 .elseif \hsrr
346 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
347 .else
348 mfspr r11,SPRN_SRR0 /* save SRR0 */
349 .endif
350 LOAD_HANDLER(r12, \label\())
351 mtctr r12
352 .if \hsrr == EXC_HV_OR_STD
353 BEGIN_FTR_SECTION
354 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
355 FTR_SECTION_ELSE
356 mfspr r12,SPRN_SRR1 /* and HSRR1 */
357 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
358 .elseif \hsrr
359 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
360 .else
361 mfspr r12,SPRN_SRR1 /* and HSRR1 */
362 .endif
363 li r10,MSR_RI
364 mtmsrd r10,1 /* Set RI (EE=0) */
365 bctr
366#else
367 .if \hsrr == EXC_HV_OR_STD
368 BEGIN_FTR_SECTION
369 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
370 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
371 FTR_SECTION_ELSE
372 mfspr r11,SPRN_SRR0 /* save SRR0 */
373 mfspr r12,SPRN_SRR1 /* and SRR1 */
374 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
375 .elseif \hsrr
376 mfspr r11,SPRN_HSRR0 /* save HSRR0 */
377 mfspr r12,SPRN_HSRR1 /* and HSRR1 */
378 .else
379 mfspr r11,SPRN_SRR0 /* save SRR0 */
380 mfspr r12,SPRN_SRR1 /* and SRR1 */
381 .endif
382 li r10,MSR_RI
383 mtmsrd r10,1 /* Set RI (EE=0) */
384 b \label
345#endif 385#endif
386.endm
346 387
347#define EXCEPTION_PROLOG_COMMON_1() \ 388/*
348 std r9,_CCR(r1); /* save CR in stackframe */ \ 389 * This is the BOOK3S interrupt entry code macro.
349 std r11,_NIP(r1); /* save SRR0 in stackframe */ \ 390 *
350 std r12,_MSR(r1); /* save SRR1 in stackframe */ \ 391 * This can result in one of several things happening:
351 std r10,0(r1); /* make stack chain pointer */ \ 392 * - Branch to the _common handler, relocated, in virtual mode.
352 std r0,GPR0(r1); /* save r0 in stackframe */ \ 393 * These are normal interrupts (synchronous and asynchronous) handled by
353 std r10,GPR1(r1); /* save r1 in stackframe */ \ 394 * the kernel.
354 395 * - Branch to KVM, relocated but real mode interrupts remain in real mode.
355/* Save original regs values from save area to stack frame. */ 396 * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by
356#define EXCEPTION_PROLOG_COMMON_2(area) \ 397 * / intended for host or guest kernel, but KVM must always be involved
357 ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ 398 * because the machine state is set for guest execution.
358 ld r10,area+EX_R10(r13); \ 399 * - Branch to the masked handler, unrelocated.
359 std r9,GPR9(r1); \ 400 * These occur when maskable asynchronous interrupts are taken with the
360 std r10,GPR10(r1); \ 401 * irq_soft_mask set.
361 ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ 402 * - Branch to an "early" handler in real mode but relocated.
362 ld r10,area+EX_R12(r13); \ 403 * This is done if early=1. MCE and HMI use these to handle errors in real
363 ld r11,area+EX_R13(r13); \ 404 * mode.
364 std r9,GPR11(r1); \ 405 * - Fall through and continue executing in real, unrelocated mode.
365 std r10,GPR12(r1); \ 406 * This is done if early=2.
366 std r11,GPR13(r1); \ 407 */
367BEGIN_FTR_SECTION_NESTED(66); \ 408.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0
368 ld r10,area+EX_CFAR(r13); \ 409 SET_SCRATCH0(r13) /* save r13 */
369 std r10,ORIG_GPR3(r1); \ 410 GET_PACA(r13)
370END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ 411 std r9,\area\()+EX_R9(r13) /* save r9 */
371 GET_CTR(r10, area); \ 412 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
372 std r10,_CTR(r1); 413 HMT_MEDIUM
373 414 std r10,\area\()+EX_R10(r13) /* save r10 - r12 */
374#define EXCEPTION_PROLOG_COMMON_3(trap) \ 415 OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
375 std r2,GPR2(r1); /* save r2 in stackframe */ \ 416 .if \ool
376 SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ 417 .if !\virt
377 SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ 418 b tramp_real_\name
378 mflr r9; /* Get LR, later save to stack */ \ 419 .pushsection .text
379 ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ 420 TRAMP_REAL_BEGIN(tramp_real_\name)
380 std r9,_LINK(r1); \ 421 .else
381 lbz r10,PACAIRQSOFTMASK(r13); \ 422 b tramp_virt_\name
382 mfspr r11,SPRN_XER; /* save XER in stackframe */ \ 423 .pushsection .text
383 std r10,SOFTE(r1); \ 424 TRAMP_VIRT_BEGIN(tramp_virt_\name)
384 std r11,_XER(r1); \ 425 .endif
385 li r9,(trap)+1; \ 426 .endif
386 std r9,_TRAP(r1); /* set trap number */ \ 427
387 li r10,0; \ 428 OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
388 ld r11,exception_marker@toc(r2); \ 429 OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
389 std r10,RESULT(r1); /* clear regs->result */ \ 430 INTERRUPT_TO_KERNEL
390 std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ 431 SAVE_CTR(r10, \area\())
432 mfcr r9
433 .if \kvm
434 KVMTEST \name \hsrr \vec
435 .endif
436 .if \bitmask
437 lbz r10,PACAIRQSOFTMASK(r13)
438 andi. r10,r10,\bitmask
439 /* Associate vector numbers with bits in paca->irq_happened */
440 .if \vec == 0x500 || \vec == 0xea0
441 li r10,PACA_IRQ_EE
442 .elseif \vec == 0x900
443 li r10,PACA_IRQ_DEC
444 .elseif \vec == 0xa00 || \vec == 0xe80
445 li r10,PACA_IRQ_DBELL
446 .elseif \vec == 0xe60
447 li r10,PACA_IRQ_HMI
448 .elseif \vec == 0xf00
449 li r10,PACA_IRQ_PMI
450 .else
451 .abort "Bad maskable vector"
452 .endif
453
454 .if \hsrr == EXC_HV_OR_STD
455 BEGIN_FTR_SECTION
456 bne masked_Hinterrupt
457 FTR_SECTION_ELSE
458 bne masked_interrupt
459 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
460 .elseif \hsrr
461 bne masked_Hinterrupt
462 .else
463 bne masked_interrupt
464 .endif
465 .endif
466
467 std r11,\area\()+EX_R11(r13)
468 std r12,\area\()+EX_R12(r13)
469
470 /*
471 * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
472 * because a d-side MCE will clobber those registers so is
473 * not recoverable if they are live.
474 */
475 GET_SCRATCH0(r10)
476 std r10,\area\()+EX_R13(r13)
477 .if \dar
478 .if \hsrr
479 mfspr r10,SPRN_HDAR
480 .else
481 mfspr r10,SPRN_DAR
482 .endif
483 std r10,\area\()+EX_DAR(r13)
484 .endif
485 .if \dsisr
486 .if \hsrr
487 mfspr r10,SPRN_HDSISR
488 .else
489 mfspr r10,SPRN_DSISR
490 .endif
491 stw r10,\area\()+EX_DSISR(r13)
492 .endif
493
494 .if \early == 2
495 /* nothing more */
496 .elseif \early
497 mfctr r10 /* save ctr, even for !RELOCATABLE */
498 BRANCH_TO_C000(r11, \name\()_early_common)
499 .elseif !\virt
500 INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri
501 .else
502 INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr
503 .endif
504 .if \ool
505 .popsection
506 .endif
507.endm
391 508
392/* 509/*
393 * On entry r13 points to the paca, r9-r13 are saved in the paca, 510 * On entry r13 points to the paca, r9-r13 are saved in the paca,
394 * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and 511 * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
395 * SRR1, and relocation is on. 512 * SRR1, and relocation is on.
513 *
514 * If stack=0, then the stack is already set in r1, and r1 is saved in r10.
515 * PPR save and CPU accounting is not done for the !stack case (XXX why not?)
396 */ 516 */
397#define EXCEPTION_COMMON(area, trap) \ 517.macro INT_COMMON vec, area, stack, kaup, reconcile, dar, dsisr
398 andi. r10,r12,MSR_PR; /* See if coming from user */ \ 518 .if \stack
399 mr r10,r1; /* Save r1 */ \ 519 andi. r10,r12,MSR_PR /* See if coming from user */
400 subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ 520 mr r10,r1 /* Save r1 */
401 beq- 1f; \ 521 subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */
402 ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ 522 beq- 100f
4031: tdgei r1,-INT_FRAME_SIZE; /* trap if r1 is in userspace */ \ 523 ld r1,PACAKSAVE(r13) /* kernel stack to use */
404 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; \ 524100: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */
4053: EXCEPTION_PROLOG_COMMON_1(); \ 525 EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0
406 kuap_save_amr_and_lock r9, r10, cr1, cr0; \ 526 .endif
407 beq 4f; /* if from kernel mode */ \ 527
408 ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ 528 std r9,_CCR(r1) /* save CR in stackframe */
409 SAVE_PPR(area, r9); \ 529 std r11,_NIP(r1) /* save SRR0 in stackframe */
4104: EXCEPTION_PROLOG_COMMON_2(area); \ 530 std r12,_MSR(r1) /* save SRR1 in stackframe */
411 EXCEPTION_PROLOG_COMMON_3(trap); \ 531 std r10,0(r1) /* make stack chain pointer */
532 std r0,GPR0(r1) /* save r0 in stackframe */
533 std r10,GPR1(r1) /* save r1 in stackframe */
534
535 .if \stack
536 .if \kaup
537 kuap_save_amr_and_lock r9, r10, cr1, cr0
538 .endif
539 beq 101f /* if from kernel mode */
540 ACCOUNT_CPU_USER_ENTRY(r13, r9, r10)
541 SAVE_PPR(\area, r9)
542101:
543 .else
544 .if \kaup
545 kuap_save_amr_and_lock r9, r10, cr1
546 .endif
547 .endif
548
549 /* Save original regs values from save area to stack frame. */
550 ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */
551 ld r10,\area+EX_R10(r13)
552 std r9,GPR9(r1)
553 std r10,GPR10(r1)
554 ld r9,\area+EX_R11(r13) /* move r11 - r13 to stackframe */
555 ld r10,\area+EX_R12(r13)
556 ld r11,\area+EX_R13(r13)
557 std r9,GPR11(r1)
558 std r10,GPR12(r1)
559 std r11,GPR13(r1)
560 .if \dar
561 .if \dar == 2
562 ld r10,_NIP(r1)
563 .else
564 ld r10,\area+EX_DAR(r13)
565 .endif
566 std r10,_DAR(r1)
567 .endif
568 .if \dsisr
569 .if \dsisr == 2
570 ld r10,_MSR(r1)
571 lis r11,DSISR_SRR1_MATCH_64S@h
572 and r10,r10,r11
573 .else
574 lwz r10,\area+EX_DSISR(r13)
575 .endif
576 std r10,_DSISR(r1)
577 .endif
578BEGIN_FTR_SECTION_NESTED(66)
579 ld r10,\area+EX_CFAR(r13)
580 std r10,ORIG_GPR3(r1)
581END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66)
582 GET_CTR(r10, \area)
583 std r10,_CTR(r1)
584 std r2,GPR2(r1) /* save r2 in stackframe */
585 SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */
586 SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */
587 mflr r9 /* Get LR, later save to stack */
588 ld r2,PACATOC(r13) /* get kernel TOC into r2 */
589 std r9,_LINK(r1)
590 lbz r10,PACAIRQSOFTMASK(r13)
591 mfspr r11,SPRN_XER /* save XER in stackframe */
592 std r10,SOFTE(r1)
593 std r11,_XER(r1)
594 li r9,(\vec)+1
595 std r9,_TRAP(r1) /* set trap number */
596 li r10,0
597 ld r11,exception_marker@toc(r2)
598 std r10,RESULT(r1) /* clear regs->result */
599 std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */
600
601 .if \stack
412 ACCOUNT_STOLEN_TIME 602 ACCOUNT_STOLEN_TIME
603 .endif
413 604
414/* 605 .if \reconcile
415 * Exception where stack is already set in r1, r1 is saved in r10. 606 RECONCILE_IRQ_STATE(r10, r11)
416 * PPR save and CPU accounting is not done (for some reason). 607 .endif
417 */ 608.endm
418#define EXCEPTION_COMMON_STACK(area, trap) \
419 EXCEPTION_PROLOG_COMMON_1(); \
420 kuap_save_amr_and_lock r9, r10, cr1; \
421 EXCEPTION_PROLOG_COMMON_2(area); \
422 EXCEPTION_PROLOG_COMMON_3(trap)
423 609
424/* 610/*
425 * Restore all registers including H/SRR0/1 saved in a stack frame of a 611 * Restore all registers including H/SRR0/1 saved in a stack frame of a
@@ -428,6 +614,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \
428.macro EXCEPTION_RESTORE_REGS hsrr 614.macro EXCEPTION_RESTORE_REGS hsrr
429 /* Move original SRR0 and SRR1 into the respective regs */ 615 /* Move original SRR0 and SRR1 into the respective regs */
430 ld r9,_MSR(r1) 616 ld r9,_MSR(r1)
617 .if \hsrr == EXC_HV_OR_STD
618 .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS"
619 .endif
431 .if \hsrr 620 .if \hsrr
432 mtspr SPRN_HSRR1,r9 621 mtspr SPRN_HSRR1,r9
433 .else 622 .else
@@ -481,219 +670,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
481#define FINISH_NAP 670#define FINISH_NAP
482#endif 671#endif
483 672
484/*
485 * Following are the BOOK3S exception handler helper macros.
486 * Handlers come in a number of types, and each type has a number of varieties.
487 *
488 * EXC_REAL_* - real, unrelocated exception vectors
489 * EXC_VIRT_* - virt (AIL), unrelocated exception vectors
490 * TRAMP_REAL_* - real, unrelocated helpers (virt can call these)
491 * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use)
492 * TRAMP_KVM - KVM handlers that get put into real, unrelocated
493 * EXC_COMMON - virt, relocated common handlers
494 *
495 * The EXC handlers are given a name, and branch to name_common, or the
496 * appropriate KVM or masking function. Vector handler verieties are as
497 * follows:
498 *
499 * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception
500 *
501 * EXC_{REAL|VIRT} - standard exception
502 *
503 * EXC_{REAL|VIRT}_suffix
504 * where _suffix is:
505 * - _MASKABLE - maskable exception
506 * - _OOL - out of line with trampoline to common handler
507 * - _HV - HV exception
508 *
509 * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV
510 *
511 * KVM handlers come in the following verieties:
512 * TRAMP_KVM
513 * TRAMP_KVM_SKIP
514 * TRAMP_KVM_HV
515 * TRAMP_KVM_HV_SKIP
516 *
517 * COMMON handlers come in the following verieties:
518 * EXC_COMMON_BEGIN/END - used to open-code the handler
519 * EXC_COMMON
520 * EXC_COMMON_ASYNC
521 *
522 * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM
523 * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers.
524 */
525
526#define __EXC_REAL(name, start, size, area) \
527 EXC_REAL_BEGIN(name, start, size); \
528 EXCEPTION_PROLOG_0 area ; \
529 EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ; \
530 EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
531 EXC_REAL_END(name, start, size)
532
533#define EXC_REAL(name, start, size) \
534 __EXC_REAL(name, start, size, PACA_EXGEN)
535
536#define __EXC_VIRT(name, start, size, realvec, area) \
537 EXC_VIRT_BEGIN(name, start, size); \
538 EXCEPTION_PROLOG_0 area ; \
539 EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0; \
540 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \
541 EXC_VIRT_END(name, start, size)
542
543#define EXC_VIRT(name, start, size, realvec) \
544 __EXC_VIRT(name, start, size, realvec, PACA_EXGEN)
545
546#define EXC_REAL_MASKABLE(name, start, size, bitmask) \
547 EXC_REAL_BEGIN(name, start, size); \
548 EXCEPTION_PROLOG_0 PACA_EXGEN ; \
549 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \
550 EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
551 EXC_REAL_END(name, start, size)
552
553#define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask) \
554 EXC_VIRT_BEGIN(name, start, size); \
555 EXCEPTION_PROLOG_0 PACA_EXGEN ; \
556 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
557 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \
558 EXC_VIRT_END(name, start, size)
559
560#define EXC_REAL_HV(name, start, size) \
561 EXC_REAL_BEGIN(name, start, size); \
562 EXCEPTION_PROLOG_0 PACA_EXGEN; \
563 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ; \
564 EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ; \
565 EXC_REAL_END(name, start, size)
566
567#define EXC_VIRT_HV(name, start, size, realvec) \
568 EXC_VIRT_BEGIN(name, start, size); \
569 EXCEPTION_PROLOG_0 PACA_EXGEN; \
570 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \
571 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ; \
572 EXC_VIRT_END(name, start, size)
573
574#define __EXC_REAL_OOL(name, start, size) \
575 EXC_REAL_BEGIN(name, start, size); \
576 EXCEPTION_PROLOG_0 PACA_EXGEN ; \
577 b tramp_real_##name ; \
578 EXC_REAL_END(name, start, size)
579
580#define __TRAMP_REAL_OOL(name, vec) \
581 TRAMP_REAL_BEGIN(tramp_real_##name); \
582 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, 0 ; \
583 EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
584
585#define EXC_REAL_OOL(name, start, size) \
586 __EXC_REAL_OOL(name, start, size); \
587 __TRAMP_REAL_OOL(name, start)
588
589#define __EXC_REAL_OOL_MASKABLE(name, start, size) \
590 __EXC_REAL_OOL(name, start, size)
591
592#define __TRAMP_REAL_OOL_MASKABLE(name, vec, bitmask) \
593 TRAMP_REAL_BEGIN(tramp_real_##name); \
594 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \
595 EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
596
597#define EXC_REAL_OOL_MASKABLE(name, start, size, bitmask) \
598 __EXC_REAL_OOL_MASKABLE(name, start, size); \
599 __TRAMP_REAL_OOL_MASKABLE(name, start, bitmask)
600
601#define __EXC_REAL_OOL_HV(name, start, size) \
602 __EXC_REAL_OOL(name, start, size)
603
604#define __TRAMP_REAL_OOL_HV(name, vec) \
605 TRAMP_REAL_BEGIN(tramp_real_##name); \
606 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, 0 ; \
607 EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1
608
609#define EXC_REAL_OOL_HV(name, start, size) \
610 __EXC_REAL_OOL_HV(name, start, size); \
611 __TRAMP_REAL_OOL_HV(name, start)
612
613#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size) \
614 __EXC_REAL_OOL(name, start, size)
615
616#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec, bitmask) \
617 TRAMP_REAL_BEGIN(tramp_real_##name); \
618 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \
619 EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1
620
621#define EXC_REAL_OOL_MASKABLE_HV(name, start, size, bitmask) \
622 __EXC_REAL_OOL_MASKABLE_HV(name, start, size); \
623 __TRAMP_REAL_OOL_MASKABLE_HV(name, start, bitmask)
624
625#define __EXC_VIRT_OOL(name, start, size) \
626 EXC_VIRT_BEGIN(name, start, size); \
627 EXCEPTION_PROLOG_0 PACA_EXGEN ; \
628 b tramp_virt_##name; \
629 EXC_VIRT_END(name, start, size)
630
631#define __TRAMP_VIRT_OOL(name, realvec) \
632 TRAMP_VIRT_BEGIN(tramp_virt_##name); \
633 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, vec, 0, 0, 0 ; \
634 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD
635
636#define EXC_VIRT_OOL(name, start, size, realvec) \
637 __EXC_VIRT_OOL(name, start, size); \
638 __TRAMP_VIRT_OOL(name, realvec)
639
640#define __EXC_VIRT_OOL_MASKABLE(name, start, size) \
641 __EXC_VIRT_OOL(name, start, size)
642
643#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) \
644 TRAMP_VIRT_BEGIN(tramp_virt_##name); \
645 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
646 EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1
647
648#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask) \
649 __EXC_VIRT_OOL_MASKABLE(name, start, size); \
650 __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask)
651
652#define __EXC_VIRT_OOL_HV(name, start, size) \
653 __EXC_VIRT_OOL(name, start, size)
654
655#define __TRAMP_VIRT_OOL_HV(name, realvec) \
656 TRAMP_VIRT_BEGIN(tramp_virt_##name); \
657 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \
658 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV
659
660#define EXC_VIRT_OOL_HV(name, start, size, realvec) \
661 __EXC_VIRT_OOL_HV(name, start, size); \
662 __TRAMP_VIRT_OOL_HV(name, realvec)
663
664#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size) \
665 __EXC_VIRT_OOL(name, start, size)
666
667#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) \
668 TRAMP_VIRT_BEGIN(tramp_virt_##name); \
669 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, bitmask ; \
670 EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV
671
672#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec, bitmask) \
673 __EXC_VIRT_OOL_MASKABLE_HV(name, start, size); \
674 __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask)
675
676#define TRAMP_KVM(area, n) \
677 TRAMP_KVM_BEGIN(do_kvm_##n); \
678 KVM_HANDLER area, EXC_STD, n, 0
679
680#define TRAMP_KVM_SKIP(area, n) \
681 TRAMP_KVM_BEGIN(do_kvm_##n); \
682 KVM_HANDLER area, EXC_STD, n, 1
683
684#define TRAMP_KVM_HV(area, n) \
685 TRAMP_KVM_BEGIN(do_kvm_H##n); \
686 KVM_HANDLER area, EXC_HV, n, 0
687
688#define TRAMP_KVM_HV_SKIP(area, n) \
689 TRAMP_KVM_BEGIN(do_kvm_H##n); \
690 KVM_HANDLER area, EXC_HV, n, 1
691
692#define EXC_COMMON(name, realvec, hdlr) \ 673#define EXC_COMMON(name, realvec, hdlr) \
693 EXC_COMMON_BEGIN(name); \ 674 EXC_COMMON_BEGIN(name); \
694 EXCEPTION_COMMON(PACA_EXGEN, realvec); \ 675 INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \
695 bl save_nvgprs; \ 676 bl save_nvgprs; \
696 RECONCILE_IRQ_STATE(r10, r11); \
697 addi r3,r1,STACK_FRAME_OVERHEAD; \ 677 addi r3,r1,STACK_FRAME_OVERHEAD; \
698 bl hdlr; \ 678 bl hdlr; \
699 b ret_from_except 679 b ret_from_except
@@ -704,9 +684,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
704 */ 684 */
705#define EXC_COMMON_ASYNC(name, realvec, hdlr) \ 685#define EXC_COMMON_ASYNC(name, realvec, hdlr) \
706 EXC_COMMON_BEGIN(name); \ 686 EXC_COMMON_BEGIN(name); \
707 EXCEPTION_COMMON(PACA_EXGEN, realvec); \ 687 INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \
708 FINISH_NAP; \ 688 FINISH_NAP; \
709 RECONCILE_IRQ_STATE(r10, r11); \
710 RUNLATCH_ON; \ 689 RUNLATCH_ON; \
711 addi r3,r1,STACK_FRAME_OVERHEAD; \ 690 addi r3,r1,STACK_FRAME_OVERHEAD; \
712 bl hdlr; \ 691 bl hdlr; \
@@ -836,9 +815,7 @@ BEGIN_FTR_SECTION
836END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) 815END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
837#endif 816#endif
838 817
839 EXCEPTION_PROLOG_0 PACA_EXNMI 818 INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1
840 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0, 0, 0
841 EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0
842 /* 819 /*
843 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is 820 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
844 * being used, so a nested NMI exception would corrupt it. 821 * being used, so a nested NMI exception would corrupt it.
@@ -850,9 +827,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
850 * be dangerous anyway. 827 * be dangerous anyway.
851 */ 828 */
852EXC_REAL_END(system_reset, 0x100, 0x100) 829EXC_REAL_END(system_reset, 0x100, 0x100)
853
854EXC_VIRT_NONE(0x4100, 0x100) 830EXC_VIRT_NONE(0x4100, 0x100)
855TRAMP_KVM(PACA_EXNMI, 0x100) 831INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0
856 832
857#ifdef CONFIG_PPC_P7_NAP 833#ifdef CONFIG_PPC_P7_NAP
858TRAMP_REAL_BEGIN(system_reset_idle_wake) 834TRAMP_REAL_BEGIN(system_reset_idle_wake)
@@ -868,9 +844,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake)
868 */ 844 */
869TRAMP_REAL_BEGIN(system_reset_fwnmi) 845TRAMP_REAL_BEGIN(system_reset_fwnmi)
870 /* See comment at system_reset exception, don't turn on RI */ 846 /* See comment at system_reset exception, don't turn on RI */
871 EXCEPTION_PROLOG_0 PACA_EXNMI 847 INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0
872 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 0, 0x100, 0, 0, 0
873 EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0
874 848
875#endif /* CONFIG_PPC_PSERIES */ 849#endif /* CONFIG_PPC_PSERIES */
876 850
@@ -890,7 +864,7 @@ EXC_COMMON_BEGIN(system_reset_common)
890 mr r10,r1 864 mr r10,r1
891 ld r1,PACA_NMI_EMERG_SP(r13) 865 ld r1,PACA_NMI_EMERG_SP(r13)
892 subi r1,r1,INT_FRAME_SIZE 866 subi r1,r1,INT_FRAME_SIZE
893 EXCEPTION_COMMON_STACK(PACA_EXNMI, 0x100) 867 INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0
894 bl save_nvgprs 868 bl save_nvgprs
895 /* 869 /*
896 * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does 870 * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does
@@ -933,26 +907,39 @@ EXC_COMMON_BEGIN(system_reset_common)
933 907
934 908
935EXC_REAL_BEGIN(machine_check, 0x200, 0x100) 909EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
936 /* This is moved out of line as it can be patched by FW, but 910 INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
937 * some code path might still want to branch into the original 911 /*
938 * vector 912 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
913 * nested machine check corrupts it. machine_check_common enables
914 * MSR_RI.
939 */ 915 */
940 EXCEPTION_PROLOG_0 PACA_EXMC
941BEGIN_FTR_SECTION
942 b machine_check_common_early
943FTR_SECTION_ELSE
944 b machine_check_pSeries_0
945ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
946EXC_REAL_END(machine_check, 0x200, 0x100) 916EXC_REAL_END(machine_check, 0x200, 0x100)
947EXC_VIRT_NONE(0x4200, 0x100) 917EXC_VIRT_NONE(0x4200, 0x100)
948TRAMP_REAL_BEGIN(machine_check_common_early) 918
949 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0 919#ifdef CONFIG_PPC_PSERIES
920TRAMP_REAL_BEGIN(machine_check_fwnmi)
921 /* See comment at machine_check exception, don't turn on RI */
922 INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1
923#endif
924
925INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1
926
927#define MACHINE_CHECK_HANDLER_WINDUP \
928 /* Clear MSR_RI before setting SRR0 and SRR1. */\
929 li r9,0; \
930 mtmsrd r9,1; /* Clear MSR_RI */ \
931 /* Decrement paca->in_mce now RI is clear. */ \
932 lhz r12,PACA_IN_MCE(r13); \
933 subi r12,r12,1; \
934 sth r12,PACA_IN_MCE(r13); \
935 EXCEPTION_RESTORE_REGS EXC_STD
936
937EXC_COMMON_BEGIN(machine_check_early_common)
938 mtctr r10 /* Restore ctr */
939 mfspr r11,SPRN_SRR0
940 mfspr r12,SPRN_SRR1
941
950 /* 942 /*
951 * Register contents:
952 * R13 = PACA
953 * R9 = CR
954 * Original R9 to R13 is saved on PACA_EXMC
955 *
956 * Switch to mc_emergency stack and handle re-entrancy (we limit 943 * Switch to mc_emergency stack and handle re-entrancy (we limit
957 * the nested MCE upto level 4 to avoid stack overflow). 944 * the nested MCE upto level 4 to avoid stack overflow).
958 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 945 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
@@ -973,103 +960,127 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
973 * the machine check is handled then the idle wakeup code is called 960 * the machine check is handled then the idle wakeup code is called
974 * to restore state. 961 * to restore state.
975 */ 962 */
976 mr r11,r1 /* Save r1 */
977 lhz r10,PACA_IN_MCE(r13) 963 lhz r10,PACA_IN_MCE(r13)
978 cmpwi r10,0 /* Are we in nested machine check */ 964 cmpwi r10,0 /* Are we in nested machine check */
979 bne 0f /* Yes, we are. */ 965 cmpwi cr1,r10,MAX_MCE_DEPTH /* Are we at maximum nesting */
980 /* First machine check entry */
981 ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */
9820: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
983 addi r10,r10,1 /* increment paca->in_mce */ 966 addi r10,r10,1 /* increment paca->in_mce */
984 sth r10,PACA_IN_MCE(r13) 967 sth r10,PACA_IN_MCE(r13)
985 /* Limit nested MCE to level 4 to avoid stack overflow */ 968
986 cmpwi r10,MAX_MCE_DEPTH 969 mr r10,r1 /* Save r1 */
987 bgt 2f /* Check if we hit limit of 4 */ 970 bne 1f
988 std r11,GPR1(r1) /* Save r1 on the stack. */ 971 /* First machine check entry */
989 std r11,0(r1) /* make stack chain pointer */ 972 ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */
990 mfspr r11,SPRN_SRR0 /* Save SRR0 */ 9731: /* Limit nested MCE to level 4 to avoid stack overflow */
991 std r11,_NIP(r1) 974 bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */
992 mfspr r11,SPRN_SRR1 /* Save SRR1 */ 975 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
993 std r11,_MSR(r1) 976
994 mfspr r11,SPRN_DAR /* Save DAR */
995 std r11,_DAR(r1)
996 mfspr r11,SPRN_DSISR /* Save DSISR */
997 std r11,_DSISR(r1)
998 std r9,_CCR(r1) /* Save CR in stackframe */
999 /* We don't touch AMR here, we never go to virtual mode */ 977 /* We don't touch AMR here, we never go to virtual mode */
1000 /* Save r9 through r13 from EXMC save area to stack frame. */ 978 INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1
1001 EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) 979
1002 mfmsr r11 /* get MSR value */
1003BEGIN_FTR_SECTION 980BEGIN_FTR_SECTION
1004 ori r11,r11,MSR_ME /* turn on ME bit */ 981 bl enable_machine_check
1005END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 982END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
1006 ori r11,r11,MSR_RI /* turn on RI bit */ 983 li r10,MSR_RI
1007 LOAD_HANDLER(r12, machine_check_handle_early) 984 mtmsrd r10,1
10081: mtspr SPRN_SRR0,r12 985
1009 mtspr SPRN_SRR1,r11 986 bl save_nvgprs
1010 RFI_TO_KERNEL 987 addi r3,r1,STACK_FRAME_OVERHEAD
1011 b . /* prevent speculative execution */ 988 bl machine_check_early
10122: 989 std r3,RESULT(r1) /* Save result */
1013 /* Stack overflow. Stay on emergency stack and panic. 990 ld r12,_MSR(r1)
1014 * Keep the ME bit off while panic-ing, so that if we hit
1015 * another machine check we checkstop.
1016 */
1017 addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */
1018 ld r11,PACAKMSR(r13)
1019 LOAD_HANDLER(r12, unrecover_mce)
1020 li r10,MSR_ME
1021 andc r11,r11,r10 /* Turn off MSR_ME */
1022 b 1b
1023 b . /* prevent speculative execution */
1024 991
1025TRAMP_REAL_BEGIN(machine_check_pSeries) 992#ifdef CONFIG_PPC_P7_NAP
1026 .globl machine_check_fwnmi 993 /*
1027machine_check_fwnmi: 994 * Check if thread was in power saving mode. We come here when any
1028 EXCEPTION_PROLOG_0 PACA_EXMC 995 * of the following is true:
996 * a. thread wasn't in power saving mode
997 * b. thread was in power saving mode with no state loss,
998 * supervisor state loss or hypervisor state loss.
999 *
1000 * Go back to nap/sleep/winkle mode again if (b) is true.
1001 */
1029BEGIN_FTR_SECTION 1002BEGIN_FTR_SECTION
1030 b machine_check_common_early 1003 rlwinm. r11,r12,47-31,30,31
1031END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) 1004 bne machine_check_idle_common
1032machine_check_pSeries_0: 1005END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
1033 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 1006#endif
1007
1008#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
1034 /* 1009 /*
1035 * MSR_RI is not enabled, because PACA_EXMC is being used, so a 1010 * Check if we are coming from guest. If yes, then run the normal
1036 * nested machine check corrupts it. machine_check_common enables 1011 * exception handler which will take the
1037 * MSR_RI. 1012 * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event
1013 * to guest.
1038 */ 1014 */
1039 EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 1015 lbz r11,HSTATE_IN_GUEST(r13)
1016 cmpwi r11,0 /* Check if coming from guest */
1017 bne mce_deliver /* continue if we are. */
1018#endif
1040 1019
1041TRAMP_KVM_SKIP(PACA_EXMC, 0x200) 1020 /*
1021 * Check if we are coming from userspace. If yes, then run the normal
1022 * exception handler which will deliver the MC event to this kernel.
1023 */
1024 andi. r11,r12,MSR_PR /* See if coming from user. */
1025 bne mce_deliver /* continue in V mode if we are. */
1026
1027 /*
1028 * At this point we are coming from kernel context.
1029 * Queue up the MCE event and return from the interrupt.
1030 * But before that, check if this is an un-recoverable exception.
1031 * If yes, then stay on emergency stack and panic.
1032 */
1033 andi. r11,r12,MSR_RI
1034 beq unrecoverable_mce
1035
1036 /*
1037 * Check if we have successfully handled/recovered from error, if not
1038 * then stay on emergency stack and panic.
1039 */
1040 ld r3,RESULT(r1) /* Load result */
1041 cmpdi r3,0 /* see if we handled MCE successfully */
1042 beq unrecoverable_mce /* if !handled then panic */
1043
1044 /*
1045 * Return from MC interrupt.
1046 * Queue up the MCE event so that we can log it later, while
1047 * returning from kernel or opal call.
1048 */
1049 bl machine_check_queue_event
1050 MACHINE_CHECK_HANDLER_WINDUP
1051 RFI_TO_KERNEL
1052
1053mce_deliver:
1054 /*
1055 * This is a host user or guest MCE. Restore all registers, then
1056 * run the "late" handler. For host user, this will run the
1057 * machine_check_exception handler in virtual mode like a normal
1058 * interrupt handler. For guest, this will trigger the KVM test
1059 * and branch to the KVM interrupt similarly to other interrupts.
1060 */
1061BEGIN_FTR_SECTION
1062 ld r10,ORIG_GPR3(r1)
1063 mtspr SPRN_CFAR,r10
1064END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
1065 MACHINE_CHECK_HANDLER_WINDUP
1066 /* See comment at machine_check exception, don't turn on RI */
1067 INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1
1042 1068
1043EXC_COMMON_BEGIN(machine_check_common) 1069EXC_COMMON_BEGIN(machine_check_common)
1044 /* 1070 /*
1045 * Machine check is different because we use a different 1071 * Machine check is different because we use a different
1046 * save area: PACA_EXMC instead of PACA_EXGEN. 1072 * save area: PACA_EXMC instead of PACA_EXGEN.
1047 */ 1073 */
1048 EXCEPTION_COMMON(PACA_EXMC, 0x200) 1074 INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1
1049 FINISH_NAP 1075 FINISH_NAP
1050 RECONCILE_IRQ_STATE(r10, r11)
1051 ld r3,PACA_EXMC+EX_DAR(r13)
1052 lwz r4,PACA_EXMC+EX_DSISR(r13)
1053 /* Enable MSR_RI when finished with PACA_EXMC */ 1076 /* Enable MSR_RI when finished with PACA_EXMC */
1054 li r10,MSR_RI 1077 li r10,MSR_RI
1055 mtmsrd r10,1 1078 mtmsrd r10,1
1056 std r3,_DAR(r1)
1057 std r4,_DSISR(r1)
1058 bl save_nvgprs 1079 bl save_nvgprs
1059 addi r3,r1,STACK_FRAME_OVERHEAD 1080 addi r3,r1,STACK_FRAME_OVERHEAD
1060 bl machine_check_exception 1081 bl machine_check_exception
1061 b ret_from_except 1082 b ret_from_except
1062 1083
1063#define MACHINE_CHECK_HANDLER_WINDUP \
1064 /* Clear MSR_RI before setting SRR0 and SRR1. */\
1065 li r9,0; \
1066 mtmsrd r9,1; /* Clear MSR_RI */ \
1067 /* Decrement paca->in_mce now RI is clear. */ \
1068 lhz r12,PACA_IN_MCE(r13); \
1069 subi r12,r12,1; \
1070 sth r12,PACA_IN_MCE(r13); \
1071 EXCEPTION_RESTORE_REGS EXC_STD
1072
1073#ifdef CONFIG_PPC_P7_NAP 1084#ifdef CONFIG_PPC_P7_NAP
1074/* 1085/*
1075 * This is an idle wakeup. Low level machine check has already been 1086 * This is an idle wakeup. Low level machine check has already been
@@ -1101,72 +1112,8 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
1101 bltlr cr1 /* no state loss, return to idle caller */ 1112 bltlr cr1 /* no state loss, return to idle caller */
1102 b idle_return_gpr_loss 1113 b idle_return_gpr_loss
1103#endif 1114#endif
1104 /*
1105 * Handle machine check early in real mode. We come here with
1106 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
1107 */
1108EXC_COMMON_BEGIN(machine_check_handle_early)
1109 std r0,GPR0(r1) /* Save r0 */
1110 EXCEPTION_PROLOG_COMMON_3(0x200)
1111 bl save_nvgprs
1112 addi r3,r1,STACK_FRAME_OVERHEAD
1113 bl machine_check_early
1114 std r3,RESULT(r1) /* Save result */
1115 ld r12,_MSR(r1)
1116BEGIN_FTR_SECTION
1117 b 4f
1118END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
1119 1115
1120#ifdef CONFIG_PPC_P7_NAP 1116EXC_COMMON_BEGIN(unrecoverable_mce)
1121 /*
1122 * Check if thread was in power saving mode. We come here when any
1123 * of the following is true:
1124 * a. thread wasn't in power saving mode
1125 * b. thread was in power saving mode with no state loss,
1126 * supervisor state loss or hypervisor state loss.
1127 *
1128 * Go back to nap/sleep/winkle mode again if (b) is true.
1129 */
1130BEGIN_FTR_SECTION
1131 rlwinm. r11,r12,47-31,30,31
1132 bne machine_check_idle_common
1133END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
1134#endif
1135
1136 /*
1137 * Check if we are coming from hypervisor userspace. If yes then we
1138 * continue in host kernel in V mode to deliver the MC event.
1139 */
1140 rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */
1141 beq 5f
11424: andi. r11,r12,MSR_PR /* See if coming from user. */
1143 bne 9f /* continue in V mode if we are. */
1144
11455:
1146#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
1147BEGIN_FTR_SECTION
1148 /*
1149 * We are coming from kernel context. Check if we are coming from
1150 * guest. if yes, then we can continue. We will fall through
1151 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
1152 */
1153 lbz r11,HSTATE_IN_GUEST(r13)
1154 cmpwi r11,0 /* Check if coming from guest */
1155 bne 9f /* continue if we are. */
1156END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
1157#endif
1158 /*
1159 * At this point we are not sure about what context we come from.
1160 * Queue up the MCE event and return from the interrupt.
1161 * But before that, check if this is an un-recoverable exception.
1162 * If yes, then stay on emergency stack and panic.
1163 */
1164 andi. r11,r12,MSR_RI
1165 bne 2f
11661: mfspr r11,SPRN_SRR0
1167 LOAD_HANDLER(r10,unrecover_mce)
1168 mtspr SPRN_SRR0,r10
1169 ld r10,PACAKMSR(r13)
1170 /* 1117 /*
1171 * We are going down. But there are chances that we might get hit by 1118 * We are going down. But there are chances that we might get hit by
1172 * another MCE during panic path and we may run into unstable state 1119 * another MCE during panic path and we may run into unstable state
@@ -1174,84 +1121,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
1174 * when another MCE is hit during panic path, system will checkstop 1121 * when another MCE is hit during panic path, system will checkstop
1175 * and hypervisor will get restarted cleanly by SP. 1122 * and hypervisor will get restarted cleanly by SP.
1176 */ 1123 */
1177 li r3,MSR_ME
1178 andc r10,r10,r3 /* Turn off MSR_ME */
1179 mtspr SPRN_SRR1,r10
1180 RFI_TO_KERNEL
1181 b .
11822:
1183 /*
1184 * Check if we have successfully handled/recovered from error, if not
1185 * then stay on emergency stack and panic.
1186 */
1187 ld r3,RESULT(r1) /* Load result */
1188 cmpdi r3,0 /* see if we handled MCE successfully */
1189
1190 beq 1b /* if !handled then panic */
1191BEGIN_FTR_SECTION 1124BEGIN_FTR_SECTION
1192 /* 1125 li r10,0 /* clear MSR_RI */
1193 * Return from MC interrupt. 1126 mtmsrd r10,1
1194 * Queue up the MCE event so that we can log it later, while 1127 bl disable_machine_check
1195 * returning from kernel or opal call. 1128END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
1196 */
1197 bl machine_check_queue_event
1198 MACHINE_CHECK_HANDLER_WINDUP
1199 RFI_TO_USER_OR_KERNEL
1200FTR_SECTION_ELSE
1201 /*
1202 * pSeries: Return from MC interrupt. Before that stay on emergency
1203 * stack and call machine_check_exception to log the MCE event.
1204 */
1205 LOAD_HANDLER(r10,mce_return)
1206 mtspr SPRN_SRR0,r10
1207 ld r10,PACAKMSR(r13) 1129 ld r10,PACAKMSR(r13)
1208 mtspr SPRN_SRR1,r10 1130 li r3,MSR_ME
1209 RFI_TO_KERNEL 1131 andc r10,r10,r3
1210 b . 1132 mtmsrd r10
1211ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
12129:
1213 /* Deliver the machine check to host kernel in V mode. */
1214 MACHINE_CHECK_HANDLER_WINDUP
1215 EXCEPTION_PROLOG_0 PACA_EXMC
1216 b machine_check_pSeries_0
1217 1133
1218EXC_COMMON_BEGIN(unrecover_mce)
1219 /* Invoke machine_check_exception to print MCE event and panic. */ 1134 /* Invoke machine_check_exception to print MCE event and panic. */
1220 addi r3,r1,STACK_FRAME_OVERHEAD 1135 addi r3,r1,STACK_FRAME_OVERHEAD
1221 bl machine_check_exception 1136 bl machine_check_exception
1137
1222 /* 1138 /*
1223 * We will not reach here. Even if we did, there is no way out. Call 1139 * We will not reach here. Even if we did, there is no way out.
1224 * unrecoverable_exception and die. 1140 * Call unrecoverable_exception and die.
1225 */ 1141 */
12261: addi r3,r1,STACK_FRAME_OVERHEAD
1227 bl unrecoverable_exception
1228 b 1b
1229
1230EXC_COMMON_BEGIN(mce_return)
1231 /* Invoke machine_check_exception to print MCE event and return. */
1232 addi r3,r1,STACK_FRAME_OVERHEAD 1142 addi r3,r1,STACK_FRAME_OVERHEAD
1233 bl machine_check_exception 1143 bl unrecoverable_exception
1234 MACHINE_CHECK_HANDLER_WINDUP
1235 RFI_TO_KERNEL
1236 b . 1144 b .
1237 1145
1146
1238EXC_REAL_BEGIN(data_access, 0x300, 0x80) 1147EXC_REAL_BEGIN(data_access, 0x300, 0x80)
1239 EXCEPTION_PROLOG_0 PACA_EXGEN 1148 INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1
1240 b tramp_real_data_access
1241EXC_REAL_END(data_access, 0x300, 0x80) 1149EXC_REAL_END(data_access, 0x300, 0x80)
1242
1243TRAMP_REAL_BEGIN(tramp_real_data_access)
1244 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 1, 1, 0
1245 EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1
1246
1247EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) 1150EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
1248 EXCEPTION_PROLOG_0 PACA_EXGEN 1151 INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1
1249 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 1, 1, 0
1250EXCEPTION_PROLOG_2_VIRT data_access_common, EXC_STD
1251EXC_VIRT_END(data_access, 0x4300, 0x80) 1152EXC_VIRT_END(data_access, 0x4300, 0x80)
1252 1153INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1
1253TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
1254
1255EXC_COMMON_BEGIN(data_access_common) 1154EXC_COMMON_BEGIN(data_access_common)
1256 /* 1155 /*
1257 * Here r13 points to the paca, r9 contains the saved CR, 1156 * Here r13 points to the paca, r9 contains the saved CR,
@@ -1259,15 +1158,12 @@ EXC_COMMON_BEGIN(data_access_common)
1259 * r9 - r13 are saved in paca->exgen. 1158 * r9 - r13 are saved in paca->exgen.
1260 * EX_DAR and EX_DSISR have saved DAR/DSISR 1159 * EX_DAR and EX_DSISR have saved DAR/DSISR
1261 */ 1160 */
1262 EXCEPTION_COMMON(PACA_EXGEN, 0x300) 1161 INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1
1263 RECONCILE_IRQ_STATE(r10, r11) 1162 ld r4,_DAR(r1)
1264 ld r12,_MSR(r1) 1163 ld r5,_DSISR(r1)
1265 ld r3,PACA_EXGEN+EX_DAR(r13)
1266 lwz r4,PACA_EXGEN+EX_DSISR(r13)
1267 li r5,0x300
1268 std r3,_DAR(r1)
1269 std r4,_DSISR(r1)
1270BEGIN_MMU_FTR_SECTION 1164BEGIN_MMU_FTR_SECTION
1165 ld r6,_MSR(r1)
1166 li r3,0x300
1271 b do_hash_page /* Try to handle as hpte fault */ 1167 b do_hash_page /* Try to handle as hpte fault */
1272MMU_FTR_SECTION_ELSE 1168MMU_FTR_SECTION_ELSE
1273 b handle_page_fault 1169 b handle_page_fault
@@ -1275,26 +1171,15 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
1275 1171
1276 1172
1277EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) 1173EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
1278 EXCEPTION_PROLOG_0 PACA_EXSLB 1174 INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1
1279 b tramp_real_data_access_slb
1280EXC_REAL_END(data_access_slb, 0x380, 0x80) 1175EXC_REAL_END(data_access_slb, 0x380, 0x80)
1281
1282TRAMP_REAL_BEGIN(tramp_real_data_access_slb)
1283 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 1, 0x380, 1, 0, 0
1284 EXCEPTION_PROLOG_2_REAL data_access_slb_common, EXC_STD, 1
1285
1286EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) 1176EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
1287 EXCEPTION_PROLOG_0 PACA_EXSLB 1177 INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1
1288 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 0, 0x380, 1, 0, 0
1289 EXCEPTION_PROLOG_2_VIRT data_access_slb_common, EXC_STD
1290EXC_VIRT_END(data_access_slb, 0x4380, 0x80) 1178EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
1291 1179INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1
1292TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
1293
1294EXC_COMMON_BEGIN(data_access_slb_common) 1180EXC_COMMON_BEGIN(data_access_slb_common)
1295 EXCEPTION_COMMON(PACA_EXSLB, 0x380) 1181 INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0
1296 ld r4,PACA_EXSLB+EX_DAR(r13) 1182 ld r4,_DAR(r1)
1297 std r4,_DAR(r1)
1298 addi r3,r1,STACK_FRAME_OVERHEAD 1183 addi r3,r1,STACK_FRAME_OVERHEAD
1299BEGIN_MMU_FTR_SECTION 1184BEGIN_MMU_FTR_SECTION
1300 /* HPT case, do SLB fault */ 1185 /* HPT case, do SLB fault */
@@ -1317,33 +1202,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
1317 b ret_from_except 1202 b ret_from_except
1318 1203
1319 1204
1320EXC_REAL(instruction_access, 0x400, 0x80) 1205EXC_REAL_BEGIN(instruction_access, 0x400, 0x80)
1321EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) 1206 INT_HANDLER instruction_access, 0x400, kvm=1
1322TRAMP_KVM(PACA_EXGEN, 0x400) 1207EXC_REAL_END(instruction_access, 0x400, 0x80)
1323 1208EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80)
1209 INT_HANDLER instruction_access, 0x400, virt=1
1210EXC_VIRT_END(instruction_access, 0x4400, 0x80)
1211INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0
1324EXC_COMMON_BEGIN(instruction_access_common) 1212EXC_COMMON_BEGIN(instruction_access_common)
1325 EXCEPTION_COMMON(PACA_EXGEN, 0x400) 1213 INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2
1326 RECONCILE_IRQ_STATE(r10, r11) 1214 ld r4,_DAR(r1)
1327 ld r12,_MSR(r1) 1215 ld r5,_DSISR(r1)
1328 ld r3,_NIP(r1)
1329 andis. r4,r12,DSISR_SRR1_MATCH_64S@h
1330 li r5,0x400
1331 std r3,_DAR(r1)
1332 std r4,_DSISR(r1)
1333BEGIN_MMU_FTR_SECTION 1216BEGIN_MMU_FTR_SECTION
1217 ld r6,_MSR(r1)
1218 li r3,0x400
1334 b do_hash_page /* Try to handle as hpte fault */ 1219 b do_hash_page /* Try to handle as hpte fault */
1335MMU_FTR_SECTION_ELSE 1220MMU_FTR_SECTION_ELSE
1336 b handle_page_fault 1221 b handle_page_fault
1337ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) 1222ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
1338 1223
1339 1224
1340__EXC_REAL(instruction_access_slb, 0x480, 0x80, PACA_EXSLB) 1225EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
1341__EXC_VIRT(instruction_access_slb, 0x4480, 0x80, 0x480, PACA_EXSLB) 1226 INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1
1342TRAMP_KVM(PACA_EXSLB, 0x480) 1227EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
1343 1228EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
1229 INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB
1230EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
1231INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0
1344EXC_COMMON_BEGIN(instruction_access_slb_common) 1232EXC_COMMON_BEGIN(instruction_access_slb_common)
1345 EXCEPTION_COMMON(PACA_EXSLB, 0x480) 1233 INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0
1346 ld r4,_NIP(r1) 1234 ld r4,_DAR(r1)
1347 addi r3,r1,STACK_FRAME_OVERHEAD 1235 addi r3,r1,STACK_FRAME_OVERHEAD
1348BEGIN_MMU_FTR_SECTION 1236BEGIN_MMU_FTR_SECTION
1349 /* HPT case, do SLB fault */ 1237 /* HPT case, do SLB fault */
@@ -1359,69 +1247,44 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
1359 std r3,RESULT(r1) 1247 std r3,RESULT(r1)
1360 bl save_nvgprs 1248 bl save_nvgprs
1361 RECONCILE_IRQ_STATE(r10, r11) 1249 RECONCILE_IRQ_STATE(r10, r11)
1362 ld r4,_NIP(r1) 1250 ld r4,_DAR(r1)
1363 ld r5,RESULT(r1) 1251 ld r5,RESULT(r1)
1364 addi r3,r1,STACK_FRAME_OVERHEAD 1252 addi r3,r1,STACK_FRAME_OVERHEAD
1365 bl do_bad_slb_fault 1253 bl do_bad_slb_fault
1366 b ret_from_except 1254 b ret_from_except
1367 1255
1368
1369EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) 1256EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
1370 EXCEPTION_PROLOG_0 PACA_EXGEN 1257 INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
1371BEGIN_FTR_SECTION
1372 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
1373 EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV, 1
1374FTR_SECTION_ELSE
1375 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
1376 EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_STD, 1
1377ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
1378EXC_REAL_END(hardware_interrupt, 0x500, 0x100) 1258EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
1379
1380EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) 1259EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
1381 EXCEPTION_PROLOG_0 PACA_EXGEN 1260 INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1
1382BEGIN_FTR_SECTION
1383 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
1384 EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV
1385FTR_SECTION_ELSE
1386 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED
1387 EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_STD
1388ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
1389EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) 1261EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
1390 1262INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0
1391TRAMP_KVM(PACA_EXGEN, 0x500)
1392TRAMP_KVM_HV(PACA_EXGEN, 0x500)
1393EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) 1263EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
1394 1264
1395 1265
1396EXC_REAL_BEGIN(alignment, 0x600, 0x100) 1266EXC_REAL_BEGIN(alignment, 0x600, 0x100)
1397 EXCEPTION_PROLOG_0 PACA_EXGEN 1267 INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1
1398 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x600, 1, 1, 0
1399 EXCEPTION_PROLOG_2_REAL alignment_common, EXC_STD, 1
1400EXC_REAL_END(alignment, 0x600, 0x100) 1268EXC_REAL_END(alignment, 0x600, 0x100)
1401
1402EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) 1269EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
1403 EXCEPTION_PROLOG_0 PACA_EXGEN 1270 INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1
1404 EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x600, 1, 1, 0
1405 EXCEPTION_PROLOG_2_VIRT alignment_common, EXC_STD
1406EXC_VIRT_END(alignment, 0x4600, 0x100) 1271EXC_VIRT_END(alignment, 0x4600, 0x100)
1407 1272INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0
1408TRAMP_KVM(PACA_EXGEN, 0x600)
1409EXC_COMMON_BEGIN(alignment_common) 1273EXC_COMMON_BEGIN(alignment_common)
1410 EXCEPTION_COMMON(PACA_EXGEN, 0x600) 1274 INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1
1411 ld r3,PACA_EXGEN+EX_DAR(r13)
1412 lwz r4,PACA_EXGEN+EX_DSISR(r13)
1413 std r3,_DAR(r1)
1414 std r4,_DSISR(r1)
1415 bl save_nvgprs 1275 bl save_nvgprs
1416 RECONCILE_IRQ_STATE(r10, r11)
1417 addi r3,r1,STACK_FRAME_OVERHEAD 1276 addi r3,r1,STACK_FRAME_OVERHEAD
1418 bl alignment_exception 1277 bl alignment_exception
1419 b ret_from_except 1278 b ret_from_except
1420 1279
1421 1280
1422EXC_REAL(program_check, 0x700, 0x100) 1281EXC_REAL_BEGIN(program_check, 0x700, 0x100)
1423EXC_VIRT(program_check, 0x4700, 0x100, 0x700) 1282 INT_HANDLER program_check, 0x700, kvm=1
1424TRAMP_KVM(PACA_EXGEN, 0x700) 1283EXC_REAL_END(program_check, 0x700, 0x100)
1284EXC_VIRT_BEGIN(program_check, 0x4700, 0x100)
1285 INT_HANDLER program_check, 0x700, virt=1
1286EXC_VIRT_END(program_check, 0x4700, 0x100)
1287INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0
1425EXC_COMMON_BEGIN(program_check_common) 1288EXC_COMMON_BEGIN(program_check_common)
1426 /* 1289 /*
1427 * It's possible to receive a TM Bad Thing type program check with 1290 * It's possible to receive a TM Bad Thing type program check with
@@ -1447,27 +1310,33 @@ EXC_COMMON_BEGIN(program_check_common)
1447 mr r10,r1 /* Save r1 */ 1310 mr r10,r1 /* Save r1 */
1448 ld r1,PACAEMERGSP(r13) /* Use emergency stack */ 1311 ld r1,PACAEMERGSP(r13) /* Use emergency stack */
1449 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ 1312 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
1450 b 3f /* Jump into the macro !! */ 1313 INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0
1314 b 3f
14512: 13152:
1452 EXCEPTION_COMMON(PACA_EXGEN, 0x700) 1316 INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0
13173:
1453 bl save_nvgprs 1318 bl save_nvgprs
1454 RECONCILE_IRQ_STATE(r10, r11)
1455 addi r3,r1,STACK_FRAME_OVERHEAD 1319 addi r3,r1,STACK_FRAME_OVERHEAD
1456 bl program_check_exception 1320 bl program_check_exception
1457 b ret_from_except 1321 b ret_from_except
1458 1322
1459 1323
1460EXC_REAL(fp_unavailable, 0x800, 0x100) 1324EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100)
1461EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800) 1325 INT_HANDLER fp_unavailable, 0x800, kvm=1
1462TRAMP_KVM(PACA_EXGEN, 0x800) 1326EXC_REAL_END(fp_unavailable, 0x800, 0x100)
1327EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100)
1328 INT_HANDLER fp_unavailable, 0x800, virt=1
1329EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
1330INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0
1463EXC_COMMON_BEGIN(fp_unavailable_common) 1331EXC_COMMON_BEGIN(fp_unavailable_common)
1464 EXCEPTION_COMMON(PACA_EXGEN, 0x800) 1332 INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0
1465 bne 1f /* if from user, just load it up */ 1333 bne 1f /* if from user, just load it up */
1466 bl save_nvgprs 1334 bl save_nvgprs
1467 RECONCILE_IRQ_STATE(r10, r11) 1335 RECONCILE_IRQ_STATE(r10, r11)
1468 addi r3,r1,STACK_FRAME_OVERHEAD 1336 addi r3,r1,STACK_FRAME_OVERHEAD
1469 bl kernel_fp_unavailable_exception 1337 bl kernel_fp_unavailable_exception
1470 BUG_OPCODE 13380: trap
1339 EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
14711: 13401:
1472#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1341#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1473BEGIN_FTR_SECTION 1342BEGIN_FTR_SECTION
@@ -1490,21 +1359,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
1490#endif 1359#endif
1491 1360
1492 1361
1493EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED) 1362EXC_REAL_BEGIN(decrementer, 0x900, 0x80)
1494EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED) 1363 INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1
1495TRAMP_KVM(PACA_EXGEN, 0x900) 1364EXC_REAL_END(decrementer, 0x900, 0x80)
1365EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
1366 INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED
1367EXC_VIRT_END(decrementer, 0x4900, 0x80)
1368INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0
1496EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) 1369EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
1497 1370
1498 1371
1499EXC_REAL_HV(hdecrementer, 0x980, 0x80) 1372EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80)
1500EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980) 1373 INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1
1501TRAMP_KVM_HV(PACA_EXGEN, 0x980) 1374EXC_REAL_END(hdecrementer, 0x980, 0x80)
1375EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80)
1376 INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1
1377EXC_VIRT_END(hdecrementer, 0x4980, 0x80)
1378INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0
1502EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) 1379EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
1503 1380
1504 1381
1505EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED) 1382EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100)
1506EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED) 1383 INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1
1507TRAMP_KVM(PACA_EXGEN, 0xa00) 1384EXC_REAL_END(doorbell_super, 0xa00, 0x100)
1385EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
1386 INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED
1387EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
1388INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0
1508#ifdef CONFIG_PPC_DOORBELL 1389#ifdef CONFIG_PPC_DOORBELL
1509EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) 1390EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
1510#else 1391#else
@@ -1512,17 +1393,13 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception)
1512#endif 1393#endif
1513 1394
1514 1395
1515EXC_REAL(trap_0b, 0xb00, 0x100) 1396EXC_REAL_NONE(0xb00, 0x100)
1516EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) 1397EXC_VIRT_NONE(0x4b00, 0x100)
1517TRAMP_KVM(PACA_EXGEN, 0xb00)
1518EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
1519 1398
1520/* 1399/*
1521 * system call / hypercall (0xc00, 0x4c00) 1400 * system call / hypercall (0xc00, 0x4c00)
1522 * 1401 *
1523 * The system call exception is invoked with "sc 0" and does not alter HV bit. 1402 * The system call exception is invoked with "sc 0" and does not alter HV bit.
1524 * There is support for kernel code to invoke system calls but there are no
1525 * in-tree users.
1526 * 1403 *
1527 * The hypercall is invoked with "sc 1" and sets HV=1. 1404 * The hypercall is invoked with "sc 1" and sets HV=1.
1528 * 1405 *
@@ -1567,7 +1444,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
1567 GET_PACA(r13) 1444 GET_PACA(r13)
1568 std r10,PACA_EXGEN+EX_R10(r13) 1445 std r10,PACA_EXGEN+EX_R10(r13)
1569 INTERRUPT_TO_KERNEL 1446 INTERRUPT_TO_KERNEL
1570 KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */ 1447 KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */
1571 mfctr r9 1448 mfctr r9
1572#else 1449#else
1573 mr r9,r13 1450 mr r9,r13
@@ -1621,7 +1498,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
1621EXC_REAL_BEGIN(system_call, 0xc00, 0x100) 1498EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
1622 SYSTEM_CALL 0 1499 SYSTEM_CALL 0
1623EXC_REAL_END(system_call, 0xc00, 0x100) 1500EXC_REAL_END(system_call, 0xc00, 0x100)
1624
1625EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) 1501EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
1626 SYSTEM_CALL 1 1502 SYSTEM_CALL 1
1627EXC_VIRT_END(system_call, 0x4c00, 0x100) 1503EXC_VIRT_END(system_call, 0x4c00, 0x100)
@@ -1634,7 +1510,7 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100)
1634 * ctr = orig r13 1510 * ctr = orig r13
1635 * orig r10 saved in PACA 1511 * orig r10 saved in PACA
1636 */ 1512 */
1637TRAMP_KVM_BEGIN(do_kvm_0xc00) 1513TRAMP_KVM_BEGIN(system_call_kvm)
1638 /* 1514 /*
1639 * Save the PPR (on systems that support it) before changing to 1515 * Save the PPR (on systems that support it) before changing to
1640 * HMT_MEDIUM. That allows the KVM code to save that value into the 1516 * HMT_MEDIUM. That allows the KVM code to save that value into the
@@ -1647,32 +1523,33 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00)
1647 SET_SCRATCH0(r10) 1523 SET_SCRATCH0(r10)
1648 std r9,PACA_EXGEN+EX_R9(r13) 1524 std r9,PACA_EXGEN+EX_R9(r13)
1649 mfcr r9 1525 mfcr r9
1650 KVM_HANDLER PACA_EXGEN, EXC_STD, 0xc00, 0 1526 KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0
1651#endif 1527#endif
1652 1528
1653 1529
1654EXC_REAL(single_step, 0xd00, 0x100) 1530EXC_REAL_BEGIN(single_step, 0xd00, 0x100)
1655EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00) 1531 INT_HANDLER single_step, 0xd00, kvm=1
1656TRAMP_KVM(PACA_EXGEN, 0xd00) 1532EXC_REAL_END(single_step, 0xd00, 0x100)
1533EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
1534 INT_HANDLER single_step, 0xd00, virt=1
1535EXC_VIRT_END(single_step, 0x4d00, 0x100)
1536INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0
1657EXC_COMMON(single_step_common, 0xd00, single_step_exception) 1537EXC_COMMON(single_step_common, 0xd00, single_step_exception)
1658 1538
1659EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20) 1539
1660EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00) 1540EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20)
1661TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) 1541 INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
1542EXC_REAL_END(h_data_storage, 0xe00, 0x20)
1543EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
1544 INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1
1545EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
1546INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1
1662EXC_COMMON_BEGIN(h_data_storage_common) 1547EXC_COMMON_BEGIN(h_data_storage_common)
1663 mfspr r10,SPRN_HDAR 1548 INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1
1664 std r10,PACA_EXGEN+EX_DAR(r13)
1665 mfspr r10,SPRN_HDSISR
1666 stw r10,PACA_EXGEN+EX_DSISR(r13)
1667 EXCEPTION_COMMON(PACA_EXGEN, 0xe00)
1668 bl save_nvgprs 1549 bl save_nvgprs
1669 RECONCILE_IRQ_STATE(r10, r11)
1670 addi r3,r1,STACK_FRAME_OVERHEAD 1550 addi r3,r1,STACK_FRAME_OVERHEAD
1671BEGIN_MMU_FTR_SECTION 1551BEGIN_MMU_FTR_SECTION
1672 ld r4,PACA_EXGEN+EX_DAR(r13) 1552 ld r4,_DAR(r1)
1673 lwz r5,PACA_EXGEN+EX_DSISR(r13)
1674 std r4,_DAR(r1)
1675 std r5,_DSISR(r1)
1676 li r5,SIGSEGV 1553 li r5,SIGSEGV
1677 bl bad_page_fault 1554 bl bad_page_fault
1678MMU_FTR_SECTION_ELSE 1555MMU_FTR_SECTION_ELSE
@@ -1681,15 +1558,23 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
1681 b ret_from_except 1558 b ret_from_except
1682 1559
1683 1560
1684EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20) 1561EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20)
1685EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20) 1562 INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1
1686TRAMP_KVM_HV(PACA_EXGEN, 0xe20) 1563EXC_REAL_END(h_instr_storage, 0xe20, 0x20)
1564EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
1565 INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1
1566EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
1567INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0
1687EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) 1568EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
1688 1569
1689 1570
1690EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20) 1571EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20)
1691EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40) 1572 INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1
1692TRAMP_KVM_HV(PACA_EXGEN, 0xe40) 1573EXC_REAL_END(emulation_assist, 0xe40, 0x20)
1574EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
1575 INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1
1576EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
1577INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0
1693EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) 1578EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
1694 1579
1695 1580
@@ -1699,16 +1584,10 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
1699 * mode. 1584 * mode.
1700 */ 1585 */
1701EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) 1586EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20)
1702 EXCEPTION_PROLOG_0 PACA_EXGEN 1587 INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1
1703 b hmi_exception_early
1704EXC_REAL_END(hmi_exception, 0xe60, 0x20) 1588EXC_REAL_END(hmi_exception, 0xe60, 0x20)
1705EXC_VIRT_NONE(0x4e60, 0x20) 1589EXC_VIRT_NONE(0x4e60, 0x20)
1706TRAMP_KVM_HV(PACA_EXGEN, 0xe60) 1590INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0
1707TRAMP_REAL_BEGIN(hmi_exception_early)
1708 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, 0
1709 mfctr r10 /* save ctr, even for !RELOCATABLE */
1710 BRANCH_TO_C000(r11, hmi_exception_early_common)
1711
1712EXC_COMMON_BEGIN(hmi_exception_early_common) 1591EXC_COMMON_BEGIN(hmi_exception_early_common)
1713 mtctr r10 /* Restore ctr */ 1592 mtctr r10 /* Restore ctr */
1714 mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ 1593 mfspr r11,SPRN_HSRR0 /* Save HSRR0 */
@@ -1716,10 +1595,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
1716 mr r10,r1 /* Save r1 */ 1595 mr r10,r1 /* Save r1 */
1717 ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ 1596 ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */
1718 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ 1597 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
1719 EXCEPTION_PROLOG_COMMON_1() 1598
1720 /* We don't touch AMR here, we never go to virtual mode */ 1599 /* We don't touch AMR here, we never go to virtual mode */
1721 EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) 1600 INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0
1722 EXCEPTION_PROLOG_COMMON_3(0xe60) 1601
1723 addi r3,r1,STACK_FRAME_OVERHEAD 1602 addi r3,r1,STACK_FRAME_OVERHEAD
1724 bl hmi_exception_realmode 1603 bl hmi_exception_realmode
1725 cmpdi cr0,r3,0 1604 cmpdi cr0,r3,0
@@ -1734,23 +1613,25 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
1734 * firmware. 1613 * firmware.
1735 */ 1614 */
1736 EXCEPTION_RESTORE_REGS EXC_HV 1615 EXCEPTION_RESTORE_REGS EXC_HV
1737 EXCEPTION_PROLOG_0 PACA_EXGEN 1616 INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
1738 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, IRQS_DISABLED
1739 EXCEPTION_PROLOG_2_REAL hmi_exception_common, EXC_HV, 1
1740 1617
1741EXC_COMMON_BEGIN(hmi_exception_common) 1618EXC_COMMON_BEGIN(hmi_exception_common)
1742 EXCEPTION_COMMON(PACA_EXGEN, 0xe60) 1619 INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0
1743 FINISH_NAP 1620 FINISH_NAP
1744 bl save_nvgprs
1745 RECONCILE_IRQ_STATE(r10, r11)
1746 RUNLATCH_ON 1621 RUNLATCH_ON
1622 bl save_nvgprs
1747 addi r3,r1,STACK_FRAME_OVERHEAD 1623 addi r3,r1,STACK_FRAME_OVERHEAD
1748 bl handle_hmi_exception 1624 bl handle_hmi_exception
1749 b ret_from_except 1625 b ret_from_except
1750 1626
1751EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED) 1627
1752EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED) 1628EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20)
1753TRAMP_KVM_HV(PACA_EXGEN, 0xe80) 1629 INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
1630EXC_REAL_END(h_doorbell, 0xe80, 0x20)
1631EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
1632 INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
1633EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
1634INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0
1754#ifdef CONFIG_PPC_DOORBELL 1635#ifdef CONFIG_PPC_DOORBELL
1755EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) 1636EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
1756#else 1637#else
@@ -1758,9 +1639,13 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
1758#endif 1639#endif
1759 1640
1760 1641
1761EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED) 1642EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20)
1762EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED) 1643 INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
1763TRAMP_KVM_HV(PACA_EXGEN, 0xea0) 1644EXC_REAL_END(h_virt_irq, 0xea0, 0x20)
1645EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
1646 INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1
1647EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
1648INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0
1764EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) 1649EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
1765 1650
1766 1651
@@ -1770,17 +1655,25 @@ EXC_REAL_NONE(0xee0, 0x20)
1770EXC_VIRT_NONE(0x4ee0, 0x20) 1655EXC_VIRT_NONE(0x4ee0, 0x20)
1771 1656
1772 1657
1773EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED) 1658EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20)
1774EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED) 1659 INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1
1775TRAMP_KVM(PACA_EXGEN, 0xf00) 1660EXC_REAL_END(performance_monitor, 0xf00, 0x20)
1661EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
1662 INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED
1663EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
1664INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0
1776EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) 1665EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
1777 1666
1778 1667
1779EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20) 1668EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20)
1780EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20) 1669 INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1
1781TRAMP_KVM(PACA_EXGEN, 0xf20) 1670EXC_REAL_END(altivec_unavailable, 0xf20, 0x20)
1671EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20)
1672 INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1
1673EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20)
1674INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0
1782EXC_COMMON_BEGIN(altivec_unavailable_common) 1675EXC_COMMON_BEGIN(altivec_unavailable_common)
1783 EXCEPTION_COMMON(PACA_EXGEN, 0xf20) 1676 INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0
1784#ifdef CONFIG_ALTIVEC 1677#ifdef CONFIG_ALTIVEC
1785BEGIN_FTR_SECTION 1678BEGIN_FTR_SECTION
1786 beq 1f 1679 beq 1f
@@ -1813,11 +1706,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
1813 b ret_from_except 1706 b ret_from_except
1814 1707
1815 1708
1816EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20) 1709EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20)
1817EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40) 1710 INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1
1818TRAMP_KVM(PACA_EXGEN, 0xf40) 1711EXC_REAL_END(vsx_unavailable, 0xf40, 0x20)
1712EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20)
1713 INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1
1714EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20)
1715INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0
1819EXC_COMMON_BEGIN(vsx_unavailable_common) 1716EXC_COMMON_BEGIN(vsx_unavailable_common)
1820 EXCEPTION_COMMON(PACA_EXGEN, 0xf40) 1717 INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0
1821#ifdef CONFIG_VSX 1718#ifdef CONFIG_VSX
1822BEGIN_FTR_SECTION 1719BEGIN_FTR_SECTION
1823 beq 1f 1720 beq 1f
@@ -1849,15 +1746,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
1849 b ret_from_except 1746 b ret_from_except
1850 1747
1851 1748
1852EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20) 1749EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20)
1853EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60) 1750 INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1
1854TRAMP_KVM(PACA_EXGEN, 0xf60) 1751EXC_REAL_END(facility_unavailable, 0xf60, 0x20)
1752EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
1753 INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1
1754EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
1755INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0
1855EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) 1756EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
1856 1757
1857 1758
1858EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20) 1759EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20)
1859EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80) 1760 INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1
1860TRAMP_KVM_HV(PACA_EXGEN, 0xf80) 1761EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20)
1762EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
1763 INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1
1764EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
1765INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0
1861EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) 1766EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
1862 1767
1863 1768
@@ -1874,9 +1779,11 @@ EXC_REAL_NONE(0x1100, 0x100)
1874EXC_VIRT_NONE(0x5100, 0x100) 1779EXC_VIRT_NONE(0x5100, 0x100)
1875 1780
1876#ifdef CONFIG_CBE_RAS 1781#ifdef CONFIG_CBE_RAS
1877EXC_REAL_HV(cbe_system_error, 0x1200, 0x100) 1782EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
1783 INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1
1784EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
1878EXC_VIRT_NONE(0x5200, 0x100) 1785EXC_VIRT_NONE(0x5200, 0x100)
1879TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) 1786INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1
1880EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) 1787EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
1881#else /* CONFIG_CBE_RAS */ 1788#else /* CONFIG_CBE_RAS */
1882EXC_REAL_NONE(0x1200, 0x100) 1789EXC_REAL_NONE(0x1200, 0x100)
@@ -1884,37 +1791,43 @@ EXC_VIRT_NONE(0x5200, 0x100)
1884#endif 1791#endif
1885 1792
1886 1793
1887EXC_REAL(instruction_breakpoint, 0x1300, 0x100) 1794EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100)
1888EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300) 1795 INT_HANDLER instruction_breakpoint, 0x1300, kvm=1
1889TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) 1796EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100)
1797EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
1798 INT_HANDLER instruction_breakpoint, 0x1300, virt=1
1799EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
1800INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1
1890EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) 1801EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
1891 1802
1803
1892EXC_REAL_NONE(0x1400, 0x100) 1804EXC_REAL_NONE(0x1400, 0x100)
1893EXC_VIRT_NONE(0x5400, 0x100) 1805EXC_VIRT_NONE(0x5400, 0x100)
1894 1806
1895EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) 1807EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
1896 EXCEPTION_PROLOG_0 PACA_EXGEN 1808 INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV
1897 EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0
1898
1899#ifdef CONFIG_PPC_DENORMALISATION 1809#ifdef CONFIG_PPC_DENORMALISATION
1900 mfspr r10,SPRN_HSRR1 1810 mfspr r10,SPRN_HSRR1
1901 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ 1811 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */
1902 bne+ denorm_assist 1812 bne+ denorm_assist
1903#endif 1813#endif
1904 1814 KVMTEST denorm_exception_hv, EXC_HV 0x1500
1905 KVMTEST EXC_HV 0x1500 1815 INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1
1906 EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1
1907EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) 1816EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
1908 1817
1909#ifdef CONFIG_PPC_DENORMALISATION 1818#ifdef CONFIG_PPC_DENORMALISATION
1910EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) 1819EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100)
1911 b exc_real_0x1500_denorm_exception_hv 1820 INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0
1821 mfspr r10,SPRN_HSRR1
1822 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */
1823 bne+ denorm_assist
1824 INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV
1912EXC_VIRT_END(denorm_exception, 0x5500, 0x100) 1825EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
1913#else 1826#else
1914EXC_VIRT_NONE(0x5500, 0x100) 1827EXC_VIRT_NONE(0x5500, 0x100)
1915#endif 1828#endif
1916 1829
1917TRAMP_KVM_HV(PACA_EXGEN, 0x1500) 1830INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0
1918 1831
1919#ifdef CONFIG_PPC_DENORMALISATION 1832#ifdef CONFIG_PPC_DENORMALISATION
1920TRAMP_REAL_BEGIN(denorm_assist) 1833TRAMP_REAL_BEGIN(denorm_assist)
@@ -1989,9 +1902,11 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception)
1989 1902
1990 1903
1991#ifdef CONFIG_CBE_RAS 1904#ifdef CONFIG_CBE_RAS
1992EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100) 1905EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
1906 INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1
1907EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
1993EXC_VIRT_NONE(0x5600, 0x100) 1908EXC_VIRT_NONE(0x5600, 0x100)
1994TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) 1909INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1
1995EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) 1910EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
1996#else /* CONFIG_CBE_RAS */ 1911#else /* CONFIG_CBE_RAS */
1997EXC_REAL_NONE(0x1600, 0x100) 1912EXC_REAL_NONE(0x1600, 0x100)
@@ -1999,9 +1914,13 @@ EXC_VIRT_NONE(0x5600, 0x100)
1999#endif 1914#endif
2000 1915
2001 1916
2002EXC_REAL(altivec_assist, 0x1700, 0x100) 1917EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100)
2003EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700) 1918 INT_HANDLER altivec_assist, 0x1700, kvm=1
2004TRAMP_KVM(PACA_EXGEN, 0x1700) 1919EXC_REAL_END(altivec_assist, 0x1700, 0x100)
1920EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
1921 INT_HANDLER altivec_assist, 0x1700, virt=1
1922EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
1923INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0
2005#ifdef CONFIG_ALTIVEC 1924#ifdef CONFIG_ALTIVEC
2006EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) 1925EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
2007#else 1926#else
@@ -2010,15 +1929,18 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
2010 1929
2011 1930
2012#ifdef CONFIG_CBE_RAS 1931#ifdef CONFIG_CBE_RAS
2013EXC_REAL_HV(cbe_thermal, 0x1800, 0x100) 1932EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
1933 INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1
1934EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
2014EXC_VIRT_NONE(0x5800, 0x100) 1935EXC_VIRT_NONE(0x5800, 0x100)
2015TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) 1936INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1
2016EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) 1937EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
2017#else /* CONFIG_CBE_RAS */ 1938#else /* CONFIG_CBE_RAS */
2018EXC_REAL_NONE(0x1800, 0x100) 1939EXC_REAL_NONE(0x1800, 0x100)
2019EXC_VIRT_NONE(0x5800, 0x100) 1940EXC_VIRT_NONE(0x5800, 0x100)
2020#endif 1941#endif
2021 1942
1943
2022#ifdef CONFIG_PPC_WATCHDOG 1944#ifdef CONFIG_PPC_WATCHDOG
2023 1945
2024#define MASKED_DEC_HANDLER_LABEL 3f 1946#define MASKED_DEC_HANDLER_LABEL 3f
@@ -2028,7 +1950,7 @@ EXC_VIRT_NONE(0x5800, 0x100)
2028 std r12,PACA_EXGEN+EX_R12(r13); \ 1950 std r12,PACA_EXGEN+EX_R12(r13); \
2029 GET_SCRATCH0(r10); \ 1951 GET_SCRATCH0(r10); \
2030 std r10,PACA_EXGEN+EX_R13(r13); \ 1952 std r10,PACA_EXGEN+EX_R13(r13); \
2031 EXCEPTION_PROLOG_2_REAL soft_nmi_common, _H, 1 1953 INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1
2032 1954
2033/* 1955/*
2034 * Branch to soft_nmi_interrupt using the emergency stack. The emergency 1956 * Branch to soft_nmi_interrupt using the emergency stack. The emergency
@@ -2043,9 +1965,8 @@ EXC_COMMON_BEGIN(soft_nmi_common)
2043 mr r10,r1 1965 mr r10,r1
2044 ld r1,PACAEMERGSP(r13) 1966 ld r1,PACAEMERGSP(r13)
2045 subi r1,r1,INT_FRAME_SIZE 1967 subi r1,r1,INT_FRAME_SIZE
2046 EXCEPTION_COMMON_STACK(PACA_EXGEN, 0x900) 1968 INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0
2047 bl save_nvgprs 1969 bl save_nvgprs
2048 RECONCILE_IRQ_STATE(r10, r11)
2049 addi r3,r1,STACK_FRAME_OVERHEAD 1970 addi r3,r1,STACK_FRAME_OVERHEAD
2050 bl soft_nmi_interrupt 1971 bl soft_nmi_interrupt
2051 b ret_from_except 1972 b ret_from_except
@@ -2302,6 +2223,35 @@ CLOSE_FIXED_SECTION(virt_trampolines);
2302 2223
2303USE_TEXT_SECTION() 2224USE_TEXT_SECTION()
2304 2225
2226/* MSR[RI] should be clear because this uses SRR[01] */
2227enable_machine_check:
2228 mflr r0
2229 bcl 20,31,$+4
22300: mflr r3
2231 addi r3,r3,(1f - 0b)
2232 mtspr SPRN_SRR0,r3
2233 mfmsr r3
2234 ori r3,r3,MSR_ME
2235 mtspr SPRN_SRR1,r3
2236 RFI_TO_KERNEL
22371: mtlr r0
2238 blr
2239
2240/* MSR[RI] should be clear because this uses SRR[01] */
2241disable_machine_check:
2242 mflr r0
2243 bcl 20,31,$+4
22440: mflr r3
2245 addi r3,r3,(1f - 0b)
2246 mtspr SPRN_SRR0,r3
2247 mfmsr r3
2248 li r4,MSR_ME
2249 andc r3,r3,r4
2250 mtspr SPRN_SRR1,r3
2251 RFI_TO_KERNEL
22521: mtlr r0
2253 blr
2254
2305/* 2255/*
2306 * Hash table stuff 2256 * Hash table stuff
2307 */ 2257 */
@@ -2310,7 +2260,7 @@ do_hash_page:
2310#ifdef CONFIG_PPC_BOOK3S_64 2260#ifdef CONFIG_PPC_BOOK3S_64
2311 lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h 2261 lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
2312 ori r0,r0,DSISR_BAD_FAULT_64S@l 2262 ori r0,r0,DSISR_BAD_FAULT_64S@l
2313 and. r0,r4,r0 /* weird error? */ 2263 and. r0,r5,r0 /* weird error? */
2314 bne- handle_page_fault /* if not, try to insert a HPTE */ 2264 bne- handle_page_fault /* if not, try to insert a HPTE */
2315 ld r11, PACA_THREAD_INFO(r13) 2265 ld r11, PACA_THREAD_INFO(r13)
2316 lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ 2266 lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
@@ -2318,15 +2268,13 @@ do_hash_page:
2318 bne 77f /* then don't call hash_page now */ 2268 bne 77f /* then don't call hash_page now */
2319 2269
2320 /* 2270 /*
2321 * r3 contains the faulting address 2271 * r3 contains the trap number
2322 * r4 msr 2272 * r4 contains the faulting address
2323 * r5 contains the trap number 2273 * r5 contains dsisr
2324 * r6 contains dsisr 2274 * r6 msr
2325 * 2275 *
2326 * at return r3 = 0 for success, 1 for page fault, negative for error 2276 * at return r3 = 0 for success, 1 for page fault, negative for error
2327 */ 2277 */
2328 mr r4,r12
2329 ld r6,_DSISR(r1)
2330 bl __hash_page /* build HPTE if possible */ 2278 bl __hash_page /* build HPTE if possible */
2331 cmpdi r3,0 /* see if __hash_page succeeded */ 2279 cmpdi r3,0 /* see if __hash_page succeeded */
2332 2280
@@ -2336,16 +2284,15 @@ do_hash_page:
2336 /* Error */ 2284 /* Error */
2337 blt- 13f 2285 blt- 13f
2338 2286
2339 /* Reload DSISR into r4 for the DABR check below */ 2287 /* Reload DAR/DSISR into r4/r5 for the DABR check below */
2340 ld r4,_DSISR(r1) 2288 ld r4,_DAR(r1)
2289 ld r5,_DSISR(r1)
2341#endif /* CONFIG_PPC_BOOK3S_64 */ 2290#endif /* CONFIG_PPC_BOOK3S_64 */
2342 2291
2343/* Here we have a page fault that hash_page can't handle. */ 2292/* Here we have a page fault that hash_page can't handle. */
2344handle_page_fault: 2293handle_page_fault:
234511: andis. r0,r4,DSISR_DABRMATCH@h 229411: andis. r0,r5,DSISR_DABRMATCH@h
2346 bne- handle_dabr_fault 2295 bne- handle_dabr_fault
2347 ld r4,_DAR(r1)
2348 ld r5,_DSISR(r1)
2349 addi r3,r1,STACK_FRAME_OVERHEAD 2296 addi r3,r1,STACK_FRAME_OVERHEAD
2350 bl do_page_fault 2297 bl do_page_fault
2351 cmpdi r3,0 2298 cmpdi r3,0
@@ -2353,7 +2300,7 @@ handle_page_fault:
2353 bl save_nvgprs 2300 bl save_nvgprs
2354 mr r5,r3 2301 mr r5,r3
2355 addi r3,r1,STACK_FRAME_OVERHEAD 2302 addi r3,r1,STACK_FRAME_OVERHEAD
2356 lwz r4,_DAR(r1) 2303 ld r4,_DAR(r1)
2357 bl bad_page_fault 2304 bl bad_page_fault
2358 b ret_from_except 2305 b ret_from_except
2359 2306
@@ -2392,7 +2339,6 @@ handle_dabr_fault:
2392 * the access, or panic if there isn't a handler. 2339 * the access, or panic if there isn't a handler.
2393 */ 2340 */
239477: bl save_nvgprs 234177: bl save_nvgprs
2395 mr r4,r3
2396 addi r3,r1,STACK_FRAME_OVERHEAD 2342 addi r3,r1,STACK_FRAME_OVERHEAD
2397 li r5,SIGSEGV 2343 li r5,SIGSEGV
2398 bl bad_page_fault 2344 bl bad_page_fault
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4eab97292cc2..ed59855430b9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -28,24 +28,22 @@
28#include <asm/debugfs.h> 28#include <asm/debugfs.h>
29#include <asm/page.h> 29#include <asm/page.h>
30#include <asm/prom.h> 30#include <asm/prom.h>
31#include <asm/rtas.h>
32#include <asm/fadump.h> 31#include <asm/fadump.h>
32#include <asm/fadump-internal.h>
33#include <asm/setup.h> 33#include <asm/setup.h>
34 34
35static struct fw_dump fw_dump; 35static struct fw_dump fw_dump;
36static struct fadump_mem_struct fdm;
37static const struct fadump_mem_struct *fdm_active;
38#ifdef CONFIG_CMA
39static struct cma *fadump_cma;
40#endif
41 36
37static void __init fadump_reserve_crash_area(u64 base);
38
39#ifndef CONFIG_PRESERVE_FA_DUMP
42static DEFINE_MUTEX(fadump_mutex); 40static DEFINE_MUTEX(fadump_mutex);
43struct fad_crash_memory_ranges *crash_memory_ranges; 41struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 };
44int crash_memory_ranges_size; 42struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 };
45int crash_mem_ranges;
46int max_crash_mem_ranges;
47 43
48#ifdef CONFIG_CMA 44#ifdef CONFIG_CMA
45static struct cma *fadump_cma;
46
49/* 47/*
50 * fadump_cma_init() - Initialize CMA area from a fadump reserved memory 48 * fadump_cma_init() - Initialize CMA area from a fadump reserved memory
51 * 49 *
@@ -107,84 +105,45 @@ static int __init fadump_cma_init(void) { return 1; }
107#endif /* CONFIG_CMA */ 105#endif /* CONFIG_CMA */
108 106
109/* Scan the Firmware Assisted dump configuration details. */ 107/* Scan the Firmware Assisted dump configuration details. */
110int __init early_init_dt_scan_fw_dump(unsigned long node, 108int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
111 const char *uname, int depth, void *data) 109 int depth, void *data)
112{ 110{
113 const __be32 *sections; 111 if (depth != 1)
114 int i, num_sections;
115 int size;
116 const __be32 *token;
117
118 if (depth != 1 || strcmp(uname, "rtas") != 0)
119 return 0; 112 return 0;
120 113
121 /* 114 if (strcmp(uname, "rtas") == 0) {
122 * Check if Firmware Assisted dump is supported. if yes, check 115 rtas_fadump_dt_scan(&fw_dump, node);
123 * if dump has been initiated on last reboot.
124 */
125 token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
126 if (!token)
127 return 1; 116 return 1;
117 }
128 118
129 fw_dump.fadump_supported = 1; 119 if (strcmp(uname, "ibm,opal") == 0) {
130 fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); 120 opal_fadump_dt_scan(&fw_dump, node);
131
132 /*
133 * The 'ibm,kernel-dump' rtas node is present only if there is
134 * dump data waiting for us.
135 */
136 fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
137 if (fdm_active)
138 fw_dump.dump_active = 1;
139
140 /* Get the sizes required to store dump data for the firmware provided
141 * dump sections.
142 * For each dump section type supported, a 32bit cell which defines
143 * the ID of a supported section followed by two 32 bit cells which
144 * gives teh size of the section in bytes.
145 */
146 sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
147 &size);
148
149 if (!sections)
150 return 1; 121 return 1;
151
152 num_sections = size / (3 * sizeof(u32));
153
154 for (i = 0; i < num_sections; i++, sections += 3) {
155 u32 type = (u32)of_read_number(sections, 1);
156
157 switch (type) {
158 case FADUMP_CPU_STATE_DATA:
159 fw_dump.cpu_state_data_size =
160 of_read_ulong(&sections[1], 2);
161 break;
162 case FADUMP_HPTE_REGION:
163 fw_dump.hpte_region_size =
164 of_read_ulong(&sections[1], 2);
165 break;
166 }
167 } 122 }
168 123
169 return 1; 124 return 0;
170} 125}
171 126
172/* 127/*
173 * If fadump is registered, check if the memory provided 128 * If fadump is registered, check if the memory provided
174 * falls within boot memory area and reserved memory area. 129 * falls within boot memory area and reserved memory area.
175 */ 130 */
176int is_fadump_memory_area(u64 addr, ulong size) 131int is_fadump_memory_area(u64 addr, unsigned long size)
177{ 132{
178 u64 d_start = fw_dump.reserve_dump_area_start; 133 u64 d_start, d_end;
179 u64 d_end = d_start + fw_dump.reserve_dump_area_size;
180 134
181 if (!fw_dump.dump_registered) 135 if (!fw_dump.dump_registered)
182 return 0; 136 return 0;
183 137
138 if (!size)
139 return 0;
140
141 d_start = fw_dump.reserve_dump_area_start;
142 d_end = d_start + fw_dump.reserve_dump_area_size;
184 if (((addr + size) > d_start) && (addr <= d_end)) 143 if (((addr + size) > d_start) && (addr <= d_end))
185 return 1; 144 return 1;
186 145
187 return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; 146 return (addr <= fw_dump.boot_mem_top);
188} 147}
189 148
190int should_fadump_crash(void) 149int should_fadump_crash(void)
@@ -200,31 +159,29 @@ int is_fadump_active(void)
200} 159}
201 160
202/* 161/*
203 * Returns 1, if there are no holes in boot memory area, 162 * Returns true, if there are no holes in memory area between d_start to d_end,
204 * 0 otherwise. 163 * false otherwise.
205 */ 164 */
206static int is_boot_memory_area_contiguous(void) 165static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
207{ 166{
208 struct memblock_region *reg; 167 struct memblock_region *reg;
209 unsigned long tstart, tend; 168 bool ret = false;
210 unsigned long start_pfn = PHYS_PFN(RMA_START); 169 u64 start, end;
211 unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size);
212 unsigned int ret = 0;
213 170
214 for_each_memblock(memory, reg) { 171 for_each_memblock(memory, reg) {
215 tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); 172 start = max_t(u64, d_start, reg->base);
216 tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); 173 end = min_t(u64, d_end, (reg->base + reg->size));
217 if (tstart < tend) { 174 if (d_start < end) {
218 /* Memory hole from start_pfn to tstart */ 175 /* Memory hole from d_start to start */
219 if (tstart > start_pfn) 176 if (start > d_start)
220 break; 177 break;
221 178
222 if (tend == end_pfn) { 179 if (end == d_end) {
223 ret = 1; 180 ret = true;
224 break; 181 break;
225 } 182 }
226 183
227 start_pfn = tend + 1; 184 d_start = end + 1;
228 } 185 }
229 } 186 }
230 187
@@ -232,37 +189,45 @@ static int is_boot_memory_area_contiguous(void)
232} 189}
233 190
234/* 191/*
235 * Returns true, if there are no holes in reserved memory area, 192 * Returns true, if there are no holes in boot memory area,
236 * false otherwise. 193 * false otherwise.
237 */ 194 */
238static bool is_reserved_memory_area_contiguous(void) 195bool is_fadump_boot_mem_contiguous(void)
239{ 196{
240 struct memblock_region *reg; 197 unsigned long d_start, d_end;
241 unsigned long start, end; 198 bool ret = false;
242 unsigned long d_start = fw_dump.reserve_dump_area_start; 199 int i;
243 unsigned long d_end = d_start + fw_dump.reserve_dump_area_size;
244
245 for_each_memblock(memory, reg) {
246 start = max(d_start, (unsigned long)reg->base);
247 end = min(d_end, (unsigned long)(reg->base + reg->size));
248 if (d_start < end) {
249 /* Memory hole from d_start to start */
250 if (start > d_start)
251 break;
252 200
253 if (end == d_end) 201 for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
254 return true; 202 d_start = fw_dump.boot_mem_addr[i];
203 d_end = d_start + fw_dump.boot_mem_sz[i];
255 204
256 d_start = end + 1; 205 ret = is_fadump_mem_area_contiguous(d_start, d_end);
257 } 206 if (!ret)
207 break;
258 } 208 }
259 209
260 return false; 210 return ret;
211}
212
213/*
214 * Returns true, if there are no holes in reserved memory area,
215 * false otherwise.
216 */
217bool is_fadump_reserved_mem_contiguous(void)
218{
219 u64 d_start, d_end;
220
221 d_start = fw_dump.reserve_dump_area_start;
222 d_end = d_start + fw_dump.reserve_dump_area_size;
223 return is_fadump_mem_area_contiguous(d_start, d_end);
261} 224}
262 225
263/* Print firmware assisted dump configurations for debugging purpose. */ 226/* Print firmware assisted dump configurations for debugging purpose. */
264static void fadump_show_config(void) 227static void fadump_show_config(void)
265{ 228{
229 int i;
230
266 pr_debug("Support for firmware-assisted dump (fadump): %s\n", 231 pr_debug("Support for firmware-assisted dump (fadump): %s\n",
267 (fw_dump.fadump_supported ? "present" : "no support")); 232 (fw_dump.fadump_supported ? "present" : "no support"));
268 233
@@ -276,62 +241,13 @@ static void fadump_show_config(void)
276 pr_debug("Dump section sizes:\n"); 241 pr_debug("Dump section sizes:\n");
277 pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); 242 pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
278 pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); 243 pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size);
279 pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); 244 pr_debug(" Boot memory size : %lx\n", fw_dump.boot_memory_size);
280} 245 pr_debug(" Boot memory top : %llx\n", fw_dump.boot_mem_top);
281 246 pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt);
282static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, 247 for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
283 unsigned long addr) 248 pr_debug("[%03d] base = %llx, size = %llx\n", i,
284{ 249 fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]);
285 if (!fdm) 250 }
286 return 0;
287
288 memset(fdm, 0, sizeof(struct fadump_mem_struct));
289 addr = addr & PAGE_MASK;
290
291 fdm->header.dump_format_version = cpu_to_be32(0x00000001);
292 fdm->header.dump_num_sections = cpu_to_be16(3);
293 fdm->header.dump_status_flag = 0;
294 fdm->header.offset_first_dump_section =
295 cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data));
296
297 /*
298 * Fields for disk dump option.
299 * We are not using disk dump option, hence set these fields to 0.
300 */
301 fdm->header.dd_block_size = 0;
302 fdm->header.dd_block_offset = 0;
303 fdm->header.dd_num_blocks = 0;
304 fdm->header.dd_offset_disk_path = 0;
305
306 /* set 0 to disable an automatic dump-reboot. */
307 fdm->header.max_time_auto = 0;
308
309 /* Kernel dump sections */
310 /* cpu state data section. */
311 fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
312 fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA);
313 fdm->cpu_state_data.source_address = 0;
314 fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size);
315 fdm->cpu_state_data.destination_address = cpu_to_be64(addr);
316 addr += fw_dump.cpu_state_data_size;
317
318 /* hpte region section */
319 fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
320 fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION);
321 fdm->hpte_region.source_address = 0;
322 fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size);
323 fdm->hpte_region.destination_address = cpu_to_be64(addr);
324 addr += fw_dump.hpte_region_size;
325
326 /* RMA region section */
327 fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG);
328 fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION);
329 fdm->rmr_region.source_address = cpu_to_be64(RMA_START);
330 fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size);
331 fdm->rmr_region.destination_address = cpu_to_be64(addr);
332 addr += fw_dump.boot_memory_size;
333
334 return addr;
335} 251}
336 252
337/** 253/**
@@ -349,10 +265,10 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
349 * that is required for a kernel to boot successfully. 265 * that is required for a kernel to boot successfully.
350 * 266 *
351 */ 267 */
352static inline unsigned long fadump_calculate_reserve_size(void) 268static inline u64 fadump_calculate_reserve_size(void)
353{ 269{
270 u64 base, size, bootmem_min;
354 int ret; 271 int ret;
355 unsigned long long base, size;
356 272
357 if (fw_dump.reserve_bootvar) 273 if (fw_dump.reserve_bootvar)
358 pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); 274 pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
@@ -402,7 +318,8 @@ static inline unsigned long fadump_calculate_reserve_size(void)
402 if (memory_limit && size > memory_limit) 318 if (memory_limit && size > memory_limit)
403 size = memory_limit; 319 size = memory_limit;
404 320
405 return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); 321 bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
322 return (size > bootmem_min ? size : bootmem_min);
406} 323}
407 324
408/* 325/*
@@ -423,57 +340,136 @@ static unsigned long get_fadump_area_size(void)
423 size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); 340 size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
424 341
425 size = PAGE_ALIGN(size); 342 size = PAGE_ALIGN(size);
343
344 /* This is to hold kernel metadata on platforms that support it */
345 size += (fw_dump.ops->fadump_get_metadata_size ?
346 fw_dump.ops->fadump_get_metadata_size() : 0);
426 return size; 347 return size;
427} 348}
428 349
429static void __init fadump_reserve_crash_area(unsigned long base, 350static int __init add_boot_mem_region(unsigned long rstart,
430 unsigned long size) 351 unsigned long rsize)
352{
353 int i = fw_dump.boot_mem_regs_cnt++;
354
355 if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) {
356 fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS;
357 return 0;
358 }
359
360 pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n",
361 i, rstart, (rstart + rsize));
362 fw_dump.boot_mem_addr[i] = rstart;
363 fw_dump.boot_mem_sz[i] = rsize;
364 return 1;
365}
366
367/*
368 * Firmware usually has a hard limit on the data it can copy per region.
369 * Honour that by splitting a memory range into multiple regions.
370 */
371static int __init add_boot_mem_regions(unsigned long mstart,
372 unsigned long msize)
431{ 373{
374 unsigned long rstart, rsize, max_size;
375 int ret = 1;
376
377 rstart = mstart;
378 max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize;
379 while (msize) {
380 if (msize > max_size)
381 rsize = max_size;
382 else
383 rsize = msize;
384
385 ret = add_boot_mem_region(rstart, rsize);
386 if (!ret)
387 break;
388
389 msize -= rsize;
390 rstart += rsize;
391 }
392
393 return ret;
394}
395
396static int __init fadump_get_boot_mem_regions(void)
397{
398 unsigned long base, size, cur_size, hole_size, last_end;
399 unsigned long mem_size = fw_dump.boot_memory_size;
432 struct memblock_region *reg; 400 struct memblock_region *reg;
433 unsigned long mstart, mend, msize; 401 int ret = 1;
402
403 fw_dump.boot_mem_regs_cnt = 0;
434 404
405 last_end = 0;
406 hole_size = 0;
407 cur_size = 0;
435 for_each_memblock(memory, reg) { 408 for_each_memblock(memory, reg) {
436 mstart = max_t(unsigned long, base, reg->base); 409 base = reg->base;
437 mend = reg->base + reg->size; 410 size = reg->size;
438 mend = min(base + size, mend); 411 hole_size += (base - last_end);
439 412
440 if (mstart < mend) { 413 if ((cur_size + size) >= mem_size) {
441 msize = mend - mstart; 414 size = (mem_size - cur_size);
442 memblock_reserve(mstart, msize); 415 ret = add_boot_mem_regions(base, size);
443 pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n", 416 break;
444 (msize >> 20), mstart);
445 } 417 }
418
419 mem_size -= size;
420 cur_size += size;
421 ret = add_boot_mem_regions(base, size);
422 if (!ret)
423 break;
424
425 last_end = base + size;
446 } 426 }
427 fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size);
428
429 return ret;
447} 430}
448 431
449int __init fadump_reserve_mem(void) 432int __init fadump_reserve_mem(void)
450{ 433{
451 unsigned long base, size, memory_boundary; 434 u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE;
435 bool is_memblock_bottom_up = memblock_bottom_up();
436 int ret = 1;
452 437
453 if (!fw_dump.fadump_enabled) 438 if (!fw_dump.fadump_enabled)
454 return 0; 439 return 0;
455 440
456 if (!fw_dump.fadump_supported) { 441 if (!fw_dump.fadump_supported) {
457 printk(KERN_INFO "Firmware-assisted dump is not supported on" 442 pr_info("Firmware-Assisted Dump is not supported on this hardware\n");
458 " this hardware\n"); 443 goto error_out;
459 fw_dump.fadump_enabled = 0;
460 return 0;
461 } 444 }
445
462 /* 446 /*
463 * Initialize boot memory size 447 * Initialize boot memory size
464 * If dump is active then we have already calculated the size during 448 * If dump is active then we have already calculated the size during
465 * first kernel. 449 * first kernel.
466 */ 450 */
467 if (fdm_active) 451 if (!fw_dump.dump_active) {
468 fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len); 452 fw_dump.boot_memory_size =
469 else { 453 PAGE_ALIGN(fadump_calculate_reserve_size());
470 fw_dump.boot_memory_size = fadump_calculate_reserve_size();
471#ifdef CONFIG_CMA 454#ifdef CONFIG_CMA
472 if (!fw_dump.nocma) 455 if (!fw_dump.nocma) {
456 align = FADUMP_CMA_ALIGNMENT;
473 fw_dump.boot_memory_size = 457 fw_dump.boot_memory_size =
474 ALIGN(fw_dump.boot_memory_size, 458 ALIGN(fw_dump.boot_memory_size, align);
475 FADUMP_CMA_ALIGNMENT); 459 }
476#endif 460#endif
461
462 bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
463 if (fw_dump.boot_memory_size < bootmem_min) {
464 pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n",
465 fw_dump.boot_memory_size, bootmem_min);
466 goto error_out;
467 }
468
469 if (!fadump_get_boot_mem_regions()) {
470 pr_err("Too many holes in boot memory area to enable fadump\n");
471 goto error_out;
472 }
477 } 473 }
478 474
479 /* 475 /*
@@ -493,10 +489,13 @@ int __init fadump_reserve_mem(void)
493 " dump, now %#016llx\n", memory_limit); 489 " dump, now %#016llx\n", memory_limit);
494 } 490 }
495 if (memory_limit) 491 if (memory_limit)
496 memory_boundary = memory_limit; 492 mem_boundary = memory_limit;
497 else 493 else
498 memory_boundary = memblock_end_of_DRAM(); 494 mem_boundary = memblock_end_of_DRAM();
499 495
496 base = fw_dump.boot_mem_top;
497 size = get_fadump_area_size();
498 fw_dump.reserve_dump_area_size = size;
500 if (fw_dump.dump_active) { 499 if (fw_dump.dump_active) {
501 pr_info("Firmware-assisted dump is active.\n"); 500 pr_info("Firmware-assisted dump is active.\n");
502 501
@@ -510,58 +509,55 @@ int __init fadump_reserve_mem(void)
510#endif 509#endif
511 /* 510 /*
512 * If last boot has crashed then reserve all the memory 511 * If last boot has crashed then reserve all the memory
513 * above boot_memory_size so that we don't touch it until 512 * above boot memory size so that we don't touch it until
514 * dump is written to disk by userspace tool. This memory 513 * dump is written to disk by userspace tool. This memory
515 * will be released for general use once the dump is saved. 514 * can be released for general use by invalidating fadump.
516 */ 515 */
517 base = fw_dump.boot_memory_size; 516 fadump_reserve_crash_area(base);
518 size = memory_boundary - base;
519 fadump_reserve_crash_area(base, size);
520
521 fw_dump.fadumphdr_addr =
522 be64_to_cpu(fdm_active->rmr_region.destination_address) +
523 be64_to_cpu(fdm_active->rmr_region.source_len);
524 pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
525 fw_dump.reserve_dump_area_start = base;
526 fw_dump.reserve_dump_area_size = size;
527 } else {
528 size = get_fadump_area_size();
529 517
518 pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr);
519 pr_debug("Reserve dump area start address: 0x%lx\n",
520 fw_dump.reserve_dump_area_start);
521 } else {
530 /* 522 /*
531 * Reserve memory at an offset closer to bottom of the RAM to 523 * Reserve memory at an offset closer to bottom of the RAM to
532 * minimize the impact of memory hot-remove operation. We can't 524 * minimize the impact of memory hot-remove operation.
533 * use memblock_find_in_range() here since it doesn't allocate
534 * from bottom to top.
535 */ 525 */
536 for (base = fw_dump.boot_memory_size; 526 memblock_set_bottom_up(true);
537 base <= (memory_boundary - size); 527 base = memblock_find_in_range(base, mem_boundary, size, align);
538 base += size) { 528
539 if (memblock_is_region_memory(base, size) && 529 /* Restore the previous allocation mode */
540 !memblock_is_region_reserved(base, size)) 530 memblock_set_bottom_up(is_memblock_bottom_up);
541 break; 531
532 if (!base) {
533 pr_err("Failed to find memory chunk for reservation!\n");
534 goto error_out;
542 } 535 }
543 if ((base > (memory_boundary - size)) || 536 fw_dump.reserve_dump_area_start = base;
544 memblock_reserve(base, size)) { 537
545 pr_err("Failed to reserve memory\n"); 538 /*
546 return 0; 539 * Calculate the kernel metadata address and register it with
540 * f/w if the platform supports.
541 */
542 if (fw_dump.ops->fadump_setup_metadata &&
543 (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
544 goto error_out;
545
546 if (memblock_reserve(base, size)) {
547 pr_err("Failed to reserve memory!\n");
548 goto error_out;
547 } 549 }
548 550
549 pr_info("Reserved %ldMB of memory at %ldMB for firmware-" 551 pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n",
550 "assisted dump (System RAM: %ldMB)\n", 552 (size >> 20), base, (memblock_phys_mem_size() >> 20));
551 (unsigned long)(size >> 20),
552 (unsigned long)(base >> 20),
553 (unsigned long)(memblock_phys_mem_size() >> 20));
554 553
555 fw_dump.reserve_dump_area_start = base; 554 ret = fadump_cma_init();
556 fw_dump.reserve_dump_area_size = size;
557 return fadump_cma_init();
558 } 555 }
559 return 1;
560}
561 556
562unsigned long __init arch_reserved_kernel_pages(void) 557 return ret;
563{ 558error_out:
564 return memblock_reserved_size() / PAGE_SIZE; 559 fw_dump.fadump_enabled = 0;
560 return 0;
565} 561}
566 562
567/* Look for fadump= cmdline option. */ 563/* Look for fadump= cmdline option. */
@@ -596,61 +592,6 @@ static int __init early_fadump_reserve_mem(char *p)
596} 592}
597early_param("fadump_reserve_mem", early_fadump_reserve_mem); 593early_param("fadump_reserve_mem", early_fadump_reserve_mem);
598 594
599static int register_fw_dump(struct fadump_mem_struct *fdm)
600{
601 int rc, err;
602 unsigned int wait_time;
603
604 pr_debug("Registering for firmware-assisted kernel dump...\n");
605
606 /* TODO: Add upper time limit for the delay */
607 do {
608 rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
609 FADUMP_REGISTER, fdm,
610 sizeof(struct fadump_mem_struct));
611
612 wait_time = rtas_busy_delay_time(rc);
613 if (wait_time)
614 mdelay(wait_time);
615
616 } while (wait_time);
617
618 err = -EIO;
619 switch (rc) {
620 default:
621 pr_err("Failed to register. Unknown Error(%d).\n", rc);
622 break;
623 case -1:
624 printk(KERN_ERR "Failed to register firmware-assisted kernel"
625 " dump. Hardware Error(%d).\n", rc);
626 break;
627 case -3:
628 if (!is_boot_memory_area_contiguous())
629 pr_err("Can't have holes in boot memory area while registering fadump\n");
630 else if (!is_reserved_memory_area_contiguous())
631 pr_err("Can't have holes in reserved memory area while"
632 " registering fadump\n");
633
634 printk(KERN_ERR "Failed to register firmware-assisted kernel"
635 " dump. Parameter Error(%d).\n", rc);
636 err = -EINVAL;
637 break;
638 case -9:
639 printk(KERN_ERR "firmware-assisted kernel dump is already "
640 " registered.");
641 fw_dump.dump_registered = 1;
642 err = -EEXIST;
643 break;
644 case 0:
645 printk(KERN_INFO "firmware-assisted kernel dump registration"
646 " is successful\n");
647 fw_dump.dump_registered = 1;
648 err = 0;
649 break;
650 }
651 return err;
652}
653
654void crash_fadump(struct pt_regs *regs, const char *str) 595void crash_fadump(struct pt_regs *regs, const char *str)
655{ 596{
656 struct fadump_crash_info_header *fdh = NULL; 597 struct fadump_crash_info_header *fdh = NULL;
@@ -693,71 +634,10 @@ void crash_fadump(struct pt_regs *regs, const char *str)
693 634
694 fdh->online_mask = *cpu_online_mask; 635 fdh->online_mask = *cpu_online_mask;
695 636
696 /* Call ibm,os-term rtas call to trigger firmware assisted dump */ 637 fw_dump.ops->fadump_trigger(fdh, str);
697 rtas_os_term((char *)str);
698}
699
700#define GPR_MASK 0xffffff0000000000
701static inline int fadump_gpr_index(u64 id)
702{
703 int i = -1;
704 char str[3];
705
706 if ((id & GPR_MASK) == REG_ID("GPR")) {
707 /* get the digits at the end */
708 id &= ~GPR_MASK;
709 id >>= 24;
710 str[2] = '\0';
711 str[1] = id & 0xff;
712 str[0] = (id >> 8) & 0xff;
713 sscanf(str, "%d", &i);
714 if (i > 31)
715 i = -1;
716 }
717 return i;
718}
719
720static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
721 u64 reg_val)
722{
723 int i;
724
725 i = fadump_gpr_index(reg_id);
726 if (i >= 0)
727 regs->gpr[i] = (unsigned long)reg_val;
728 else if (reg_id == REG_ID("NIA"))
729 regs->nip = (unsigned long)reg_val;
730 else if (reg_id == REG_ID("MSR"))
731 regs->msr = (unsigned long)reg_val;
732 else if (reg_id == REG_ID("CTR"))
733 regs->ctr = (unsigned long)reg_val;
734 else if (reg_id == REG_ID("LR"))
735 regs->link = (unsigned long)reg_val;
736 else if (reg_id == REG_ID("XER"))
737 regs->xer = (unsigned long)reg_val;
738 else if (reg_id == REG_ID("CR"))
739 regs->ccr = (unsigned long)reg_val;
740 else if (reg_id == REG_ID("DAR"))
741 regs->dar = (unsigned long)reg_val;
742 else if (reg_id == REG_ID("DSISR"))
743 regs->dsisr = (unsigned long)reg_val;
744}
745
746static struct fadump_reg_entry*
747fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
748{
749 memset(regs, 0, sizeof(struct pt_regs));
750
751 while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) {
752 fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
753 be64_to_cpu(reg_entry->reg_value));
754 reg_entry++;
755 }
756 reg_entry++;
757 return reg_entry;
758} 638}
759 639
760static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) 640u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
761{ 641{
762 struct elf_prstatus prstatus; 642 struct elf_prstatus prstatus;
763 643
@@ -772,7 +652,7 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
772 return buf; 652 return buf;
773} 653}
774 654
775static void fadump_update_elfcore_header(char *bufp) 655void fadump_update_elfcore_header(char *bufp)
776{ 656{
777 struct elfhdr *elf; 657 struct elfhdr *elf;
778 struct elf_phdr *phdr; 658 struct elf_phdr *phdr;
@@ -784,7 +664,7 @@ static void fadump_update_elfcore_header(char *bufp)
784 phdr = (struct elf_phdr *)bufp; 664 phdr = (struct elf_phdr *)bufp;
785 665
786 if (phdr->p_type == PT_NOTE) { 666 if (phdr->p_type == PT_NOTE) {
787 phdr->p_paddr = fw_dump.cpu_notes_buf; 667 phdr->p_paddr = __pa(fw_dump.cpu_notes_buf_vaddr);
788 phdr->p_offset = phdr->p_paddr; 668 phdr->p_offset = phdr->p_paddr;
789 phdr->p_filesz = fw_dump.cpu_notes_buf_size; 669 phdr->p_filesz = fw_dump.cpu_notes_buf_size;
790 phdr->p_memsz = fw_dump.cpu_notes_buf_size; 670 phdr->p_memsz = fw_dump.cpu_notes_buf_size;
@@ -792,228 +672,100 @@ static void fadump_update_elfcore_header(char *bufp)
792 return; 672 return;
793} 673}
794 674
795static void *fadump_cpu_notes_buf_alloc(unsigned long size) 675static void *fadump_alloc_buffer(unsigned long size)
796{ 676{
797 void *vaddr; 677 unsigned long count, i;
798 struct page *page; 678 struct page *page;
799 unsigned long order, count, i; 679 void *vaddr;
800 680
801 order = get_order(size); 681 vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
802 vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
803 if (!vaddr) 682 if (!vaddr)
804 return NULL; 683 return NULL;
805 684
806 count = 1 << order; 685 count = PAGE_ALIGN(size) / PAGE_SIZE;
807 page = virt_to_page(vaddr); 686 page = virt_to_page(vaddr);
808 for (i = 0; i < count; i++) 687 for (i = 0; i < count; i++)
809 SetPageReserved(page + i); 688 mark_page_reserved(page + i);
810 return vaddr; 689 return vaddr;
811} 690}
812 691
813static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) 692static void fadump_free_buffer(unsigned long vaddr, unsigned long size)
814{ 693{
815 struct page *page; 694 free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL);
816 unsigned long order, count, i;
817
818 order = get_order(size);
819 count = 1 << order;
820 page = virt_to_page(vaddr);
821 for (i = 0; i < count; i++)
822 ClearPageReserved(page + i);
823 __free_pages(page, order);
824} 695}
825 696
826/* 697s32 fadump_setup_cpu_notes_buf(u32 num_cpus)
827 * Read CPU state dump data and convert it into ELF notes.
828 * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
829 * used to access the data to allow for additional fields to be added without
830 * affecting compatibility. Each list of registers for a CPU starts with
831 * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
832 * 8 Byte ASCII identifier and 8 Byte register value. The register entry
833 * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
834 * of register value. For more details refer to PAPR document.
835 *
836 * Only for the crashing cpu we ignore the CPU dump data and get exact
837 * state from fadump crash info structure populated by first kernel at the
838 * time of crash.
839 */
840static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
841{ 698{
842 struct fadump_reg_save_area_header *reg_header;
843 struct fadump_reg_entry *reg_entry;
844 struct fadump_crash_info_header *fdh = NULL;
845 void *vaddr;
846 unsigned long addr;
847 u32 num_cpus, *note_buf;
848 struct pt_regs regs;
849 int i, rc = 0, cpu = 0;
850
851 if (!fdm->cpu_state_data.bytes_dumped)
852 return -EINVAL;
853
854 addr = be64_to_cpu(fdm->cpu_state_data.destination_address);
855 vaddr = __va(addr);
856
857 reg_header = vaddr;
858 if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) {
859 printk(KERN_ERR "Unable to read register save area.\n");
860 return -ENOENT;
861 }
862 pr_debug("--------CPU State Data------------\n");
863 pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
864 pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
865
866 vaddr += be32_to_cpu(reg_header->num_cpu_offset);
867 num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
868 pr_debug("NumCpus : %u\n", num_cpus);
869 vaddr += sizeof(u32);
870 reg_entry = (struct fadump_reg_entry *)vaddr;
871
872 /* Allocate buffer to hold cpu crash notes. */ 699 /* Allocate buffer to hold cpu crash notes. */
873 fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); 700 fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
874 fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); 701 fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
875 note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); 702 fw_dump.cpu_notes_buf_vaddr =
876 if (!note_buf) { 703 (unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size);
877 printk(KERN_ERR "Failed to allocate 0x%lx bytes for " 704 if (!fw_dump.cpu_notes_buf_vaddr) {
878 "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); 705 pr_err("Failed to allocate %ld bytes for CPU notes buffer\n",
706 fw_dump.cpu_notes_buf_size);
879 return -ENOMEM; 707 return -ENOMEM;
880 } 708 }
881 fw_dump.cpu_notes_buf = __pa(note_buf);
882
883 pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
884 (num_cpus * sizeof(note_buf_t)), note_buf);
885 709
886 if (fw_dump.fadumphdr_addr) 710 pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n",
887 fdh = __va(fw_dump.fadumphdr_addr); 711 fw_dump.cpu_notes_buf_size,
888 712 fw_dump.cpu_notes_buf_vaddr);
889 for (i = 0; i < num_cpus; i++) {
890 if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) {
891 printk(KERN_ERR "Unable to read CPU state data\n");
892 rc = -ENOENT;
893 goto error_out;
894 }
895 /* Lower 4 bytes of reg_value contains logical cpu id */
896 cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK;
897 if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
898 SKIP_TO_NEXT_CPU(reg_entry);
899 continue;
900 }
901 pr_debug("Reading register data for cpu %d...\n", cpu);
902 if (fdh && fdh->crashing_cpu == cpu) {
903 regs = fdh->regs;
904 note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
905 SKIP_TO_NEXT_CPU(reg_entry);
906 } else {
907 reg_entry++;
908 reg_entry = fadump_read_registers(reg_entry, &regs);
909 note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
910 }
911 }
912 final_note(note_buf);
913
914 if (fdh) {
915 pr_debug("Updating elfcore header (%llx) with cpu notes\n",
916 fdh->elfcorehdr_addr);
917 fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
918 }
919 return 0; 713 return 0;
920
921error_out:
922 fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
923 fw_dump.cpu_notes_buf_size);
924 fw_dump.cpu_notes_buf = 0;
925 fw_dump.cpu_notes_buf_size = 0;
926 return rc;
927
928} 714}
929 715
930/* 716void fadump_free_cpu_notes_buf(void)
931 * Validate and process the dump data stored by firmware before exporting
932 * it through '/proc/vmcore'.
933 */
934static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
935{ 717{
936 struct fadump_crash_info_header *fdh; 718 if (!fw_dump.cpu_notes_buf_vaddr)
937 int rc = 0; 719 return;
938
939 if (!fdm_active || !fw_dump.fadumphdr_addr)
940 return -EINVAL;
941
942 /* Check if the dump data is valid. */
943 if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) ||
944 (fdm_active->cpu_state_data.error_flags != 0) ||
945 (fdm_active->rmr_region.error_flags != 0)) {
946 printk(KERN_ERR "Dump taken by platform is not valid\n");
947 return -EINVAL;
948 }
949 if ((fdm_active->rmr_region.bytes_dumped !=
950 fdm_active->rmr_region.source_len) ||
951 !fdm_active->cpu_state_data.bytes_dumped) {
952 printk(KERN_ERR "Dump taken by platform is incomplete\n");
953 return -EINVAL;
954 }
955
956 /* Validate the fadump crash info header */
957 fdh = __va(fw_dump.fadumphdr_addr);
958 if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
959 printk(KERN_ERR "Crash info header is not valid.\n");
960 return -EINVAL;
961 }
962
963 rc = fadump_build_cpu_notes(fdm_active);
964 if (rc)
965 return rc;
966
967 /*
968 * We are done validating dump info and elfcore header is now ready
969 * to be exported. set elfcorehdr_addr so that vmcore module will
970 * export the elfcore header through '/proc/vmcore'.
971 */
972 elfcorehdr_addr = fdh->elfcorehdr_addr;
973 720
974 return 0; 721 fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr,
722 fw_dump.cpu_notes_buf_size);
723 fw_dump.cpu_notes_buf_vaddr = 0;
724 fw_dump.cpu_notes_buf_size = 0;
975} 725}
976 726
977static void free_crash_memory_ranges(void) 727static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info)
978{ 728{
979 kfree(crash_memory_ranges); 729 kfree(mrange_info->mem_ranges);
980 crash_memory_ranges = NULL; 730 mrange_info->mem_ranges = NULL;
981 crash_memory_ranges_size = 0; 731 mrange_info->mem_ranges_sz = 0;
982 max_crash_mem_ranges = 0; 732 mrange_info->max_mem_ranges = 0;
983} 733}
984 734
985/* 735/*
986 * Allocate or reallocate crash memory ranges array in incremental units 736 * Allocate or reallocate mem_ranges array in incremental units
987 * of PAGE_SIZE. 737 * of PAGE_SIZE.
988 */ 738 */
989static int allocate_crash_memory_ranges(void) 739static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info)
990{ 740{
991 struct fad_crash_memory_ranges *new_array; 741 struct fadump_memory_range *new_array;
992 u64 new_size; 742 u64 new_size;
993 743
994 new_size = crash_memory_ranges_size + PAGE_SIZE; 744 new_size = mrange_info->mem_ranges_sz + PAGE_SIZE;
995 pr_debug("Allocating %llu bytes of memory for crash memory ranges\n", 745 pr_debug("Allocating %llu bytes of memory for %s memory ranges\n",
996 new_size); 746 new_size, mrange_info->name);
997 747
998 new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL); 748 new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL);
999 if (new_array == NULL) { 749 if (new_array == NULL) {
1000 pr_err("Insufficient memory for setting up crash memory ranges\n"); 750 pr_err("Insufficient memory for setting up %s memory ranges\n",
1001 free_crash_memory_ranges(); 751 mrange_info->name);
752 fadump_free_mem_ranges(mrange_info);
1002 return -ENOMEM; 753 return -ENOMEM;
1003 } 754 }
1004 755
1005 crash_memory_ranges = new_array; 756 mrange_info->mem_ranges = new_array;
1006 crash_memory_ranges_size = new_size; 757 mrange_info->mem_ranges_sz = new_size;
1007 max_crash_mem_ranges = (new_size / 758 mrange_info->max_mem_ranges = (new_size /
1008 sizeof(struct fad_crash_memory_ranges)); 759 sizeof(struct fadump_memory_range));
1009 return 0; 760 return 0;
1010} 761}
1011 762
1012static inline int fadump_add_crash_memory(unsigned long long base, 763static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info,
1013 unsigned long long end) 764 u64 base, u64 end)
1014{ 765{
1015 u64 start, size; 766 struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges;
1016 bool is_adjacent = false; 767 bool is_adjacent = false;
768 u64 start, size;
1017 769
1018 if (base == end) 770 if (base == end)
1019 return 0; 771 return 0;
@@ -1022,38 +774,41 @@ static inline int fadump_add_crash_memory(unsigned long long base,
1022 * Fold adjacent memory ranges to bring down the memory ranges/ 774 * Fold adjacent memory ranges to bring down the memory ranges/
1023 * PT_LOAD segments count. 775 * PT_LOAD segments count.
1024 */ 776 */
1025 if (crash_mem_ranges) { 777 if (mrange_info->mem_range_cnt) {
1026 start = crash_memory_ranges[crash_mem_ranges - 1].base; 778 start = mem_ranges[mrange_info->mem_range_cnt - 1].base;
1027 size = crash_memory_ranges[crash_mem_ranges - 1].size; 779 size = mem_ranges[mrange_info->mem_range_cnt - 1].size;
1028 780
1029 if ((start + size) == base) 781 if ((start + size) == base)
1030 is_adjacent = true; 782 is_adjacent = true;
1031 } 783 }
1032 if (!is_adjacent) { 784 if (!is_adjacent) {
1033 /* resize the array on reaching the limit */ 785 /* resize the array on reaching the limit */
1034 if (crash_mem_ranges == max_crash_mem_ranges) { 786 if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) {
1035 int ret; 787 int ret;
1036 788
1037 ret = allocate_crash_memory_ranges(); 789 ret = fadump_alloc_mem_ranges(mrange_info);
1038 if (ret) 790 if (ret)
1039 return ret; 791 return ret;
792
793 /* Update to the new resized array */
794 mem_ranges = mrange_info->mem_ranges;
1040 } 795 }
1041 796
1042 start = base; 797 start = base;
1043 crash_memory_ranges[crash_mem_ranges].base = start; 798 mem_ranges[mrange_info->mem_range_cnt].base = start;
1044 crash_mem_ranges++; 799 mrange_info->mem_range_cnt++;
1045 } 800 }
1046 801
1047 crash_memory_ranges[crash_mem_ranges - 1].size = (end - start); 802 mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start);
1048 pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", 803 pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
1049 (crash_mem_ranges - 1), start, end - 1, (end - start)); 804 mrange_info->name, (mrange_info->mem_range_cnt - 1),
805 start, end - 1, (end - start));
1050 return 0; 806 return 0;
1051} 807}
1052 808
1053static int fadump_exclude_reserved_area(unsigned long long start, 809static int fadump_exclude_reserved_area(u64 start, u64 end)
1054 unsigned long long end)
1055{ 810{
1056 unsigned long long ra_start, ra_end; 811 u64 ra_start, ra_end;
1057 int ret = 0; 812 int ret = 0;
1058 813
1059 ra_start = fw_dump.reserve_dump_area_start; 814 ra_start = fw_dump.reserve_dump_area_start;
@@ -1061,18 +816,22 @@ static int fadump_exclude_reserved_area(unsigned long long start,
1061 816
1062 if ((ra_start < end) && (ra_end > start)) { 817 if ((ra_start < end) && (ra_end > start)) {
1063 if ((start < ra_start) && (end > ra_end)) { 818 if ((start < ra_start) && (end > ra_end)) {
1064 ret = fadump_add_crash_memory(start, ra_start); 819 ret = fadump_add_mem_range(&crash_mrange_info,
820 start, ra_start);
1065 if (ret) 821 if (ret)
1066 return ret; 822 return ret;
1067 823
1068 ret = fadump_add_crash_memory(ra_end, end); 824 ret = fadump_add_mem_range(&crash_mrange_info,
825 ra_end, end);
1069 } else if (start < ra_start) { 826 } else if (start < ra_start) {
1070 ret = fadump_add_crash_memory(start, ra_start); 827 ret = fadump_add_mem_range(&crash_mrange_info,
828 start, ra_start);
1071 } else if (ra_end < end) { 829 } else if (ra_end < end) {
1072 ret = fadump_add_crash_memory(ra_end, end); 830 ret = fadump_add_mem_range(&crash_mrange_info,
831 ra_end, end);
1073 } 832 }
1074 } else 833 } else
1075 ret = fadump_add_crash_memory(start, end); 834 ret = fadump_add_mem_range(&crash_mrange_info, start, end);
1076 835
1077 return ret; 836 return ret;
1078} 837}
@@ -1117,36 +876,36 @@ static int fadump_init_elfcore_header(char *bufp)
1117static int fadump_setup_crash_memory_ranges(void) 876static int fadump_setup_crash_memory_ranges(void)
1118{ 877{
1119 struct memblock_region *reg; 878 struct memblock_region *reg;
1120 unsigned long long start, end; 879 u64 start, end;
1121 int ret; 880 int i, ret;
1122 881
1123 pr_debug("Setup crash memory ranges.\n"); 882 pr_debug("Setup crash memory ranges.\n");
1124 crash_mem_ranges = 0; 883 crash_mrange_info.mem_range_cnt = 0;
1125 884
1126 /* 885 /*
1127 * add the first memory chunk (RMA_START through boot_memory_size) as 886 * Boot memory region(s) registered with firmware are moved to
1128 * a separate memory chunk. The reason is, at the time crash firmware 887 * different location at the time of crash. Create separate program
1129 * will move the content of this memory chunk to different location 888 * header(s) for this memory chunk(s) with the correct offset.
1130 * specified during fadump registration. We need to create a separate
1131 * program header for this chunk with the correct offset.
1132 */ 889 */
1133 ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); 890 for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
1134 if (ret) 891 start = fw_dump.boot_mem_addr[i];
1135 return ret; 892 end = start + fw_dump.boot_mem_sz[i];
893 ret = fadump_add_mem_range(&crash_mrange_info, start, end);
894 if (ret)
895 return ret;
896 }
1136 897
1137 for_each_memblock(memory, reg) { 898 for_each_memblock(memory, reg) {
1138 start = (unsigned long long)reg->base; 899 start = (u64)reg->base;
1139 end = start + (unsigned long long)reg->size; 900 end = start + (u64)reg->size;
1140 901
1141 /* 902 /*
1142 * skip the first memory chunk that is already added (RMA_START 903 * skip the memory chunk that is already added
1143 * through boot_memory_size). This logic needs a relook if and 904 * (0 through boot_memory_top).
1144 * when RMA_START changes to a non-zero value.
1145 */ 905 */
1146 BUILD_BUG_ON(RMA_START != 0); 906 if (start < fw_dump.boot_mem_top) {
1147 if (start < fw_dump.boot_memory_size) { 907 if (end > fw_dump.boot_mem_top)
1148 if (end > fw_dump.boot_memory_size) 908 start = fw_dump.boot_mem_top;
1149 start = fw_dump.boot_memory_size;
1150 else 909 else
1151 continue; 910 continue;
1152 } 911 }
@@ -1167,17 +926,35 @@ static int fadump_setup_crash_memory_ranges(void)
1167 */ 926 */
1168static inline unsigned long fadump_relocate(unsigned long paddr) 927static inline unsigned long fadump_relocate(unsigned long paddr)
1169{ 928{
1170 if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) 929 unsigned long raddr, rstart, rend, rlast, hole_size;
1171 return be64_to_cpu(fdm.rmr_region.destination_address) + paddr; 930 int i;
1172 else 931
1173 return paddr; 932 hole_size = 0;
933 rlast = 0;
934 raddr = paddr;
935 for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
936 rstart = fw_dump.boot_mem_addr[i];
937 rend = rstart + fw_dump.boot_mem_sz[i];
938 hole_size += (rstart - rlast);
939
940 if (paddr >= rstart && paddr < rend) {
941 raddr += fw_dump.boot_mem_dest_addr - hole_size;
942 break;
943 }
944
945 rlast = rend;
946 }
947
948 pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr);
949 return raddr;
1174} 950}
1175 951
1176static int fadump_create_elfcore_headers(char *bufp) 952static int fadump_create_elfcore_headers(char *bufp)
1177{ 953{
1178 struct elfhdr *elf; 954 unsigned long long raddr, offset;
1179 struct elf_phdr *phdr; 955 struct elf_phdr *phdr;
1180 int i; 956 struct elfhdr *elf;
957 int i, j;
1181 958
1182 fadump_init_elfcore_header(bufp); 959 fadump_init_elfcore_header(bufp);
1183 elf = (struct elfhdr *)bufp; 960 elf = (struct elfhdr *)bufp;
@@ -1220,12 +997,14 @@ static int fadump_create_elfcore_headers(char *bufp)
1220 (elf->e_phnum)++; 997 (elf->e_phnum)++;
1221 998
1222 /* setup PT_LOAD sections. */ 999 /* setup PT_LOAD sections. */
1223 1000 j = 0;
1224 for (i = 0; i < crash_mem_ranges; i++) { 1001 offset = 0;
1225 unsigned long long mbase, msize; 1002 raddr = fw_dump.boot_mem_addr[0];
1226 mbase = crash_memory_ranges[i].base; 1003 for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) {
1227 msize = crash_memory_ranges[i].size; 1004 u64 mbase, msize;
1228 1005
1006 mbase = crash_mrange_info.mem_ranges[i].base;
1007 msize = crash_mrange_info.mem_ranges[i].size;
1229 if (!msize) 1008 if (!msize)
1230 continue; 1009 continue;
1231 1010
@@ -1235,13 +1014,17 @@ static int fadump_create_elfcore_headers(char *bufp)
1235 phdr->p_flags = PF_R|PF_W|PF_X; 1014 phdr->p_flags = PF_R|PF_W|PF_X;
1236 phdr->p_offset = mbase; 1015 phdr->p_offset = mbase;
1237 1016
1238 if (mbase == RMA_START) { 1017 if (mbase == raddr) {
1239 /* 1018 /*
1240 * The entire RMA region will be moved by firmware 1019 * The entire real memory region will be moved by
1241 * to the specified destination_address. Hence set 1020 * firmware to the specified destination_address.
1242 * the correct offset. 1021 * Hence set the correct offset.
1243 */ 1022 */
1244 phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address); 1023 phdr->p_offset = fw_dump.boot_mem_dest_addr + offset;
1024 if (j < (fw_dump.boot_mem_regs_cnt - 1)) {
1025 offset += fw_dump.boot_mem_sz[j];
1026 raddr = fw_dump.boot_mem_addr[++j];
1027 }
1245 } 1028 }
1246 1029
1247 phdr->p_paddr = mbase; 1030 phdr->p_paddr = mbase;
@@ -1263,7 +1046,6 @@ static unsigned long init_fadump_header(unsigned long addr)
1263 if (!addr) 1046 if (!addr)
1264 return 0; 1047 return 0;
1265 1048
1266 fw_dump.fadumphdr_addr = addr;
1267 fdh = __va(addr); 1049 fdh = __va(addr);
1268 addr += sizeof(struct fadump_crash_info_header); 1050 addr += sizeof(struct fadump_crash_info_header);
1269 1051
@@ -1271,7 +1053,7 @@ static unsigned long init_fadump_header(unsigned long addr)
1271 fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; 1053 fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
1272 fdh->elfcorehdr_addr = addr; 1054 fdh->elfcorehdr_addr = addr;
1273 /* We will set the crashing cpu id in crash_fadump() during crash. */ 1055 /* We will set the crashing cpu id in crash_fadump() during crash. */
1274 fdh->crashing_cpu = CPU_UNKNOWN; 1056 fdh->crashing_cpu = FADUMP_CPU_UNKNOWN;
1275 1057
1276 return addr; 1058 return addr;
1277} 1059}
@@ -1293,7 +1075,8 @@ static int register_fadump(void)
1293 if (ret) 1075 if (ret)
1294 return ret; 1076 return ret;
1295 1077
1296 addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len); 1078 addr = fw_dump.fadumphdr_addr;
1079
1297 /* Initialize fadump crash info header. */ 1080 /* Initialize fadump crash info header. */
1298 addr = init_fadump_header(addr); 1081 addr = init_fadump_header(addr);
1299 vaddr = __va(addr); 1082 vaddr = __va(addr);
@@ -1302,74 +1085,27 @@ static int register_fadump(void)
1302 fadump_create_elfcore_headers(vaddr); 1085 fadump_create_elfcore_headers(vaddr);
1303 1086
1304 /* register the future kernel dump with firmware. */ 1087 /* register the future kernel dump with firmware. */
1305 return register_fw_dump(&fdm); 1088 pr_debug("Registering for firmware-assisted kernel dump...\n");
1306} 1089 return fw_dump.ops->fadump_register(&fw_dump);
1307
1308static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
1309{
1310 int rc = 0;
1311 unsigned int wait_time;
1312
1313 pr_debug("Un-register firmware-assisted dump\n");
1314
1315 /* TODO: Add upper time limit for the delay */
1316 do {
1317 rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
1318 FADUMP_UNREGISTER, fdm,
1319 sizeof(struct fadump_mem_struct));
1320
1321 wait_time = rtas_busy_delay_time(rc);
1322 if (wait_time)
1323 mdelay(wait_time);
1324 } while (wait_time);
1325
1326 if (rc) {
1327 printk(KERN_ERR "Failed to un-register firmware-assisted dump."
1328 " unexpected error(%d).\n", rc);
1329 return rc;
1330 }
1331 fw_dump.dump_registered = 0;
1332 return 0;
1333}
1334
1335static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
1336{
1337 int rc = 0;
1338 unsigned int wait_time;
1339
1340 pr_debug("Invalidating firmware-assisted dump registration\n");
1341
1342 /* TODO: Add upper time limit for the delay */
1343 do {
1344 rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
1345 FADUMP_INVALIDATE, fdm,
1346 sizeof(struct fadump_mem_struct));
1347
1348 wait_time = rtas_busy_delay_time(rc);
1349 if (wait_time)
1350 mdelay(wait_time);
1351 } while (wait_time);
1352
1353 if (rc) {
1354 pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc);
1355 return rc;
1356 }
1357 fw_dump.dump_active = 0;
1358 fdm_active = NULL;
1359 return 0;
1360} 1090}
1361 1091
1362void fadump_cleanup(void) 1092void fadump_cleanup(void)
1363{ 1093{
1094 if (!fw_dump.fadump_supported)
1095 return;
1096
1364 /* Invalidate the registration only if dump is active. */ 1097 /* Invalidate the registration only if dump is active. */
1365 if (fw_dump.dump_active) { 1098 if (fw_dump.dump_active) {
1366 /* pass the same memory dump structure provided by platform */ 1099 pr_debug("Invalidating firmware-assisted dump registration\n");
1367 fadump_invalidate_dump(fdm_active); 1100 fw_dump.ops->fadump_invalidate(&fw_dump);
1368 } else if (fw_dump.dump_registered) { 1101 } else if (fw_dump.dump_registered) {
1369 /* Un-register Firmware-assisted dump if it was registered. */ 1102 /* Un-register Firmware-assisted dump if it was registered. */
1370 fadump_unregister_dump(&fdm); 1103 fw_dump.ops->fadump_unregister(&fw_dump);
1371 free_crash_memory_ranges(); 1104 fadump_free_mem_ranges(&crash_mrange_info);
1372 } 1105 }
1106
1107 if (fw_dump.ops->fadump_cleanup)
1108 fw_dump.ops->fadump_cleanup(&fw_dump);
1373} 1109}
1374 1110
1375static void fadump_free_reserved_memory(unsigned long start_pfn, 1111static void fadump_free_reserved_memory(unsigned long start_pfn,
@@ -1394,90 +1130,197 @@ static void fadump_free_reserved_memory(unsigned long start_pfn,
1394/* 1130/*
1395 * Skip memory holes and free memory that was actually reserved. 1131 * Skip memory holes and free memory that was actually reserved.
1396 */ 1132 */
1397static void fadump_release_reserved_area(unsigned long start, unsigned long end) 1133static void fadump_release_reserved_area(u64 start, u64 end)
1398{ 1134{
1135 u64 tstart, tend, spfn, epfn;
1399 struct memblock_region *reg; 1136 struct memblock_region *reg;
1400 unsigned long tstart, tend;
1401 unsigned long start_pfn = PHYS_PFN(start);
1402 unsigned long end_pfn = PHYS_PFN(end);
1403 1137
1138 spfn = PHYS_PFN(start);
1139 epfn = PHYS_PFN(end);
1404 for_each_memblock(memory, reg) { 1140 for_each_memblock(memory, reg) {
1405 tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); 1141 tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg));
1406 tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); 1142 tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg));
1407 if (tstart < tend) { 1143 if (tstart < tend) {
1408 fadump_free_reserved_memory(tstart, tend); 1144 fadump_free_reserved_memory(tstart, tend);
1409 1145
1410 if (tend == end_pfn) 1146 if (tend == epfn)
1411 break; 1147 break;
1412 1148
1413 start_pfn = tend + 1; 1149 spfn = tend;
1414 } 1150 }
1415 } 1151 }
1416} 1152}
1417 1153
1418/* 1154/*
1419 * Release the memory that was reserved in early boot to preserve the memory 1155 * Sort the mem ranges in-place and merge adjacent ranges
1420 * contents. The released memory will be available for general use. 1156 * to minimize the memory ranges count.
1421 */ 1157 */
1422static void fadump_release_memory(unsigned long begin, unsigned long end) 1158static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info)
1423{ 1159{
1424 unsigned long ra_start, ra_end; 1160 struct fadump_memory_range *mem_ranges;
1161 struct fadump_memory_range tmp_range;
1162 u64 base, size;
1163 int i, j, idx;
1164
1165 if (!reserved_mrange_info.mem_range_cnt)
1166 return;
1167
1168 /* Sort the memory ranges */
1169 mem_ranges = mrange_info->mem_ranges;
1170 for (i = 0; i < mrange_info->mem_range_cnt; i++) {
1171 idx = i;
1172 for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) {
1173 if (mem_ranges[idx].base > mem_ranges[j].base)
1174 idx = j;
1175 }
1176 if (idx != i) {
1177 tmp_range = mem_ranges[idx];
1178 mem_ranges[idx] = mem_ranges[i];
1179 mem_ranges[i] = tmp_range;
1180 }
1181 }
1182
1183 /* Merge adjacent reserved ranges */
1184 idx = 0;
1185 for (i = 1; i < mrange_info->mem_range_cnt; i++) {
1186 base = mem_ranges[i-1].base;
1187 size = mem_ranges[i-1].size;
1188 if (mem_ranges[i].base == (base + size))
1189 mem_ranges[idx].size += mem_ranges[i].size;
1190 else {
1191 idx++;
1192 if (i == idx)
1193 continue;
1194
1195 mem_ranges[idx] = mem_ranges[i];
1196 }
1197 }
1198 mrange_info->mem_range_cnt = idx + 1;
1199}
1200
1201/*
1202 * Scan reserved-ranges to consider them while reserving/releasing
1203 * memory for FADump.
1204 */
1205static inline int fadump_scan_reserved_mem_ranges(void)
1206{
1207 struct device_node *root;
1208 const __be32 *prop;
1209 int len, ret = -1;
1210 unsigned long i;
1211
1212 root = of_find_node_by_path("/");
1213 if (!root)
1214 return ret;
1215
1216 prop = of_get_property(root, "reserved-ranges", &len);
1217 if (!prop)
1218 return ret;
1219
1220 /*
1221 * Each reserved range is an (address,size) pair, 2 cells each,
1222 * totalling 4 cells per range.
1223 */
1224 for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
1225 u64 base, size;
1226
1227 base = of_read_number(prop + (i * 4) + 0, 2);
1228 size = of_read_number(prop + (i * 4) + 2, 2);
1229
1230 if (size) {
1231 ret = fadump_add_mem_range(&reserved_mrange_info,
1232 base, base + size);
1233 if (ret < 0) {
1234 pr_warn("some reserved ranges are ignored!\n");
1235 break;
1236 }
1237 }
1238 }
1239
1240 return ret;
1241}
1242
1243/*
1244 * Release the memory that was reserved during early boot to preserve the
1245 * crash'ed kernel's memory contents except reserved dump area (permanent
1246 * reservation) and reserved ranges used by F/W. The released memory will
1247 * be available for general use.
1248 */
1249static void fadump_release_memory(u64 begin, u64 end)
1250{
1251 u64 ra_start, ra_end, tstart;
1252 int i, ret;
1253
1254 fadump_scan_reserved_mem_ranges();
1425 1255
1426 ra_start = fw_dump.reserve_dump_area_start; 1256 ra_start = fw_dump.reserve_dump_area_start;
1427 ra_end = ra_start + fw_dump.reserve_dump_area_size; 1257 ra_end = ra_start + fw_dump.reserve_dump_area_size;
1428 1258
1429 /* 1259 /*
1430 * exclude the dump reserve area. Will reuse it for next 1260 * Add reserved dump area to reserved ranges list
1431 * fadump registration. 1261 * and exclude all these ranges while releasing memory.
1432 */ 1262 */
1433 if (begin < ra_end && end > ra_start) { 1263 ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end);
1434 if (begin < ra_start) 1264 if (ret != 0) {
1435 fadump_release_reserved_area(begin, ra_start); 1265 /*
1436 if (end > ra_end) 1266 * Not enough memory to setup reserved ranges but the system is
1437 fadump_release_reserved_area(ra_end, end); 1267 * running shortage of memory. So, release all the memory except
1438 } else 1268 * Reserved dump area (reused for next fadump registration).
1439 fadump_release_reserved_area(begin, end); 1269 */
1270 if (begin < ra_end && end > ra_start) {
1271 if (begin < ra_start)
1272 fadump_release_reserved_area(begin, ra_start);
1273 if (end > ra_end)
1274 fadump_release_reserved_area(ra_end, end);
1275 } else
1276 fadump_release_reserved_area(begin, end);
1277
1278 return;
1279 }
1280
1281 /* Get the reserved ranges list in order first. */
1282 sort_and_merge_mem_ranges(&reserved_mrange_info);
1283
1284 /* Exclude reserved ranges and release remaining memory */
1285 tstart = begin;
1286 for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) {
1287 ra_start = reserved_mrange_info.mem_ranges[i].base;
1288 ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size;
1289
1290 if (tstart >= ra_end)
1291 continue;
1292
1293 if (tstart < ra_start)
1294 fadump_release_reserved_area(tstart, ra_start);
1295 tstart = ra_end;
1296 }
1297
1298 if (tstart < end)
1299 fadump_release_reserved_area(tstart, end);
1440} 1300}
1441 1301
1442static void fadump_invalidate_release_mem(void) 1302static void fadump_invalidate_release_mem(void)
1443{ 1303{
1444 unsigned long reserved_area_start, reserved_area_end;
1445 unsigned long destination_address;
1446
1447 mutex_lock(&fadump_mutex); 1304 mutex_lock(&fadump_mutex);
1448 if (!fw_dump.dump_active) { 1305 if (!fw_dump.dump_active) {
1449 mutex_unlock(&fadump_mutex); 1306 mutex_unlock(&fadump_mutex);
1450 return; 1307 return;
1451 } 1308 }
1452 1309
1453 destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
1454 fadump_cleanup(); 1310 fadump_cleanup();
1455 mutex_unlock(&fadump_mutex); 1311 mutex_unlock(&fadump_mutex);
1456 1312
1313 fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM());
1314 fadump_free_cpu_notes_buf();
1315
1457 /* 1316 /*
1458 * Save the current reserved memory bounds we will require them 1317 * Setup kernel metadata and initialize the kernel dump
1459 * later for releasing the memory for general use. 1318 * memory structure for FADump re-registration.
1460 */
1461 reserved_area_start = fw_dump.reserve_dump_area_start;
1462 reserved_area_end = reserved_area_start +
1463 fw_dump.reserve_dump_area_size;
1464 /*
1465 * Setup reserve_dump_area_start and its size so that we can
1466 * reuse this reserved memory for Re-registration.
1467 */ 1319 */
1468 fw_dump.reserve_dump_area_start = destination_address; 1320 if (fw_dump.ops->fadump_setup_metadata &&
1469 fw_dump.reserve_dump_area_size = get_fadump_area_size(); 1321 (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0))
1470 1322 pr_warn("Failed to setup kernel metadata!\n");
1471 fadump_release_memory(reserved_area_start, reserved_area_end); 1323 fw_dump.ops->fadump_init_mem_struct(&fw_dump);
1472 if (fw_dump.cpu_notes_buf) {
1473 fadump_cpu_notes_buf_free(
1474 (unsigned long)__va(fw_dump.cpu_notes_buf),
1475 fw_dump.cpu_notes_buf_size);
1476 fw_dump.cpu_notes_buf = 0;
1477 fw_dump.cpu_notes_buf_size = 0;
1478 }
1479 /* Initialize the kernel dump memory structure for FAD registration. */
1480 init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
1481} 1324}
1482 1325
1483static ssize_t fadump_release_memory_store(struct kobject *kobj, 1326static ssize_t fadump_release_memory_store(struct kobject *kobj,
@@ -1528,7 +1371,7 @@ static ssize_t fadump_register_store(struct kobject *kobj,
1528 int ret = 0; 1371 int ret = 0;
1529 int input = -1; 1372 int input = -1;
1530 1373
1531 if (!fw_dump.fadump_enabled || fdm_active) 1374 if (!fw_dump.fadump_enabled || fw_dump.dump_active)
1532 return -EPERM; 1375 return -EPERM;
1533 1376
1534 if (kstrtoint(buf, 0, &input)) 1377 if (kstrtoint(buf, 0, &input))
@@ -1541,13 +1384,15 @@ static ssize_t fadump_register_store(struct kobject *kobj,
1541 if (fw_dump.dump_registered == 0) { 1384 if (fw_dump.dump_registered == 0) {
1542 goto unlock_out; 1385 goto unlock_out;
1543 } 1386 }
1387
1544 /* Un-register Firmware-assisted dump */ 1388 /* Un-register Firmware-assisted dump */
1545 fadump_unregister_dump(&fdm); 1389 pr_debug("Un-register firmware-assisted dump\n");
1390 fw_dump.ops->fadump_unregister(&fw_dump);
1546 break; 1391 break;
1547 case 1: 1392 case 1:
1548 if (fw_dump.dump_registered == 1) { 1393 if (fw_dump.dump_registered == 1) {
1549 /* Un-register Firmware-assisted dump */ 1394 /* Un-register Firmware-assisted dump */
1550 fadump_unregister_dump(&fdm); 1395 fw_dump.ops->fadump_unregister(&fw_dump);
1551 } 1396 }
1552 /* Register Firmware-assisted dump */ 1397 /* Register Firmware-assisted dump */
1553 ret = register_fadump(); 1398 ret = register_fadump();
@@ -1564,62 +1409,12 @@ unlock_out:
1564 1409
1565static int fadump_region_show(struct seq_file *m, void *private) 1410static int fadump_region_show(struct seq_file *m, void *private)
1566{ 1411{
1567 const struct fadump_mem_struct *fdm_ptr;
1568
1569 if (!fw_dump.fadump_enabled) 1412 if (!fw_dump.fadump_enabled)
1570 return 0; 1413 return 0;
1571 1414
1572 mutex_lock(&fadump_mutex); 1415 mutex_lock(&fadump_mutex);
1573 if (fdm_active) 1416 fw_dump.ops->fadump_region_show(&fw_dump, m);
1574 fdm_ptr = fdm_active; 1417 mutex_unlock(&fadump_mutex);
1575 else {
1576 mutex_unlock(&fadump_mutex);
1577 fdm_ptr = &fdm;
1578 }
1579
1580 seq_printf(m,
1581 "CPU : [%#016llx-%#016llx] %#llx bytes, "
1582 "Dumped: %#llx\n",
1583 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address),
1584 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) +
1585 be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1,
1586 be64_to_cpu(fdm_ptr->cpu_state_data.source_len),
1587 be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped));
1588 seq_printf(m,
1589 "HPTE: [%#016llx-%#016llx] %#llx bytes, "
1590 "Dumped: %#llx\n",
1591 be64_to_cpu(fdm_ptr->hpte_region.destination_address),
1592 be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
1593 be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
1594 be64_to_cpu(fdm_ptr->hpte_region.source_len),
1595 be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
1596 seq_printf(m,
1597 "DUMP: [%#016llx-%#016llx] %#llx bytes, "
1598 "Dumped: %#llx\n",
1599 be64_to_cpu(fdm_ptr->rmr_region.destination_address),
1600 be64_to_cpu(fdm_ptr->rmr_region.destination_address) +
1601 be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1,
1602 be64_to_cpu(fdm_ptr->rmr_region.source_len),
1603 be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
1604
1605 if (!fdm_active ||
1606 (fw_dump.reserve_dump_area_start ==
1607 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address)))
1608 goto out;
1609
1610 /* Dump is active. Show reserved memory region. */
1611 seq_printf(m,
1612 " : [%#016llx-%#016llx] %#llx bytes, "
1613 "Dumped: %#llx\n",
1614 (unsigned long long)fw_dump.reserve_dump_area_start,
1615 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1,
1616 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
1617 fw_dump.reserve_dump_area_start,
1618 be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) -
1619 fw_dump.reserve_dump_area_start);
1620out:
1621 if (fdm_active)
1622 mutex_unlock(&fadump_mutex);
1623 return 0; 1418 return 0;
1624} 1419}
1625 1420
@@ -1690,14 +1485,77 @@ int __init setup_fadump(void)
1690 * if dump process fails then invalidate the registration 1485 * if dump process fails then invalidate the registration
1691 * and release memory before proceeding for re-registration. 1486 * and release memory before proceeding for re-registration.
1692 */ 1487 */
1693 if (process_fadump(fdm_active) < 0) 1488 if (fw_dump.ops->fadump_process(&fw_dump) < 0)
1694 fadump_invalidate_release_mem(); 1489 fadump_invalidate_release_mem();
1695 } 1490 }
1696 /* Initialize the kernel dump memory structure for FAD registration. */ 1491 /* Initialize the kernel dump memory structure for FAD registration. */
1697 else if (fw_dump.reserve_dump_area_size) 1492 else if (fw_dump.reserve_dump_area_size)
1698 init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); 1493 fw_dump.ops->fadump_init_mem_struct(&fw_dump);
1494
1699 fadump_init_files(); 1495 fadump_init_files();
1700 1496
1701 return 1; 1497 return 1;
1702} 1498}
1703subsys_initcall(setup_fadump); 1499subsys_initcall(setup_fadump);
1500#else /* !CONFIG_PRESERVE_FA_DUMP */
1501
1502/* Scan the Firmware Assisted dump configuration details. */
1503int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
1504 int depth, void *data)
1505{
1506 if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0))
1507 return 0;
1508
1509 opal_fadump_dt_scan(&fw_dump, node);
1510 return 1;
1511}
1512
1513/*
1514 * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
1515 * preserve crash data. The subsequent memory preserving kernel boot
1516 * is likely to process this crash data.
1517 */
1518int __init fadump_reserve_mem(void)
1519{
1520 if (fw_dump.dump_active) {
1521 /*
1522 * If last boot has crashed then reserve all the memory
1523 * above boot memory to preserve crash data.
1524 */
1525 pr_info("Preserving crash data for processing in next boot.\n");
1526 fadump_reserve_crash_area(fw_dump.boot_mem_top);
1527 } else
1528 pr_debug("FADump-aware kernel..\n");
1529
1530 return 1;
1531}
1532#endif /* CONFIG_PRESERVE_FA_DUMP */
1533
1534/* Preserve everything above the base address */
1535static void __init fadump_reserve_crash_area(u64 base)
1536{
1537 struct memblock_region *reg;
1538 u64 mstart, msize;
1539
1540 for_each_memblock(memory, reg) {
1541 mstart = reg->base;
1542 msize = reg->size;
1543
1544 if ((mstart + msize) < base)
1545 continue;
1546
1547 if (mstart < base) {
1548 msize -= (base - mstart);
1549 mstart = base;
1550 }
1551
1552 pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data",
1553 (msize >> 20), mstart);
1554 memblock_reserve(mstart, msize);
1555 }
1556}
1557
1558unsigned long __init arch_reserved_kernel_pages(void)
1559{
1560 return memblock_reserved_size() / PAGE_SIZE;
1561}
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index f255e22184b4..4a24f8f026c7 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -34,7 +34,16 @@
34 34
35#include "head_32.h" 35#include "head_32.h"
36 36
37/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ 37/* 601 only have IBAT */
38#ifdef CONFIG_PPC_BOOK3S_601
39#define LOAD_BAT(n, reg, RA, RB) \
40 li RA,0; \
41 mtspr SPRN_IBAT##n##U,RA; \
42 lwz RA,(n*16)+0(reg); \
43 lwz RB,(n*16)+4(reg); \
44 mtspr SPRN_IBAT##n##U,RA; \
45 mtspr SPRN_IBAT##n##L,RB
46#else
38#define LOAD_BAT(n, reg, RA, RB) \ 47#define LOAD_BAT(n, reg, RA, RB) \
39 /* see the comment for clear_bats() -- Cort */ \ 48 /* see the comment for clear_bats() -- Cort */ \
40 li RA,0; \ 49 li RA,0; \
@@ -44,12 +53,11 @@
44 lwz RB,(n*16)+4(reg); \ 53 lwz RB,(n*16)+4(reg); \
45 mtspr SPRN_IBAT##n##U,RA; \ 54 mtspr SPRN_IBAT##n##U,RA; \
46 mtspr SPRN_IBAT##n##L,RB; \ 55 mtspr SPRN_IBAT##n##L,RB; \
47 beq 1f; \
48 lwz RA,(n*16)+8(reg); \ 56 lwz RA,(n*16)+8(reg); \
49 lwz RB,(n*16)+12(reg); \ 57 lwz RB,(n*16)+12(reg); \
50 mtspr SPRN_DBAT##n##U,RA; \ 58 mtspr SPRN_DBAT##n##U,RA; \
51 mtspr SPRN_DBAT##n##L,RB; \ 59 mtspr SPRN_DBAT##n##L,RB
521: 60#endif
53 61
54 __HEAD 62 __HEAD
55 .stabs "arch/powerpc/kernel/",N_SO,0,0,0f 63 .stabs "arch/powerpc/kernel/",N_SO,0,0,0f
@@ -557,9 +565,9 @@ DataStoreTLBMiss:
557 cmplw 0,r1,r3 565 cmplw 0,r1,r3
558 mfspr r2, SPRN_SPRG_PGDIR 566 mfspr r2, SPRN_SPRG_PGDIR
559#ifdef CONFIG_SWAP 567#ifdef CONFIG_SWAP
560 li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED 568 li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED
561#else 569#else
562 li r1, _PAGE_RW | _PAGE_PRESENT 570 li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT
563#endif 571#endif
564 bge- 112f 572 bge- 112f
565 lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ 573 lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */
@@ -820,9 +828,6 @@ load_up_mmu:
820 828
821/* Load the BAT registers with the values set up by MMU_init. 829/* Load the BAT registers with the values set up by MMU_init.
822 MMU_init takes care of whether we're on a 601 or not. */ 830 MMU_init takes care of whether we're on a 601 or not. */
823 mfpvr r3
824 srwi r3,r3,16
825 cmpwi r3,1
826 lis r3,BATS@ha 831 lis r3,BATS@ha
827 addi r3,r3,BATS@l 832 addi r3,r3,BATS@l
828 tophys(r3,r3) 833 tophys(r3,r3)
@@ -897,9 +902,11 @@ start_here:
897 bl machine_init 902 bl machine_init
898 bl __save_cpu_setup 903 bl __save_cpu_setup
899 bl MMU_init 904 bl MMU_init
905#ifdef CONFIG_KASAN
900BEGIN_MMU_FTR_SECTION 906BEGIN_MMU_FTR_SECTION
901 bl MMU_init_hw_patch 907 bl MMU_init_hw_patch
902END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) 908END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
909#endif
903 910
904/* 911/*
905 * Go back to running unmapped so we can load up new values 912 * Go back to running unmapped so we can load up new values
@@ -996,11 +1003,8 @@ EXPORT_SYMBOL(switch_mmu_context)
996 */ 1003 */
997clear_bats: 1004clear_bats:
998 li r10,0 1005 li r10,0
999 mfspr r9,SPRN_PVR
1000 rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */
1001 cmpwi r9, 1
1002 beq 1f
1003 1006
1007#ifndef CONFIG_PPC_BOOK3S_601
1004 mtspr SPRN_DBAT0U,r10 1008 mtspr SPRN_DBAT0U,r10
1005 mtspr SPRN_DBAT0L,r10 1009 mtspr SPRN_DBAT0L,r10
1006 mtspr SPRN_DBAT1U,r10 1010 mtspr SPRN_DBAT1U,r10
@@ -1009,7 +1013,7 @@ clear_bats:
1009 mtspr SPRN_DBAT2L,r10 1013 mtspr SPRN_DBAT2L,r10
1010 mtspr SPRN_DBAT3U,r10 1014 mtspr SPRN_DBAT3U,r10
1011 mtspr SPRN_DBAT3L,r10 1015 mtspr SPRN_DBAT3L,r10
10121: 1016#endif
1013 mtspr SPRN_IBAT0U,r10 1017 mtspr SPRN_IBAT0U,r10
1014 mtspr SPRN_IBAT0L,r10 1018 mtspr SPRN_IBAT0L,r10
1015 mtspr SPRN_IBAT1U,r10 1019 mtspr SPRN_IBAT1U,r10
@@ -1104,10 +1108,7 @@ mmu_off:
1104 */ 1108 */
1105initial_bats: 1109initial_bats:
1106 lis r11,PAGE_OFFSET@h 1110 lis r11,PAGE_OFFSET@h
1107 mfspr r9,SPRN_PVR 1111#ifdef CONFIG_PPC_BOOK3S_601
1108 rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */
1109 cmpwi 0,r9,1
1110 bne 4f
1111 ori r11,r11,4 /* set up BAT registers for 601 */ 1112 ori r11,r11,4 /* set up BAT registers for 601 */
1112 li r8,0x7f /* valid, block length = 8MB */ 1113 li r8,0x7f /* valid, block length = 8MB */
1113 mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ 1114 mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */
@@ -1120,10 +1121,8 @@ initial_bats:
1120 addis r8,r8,0x800000@h 1121 addis r8,r8,0x800000@h
1121 mtspr SPRN_IBAT2U,r11 1122 mtspr SPRN_IBAT2U,r11
1122 mtspr SPRN_IBAT2L,r8 1123 mtspr SPRN_IBAT2L,r8
1123 isync 1124#else
1124 blr 1125 tophys(r8,r11)
1125
11264: tophys(r8,r11)
1127#ifdef CONFIG_SMP 1126#ifdef CONFIG_SMP
1128 ori r8,r8,0x12 /* R/W access, M=1 */ 1127 ori r8,r8,0x12 /* R/W access, M=1 */
1129#else 1128#else
@@ -1135,10 +1134,10 @@ initial_bats:
1135 mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ 1134 mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */
1136 mtspr SPRN_IBAT0L,r8 1135 mtspr SPRN_IBAT0L,r8
1137 mtspr SPRN_IBAT0U,r11 1136 mtspr SPRN_IBAT0U,r11
1137#endif
1138 isync 1138 isync
1139 blr 1139 blr
1140 1140
1141
1142#ifdef CONFIG_BOOTX_TEXT 1141#ifdef CONFIG_BOOTX_TEXT
1143setup_disp_bat: 1142setup_disp_bat:
1144 /* 1143 /*
@@ -1153,15 +1152,13 @@ setup_disp_bat:
1153 beqlr 1152 beqlr
1154 lwz r11,0(r8) 1153 lwz r11,0(r8)
1155 lwz r8,4(r8) 1154 lwz r8,4(r8)
1156 mfspr r9,SPRN_PVR 1155#ifndef CONFIG_PPC_BOOK3S_601
1157 rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */
1158 cmpwi 0,r9,1
1159 beq 1f
1160 mtspr SPRN_DBAT3L,r8 1156 mtspr SPRN_DBAT3L,r8
1161 mtspr SPRN_DBAT3U,r11 1157 mtspr SPRN_DBAT3U,r11
1162 blr 1158#else
11631: mtspr SPRN_IBAT3L,r8 1159 mtspr SPRN_IBAT3L,r8
1164 mtspr SPRN_IBAT3U,r11 1160 mtspr SPRN_IBAT3U,r11
1161#endif
1165 blr 1162 blr
1166#endif /* CONFIG_BOOTX_TEXT */ 1163#endif /* CONFIG_BOOTX_TEXT */
1167 1164
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 4a692553651f..8abc7783dbe5 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -5,19 +5,6 @@
5#include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ 5#include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */
6 6
7/* 7/*
8 * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE.
9 */
10.macro __LOAD_MSR_KERNEL r, x
11.if \x >= 0x8000
12 lis \r, (\x)@h
13 ori \r, \r, (\x)@l
14.else
15 li \r, (\x)
16.endif
17.endm
18#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x
19
20/*
21 * Exception entry code. This code runs with address translation 8 * Exception entry code. This code runs with address translation
22 * turned off, i.e. using physical addresses. 9 * turned off, i.e. using physical addresses.
23 * We assume sprg3 has the physical address of the current 10 * We assume sprg3 has the physical address of the current
@@ -92,7 +79,7 @@
92#ifdef CONFIG_40x 79#ifdef CONFIG_40x
93 rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ 80 rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */
94#else 81#else
95 LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ 82 LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
96 MTMSRD(r10) /* (except for mach check in rtas) */ 83 MTMSRD(r10) /* (except for mach check in rtas) */
97#endif 84#endif
98 lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ 85 lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
@@ -140,10 +127,10 @@
140 * otherwise we might risk taking an interrupt before we tell lockdep 127 * otherwise we might risk taking an interrupt before we tell lockdep
141 * they are enabled. 128 * they are enabled.
142 */ 129 */
143 LOAD_MSR_KERNEL(r10, MSR_KERNEL) 130 LOAD_REG_IMMEDIATE(r10, MSR_KERNEL)
144 rlwimi r10, r9, 0, MSR_EE 131 rlwimi r10, r9, 0, MSR_EE
145#else 132#else
146 LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) 133 LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
147#endif 134#endif
148#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) 135#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
149 mtspr SPRN_NRI, r0 136 mtspr SPRN_NRI, r0
@@ -187,7 +174,7 @@ label:
187#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ 174#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \
188 li r10,trap; \ 175 li r10,trap; \
189 stw r10,_TRAP(r11); \ 176 stw r10,_TRAP(r11); \
190 LOAD_MSR_KERNEL(r10, msr); \ 177 LOAD_REG_IMMEDIATE(r10, msr); \
191 bl tfer; \ 178 bl tfer; \
192 .long hdlr; \ 179 .long hdlr; \
193 .long ret 180 .long ret
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 91d297e696dd..ad79fddb974d 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -182,7 +182,8 @@ __secondary_hold:
182 isync 182 isync
183 bctr 183 bctr
184#else 184#else
185 BUG_OPCODE 1850: trap
186 EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
186#endif 187#endif
187CLOSE_FIXED_SECTION(first_256B) 188CLOSE_FIXED_SECTION(first_256B)
188 189
@@ -635,7 +636,7 @@ __after_prom_start:
635 sub r5,r5,r11 636 sub r5,r5,r11
636#else 637#else
637 /* just copy interrupts */ 638 /* just copy interrupts */
638 LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) 639 LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
639#endif 640#endif
640 b 5f 641 b 5f
6413: 6423:
@@ -998,7 +999,8 @@ start_here_common:
998 bl start_kernel 999 bl start_kernel
999 1000
1000 /* Not reached */ 1001 /* Not reached */
1001 BUG_OPCODE 1002 trap
1003 EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0
1002 1004
1003/* 1005/*
1004 * We put a few things here that have to be page-aligned. 1006 * We put a few things here that have to be page-aligned.
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 5ab9178c2347..19f583e18402 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/magic.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/mmu.h> 21#include <asm/mmu.h>
@@ -574,8 +575,6 @@ InstructionBreakpoint:
574 * by decoding the registers used by the dcbx instruction and adding them. 575 * by decoding the registers used by the dcbx instruction and adding them.
575 * DAR is set to the calculated address. 576 * DAR is set to the calculated address.
576 */ 577 */
577 /* define if you don't want to use self modifying code */
578#define NO_SELF_MODIFYING_CODE
579FixupDAR:/* Entry point for dcbx workaround. */ 578FixupDAR:/* Entry point for dcbx workaround. */
580 mtspr SPRN_M_TW, r10 579 mtspr SPRN_M_TW, r10
581 /* fetch instruction from memory. */ 580 /* fetch instruction from memory. */
@@ -639,27 +638,6 @@ FixupDAR:/* Entry point for dcbx workaround. */
639 rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ 638 rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */
640 mtspr SPRN_DSISR, r10 639 mtspr SPRN_DSISR, r10
641142: /* continue, it was a dcbx, dcbi instruction. */ 640142: /* continue, it was a dcbx, dcbi instruction. */
642#ifndef NO_SELF_MODIFYING_CODE
643 andis. r10,r11,0x1f /* test if reg RA is r0 */
644 li r10,modified_instr@l
645 dcbtst r0,r10 /* touch for store */
646 rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */
647 oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */
648 ori r11,r11,532
649 stw r11,0(r10) /* store add/and instruction */
650 dcbf 0,r10 /* flush new instr. to memory. */
651 icbi 0,r10 /* invalidate instr. cache line */
652 mfspr r11, SPRN_SPRG_SCRATCH1 /* restore r11 */
653 mfspr r10, SPRN_SPRG_SCRATCH0 /* restore r10 */
654 isync /* Wait until new instr is loaded from memory */
655modified_instr:
656 .space 4 /* this is where the add instr. is stored */
657 bne+ 143f
658 subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */
659143: mtdar r10 /* store faulting EA in DAR */
660 mfspr r10,SPRN_M_TW
661 b DARFixed /* Go back to normal TLB handling */
662#else
663 mfctr r10 641 mfctr r10
664 mtdar r10 /* save ctr reg in DAR */ 642 mtdar r10 /* save ctr reg in DAR */
665 rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */ 643 rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */
@@ -723,7 +701,6 @@ modified_instr:
723 add r10, r10, r11 /* add it */ 701 add r10, r10, r11 /* add it */
724 mfctr r11 /* restore r11 */ 702 mfctr r11 /* restore r11 */
725 b 151b 703 b 151b
726#endif
727 704
728/* 705/*
729 * This is where the main kernel code starts. 706 * This is where the main kernel code starts.
@@ -741,6 +718,9 @@ start_here:
741 /* stack */ 718 /* stack */
742 lis r1,init_thread_union@ha 719 lis r1,init_thread_union@ha
743 addi r1,r1,init_thread_union@l 720 addi r1,r1,init_thread_union@l
721 lis r0, STACK_END_MAGIC@h
722 ori r0, r0, STACK_END_MAGIC@l
723 stw r0, 0(r1)
744 li r0,0 724 li r0,0
745 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) 725 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
746 726
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index c8d1fa2e9d53..1007ec36b4cb 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -195,18 +195,63 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
195 tsk->thread.last_hit_ubp = NULL; 195 tsk->thread.last_hit_ubp = NULL;
196} 196}
197 197
198static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr)
199{
200 int ret, type;
201 struct instruction_op op;
202
203 ret = analyse_instr(&op, regs, instr);
204 type = GETTYPE(op.type);
205 return (!ret && (type == LARX || type == STCX));
206}
207
198/* 208/*
199 * Handle debug exception notifications. 209 * Handle debug exception notifications.
200 */ 210 */
211static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
212 unsigned long addr)
213{
214 unsigned int instr = 0;
215
216 if (__get_user_inatomic(instr, (unsigned int *)regs->nip))
217 goto fail;
218
219 if (is_larx_stcx_instr(regs, instr)) {
220 printk_ratelimited("Breakpoint hit on instruction that can't be emulated."
221 " Breakpoint at 0x%lx will be disabled.\n", addr);
222 goto disable;
223 }
224
225 /* Do not emulate user-space instructions, instead single-step them */
226 if (user_mode(regs)) {
227 current->thread.last_hit_ubp = bp;
228 regs->msr |= MSR_SE;
229 return false;
230 }
231
232 if (!emulate_step(regs, instr))
233 goto fail;
234
235 return true;
236
237fail:
238 /*
239 * We've failed in reliably handling the hw-breakpoint. Unregister
240 * it and throw a warning message to let the user know about it.
241 */
242 WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
243 "0x%lx will be disabled.", addr);
244
245disable:
246 perf_event_disable_inatomic(bp);
247 return false;
248}
249
201int hw_breakpoint_handler(struct die_args *args) 250int hw_breakpoint_handler(struct die_args *args)
202{ 251{
203 int rc = NOTIFY_STOP; 252 int rc = NOTIFY_STOP;
204 struct perf_event *bp; 253 struct perf_event *bp;
205 struct pt_regs *regs = args->regs; 254 struct pt_regs *regs = args->regs;
206#ifndef CONFIG_PPC_8xx
207 int stepped = 1;
208 unsigned int instr;
209#endif
210 struct arch_hw_breakpoint *info; 255 struct arch_hw_breakpoint *info;
211 unsigned long dar = regs->dar; 256 unsigned long dar = regs->dar;
212 257
@@ -251,32 +296,10 @@ int hw_breakpoint_handler(struct die_args *args)
251 (dar - bp->attr.bp_addr < bp->attr.bp_len))) 296 (dar - bp->attr.bp_addr < bp->attr.bp_len)))
252 info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; 297 info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
253 298
254#ifndef CONFIG_PPC_8xx 299 if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info->address))
255 /* Do not emulate user-space instructions, instead single-step them */
256 if (user_mode(regs)) {
257 current->thread.last_hit_ubp = bp;
258 regs->msr |= MSR_SE;
259 goto out; 300 goto out;
260 }
261
262 stepped = 0;
263 instr = 0;
264 if (!__get_user_inatomic(instr, (unsigned int *) regs->nip))
265 stepped = emulate_step(regs, instr);
266 301
267 /* 302 /*
268 * emulate_step() could not execute it. We've failed in reliably
269 * handling the hw-breakpoint. Unregister it and throw a warning
270 * message to let the user know about it.
271 */
272 if (!stepped) {
273 WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
274 "0x%lx will be disabled.", info->address);
275 perf_event_disable_inatomic(bp);
276 goto out;
277 }
278#endif
279 /*
280 * As a policy, the callback is invoked in a 'trigger-after-execute' 303 * As a policy, the callback is invoked in a 'trigger-after-execute'
281 * fashion 304 * fashion
282 */ 305 */
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index fbd2d0007c52..0276bc8c8969 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = {
149}; 149};
150 150
151#ifdef CONFIG_PPC_INDIRECT_MMIO 151#ifdef CONFIG_PPC_INDIRECT_MMIO
152static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, 152void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
153 pgprot_t prot, void *caller) 153 pgprot_t prot, void *caller)
154{ 154{
155 struct iowa_bus *bus; 155 struct iowa_bus *bus;
156 void __iomem *res = __ioremap_caller(addr, size, prot, caller); 156 void __iomem *res = __ioremap_caller(addr, size, prot, caller);
@@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
163 } 163 }
164 return res; 164 return res;
165} 165}
166#else /* CONFIG_PPC_INDIRECT_MMIO */
167#define iowa_ioremap NULL
168#endif /* !CONFIG_PPC_INDIRECT_MMIO */ 166#endif /* !CONFIG_PPC_INDIRECT_MMIO */
169 167
168bool io_workaround_inited;
169
170/* Enable IO workaround */ 170/* Enable IO workaround */
171static void io_workaround_init(void) 171static void io_workaround_init(void)
172{ 172{
173 static int io_workaround_inited;
174
175 if (io_workaround_inited) 173 if (io_workaround_inited)
176 return; 174 return;
177 ppc_pci_io = iowa_pci_io; 175 ppc_pci_io = iowa_pci_io;
178 ppc_md.ioremap = iowa_ioremap; 176 io_workaround_inited = true;
179 io_workaround_inited = 1;
180} 177}
181 178
182/* Register new bus to support workaround */ 179/* Register new bus to support workaround */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0a67ce9f827e..9704f3f76e63 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl)
633#endif 633#endif
634} 634}
635 635
636static void iommu_table_reserve_pages(struct iommu_table *tbl,
637 unsigned long res_start, unsigned long res_end)
638{
639 int i;
640
641 WARN_ON_ONCE(res_end < res_start);
642 /*
643 * Reserve page 0 so it will not be used for any mappings.
644 * This avoids buggy drivers that consider page 0 to be invalid
645 * to crash the machine or even lose data.
646 */
647 if (tbl->it_offset == 0)
648 set_bit(0, tbl->it_map);
649
650 tbl->it_reserved_start = res_start;
651 tbl->it_reserved_end = res_end;
652
653 /* Check if res_start..res_end isn't empty and overlaps the table */
654 if (res_start && res_end &&
655 (tbl->it_offset + tbl->it_size < res_start ||
656 res_end < tbl->it_offset))
657 return;
658
659 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
660 set_bit(i - tbl->it_offset, tbl->it_map);
661}
662
663static void iommu_table_release_pages(struct iommu_table *tbl)
664{
665 int i;
666
667 /*
668 * In case we have reserved the first bit, we should not emit
669 * the warning below.
670 */
671 if (tbl->it_offset == 0)
672 clear_bit(0, tbl->it_map);
673
674 for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
675 clear_bit(i - tbl->it_offset, tbl->it_map);
676}
677
636/* 678/*
637 * Build a iommu_table structure. This contains a bit map which 679 * Build a iommu_table structure. This contains a bit map which
638 * is used to manage allocation of the tce space. 680 * is used to manage allocation of the tce space.
639 */ 681 */
640struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) 682struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
683 unsigned long res_start, unsigned long res_end)
641{ 684{
642 unsigned long sz; 685 unsigned long sz;
643 static int welcomed = 0; 686 static int welcomed = 0;
@@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
656 tbl->it_map = page_address(page); 699 tbl->it_map = page_address(page);
657 memset(tbl->it_map, 0, sz); 700 memset(tbl->it_map, 0, sz);
658 701
659 /* 702 iommu_table_reserve_pages(tbl, res_start, res_end);
660 * Reserve page 0 so it will not be used for any mappings.
661 * This avoids buggy drivers that consider page 0 to be invalid
662 * to crash the machine or even lose data.
663 */
664 if (tbl->it_offset == 0)
665 set_bit(0, tbl->it_map);
666 703
667 /* We only split the IOMMU table if we have 1GB or more of space */ 704 /* We only split the IOMMU table if we have 1GB or more of space */
668 if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) 705 if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
@@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref)
714 return; 751 return;
715 } 752 }
716 753
717 /* 754 iommu_table_release_pages(tbl);
718 * In case we have reserved the first bit, we should not emit
719 * the warning below.
720 */
721 if (tbl->it_offset == 0)
722 clear_bit(0, tbl->it_map);
723 755
724 /* verify that table contains no entries */ 756 /* verify that table contains no entries */
725 if (!bitmap_empty(tbl->it_map, tbl->it_size)) 757 if (!bitmap_empty(tbl->it_map, tbl->it_size))
@@ -981,29 +1013,32 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
981} 1013}
982EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); 1014EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
983 1015
984long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, 1016extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
1017 struct iommu_table *tbl,
985 unsigned long entry, unsigned long *hpa, 1018 unsigned long entry, unsigned long *hpa,
986 enum dma_data_direction *direction) 1019 enum dma_data_direction *direction)
987{ 1020{
988 long ret; 1021 long ret;
989 unsigned long size = 0; 1022 unsigned long size = 0;
990 1023
991 ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); 1024 ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false);
992
993 if (!ret && ((*direction == DMA_FROM_DEVICE) || 1025 if (!ret && ((*direction == DMA_FROM_DEVICE) ||
994 (*direction == DMA_BIDIRECTIONAL)) && 1026 (*direction == DMA_BIDIRECTIONAL)) &&
995 !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, 1027 !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
996 &size)) 1028 &size))
997 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); 1029 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
998 1030
999 /* if (unlikely(ret))
1000 pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
1001 __func__, hwaddr, entry << tbl->it_page_shift,
1002 hwaddr, ret); */
1003
1004 return ret; 1031 return ret;
1005} 1032}
1006EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1033EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
1034
1035void iommu_tce_kill(struct iommu_table *tbl,
1036 unsigned long entry, unsigned long pages)
1037{
1038 if (tbl->it_ops->tce_kill)
1039 tbl->it_ops->tce_kill(tbl, entry, pages, false);
1040}
1041EXPORT_SYMBOL_GPL(iommu_tce_kill);
1007 1042
1008int iommu_take_ownership(struct iommu_table *tbl) 1043int iommu_take_ownership(struct iommu_table *tbl)
1009{ 1044{
@@ -1017,22 +1052,21 @@ int iommu_take_ownership(struct iommu_table *tbl)
1017 * requires exchange() callback defined so if it is not 1052 * requires exchange() callback defined so if it is not
1018 * implemented, we disallow taking ownership over the table. 1053 * implemented, we disallow taking ownership over the table.
1019 */ 1054 */
1020 if (!tbl->it_ops->exchange) 1055 if (!tbl->it_ops->xchg_no_kill)
1021 return -EINVAL; 1056 return -EINVAL;
1022 1057
1023 spin_lock_irqsave(&tbl->large_pool.lock, flags); 1058 spin_lock_irqsave(&tbl->large_pool.lock, flags);
1024 for (i = 0; i < tbl->nr_pools; i++) 1059 for (i = 0; i < tbl->nr_pools; i++)
1025 spin_lock(&tbl->pools[i].lock); 1060 spin_lock(&tbl->pools[i].lock);
1026 1061
1027 if (tbl->it_offset == 0) 1062 iommu_table_release_pages(tbl);
1028 clear_bit(0, tbl->it_map);
1029 1063
1030 if (!bitmap_empty(tbl->it_map, tbl->it_size)) { 1064 if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
1031 pr_err("iommu_tce: it_map is not empty"); 1065 pr_err("iommu_tce: it_map is not empty");
1032 ret = -EBUSY; 1066 ret = -EBUSY;
1033 /* Restore bit#0 set by iommu_init_table() */ 1067 /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
1034 if (tbl->it_offset == 0) 1068 iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1035 set_bit(0, tbl->it_map); 1069 tbl->it_reserved_end);
1036 } else { 1070 } else {
1037 memset(tbl->it_map, 0xff, sz); 1071 memset(tbl->it_map, 0xff, sz);
1038 } 1072 }
@@ -1055,9 +1089,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
1055 1089
1056 memset(tbl->it_map, 0, sz); 1090 memset(tbl->it_map, 0, sz);
1057 1091
1058 /* Restore bit#0 set by iommu_init_table() */ 1092 iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1059 if (tbl->it_offset == 0) 1093 tbl->it_reserved_end);
1060 set_bit(0, tbl->it_map);
1061 1094
1062 for (i = 0; i < tbl->nr_pools; i++) 1095 for (i = 0; i < tbl->nr_pools; i++)
1063 spin_unlock(&tbl->pools[i].lock); 1096 spin_unlock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index b7b3a5e4e224..617eba82531c 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -64,16 +64,17 @@
64#define KVM_INST_MTSRIN 0x7c0001e4 64#define KVM_INST_MTSRIN 0x7c0001e4
65 65
66static bool kvm_patching_worked = true; 66static bool kvm_patching_worked = true;
67char kvm_tmp[1024 * 1024]; 67extern char kvm_tmp[];
68extern char kvm_tmp_end[];
68static int kvm_tmp_index; 69static int kvm_tmp_index;
69 70
70static inline void kvm_patch_ins(u32 *inst, u32 new_inst) 71static void __init kvm_patch_ins(u32 *inst, u32 new_inst)
71{ 72{
72 *inst = new_inst; 73 *inst = new_inst;
73 flush_icache_range((ulong)inst, (ulong)inst + 4); 74 flush_icache_range((ulong)inst, (ulong)inst + 4);
74} 75}
75 76
76static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) 77static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
77{ 78{
78#ifdef CONFIG_64BIT 79#ifdef CONFIG_64BIT
79 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); 80 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@@ -82,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
82#endif 83#endif
83} 84}
84 85
85static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) 86static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
86{ 87{
87#ifdef CONFIG_64BIT 88#ifdef CONFIG_64BIT
88 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); 89 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
@@ -91,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
91#endif 92#endif
92} 93}
93 94
94static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) 95static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
95{ 96{
96 kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); 97 kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
97} 98}
98 99
99static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) 100static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
100{ 101{
101#ifdef CONFIG_64BIT 102#ifdef CONFIG_64BIT
102 kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); 103 kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
@@ -105,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
105#endif 106#endif
106} 107}
107 108
108static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) 109static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
109{ 110{
110 kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); 111 kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
111} 112}
112 113
113static void kvm_patch_ins_nop(u32 *inst) 114static void __init kvm_patch_ins_nop(u32 *inst)
114{ 115{
115 kvm_patch_ins(inst, KVM_INST_NOP); 116 kvm_patch_ins(inst, KVM_INST_NOP);
116} 117}
117 118
118static void kvm_patch_ins_b(u32 *inst, int addr) 119static void __init kvm_patch_ins_b(u32 *inst, int addr)
119{ 120{
120#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S) 121#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S)
121 /* On relocatable kernels interrupts handlers and our code 122 /* On relocatable kernels interrupts handlers and our code
@@ -128,11 +129,11 @@ static void kvm_patch_ins_b(u32 *inst, int addr)
128 kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); 129 kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
129} 130}
130 131
131static u32 *kvm_alloc(int len) 132static u32 * __init kvm_alloc(int len)
132{ 133{
133 u32 *p; 134 u32 *p;
134 135
135 if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { 136 if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) {
136 printk(KERN_ERR "KVM: No more space (%d + %d)\n", 137 printk(KERN_ERR "KVM: No more space (%d + %d)\n",
137 kvm_tmp_index, len); 138 kvm_tmp_index, len);
138 kvm_patching_worked = false; 139 kvm_patching_worked = false;
@@ -151,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
151extern u32 kvm_emulate_mtmsrd_len; 152extern u32 kvm_emulate_mtmsrd_len;
152extern u32 kvm_emulate_mtmsrd[]; 153extern u32 kvm_emulate_mtmsrd[];
153 154
154static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) 155static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
155{ 156{
156 u32 *p; 157 u32 *p;
157 int distance_start; 158 int distance_start;
@@ -204,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs;
204extern u32 kvm_emulate_mtmsr_len; 205extern u32 kvm_emulate_mtmsr_len;
205extern u32 kvm_emulate_mtmsr[]; 206extern u32 kvm_emulate_mtmsr[];
206 207
207static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) 208static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
208{ 209{
209 u32 *p; 210 u32 *p;
210 int distance_start; 211 int distance_start;
@@ -265,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs;
265extern u32 kvm_emulate_wrtee_len; 266extern u32 kvm_emulate_wrtee_len;
266extern u32 kvm_emulate_wrtee[]; 267extern u32 kvm_emulate_wrtee[];
267 268
268static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) 269static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
269{ 270{
270 u32 *p; 271 u32 *p;
271 int distance_start; 272 int distance_start;
@@ -322,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs;
322extern u32 kvm_emulate_wrteei_0_len; 323extern u32 kvm_emulate_wrteei_0_len;
323extern u32 kvm_emulate_wrteei_0[]; 324extern u32 kvm_emulate_wrteei_0[];
324 325
325static void kvm_patch_ins_wrteei_0(u32 *inst) 326static void __init kvm_patch_ins_wrteei_0(u32 *inst)
326{ 327{
327 u32 *p; 328 u32 *p;
328 int distance_start; 329 int distance_start;
@@ -363,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs;
363extern u32 kvm_emulate_mtsrin_len; 364extern u32 kvm_emulate_mtsrin_len;
364extern u32 kvm_emulate_mtsrin[]; 365extern u32 kvm_emulate_mtsrin[];
365 366
366static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) 367static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
367{ 368{
368 u32 *p; 369 u32 *p;
369 int distance_start; 370 int distance_start;
@@ -399,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
399 400
400#endif 401#endif
401 402
402static void kvm_map_magic_page(void *data) 403static void __init kvm_map_magic_page(void *data)
403{ 404{
404 u32 *features = data; 405 u32 *features = data;
405 406
@@ -414,7 +415,7 @@ static void kvm_map_magic_page(void *data)
414 *features = out[0]; 415 *features = out[0];
415} 416}
416 417
417static void kvm_check_ins(u32 *inst, u32 features) 418static void __init kvm_check_ins(u32 *inst, u32 features)
418{ 419{
419 u32 _inst = *inst; 420 u32 _inst = *inst;
420 u32 inst_no_rt = _inst & ~KVM_MASK_RT; 421 u32 inst_no_rt = _inst & ~KVM_MASK_RT;
@@ -658,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features)
658extern u32 kvm_template_start[]; 659extern u32 kvm_template_start[];
659extern u32 kvm_template_end[]; 660extern u32 kvm_template_end[];
660 661
661static void kvm_use_magic_page(void) 662static void __init kvm_use_magic_page(void)
662{ 663{
663 u32 *p; 664 u32 *p;
664 u32 *start, *end; 665 u32 *start, *end;
@@ -699,25 +700,13 @@ static void kvm_use_magic_page(void)
699 kvm_patching_worked ? "worked" : "failed"); 700 kvm_patching_worked ? "worked" : "failed");
700} 701}
701 702
702static __init void kvm_free_tmp(void)
703{
704 /*
705 * Inform kmemleak about the hole in the .bss section since the
706 * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y.
707 */
708 kmemleak_free_part(&kvm_tmp[kvm_tmp_index],
709 ARRAY_SIZE(kvm_tmp) - kvm_tmp_index);
710 free_reserved_area(&kvm_tmp[kvm_tmp_index],
711 &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
712}
713
714static int __init kvm_guest_init(void) 703static int __init kvm_guest_init(void)
715{ 704{
716 if (!kvm_para_available()) 705 if (!kvm_para_available())
717 goto free_tmp; 706 return 0;
718 707
719 if (!epapr_paravirt_enabled) 708 if (!epapr_paravirt_enabled)
720 goto free_tmp; 709 return 0;
721 710
722 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) 711 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
723 kvm_use_magic_page(); 712 kvm_use_magic_page();
@@ -727,9 +716,6 @@ static int __init kvm_guest_init(void)
727 powersave_nap = 1; 716 powersave_nap = 1;
728#endif 717#endif
729 718
730free_tmp:
731 kvm_free_tmp();
732
733 return 0; 719 return 0;
734} 720}
735 721
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index eb2568f583ae..7af6f8b50c5d 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs:
192kvm_emulate_mtmsr_len: 192kvm_emulate_mtmsr_len:
193 .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 193 .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
194 194
195#ifdef CONFIG_BOOKE
196
195/* also used for wrteei 1 */ 197/* also used for wrteei 1 */
196.global kvm_emulate_wrtee 198.global kvm_emulate_wrtee
197kvm_emulate_wrtee: 199kvm_emulate_wrtee:
@@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs:
285kvm_emulate_wrteei_0_len: 287kvm_emulate_wrteei_0_len:
286 .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 288 .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
287 289
290#endif /* CONFIG_BOOKE */
291
292#ifdef CONFIG_PPC_BOOK3S_32
293
288.global kvm_emulate_mtsrin 294.global kvm_emulate_mtsrin
289kvm_emulate_mtsrin: 295kvm_emulate_mtsrin:
290 296
@@ -334,5 +340,15 @@ kvm_emulate_mtsrin_orig_ins_offs:
334kvm_emulate_mtsrin_len: 340kvm_emulate_mtsrin_len:
335 .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 341 .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
336 342
343#endif /* CONFIG_PPC_BOOK3S_32 */
344
345 .balign 4
346 .global kvm_tmp
347kvm_tmp:
348 .space (64 * 1024)
349
350.global kvm_tmp_end
351kvm_tmp_end:
352
337.global kvm_template_end 353.global kvm_template_end
338kvm_template_end: 354kvm_template_end:
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 18481b0e2788..04a7cba58eff 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -29,6 +29,8 @@
29#include <asm/smp.h> 29#include <asm/smp.h>
30#include <asm/hw_breakpoint.h> 30#include <asm/hw_breakpoint.h>
31#include <asm/asm-prototypes.h> 31#include <asm/asm-prototypes.h>
32#include <asm/svm.h>
33#include <asm/ultravisor.h>
32 34
33int default_machine_kexec_prepare(struct kimage *image) 35int default_machine_kexec_prepare(struct kimage *image)
34{ 36{
@@ -327,6 +329,13 @@ void default_machine_kexec(struct kimage *image)
327#ifdef CONFIG_PPC_PSERIES 329#ifdef CONFIG_PPC_PSERIES
328 kexec_paca.lppaca_ptr = NULL; 330 kexec_paca.lppaca_ptr = NULL;
329#endif 331#endif
332
333 if (is_secure_guest() && !(image->preserve_context ||
334 image->type == KEXEC_TYPE_CRASH)) {
335 uv_unshare_all_pages();
336 printk("kexec: Unshared all shared pages.\n");
337 }
338
330 paca_ptrs[kexec_paca.paca_index] = &kexec_paca; 339 paca_ptrs[kexec_paca.paca_index] = &kexec_paca;
331 340
332 setup_paca(&kexec_paca); 341 setup_paca(&kexec_paca);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..34c1001e9e8b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,13 +33,18 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
33 mce_ue_event_queue); 33 mce_ue_event_queue);
34 34
35static void machine_check_process_queued_event(struct irq_work *work); 35static void machine_check_process_queued_event(struct irq_work *work);
36void machine_check_ue_event(struct machine_check_event *evt); 36static void machine_check_ue_irq_work(struct irq_work *work);
37static void machine_check_ue_event(struct machine_check_event *evt);
37static void machine_process_ue_event(struct work_struct *work); 38static void machine_process_ue_event(struct work_struct *work);
38 39
39static struct irq_work mce_event_process_work = { 40static struct irq_work mce_event_process_work = {
40 .func = machine_check_process_queued_event, 41 .func = machine_check_process_queued_event,
41}; 42};
42 43
44static struct irq_work mce_ue_event_irq_work = {
45 .func = machine_check_ue_irq_work,
46};
47
43DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); 48DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
44 49
45static void mce_set_error_info(struct machine_check_event *mce, 50static void mce_set_error_info(struct machine_check_event *mce,
@@ -144,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
144 if (phys_addr != ULONG_MAX) { 149 if (phys_addr != ULONG_MAX) {
145 mce->u.ue_error.physical_address_provided = true; 150 mce->u.ue_error.physical_address_provided = true;
146 mce->u.ue_error.physical_address = phys_addr; 151 mce->u.ue_error.physical_address = phys_addr;
152 mce->u.ue_error.ignore_event = mce_err->ignore_event;
147 machine_check_ue_event(mce); 153 machine_check_ue_event(mce);
148 } 154 }
149 } 155 }
@@ -199,11 +205,15 @@ void release_mce_event(void)
199 get_mce_event(NULL, true); 205 get_mce_event(NULL, true);
200} 206}
201 207
208static void machine_check_ue_irq_work(struct irq_work *work)
209{
210 schedule_work(&mce_ue_event_work);
211}
202 212
203/* 213/*
204 * Queue up the MCE event which then can be handled later. 214 * Queue up the MCE event which then can be handled later.
205 */ 215 */
206void machine_check_ue_event(struct machine_check_event *evt) 216static void machine_check_ue_event(struct machine_check_event *evt)
207{ 217{
208 int index; 218 int index;
209 219
@@ -216,7 +226,7 @@ void machine_check_ue_event(struct machine_check_event *evt)
216 memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); 226 memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
217 227
218 /* Queue work to process this event later. */ 228 /* Queue work to process this event later. */
219 schedule_work(&mce_ue_event_work); 229 irq_work_queue(&mce_ue_event_irq_work);
220} 230}
221 231
222/* 232/*
@@ -257,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work)
257 /* 267 /*
258 * This should probably queued elsewhere, but 268 * This should probably queued elsewhere, but
259 * oh! well 269 * oh! well
270 *
271 * Don't report this machine check because the caller has a
272 * asked us to ignore the event, it has a fixup handler which
273 * will do the appropriate error handling and reporting.
260 */ 274 */
261 if (evt->error_type == MCE_ERROR_TYPE_UE) { 275 if (evt->error_type == MCE_ERROR_TYPE_UE) {
276 if (evt->u.ue_error.ignore_event) {
277 __this_cpu_dec(mce_ue_count);
278 continue;
279 }
280
262 if (evt->u.ue_error.physical_address_provided) { 281 if (evt->u.ue_error.physical_address_provided) {
263 unsigned long pfn; 282 unsigned long pfn;
264 283
@@ -292,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work)
292 while (__this_cpu_read(mce_queue_count) > 0) { 311 while (__this_cpu_read(mce_queue_count) > 0) {
293 index = __this_cpu_read(mce_queue_count) - 1; 312 index = __this_cpu_read(mce_queue_count) - 1;
294 evt = this_cpu_ptr(&mce_event_queue[index]); 313 evt = this_cpu_ptr(&mce_event_queue[index]);
314
315 if (evt->error_type == MCE_ERROR_TYPE_UE &&
316 evt->u.ue_error.ignore_event) {
317 __this_cpu_dec(mce_queue_count);
318 continue;
319 }
295 machine_check_print_event_info(evt, false, false); 320 machine_check_print_event_info(evt, false, false);
296 __this_cpu_dec(mce_queue_count); 321 __this_cpu_dec(mce_queue_count);
297 } 322 }
@@ -300,7 +325,7 @@ static void machine_check_process_queued_event(struct irq_work *work)
300void machine_check_print_event_info(struct machine_check_event *evt, 325void machine_check_print_event_info(struct machine_check_event *evt,
301 bool user_mode, bool in_guest) 326 bool user_mode, bool in_guest)
302{ 327{
303 const char *level, *sevstr, *subtype, *err_type; 328 const char *level, *sevstr, *subtype, *err_type, *initiator;
304 uint64_t ea = 0, pa = 0; 329 uint64_t ea = 0, pa = 0;
305 int n = 0; 330 int n = 0;
306 char dar_str[50]; 331 char dar_str[50];
@@ -385,6 +410,28 @@ void machine_check_print_event_info(struct machine_check_event *evt,
385 break; 410 break;
386 } 411 }
387 412
413 switch(evt->initiator) {
414 case MCE_INITIATOR_CPU:
415 initiator = "CPU";
416 break;
417 case MCE_INITIATOR_PCI:
418 initiator = "PCI";
419 break;
420 case MCE_INITIATOR_ISA:
421 initiator = "ISA";
422 break;
423 case MCE_INITIATOR_MEMORY:
424 initiator = "Memory";
425 break;
426 case MCE_INITIATOR_POWERMGM:
427 initiator = "Power Management";
428 break;
429 case MCE_INITIATOR_UNKNOWN:
430 default:
431 initiator = "Unknown";
432 break;
433 }
434
388 switch (evt->error_type) { 435 switch (evt->error_type) {
389 case MCE_ERROR_TYPE_UE: 436 case MCE_ERROR_TYPE_UE:
390 err_type = "UE"; 437 err_type = "UE";
@@ -451,6 +498,14 @@ void machine_check_print_event_info(struct machine_check_event *evt,
451 if (evt->u.link_error.effective_address_provided) 498 if (evt->u.link_error.effective_address_provided)
452 ea = evt->u.link_error.effective_address; 499 ea = evt->u.link_error.effective_address;
453 break; 500 break;
501 case MCE_ERROR_TYPE_DCACHE:
502 err_type = "D-Cache";
503 subtype = "Unknown";
504 break;
505 case MCE_ERROR_TYPE_ICACHE:
506 err_type = "I-Cache";
507 subtype = "Unknown";
508 break;
454 default: 509 default:
455 case MCE_ERROR_TYPE_UNKNOWN: 510 case MCE_ERROR_TYPE_UNKNOWN:
456 err_type = "Unknown"; 511 err_type = "Unknown";
@@ -483,9 +538,17 @@ void machine_check_print_event_info(struct machine_check_event *evt,
483 level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); 538 level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
484 } 539 }
485 540
541 printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator);
542
486 subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? 543 subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
487 mc_error_class[evt->error_class] : "Unknown"; 544 mc_error_class[evt->error_class] : "Unknown";
488 printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); 545 printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
546
547#ifdef CONFIG_PPC_BOOK3S_64
548 /* Display faulty slb contents for SLB errors. */
549 if (evt->error_type == MCE_ERROR_TYPE_SLB)
550 slb_dump_contents(local_paca->mce_faulty_slbs);
551#endif
489} 552}
490EXPORT_SYMBOL_GPL(machine_check_print_event_info); 553EXPORT_SYMBOL_GPL(machine_check_print_event_info);
491 554
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..1cbf7f1a4e3d 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/extable.h>
14#include <asm/mmu.h> 15#include <asm/mmu.h>
15#include <asm/mce.h> 16#include <asm/mce.h>
16#include <asm/machdep.h> 17#include <asm/machdep.h>
@@ -18,6 +19,7 @@
18#include <asm/pte-walk.h> 19#include <asm/pte-walk.h>
19#include <asm/sstep.h> 20#include <asm/sstep.h>
20#include <asm/exception-64s.h> 21#include <asm/exception-64s.h>
22#include <asm/extable.h>
21 23
22/* 24/*
23 * Convert an address related to an mm to a PFN. NOTE: we are in real 25 * Convert an address related to an mm to a PFN. NOTE: we are in real
@@ -26,7 +28,8 @@
26unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) 28unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
27{ 29{
28 pte_t *ptep; 30 pte_t *ptep;
29 unsigned long flags; 31 unsigned int shift;
32 unsigned long pfn, flags;
30 struct mm_struct *mm; 33 struct mm_struct *mm;
31 34
32 if (user_mode(regs)) 35 if (user_mode(regs))
@@ -35,14 +38,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
35 mm = &init_mm; 38 mm = &init_mm;
36 39
37 local_irq_save(flags); 40 local_irq_save(flags);
38 if (mm == current->mm) 41 ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
39 ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); 42
40 else 43 if (!ptep || pte_special(*ptep)) {
41 ptep = find_init_mm_pte(addr, NULL); 44 pfn = ULONG_MAX;
45 goto out;
46 }
47
48 if (shift <= PAGE_SHIFT)
49 pfn = pte_pfn(*ptep);
50 else {
51 unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
52 pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
53 }
54
55out:
42 local_irq_restore(flags); 56 local_irq_restore(flags);
43 if (!ptep || pte_special(*ptep)) 57 return pfn;
44 return ULONG_MAX;
45 return pte_pfn(*ptep);
46} 58}
47 59
48/* flush SLBs and reload */ 60/* flush SLBs and reload */
@@ -344,7 +356,7 @@ static const struct mce_derror_table mce_p9_derror_table[] = {
344 MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, 356 MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true },
345{ 0, false, 0, 0, 0, 0, 0 } }; 357{ 0, false, 0, 0, 0, 0, 0 } };
346 358
347static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, 359static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
348 uint64_t *phys_addr) 360 uint64_t *phys_addr)
349{ 361{
350 /* 362 /*
@@ -397,6 +409,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
397 /* attempt to correct the error */ 409 /* attempt to correct the error */
398 switch (table[i].error_type) { 410 switch (table[i].error_type) {
399 case MCE_ERROR_TYPE_SLB: 411 case MCE_ERROR_TYPE_SLB:
412 if (local_paca->in_mce == 1)
413 slb_save_contents(local_paca->mce_faulty_slbs);
400 handled = mce_flush(MCE_FLUSH_SLB); 414 handled = mce_flush(MCE_FLUSH_SLB);
401 break; 415 break;
402 case MCE_ERROR_TYPE_ERAT: 416 case MCE_ERROR_TYPE_ERAT:
@@ -482,6 +496,8 @@ static int mce_handle_derror(struct pt_regs *regs,
482 /* attempt to correct the error */ 496 /* attempt to correct the error */
483 switch (table[i].error_type) { 497 switch (table[i].error_type) {
484 case MCE_ERROR_TYPE_SLB: 498 case MCE_ERROR_TYPE_SLB:
499 if (local_paca->in_mce == 1)
500 slb_save_contents(local_paca->mce_faulty_slbs);
485 if (mce_flush(MCE_FLUSH_SLB)) 501 if (mce_flush(MCE_FLUSH_SLB))
486 handled = 1; 502 handled = 1;
487 break; 503 break;
@@ -541,7 +557,8 @@ static int mce_handle_derror(struct pt_regs *regs,
541 * kernel/exception-64s.h 557 * kernel/exception-64s.h
542 */ 558 */
543 if (get_paca()->in_mce < MAX_MCE_DEPTH) 559 if (get_paca()->in_mce < MAX_MCE_DEPTH)
544 mce_find_instr_ea_and_pfn(regs, addr, phys_addr); 560 mce_find_instr_ea_and_phys(regs, addr,
561 phys_addr);
545 } 562 }
546 found = 1; 563 found = 1;
547 } 564 }
@@ -558,9 +575,18 @@ static int mce_handle_derror(struct pt_regs *regs,
558 return 0; 575 return 0;
559} 576}
560 577
561static long mce_handle_ue_error(struct pt_regs *regs) 578static long mce_handle_ue_error(struct pt_regs *regs,
579 struct mce_error_info *mce_err)
562{ 580{
563 long handled = 0; 581 long handled = 0;
582 const struct exception_table_entry *entry;
583
584 entry = search_kernel_exception_table(regs->nip);
585 if (entry) {
586 mce_err->ignore_event = true;
587 regs->nip = extable_fixup(entry);
588 return 1;
589 }
564 590
565 /* 591 /*
566 * On specific SCOM read via MMIO we may get a machine check 592 * On specific SCOM read via MMIO we may get a machine check
@@ -593,7 +619,7 @@ static long mce_handle_error(struct pt_regs *regs,
593 &phys_addr); 619 &phys_addr);
594 620
595 if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) 621 if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
596 handled = mce_handle_ue_error(regs); 622 handled = mce_handle_ue_error(regs, &mce_err);
597 623
598 save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); 624 save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
599 625
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index fe4bd321730e..82df4b09e79f 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -292,22 +292,20 @@ _GLOBAL(flush_instruction_cache)
292 iccci 0,r3 292 iccci 0,r3
293#endif 293#endif
294#elif defined(CONFIG_FSL_BOOKE) 294#elif defined(CONFIG_FSL_BOOKE)
295BEGIN_FTR_SECTION 295#ifdef CONFIG_E200
296 mfspr r3,SPRN_L1CSR0 296 mfspr r3,SPRN_L1CSR0
297 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC 297 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC
298 /* msync; isync recommended here */ 298 /* msync; isync recommended here */
299 mtspr SPRN_L1CSR0,r3 299 mtspr SPRN_L1CSR0,r3
300 isync 300 isync
301 blr 301 blr
302END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) 302#endif
303 mfspr r3,SPRN_L1CSR1 303 mfspr r3,SPRN_L1CSR1
304 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR 304 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR
305 mtspr SPRN_L1CSR1,r3 305 mtspr SPRN_L1CSR1,r3
306#elif defined(CONFIG_PPC_BOOK3S_601)
307 blr /* for 601, do nothing */
306#else 308#else
307 mfspr r3,SPRN_PVR
308 rlwinm r3,r3,16,16,31
309 cmpwi 0,r3,1
310 beqlr /* for 601, do nothing */
311 /* 603/604 processor - use invalidate-all bit in HID0 */ 309 /* 603/604 processor - use invalidate-all bit in HID0 */
312 mfspr r3,SPRN_HID0 310 mfspr r3,SPRN_HID0
313 ori r3,r3,HID0_ICFI 311 ori r3,r3,HID0_ICFI
@@ -326,10 +324,10 @@ EXPORT_SYMBOL(flush_instruction_cache)
326 * flush_icache_range(unsigned long start, unsigned long stop) 324 * flush_icache_range(unsigned long start, unsigned long stop)
327 */ 325 */
328_GLOBAL(flush_icache_range) 326_GLOBAL(flush_icache_range)
329BEGIN_FTR_SECTION 327#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
330 PURGE_PREFETCHED_INS 328 PURGE_PREFETCHED_INS
331 blr /* for 601, do nothing */ 329 blr /* for 601 and e200, do nothing */
332END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) 330#else
333 rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT 331 rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT
334 subf r4,r3,r4 332 subf r4,r3,r4
335 addi r4,r4,L1_CACHE_BYTES - 1 333 addi r4,r4,L1_CACHE_BYTES - 1
@@ -355,6 +353,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
355 sync /* additional sync needed on g4 */ 353 sync /* additional sync needed on g4 */
356 isync 354 isync
357 blr 355 blr
356#endif
358_ASM_NOKPROBE_SYMBOL(flush_icache_range) 357_ASM_NOKPROBE_SYMBOL(flush_icache_range)
359EXPORT_SYMBOL(flush_icache_range) 358EXPORT_SYMBOL(flush_icache_range)
360 359
@@ -362,15 +361,15 @@ EXPORT_SYMBOL(flush_icache_range)
362 * Flush a particular page from the data cache to RAM. 361 * Flush a particular page from the data cache to RAM.
363 * Note: this is necessary because the instruction cache does *not* 362 * Note: this is necessary because the instruction cache does *not*
364 * snoop from the data cache. 363 * snoop from the data cache.
365 * This is a no-op on the 601 which has a unified cache. 364 * This is a no-op on the 601 and e200 which have a unified cache.
366 * 365 *
367 * void __flush_dcache_icache(void *page) 366 * void __flush_dcache_icache(void *page)
368 */ 367 */
369_GLOBAL(__flush_dcache_icache) 368_GLOBAL(__flush_dcache_icache)
370BEGIN_FTR_SECTION 369#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
371 PURGE_PREFETCHED_INS 370 PURGE_PREFETCHED_INS
372 blr 371 blr
373END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) 372#else
374 rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ 373 rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */
375 li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ 374 li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */
376 mtctr r4 375 mtctr r4
@@ -398,6 +397,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
398 sync 397 sync
399 isync 398 isync
400 blr 399 blr
400#endif
401 401
402#ifndef CONFIG_BOOKE 402#ifndef CONFIG_BOOKE
403/* 403/*
@@ -409,10 +409,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
409 * void __flush_dcache_icache_phys(unsigned long physaddr) 409 * void __flush_dcache_icache_phys(unsigned long physaddr)
410 */ 410 */
411_GLOBAL(__flush_dcache_icache_phys) 411_GLOBAL(__flush_dcache_icache_phys)
412BEGIN_FTR_SECTION 412#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200)
413 PURGE_PREFETCHED_INS 413 PURGE_PREFETCHED_INS
414 blr /* for 601, do nothing */ 414 blr /* for 601 and e200, do nothing */
415END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) 415#else
416 mfmsr r10 416 mfmsr r10
417 rlwinm r0,r10,0,28,26 /* clear DR */ 417 rlwinm r0,r10,0,28,26 /* clear DR */
418 mtmsr r0 418 mtmsr r0
@@ -433,6 +433,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
433 mtmsr r10 /* restore DR */ 433 mtmsr r10 /* restore DR */
434 isync 434 isync
435 blr 435 blr
436#endif
436#endif /* CONFIG_BOOKE */ 437#endif /* CONFIG_BOOKE */
437 438
438/* 439/*
@@ -452,7 +453,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
452 stwu r9,16(r3) 453 stwu r9,16(r3)
453 454
454_GLOBAL(copy_page) 455_GLOBAL(copy_page)
456 rlwinm r5, r3, 0, L1_CACHE_BYTES - 1
455 addi r3,r3,-4 457 addi r3,r3,-4
458
4590: twnei r5, 0 /* WARN if r3 is not cache aligned */
460 EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
461
456 addi r4,r4,-4 462 addi r4,r4,-4
457 463
458 li r5,4 464 li r5,4
diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S
new file mode 100644
index 000000000000..bcdad15395dd
--- /dev/null
+++ b/arch/powerpc/kernel/note.S
@@ -0,0 +1,40 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * PowerPC ELF notes.
4 *
5 * Copyright 2019, IBM Corporation
6 */
7
8#include <linux/elfnote.h>
9#include <asm/elfnote.h>
10
11/*
12 * Ultravisor-capable bit (PowerNV only).
13 *
14 * Bit 0 indicates that the powerpc kernel binary knows how to run in an
15 * ultravisor-enabled system.
16 *
17 * In an ultravisor-enabled system, some machine resources are now controlled
18 * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
19 * being run on a machine with ultravisor, the kernel will probably crash
20 * trying to access ultravisor resources. For instance, it may crash in early
21 * boot trying to set the partition table entry 0.
22 *
23 * In an ultravisor-enabled system, a bootloader could warn the user or prevent
24 * the kernel from being run if the PowerPC ultravisor capability doesn't exist
25 * or the Ultravisor-capable bit is not set.
26 */
27#ifdef CONFIG_PPC_POWERNV
28#define PPCCAP_ULTRAVISOR_BIT (1 << 0)
29#else
30#define PPCCAP_ULTRAVISOR_BIT 0
31#endif
32
33/*
34 * Add the PowerPC Capabilities in the binary ELF note. It is a bitmap that
35 * can be used to advertise kernel capabilities to userland.
36 */
37#define PPC_CAPABILITIES_BITMAP (PPCCAP_ULTRAVISOR_BIT)
38
39ELFNOTE(PowerPC, PPC_ELFNOTE_CAPABILITIES,
40 .long PPC_CAPABILITIES_BITMAP)
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index e3ad8aa4730d..949eceb254d8 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -14,6 +14,8 @@
14#include <asm/sections.h> 14#include <asm/sections.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/kexec.h> 16#include <asm/kexec.h>
17#include <asm/svm.h>
18#include <asm/ultravisor.h>
17 19
18#include "setup.h" 20#include "setup.h"
19 21
@@ -52,6 +54,43 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align,
52 54
53#ifdef CONFIG_PPC_PSERIES 55#ifdef CONFIG_PPC_PSERIES
54 56
57#define LPPACA_SIZE 0x400
58
59static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align,
60 unsigned long limit, int cpu)
61{
62 size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE);
63 static unsigned long shared_lppaca_size;
64 static void *shared_lppaca;
65 void *ptr;
66
67 if (!shared_lppaca) {
68 memblock_set_bottom_up(true);
69
70 shared_lppaca =
71 memblock_alloc_try_nid(shared_lppaca_total_size,
72 PAGE_SIZE, MEMBLOCK_LOW_LIMIT,
73 limit, NUMA_NO_NODE);
74 if (!shared_lppaca)
75 panic("cannot allocate shared data");
76
77 memblock_set_bottom_up(false);
78 uv_share_page(PHYS_PFN(__pa(shared_lppaca)),
79 shared_lppaca_total_size >> PAGE_SHIFT);
80 }
81
82 ptr = shared_lppaca + shared_lppaca_size;
83 shared_lppaca_size += size;
84
85 /*
86 * This is very early in boot, so no harm done if the kernel crashes at
87 * this point.
88 */
89 BUG_ON(shared_lppaca_size >= shared_lppaca_total_size);
90
91 return ptr;
92}
93
55/* 94/*
56 * See asm/lppaca.h for more detail. 95 * See asm/lppaca.h for more detail.
57 * 96 *
@@ -65,7 +104,7 @@ static inline void init_lppaca(struct lppaca *lppaca)
65 104
66 *lppaca = (struct lppaca) { 105 *lppaca = (struct lppaca) {
67 .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ 106 .desc = cpu_to_be32(0xd397d781), /* "LpPa" */
68 .size = cpu_to_be16(0x400), 107 .size = cpu_to_be16(LPPACA_SIZE),
69 .fpregs_in_use = 1, 108 .fpregs_in_use = 1,
70 .slb_count = cpu_to_be16(64), 109 .slb_count = cpu_to_be16(64),
71 .vmxregs_in_use = 0, 110 .vmxregs_in_use = 0,
@@ -75,19 +114,22 @@ static inline void init_lppaca(struct lppaca *lppaca)
75static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) 114static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
76{ 115{
77 struct lppaca *lp; 116 struct lppaca *lp;
78 size_t size = 0x400;
79 117
80 BUILD_BUG_ON(size < sizeof(struct lppaca)); 118 BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE);
81 119
82 if (early_cpu_has_feature(CPU_FTR_HVMODE)) 120 if (early_cpu_has_feature(CPU_FTR_HVMODE))
83 return NULL; 121 return NULL;
84 122
85 lp = alloc_paca_data(size, 0x400, limit, cpu); 123 if (is_secure_guest())
124 lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu);
125 else
126 lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
127
86 init_lppaca(lp); 128 init_lppaca(lp);
87 129
88 return lp; 130 return lp;
89} 131}
90#endif /* CONFIG_PPC_BOOK3S */ 132#endif /* CONFIG_PPC_PSERIES */
91 133
92#ifdef CONFIG_PPC_BOOK3S_64 134#ifdef CONFIG_PPC_BOOK3S_64
93 135
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index f627e15bb43c..1c448cf25506 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1379,10 +1379,6 @@ void __init pcibios_resource_survey(void)
1379 pr_debug("PCI: Assigning unassigned resources...\n"); 1379 pr_debug("PCI: Assigning unassigned resources...\n");
1380 pci_assign_unassigned_resources(); 1380 pci_assign_unassigned_resources();
1381 } 1381 }
1382
1383 /* Call machine dependent fixup */
1384 if (ppc_md.pcibios_fixup)
1385 ppc_md.pcibios_fixup();
1386} 1382}
1387 1383
1388/* This is used by the PCI hotplug driver to allocate resource 1384/* This is used by the PCI hotplug driver to allocate resource
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 0b0cf8168b47..fc62c4bc47b1 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
55void pcibios_release_device(struct pci_dev *dev) 55void pcibios_release_device(struct pci_dev *dev)
56{ 56{
57 struct pci_controller *phb = pci_bus_to_host(dev->bus); 57 struct pci_controller *phb = pci_bus_to_host(dev->bus);
58 struct pci_dn *pdn = pci_get_pdn(dev);
58 59
59 eeh_remove_device(dev); 60 eeh_remove_device(dev);
60 61
61 if (phb->controller_ops.release_device) 62 if (phb->controller_ops.release_device)
62 phb->controller_ops.release_device(dev); 63 phb->controller_ops.release_device(dev);
64
65 /* free()ing the pci_dn has been deferred to us, do it now */
66 if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) {
67 pci_dbg(dev, "freeing dead pdn\n");
68 kfree(pdn);
69 }
63} 70}
64 71
65/** 72/**
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 50942a1d1a5f..b49e1060a3bf 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -263,6 +263,10 @@ static int __init pcibios_init(void)
263 /* Call common code to handle resource allocation */ 263 /* Call common code to handle resource allocation */
264 pcibios_resource_survey(); 264 pcibios_resource_survey();
265 265
266 /* Call machine dependent fixup */
267 if (ppc_md.pcibios_fixup)
268 ppc_md.pcibios_fixup();
269
266 /* Call machine dependent post-init code */ 270 /* Call machine dependent post-init code */
267 if (ppc_md.pcibios_after_init) 271 if (ppc_md.pcibios_after_init)
268 ppc_md.pcibios_after_init(); 272 ppc_md.pcibios_after_init();
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index b7030b1189d0..f83d1f69b1dd 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -54,14 +54,20 @@ static int __init pcibios_init(void)
54 pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); 54 pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
55 55
56 /* Scan all of the recorded PCI controllers. */ 56 /* Scan all of the recorded PCI controllers. */
57 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 57 list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
58 pcibios_scan_phb(hose); 58 pcibios_scan_phb(hose);
59 pci_bus_add_devices(hose->bus);
60 }
61 59
62 /* Call common code to handle resource allocation */ 60 /* Call common code to handle resource allocation */
63 pcibios_resource_survey(); 61 pcibios_resource_survey();
64 62
63 /* Add devices. */
64 list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
65 pci_bus_add_devices(hose->bus);
66
67 /* Call machine dependent fixup */
68 if (ppc_md.pcibios_fixup)
69 ppc_md.pcibios_fixup();
70
65 printk(KERN_DEBUG "PCI: Probing PCI hardware done\n"); 71 printk(KERN_DEBUG "PCI: Probing PCI hardware done\n");
66 72
67 return 0; 73 return 0;
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index c4c8c237a106..9524009ca1ae 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -323,6 +323,7 @@ void pci_remove_device_node_info(struct device_node *dn)
323{ 323{
324 struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; 324 struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
325 struct device_node *parent; 325 struct device_node *parent;
326 struct pci_dev *pdev;
326#ifdef CONFIG_EEH 327#ifdef CONFIG_EEH
327 struct eeh_dev *edev = pdn_to_eeh_dev(pdn); 328 struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
328 329
@@ -336,12 +337,28 @@ void pci_remove_device_node_info(struct device_node *dn)
336 WARN_ON(!list_empty(&pdn->child_list)); 337 WARN_ON(!list_empty(&pdn->child_list));
337 list_del(&pdn->list); 338 list_del(&pdn->list);
338 339
340 /* Drop the parent pci_dn's ref to our backing dt node */
339 parent = of_get_parent(dn); 341 parent = of_get_parent(dn);
340 if (parent) 342 if (parent)
341 of_node_put(parent); 343 of_node_put(parent);
342 344
343 dn->data = NULL; 345 /*
344 kfree(pdn); 346 * At this point we *might* still have a pci_dev that was
347 * instantiated from this pci_dn. So defer free()ing it until
348 * the pci_dev's release function is called.
349 */
350 pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number,
351 pdn->busno, pdn->devfn);
352 if (pdev) {
353 /* NB: pdev has a ref to dn */
354 pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn);
355 pdn->flags |= PCI_DN_FLAG_DEAD;
356 } else {
357 dn->data = NULL;
358 kfree(pdn);
359 }
360
361 pci_dev_put(pdev);
345} 362}
346EXPORT_SYMBOL_GPL(pci_remove_device_node_info); 363EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
347 364
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index 409c6c1beabf..f91d7e94872e 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -34,31 +34,75 @@ static u32 get_int_prop(struct device_node *np, const char *name, u32 def)
34 * pci_parse_of_flags - Parse the flags cell of a device tree PCI address 34 * pci_parse_of_flags - Parse the flags cell of a device tree PCI address
35 * @addr0: value of 1st cell of a device tree PCI address. 35 * @addr0: value of 1st cell of a device tree PCI address.
36 * @bridge: Set this flag if the address is from a bridge 'ranges' property 36 * @bridge: Set this flag if the address is from a bridge 'ranges' property
37 *
38 * PCI Bus Binding to IEEE Std 1275-1994
39 *
40 * Bit# 33222222 22221111 11111100 00000000
41 * 10987654 32109876 54321098 76543210
42 * phys.hi cell: npt000ss bbbbbbbb dddddfff rrrrrrrr
43 * phys.mid cell: hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh
44 * phys.lo cell: llllllll llllllll llllllll llllllll
45 *
46 * where:
47 * n is 0 if the address is relocatable, 1 otherwise
48 * p is 1 if the addressable region is "prefetchable", 0 otherwise
49 * t is 1 if the address is aliased (for non-relocatable I/O),
50 * below 1 MB (for Memory),or below 64 KB (for relocatable I/O).
51 * ss is the space code, denoting the address space:
52 * 00 denotes Configuration Space
53 * 01 denotes I/O Space
54 * 10 denotes 32-bit-address Memory Space
55 * 11 denotes 64-bit-address Memory Space
56 * bbbbbbbb is the 8-bit Bus Number
57 * ddddd is the 5-bit Device Number
58 * fff is the 3-bit Function Number
59 * rrrrrrrr is the 8-bit Register Number
37 */ 60 */
61#define OF_PCI_ADDR0_SPACE(ss) (((ss)&3)<<24)
62#define OF_PCI_ADDR0_SPACE_CFG OF_PCI_ADDR0_SPACE(0)
63#define OF_PCI_ADDR0_SPACE_IO OF_PCI_ADDR0_SPACE(1)
64#define OF_PCI_ADDR0_SPACE_MMIO32 OF_PCI_ADDR0_SPACE(2)
65#define OF_PCI_ADDR0_SPACE_MMIO64 OF_PCI_ADDR0_SPACE(3)
66#define OF_PCI_ADDR0_SPACE_MASK OF_PCI_ADDR0_SPACE(3)
67#define OF_PCI_ADDR0_RELOC (1UL<<31)
68#define OF_PCI_ADDR0_PREFETCH (1UL<<30)
69#define OF_PCI_ADDR0_ALIAS (1UL<<29)
70#define OF_PCI_ADDR0_BUS 0x00FF0000UL
71#define OF_PCI_ADDR0_DEV 0x0000F800UL
72#define OF_PCI_ADDR0_FN 0x00000700UL
73#define OF_PCI_ADDR0_BARREG 0x000000FFUL
74
38unsigned int pci_parse_of_flags(u32 addr0, int bridge) 75unsigned int pci_parse_of_flags(u32 addr0, int bridge)
39{ 76{
40 unsigned int flags = 0; 77 unsigned int flags = 0, as = addr0 & OF_PCI_ADDR0_SPACE_MASK;
41 78
42 if (addr0 & 0x02000000) { 79 if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64) {
43 flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; 80 flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
44 flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; 81
45 if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64) 82 if (as == OF_PCI_ADDR0_SPACE_MMIO64)
46 flags |= IORESOURCE_MEM_64; 83 flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64;
47 flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; 84
48 if (addr0 & 0x40000000) 85 if (addr0 & OF_PCI_ADDR0_ALIAS)
49 flags |= IORESOURCE_PREFETCH 86 flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M;
50 | PCI_BASE_ADDRESS_MEM_PREFETCH; 87
88 if (addr0 & OF_PCI_ADDR0_PREFETCH)
89 flags |= IORESOURCE_PREFETCH |
90 PCI_BASE_ADDRESS_MEM_PREFETCH;
91
51 /* Note: We don't know whether the ROM has been left enabled 92 /* Note: We don't know whether the ROM has been left enabled
52 * by the firmware or not. We mark it as disabled (ie, we do 93 * by the firmware or not. We mark it as disabled (ie, we do
53 * not set the IORESOURCE_ROM_ENABLE flag) for now rather than 94 * not set the IORESOURCE_ROM_ENABLE flag) for now rather than
54 * do a config space read, it will be force-enabled if needed 95 * do a config space read, it will be force-enabled if needed
55 */ 96 */
56 if (!bridge && (addr0 & 0xff) == 0x30) 97 if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS)
57 flags |= IORESOURCE_READONLY; 98 flags |= IORESOURCE_READONLY;
58 } else if (addr0 & 0x01000000) 99
100 } else if (as == OF_PCI_ADDR0_SPACE_IO)
59 flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; 101 flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO;
102
60 if (flags) 103 if (flags)
61 flags |= IORESOURCE_SIZEALIGN; 104 flags |= IORESOURCE_SIZEALIGN;
105
62 return flags; 106 return flags;
63} 107}
64 108
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 7a84c9f1778e..639ceae7da9d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1587,8 +1587,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
1587/* 1587/*
1588 * Copy architecture-specific thread state 1588 * Copy architecture-specific thread state
1589 */ 1589 */
1590int copy_thread(unsigned long clone_flags, unsigned long usp, 1590int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
1591 unsigned long kthread_arg, struct task_struct *p) 1591 unsigned long kthread_arg, struct task_struct *p,
1592 unsigned long tls)
1592{ 1593{
1593 struct pt_regs *childregs, *kregs; 1594 struct pt_regs *childregs, *kregs;
1594 extern void ret_from_fork(void); 1595 extern void ret_from_fork(void);
@@ -1629,10 +1630,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
1629 if (clone_flags & CLONE_SETTLS) { 1630 if (clone_flags & CLONE_SETTLS) {
1630#ifdef CONFIG_PPC64 1631#ifdef CONFIG_PPC64
1631 if (!is_32bit_task()) 1632 if (!is_32bit_task())
1632 childregs->gpr[13] = childregs->gpr[6]; 1633 childregs->gpr[13] = tls;
1633 else 1634 else
1634#endif 1635#endif
1635 childregs->gpr[2] = childregs->gpr[6]; 1636 childregs->gpr[2] = tls;
1636 } 1637 }
1637 1638
1638 f = ret_from_fork; 1639 f = ret_from_fork;
@@ -2033,10 +2034,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
2033 int count = 0; 2034 int count = 0;
2034 int firstframe = 1; 2035 int firstframe = 1;
2035#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2036#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2036 struct ftrace_ret_stack *ret_stack; 2037 unsigned long ret_addr;
2037 extern void return_to_handler(void); 2038 int ftrace_idx = 0;
2038 unsigned long rth = (unsigned long)return_to_handler;
2039 int curr_frame = 0;
2040#endif 2039#endif
2041 2040
2042 if (tsk == NULL) 2041 if (tsk == NULL)
@@ -2065,15 +2064,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
2065 if (!firstframe || ip != lr) { 2064 if (!firstframe || ip != lr) {
2066 printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); 2065 printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip);
2067#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2066#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2068 if ((ip == rth) && curr_frame >= 0) { 2067 ret_addr = ftrace_graph_ret_addr(current,
2069 ret_stack = ftrace_graph_get_ret_stack(current, 2068 &ftrace_idx, ip, stack);
2070 curr_frame++); 2069 if (ret_addr != ip)
2071 if (ret_stack) 2070 pr_cont(" (%pS)", (void *)ret_addr);
2072 pr_cont(" (%pS)",
2073 (void *)ret_stack->ret);
2074 else
2075 curr_frame = -1;
2076 }
2077#endif 2071#endif
2078 if (firstframe) 2072 if (firstframe)
2079 pr_cont(" (unreliable)"); 2073 pr_cont(" (unreliable)");
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 7159e791a70d..6620f37abe73 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,6 +55,7 @@
55#include <asm/firmware.h> 55#include <asm/firmware.h>
56#include <asm/dt_cpu_ftrs.h> 56#include <asm/dt_cpu_ftrs.h>
57#include <asm/drmem.h> 57#include <asm/drmem.h>
58#include <asm/ultravisor.h>
58 59
59#include <mm/mmu_decl.h> 60#include <mm/mmu_decl.h>
60 61
@@ -702,9 +703,12 @@ void __init early_init_devtree(void *params)
702#ifdef CONFIG_PPC_POWERNV 703#ifdef CONFIG_PPC_POWERNV
703 /* Some machines might need OPAL info for debugging, grab it now. */ 704 /* Some machines might need OPAL info for debugging, grab it now. */
704 of_scan_flat_dt(early_init_dt_scan_opal, NULL); 705 of_scan_flat_dt(early_init_dt_scan_opal, NULL);
706
707 /* Scan tree for ultravisor feature */
708 of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL);
705#endif 709#endif
706 710
707#ifdef CONFIG_FA_DUMP 711#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
708 /* scan tree to see if dump is active during last boot */ 712 /* scan tree to see if dump is active during last boot */
709 of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); 713 of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
710#endif 714#endif
@@ -731,7 +735,7 @@ void __init early_init_devtree(void *params)
731 if (PHYSICAL_START > MEMORY_START) 735 if (PHYSICAL_START > MEMORY_START)
732 memblock_reserve(MEMORY_START, 0x8000); 736 memblock_reserve(MEMORY_START, 0x8000);
733 reserve_kdump_trampoline(); 737 reserve_kdump_trampoline();
734#ifdef CONFIG_FA_DUMP 738#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
735 /* 739 /*
736 * If we fail to reserve memory for firmware-assisted dump then 740 * If we fail to reserve memory for firmware-assisted dump then
737 * fallback to kexec based kdump. 741 * fallback to kexec based kdump.
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 514707ef6779..a4e7762dd286 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -40,6 +40,7 @@
40#include <asm/sections.h> 40#include <asm/sections.h>
41#include <asm/machdep.h> 41#include <asm/machdep.h>
42#include <asm/asm-prototypes.h> 42#include <asm/asm-prototypes.h>
43#include <asm/ultravisor-api.h>
43 44
44#include <linux/linux_logo.h> 45#include <linux/linux_logo.h>
45 46
@@ -94,7 +95,7 @@ static int of_workarounds __prombss;
94#define PROM_BUG() do { \ 95#define PROM_BUG() do { \
95 prom_printf("kernel BUG at %s line 0x%x!\n", \ 96 prom_printf("kernel BUG at %s line 0x%x!\n", \
96 __FILE__, __LINE__); \ 97 __FILE__, __LINE__); \
97 __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ 98 __builtin_trap(); \
98} while (0) 99} while (0)
99 100
100#ifdef DEBUG_PROM 101#ifdef DEBUG_PROM
@@ -171,6 +172,10 @@ static bool __prombss prom_radix_disable;
171static bool __prombss prom_xive_disable; 172static bool __prombss prom_xive_disable;
172#endif 173#endif
173 174
175#ifdef CONFIG_PPC_SVM
176static bool __prombss prom_svm_enable;
177#endif
178
174struct platform_support { 179struct platform_support {
175 bool hash_mmu; 180 bool hash_mmu;
176 bool radix_mmu; 181 bool radix_mmu;
@@ -812,6 +817,17 @@ static void __init early_cmdline_parse(void)
812 prom_debug("XIVE disabled from cmdline\n"); 817 prom_debug("XIVE disabled from cmdline\n");
813 } 818 }
814#endif /* CONFIG_PPC_PSERIES */ 819#endif /* CONFIG_PPC_PSERIES */
820
821#ifdef CONFIG_PPC_SVM
822 opt = prom_strstr(prom_cmd_line, "svm=");
823 if (opt) {
824 bool val;
825
826 opt += sizeof("svm=") - 1;
827 if (!prom_strtobool(opt, &val))
828 prom_svm_enable = val;
829 }
830#endif /* CONFIG_PPC_SVM */
815} 831}
816 832
817#ifdef CONFIG_PPC_PSERIES 833#ifdef CONFIG_PPC_PSERIES
@@ -1712,6 +1728,43 @@ static void __init prom_close_stdin(void)
1712 } 1728 }
1713} 1729}
1714 1730
1731#ifdef CONFIG_PPC_SVM
1732static int prom_rtas_hcall(uint64_t args)
1733{
1734 register uint64_t arg1 asm("r3") = H_RTAS;
1735 register uint64_t arg2 asm("r4") = args;
1736
1737 asm volatile("sc 1\n" : "=r" (arg1) :
1738 "r" (arg1),
1739 "r" (arg2) :);
1740 return arg1;
1741}
1742
1743static struct rtas_args __prombss os_term_args;
1744
1745static void __init prom_rtas_os_term(char *str)
1746{
1747 phandle rtas_node;
1748 __be32 val;
1749 u32 token;
1750
1751 prom_debug("%s: start...\n", __func__);
1752 rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas"));
1753 prom_debug("rtas_node: %x\n", rtas_node);
1754 if (!PHANDLE_VALID(rtas_node))
1755 return;
1756
1757 val = 0;
1758 prom_getprop(rtas_node, "ibm,os-term", &val, sizeof(val));
1759 token = be32_to_cpu(val);
1760 prom_debug("ibm,os-term: %x\n", token);
1761 if (token == 0)
1762 prom_panic("Could not get token for ibm,os-term\n");
1763 os_term_args.token = cpu_to_be32(token);
1764 prom_rtas_hcall((uint64_t)&os_term_args);
1765}
1766#endif /* CONFIG_PPC_SVM */
1767
1715/* 1768/*
1716 * Allocate room for and instantiate RTAS 1769 * Allocate room for and instantiate RTAS
1717 */ 1770 */
@@ -3168,6 +3221,46 @@ static void unreloc_toc(void)
3168#endif 3221#endif
3169#endif 3222#endif
3170 3223
3224#ifdef CONFIG_PPC_SVM
3225/*
3226 * Perform the Enter Secure Mode ultracall.
3227 */
3228static int enter_secure_mode(unsigned long kbase, unsigned long fdt)
3229{
3230 register unsigned long r3 asm("r3") = UV_ESM;
3231 register unsigned long r4 asm("r4") = kbase;
3232 register unsigned long r5 asm("r5") = fdt;
3233
3234 asm volatile("sc 2" : "+r"(r3) : "r"(r4), "r"(r5));
3235
3236 return r3;
3237}
3238
3239/*
3240 * Call the Ultravisor to transfer us to secure memory if we have an ESM blob.
3241 */
3242static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
3243{
3244 int ret;
3245
3246 if (!prom_svm_enable)
3247 return;
3248
3249 /* Switch to secure mode. */
3250 prom_printf("Switching to secure mode.\n");
3251
3252 ret = enter_secure_mode(kbase, fdt);
3253 if (ret != U_SUCCESS) {
3254 prom_printf("Returned %d from switching to secure mode.\n", ret);
3255 prom_rtas_os_term("Switch to secure mode failed.\n");
3256 }
3257}
3258#else
3259static void setup_secure_guest(unsigned long kbase, unsigned long fdt)
3260{
3261}
3262#endif /* CONFIG_PPC_SVM */
3263
3171/* 3264/*
3172 * We enter here early on, when the Open Firmware prom is still 3265 * We enter here early on, when the Open Firmware prom is still
3173 * handling exceptions and the MMU hash table for us. 3266 * handling exceptions and the MMU hash table for us.
@@ -3366,6 +3459,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
3366 unreloc_toc(); 3459 unreloc_toc();
3367#endif 3460#endif
3368 3461
3462 /* Move to secure memory if we're supposed to be secure guests. */
3463 setup_secure_guest(kbase, hdr);
3464
3369 __start(hdr, kbase, 0, 0, 0, 0, 0); 3465 __start(hdr, kbase, 0, 0, 0, 0, 0);
3370 3466
3371 return 0; 3467 return 0;
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 5faf0a64c92b..c5fa251b8950 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -16,6 +16,7 @@
16#include <linux/capability.h> 16#include <linux/capability.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/cpu.h> 18#include <linux/cpu.h>
19#include <linux/sched.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
20#include <linux/completion.h> 21#include <linux/completion.h>
21#include <linux/cpumask.h> 22#include <linux/cpumask.h>
@@ -871,15 +872,17 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
871 return 0; 872 return 0;
872 873
873 for_each_cpu(cpu, cpus) { 874 for_each_cpu(cpu, cpus) {
875 struct device *dev = get_cpu_device(cpu);
876
874 switch (state) { 877 switch (state) {
875 case DOWN: 878 case DOWN:
876 cpuret = cpu_down(cpu); 879 cpuret = device_offline(dev);
877 break; 880 break;
878 case UP: 881 case UP:
879 cpuret = cpu_up(cpu); 882 cpuret = device_online(dev);
880 break; 883 break;
881 } 884 }
882 if (cpuret) { 885 if (cpuret < 0) {
883 pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", 886 pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
884 __func__, 887 __func__,
885 ((state == UP) ? "up" : "down"), 888 ((state == UP) ? "up" : "down"),
@@ -896,6 +899,7 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
896 cpumask_clear_cpu(cpu, cpus); 899 cpumask_clear_cpu(cpu, cpus);
897 } 900 }
898 } 901 }
902 cond_resched();
899 } 903 }
900 904
901 return ret; 905 return ret;
@@ -922,13 +926,11 @@ int rtas_online_cpus_mask(cpumask_var_t cpus)
922 926
923 return ret; 927 return ret;
924} 928}
925EXPORT_SYMBOL(rtas_online_cpus_mask);
926 929
927int rtas_offline_cpus_mask(cpumask_var_t cpus) 930int rtas_offline_cpus_mask(cpumask_var_t cpus)
928{ 931{
929 return rtas_cpu_state_change_mask(DOWN, cpus); 932 return rtas_cpu_state_change_mask(DOWN, cpus);
930} 933}
931EXPORT_SYMBOL(rtas_offline_cpus_mask);
932 934
933int rtas_ibm_suspend_me(u64 handle) 935int rtas_ibm_suspend_me(u64 handle)
934{ 936{
@@ -968,6 +970,8 @@ int rtas_ibm_suspend_me(u64 handle)
968 data.token = rtas_token("ibm,suspend-me"); 970 data.token = rtas_token("ibm,suspend-me");
969 data.complete = &done; 971 data.complete = &done;
970 972
973 lock_device_hotplug();
974
971 /* All present CPUs must be online */ 975 /* All present CPUs must be online */
972 cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); 976 cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
973 cpuret = rtas_online_cpus_mask(offline_mask); 977 cpuret = rtas_online_cpus_mask(offline_mask);
@@ -1006,6 +1010,7 @@ out_hotplug_enable:
1006 __func__); 1010 __func__);
1007 1011
1008out: 1012out:
1013 unlock_device_hotplug();
1009 free_cpumask_var(offline_mask); 1014 free_cpumask_var(offline_mask);
1010 return atomic_read(&data.error); 1015 return atomic_read(&data.error);
1011} 1016}
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index e1c9cf079503..7cfcb294b11c 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -28,7 +28,7 @@ static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NO
28bool barrier_nospec_enabled; 28bool barrier_nospec_enabled;
29static bool no_nospec; 29static bool no_nospec;
30static bool btb_flush_enabled; 30static bool btb_flush_enabled;
31#ifdef CONFIG_PPC_FSL_BOOK3E 31#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
32static bool no_spectrev2; 32static bool no_spectrev2;
33#endif 33#endif
34 34
@@ -114,7 +114,7 @@ static __init int security_feature_debugfs_init(void)
114device_initcall(security_feature_debugfs_init); 114device_initcall(security_feature_debugfs_init);
115#endif /* CONFIG_DEBUG_FS */ 115#endif /* CONFIG_DEBUG_FS */
116 116
117#ifdef CONFIG_PPC_FSL_BOOK3E 117#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64)
118static int __init handle_nospectre_v2(char *p) 118static int __init handle_nospectre_v2(char *p)
119{ 119{
120 no_spectrev2 = true; 120 no_spectrev2 = true;
@@ -122,6 +122,9 @@ static int __init handle_nospectre_v2(char *p)
122 return 0; 122 return 0;
123} 123}
124early_param("nospectre_v2", handle_nospectre_v2); 124early_param("nospectre_v2", handle_nospectre_v2);
125#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */
126
127#ifdef CONFIG_PPC_FSL_BOOK3E
125void setup_spectre_v2(void) 128void setup_spectre_v2(void)
126{ 129{
127 if (no_spectrev2 || cpu_mitigations_off()) 130 if (no_spectrev2 || cpu_mitigations_off())
@@ -399,7 +402,17 @@ static void toggle_count_cache_flush(bool enable)
399 402
400void setup_count_cache_flush(void) 403void setup_count_cache_flush(void)
401{ 404{
402 toggle_count_cache_flush(true); 405 bool enable = true;
406
407 if (no_spectrev2 || cpu_mitigations_off()) {
408 if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) ||
409 security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED))
410 pr_warn("Spectre v2 mitigations not under software control, can't disable\n");
411
412 enable = false;
413 }
414
415 toggle_count_cache_flush(enable);
403} 416}
404 417
405#ifdef CONFIG_DEBUG_FS 418#ifdef CONFIG_DEBUG_FS
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 5e6543aba1b3..25aaa3903000 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -800,9 +800,15 @@ static __init void print_system_info(void)
800 pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); 800 pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features);
801#ifdef CONFIG_PPC64 801#ifdef CONFIG_PPC64
802 pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); 802 pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
803#ifdef CONFIG_PPC_BOOK3S
804 pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START);
805 pr_info("IO start = 0x%lx\n", KERN_IO_START);
806 pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap);
807#endif
803#endif 808#endif
804 809
805 print_system_hash_info(); 810 if (!early_radix_enabled())
811 print_system_hash_info();
806 812
807 if (PHYSICAL_START > 0) 813 if (PHYSICAL_START > 0)
808 pr_info("physical_start = 0x%llx\n", 814 pr_info("physical_start = 0x%llx\n",
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 94517e4a2723..a7541edf0cdb 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -206,6 +206,6 @@ __init void initialize_cache_info(void)
206 dcache_bsize = cur_cpu_spec->dcache_bsize; 206 dcache_bsize = cur_cpu_spec->dcache_bsize;
207 icache_bsize = cur_cpu_spec->icache_bsize; 207 icache_bsize = cur_cpu_spec->icache_bsize;
208 ucache_bsize = 0; 208 ucache_bsize = 0;
209 if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) 209 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200))
210 ucache_bsize = icache_bsize = dcache_bsize; 210 ucache_bsize = icache_bsize = dcache_bsize;
211} 211}
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 1e2276963f6d..e2a46cfed5fd 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk,
182 * FIXME: IMHO these tests do not belong in 182 * FIXME: IMHO these tests do not belong in
183 * arch-dependent code, they are generic. 183 * arch-dependent code, they are generic.
184 */ 184 */
185 ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL); 185 ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack);
186#ifdef CONFIG_KPROBES 186#ifdef CONFIG_KPROBES
187 /* 187 /*
188 * Mark stacktraces with kretprobed functions on them 188 * Mark stacktraces with kretprobed functions on them
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e2147d7c9e72..80a676da11cb 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -19,6 +19,7 @@
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/pmc.h> 20#include <asm/pmc.h>
21#include <asm/firmware.h> 21#include <asm/firmware.h>
22#include <asm/svm.h>
22 23
23#include "cacheinfo.h" 24#include "cacheinfo.h"
24#include "setup.h" 25#include "setup.h"
@@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = {
715#endif /* HAS_PPC_PMC_PA6T */ 716#endif /* HAS_PPC_PMC_PA6T */
716#endif /* HAS_PPC_PMC_CLASSIC */ 717#endif /* HAS_PPC_PMC_CLASSIC */
717 718
719#ifdef CONFIG_PPC_SVM
720static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf)
721{
722 return sprintf(buf, "%u\n", is_secure_guest());
723}
724static DEVICE_ATTR(svm, 0444, show_svm, NULL);
725
726static void create_svm_file(void)
727{
728 device_create_file(cpu_subsys.dev_root, &dev_attr_svm);
729}
730#else
731static void create_svm_file(void)
732{
733}
734#endif /* CONFIG_PPC_SVM */
735
718static int register_cpu_online(unsigned int cpu) 736static int register_cpu_online(unsigned int cpu)
719{ 737{
720 struct cpu *c = &per_cpu(cpu_devices, cpu); 738 struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -1058,6 +1076,8 @@ static int __init topology_init(void)
1058 sysfs_create_dscr_default(); 1076 sysfs_create_dscr_default();
1059#endif /* CONFIG_PPC64 */ 1077#endif /* CONFIG_PPC64 */
1060 1078
1079 create_svm_file();
1080
1061 return 0; 1081 return 0;
1062} 1082}
1063subsys_initcall(topology_init); 1083subsys_initcall(topology_init);
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index be1ca98fce5c..7ea0ca044b65 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void)
944 * Hook the return address and push it in the stack of return addrs 944 * Hook the return address and push it in the stack of return addrs
945 * in current thread info. Return the address we want to divert to. 945 * in current thread info. Return the address we want to divert to.
946 */ 946 */
947unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) 947unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
948 unsigned long sp)
948{ 949{
949 unsigned long return_hooker; 950 unsigned long return_hooker;
950 951
@@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
956 957
957 return_hooker = ppc_function_entry(return_to_handler); 958 return_hooker = ppc_function_entry(return_to_handler);
958 959
959 if (!function_graph_enter(parent, ip, 0, NULL)) 960 if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp))
960 parent = return_hooker; 961 parent = return_hooker;
961out: 962out:
962 return parent; 963 return parent;
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
index 183f608efb81..e023ae59c429 100644
--- a/arch/powerpc/kernel/trace/ftrace_32.S
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub)
50 50
51#ifdef CONFIG_FUNCTION_GRAPH_TRACER 51#ifdef CONFIG_FUNCTION_GRAPH_TRACER
52_GLOBAL(ftrace_graph_caller) 52_GLOBAL(ftrace_graph_caller)
53 addi r5, r1, 48
53 /* load r4 with local address */ 54 /* load r4 with local address */
54 lwz r4, 44(r1) 55 lwz r4, 44(r1)
55 subi r4, r4, MCOUNT_INSN_SIZE 56 subi r4, r4, MCOUNT_INSN_SIZE
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index 74acbf16a666..f9fd5f743eba 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller)
294 std r2, 24(r1) 294 std r2, 24(r1)
295 ld r2, PACATOC(r13) /* get kernel TOC in r2 */ 295 ld r2, PACATOC(r13) /* get kernel TOC in r2 */
296 296
297 addi r5, r1, 112
297 mfctr r4 /* ftrace_caller has moved local addr here */ 298 mfctr r4 /* ftrace_caller has moved local addr here */
298 std r4, 40(r1) 299 std r4, 40(r1)
299 mflr r3 /* ftrace_caller has restored LR from stack */ 300 mflr r3 /* ftrace_caller has restored LR from stack */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
index e41a7d13c99c..6708e24db0ab 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub)
41 41
42#ifdef CONFIG_FUNCTION_GRAPH_TRACER 42#ifdef CONFIG_FUNCTION_GRAPH_TRACER
43_GLOBAL(ftrace_graph_caller) 43_GLOBAL(ftrace_graph_caller)
44 addi r5, r1, 112
44 /* load r4 with local address */ 45 /* load r4 with local address */
45 ld r4, 128(r1) 46 ld r4, 128(r1)
46 subi r4, r4, MCOUNT_INSN_SIZE 47 subi r4, r4, MCOUNT_INSN_SIZE
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..82f43535e686 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs)
472 if (debugger(regs)) 472 if (debugger(regs))
473 goto out; 473 goto out;
474 474
475 kmsg_dump(KMSG_DUMP_OOPS);
475 /* 476 /*
476 * A system reset is a request to dump, so we always send 477 * A system reset is a request to dump, so we always send
477 * it through the crashdump code (if fadump or kdump are 478 * it through the crashdump code (if fadump or kdump are
diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S
new file mode 100644
index 000000000000..07296bc39166
--- /dev/null
+++ b/arch/powerpc/kernel/ucall.S
@@ -0,0 +1,14 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Generic code to perform an ultravisor call.
4 *
5 * Copyright 2019, IBM Corporation.
6 *
7 */
8#include <asm/ppc_asm.h>
9#include <asm/export.h>
10
11_GLOBAL(ucall_norets)
12EXPORT_SYMBOL_GPL(ucall_norets)
13 sc 2 /* Invoke the ultravisor */
14 blr /* Return r3 = status */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d60598113a9f..eae9ddaecbcf 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -94,28 +94,6 @@ static struct vdso_patch_def vdso_patches[] = {
94 CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, 94 CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
95 "__kernel_sync_dicache", "__kernel_sync_dicache_p5" 95 "__kernel_sync_dicache", "__kernel_sync_dicache_p5"
96 }, 96 },
97#ifdef CONFIG_PPC32
98 {
99 CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
100 "__kernel_gettimeofday", NULL
101 },
102 {
103 CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
104 "__kernel_clock_gettime", NULL
105 },
106 {
107 CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
108 "__kernel_clock_getres", NULL
109 },
110 {
111 CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
112 "__kernel_get_tbfreq", NULL
113 },
114 {
115 CPU_FTR_USE_RTC, CPU_FTR_USE_RTC,
116 "__kernel_time", NULL
117 },
118#endif
119}; 97};
120 98
121/* 99/*
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 6984125b9fc0..6c7401bd284e 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -70,6 +70,7 @@ V_FUNCTION_END(__kernel_get_syscall_map)
70 * 70 *
71 * returns the timebase frequency in HZ 71 * returns the timebase frequency in HZ
72 */ 72 */
73#ifndef CONFIG_PPC_BOOK3S_601
73V_FUNCTION_BEGIN(__kernel_get_tbfreq) 74V_FUNCTION_BEGIN(__kernel_get_tbfreq)
74 .cfi_startproc 75 .cfi_startproc
75 mflr r12 76 mflr r12
@@ -82,3 +83,4 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
82 blr 83 blr
83 .cfi_endproc 84 .cfi_endproc
84V_FUNCTION_END(__kernel_get_tbfreq) 85V_FUNCTION_END(__kernel_get_tbfreq)
86#endif
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 099a6db14e67..00c025ba4a92 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -144,10 +144,13 @@ VERSION
144 __kernel_datapage_offset; 144 __kernel_datapage_offset;
145 145
146 __kernel_get_syscall_map; 146 __kernel_get_syscall_map;
147#ifndef CONFIG_PPC_BOOK3S_601
147 __kernel_gettimeofday; 148 __kernel_gettimeofday;
148 __kernel_clock_gettime; 149 __kernel_clock_gettime;
149 __kernel_clock_getres; 150 __kernel_clock_getres;
151 __kernel_time;
150 __kernel_get_tbfreq; 152 __kernel_get_tbfreq;
153#endif
151 __kernel_sync_dicache; 154 __kernel_sync_dicache;
152 __kernel_sync_dicache_p5; 155 __kernel_sync_dicache_p5;
153 __kernel_sigtramp32; 156 __kernel_sigtramp32;
@@ -155,7 +158,6 @@ VERSION
155#ifdef CONFIG_PPC64 158#ifdef CONFIG_PPC64
156 __kernel_getcpu; 159 __kernel_getcpu;
157#endif 160#endif
158 __kernel_time;
159 161
160 local: *; 162 local: *;
161 }; 163 };
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c4b606fe73eb..5834db0a54c6 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -416,7 +416,7 @@ static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
416 unsigned long hpa = 0; 416 unsigned long hpa = 0;
417 enum dma_data_direction dir = DMA_NONE; 417 enum dma_data_direction dir = DMA_NONE;
418 418
419 iommu_tce_xchg(mm, tbl, entry, &hpa, &dir); 419 iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir);
420} 420}
421 421
422static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, 422static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -447,7 +447,8 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
447 unsigned long hpa = 0; 447 unsigned long hpa = 0;
448 long ret; 448 long ret;
449 449
450 if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir))) 450 if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa,
451 &dir)))
451 return H_TOO_HARD; 452 return H_TOO_HARD;
452 453
453 if (dir == DMA_NONE) 454 if (dir == DMA_NONE)
@@ -455,7 +456,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
455 456
456 ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); 457 ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
457 if (ret != H_SUCCESS) 458 if (ret != H_SUCCESS)
458 iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); 459 iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
459 460
460 return ret; 461 return ret;
461} 462}
@@ -501,7 +502,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
501 if (mm_iommu_mapped_inc(mem)) 502 if (mm_iommu_mapped_inc(mem))
502 return H_TOO_HARD; 503 return H_TOO_HARD;
503 504
504 ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); 505 ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
505 if (WARN_ON_ONCE(ret)) { 506 if (WARN_ON_ONCE(ret)) {
506 mm_iommu_mapped_dec(mem); 507 mm_iommu_mapped_dec(mem);
507 return H_TOO_HARD; 508 return H_TOO_HARD;
@@ -579,6 +580,8 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
579 ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, 580 ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
580 entry, ua, dir); 581 entry, ua, dir);
581 582
583 iommu_tce_kill(stit->tbl, entry, 1);
584
582 if (ret != H_SUCCESS) { 585 if (ret != H_SUCCESS) {
583 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); 586 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
584 goto unlock_exit; 587 goto unlock_exit;
@@ -656,13 +659,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
656 */ 659 */
657 if (get_user(tce, tces + i)) { 660 if (get_user(tce, tces + i)) {
658 ret = H_TOO_HARD; 661 ret = H_TOO_HARD;
659 goto unlock_exit; 662 goto invalidate_exit;
660 } 663 }
661 tce = be64_to_cpu(tce); 664 tce = be64_to_cpu(tce);
662 665
663 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { 666 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
664 ret = H_PARAMETER; 667 ret = H_PARAMETER;
665 goto unlock_exit; 668 goto invalidate_exit;
666 } 669 }
667 670
668 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 671 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -673,13 +676,17 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
673 if (ret != H_SUCCESS) { 676 if (ret != H_SUCCESS) {
674 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, 677 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
675 entry); 678 entry);
676 goto unlock_exit; 679 goto invalidate_exit;
677 } 680 }
678 } 681 }
679 682
680 kvmppc_tce_put(stt, entry + i, tce); 683 kvmppc_tce_put(stt, entry + i, tce);
681 } 684 }
682 685
686invalidate_exit:
687 list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
688 iommu_tce_kill(stit->tbl, entry, npages);
689
683unlock_exit: 690unlock_exit:
684 srcu_read_unlock(&vcpu->kvm->srcu, idx); 691 srcu_read_unlock(&vcpu->kvm->srcu, idx);
685 692
@@ -718,7 +725,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
718 continue; 725 continue;
719 726
720 if (ret == H_TOO_HARD) 727 if (ret == H_TOO_HARD)
721 return ret; 728 goto invalidate_exit;
722 729
723 WARN_ON_ONCE(1); 730 WARN_ON_ONCE(1);
724 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); 731 kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
@@ -728,6 +735,10 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
728 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) 735 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
729 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); 736 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
730 737
731 return H_SUCCESS; 738invalidate_exit:
739 list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
740 iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages);
741
742 return ret;
732} 743}
733EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); 744EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index b4f20f13b860..ab6eeb8e753e 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -218,13 +218,14 @@ static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
218 return H_SUCCESS; 218 return H_SUCCESS;
219} 219}
220 220
221static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 221static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
222 struct iommu_table *tbl,
222 unsigned long entry, unsigned long *hpa, 223 unsigned long entry, unsigned long *hpa,
223 enum dma_data_direction *direction) 224 enum dma_data_direction *direction)
224{ 225{
225 long ret; 226 long ret;
226 227
227 ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 228 ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true);
228 229
229 if (!ret && ((*direction == DMA_FROM_DEVICE) || 230 if (!ret && ((*direction == DMA_FROM_DEVICE) ||
230 (*direction == DMA_BIDIRECTIONAL))) { 231 (*direction == DMA_BIDIRECTIONAL))) {
@@ -240,13 +241,20 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
240 return ret; 241 return ret;
241} 242}
242 243
244extern void iommu_tce_kill_rm(struct iommu_table *tbl,
245 unsigned long entry, unsigned long pages)
246{
247 if (tbl->it_ops->tce_kill)
248 tbl->it_ops->tce_kill(tbl, entry, pages, true);
249}
250
243static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, 251static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
244 unsigned long entry) 252 unsigned long entry)
245{ 253{
246 unsigned long hpa = 0; 254 unsigned long hpa = 0;
247 enum dma_data_direction dir = DMA_NONE; 255 enum dma_data_direction dir = DMA_NONE;
248 256
249 iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 257 iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
250} 258}
251 259
252static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, 260static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -278,7 +286,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
278 unsigned long hpa = 0; 286 unsigned long hpa = 0;
279 long ret; 287 long ret;
280 288
281 if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) 289 if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir))
282 /* 290 /*
283 * real mode xchg can fail if struct page crosses 291 * real mode xchg can fail if struct page crosses
284 * a page boundary 292 * a page boundary
@@ -290,7 +298,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
290 298
291 ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); 299 ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
292 if (ret) 300 if (ret)
293 iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 301 iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
294 302
295 return ret; 303 return ret;
296} 304}
@@ -336,7 +344,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
336 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) 344 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
337 return H_TOO_HARD; 345 return H_TOO_HARD;
338 346
339 ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 347 ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
340 if (ret) { 348 if (ret) {
341 mm_iommu_mapped_dec(mem); 349 mm_iommu_mapped_dec(mem);
342 /* 350 /*
@@ -417,6 +425,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
417 ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, 425 ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
418 stit->tbl, entry, ua, dir); 426 stit->tbl, entry, ua, dir);
419 427
428 iommu_tce_kill_rm(stit->tbl, entry, 1);
429
420 if (ret != H_SUCCESS) { 430 if (ret != H_SUCCESS) {
421 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 431 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
422 return ret; 432 return ret;
@@ -558,7 +568,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
558 ua = 0; 568 ua = 0;
559 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { 569 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
560 ret = H_PARAMETER; 570 ret = H_PARAMETER;
561 goto unlock_exit; 571 goto invalidate_exit;
562 } 572 }
563 573
564 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 574 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -569,13 +579,17 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
569 if (ret != H_SUCCESS) { 579 if (ret != H_SUCCESS) {
570 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, 580 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
571 entry); 581 entry);
572 goto unlock_exit; 582 goto invalidate_exit;
573 } 583 }
574 } 584 }
575 585
576 kvmppc_rm_tce_put(stt, entry + i, tce); 586 kvmppc_rm_tce_put(stt, entry + i, tce);
577 } 587 }
578 588
589invalidate_exit:
590 list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
591 iommu_tce_kill_rm(stit->tbl, entry, npages);
592
579unlock_exit: 593unlock_exit:
580 if (rmap) 594 if (rmap)
581 unlock_rmap(rmap); 595 unlock_rmap(rmap);
@@ -618,7 +632,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
618 continue; 632 continue;
619 633
620 if (ret == H_TOO_HARD) 634 if (ret == H_TOO_HARD)
621 return ret; 635 goto invalidate_exit;
622 636
623 WARN_ON_ONCE_RM(1); 637 WARN_ON_ONCE_RM(1);
624 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 638 kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
@@ -628,7 +642,11 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
628 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) 642 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
629 kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); 643 kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
630 644
631 return H_SUCCESS; 645invalidate_exit:
646 list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
647 iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages);
648
649 return ret;
632} 650}
633 651
634/* This can be called in either virtual mode or real mode */ 652/* This can be called in either virtual mode or real mode */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f8975c620f41..efd8f93bc9dc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5474,6 +5474,12 @@ static int kvmppc_radix_possible(void)
5474static int kvmppc_book3s_init_hv(void) 5474static int kvmppc_book3s_init_hv(void)
5475{ 5475{
5476 int r; 5476 int r;
5477
5478 if (!tlbie_capable) {
5479 pr_err("KVM-HV: Host does not support TLBIE\n");
5480 return -ENODEV;
5481 }
5482
5477 /* 5483 /*
5478 * FIXME!! Do we need to check on all cpus ? 5484 * FIXME!! Do we need to check on all cpus ?
5479 */ 5485 */
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 735e0ac6f5b2..fff90f2c3de2 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -398,7 +398,7 @@ static void kvmhv_flush_lpid(unsigned int lpid)
398 long rc; 398 long rc;
399 399
400 if (!kvmhv_on_pseries()) { 400 if (!kvmhv_on_pseries()) {
401 radix__flush_tlb_lpid(lpid); 401 radix__flush_all_lpid(lpid);
402 return; 402 return;
403 } 403 }
404 404
@@ -411,7 +411,7 @@ static void kvmhv_flush_lpid(unsigned int lpid)
411void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) 411void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
412{ 412{
413 if (!kvmhv_on_pseries()) { 413 if (!kvmhv_on_pseries()) {
414 mmu_partition_table_set_entry(lpid, dw0, dw1); 414 mmu_partition_table_set_entry(lpid, dw0, dw1, true);
415 return; 415 return;
416 } 416 }
417 417
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 07181d0dfcb7..9a05b0d932ef 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -29,6 +29,7 @@
29#include <asm/asm-compat.h> 29#include <asm/asm-compat.h>
30#include <asm/feature-fixups.h> 30#include <asm/feature-fixups.h>
31#include <asm/cpuidle.h> 31#include <asm/cpuidle.h>
32#include <asm/ultravisor-api.h>
32 33
33/* Sign-extend HDEC if not on POWER9 */ 34/* Sign-extend HDEC if not on POWER9 */
34#define EXTEND_HDEC(reg) \ 35#define EXTEND_HDEC(reg) \
@@ -1085,16 +1086,10 @@ BEGIN_FTR_SECTION
1085END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 1086END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1086 1087
1087 ld r5, VCPU_LR(r4) 1088 ld r5, VCPU_LR(r4)
1088 ld r6, VCPU_CR(r4)
1089 mtlr r5 1089 mtlr r5
1090 mtcr r6
1091 1090
1092 ld r1, VCPU_GPR(R1)(r4) 1091 ld r1, VCPU_GPR(R1)(r4)
1093 ld r2, VCPU_GPR(R2)(r4)
1094 ld r3, VCPU_GPR(R3)(r4)
1095 ld r5, VCPU_GPR(R5)(r4) 1092 ld r5, VCPU_GPR(R5)(r4)
1096 ld r6, VCPU_GPR(R6)(r4)
1097 ld r7, VCPU_GPR(R7)(r4)
1098 ld r8, VCPU_GPR(R8)(r4) 1093 ld r8, VCPU_GPR(R8)(r4)
1099 ld r9, VCPU_GPR(R9)(r4) 1094 ld r9, VCPU_GPR(R9)(r4)
1100 ld r10, VCPU_GPR(R10)(r4) 1095 ld r10, VCPU_GPR(R10)(r4)
@@ -1112,10 +1107,42 @@ BEGIN_FTR_SECTION
1112 mtspr SPRN_HDSISR, r0 1107 mtspr SPRN_HDSISR, r0
1113END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1108END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1114 1109
1110 ld r6, VCPU_KVM(r4)
1111 lbz r7, KVM_SECURE_GUEST(r6)
1112 cmpdi r7, 0
1113 ld r6, VCPU_GPR(R6)(r4)
1114 ld r7, VCPU_GPR(R7)(r4)
1115 bne ret_to_ultra
1116
1117 lwz r0, VCPU_CR(r4)
1118 mtcr r0
1119
1115 ld r0, VCPU_GPR(R0)(r4) 1120 ld r0, VCPU_GPR(R0)(r4)
1121 ld r2, VCPU_GPR(R2)(r4)
1122 ld r3, VCPU_GPR(R3)(r4)
1116 ld r4, VCPU_GPR(R4)(r4) 1123 ld r4, VCPU_GPR(R4)(r4)
1117 HRFI_TO_GUEST 1124 HRFI_TO_GUEST
1118 b . 1125 b .
1126/*
1127 * Use UV_RETURN ultracall to return control back to the Ultravisor after
1128 * processing an hypercall or interrupt that was forwarded (a.k.a. reflected)
1129 * to the Hypervisor.
1130 *
1131 * All registers have already been loaded, except:
1132 * R0 = hcall result
1133 * R2 = SRR1, so UV can detect a synthesized interrupt (if any)
1134 * R3 = UV_RETURN
1135 */
1136ret_to_ultra:
1137 lwz r0, VCPU_CR(r4)
1138 mtcr r0
1139
1140 ld r0, VCPU_GPR(R3)(r4)
1141 mfspr r2, SPRN_SRR1
1142 li r3, 0
1143 ori r3, r3, UV_RETURN
1144 ld r4, VCPU_GPR(R4)(r4)
1145 sc 2
1119 1146
1120/* 1147/*
1121 * Enter the guest on a P9 or later system where we have exactly 1148 * Enter the guest on a P9 or later system where we have exactly
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index eebc782d89a5..b8de3be10eb4 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
16CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING 16CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
17endif 17endif
18 18
19obj-y += alloc.o code-patching.o feature-fixups.o 19obj-y += alloc.o code-patching.o feature-fixups.o pmem.o
20 20
21ifndef CONFIG_KASAN 21ifndef CONFIG_KASAN
22obj-y += string.o memcmp_$(BITS).o 22obj-y += string.o memcmp_$(BITS).o
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
39 memcpy_power7.o 39 memcpy_power7.o
40 40
41obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ 41obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
42 memcpy_64.o pmem.o 42 memcpy_64.o memcpy_mcsafe_64.o
43 43
44obj64-$(CONFIG_SMP) += locks.o 44obj64-$(CONFIG_SMP) += locks.o
45obj64-$(CONFIG_ALTIVEC) += vmx-helper.o 45obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6550b9e5ce5f..6440d5943c00 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -18,7 +18,7 @@
18#include <asm/hvcall.h> 18#include <asm/hvcall.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
21void __spin_yield(arch_spinlock_t *lock) 21void splpar_spin_yield(arch_spinlock_t *lock)
22{ 22{
23 unsigned int lock_value, holder_cpu, yield_count; 23 unsigned int lock_value, holder_cpu, yield_count;
24 24
@@ -36,14 +36,14 @@ void __spin_yield(arch_spinlock_t *lock)
36 plpar_hcall_norets(H_CONFER, 36 plpar_hcall_norets(H_CONFER,
37 get_hard_smp_processor_id(holder_cpu), yield_count); 37 get_hard_smp_processor_id(holder_cpu), yield_count);
38} 38}
39EXPORT_SYMBOL_GPL(__spin_yield); 39EXPORT_SYMBOL_GPL(splpar_spin_yield);
40 40
41/* 41/*
42 * Waiting for a read lock or a write lock on a rwlock... 42 * Waiting for a read lock or a write lock on a rwlock...
43 * This turns out to be the same for read and write locks, since 43 * This turns out to be the same for read and write locks, since
44 * we only know the holder if it is write-locked. 44 * we only know the holder if it is write-locked.
45 */ 45 */
46void __rw_yield(arch_rwlock_t *rw) 46void splpar_rw_yield(arch_rwlock_t *rw)
47{ 47{
48 int lock_value; 48 int lock_value;
49 unsigned int holder_cpu, yield_count; 49 unsigned int holder_cpu, yield_count;
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index 000000000000..cb882d9a6d8a
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,242 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) IBM Corporation, 2011
4 * Derived from copyuser_power7.s by Anton Blanchard <anton@au.ibm.com>
5 * Author - Balbir Singh <bsingharora@gmail.com>
6 */
7#include <asm/ppc_asm.h>
8#include <asm/errno.h>
9#include <asm/export.h>
10
11 .macro err1
12100:
13 EX_TABLE(100b,.Ldo_err1)
14 .endm
15
16 .macro err2
17200:
18 EX_TABLE(200b,.Ldo_err2)
19 .endm
20
21 .macro err3
22300: EX_TABLE(300b,.Ldone)
23 .endm
24
25.Ldo_err2:
26 ld r22,STK_REG(R22)(r1)
27 ld r21,STK_REG(R21)(r1)
28 ld r20,STK_REG(R20)(r1)
29 ld r19,STK_REG(R19)(r1)
30 ld r18,STK_REG(R18)(r1)
31 ld r17,STK_REG(R17)(r1)
32 ld r16,STK_REG(R16)(r1)
33 ld r15,STK_REG(R15)(r1)
34 ld r14,STK_REG(R14)(r1)
35 addi r1,r1,STACKFRAMESIZE
36.Ldo_err1:
37 /* Do a byte by byte copy to get the exact remaining size */
38 mtctr r7
3946:
40err3; lbz r0,0(r4)
41 addi r4,r4,1
42err3; stb r0,0(r3)
43 addi r3,r3,1
44 bdnz 46b
45 li r3,0
46 blr
47
48.Ldone:
49 mfctr r3
50 blr
51
52
53_GLOBAL(memcpy_mcsafe)
54 mr r7,r5
55 cmpldi r5,16
56 blt .Lshort_copy
57
58.Lcopy:
59 /* Get the source 8B aligned */
60 neg r6,r4
61 mtocrf 0x01,r6
62 clrldi r6,r6,(64-3)
63
64 bf cr7*4+3,1f
65err1; lbz r0,0(r4)
66 addi r4,r4,1
67err1; stb r0,0(r3)
68 addi r3,r3,1
69 subi r7,r7,1
70
711: bf cr7*4+2,2f
72err1; lhz r0,0(r4)
73 addi r4,r4,2
74err1; sth r0,0(r3)
75 addi r3,r3,2
76 subi r7,r7,2
77
782: bf cr7*4+1,3f
79err1; lwz r0,0(r4)
80 addi r4,r4,4
81err1; stw r0,0(r3)
82 addi r3,r3,4
83 subi r7,r7,4
84
853: sub r5,r5,r6
86 cmpldi r5,128
87
88 mflr r0
89 stdu r1,-STACKFRAMESIZE(r1)
90 std r14,STK_REG(R14)(r1)
91 std r15,STK_REG(R15)(r1)
92 std r16,STK_REG(R16)(r1)
93 std r17,STK_REG(R17)(r1)
94 std r18,STK_REG(R18)(r1)
95 std r19,STK_REG(R19)(r1)
96 std r20,STK_REG(R20)(r1)
97 std r21,STK_REG(R21)(r1)
98 std r22,STK_REG(R22)(r1)
99 std r0,STACKFRAMESIZE+16(r1)
100
101 blt 5f
102 srdi r6,r5,7
103 mtctr r6
104
105 /* Now do cacheline (128B) sized loads and stores. */
106 .align 5
1074:
108err2; ld r0,0(r4)
109err2; ld r6,8(r4)
110err2; ld r8,16(r4)
111err2; ld r9,24(r4)
112err2; ld r10,32(r4)
113err2; ld r11,40(r4)
114err2; ld r12,48(r4)
115err2; ld r14,56(r4)
116err2; ld r15,64(r4)
117err2; ld r16,72(r4)
118err2; ld r17,80(r4)
119err2; ld r18,88(r4)
120err2; ld r19,96(r4)
121err2; ld r20,104(r4)
122err2; ld r21,112(r4)
123err2; ld r22,120(r4)
124 addi r4,r4,128
125err2; std r0,0(r3)
126err2; std r6,8(r3)
127err2; std r8,16(r3)
128err2; std r9,24(r3)
129err2; std r10,32(r3)
130err2; std r11,40(r3)
131err2; std r12,48(r3)
132err2; std r14,56(r3)
133err2; std r15,64(r3)
134err2; std r16,72(r3)
135err2; std r17,80(r3)
136err2; std r18,88(r3)
137err2; std r19,96(r3)
138err2; std r20,104(r3)
139err2; std r21,112(r3)
140err2; std r22,120(r3)
141 addi r3,r3,128
142 subi r7,r7,128
143 bdnz 4b
144
145 clrldi r5,r5,(64-7)
146
147 /* Up to 127B to go */
1485: srdi r6,r5,4
149 mtocrf 0x01,r6
150
1516: bf cr7*4+1,7f
152err2; ld r0,0(r4)
153err2; ld r6,8(r4)
154err2; ld r8,16(r4)
155err2; ld r9,24(r4)
156err2; ld r10,32(r4)
157err2; ld r11,40(r4)
158err2; ld r12,48(r4)
159err2; ld r14,56(r4)
160 addi r4,r4,64
161err2; std r0,0(r3)
162err2; std r6,8(r3)
163err2; std r8,16(r3)
164err2; std r9,24(r3)
165err2; std r10,32(r3)
166err2; std r11,40(r3)
167err2; std r12,48(r3)
168err2; std r14,56(r3)
169 addi r3,r3,64
170 subi r7,r7,64
171
1727: ld r14,STK_REG(R14)(r1)
173 ld r15,STK_REG(R15)(r1)
174 ld r16,STK_REG(R16)(r1)
175 ld r17,STK_REG(R17)(r1)
176 ld r18,STK_REG(R18)(r1)
177 ld r19,STK_REG(R19)(r1)
178 ld r20,STK_REG(R20)(r1)
179 ld r21,STK_REG(R21)(r1)
180 ld r22,STK_REG(R22)(r1)
181 addi r1,r1,STACKFRAMESIZE
182
183 /* Up to 63B to go */
184 bf cr7*4+2,8f
185err1; ld r0,0(r4)
186err1; ld r6,8(r4)
187err1; ld r8,16(r4)
188err1; ld r9,24(r4)
189 addi r4,r4,32
190err1; std r0,0(r3)
191err1; std r6,8(r3)
192err1; std r8,16(r3)
193err1; std r9,24(r3)
194 addi r3,r3,32
195 subi r7,r7,32
196
197 /* Up to 31B to go */
1988: bf cr7*4+3,9f
199err1; ld r0,0(r4)
200err1; ld r6,8(r4)
201 addi r4,r4,16
202err1; std r0,0(r3)
203err1; std r6,8(r3)
204 addi r3,r3,16
205 subi r7,r7,16
206
2079: clrldi r5,r5,(64-4)
208
209 /* Up to 15B to go */
210.Lshort_copy:
211 mtocrf 0x01,r5
212 bf cr7*4+0,12f
213err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
214err1; lwz r6,4(r4)
215 addi r4,r4,8
216err1; stw r0,0(r3)
217err1; stw r6,4(r3)
218 addi r3,r3,8
219 subi r7,r7,8
220
22112: bf cr7*4+1,13f
222err1; lwz r0,0(r4)
223 addi r4,r4,4
224err1; stw r0,0(r3)
225 addi r3,r3,4
226 subi r7,r7,4
227
22813: bf cr7*4+2,14f
229err1; lhz r0,0(r4)
230 addi r4,r4,2
231err1; sth r0,0(r3)
232 addi r3,r3,2
233 subi r7,r7,2
234
23514: bf cr7*4+3,15f
236err1; lbz r0,0(r4)
237err1; stb r0,0(r3)
238
23915: li r3,0
240 blr
241
242EXPORT_SYMBOL_GPL(memcpy_mcsafe);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..5e147986400d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
7 7
8obj-y := fault.o mem.o pgtable.o mmap.o \ 8obj-y := fault.o mem.o pgtable.o mmap.o \
9 init_$(BITS).o pgtable_$(BITS).o \ 9 init_$(BITS).o pgtable_$(BITS).o \
10 pgtable-frag.o \ 10 pgtable-frag.o ioremap.o ioremap_$(BITS).o \
11 init-common.o mmu_context.o drmem.o 11 init-common.o mmu_context.o drmem.o
12obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ 12obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/
13obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ 13obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index e249fbf6b9c3..84d5fab94f8f 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -74,7 +74,7 @@ static int find_free_bat(void)
74{ 74{
75 int b; 75 int b;
76 76
77 if (cpu_has_feature(CPU_FTR_601)) { 77 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
78 for (b = 0; b < 4; b++) { 78 for (b = 0; b < 4; b++) {
79 struct ppc_bat *bat = BATS[b]; 79 struct ppc_bat *bat = BATS[b];
80 80
@@ -106,7 +106,7 @@ static int find_free_bat(void)
106 */ 106 */
107static unsigned int block_size(unsigned long base, unsigned long top) 107static unsigned int block_size(unsigned long base, unsigned long top)
108{ 108{
109 unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; 109 unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M;
110 unsigned int base_shift = (ffs(base) - 1) & 31; 110 unsigned int base_shift = (ffs(base) - 1) & 31;
111 unsigned int block_shift = (fls(top - base) - 1) & 31; 111 unsigned int block_shift = (fls(top - base) - 1) & 31;
112 112
@@ -189,7 +189,7 @@ void mmu_mark_initmem_nx(void)
189 unsigned long top = (unsigned long)_etext - PAGE_OFFSET; 189 unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
190 unsigned long size; 190 unsigned long size;
191 191
192 if (cpu_has_feature(CPU_FTR_601)) 192 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
193 return; 193 return;
194 194
195 for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { 195 for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
@@ -227,7 +227,7 @@ void mmu_mark_rodata_ro(void)
227 int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; 227 int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
228 int i; 228 int i;
229 229
230 if (cpu_has_feature(CPU_FTR_601)) 230 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
231 return; 231 return;
232 232
233 for (i = 0; i < nb; i++) { 233 for (i = 0; i < nb; i++) {
@@ -259,7 +259,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
259 flags &= ~_PAGE_COHERENT; 259 flags &= ~_PAGE_COHERENT;
260 260
261 bl = (size >> 17) - 1; 261 bl = (size >> 17) - 1;
262 if (PVR_VER(mfspr(SPRN_PVR)) != 1) { 262 if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
263 /* 603, 604, etc. */ 263 /* 603, 604, etc. */
264 /* Do DBAT first */ 264 /* Do DBAT first */
265 wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE 265 wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
@@ -297,8 +297,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
297/* 297/*
298 * Preload a translation in the hash table 298 * Preload a translation in the hash table
299 */ 299 */
300void hash_preload(struct mm_struct *mm, unsigned long ea, 300void hash_preload(struct mm_struct *mm, unsigned long ea)
301 bool is_exec, unsigned long trap)
302{ 301{
303 pmd_t *pmd; 302 pmd_t *pmd;
304 303
@@ -310,6 +309,39 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
310} 309}
311 310
312/* 311/*
312 * This is called at the end of handling a user page fault, when the
313 * fault has been handled by updating a PTE in the linux page tables.
314 * We use it to preload an HPTE into the hash table corresponding to
315 * the updated linux PTE.
316 *
317 * This must always be called with the pte lock held.
318 */
319void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
320 pte_t *ptep)
321{
322 if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
323 return;
324 /*
325 * We don't need to worry about _PAGE_PRESENT here because we are
326 * called with either mm->page_table_lock held or ptl lock held
327 */
328
329 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
330 if (!pte_young(*ptep) || address >= TASK_SIZE)
331 return;
332
333 /* We have to test for regs NULL since init will get here first thing at boot */
334 if (!current->thread.regs)
335 return;
336
337 /* We also avoid filling the hash if not coming from a fault */
338 if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400)
339 return;
340
341 hash_preload(vma->vm_mm, address);
342}
343
344/*
313 * Initialize the hash table and patch the instructions in hashtable.S. 345 * Initialize the hash table and patch the instructions in hashtable.S.
314 */ 346 */
315void __init MMU_init_hw(void) 347void __init MMU_init_hw(void)
@@ -358,6 +390,15 @@ void __init MMU_init_hw(void)
358 hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; 390 hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
359 if (lg_n_hpteg > 16) 391 if (lg_n_hpteg > 16)
360 hash_mb2 = 16 - LG_HPTEG_SIZE; 392 hash_mb2 = 16 - LG_HPTEG_SIZE;
393
394 /*
395 * When KASAN is selected, there is already an early temporary hash
396 * table and the switch to the final hash table is done later.
397 */
398 if (IS_ENABLED(CONFIG_KASAN))
399 return;
400
401 MMU_init_hw_patch();
361} 402}
362 403
363void __init MMU_init_hw_patch(void) 404void __init MMU_init_hw_patch(void)
@@ -400,7 +441,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
400 BUG_ON(first_memblock_base != 0); 441 BUG_ON(first_memblock_base != 0);
401 442
402 /* 601 can only access 16MB at the moment */ 443 /* 601 can only access 16MB at the moment */
403 if (PVR_VER(mfspr(SPRN_PVR)) == 1) 444 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
404 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); 445 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
405 else /* Anything else has 256M mapped */ 446 else /* Anything else has 256M mapped */
406 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); 447 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
@@ -418,9 +459,6 @@ void __init setup_kuep(bool disabled)
418{ 459{
419 pr_info("Activating Kernel Userspace Execution Prevention\n"); 460 pr_info("Activating Kernel Userspace Execution Prevention\n");
420 461
421 if (cpu_has_feature(CPU_FTR_601))
422 pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n");
423
424 if (disabled) 462 if (disabled)
425 pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); 463 pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
426} 464}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b8ad14bb1170..3410ea9f4de1 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -34,6 +34,7 @@
34#include <linux/libfdt.h> 34#include <linux/libfdt.h>
35#include <linux/pkeys.h> 35#include <linux/pkeys.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/cpu.h>
37 38
38#include <asm/debugfs.h> 39#include <asm/debugfs.h>
39#include <asm/processor.h> 40#include <asm/processor.h>
@@ -61,6 +62,7 @@
61#include <asm/ps3.h> 62#include <asm/ps3.h>
62#include <asm/pte-walk.h> 63#include <asm/pte-walk.h>
63#include <asm/asm-prototypes.h> 64#include <asm/asm-prototypes.h>
65#include <asm/ultravisor.h>
64 66
65#include <mm/mmu_decl.h> 67#include <mm/mmu_decl.h>
66 68
@@ -271,10 +273,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
271 if (overlaps_kernel_text(vaddr, vaddr + step)) 273 if (overlaps_kernel_text(vaddr, vaddr + step))
272 tprot &= ~HPTE_R_N; 274 tprot &= ~HPTE_R_N;
273 275
274 /* Make kvm guest trampolines executable */
275 if (overlaps_kvm_tmp(vaddr, vaddr + step))
276 tprot &= ~HPTE_R_N;
277
278 /* 276 /*
279 * If relocatable, check if it overlaps interrupt vectors that 277 * If relocatable, check if it overlaps interrupt vectors that
280 * are copied down to real 0. For relocatable kernel 278 * are copied down to real 0. For relocatable kernel
@@ -823,7 +821,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
823 * For now, UPRT is 0 and we have no segment table. 821 * For now, UPRT is 0 and we have no segment table.
824 */ 822 */
825 htab_size = __ilog2(htab_size) - 18; 823 htab_size = __ilog2(htab_size) - 18;
826 mmu_partition_table_set_entry(0, hash_table | htab_size, 0); 824 mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
827 pr_info("Partition table %p\n", partition_tb); 825 pr_info("Partition table %p\n", partition_tb);
828} 826}
829 827
@@ -857,12 +855,6 @@ static void __init htab_initialize(void)
857 /* Using a hypervisor which owns the htab */ 855 /* Using a hypervisor which owns the htab */
858 htab_address = NULL; 856 htab_address = NULL;
859 _SDR1 = 0; 857 _SDR1 = 0;
860 /*
861 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
862 * to inform the hypervisor that we wish to use the HPT.
863 */
864 if (cpu_has_feature(CPU_FTR_ARCH_300))
865 register_process_table(0, 0, 0);
866#ifdef CONFIG_FA_DUMP 858#ifdef CONFIG_FA_DUMP
867 /* 859 /*
868 * If firmware assisted dump is active firmware preserves 860 * If firmware assisted dump is active firmware preserves
@@ -1075,8 +1067,8 @@ void hash__early_init_mmu_secondary(void)
1075 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 1067 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1076 mtspr(SPRN_SDR1, _SDR1); 1068 mtspr(SPRN_SDR1, _SDR1);
1077 else 1069 else
1078 mtspr(SPRN_PTCR, 1070 set_ptcr_when_no_uv(__pa(partition_tb) |
1079 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 1071 (PATB_SIZE_SHIFT - 12));
1080 } 1072 }
1081 /* Initialize SLB */ 1073 /* Initialize SLB */
1082 slb_initialize(); 1074 slb_initialize();
@@ -1460,8 +1452,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
1460} 1452}
1461EXPORT_SYMBOL_GPL(hash_page); 1453EXPORT_SYMBOL_GPL(hash_page);
1462 1454
1463int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, 1455int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
1464 unsigned long dsisr) 1456 unsigned long msr)
1465{ 1457{
1466 unsigned long access = _PAGE_PRESENT | _PAGE_READ; 1458 unsigned long access = _PAGE_PRESENT | _PAGE_READ;
1467 unsigned long flags = 0; 1459 unsigned long flags = 0;
@@ -1518,8 +1510,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
1518} 1510}
1519#endif 1511#endif
1520 1512
1521void hash_preload(struct mm_struct *mm, unsigned long ea, 1513static void hash_preload(struct mm_struct *mm, unsigned long ea,
1522 bool is_exec, unsigned long trap) 1514 bool is_exec, unsigned long trap)
1523{ 1515{
1524 int hugepage_shift; 1516 int hugepage_shift;
1525 unsigned long vsid; 1517 unsigned long vsid;
@@ -1599,6 +1591,57 @@ out_exit:
1599 local_irq_restore(flags); 1591 local_irq_restore(flags);
1600} 1592}
1601 1593
1594/*
1595 * This is called at the end of handling a user page fault, when the
1596 * fault has been handled by updating a PTE in the linux page tables.
1597 * We use it to preload an HPTE into the hash table corresponding to
1598 * the updated linux PTE.
1599 *
1600 * This must always be called with the pte lock held.
1601 */
1602void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
1603 pte_t *ptep)
1604{
1605 /*
1606 * We don't need to worry about _PAGE_PRESENT here because we are
1607 * called with either mm->page_table_lock held or ptl lock held
1608 */
1609 unsigned long trap;
1610 bool is_exec;
1611
1612 if (radix_enabled()) {
1613 prefetch((void *)address);
1614 return;
1615 }
1616
1617 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
1618 if (!pte_young(*ptep) || address >= TASK_SIZE)
1619 return;
1620
1621 /*
1622 * We try to figure out if we are coming from an instruction
1623 * access fault and pass that down to __hash_page so we avoid
1624 * double-faulting on execution of fresh text. We have to test
1625 * for regs NULL since init will get here first thing at boot.
1626 *
1627 * We also avoid filling the hash if not coming from a fault.
1628 */
1629
1630 trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
1631 switch (trap) {
1632 case 0x300:
1633 is_exec = false;
1634 break;
1635 case 0x400:
1636 is_exec = true;
1637 break;
1638 default:
1639 return;
1640 }
1641
1642 hash_preload(vma->vm_mm, address, is_exec, trap);
1643}
1644
1602#ifdef CONFIG_PPC_MEM_KEYS 1645#ifdef CONFIG_PPC_MEM_KEYS
1603/* 1646/*
1604 * Return the protection key associated with the given address and the 1647 * Return the protection key associated with the given address and the
@@ -1931,10 +1974,16 @@ static int hpt_order_get(void *data, u64 *val)
1931 1974
1932static int hpt_order_set(void *data, u64 val) 1975static int hpt_order_set(void *data, u64 val)
1933{ 1976{
1977 int ret;
1978
1934 if (!mmu_hash_ops.resize_hpt) 1979 if (!mmu_hash_ops.resize_hpt)
1935 return -ENODEV; 1980 return -ENODEV;
1936 1981
1937 return mmu_hash_ops.resize_hpt(val); 1982 cpus_read_lock();
1983 ret = mmu_hash_ops.resize_hpt(val);
1984 cpus_read_unlock();
1985
1986 return ret;
1938} 1987}
1939 1988
1940DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); 1989DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
@@ -1957,7 +2006,4 @@ void __init print_system_hash_info(void)
1957 2006
1958 if (htab_hash_mask) 2007 if (htab_hash_mask)
1959 pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); 2008 pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
1960 pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START);
1961 pr_info("kernel IO start = 0x%lx\n", KERN_IO_START);
1962 pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap);
1963} 2009}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 7d0e0d0d22c4..75483b40fcb1 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -8,10 +8,13 @@
8#include <linux/memblock.h> 8#include <linux/memblock.h>
9#include <misc/cxl-base.h> 9#include <misc/cxl-base.h>
10 10
11#include <asm/debugfs.h>
11#include <asm/pgalloc.h> 12#include <asm/pgalloc.h>
12#include <asm/tlb.h> 13#include <asm/tlb.h>
13#include <asm/trace.h> 14#include <asm/trace.h>
14#include <asm/powernv.h> 15#include <asm/powernv.h>
16#include <asm/firmware.h>
17#include <asm/ultravisor.h>
15 18
16#include <mm/mmu_decl.h> 19#include <mm/mmu_decl.h>
17#include <trace/events/thp.h> 20#include <trace/events/thp.h>
@@ -21,9 +24,6 @@ EXPORT_SYMBOL(__pmd_frag_nr);
21unsigned long __pmd_frag_size_shift; 24unsigned long __pmd_frag_size_shift;
22EXPORT_SYMBOL(__pmd_frag_size_shift); 25EXPORT_SYMBOL(__pmd_frag_size_shift);
23 26
24int (*register_process_table)(unsigned long base, unsigned long page_size,
25 unsigned long tbl_size);
26
27#ifdef CONFIG_TRANSPARENT_HUGEPAGE 27#ifdef CONFIG_TRANSPARENT_HUGEPAGE
28/* 28/*
29 * This is called when relaxing access to a hugepage. It's also called in the page 29 * This is called when relaxing access to a hugepage. It's also called in the page
@@ -205,37 +205,61 @@ void __init mmu_partition_table_init(void)
205 * 64 K size. 205 * 64 K size.
206 */ 206 */
207 ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); 207 ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
208 mtspr(SPRN_PTCR, ptcr); 208 set_ptcr_when_no_uv(ptcr);
209 powernv_set_nmmu_ptcr(ptcr); 209 powernv_set_nmmu_ptcr(ptcr);
210} 210}
211 211
212static void flush_partition(unsigned int lpid, bool radix)
213{
214 if (radix) {
215 radix__flush_all_lpid(lpid);
216 radix__flush_all_lpid_guest(lpid);
217 } else {
218 asm volatile("ptesync" : : : "memory");
219 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
220 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
221 /* do we need fixup here ?*/
222 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
223 trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
224 }
225}
226
212void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, 227void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
213 unsigned long dw1) 228 unsigned long dw1, bool flush)
214{ 229{
215 unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); 230 unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
216 231
232 /*
233 * When ultravisor is enabled, the partition table is stored in secure
234 * memory and can only be accessed doing an ultravisor call. However, we
235 * maintain a copy of the partition table in normal memory to allow Nest
236 * MMU translations to occur (for normal VMs).
237 *
238 * Therefore, here we always update partition_tb, regardless of whether
239 * we are running under an ultravisor or not.
240 */
217 partition_tb[lpid].patb0 = cpu_to_be64(dw0); 241 partition_tb[lpid].patb0 = cpu_to_be64(dw0);
218 partition_tb[lpid].patb1 = cpu_to_be64(dw1); 242 partition_tb[lpid].patb1 = cpu_to_be64(dw1);
219 243
220 /* 244 /*
221 * Global flush of TLBs and partition table caches for this lpid. 245 * If ultravisor is enabled, we do an ultravisor call to register the
222 * The type of flush (hash or radix) depends on what the previous 246 * partition table entry (PATE), which also do a global flush of TLBs
223 * use of this partition ID was, not the new use. 247 * and partition table caches for the lpid. Otherwise, just do the
248 * flush. The type of flush (hash or radix) depends on what the previous
249 * use of the partition ID was, not the new use.
224 */ 250 */
225 asm volatile("ptesync" : : : "memory"); 251 if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
226 if (old & PATB_HR) { 252 uv_register_pate(lpid, dw0, dw1);
227 asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : 253 pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
228 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 254 dw0, dw1);
229 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : 255 } else if (flush) {
230 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 256 /*
231 trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); 257 * Boot does not need to flush, because MMU is off and each
232 } else { 258 * CPU does a tlbiel_all() before switching them on, which
233 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : 259 * flushes everything.
234 "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); 260 */
235 trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); 261 flush_partition(lpid, (old & PATB_HR));
236 } 262 }
237 /* do we need fixup here ?*/
238 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
239} 263}
240EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); 264EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
241 265
@@ -447,23 +471,48 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
447 return true; 471 return true;
448} 472}
449 473
450int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) 474/*
451{ 475 * Does the CPU support tlbie?
452 unsigned long i; 476 */
477bool tlbie_capable __read_mostly = true;
478EXPORT_SYMBOL(tlbie_capable);
453 479
454 if (radix_enabled()) 480/*
455 return radix__ioremap_range(ea, pa, size, prot, nid); 481 * Should tlbie be used for management of CPU TLBs, for kernel and process
456 482 * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
457 for (i = 0; i < size; i += PAGE_SIZE) { 483 * guest address spaces.
458 int err = map_kernel_page(ea + i, pa + i, prot); 484 */
459 if (err) { 485bool tlbie_enabled __read_mostly = true;
460 if (slab_is_available()) 486
461 unmap_kernel_range(ea, size); 487static int __init setup_disable_tlbie(char *str)
462 else 488{
463 WARN_ON_ONCE(1); /* Should clean up */ 489 if (!radix_enabled()) {
464 return err; 490 pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
465 } 491 return 1;
466 } 492 }
467 493
494 tlbie_capable = false;
495 tlbie_enabled = false;
496
497 return 1;
498}
499__setup("disable_tlbie", setup_disable_tlbie);
500
501static int __init pgtable_debugfs_setup(void)
502{
503 if (!tlbie_capable)
504 return 0;
505
506 /*
507 * There is no locking vs tlb flushing when changing this value.
508 * The tlb flushers will see one value or another, and use either
509 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
510 * invalidated as expected.
511 */
512 debugfs_create_bool("tlbie_enabled", 0600,
513 powerpc_debugfs_root,
514 &tlbie_enabled);
515
468 return 0; 516 return 0;
469} 517}
518arch_initcall(pgtable_debugfs_setup);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b4ca9e95e678..3a1fbf9cb8f8 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -27,25 +27,13 @@
27#include <asm/sections.h> 27#include <asm/sections.h>
28#include <asm/trace.h> 28#include <asm/trace.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <asm/ultravisor.h>
30 31
31#include <trace/events/thp.h> 32#include <trace/events/thp.h>
32 33
33unsigned int mmu_pid_bits; 34unsigned int mmu_pid_bits;
34unsigned int mmu_base_pid; 35unsigned int mmu_base_pid;
35 36
36static int native_register_process_table(unsigned long base, unsigned long pg_sz,
37 unsigned long table_size)
38{
39 unsigned long patb0, patb1;
40
41 patb0 = be64_to_cpu(partition_tb[0].patb0);
42 patb1 = base | table_size | PATB_GR;
43
44 mmu_partition_table_set_entry(0, patb0, patb1);
45
46 return 0;
47}
48
49static __ref void *early_alloc_pgtable(unsigned long size, int nid, 37static __ref void *early_alloc_pgtable(unsigned long size, int nid,
50 unsigned long region_start, unsigned long region_end) 38 unsigned long region_start, unsigned long region_end)
51{ 39{
@@ -380,18 +368,6 @@ static void __init radix_init_pgtable(void)
380 */ 368 */
381 rts_field = radix__get_tree_size(); 369 rts_field = radix__get_tree_size();
382 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 370 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
383 /*
384 * Fill in the partition table. We are suppose to use effective address
385 * of process table here. But our linear mapping also enable us to use
386 * physical address here.
387 */
388 register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
389 pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
390 asm volatile("ptesync" : : : "memory");
391 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
392 "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
393 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
394 trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
395 371
396 /* 372 /*
397 * The init_mm context is given the first available (non-zero) PID, 373 * The init_mm context is given the first available (non-zero) PID,
@@ -412,20 +388,15 @@ static void __init radix_init_pgtable(void)
412 388
413static void __init radix_init_partition_table(void) 389static void __init radix_init_partition_table(void)
414{ 390{
415 unsigned long rts_field, dw0; 391 unsigned long rts_field, dw0, dw1;
416 392
417 mmu_partition_table_init(); 393 mmu_partition_table_init();
418 rts_field = radix__get_tree_size(); 394 rts_field = radix__get_tree_size();
419 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 395 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
420 mmu_partition_table_set_entry(0, dw0, 0); 396 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
397 mmu_partition_table_set_entry(0, dw0, dw1, false);
421 398
422 pr_info("Initializing Radix MMU\n"); 399 pr_info("Initializing Radix MMU\n");
423 pr_info("Partition table %p\n", partition_tb);
424}
425
426void __init radix_init_native(void)
427{
428 register_process_table = native_register_process_table;
429} 400}
430 401
431static int __init get_idx_from_shift(unsigned int shift) 402static int __init get_idx_from_shift(unsigned int shift)
@@ -621,8 +592,9 @@ void __init radix__early_init_mmu(void)
621 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 592 __pmd_frag_nr = RADIX_PMD_FRAG_NR;
622 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 593 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
623 594
595 radix_init_pgtable();
596
624 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 597 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
625 radix_init_native();
626 lpcr = mfspr(SPRN_LPCR); 598 lpcr = mfspr(SPRN_LPCR);
627 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 599 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
628 radix_init_partition_table(); 600 radix_init_partition_table();
@@ -633,11 +605,9 @@ void __init radix__early_init_mmu(void)
633 605
634 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 606 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
635 607
636 radix_init_pgtable();
637 /* Switch to the guard PID before turning on MMU */ 608 /* Switch to the guard PID before turning on MMU */
638 radix__switch_mmu_context(NULL, &init_mm); 609 radix__switch_mmu_context(NULL, &init_mm);
639 if (cpu_has_feature(CPU_FTR_HVMODE)) 610 tlbiel_all();
640 tlbiel_all();
641} 611}
642 612
643void radix__early_init_mmu_secondary(void) 613void radix__early_init_mmu_secondary(void)
@@ -650,14 +620,14 @@ void radix__early_init_mmu_secondary(void)
650 lpcr = mfspr(SPRN_LPCR); 620 lpcr = mfspr(SPRN_LPCR);
651 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 621 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
652 622
653 mtspr(SPRN_PTCR, 623 set_ptcr_when_no_uv(__pa(partition_tb) |
654 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 624 (PATB_SIZE_SHIFT - 12));
625
655 radix_init_amor(); 626 radix_init_amor();
656 } 627 }
657 628
658 radix__switch_mmu_context(NULL, &init_mm); 629 radix__switch_mmu_context(NULL, &init_mm);
659 if (cpu_has_feature(CPU_FTR_HVMODE)) 630 tlbiel_all();
660 tlbiel_all();
661} 631}
662 632
663void radix__mmu_cleanup_all(void) 633void radix__mmu_cleanup_all(void)
@@ -667,7 +637,7 @@ void radix__mmu_cleanup_all(void)
667 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 637 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
668 lpcr = mfspr(SPRN_LPCR); 638 lpcr = mfspr(SPRN_LPCR);
669 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 639 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
670 mtspr(SPRN_PTCR, 0); 640 set_ptcr_when_no_uv(0);
671 powernv_set_nmmu_ptcr(0); 641 powernv_set_nmmu_ptcr(0);
672 radix__flush_tlb_all(); 642 radix__flush_tlb_all();
673 } 643 }
@@ -737,8 +707,8 @@ static int __meminit stop_machine_change_mapping(void *data)
737 707
738 spin_unlock(&init_mm.page_table_lock); 708 spin_unlock(&init_mm.page_table_lock);
739 pte_clear(&init_mm, params->aligned_start, params->pte); 709 pte_clear(&init_mm, params->aligned_start, params->pte);
740 create_physical_mapping(params->aligned_start, params->start, -1); 710 create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
741 create_physical_mapping(params->end, params->aligned_end, -1); 711 create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
742 spin_lock(&init_mm.page_table_lock); 712 spin_lock(&init_mm.page_table_lock);
743 return 0; 713 return 0;
744} 714}
@@ -902,7 +872,7 @@ int __meminit radix__create_section_mapping(unsigned long start, unsigned long e
902 return -1; 872 return -1;
903 } 873 }
904 874
905 return create_physical_mapping(start, end, nid); 875 return create_physical_mapping(__pa(start), __pa(end), nid);
906} 876}
907 877
908int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 878int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
@@ -1218,26 +1188,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1218 return 1; 1188 return 1;
1219} 1189}
1220 1190
1221int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
1222 pgprot_t prot, int nid)
1223{
1224 if (likely(slab_is_available())) {
1225 int err = ioremap_page_range(ea, ea + size, pa, prot);
1226 if (err)
1227 unmap_kernel_range(ea, size);
1228 return err;
1229 } else {
1230 unsigned long i;
1231
1232 for (i = 0; i < size; i += PAGE_SIZE) {
1233 int err = map_kernel_page(ea + i, pa + i, prot);
1234 if (WARN_ON_ONCE(err)) /* Should clean up */
1235 return err;
1236 }
1237 return 0;
1238 }
1239}
1240
1241int __init arch_ioremap_p4d_supported(void) 1191int __init arch_ioremap_p4d_supported(void)
1242{ 1192{
1243 return 0; 1193 return 0;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 71f7fede2fa4..631be42abd33 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -51,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
51 * and partition table entries. Then flush the remaining sets of the 51 * and partition table entries. Then flush the remaining sets of the
52 * TLB. 52 * TLB.
53 */ 53 */
54 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
55 for (set = 1; set < num_sets; set++)
56 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
57 54
58 /* Do the same for process scoped entries. */ 55 if (early_cpu_has_feature(CPU_FTR_HVMODE)) {
56 /* MSR[HV] should flush partition scope translations first. */
57 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
58 for (set = 1; set < num_sets; set++)
59 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
60 }
61
62 /* Flush process scoped entries. */
59 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); 63 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
60 for (set = 1; set < num_sets; set++) 64 for (set = 1; set < num_sets; set++)
61 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); 65 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
@@ -116,22 +120,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
116 trace_tlbie(0, 0, rb, rs, ric, prs, r); 120 trace_tlbie(0, 0, rb, rs, ric, prs, r);
117} 121}
118 122
119static __always_inline void __tlbiel_lpid(unsigned long lpid, int set,
120 unsigned long ric)
121{
122 unsigned long rb,rs,prs,r;
123
124 rb = PPC_BIT(52); /* IS = 2 */
125 rb |= set << PPC_BITLSHIFT(51);
126 rs = 0; /* LPID comes from LPIDR */
127 prs = 0; /* partition scoped */
128 r = 1; /* radix format */
129
130 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
131 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
132 trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
133}
134
135static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) 123static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
136{ 124{
137 unsigned long rb,rs,prs,r; 125 unsigned long rb,rs,prs,r;
@@ -146,23 +134,20 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
146 trace_tlbie(lpid, 0, rb, rs, ric, prs, r); 134 trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
147} 135}
148 136
149static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set, 137static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
150 unsigned long ric)
151{ 138{
152 unsigned long rb,rs,prs,r; 139 unsigned long rb,rs,prs,r;
153 140
154 rb = PPC_BIT(52); /* IS = 2 */ 141 rb = PPC_BIT(52); /* IS = 2 */
155 rb |= set << PPC_BITLSHIFT(51); 142 rs = lpid;
156 rs = 0; /* LPID comes from LPIDR */
157 prs = 1; /* process scoped */ 143 prs = 1; /* process scoped */
158 r = 1; /* radix format */ 144 r = 1; /* radix format */
159 145
160 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) 146 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
161 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 147 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
162 trace_tlbie(lpid, 1, rb, rs, ric, prs, r); 148 trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
163} 149}
164 150
165
166static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, 151static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
167 unsigned long ap, unsigned long ric) 152 unsigned long ap, unsigned long ric)
168{ 153{
@@ -285,32 +270,37 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
285 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 270 asm volatile("eieio; tlbsync; ptesync": : :"memory");
286} 271}
287 272
288static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) 273struct tlbiel_pid {
274 unsigned long pid;
275 unsigned long ric;
276};
277
278static void do_tlbiel_pid(void *info)
289{ 279{
290 int set; 280 struct tlbiel_pid *t = info;
291 281
292 VM_BUG_ON(mfspr(SPRN_LPID) != lpid); 282 if (t->ric == RIC_FLUSH_TLB)
283 _tlbiel_pid(t->pid, RIC_FLUSH_TLB);
284 else if (t->ric == RIC_FLUSH_PWC)
285 _tlbiel_pid(t->pid, RIC_FLUSH_PWC);
286 else
287 _tlbiel_pid(t->pid, RIC_FLUSH_ALL);
288}
293 289
294 asm volatile("ptesync": : :"memory"); 290static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
291 unsigned long pid, unsigned long ric)
292{
293 struct cpumask *cpus = mm_cpumask(mm);
294 struct tlbiel_pid t = { .pid = pid, .ric = ric };
295 295
296 on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
296 /* 297 /*
297 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, 298 * Always want the CPU translations to be invalidated with tlbiel in
298 * also flush the entire Page Walk Cache. 299 * these paths, so while coprocessors must use tlbie, we can not
300 * optimise away the tlbiel component.
299 */ 301 */
300 __tlbiel_lpid(lpid, 0, ric); 302 if (atomic_read(&mm->context.copros) > 0)
301 303 _tlbie_pid(pid, RIC_FLUSH_ALL);
302 /* For PWC, only one flush is needed */
303 if (ric == RIC_FLUSH_PWC) {
304 asm volatile("ptesync": : :"memory");
305 return;
306 }
307
308 /* For the remaining sets, just flush the TLB */
309 for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
310 __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
311
312 asm volatile("ptesync": : :"memory");
313 asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST "; isync" : : :"memory");
314} 304}
315 305
316static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) 306static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
@@ -337,35 +327,28 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
337 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 327 asm volatile("eieio; tlbsync; ptesync": : :"memory");
338} 328}
339 329
340static __always_inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) 330static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
341{ 331{
342 int set;
343
344 VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
345
346 asm volatile("ptesync": : :"memory");
347
348 /* 332 /*
349 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, 333 * Workaround the fact that the "ric" argument to __tlbie_pid
350 * also flush the entire Page Walk Cache. 334 * must be a compile-time contraint to match the "i" constraint
335 * in the asm statement.
351 */ 336 */
352 __tlbiel_lpid_guest(lpid, 0, ric); 337 switch (ric) {
353 338 case RIC_FLUSH_TLB:
354 /* For PWC, only one flush is needed */ 339 __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB);
355 if (ric == RIC_FLUSH_PWC) { 340 break;
356 asm volatile("ptesync": : :"memory"); 341 case RIC_FLUSH_PWC:
357 return; 342 __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC);
343 break;
344 case RIC_FLUSH_ALL:
345 default:
346 __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
358 } 347 }
359 348 fixup_tlbie_lpid(lpid);
360 /* For the remaining sets, just flush the TLB */ 349 asm volatile("eieio; tlbsync; ptesync": : :"memory");
361 for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
362 __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
363
364 asm volatile("ptesync": : :"memory");
365 asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory");
366} 350}
367 351
368
369static inline void __tlbiel_va_range(unsigned long start, unsigned long end, 352static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
370 unsigned long pid, unsigned long page_size, 353 unsigned long pid, unsigned long page_size,
371 unsigned long psize) 354 unsigned long psize)
@@ -420,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
420 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 403 asm volatile("eieio; tlbsync; ptesync": : :"memory");
421} 404}
422 405
406struct tlbiel_va {
407 unsigned long pid;
408 unsigned long va;
409 unsigned long psize;
410 unsigned long ric;
411};
412
413static void do_tlbiel_va(void *info)
414{
415 struct tlbiel_va *t = info;
416
417 if (t->ric == RIC_FLUSH_TLB)
418 _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
419 else if (t->ric == RIC_FLUSH_PWC)
420 _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
421 else
422 _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
423}
424
425static inline void _tlbiel_va_multicast(struct mm_struct *mm,
426 unsigned long va, unsigned long pid,
427 unsigned long psize, unsigned long ric)
428{
429 struct cpumask *cpus = mm_cpumask(mm);
430 struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric };
431 on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
432 if (atomic_read(&mm->context.copros) > 0)
433 _tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
434}
435
436struct tlbiel_va_range {
437 unsigned long pid;
438 unsigned long start;
439 unsigned long end;
440 unsigned long page_size;
441 unsigned long psize;
442 bool also_pwc;
443};
444
445static void do_tlbiel_va_range(void *info)
446{
447 struct tlbiel_va_range *t = info;
448
449 _tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
450 t->psize, t->also_pwc);
451}
452
423static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, 453static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
424 unsigned long psize, unsigned long ric) 454 unsigned long psize, unsigned long ric)
425{ 455{
@@ -443,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
443 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 473 asm volatile("eieio; tlbsync; ptesync": : :"memory");
444} 474}
445 475
476static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
477 unsigned long start, unsigned long end,
478 unsigned long pid, unsigned long page_size,
479 unsigned long psize, bool also_pwc)
480{
481 struct cpumask *cpus = mm_cpumask(mm);
482 struct tlbiel_va_range t = { .start = start, .end = end,
483 .pid = pid, .page_size = page_size,
484 .psize = psize, .also_pwc = also_pwc };
485
486 on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
487 if (atomic_read(&mm->context.copros) > 0)
488 _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
489}
490
446/* 491/*
447 * Base TLB flushing operations: 492 * Base TLB flushing operations:
448 * 493 *
@@ -580,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
580 goto local; 625 goto local;
581 } 626 }
582 627
583 if (mm_needs_flush_escalation(mm)) 628 if (cputlb_use_tlbie()) {
584 _tlbie_pid(pid, RIC_FLUSH_ALL); 629 if (mm_needs_flush_escalation(mm))
585 else 630 _tlbie_pid(pid, RIC_FLUSH_ALL);
586 _tlbie_pid(pid, RIC_FLUSH_TLB); 631 else
632 _tlbie_pid(pid, RIC_FLUSH_TLB);
633 } else {
634 _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
635 }
587 } else { 636 } else {
588local: 637local:
589 _tlbiel_pid(pid, RIC_FLUSH_TLB); 638 _tlbiel_pid(pid, RIC_FLUSH_TLB);
@@ -609,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
609 goto local; 658 goto local;
610 } 659 }
611 } 660 }
612 _tlbie_pid(pid, RIC_FLUSH_ALL); 661 if (cputlb_use_tlbie())
662 _tlbie_pid(pid, RIC_FLUSH_ALL);
663 else
664 _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
613 } else { 665 } else {
614local: 666local:
615 _tlbiel_pid(pid, RIC_FLUSH_ALL); 667 _tlbiel_pid(pid, RIC_FLUSH_ALL);
@@ -644,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
644 exit_flush_lazy_tlbs(mm); 696 exit_flush_lazy_tlbs(mm);
645 goto local; 697 goto local;
646 } 698 }
647 _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); 699 if (cputlb_use_tlbie())
700 _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
701 else
702 _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
648 } else { 703 } else {
649local: 704local:
650 _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); 705 _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -666,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
666#define radix__flush_all_mm radix__local_flush_all_mm 721#define radix__flush_all_mm radix__local_flush_all_mm
667#endif /* CONFIG_SMP */ 722#endif /* CONFIG_SMP */
668 723
724static void do_tlbiel_kernel(void *info)
725{
726 _tlbiel_pid(0, RIC_FLUSH_ALL);
727}
728
729static inline void _tlbiel_kernel_broadcast(void)
730{
731 on_each_cpu(do_tlbiel_kernel, NULL, 1);
732 if (tlbie_capable) {
733 /*
734 * Coherent accelerators don't refcount kernel memory mappings,
735 * so have to always issue a tlbie for them. This is quite a
736 * slow path anyway.
737 */
738 _tlbie_pid(0, RIC_FLUSH_ALL);
739 }
740}
741
669/* 742/*
670 * If kernel TLBIs ever become local rather than global, then 743 * If kernel TLBIs ever become local rather than global, then
671 * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it 744 * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
@@ -673,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
673 */ 746 */
674void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) 747void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
675{ 748{
676 _tlbie_pid(0, RIC_FLUSH_ALL); 749 if (cputlb_use_tlbie())
750 _tlbie_pid(0, RIC_FLUSH_ALL);
751 else
752 _tlbiel_kernel_broadcast();
677} 753}
678EXPORT_SYMBOL(radix__flush_tlb_kernel_range); 754EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
679 755
@@ -729,10 +805,14 @@ is_local:
729 if (local) { 805 if (local) {
730 _tlbiel_pid(pid, RIC_FLUSH_TLB); 806 _tlbiel_pid(pid, RIC_FLUSH_TLB);
731 } else { 807 } else {
732 if (mm_needs_flush_escalation(mm)) 808 if (cputlb_use_tlbie()) {
733 _tlbie_pid(pid, RIC_FLUSH_ALL); 809 if (mm_needs_flush_escalation(mm))
734 else 810 _tlbie_pid(pid, RIC_FLUSH_ALL);
735 _tlbie_pid(pid, RIC_FLUSH_TLB); 811 else
812 _tlbie_pid(pid, RIC_FLUSH_TLB);
813 } else {
814 _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
815 }
736 } 816 }
737 } else { 817 } else {
738 bool hflush = flush_all_sizes; 818 bool hflush = flush_all_sizes;
@@ -757,8 +837,8 @@ is_local:
757 gflush = false; 837 gflush = false;
758 } 838 }
759 839
760 asm volatile("ptesync": : :"memory");
761 if (local) { 840 if (local) {
841 asm volatile("ptesync": : :"memory");
762 __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); 842 __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
763 if (hflush) 843 if (hflush)
764 __tlbiel_va_range(hstart, hend, pid, 844 __tlbiel_va_range(hstart, hend, pid,
@@ -767,7 +847,8 @@ is_local:
767 __tlbiel_va_range(gstart, gend, pid, 847 __tlbiel_va_range(gstart, gend, pid,
768 PUD_SIZE, MMU_PAGE_1G); 848 PUD_SIZE, MMU_PAGE_1G);
769 asm volatile("ptesync": : :"memory"); 849 asm volatile("ptesync": : :"memory");
770 } else { 850 } else if (cputlb_use_tlbie()) {
851 asm volatile("ptesync": : :"memory");
771 __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); 852 __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
772 if (hflush) 853 if (hflush)
773 __tlbie_va_range(hstart, hend, pid, 854 __tlbie_va_range(hstart, hend, pid,
@@ -777,6 +858,15 @@ is_local:
777 PUD_SIZE, MMU_PAGE_1G); 858 PUD_SIZE, MMU_PAGE_1G);
778 fixup_tlbie(); 859 fixup_tlbie();
779 asm volatile("eieio; tlbsync; ptesync": : :"memory"); 860 asm volatile("eieio; tlbsync; ptesync": : :"memory");
861 } else {
862 _tlbiel_va_range_multicast(mm,
863 start, end, pid, page_size, mmu_virtual_psize, false);
864 if (hflush)
865 _tlbiel_va_range_multicast(mm,
866 hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
867 if (gflush)
868 _tlbiel_va_range_multicast(mm,
869 gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false);
780 } 870 }
781 } 871 }
782 preempt_enable(); 872 preempt_enable();
@@ -835,32 +925,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
835/* 925/*
836 * Flush partition scoped translations from LPID (=LPIDR) 926 * Flush partition scoped translations from LPID (=LPIDR)
837 */ 927 */
838void radix__flush_tlb_lpid(unsigned int lpid) 928void radix__flush_all_lpid(unsigned int lpid)
839{ 929{
840 _tlbie_lpid(lpid, RIC_FLUSH_ALL); 930 _tlbie_lpid(lpid, RIC_FLUSH_ALL);
841} 931}
842EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); 932EXPORT_SYMBOL_GPL(radix__flush_all_lpid);
843 933
844/* 934/*
845 * Flush partition scoped translations from LPID (=LPIDR) 935 * Flush process scoped translations from LPID (=LPIDR)
846 */ 936 */
847void radix__local_flush_tlb_lpid(unsigned int lpid) 937void radix__flush_all_lpid_guest(unsigned int lpid)
848{ 938{
849 _tlbiel_lpid(lpid, RIC_FLUSH_ALL); 939 _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
850} 940}
851EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
852
853/*
854 * Flush process scoped translations from LPID (=LPIDR).
855 * Important difference, the guest normally manages its own translations,
856 * but some cases e.g., vCPU CPU migration require KVM to flush.
857 */
858void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
859{
860 _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
861}
862EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
863
864 941
865static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, 942static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
866 unsigned long end, int psize); 943 unsigned long end, int psize);
@@ -966,16 +1043,26 @@ is_local:
966 if (local) { 1043 if (local) {
967 _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); 1044 _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
968 } else { 1045 } else {
969 if (mm_needs_flush_escalation(mm)) 1046 if (cputlb_use_tlbie()) {
970 also_pwc = true; 1047 if (mm_needs_flush_escalation(mm))
1048 also_pwc = true;
1049
1050 _tlbie_pid(pid,
1051 also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
1052 } else {
1053 _tlbiel_pid_multicast(mm, pid,
1054 also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
1055 }
971 1056
972 _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
973 } 1057 }
974 } else { 1058 } else {
975 if (local) 1059 if (local)
976 _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); 1060 _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
977 else 1061 else if (cputlb_use_tlbie())
978 _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); 1062 _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
1063 else
1064 _tlbiel_va_range_multicast(mm,
1065 start, end, pid, page_size, psize, also_pwc);
979 } 1066 }
980 preempt_enable(); 1067 preempt_enable();
981} 1068}
@@ -1017,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
1017 exit_flush_lazy_tlbs(mm); 1104 exit_flush_lazy_tlbs(mm);
1018 goto local; 1105 goto local;
1019 } 1106 }
1020 _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); 1107 if (cputlb_use_tlbie())
1108 _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
1109 else
1110 _tlbiel_va_range_multicast(mm,
1111 addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
1021 } else { 1112 } else {
1022local: 1113local:
1023 _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); 1114 _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index c617282d5b2a..2a82984356f8 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -4,310 +4,18 @@
4 * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) 4 * Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
5 * 5 *
6 * Copyright (C) 2000 Russell King 6 * Copyright (C) 2000 Russell King
7 *
8 * Consistent memory allocators. Used for DMA devices that want to
9 * share uncached memory with the processor core. The function return
10 * is the virtual address and 'dma_handle' is the physical address.
11 * Mostly stolen from the ARM port, with some changes for PowerPC.
12 * -- Dan
13 *
14 * Reorganized to get rid of the arch-specific consistent_* functions
15 * and provide non-coherent implementations for the DMA API. -Matt
16 *
17 * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
18 * implementation. This is pulled straight from ARM and barely
19 * modified. -Matt
20 */ 7 */
21 8
22#include <linux/sched.h>
23#include <linux/slab.h>
24#include <linux/kernel.h> 9#include <linux/kernel.h>
25#include <linux/errno.h> 10#include <linux/errno.h>
26#include <linux/string.h>
27#include <linux/types.h> 11#include <linux/types.h>
28#include <linux/highmem.h> 12#include <linux/highmem.h>
29#include <linux/dma-direct.h> 13#include <linux/dma-direct.h>
30#include <linux/dma-noncoherent.h> 14#include <linux/dma-noncoherent.h>
31#include <linux/export.h>
32 15
33#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
34#include <asm/dma.h> 17#include <asm/dma.h>
35 18
36#include <mm/mmu_decl.h>
37
38/*
39 * This address range defaults to a value that is safe for all
40 * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
41 * can be further configured for specific applications under
42 * the "Advanced Setup" menu. -Matt
43 */
44#define CONSISTENT_BASE (IOREMAP_TOP)
45#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
46#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
47
48/*
49 * This is the page table (2MB) covering uncached, DMA consistent allocations
50 */
51static DEFINE_SPINLOCK(consistent_lock);
52
53/*
54 * VM region handling support.
55 *
56 * This should become something generic, handling VM region allocations for
57 * vmalloc and similar (ioremap, module space, etc).
58 *
59 * I envisage vmalloc()'s supporting vm_struct becoming:
60 *
61 * struct vm_struct {
62 * struct vm_region region;
63 * unsigned long flags;
64 * struct page **pages;
65 * unsigned int nr_pages;
66 * unsigned long phys_addr;
67 * };
68 *
69 * get_vm_area() would then call vm_region_alloc with an appropriate
70 * struct vm_region head (eg):
71 *
72 * struct vm_region vmalloc_head = {
73 * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list),
74 * .vm_start = VMALLOC_START,
75 * .vm_end = VMALLOC_END,
76 * };
77 *
78 * However, vmalloc_head.vm_start is variable (typically, it is dependent on
79 * the amount of RAM found at boot time.) I would imagine that get_vm_area()
80 * would have to initialise this each time prior to calling vm_region_alloc().
81 */
82struct ppc_vm_region {
83 struct list_head vm_list;
84 unsigned long vm_start;
85 unsigned long vm_end;
86};
87
88static struct ppc_vm_region consistent_head = {
89 .vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
90 .vm_start = CONSISTENT_BASE,
91 .vm_end = CONSISTENT_END,
92};
93
94static struct ppc_vm_region *
95ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
96{
97 unsigned long addr = head->vm_start, end = head->vm_end - size;
98 unsigned long flags;
99 struct ppc_vm_region *c, *new;
100
101 new = kmalloc(sizeof(struct ppc_vm_region), gfp);
102 if (!new)
103 goto out;
104
105 spin_lock_irqsave(&consistent_lock, flags);
106
107 list_for_each_entry(c, &head->vm_list, vm_list) {
108 if ((addr + size) < addr)
109 goto nospc;
110 if ((addr + size) <= c->vm_start)
111 goto found;
112 addr = c->vm_end;
113 if (addr > end)
114 goto nospc;
115 }
116
117 found:
118 /*
119 * Insert this entry _before_ the one we found.
120 */
121 list_add_tail(&new->vm_list, &c->vm_list);
122 new->vm_start = addr;
123 new->vm_end = addr + size;
124
125 spin_unlock_irqrestore(&consistent_lock, flags);
126 return new;
127
128 nospc:
129 spin_unlock_irqrestore(&consistent_lock, flags);
130 kfree(new);
131 out:
132 return NULL;
133}
134
135static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
136{
137 struct ppc_vm_region *c;
138
139 list_for_each_entry(c, &head->vm_list, vm_list) {
140 if (c->vm_start == addr)
141 goto out;
142 }
143 c = NULL;
144 out:
145 return c;
146}
147
148/*
149 * Allocate DMA-coherent memory space and return both the kernel remapped
150 * virtual and bus address for that space.
151 */
152void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
153 gfp_t gfp, unsigned long attrs)
154{
155 struct page *page;
156 struct ppc_vm_region *c;
157 unsigned long order;
158 u64 mask = ISA_DMA_THRESHOLD, limit;
159
160 if (dev) {
161 mask = dev->coherent_dma_mask;
162
163 /*
164 * Sanity check the DMA mask - it must be non-zero, and
165 * must be able to be satisfied by a DMA allocation.
166 */
167 if (mask == 0) {
168 dev_warn(dev, "coherent DMA mask is unset\n");
169 goto no_page;
170 }
171
172 if ((~mask) & ISA_DMA_THRESHOLD) {
173 dev_warn(dev, "coherent DMA mask %#llx is smaller "
174 "than system GFP_DMA mask %#llx\n",
175 mask, (unsigned long long)ISA_DMA_THRESHOLD);
176 goto no_page;
177 }
178 }
179
180
181 size = PAGE_ALIGN(size);
182 limit = (mask + 1) & ~mask;
183 if ((limit && size >= limit) ||
184 size >= (CONSISTENT_END - CONSISTENT_BASE)) {
185 printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
186 size, mask);
187 return NULL;
188 }
189
190 order = get_order(size);
191
192 /* Might be useful if we ever have a real legacy DMA zone... */
193 if (mask != 0xffffffff)
194 gfp |= GFP_DMA;
195
196 page = alloc_pages(gfp, order);
197 if (!page)
198 goto no_page;
199
200 /*
201 * Invalidate any data that might be lurking in the
202 * kernel direct-mapped region for device DMA.
203 */
204 {
205 unsigned long kaddr = (unsigned long)page_address(page);
206 memset(page_address(page), 0, size);
207 flush_dcache_range(kaddr, kaddr + size);
208 }
209
210 /*
211 * Allocate a virtual address in the consistent mapping region.
212 */
213 c = ppc_vm_region_alloc(&consistent_head, size,
214 gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
215 if (c) {
216 unsigned long vaddr = c->vm_start;
217 struct page *end = page + (1 << order);
218
219 split_page(page, order);
220
221 /*
222 * Set the "dma handle"
223 */
224 *dma_handle = phys_to_dma(dev, page_to_phys(page));
225
226 do {
227 SetPageReserved(page);
228 map_kernel_page(vaddr, page_to_phys(page),
229 pgprot_noncached(PAGE_KERNEL));
230 page++;
231 vaddr += PAGE_SIZE;
232 } while (size -= PAGE_SIZE);
233
234 /*
235 * Free the otherwise unused pages.
236 */
237 while (page < end) {
238 __free_page(page);
239 page++;
240 }
241
242 return (void *)c->vm_start;
243 }
244
245 if (page)
246 __free_pages(page, order);
247 no_page:
248 return NULL;
249}
250
251/*
252 * free a page as defined by the above mapping.
253 */
254void arch_dma_free(struct device *dev, size_t size, void *vaddr,
255 dma_addr_t dma_handle, unsigned long attrs)
256{
257 struct ppc_vm_region *c;
258 unsigned long flags, addr;
259
260 size = PAGE_ALIGN(size);
261
262 spin_lock_irqsave(&consistent_lock, flags);
263
264 c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
265 if (!c)
266 goto no_area;
267
268 if ((c->vm_end - c->vm_start) != size) {
269 printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
270 __func__, c->vm_end - c->vm_start, size);
271 dump_stack();
272 size = c->vm_end - c->vm_start;
273 }
274
275 addr = c->vm_start;
276 do {
277 pte_t *ptep;
278 unsigned long pfn;
279
280 ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
281 addr),
282 addr),
283 addr);
284 if (!pte_none(*ptep) && pte_present(*ptep)) {
285 pfn = pte_pfn(*ptep);
286 pte_clear(&init_mm, addr, ptep);
287 if (pfn_valid(pfn)) {
288 struct page *page = pfn_to_page(pfn);
289 __free_reserved_page(page);
290 }
291 }
292 addr += PAGE_SIZE;
293 } while (size -= PAGE_SIZE);
294
295 flush_tlb_kernel_range(c->vm_start, c->vm_end);
296
297 list_del(&c->vm_list);
298
299 spin_unlock_irqrestore(&consistent_lock, flags);
300
301 kfree(c);
302 return;
303
304 no_area:
305 spin_unlock_irqrestore(&consistent_lock, flags);
306 printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
307 __func__, vaddr);
308 dump_stack();
309}
310
311/* 19/*
312 * make an area consistent. 20 * make an area consistent.
313 */ 21 */
@@ -408,23 +116,9 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
408 __dma_sync_page(paddr, size, dir); 116 __dma_sync_page(paddr, size, dir);
409} 117}
410 118
411/* 119void arch_dma_prep_coherent(struct page *page, size_t size)
412 * Return the PFN for a given cpu virtual address returned by arch_dma_alloc.
413 */
414long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr,
415 dma_addr_t dma_addr)
416{ 120{
417 /* This should always be populated, so we don't test every 121 unsigned long kaddr = (unsigned long)page_address(page);
418 * level. If that fails, we'll have a nice crash which
419 * will be as good as a BUG_ON()
420 */
421 unsigned long cpu_addr = (unsigned long)vaddr;
422 pgd_t *pgd = pgd_offset_k(cpu_addr);
423 pud_t *pud = pud_offset(pgd, cpu_addr);
424 pmd_t *pmd = pmd_offset(pud, cpu_addr);
425 pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
426 122
427 if (pte_none(*ptep) || !pte_present(*ptep)) 123 flush_dcache_range(kaddr, kaddr + size);
428 return 0;
429 return pte_pfn(*ptep);
430} 124}
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
new file mode 100644
index 000000000000..fc669643ce6a
--- /dev/null
+++ b/arch/powerpc/mm/ioremap.c
@@ -0,0 +1,99 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/io.h>
4#include <linux/slab.h>
5#include <linux/vmalloc.h>
6#include <asm/io-workarounds.h>
7
8unsigned long ioremap_bot;
9EXPORT_SYMBOL(ioremap_bot);
10
11void __iomem *ioremap(phys_addr_t addr, unsigned long size)
12{
13 pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
14 void *caller = __builtin_return_address(0);
15
16 if (iowa_is_active())
17 return iowa_ioremap(addr, size, prot, caller);
18 return __ioremap_caller(addr, size, prot, caller);
19}
20EXPORT_SYMBOL(ioremap);
21
22void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size)
23{
24 pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
25 void *caller = __builtin_return_address(0);
26
27 if (iowa_is_active())
28 return iowa_ioremap(addr, size, prot, caller);
29 return __ioremap_caller(addr, size, prot, caller);
30}
31EXPORT_SYMBOL(ioremap_wc);
32
33void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
34{
35 pgprot_t prot = pgprot_cached(PAGE_KERNEL);
36 void *caller = __builtin_return_address(0);
37
38 if (iowa_is_active())
39 return iowa_ioremap(addr, size, prot, caller);
40 return __ioremap_caller(addr, size, prot, caller);
41}
42
43void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
44{
45 pte_t pte = __pte(flags);
46 void *caller = __builtin_return_address(0);
47
48 /* writeable implies dirty for kernel addresses */
49 if (pte_write(pte))
50 pte = pte_mkdirty(pte);
51
52 /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
53 pte = pte_exprotect(pte);
54 pte = pte_mkprivileged(pte);
55
56 if (iowa_is_active())
57 return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
58 return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
59}
60EXPORT_SYMBOL(ioremap_prot);
61
62int early_ioremap_range(unsigned long ea, phys_addr_t pa,
63 unsigned long size, pgprot_t prot)
64{
65 unsigned long i;
66
67 for (i = 0; i < size; i += PAGE_SIZE) {
68 int err = map_kernel_page(ea + i, pa + i, prot);
69
70 if (WARN_ON_ONCE(err)) /* Should clean up */
71 return err;
72 }
73
74 return 0;
75}
76
77void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
78 pgprot_t prot, void *caller)
79{
80 struct vm_struct *area;
81 int ret;
82 unsigned long va;
83
84 area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
85 if (area == NULL)
86 return NULL;
87
88 area->phys_addr = pa;
89 va = (unsigned long)area->addr;
90
91 ret = ioremap_page_range(va, va + size, pa, prot);
92 if (!ret)
93 return (void __iomem *)area->addr + offset;
94
95 unmap_kernel_range(va, size);
96 free_vm_area(area);
97
98 return NULL;
99}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
new file mode 100644
index 000000000000..f36121f25243
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -0,0 +1,92 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/io.h>
4#include <linux/slab.h>
5#include <linux/vmalloc.h>
6
7#include <mm/mmu_decl.h>
8
9void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size)
10{
11 pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
12
13 return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
14}
15EXPORT_SYMBOL(ioremap_wt);
16
17void __iomem *
18__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
19{
20 unsigned long v;
21 phys_addr_t p, offset;
22 int err;
23
24 /*
25 * Choose an address to map it to.
26 * Once the vmalloc system is running, we use it.
27 * Before then, we use space going down from IOREMAP_TOP
28 * (ioremap_bot records where we're up to).
29 */
30 p = addr & PAGE_MASK;
31 offset = addr & ~PAGE_MASK;
32 size = PAGE_ALIGN(addr + size) - p;
33
34 /*
35 * If the address lies within the first 16 MB, assume it's in ISA
36 * memory space
37 */
38 if (p < 16 * 1024 * 1024)
39 p += _ISA_MEM_BASE;
40
41#ifndef CONFIG_CRASH_DUMP
42 /*
43 * Don't allow anybody to remap normal RAM that we're using.
44 * mem_init() sets high_memory so only do the check after that.
45 */
46 if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
47 page_is_ram(__phys_to_pfn(p))) {
48 pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
49 (unsigned long long)p, __builtin_return_address(0));
50 return NULL;
51 }
52#endif
53
54 if (size == 0)
55 return NULL;
56
57 /*
58 * Is it already mapped? Perhaps overlapped by a previous
59 * mapping.
60 */
61 v = p_block_mapped(p);
62 if (v)
63 return (void __iomem *)v + offset;
64
65 if (slab_is_available())
66 return do_ioremap(p, offset, size, prot, caller);
67
68 /*
69 * Should check if it is a candidate for a BAT mapping
70 */
71
72 err = early_ioremap_range(ioremap_bot - size, p, size, prot);
73 if (err)
74 return NULL;
75 ioremap_bot -= size;
76
77 return (void __iomem *)ioremap_bot + offset;
78}
79
80void iounmap(volatile void __iomem *addr)
81{
82 /*
83 * If mapped by BATs then there is nothing to do.
84 * Calling vfree() generates a benign warning.
85 */
86 if (v_block_mapped((unsigned long)addr))
87 return;
88
89 if (addr > high_memory && (unsigned long)addr < ioremap_bot)
90 vunmap((void *)(PAGE_MASK & (unsigned long)addr));
91}
92EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
new file mode 100644
index 000000000000..fd29e51700cd
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -0,0 +1,113 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/io.h>
4#include <linux/slab.h>
5#include <linux/vmalloc.h>
6
7/**
8 * Low level function to establish the page tables for an IO mapping
9 */
10void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
11{
12 int ret;
13 unsigned long va = (unsigned long)ea;
14
15 /* We don't support the 4K PFN hack with ioremap */
16 if (pgprot_val(prot) & H_PAGE_4K_PFN)
17 return NULL;
18
19 if ((ea + size) >= (void *)IOREMAP_END) {
20 pr_warn("Outside the supported range\n");
21 return NULL;
22 }
23
24 WARN_ON(pa & ~PAGE_MASK);
25 WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
26 WARN_ON(size & ~PAGE_MASK);
27
28 if (slab_is_available()) {
29 ret = ioremap_page_range(va, va + size, pa, prot);
30 if (ret)
31 unmap_kernel_range(va, size);
32 } else {
33 ret = early_ioremap_range(va, pa, size, prot);
34 }
35
36 if (ret)
37 return NULL;
38
39 return (void __iomem *)ea;
40}
41EXPORT_SYMBOL(__ioremap_at);
42
43/**
44 * Low level function to tear down the page tables for an IO mapping. This is
45 * used for mappings that are manipulated manually, like partial unmapping of
46 * PCI IOs or ISA space.
47 */
48void __iounmap_at(void *ea, unsigned long size)
49{
50 WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
51 WARN_ON(size & ~PAGE_MASK);
52
53 unmap_kernel_range((unsigned long)ea, size);
54}
55EXPORT_SYMBOL(__iounmap_at);
56
57void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
58 pgprot_t prot, void *caller)
59{
60 phys_addr_t paligned, offset;
61 void __iomem *ret;
62 int err;
63
64 /* We don't support the 4K PFN hack with ioremap */
65 if (pgprot_val(prot) & H_PAGE_4K_PFN)
66 return NULL;
67
68 /*
69 * Choose an address to map it to. Once the vmalloc system is running,
70 * we use it. Before that, we map using addresses going up from
71 * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE
72 * through ioremap_bot.
73 */
74 paligned = addr & PAGE_MASK;
75 offset = addr & ~PAGE_MASK;
76 size = PAGE_ALIGN(addr + size) - paligned;
77
78 if (size == 0 || paligned == 0)
79 return NULL;
80
81 if (slab_is_available())
82 return do_ioremap(paligned, offset, size, prot, caller);
83
84 err = early_ioremap_range(ioremap_bot, paligned, size, prot);
85 if (err)
86 return NULL;
87
88 ret = (void __iomem *)ioremap_bot + offset;
89 ioremap_bot += size;
90
91 return ret;
92}
93
94/*
95 * Unmap an IO region and remove it from vmalloc'd list.
96 * Access to IO memory should be serialized by driver.
97 */
98void iounmap(volatile void __iomem *token)
99{
100 void *addr;
101
102 if (!slab_is_available())
103 return;
104
105 addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
106
107 if ((unsigned long)addr < ioremap_bot) {
108 pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
109 return;
110 }
111 vunmap(addr);
112}
113EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 74f4555a62ba..802387b231ad 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -5,6 +5,7 @@
5#include <linux/kasan.h> 5#include <linux/kasan.h>
6#include <linux/printk.h> 6#include <linux/printk.h>
7#include <linux/memblock.h> 7#include <linux/memblock.h>
8#include <linux/moduleloader.h>
8#include <linux/sched/task.h> 9#include <linux/sched/task.h>
9#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
10#include <asm/pgalloc.h> 11#include <asm/pgalloc.h>
@@ -46,7 +47,19 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l
46 kasan_populate_pte(new, PAGE_READONLY); 47 kasan_populate_pte(new, PAGE_READONLY);
47 else 48 else
48 kasan_populate_pte(new, PAGE_KERNEL_RO); 49 kasan_populate_pte(new, PAGE_KERNEL_RO);
49 pmd_populate_kernel(&init_mm, pmd, new); 50
51 smp_wmb(); /* See comment in __pte_alloc */
52
53 spin_lock(&init_mm.page_table_lock);
54 /* Has another populated it ? */
55 if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) {
56 pmd_populate_kernel(&init_mm, pmd, new);
57 new = NULL;
58 }
59 spin_unlock(&init_mm.page_table_lock);
60
61 if (new && slab_is_available())
62 pte_free_kernel(&init_mm, new);
50 } 63 }
51 return 0; 64 return 0;
52} 65}
@@ -74,7 +87,7 @@ static int __ref kasan_init_region(void *start, size_t size)
74 if (!slab_is_available()) 87 if (!slab_is_available())
75 block = memblock_alloc(k_end - k_start, PAGE_SIZE); 88 block = memblock_alloc(k_end - k_start, PAGE_SIZE);
76 89
77 for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) { 90 for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
78 pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); 91 pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
79 void *va = block ? block + k_cur - k_start : kasan_get_one_page(); 92 void *va = block ? block + k_cur - k_start : kasan_get_one_page();
80 pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); 93 pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
@@ -137,7 +150,11 @@ void __init kasan_init(void)
137#ifdef CONFIG_MODULES 150#ifdef CONFIG_MODULES
138void *module_alloc(unsigned long size) 151void *module_alloc(unsigned long size)
139{ 152{
140 void *base = vmalloc_exec(size); 153 void *base;
154
155 base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END,
156 GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
157 NUMA_NO_NODE, __builtin_return_address(0));
141 158
142 if (!base) 159 if (!base)
143 return NULL; 160 return NULL;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9191a66b3bc5..be941d382c8d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -302,12 +302,9 @@ void __init mem_init(void)
302 pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", 302 pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
303 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); 303 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
304#endif /* CONFIG_HIGHMEM */ 304#endif /* CONFIG_HIGHMEM */
305#ifdef CONFIG_NOT_COHERENT_CACHE 305 if (ioremap_bot != IOREMAP_TOP)
306 pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", 306 pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
307 IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); 307 ioremap_bot, IOREMAP_TOP);
308#endif /* CONFIG_NOT_COHERENT_CACHE */
309 pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
310 ioremap_bot, IOREMAP_TOP);
311 pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", 308 pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
312 VMALLOC_START, VMALLOC_END); 309 VMALLOC_START, VMALLOC_END);
313#endif /* CONFIG_PPC32 */ 310#endif /* CONFIG_PPC32 */
@@ -407,63 +404,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
407EXPORT_SYMBOL(flush_icache_user_range); 404EXPORT_SYMBOL(flush_icache_user_range);
408 405
409/* 406/*
410 * This is called at the end of handling a user page fault, when the
411 * fault has been handled by updating a PTE in the linux page tables.
412 * We use it to preload an HPTE into the hash table corresponding to
413 * the updated linux PTE.
414 *
415 * This must always be called with the pte lock held.
416 */
417void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
418 pte_t *ptep)
419{
420#ifdef CONFIG_PPC_BOOK3S
421 /*
422 * We don't need to worry about _PAGE_PRESENT here because we are
423 * called with either mm->page_table_lock held or ptl lock held
424 */
425 unsigned long trap;
426 bool is_exec;
427
428 if (radix_enabled()) {
429 prefetch((void *)address);
430 return;
431 }
432
433 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
434 if (!pte_young(*ptep) || address >= TASK_SIZE)
435 return;
436
437 /* We try to figure out if we are coming from an instruction
438 * access fault and pass that down to __hash_page so we avoid
439 * double-faulting on execution of fresh text. We have to test
440 * for regs NULL since init will get here first thing at boot
441 *
442 * We also avoid filling the hash if not coming from a fault
443 */
444
445 trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
446 switch (trap) {
447 case 0x300:
448 is_exec = false;
449 break;
450 case 0x400:
451 is_exec = true;
452 break;
453 default:
454 return;
455 }
456
457 hash_preload(vma->vm_mm, address, is_exec, trap);
458#endif /* CONFIG_PPC_BOOK3S */
459#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
460 && defined(CONFIG_HUGETLB_PAGE)
461 if (is_vm_hugetlb_page(vma))
462 book3e_hugetlb_preload(vma, address, *ptep);
463#endif
464}
465
466/*
467 * System memory should not be in /proc/iomem but various tools expect it 407 * System memory should not be in /proc/iomem but various tools expect it
468 * (eg kdump). 408 * (eg kdump).
469 */ 409 */
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 32c1a191c28a..c750ac9ec713 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -82,10 +82,6 @@ static inline void print_system_hash_info(void) {}
82 82
83#else /* CONFIG_PPC_MMU_NOHASH */ 83#else /* CONFIG_PPC_MMU_NOHASH */
84 84
85extern void hash_preload(struct mm_struct *mm, unsigned long ea,
86 bool is_exec, unsigned long trap);
87
88
89extern void _tlbie(unsigned long address); 85extern void _tlbie(unsigned long address);
90extern void _tlbia(void); 86extern void _tlbia(void);
91 87
@@ -95,6 +91,8 @@ void print_system_hash_info(void);
95 91
96#ifdef CONFIG_PPC32 92#ifdef CONFIG_PPC32
97 93
94void hash_preload(struct mm_struct *mm, unsigned long ea);
95
98extern void mapin_ram(void); 96extern void mapin_ram(void);
99extern void setbat(int index, unsigned long virt, phys_addr_t phys, 97extern void setbat(int index, unsigned long virt, phys_addr_t phys,
100 unsigned int size, pgprot_t prot); 98 unsigned int size, pgprot_t prot);
@@ -108,7 +106,6 @@ extern u8 early_hash[];
108 106
109#endif /* CONFIG_PPC32 */ 107#endif /* CONFIG_PPC32 */
110 108
111extern unsigned long ioremap_bot;
112extern unsigned long __max_low_memory; 109extern unsigned long __max_low_memory;
113extern phys_addr_t __initial_memory_limit_addr; 110extern phys_addr_t __initial_memory_limit_addr;
114extern phys_addr_t total_memory; 111extern phys_addr_t total_memory;
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index 61915f4d3c7f..8b88be91b622 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -122,8 +122,8 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
122 return found; 122 return found;
123} 123}
124 124
125void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, 125static void
126 pte_t pte) 126book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
127{ 127{
128 unsigned long mas1, mas2; 128 unsigned long mas1, mas2;
129 u64 mas7_3; 129 u64 mas7_3;
@@ -183,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
183 local_irq_restore(flags); 183 local_irq_restore(flags);
184} 184}
185 185
186/*
187 * This is called at the end of handling a user page fault, when the
188 * fault has been handled by updating a PTE in the linux page tables.
189 *
190 * This must always be called with the pte lock held.
191 */
192void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
193{
194 if (is_vm_hugetlb_page(vma))
195 book3e_hugetlb_preload(vma, address, *ptep);
196}
197
186void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 198void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
187{ 199{
188 struct hstate *hstate = hstate_file(vma->vm_file); 200 struct hstate *hstate = hstate_file(vma->vm_file);
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index bf60983a58c7..696f568253a0 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -703,6 +703,8 @@ static void __init early_init_mmu_global(void)
703 * for use by the TLB miss code 703 * for use by the TLB miss code
704 */ 704 */
705 linear_map_top = memblock_end_of_DRAM(); 705 linear_map_top = memblock_end_of_DRAM();
706
707 ioremap_bot = IOREMAP_BASE;
706} 708}
707 709
708static void __init early_mmu_set_memory_limit(void) 710static void __init early_mmu_set_memory_limit(void)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 35cb96cfc258..8ec5dfb65b2e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -27,166 +27,13 @@
27#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/fixmap.h> 29#include <asm/fixmap.h>
30#include <asm/io.h>
31#include <asm/setup.h> 30#include <asm/setup.h>
32#include <asm/sections.h> 31#include <asm/sections.h>
33 32
34#include <mm/mmu_decl.h> 33#include <mm/mmu_decl.h>
35 34
36unsigned long ioremap_bot;
37EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */
38
39extern char etext[], _stext[], _sinittext[], _einittext[]; 35extern char etext[], _stext[], _sinittext[], _einittext[];
40 36
41void __iomem *
42ioremap(phys_addr_t addr, unsigned long size)
43{
44 pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
45
46 return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
47}
48EXPORT_SYMBOL(ioremap);
49
50void __iomem *
51ioremap_wc(phys_addr_t addr, unsigned long size)
52{
53 pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
54
55 return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
56}
57EXPORT_SYMBOL(ioremap_wc);
58
59void __iomem *
60ioremap_wt(phys_addr_t addr, unsigned long size)
61{
62 pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
63
64 return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
65}
66EXPORT_SYMBOL(ioremap_wt);
67
68void __iomem *
69ioremap_coherent(phys_addr_t addr, unsigned long size)
70{
71 pgprot_t prot = pgprot_cached(PAGE_KERNEL);
72
73 return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
74}
75EXPORT_SYMBOL(ioremap_coherent);
76
77void __iomem *
78ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
79{
80 pte_t pte = __pte(flags);
81
82 /* writeable implies dirty for kernel addresses */
83 if (pte_write(pte))
84 pte = pte_mkdirty(pte);
85
86 /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
87 pte = pte_exprotect(pte);
88 pte = pte_mkprivileged(pte);
89
90 return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0));
91}
92EXPORT_SYMBOL(ioremap_prot);
93
94void __iomem *
95__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
96{
97 return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
98}
99
100void __iomem *
101__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
102{
103 unsigned long v, i;
104 phys_addr_t p;
105 int err;
106
107 /*
108 * Choose an address to map it to.
109 * Once the vmalloc system is running, we use it.
110 * Before then, we use space going down from IOREMAP_TOP
111 * (ioremap_bot records where we're up to).
112 */
113 p = addr & PAGE_MASK;
114 size = PAGE_ALIGN(addr + size) - p;
115
116 /*
117 * If the address lies within the first 16 MB, assume it's in ISA
118 * memory space
119 */
120 if (p < 16*1024*1024)
121 p += _ISA_MEM_BASE;
122
123#ifndef CONFIG_CRASH_DUMP
124 /*
125 * Don't allow anybody to remap normal RAM that we're using.
126 * mem_init() sets high_memory so only do the check after that.
127 */
128 if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
129 page_is_ram(__phys_to_pfn(p))) {
130 printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n",
131 (unsigned long long)p, __builtin_return_address(0));
132 return NULL;
133 }
134#endif
135
136 if (size == 0)
137 return NULL;
138
139 /*
140 * Is it already mapped? Perhaps overlapped by a previous
141 * mapping.
142 */
143 v = p_block_mapped(p);
144 if (v)
145 goto out;
146
147 if (slab_is_available()) {
148 struct vm_struct *area;
149 area = get_vm_area_caller(size, VM_IOREMAP, caller);
150 if (area == 0)
151 return NULL;
152 area->phys_addr = p;
153 v = (unsigned long) area->addr;
154 } else {
155 v = (ioremap_bot -= size);
156 }
157
158 /*
159 * Should check if it is a candidate for a BAT mapping
160 */
161
162 err = 0;
163 for (i = 0; i < size && err == 0; i += PAGE_SIZE)
164 err = map_kernel_page(v + i, p + i, prot);
165 if (err) {
166 if (slab_is_available())
167 vunmap((void *)v);
168 return NULL;
169 }
170
171out:
172 return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
173}
174EXPORT_SYMBOL(__ioremap);
175
176void iounmap(volatile void __iomem *addr)
177{
178 /*
179 * If mapped by BATs then there is nothing to do.
180 * Calling vfree() generates a benign warning.
181 */
182 if (v_block_mapped((unsigned long)addr))
183 return;
184
185 if (addr > high_memory && (unsigned long) addr < ioremap_bot)
186 vunmap((void *) (PAGE_MASK & (unsigned long)addr));
187}
188EXPORT_SYMBOL(iounmap);
189
190static void __init *early_alloc_pgtable(unsigned long size) 37static void __init *early_alloc_pgtable(unsigned long size)
191{ 38{
192 void *ptr = memblock_alloc(size, size); 39 void *ptr = memblock_alloc(size, size);
@@ -252,7 +99,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
252 map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); 99 map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
253#ifdef CONFIG_PPC_BOOK3S_32 100#ifdef CONFIG_PPC_BOOK3S_32
254 if (ktext) 101 if (ktext)
255 hash_preload(&init_mm, v, false, 0x300); 102 hash_preload(&init_mm, v);
256#endif 103#endif
257 v += PAGE_SIZE; 104 v += PAGE_SIZE;
258 p += PAGE_SIZE; 105 p += PAGE_SIZE;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9ad59b733984..e78832dce7bb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -1,6 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later 1// SPDX-License-Identifier: GPL-2.0-or-later
2/* 2/*
3 * This file contains ioremap and related functions for 64-bit machines. 3 * This file contains pgtable related functions for 64-bit machines.
4 * 4 *
5 * Derived from arch/ppc64/mm/init.c 5 * Derived from arch/ppc64/mm/init.c
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -34,7 +34,6 @@
34#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
35#include <asm/page.h> 35#include <asm/page.h>
36#include <asm/prom.h> 36#include <asm/prom.h>
37#include <asm/io.h>
38#include <asm/mmu_context.h> 37#include <asm/mmu_context.h>
39#include <asm/pgtable.h> 38#include <asm/pgtable.h>
40#include <asm/mmu.h> 39#include <asm/mmu.h>
@@ -98,208 +97,8 @@ unsigned long __pte_frag_nr;
98EXPORT_SYMBOL(__pte_frag_nr); 97EXPORT_SYMBOL(__pte_frag_nr);
99unsigned long __pte_frag_size_shift; 98unsigned long __pte_frag_size_shift;
100EXPORT_SYMBOL(__pte_frag_size_shift); 99EXPORT_SYMBOL(__pte_frag_size_shift);
101unsigned long ioremap_bot;
102#else /* !CONFIG_PPC_BOOK3S_64 */
103unsigned long ioremap_bot = IOREMAP_BASE;
104#endif 100#endif
105 101
106int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid)
107{
108 unsigned long i;
109
110 for (i = 0; i < size; i += PAGE_SIZE) {
111 int err = map_kernel_page(ea + i, pa + i, prot);
112 if (err) {
113 if (slab_is_available())
114 unmap_kernel_range(ea, size);
115 else
116 WARN_ON_ONCE(1); /* Should clean up */
117 return err;
118 }
119 }
120
121 return 0;
122}
123
124/**
125 * __ioremap_at - Low level function to establish the page tables
126 * for an IO mapping
127 */
128void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
129{
130 /* We don't support the 4K PFN hack with ioremap */
131 if (pgprot_val(prot) & H_PAGE_4K_PFN)
132 return NULL;
133
134 if ((ea + size) >= (void *)IOREMAP_END) {
135 pr_warn("Outside the supported range\n");
136 return NULL;
137 }
138
139 WARN_ON(pa & ~PAGE_MASK);
140 WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
141 WARN_ON(size & ~PAGE_MASK);
142
143 if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE))
144 return NULL;
145
146 return (void __iomem *)ea;
147}
148
149/**
150 * __iounmap_from - Low level function to tear down the page tables
151 * for an IO mapping. This is used for mappings that
152 * are manipulated manually, like partial unmapping of
153 * PCI IOs or ISA space.
154 */
155void __iounmap_at(void *ea, unsigned long size)
156{
157 WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
158 WARN_ON(size & ~PAGE_MASK);
159
160 unmap_kernel_range((unsigned long)ea, size);
161}
162
163void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
164 pgprot_t prot, void *caller)
165{
166 phys_addr_t paligned;
167 void __iomem *ret;
168
169 /*
170 * Choose an address to map it to.
171 * Once the imalloc system is running, we use it.
172 * Before that, we map using addresses going
173 * up from ioremap_bot. imalloc will use
174 * the addresses from ioremap_bot through
175 * IMALLOC_END
176 *
177 */
178 paligned = addr & PAGE_MASK;
179 size = PAGE_ALIGN(addr + size) - paligned;
180
181 if ((size == 0) || (paligned == 0))
182 return NULL;
183
184 if (slab_is_available()) {
185 struct vm_struct *area;
186
187 area = __get_vm_area_caller(size, VM_IOREMAP,
188 ioremap_bot, IOREMAP_END,
189 caller);
190 if (area == NULL)
191 return NULL;
192
193 area->phys_addr = paligned;
194 ret = __ioremap_at(paligned, area->addr, size, prot);
195 } else {
196 ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot);
197 if (ret)
198 ioremap_bot += size;
199 }
200
201 if (ret)
202 ret += addr & ~PAGE_MASK;
203 return ret;
204}
205
206void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
207 unsigned long flags)
208{
209 return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0));
210}
211
212void __iomem * ioremap(phys_addr_t addr, unsigned long size)
213{
214 pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
215 void *caller = __builtin_return_address(0);
216
217 if (ppc_md.ioremap)
218 return ppc_md.ioremap(addr, size, prot, caller);
219 return __ioremap_caller(addr, size, prot, caller);
220}
221
222void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
223{
224 pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
225 void *caller = __builtin_return_address(0);
226
227 if (ppc_md.ioremap)
228 return ppc_md.ioremap(addr, size, prot, caller);
229 return __ioremap_caller(addr, size, prot, caller);
230}
231
232void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
233{
234 pgprot_t prot = pgprot_cached(PAGE_KERNEL);
235 void *caller = __builtin_return_address(0);
236
237 if (ppc_md.ioremap)
238 return ppc_md.ioremap(addr, size, prot, caller);
239 return __ioremap_caller(addr, size, prot, caller);
240}
241
242void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
243 unsigned long flags)
244{
245 pte_t pte = __pte(flags);
246 void *caller = __builtin_return_address(0);
247
248 /* writeable implies dirty for kernel addresses */
249 if (pte_write(pte))
250 pte = pte_mkdirty(pte);
251
252 /* we don't want to let _PAGE_EXEC leak out */
253 pte = pte_exprotect(pte);
254 /*
255 * Force kernel mapping.
256 */
257 pte = pte_mkprivileged(pte);
258
259 if (ppc_md.ioremap)
260 return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller);
261 return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
262}
263
264
265/*
266 * Unmap an IO region and remove it from imalloc'd list.
267 * Access to IO memory should be serialized by driver.
268 */
269void __iounmap(volatile void __iomem *token)
270{
271 void *addr;
272
273 if (!slab_is_available())
274 return;
275
276 addr = (void *) ((unsigned long __force)
277 PCI_FIX_ADDR(token) & PAGE_MASK);
278 if ((unsigned long)addr < ioremap_bot) {
279 printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
280 " at 0x%p\n", addr);
281 return;
282 }
283 vunmap(addr);
284}
285
286void iounmap(volatile void __iomem *token)
287{
288 if (ppc_md.iounmap)
289 ppc_md.iounmap(token);
290 else
291 __iounmap(token);
292}
293
294EXPORT_SYMBOL(ioremap);
295EXPORT_SYMBOL(ioremap_wc);
296EXPORT_SYMBOL(ioremap_prot);
297EXPORT_SYMBOL(__ioremap);
298EXPORT_SYMBOL(__ioremap_at);
299EXPORT_SYMBOL(iounmap);
300EXPORT_SYMBOL(__iounmap);
301EXPORT_SYMBOL(__iounmap_at);
302
303#ifndef __PAGETABLE_PUD_FOLDED 102#ifndef __PAGETABLE_PUD_FOLDED
304/* 4 level page table */ 103/* 4 level page table */
305struct page *pgd_page(pgd_t pgd) 104struct page *pgd_page(pgd_t pgd)
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index a0d23e96e841..4154feac1da3 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -149,7 +149,7 @@ static int bats_show_603(struct seq_file *m, void *v)
149 149
150static int bats_open(struct inode *inode, struct file *file) 150static int bats_open(struct inode *inode, struct file *file)
151{ 151{
152 if (cpu_has_feature(CPU_FTR_601)) 152 if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
153 return single_open(file, bats_show_601, NULL); 153 return single_open(file, bats_show_601, NULL);
154 154
155 return single_open(file, bats_show_603, NULL); 155 return single_open(file, bats_show_603, NULL);
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index 72f0e4a3d839..a07278027c6f 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -237,7 +237,6 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64
237 return -1; 237 return -1;
238} 238}
239 239
240#ifdef CONFIG_PPC_PSERIES
241static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) 240static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r)
242{ 241{
243 struct hash_pte ptes[4]; 242 struct hash_pte ptes[4];
@@ -274,7 +273,6 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *
274 } 273 }
275 return -1; 274 return -1;
276} 275}
277#endif
278 276
279static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, 277static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
280 unsigned long *lp_bits) 278 unsigned long *lp_bits)
@@ -316,10 +314,9 @@ static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
316static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, 314static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v,
317 u64 *r) 315 u64 *r)
318{ 316{
319#ifdef CONFIG_PPC_PSERIES 317 if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR))
320 if (firmware_has_feature(FW_FEATURE_LPAR))
321 return pseries_find(ea, psize, primary, v, r); 318 return pseries_find(ea, psize, primary, v, r);
322#endif 319
323 return native_find(ea, psize, primary, v, r); 320 return native_find(ea, psize, primary, v, r);
324} 321}
325 322
@@ -386,12 +383,13 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
386 psize = mmu_vmalloc_psize; 383 psize = mmu_vmalloc_psize;
387 else 384 else
388 psize = mmu_io_psize; 385 psize = mmu_io_psize;
389#ifdef CONFIG_PPC_64K_PAGES 386
390 /* check for secret 4K mappings */ 387 /* check for secret 4K mappings */
391 if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || 388 if (IS_ENABLED(CONFIG_PPC_64K_PAGES) &&
392 ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) 389 ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO ||
390 (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
393 psize = mmu_io_psize; 391 psize = mmu_io_psize;
394#endif 392
395 /* check for hashpte */ 393 /* check for hashpte */
396 status = hpte_find(st, addr, psize); 394 status = hpte_find(st, addr, psize);
397 395
@@ -469,9 +467,10 @@ static void walk_linearmapping(struct pg_state *st)
469 467
470static void walk_vmemmap(struct pg_state *st) 468static void walk_vmemmap(struct pg_state *st)
471{ 469{
472#ifdef CONFIG_SPARSEMEM_VMEMMAP
473 struct vmemmap_backing *ptr = vmemmap_list; 470 struct vmemmap_backing *ptr = vmemmap_list;
474 471
472 if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
473 return;
475 /* 474 /*
476 * Traverse the vmemmaped memory and dump pages that are in the hash 475 * Traverse the vmemmaped memory and dump pages that are in the hash
477 * pagetable. 476 * pagetable.
@@ -481,7 +480,6 @@ static void walk_vmemmap(struct pg_state *st)
481 ptr = ptr->list; 480 ptr = ptr->list;
482 } 481 }
483 seq_puts(st->seq, "---[ vmemmap end ]---\n"); 482 seq_puts(st->seq, "---[ vmemmap end ]---\n");
484#endif
485} 483}
486 484
487static void populate_markers(void) 485static void populate_markers(void)
@@ -495,11 +493,7 @@ static void populate_markers(void)
495 address_markers[6].start_address = PHB_IO_END; 493 address_markers[6].start_address = PHB_IO_END;
496 address_markers[7].start_address = IOREMAP_BASE; 494 address_markers[7].start_address = IOREMAP_BASE;
497 address_markers[8].start_address = IOREMAP_END; 495 address_markers[8].start_address = IOREMAP_END;
498#ifdef CONFIG_PPC_BOOK3S_64
499 address_markers[9].start_address = H_VMEMMAP_START; 496 address_markers[9].start_address = H_VMEMMAP_START;
500#else
501 address_markers[9].start_address = VMEMMAP_BASE;
502#endif
503} 497}
504 498
505static int ptdump_show(struct seq_file *m, void *v) 499static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 6a88a9f585d4..2f9ddc29c535 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -26,10 +26,6 @@
26 26
27#include "ptdump.h" 27#include "ptdump.h"
28 28
29#ifdef CONFIG_PPC32
30#define KERN_VIRT_START PAGE_OFFSET
31#endif
32
33/* 29/*
34 * To visualise what is happening, 30 * To visualise what is happening,
35 * 31 *
@@ -88,10 +84,6 @@ static struct addr_marker address_markers[] = {
88#else 84#else
89 { 0, "Early I/O remap start" }, 85 { 0, "Early I/O remap start" },
90 { 0, "Early I/O remap end" }, 86 { 0, "Early I/O remap end" },
91#ifdef CONFIG_NOT_COHERENT_CACHE
92 { 0, "Consistent mem start" },
93 { 0, "Consistent mem end" },
94#endif
95#ifdef CONFIG_HIGHMEM 87#ifdef CONFIG_HIGHMEM
96 { 0, "Highmem PTEs start" }, 88 { 0, "Highmem PTEs start" },
97 { 0, "Highmem PTEs end" }, 89 { 0, "Highmem PTEs end" },
@@ -181,7 +173,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
181 173
182static void note_prot_wx(struct pg_state *st, unsigned long addr) 174static void note_prot_wx(struct pg_state *st, unsigned long addr)
183{ 175{
184 if (!st->check_wx) 176 if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx)
185 return; 177 return;
186 178
187 if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) 179 if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X)))
@@ -299,17 +291,15 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
299 291
300static void walk_pagetables(struct pg_state *st) 292static void walk_pagetables(struct pg_state *st)
301{ 293{
302 pgd_t *pgd = pgd_offset_k(0UL);
303 unsigned int i; 294 unsigned int i;
304 unsigned long addr; 295 unsigned long addr = st->start_address & PGDIR_MASK;
305 296 pgd_t *pgd = pgd_offset_k(addr);
306 addr = st->start_address;
307 297
308 /* 298 /*
309 * Traverse the linux pagetable structure and dump pages that are in 299 * Traverse the linux pagetable structure and dump pages that are in
310 * the hash pagetable. 300 * the hash pagetable.
311 */ 301 */
312 for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { 302 for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
313 if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) 303 if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
314 /* pgd exists */ 304 /* pgd exists */
315 walk_pud(st, pgd, addr); 305 walk_pud(st, pgd, addr);
@@ -341,11 +331,6 @@ static void populate_markers(void)
341#else /* !CONFIG_PPC64 */ 331#else /* !CONFIG_PPC64 */
342 address_markers[i++].start_address = ioremap_bot; 332 address_markers[i++].start_address = ioremap_bot;
343 address_markers[i++].start_address = IOREMAP_TOP; 333 address_markers[i++].start_address = IOREMAP_TOP;
344#ifdef CONFIG_NOT_COHERENT_CACHE
345 address_markers[i++].start_address = IOREMAP_TOP;
346 address_markers[i++].start_address = IOREMAP_TOP +
347 CONFIG_CONSISTENT_SIZE;
348#endif
349#ifdef CONFIG_HIGHMEM 334#ifdef CONFIG_HIGHMEM
350 address_markers[i++].start_address = PKMAP_BASE; 335 address_markers[i++].start_address = PKMAP_BASE;
351 address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); 336 address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
@@ -364,12 +349,13 @@ static int ptdump_show(struct seq_file *m, void *v)
364 struct pg_state st = { 349 struct pg_state st = {
365 .seq = m, 350 .seq = m,
366 .marker = address_markers, 351 .marker = address_markers,
352 .start_address = PAGE_OFFSET,
367 }; 353 };
368 354
369 if (radix_enabled()) 355#ifdef CONFIG_PPC64
370 st.start_address = PAGE_OFFSET; 356 if (!radix_enabled())
371 else
372 st.start_address = KERN_VIRT_START; 357 st.start_address = KERN_VIRT_START;
358#endif
373 359
374 /* Traverse kernel page tables */ 360 /* Traverse kernel page tables */
375 walk_pagetables(&st); 361 walk_pagetables(&st);
@@ -407,12 +393,13 @@ void ptdump_check_wx(void)
407 .seq = NULL, 393 .seq = NULL,
408 .marker = address_markers, 394 .marker = address_markers,
409 .check_wx = true, 395 .check_wx = true,
396 .start_address = PAGE_OFFSET,
410 }; 397 };
411 398
412 if (radix_enabled()) 399#ifdef CONFIG_PPC64
413 st.start_address = PAGE_OFFSET; 400 if (!radix_enabled())
414 else
415 st.start_address = KERN_VIRT_START; 401 st.start_address = KERN_VIRT_START;
402#endif
416 403
417 walk_pagetables(&st); 404 walk_pagetables(&st);
418 405
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index dea243185ea4..cb50a9e1fd2d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -577,6 +577,7 @@ static int core_imc_mem_init(int cpu, int size)
577{ 577{
578 int nid, rc = 0, core_id = (cpu / threads_per_core); 578 int nid, rc = 0, core_id = (cpu / threads_per_core);
579 struct imc_mem_info *mem_info; 579 struct imc_mem_info *mem_info;
580 struct page *page;
580 581
581 /* 582 /*
582 * alloc_pages_node() will allocate memory for core in the 583 * alloc_pages_node() will allocate memory for core in the
@@ -587,11 +588,12 @@ static int core_imc_mem_init(int cpu, int size)
587 mem_info->id = core_id; 588 mem_info->id = core_id;
588 589
589 /* We need only vbase for core counters */ 590 /* We need only vbase for core counters */
590 mem_info->vbase = page_address(alloc_pages_node(nid, 591 page = alloc_pages_node(nid,
591 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | 592 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
592 __GFP_NOWARN, get_order(size))); 593 __GFP_NOWARN, get_order(size));
593 if (!mem_info->vbase) 594 if (!page)
594 return -ENOMEM; 595 return -ENOMEM;
596 mem_info->vbase = page_address(page);
595 597
596 /* Init the mutex */ 598 /* Init the mutex */
597 core_imc_refc[core_id].id = core_id; 599 core_imc_refc[core_id].id = core_id;
@@ -849,15 +851,17 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
849 int nid = cpu_to_node(cpu_id); 851 int nid = cpu_to_node(cpu_id);
850 852
851 if (!local_mem) { 853 if (!local_mem) {
854 struct page *page;
852 /* 855 /*
853 * This case could happen only once at start, since we dont 856 * This case could happen only once at start, since we dont
854 * free the memory in cpu offline path. 857 * free the memory in cpu offline path.
855 */ 858 */
856 local_mem = page_address(alloc_pages_node(nid, 859 page = alloc_pages_node(nid,
857 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | 860 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
858 __GFP_NOWARN, get_order(size))); 861 __GFP_NOWARN, get_order(size));
859 if (!local_mem) 862 if (!page)
860 return -ENOMEM; 863 return -ENOMEM;
864 local_mem = page_address(page);
861 865
862 per_cpu(thread_imc_mem, cpu_id) = local_mem; 866 per_cpu(thread_imc_mem, cpu_id) = local_mem;
863 } 867 }
@@ -1095,11 +1099,14 @@ static int trace_imc_mem_alloc(int cpu_id, int size)
1095 int core_id = (cpu_id / threads_per_core); 1099 int core_id = (cpu_id / threads_per_core);
1096 1100
1097 if (!local_mem) { 1101 if (!local_mem) {
1098 local_mem = page_address(alloc_pages_node(phys_id, 1102 struct page *page;
1099 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | 1103
1100 __GFP_NOWARN, get_order(size))); 1104 page = alloc_pages_node(phys_id,
1101 if (!local_mem) 1105 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
1106 __GFP_NOWARN, get_order(size));
1107 if (!page)
1102 return -ENOMEM; 1108 return -ENOMEM;
1109 local_mem = page_address(page);
1103 per_cpu(trace_imc_mem, cpu_id) = local_mem; 1110 per_cpu(trace_imc_mem, cpu_id) = local_mem;
1104 1111
1105 /* Initialise the counters for trace mode */ 1112 /* Initialise the counters for trace mode */
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index b369ed4e3675..25ebe634a661 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -272,14 +272,6 @@ config PPC4xx_GPIO
272 help 272 help
273 Enable gpiolib support for ppc440 based boards 273 Enable gpiolib support for ppc440 based boards
274 274
275config PPC4xx_OCM
276 bool "PPC4xx On Chip Memory (OCM) support"
277 depends on 4xx
278 select PPC_LIB_RHEAP
279 help
280 Enable OCM support for PowerPC 4xx platforms with on chip memory,
281 OCM provides the fast place for memory access to improve performance.
282
283# 44x specific CPU modules, selected based on the board above. 275# 44x specific CPU modules, selected based on the board above.
284config 440EP 276config 440EP
285 bool 277 bool
diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile
index f5ae27ca131b..d009d2e0b9e8 100644
--- a/arch/powerpc/platforms/4xx/Makefile
+++ b/arch/powerpc/platforms/4xx/Makefile
@@ -1,6 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0-only 1# SPDX-License-Identifier: GPL-2.0-only
2obj-y += uic.o machine_check.o 2obj-y += uic.o machine_check.o
3obj-$(CONFIG_PPC4xx_OCM) += ocm.o
4obj-$(CONFIG_4xx_SOC) += soc.o 3obj-$(CONFIG_4xx_SOC) += soc.o
5obj-$(CONFIG_PCI) += pci.o 4obj-$(CONFIG_PCI) += pci.o
6obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o 5obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o
diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c
deleted file mode 100644
index ba3257406ced..000000000000
--- a/arch/powerpc/platforms/4xx/ocm.c
+++ /dev/null
@@ -1,390 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * PowerPC 4xx OCM memory allocation support
4 *
5 * (C) Copyright 2009, Applied Micro Circuits Corporation
6 * Victor Gallardo (vgallardo@amcc.com)
7 *
8 * See file CREDITS for list of people who contributed to this
9 * project.
10 */
11
12#include <linux/kernel.h>
13#include <linux/dma-mapping.h>
14#include <linux/of.h>
15#include <linux/of_address.h>
16#include <asm/rheap.h>
17#include <asm/ppc4xx_ocm.h>
18#include <linux/slab.h>
19#include <linux/debugfs.h>
20
21#define OCM_DISABLED 0
22#define OCM_ENABLED 1
23
24struct ocm_block {
25 struct list_head list;
26 void __iomem *addr;
27 int size;
28 const char *owner;
29};
30
31/* non-cached or cached region */
32struct ocm_region {
33 phys_addr_t phys;
34 void __iomem *virt;
35
36 int memtotal;
37 int memfree;
38
39 rh_info_t *rh;
40 struct list_head list;
41};
42
43struct ocm_info {
44 int index;
45 int status;
46 int ready;
47
48 phys_addr_t phys;
49
50 int alignment;
51 int memtotal;
52 int cache_size;
53
54 struct ocm_region nc; /* non-cached region */
55 struct ocm_region c; /* cached region */
56};
57
58static struct ocm_info *ocm_nodes;
59static int ocm_count;
60
61static struct ocm_info *ocm_get_node(unsigned int index)
62{
63 if (index >= ocm_count) {
64 printk(KERN_ERR "PPC4XX OCM: invalid index");
65 return NULL;
66 }
67
68 return &ocm_nodes[index];
69}
70
71static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr)
72{
73 struct ocm_block *blk, *tmp;
74 unsigned long offset;
75
76 if (!ocm_reg->virt)
77 return 0;
78
79 list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) {
80 if (blk->addr == addr) {
81 offset = addr - ocm_reg->virt;
82 ocm_reg->memfree += blk->size;
83 rh_free(ocm_reg->rh, offset);
84 list_del(&blk->list);
85 kfree(blk);
86 return 1;
87 }
88 }
89
90 return 0;
91}
92
93static void __init ocm_init_node(int count, struct device_node *node)
94{
95 struct ocm_info *ocm;
96
97 const unsigned int *cell_index;
98 const unsigned int *cache_size;
99 int len;
100
101 struct resource rsrc;
102
103 ocm = ocm_get_node(count);
104
105 cell_index = of_get_property(node, "cell-index", &len);
106 if (!cell_index) {
107 printk(KERN_ERR "PPC4XX OCM: missing cell-index property");
108 return;
109 }
110 ocm->index = *cell_index;
111
112 if (of_device_is_available(node))
113 ocm->status = OCM_ENABLED;
114
115 cache_size = of_get_property(node, "cached-region-size", &len);
116 if (cache_size)
117 ocm->cache_size = *cache_size;
118
119 if (of_address_to_resource(node, 0, &rsrc)) {
120 printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n",
121 ocm->index);
122 return;
123 }
124
125 ocm->phys = rsrc.start;
126 ocm->memtotal = (rsrc.end - rsrc.start + 1);
127
128 printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n",
129 ocm->index, ocm->memtotal,
130 (ocm->status == OCM_DISABLED) ? "disabled" : "enabled");
131
132 if (ocm->status == OCM_DISABLED)
133 return;
134
135 /* request region */
136
137 if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) {
138 printk(KERN_ERR "PPC4XX OCM%d: could not request region\n",
139 ocm->index);
140 return;
141 }
142
143 /* Configure non-cached and cached regions */
144
145 ocm->nc.phys = ocm->phys;
146 ocm->nc.memtotal = ocm->memtotal - ocm->cache_size;
147 ocm->nc.memfree = ocm->nc.memtotal;
148
149 ocm->c.phys = ocm->phys + ocm->nc.memtotal;
150 ocm->c.memtotal = ocm->cache_size;
151 ocm->c.memfree = ocm->c.memtotal;
152
153 if (ocm->nc.memtotal == 0)
154 ocm->nc.phys = 0;
155
156 if (ocm->c.memtotal == 0)
157 ocm->c.phys = 0;
158
159 printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n",
160 ocm->index, ocm->nc.memtotal);
161
162 printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n",
163 ocm->index, ocm->c.memtotal);
164
165 /* ioremap the non-cached region */
166 if (ocm->nc.memtotal) {
167 ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal,
168 _PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG));
169
170 if (!ocm->nc.virt) {
171 printk(KERN_ERR
172 "PPC4XX OCM%d: failed to ioremap non-cached memory\n",
173 ocm->index);
174 ocm->nc.memfree = 0;
175 return;
176 }
177 }
178
179 /* ioremap the cached region */
180
181 if (ocm->c.memtotal) {
182 ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal,
183 _PAGE_EXEC | pgprot_val(PAGE_KERNEL));
184
185 if (!ocm->c.virt) {
186 printk(KERN_ERR
187 "PPC4XX OCM%d: failed to ioremap cached memory\n",
188 ocm->index);
189 ocm->c.memfree = 0;
190 return;
191 }
192 }
193
194 /* Create Remote Heaps */
195
196 ocm->alignment = 4; /* default 4 byte alignment */
197
198 if (ocm->nc.virt) {
199 ocm->nc.rh = rh_create(ocm->alignment);
200 rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal);
201 }
202
203 if (ocm->c.virt) {
204 ocm->c.rh = rh_create(ocm->alignment);
205 rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal);
206 }
207
208 INIT_LIST_HEAD(&ocm->nc.list);
209 INIT_LIST_HEAD(&ocm->c.list);
210
211 ocm->ready = 1;
212}
213
214static int ocm_debugfs_show(struct seq_file *m, void *v)
215{
216 struct ocm_block *blk, *tmp;
217 unsigned int i;
218
219 for (i = 0; i < ocm_count; i++) {
220 struct ocm_info *ocm = ocm_get_node(i);
221
222 if (!ocm || !ocm->ready)
223 continue;
224
225 seq_printf(m, "PPC4XX OCM : %d\n", ocm->index);
226 seq_printf(m, "PhysAddr : %pa\n", &(ocm->phys));
227 seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal);
228 seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal);
229 seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal);
230
231 seq_printf(m, "NC.PhysAddr : %pa\n", &(ocm->nc.phys));
232 seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt);
233 seq_printf(m, "NC.MemTotal : %d Bytes\n", ocm->nc.memtotal);
234 seq_printf(m, "NC.MemFree : %d Bytes\n", ocm->nc.memfree);
235
236 list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) {
237 seq_printf(m, "NC.MemUsed : %d Bytes (%s)\n",
238 blk->size, blk->owner);
239 }
240
241 seq_printf(m, "\nC.PhysAddr : %pa\n", &(ocm->c.phys));
242 seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt);
243 seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal);
244 seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree);
245
246 list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) {
247 seq_printf(m, "C.MemUsed : %d Bytes (%s)\n",
248 blk->size, blk->owner);
249 }
250
251 seq_putc(m, '\n');
252 }
253
254 return 0;
255}
256
257static int ocm_debugfs_open(struct inode *inode, struct file *file)
258{
259 return single_open(file, ocm_debugfs_show, NULL);
260}
261
262static const struct file_operations ocm_debugfs_fops = {
263 .open = ocm_debugfs_open,
264 .read = seq_read,
265 .llseek = seq_lseek,
266 .release = single_release,
267};
268
269static int ocm_debugfs_init(void)
270{
271 struct dentry *junk;
272
273 junk = debugfs_create_dir("ppc4xx_ocm", 0);
274 if (!junk) {
275 printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n");
276 return -1;
277 }
278
279 if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) {
280 printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n");
281 return -1;
282 }
283
284 return 0;
285}
286
287void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
288 int flags, const char *owner)
289{
290 void __iomem *addr = NULL;
291 unsigned long offset;
292 struct ocm_info *ocm;
293 struct ocm_region *ocm_reg;
294 struct ocm_block *ocm_blk;
295 int i;
296
297 for (i = 0; i < ocm_count; i++) {
298 ocm = ocm_get_node(i);
299
300 if (!ocm || !ocm->ready)
301 continue;
302
303 if (flags == PPC4XX_OCM_NON_CACHED)
304 ocm_reg = &ocm->nc;
305 else
306 ocm_reg = &ocm->c;
307
308 if (!ocm_reg->virt)
309 continue;
310
311 if (align < ocm->alignment)
312 align = ocm->alignment;
313
314 offset = rh_alloc_align(ocm_reg->rh, size, align, NULL);
315
316 if (IS_ERR_VALUE(offset))
317 continue;
318
319 ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL);
320 if (!ocm_blk) {
321 rh_free(ocm_reg->rh, offset);
322 break;
323 }
324
325 *phys = ocm_reg->phys + offset;
326 addr = ocm_reg->virt + offset;
327 size = ALIGN(size, align);
328
329 ocm_blk->addr = addr;
330 ocm_blk->size = size;
331 ocm_blk->owner = owner;
332 list_add_tail(&ocm_blk->list, &ocm_reg->list);
333
334 ocm_reg->memfree -= size;
335
336 break;
337 }
338
339 return addr;
340}
341
342void ppc4xx_ocm_free(const void *addr)
343{
344 int i;
345
346 if (!addr)
347 return;
348
349 for (i = 0; i < ocm_count; i++) {
350 struct ocm_info *ocm = ocm_get_node(i);
351
352 if (!ocm || !ocm->ready)
353 continue;
354
355 if (ocm_free_region(&ocm->nc, addr) ||
356 ocm_free_region(&ocm->c, addr))
357 return;
358 }
359}
360
361static int __init ppc4xx_ocm_init(void)
362{
363 struct device_node *np;
364 int count;
365
366 count = 0;
367 for_each_compatible_node(np, NULL, "ibm,ocm")
368 count++;
369
370 if (!count)
371 return 0;
372
373 ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL);
374 if (!ocm_nodes)
375 return -ENOMEM;
376
377 ocm_count = count;
378 count = 0;
379
380 for_each_compatible_node(np, NULL, "ibm,ocm") {
381 ocm_init_node(count, np);
382 count++;
383 }
384
385 ocm_debugfs_init();
386
387 return 0;
388}
389
390arch_initcall(ppc4xx_ocm_init);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f3fb79fccc72..d82e3664ffdf 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -197,7 +197,8 @@ endmenu
197 197
198config PPC601_SYNC_FIX 198config PPC601_SYNC_FIX
199 bool "Workarounds for PPC601 bugs" 199 bool "Workarounds for PPC601 bugs"
200 depends on PPC_BOOK3S_32 && PPC_PMAC 200 depends on PPC_BOOK3S_601 && PPC_PMAC
201 default y
201 help 202 help
202 Some versions of the PPC601 (the first PowerPC chip) have bugs which 203 Some versions of the PPC601 (the first PowerPC chip) have bugs which
203 mean that extra synchronization instructions are required near 204 mean that extra synchronization instructions are required near
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 56a7c814160d..12543e53fa96 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -6,6 +6,9 @@ config PPC64
6 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
7 will be built. 7 will be built.
8 8
9config PPC_BOOK3S_32
10 bool
11
9menu "Processor support" 12menu "Processor support"
10choice 13choice
11 prompt "Processor Type" 14 prompt "Processor Type"
@@ -21,13 +24,20 @@ choice
21 24
22 If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. 25 If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx.
23 26
24config PPC_BOOK3S_32 27config PPC_BOOK3S_6xx
25 bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" 28 bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
29 select PPC_BOOK3S_32
26 select PPC_FPU 30 select PPC_FPU
27 select PPC_HAVE_PMU_SUPPORT 31 select PPC_HAVE_PMU_SUPPORT
28 select PPC_HAVE_KUEP 32 select PPC_HAVE_KUEP
29 select PPC_HAVE_KUAP 33 select PPC_HAVE_KUAP
30 34
35config PPC_BOOK3S_601
36 bool "PowerPC 601"
37 select PPC_BOOK3S_32
38 select PPC_FPU
39 select PPC_HAVE_KUAP
40
31config PPC_85xx 41config PPC_85xx
32 bool "Freescale 85xx" 42 bool "Freescale 85xx"
33 select E500 43 select E500
@@ -450,8 +460,10 @@ config NOT_COHERENT_CACHE
450 depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ 460 depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
451 GAMECUBE_COMMON || AMIGAONE 461 GAMECUBE_COMMON || AMIGAONE
452 select ARCH_HAS_DMA_COHERENT_TO_PFN 462 select ARCH_HAS_DMA_COHERENT_TO_PFN
463 select ARCH_HAS_DMA_PREP_COHERENT
453 select ARCH_HAS_SYNC_DMA_FOR_DEVICE 464 select ARCH_HAS_SYNC_DMA_FOR_DEVICE
454 select ARCH_HAS_SYNC_DMA_FOR_CPU 465 select ARCH_HAS_SYNC_DMA_FOR_CPU
466 select DMA_DIRECT_REMAP
455 default n if PPC_47x 467 default n if PPC_47x
456 default y 468 default y
457 469
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 16dfee29aa41..ca9ffc1c8685 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
486 window->table.it_size = size >> window->table.it_page_shift; 486 window->table.it_size = size >> window->table.it_page_shift;
487 window->table.it_ops = &cell_iommu_ops; 487 window->table.it_ops = &cell_iommu_ops;
488 488
489 iommu_init_table(&window->table, iommu->nid); 489 iommu_init_table(&window->table, iommu->nid, 0, 0);
490 490
491 pr_debug("\tioid %d\n", window->ioid); 491 pr_debug("\tioid %d\n", window->ioid);
492 pr_debug("\tblocksize %ld\n", window->table.it_blocksize); 492 pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 77fee09104f8..b500a6e47e6b 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void)
146 */ 146 */
147 iommu_table_iobmap.it_blocksize = 4; 147 iommu_table_iobmap.it_blocksize = 4;
148 iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; 148 iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
149 iommu_init_table(&iommu_table_iobmap, 0); 149 iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
150 pr_debug(" <- %s\n", __func__); 150 pr_debug(" <- %s\n", __func__);
151} 151}
152 152
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..938803eab0ad 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -12,7 +12,6 @@ config PPC_POWERNV
12 select EPAPR_BOOT 12 select EPAPR_BOOT
13 select PPC_INDIRECT_PIO 13 select PPC_INDIRECT_PIO
14 select PPC_UDBG_16550 14 select PPC_UDBG_16550
15 select PPC_SCOM
16 select ARCH_RANDOM 15 select ARCH_RANDOM
17 select CPU_FREQ 16 select CPU_FREQ
18 select PPC_DOORBELL 17 select PPC_DOORBELL
@@ -47,3 +46,7 @@ config PPC_VAS
47 VAS adapters are found in POWER9 based systems. 46 VAS adapters are found in POWER9 based systems.
48 47
49 If unsure, say N. 48 If unsure, say N.
49
50config SCOM_DEBUGFS
51 bool "Expose SCOM controllers via debugfs"
52 depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..a3ac9646119d 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,15 +4,19 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
4obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o 4obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
5obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o 5obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
6obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o 6obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
7obj-y += ultravisor.o
7 8
8obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o 9obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
10obj-$(CONFIG_FA_DUMP) += opal-fadump.o
11obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
12obj-$(CONFIG_OPAL_CORE) += opal-core.o
9obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o 13obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
10obj-$(CONFIG_CXL_BASE) += pci-cxl.o 14obj-$(CONFIG_CXL_BASE) += pci-cxl.o
11obj-$(CONFIG_EEH) += eeh-powernv.o 15obj-$(CONFIG_EEH) += eeh-powernv.o
12obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
13obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o 16obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
14obj-$(CONFIG_OPAL_PRD) += opal-prd.o 17obj-$(CONFIG_OPAL_PRD) += opal-prd.o
15obj-$(CONFIG_PERF_EVENTS) += opal-imc.o 18obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
16obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o 19obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
17obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o 20obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
18obj-$(CONFIG_OCXL_BASE) += ocxl.o 21obj-$(CONFIG_OCXL_BASE) += ocxl.o
22obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 620a986209f5..6bc24a47e9ef 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -34,6 +34,7 @@
34 34
35#include "powernv.h" 35#include "powernv.h"
36#include "pci.h" 36#include "pci.h"
37#include "../../../../drivers/pci/pci.h"
37 38
38static int eeh_event_irq = -EINVAL; 39static int eeh_event_irq = -EINVAL;
39 40
@@ -41,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
41{ 42{
42 struct pci_dn *pdn = pci_get_pdn(pdev); 43 struct pci_dn *pdn = pci_get_pdn(pdev);
43 44
44 if (!pdev->is_virtfn) 45 if (eeh_has_flag(EEH_FORCE_DISABLED))
45 return; 46 return;
46 47
47 /* 48 dev_dbg(&pdev->dev, "EEH: Setting up device\n");
48 * The following operations will fail if VF's sysfs files
49 * aren't created or its resources aren't finalized.
50 */
51 eeh_add_device_early(pdn); 49 eeh_add_device_early(pdn);
52 eeh_add_device_late(pdev); 50 eeh_add_device_late(pdev);
53 eeh_sysfs_add_device(pdev); 51 eeh_sysfs_add_device(pdev);
@@ -199,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
199 197
200#endif /* CONFIG_DEBUG_FS */ 198#endif /* CONFIG_DEBUG_FS */
201 199
200void pnv_eeh_enable_phbs(void)
201{
202 struct pci_controller *hose;
203 struct pnv_phb *phb;
204
205 list_for_each_entry(hose, &hose_list, list_node) {
206 phb = hose->private_data;
207 /*
208 * If EEH is enabled, we're going to rely on that.
209 * Otherwise, we restore to conventional mechanism
210 * to clear frozen PE during PCI config access.
211 */
212 if (eeh_enabled())
213 phb->flags |= PNV_PHB_FLAG_EEH;
214 else
215 phb->flags &= ~PNV_PHB_FLAG_EEH;
216 }
217}
218
202/** 219/**
203 * pnv_eeh_post_init - EEH platform dependent post initialization 220 * pnv_eeh_post_init - EEH platform dependent post initialization
204 * 221 *
@@ -213,9 +230,7 @@ int pnv_eeh_post_init(void)
213 struct pnv_phb *phb; 230 struct pnv_phb *phb;
214 int ret = 0; 231 int ret = 0;
215 232
216 /* Probe devices & build address cache */ 233 eeh_show_enabled();
217 eeh_probe_devices();
218 eeh_addr_cache_build();
219 234
220 /* Register OPAL event notifier */ 235 /* Register OPAL event notifier */
221 eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); 236 eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
@@ -237,19 +252,11 @@ int pnv_eeh_post_init(void)
237 if (!eeh_enabled()) 252 if (!eeh_enabled())
238 disable_irq(eeh_event_irq); 253 disable_irq(eeh_event_irq);
239 254
255 pnv_eeh_enable_phbs();
256
240 list_for_each_entry(hose, &hose_list, list_node) { 257 list_for_each_entry(hose, &hose_list, list_node) {
241 phb = hose->private_data; 258 phb = hose->private_data;
242 259
243 /*
244 * If EEH is enabled, we're going to rely on that.
245 * Otherwise, we restore to conventional mechanism
246 * to clear frozen PE during PCI config access.
247 */
248 if (eeh_enabled())
249 phb->flags |= PNV_PHB_FLAG_EEH;
250 else
251 phb->flags &= ~PNV_PHB_FLAG_EEH;
252
253 /* Create debugfs entries */ 260 /* Create debugfs entries */
254#ifdef CONFIG_DEBUG_FS 261#ifdef CONFIG_DEBUG_FS
255 if (phb->has_dbgfs || !phb->dbgfs) 262 if (phb->has_dbgfs || !phb->dbgfs)
@@ -377,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
377 if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) 384 if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
378 return NULL; 385 return NULL;
379 386
387 eeh_edev_dbg(edev, "Probing device\n");
388
380 /* Initialize eeh device */ 389 /* Initialize eeh device */
381 edev->class_code = pdn->class_code; 390 edev->class_code = pdn->class_code;
382 edev->mode &= 0xFFFFFF00; 391 edev->mode &= 0xFFFFFF00;
@@ -402,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
402 /* Create PE */ 411 /* Create PE */
403 ret = eeh_add_to_parent_pe(edev); 412 ret = eeh_add_to_parent_pe(edev);
404 if (ret) { 413 if (ret) {
405 pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", 414 eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
406 __func__, hose->global_number, pdn->busno,
407 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
408 return NULL; 415 return NULL;
409 } 416 }
410 417
@@ -453,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
453 * Enable EEH explicitly so that we will do EEH check 460 * Enable EEH explicitly so that we will do EEH check
454 * while accessing I/O stuff 461 * while accessing I/O stuff
455 */ 462 */
456 eeh_add_flag(EEH_ENABLED); 463 if (!eeh_has_flag(EEH_ENABLED)) {
464 enable_irq(eeh_event_irq);
465 pnv_eeh_enable_phbs();
466 eeh_add_flag(EEH_ENABLED);
467 }
457 468
458 /* Save memory bars */ 469 /* Save memory bars */
459 eeh_save_bars(edev); 470 eeh_save_bars(edev);
460 471
472 eeh_edev_dbg(edev, "EEH enabled on device\n");
473
461 return NULL; 474 return NULL;
462} 475}
463 476
@@ -837,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
837 int aer = edev ? edev->aer_cap : 0; 850 int aer = edev ? edev->aer_cap : 0;
838 u32 ctrl; 851 u32 ctrl;
839 852
840 pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", 853 pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
841 __func__, pci_domain_nr(dev->bus), 854 __func__, pci_domain_nr(dev->bus),
842 dev->bus->number, option); 855 dev->bus->number, option);
843 856
@@ -895,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
895 if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) 908 if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
896 return __pnv_eeh_bridge_reset(pdev, option); 909 return __pnv_eeh_bridge_reset(pdev, option);
897 910
911 pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
912 __func__, pci_domain_nr(pdev->bus),
913 pdev->bus->number, option);
914
898 switch (option) { 915 switch (option) {
899 case EEH_RESET_FUNDAMENTAL: 916 case EEH_RESET_FUNDAMENTAL:
900 scope = OPAL_RESET_PCI_FUNDAMENTAL; 917 scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -1113,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
1113 return -EIO; 1130 return -EIO;
1114 } 1131 }
1115 1132
1133 if (pci_is_root_bus(bus))
1134 return pnv_eeh_root_reset(hose, option);
1135
1116 /* 1136 /*
1117 * If dealing with the root bus (or the bus underneath the 1137 * For hot resets try use the generic PCI error recovery reset
1118 * root port), we reset the bus underneath the root port. 1138 * functions. These correctly handles the case where the secondary
1139 * bus is behind a hotplug slot and it will use the slot provided
1140 * reset methods to prevent spurious hotplug events during the reset.
1119 * 1141 *
1120 * The cxl driver depends on this behaviour for bi-modal card 1142 * Fundemental resets need to be handled internally to EEH since the
1121 * switching. 1143 * PCI core doesn't really have a concept of a fundemental reset,
1144 * mainly because there's no standard way to generate one. Only a
1145 * few devices require an FRESET so it should be fine.
1122 */ 1146 */
1123 if (pci_is_root_bus(bus) || 1147 if (option != EEH_RESET_FUNDAMENTAL) {
1124 pci_is_root_bus(bus->parent)) 1148 /*
1125 return pnv_eeh_root_reset(hose, option); 1149 * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
1150 * de-assert step. It's like the OPAL reset API was
1151 * poorly designed or something...
1152 */
1153 if (option == EEH_RESET_DEACTIVATE)
1154 return 0;
1126 1155
1156 rc = pci_bus_error_reset(bus->self);
1157 if (!rc)
1158 return 0;
1159 }
1160
1161 /* otherwise, use the generic bridge reset. this might call into FW */
1162 if (pci_is_root_bus(bus->parent))
1163 return pnv_eeh_root_reset(hose, option);
1127 return pnv_eeh_bridge_reset(bus->self, option); 1164 return pnv_eeh_bridge_reset(bus->self, option);
1128} 1165}
1129 1166
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 09f49eed7fb8..78599bca66c2 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
675 sprs.ptcr = mfspr(SPRN_PTCR); 675 sprs.ptcr = mfspr(SPRN_PTCR);
676 sprs.rpr = mfspr(SPRN_RPR); 676 sprs.rpr = mfspr(SPRN_RPR);
677 sprs.tscr = mfspr(SPRN_TSCR); 677 sprs.tscr = mfspr(SPRN_TSCR);
678 sprs.ldbar = mfspr(SPRN_LDBAR); 678 if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
679 sprs.ldbar = mfspr(SPRN_LDBAR);
679 680
680 sprs_saved = true; 681 sprs_saved = true;
681 682
@@ -789,7 +790,8 @@ core_woken:
789 mtspr(SPRN_MMCR0, sprs.mmcr0); 790 mtspr(SPRN_MMCR0, sprs.mmcr0);
790 mtspr(SPRN_MMCR1, sprs.mmcr1); 791 mtspr(SPRN_MMCR1, sprs.mmcr1);
791 mtspr(SPRN_MMCR2, sprs.mmcr2); 792 mtspr(SPRN_MMCR2, sprs.mmcr2);
792 mtspr(SPRN_LDBAR, sprs.ldbar); 793 if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
794 mtspr(SPRN_LDBAR, sprs.ldbar);
793 795
794 mtspr(SPRN_SPRG3, local_paca->sprg_vdso); 796 mtspr(SPRN_SPRG3, local_paca->sprg_vdso);
795 797
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c16249d251f1..b95b9e3c4c98 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
89} 89}
90EXPORT_SYMBOL(pnv_pci_get_npu_dev); 90EXPORT_SYMBOL(pnv_pci_get_npu_dev);
91 91
92#ifdef CONFIG_IOMMU_API
92/* 93/*
93 * Returns the PE assoicated with the PCI device of the given 94 * Returns the PE assoicated with the PCI device of the given
94 * NPU. Returns the linked pci device if pci_dev != NULL. 95 * NPU. Returns the linked pci device if pci_dev != NULL.
@@ -192,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
192 return 0; 193 return 0;
193} 194}
194 195
195/*
196 * Enables 32 bit DMA on NPU.
197 */
198static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
199{
200 struct pci_dev *gpdev;
201 struct pnv_ioda_pe *gpe;
202 int64_t rc;
203
204 /*
205 * Find the assoicated PCI devices and get the dma window
206 * information from there.
207 */
208 if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
209 return;
210
211 gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
212 if (!gpe)
213 return;
214
215 rc = pnv_npu_set_window(&npe->table_group, 0,
216 gpe->table_group.tables[0]);
217
218 /*
219 * NVLink devices use the same TCE table configuration as
220 * their parent device so drivers shouldn't be doing DMA
221 * operations directly on these devices.
222 */
223 set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
224}
225
226/*
227 * Enables bypass mode on the NPU. The NPU only supports one
228 * window per link, so bypass needs to be explicitly enabled or
229 * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
230 * active at the same time.
231 */
232static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
233{
234 struct pnv_phb *phb = npe->phb;
235 int64_t rc = 0;
236 phys_addr_t top = memblock_end_of_DRAM();
237
238 if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
239 return -EINVAL;
240
241 rc = pnv_npu_unset_window(&npe->table_group, 0);
242 if (rc != OPAL_SUCCESS)
243 return rc;
244
245 /* Enable the bypass window */
246
247 top = roundup_pow_of_two(top);
248 dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
249 npe->pe_number);
250 rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
251 npe->pe_number, npe->pe_number,
252 0 /* bypass base */, top);
253
254 if (rc == OPAL_SUCCESS)
255 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
256
257 return rc;
258}
259
260void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
261{
262 int i;
263 struct pnv_phb *phb;
264 struct pci_dn *pdn;
265 struct pnv_ioda_pe *npe;
266 struct pci_dev *npdev;
267
268 for (i = 0; ; ++i) {
269 npdev = pnv_pci_get_npu_dev(gpdev, i);
270
271 if (!npdev)
272 break;
273
274 pdn = pci_get_pdn(npdev);
275 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
276 return;
277
278 phb = pci_bus_to_host(npdev->bus)->private_data;
279
280 /* We only do bypass if it's enabled on the linked device */
281 npe = &phb->ioda.pe_array[pdn->pe_number];
282
283 if (bypass) {
284 dev_info(&npdev->dev,
285 "Using 64-bit DMA iommu bypass\n");
286 pnv_npu_dma_set_bypass(npe);
287 } else {
288 dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
289 pnv_npu_dma_set_32(npe);
290 }
291 }
292}
293
294#ifdef CONFIG_IOMMU_API
295/* Switch ownership from platform code to external user (e.g. VFIO) */ 196/* Switch ownership from platform code to external user (e.g. VFIO) */
296static void pnv_npu_take_ownership(struct iommu_table_group *table_group) 197static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
297{ 198{
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 29ca523c1c79..a2aa5e433ac8 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
257OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); 257OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
258OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); 258OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
259OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); 259OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
260OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); 260OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ);
261OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); 261OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
262OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); 262OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
263OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); 263OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
@@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
287OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); 287OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
288OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); 288OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
289OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); 289OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
290OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE);
291OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG);
292OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000000..ed895d82c048
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,636 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Interface for exporting the OPAL ELF core.
4 * Heavily inspired from fs/proc/vmcore.c
5 *
6 * Copyright 2019, Hari Bathini, IBM Corporation.
7 */
8
9#define pr_fmt(fmt) "opal core: " fmt
10
11#include <linux/memblock.h>
12#include <linux/uaccess.h>
13#include <linux/proc_fs.h>
14#include <linux/elf.h>
15#include <linux/elfcore.h>
16#include <linux/kobject.h>
17#include <linux/sysfs.h>
18#include <linux/slab.h>
19#include <linux/crash_core.h>
20#include <linux/of.h>
21
22#include <asm/page.h>
23#include <asm/opal.h>
24#include <asm/fadump-internal.h>
25
26#include "opal-fadump.h"
27
28#define MAX_PT_LOAD_CNT 8
29
30/* NT_AUXV note related info */
31#define AUXV_CNT 1
32#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
33
34struct opalcore_config {
35 u32 num_cpus;
36 /* PIR value of crashing CPU */
37 u32 crashing_cpu;
38
39 /* CPU state data info from F/W */
40 u64 cpu_state_destination_vaddr;
41 u64 cpu_state_data_size;
42 u64 cpu_state_entry_size;
43
44 /* OPAL memory to be exported as PT_LOAD segments */
45 u64 ptload_addr[MAX_PT_LOAD_CNT];
46 u64 ptload_size[MAX_PT_LOAD_CNT];
47 u64 ptload_cnt;
48
49 /* Pointer to the first PT_LOAD in the ELF core file */
50 Elf64_Phdr *ptload_phdr;
51
52 /* Total size of opalcore file. */
53 size_t opalcore_size;
54
55 /* Buffer for all the ELF core headers and the PT_NOTE */
56 size_t opalcorebuf_sz;
57 char *opalcorebuf;
58
59 /* NT_AUXV buffer */
60 char auxv_buf[AUXV_DESC_SZ];
61};
62
63struct opalcore {
64 struct list_head list;
65 u64 paddr;
66 size_t size;
67 loff_t offset;
68};
69
70static LIST_HEAD(opalcore_list);
71static struct opalcore_config *oc_conf;
72static const struct opal_mpipl_fadump *opalc_metadata;
73static const struct opal_mpipl_fadump *opalc_cpu_metadata;
74
75/*
76 * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
77 * by kernel, SIGTERM otherwise.
78 */
79bool kernel_initiated;
80
81static struct opalcore * __init get_new_element(void)
82{
83 return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
84}
85
86static inline int is_opalcore_usable(void)
87{
88 return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
89}
90
91static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
92 u32 type, void *data,
93 size_t data_len)
94{
95 Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
96 Elf64_Word namesz = strlen(name) + 1;
97
98 note->n_namesz = cpu_to_be32(namesz);
99 note->n_descsz = cpu_to_be32(data_len);
100 note->n_type = cpu_to_be32(type);
101 buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
102 memcpy(buf, name, namesz);
103 buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
104 memcpy(buf, data, data_len);
105 buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
106
107 return buf;
108}
109
110static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
111 struct pt_regs *regs)
112{
113 memset(prstatus, 0, sizeof(struct elf_prstatus));
114 elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
115
116 /*
117 * Overload PID with PIR value.
118 * As a PIR value could also be '0', add an offset of '100'
119 * to every PIR to avoid misinterpretations in GDB.
120 */
121 prstatus->pr_pid = cpu_to_be32(100 + pir);
122 prstatus->pr_ppid = cpu_to_be32(1);
123
124 /*
125 * Indicate SIGUSR1 for crash initiated from kernel.
126 * SIGTERM otherwise.
127 */
128 if (pir == oc_conf->crashing_cpu) {
129 short sig;
130
131 sig = kernel_initiated ? SIGUSR1 : SIGTERM;
132 prstatus->pr_cursig = cpu_to_be16(sig);
133 }
134}
135
136static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
137 u64 opal_boot_entry)
138{
139 Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
140 int idx = 0;
141
142 memset(bufp, 0, AUXV_DESC_SZ);
143
144 /* Entry point of OPAL */
145 bufp[idx++] = cpu_to_be64(AT_ENTRY);
146 bufp[idx++] = cpu_to_be64(opal_boot_entry);
147
148 /* end of vector */
149 bufp[idx++] = cpu_to_be64(AT_NULL);
150
151 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
152 oc_conf->auxv_buf, AUXV_DESC_SZ);
153 return buf;
154}
155
156/*
157 * Read from the ELF header and then the crash dump.
158 * Returns number of bytes read on success, -errno on failure.
159 */
160static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
161 struct bin_attribute *bin_attr, char *to,
162 loff_t pos, size_t count)
163{
164 struct opalcore *m;
165 ssize_t tsz, avail;
166 loff_t tpos = pos;
167
168 if (pos >= oc_conf->opalcore_size)
169 return 0;
170
171 /* Adjust count if it goes beyond opalcore size */
172 avail = oc_conf->opalcore_size - pos;
173 if (count > avail)
174 count = avail;
175
176 if (count == 0)
177 return 0;
178
179 /* Read ELF core header and/or PT_NOTE segment */
180 if (tpos < oc_conf->opalcorebuf_sz) {
181 tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
182 memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
183 to += tsz;
184 tpos += tsz;
185 count -= tsz;
186 }
187
188 list_for_each_entry(m, &opalcore_list, list) {
189 /* nothing more to read here */
190 if (count == 0)
191 break;
192
193 if (tpos < m->offset + m->size) {
194 void *addr;
195
196 tsz = min_t(size_t, m->offset + m->size - tpos, count);
197 addr = (void *)(m->paddr + tpos - m->offset);
198 memcpy(to, __va(addr), tsz);
199 to += tsz;
200 tpos += tsz;
201 count -= tsz;
202 }
203 }
204
205 return (tpos - pos);
206}
207
208static struct bin_attribute opal_core_attr = {
209 .attr = {.name = "core", .mode = 0400},
210 .read = read_opalcore
211};
212
213/*
214 * Read CPU state dump data and convert it into ELF notes.
215 *
216 * Each register entry is of 16 bytes, A numerical identifier along with
217 * a GPR/SPR flag in the first 8 bytes and the register value in the next
218 * 8 bytes. For more details refer to F/W documentation.
219 */
220static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
221{
222 u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
223 struct hdat_fadump_thread_hdr *thdr;
224 struct elf_prstatus prstatus;
225 Elf64_Word *first_cpu_note;
226 struct pt_regs regs;
227 char *bufp;
228 int i;
229
230 size_per_thread = oc_conf->cpu_state_entry_size;
231 bufp = __va(oc_conf->cpu_state_destination_vaddr);
232
233 /*
234 * Offset for register entries, entry size and registers count is
235 * duplicated in every thread header in keeping with HDAT format.
236 * Use these values from the first thread header.
237 */
238 thdr = (struct hdat_fadump_thread_hdr *)bufp;
239 regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
240 be32_to_cpu(thdr->offset));
241 reg_esize = be32_to_cpu(thdr->esize);
242 regs_cnt = be32_to_cpu(thdr->ecnt);
243
244 pr_debug("--------CPU State Data------------\n");
245 pr_debug("NumCpus : %u\n", oc_conf->num_cpus);
246 pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
247 regs_offset, reg_esize, regs_cnt);
248
249 /*
250 * Skip past the first CPU note. Fill this note with the
251 * crashing CPU's prstatus.
252 */
253 first_cpu_note = buf;
254 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
255 &prstatus, sizeof(prstatus));
256
257 for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
258 thdr = (struct hdat_fadump_thread_hdr *)bufp;
259 thread_pir = be32_to_cpu(thdr->pir);
260
261 pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
262 i, thread_pir, thdr->core_state);
263
264 /*
265 * Register state data of MAX cores is provided by firmware,
266 * but some of this cores may not be active. So, while
267 * processing register state data, check core state and
268 * skip threads that belong to inactive cores.
269 */
270 if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
271 continue;
272
273 opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
274 reg_esize, false, &regs);
275
276 pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
277 be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
278 fill_prstatus(&prstatus, thread_pir, &regs);
279
280 if (thread_pir != oc_conf->crashing_cpu) {
281 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
282 NT_PRSTATUS, &prstatus,
283 sizeof(prstatus));
284 } else {
285 /*
286 * Add crashing CPU as the first NT_PRSTATUS note for
287 * GDB to process the core file appropriately.
288 */
289 append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
290 NT_PRSTATUS, &prstatus,
291 sizeof(prstatus));
292 }
293 }
294
295 return buf;
296}
297
298static int __init create_opalcore(void)
299{
300 u64 opal_boot_entry, opal_base_addr, paddr;
301 u32 hdr_size, cpu_notes_size, count;
302 struct device_node *dn;
303 struct opalcore *new;
304 loff_t opalcore_off;
305 struct page *page;
306 Elf64_Phdr *phdr;
307 Elf64_Ehdr *elf;
308 int i, ret;
309 char *bufp;
310
311 /* Get size of header & CPU notes for OPAL core */
312 hdr_size = (sizeof(Elf64_Ehdr) +
313 ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
314 cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
315 CRASH_CORE_NOTE_NAME_BYTES +
316 CRASH_CORE_NOTE_DESC_BYTES)) +
317 (CRASH_CORE_NOTE_HEAD_BYTES +
318 CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
319
320 /* Allocate buffer to setup OPAL core */
321 oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
322 oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
323 GFP_KERNEL | __GFP_ZERO);
324 if (!oc_conf->opalcorebuf) {
325 pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
326 oc_conf->opalcorebuf_sz);
327 oc_conf->opalcorebuf_sz = 0;
328 return -ENOMEM;
329 }
330 count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
331 page = virt_to_page(oc_conf->opalcorebuf);
332 for (i = 0; i < count; i++)
333 mark_page_reserved(page + i);
334
335 pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
336
337 /* Read OPAL related device-tree entries */
338 dn = of_find_node_by_name(NULL, "ibm,opal");
339 if (dn) {
340 ret = of_property_read_u64(dn, "opal-base-address",
341 &opal_base_addr);
342 pr_debug("opal-base-address: %llx\n", opal_base_addr);
343 ret |= of_property_read_u64(dn, "opal-boot-address",
344 &opal_boot_entry);
345 pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
346 }
347 if (!dn || ret)
348 pr_warn("WARNING: Failed to read OPAL base & entry values\n");
349
350 /* Use count to keep track of the program headers */
351 count = 0;
352
353 bufp = oc_conf->opalcorebuf;
354 elf = (Elf64_Ehdr *)bufp;
355 bufp += sizeof(Elf64_Ehdr);
356 memcpy(elf->e_ident, ELFMAG, SELFMAG);
357 elf->e_ident[EI_CLASS] = ELF_CLASS;
358 elf->e_ident[EI_DATA] = ELFDATA2MSB;
359 elf->e_ident[EI_VERSION] = EV_CURRENT;
360 elf->e_ident[EI_OSABI] = ELF_OSABI;
361 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
362 elf->e_type = cpu_to_be16(ET_CORE);
363 elf->e_machine = cpu_to_be16(ELF_ARCH);
364 elf->e_version = cpu_to_be32(EV_CURRENT);
365 elf->e_entry = 0;
366 elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
367 elf->e_shoff = 0;
368 elf->e_flags = 0;
369
370 elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
371 elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
372 elf->e_phnum = 0;
373 elf->e_shentsize = 0;
374 elf->e_shnum = 0;
375 elf->e_shstrndx = 0;
376
377 phdr = (Elf64_Phdr *)bufp;
378 bufp += sizeof(Elf64_Phdr);
379 phdr->p_type = cpu_to_be32(PT_NOTE);
380 phdr->p_flags = 0;
381 phdr->p_align = 0;
382 phdr->p_paddr = phdr->p_vaddr = 0;
383 phdr->p_offset = cpu_to_be64(hdr_size);
384 phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size);
385 count++;
386
387 opalcore_off = oc_conf->opalcorebuf_sz;
388 oc_conf->ptload_phdr = (Elf64_Phdr *)bufp;
389 paddr = 0;
390 for (i = 0; i < oc_conf->ptload_cnt; i++) {
391 phdr = (Elf64_Phdr *)bufp;
392 bufp += sizeof(Elf64_Phdr);
393 phdr->p_type = cpu_to_be32(PT_LOAD);
394 phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X);
395 phdr->p_align = 0;
396
397 new = get_new_element();
398 if (!new)
399 return -ENOMEM;
400 new->paddr = oc_conf->ptload_addr[i];
401 new->size = oc_conf->ptload_size[i];
402 new->offset = opalcore_off;
403 list_add_tail(&new->list, &opalcore_list);
404
405 phdr->p_paddr = cpu_to_be64(paddr);
406 phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr);
407 phdr->p_filesz = phdr->p_memsz =
408 cpu_to_be64(oc_conf->ptload_size[i]);
409 phdr->p_offset = cpu_to_be64(opalcore_off);
410
411 count++;
412 opalcore_off += oc_conf->ptload_size[i];
413 paddr += oc_conf->ptload_size[i];
414 }
415
416 elf->e_phnum = cpu_to_be16(count);
417
418 bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
419 bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
420
421 oc_conf->opalcore_size = opalcore_off;
422 return 0;
423}
424
425static void opalcore_cleanup(void)
426{
427 if (oc_conf == NULL)
428 return;
429
430 /* Remove OPAL core sysfs file */
431 sysfs_remove_bin_file(opal_kobj, &opal_core_attr);
432 oc_conf->ptload_phdr = NULL;
433 oc_conf->ptload_cnt = 0;
434
435 /* free the buffer used for setting up OPAL core */
436 if (oc_conf->opalcorebuf) {
437 void *end = (void *)((u64)oc_conf->opalcorebuf +
438 oc_conf->opalcorebuf_sz);
439
440 free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
441 oc_conf->opalcorebuf = NULL;
442 oc_conf->opalcorebuf_sz = 0;
443 }
444
445 kfree(oc_conf);
446 oc_conf = NULL;
447}
448__exitcall(opalcore_cleanup);
449
450static void __init opalcore_config_init(void)
451{
452 u32 idx, cpu_data_version;
453 struct device_node *np;
454 const __be32 *prop;
455 u64 addr = 0;
456 int i, ret;
457
458 np = of_find_node_by_path("/ibm,opal/dump");
459 if (np == NULL)
460 return;
461
462 if (!of_device_is_compatible(np, "ibm,opal-dump")) {
463 pr_warn("Support missing for this f/w version!\n");
464 return;
465 }
466
467 /* Check if dump has been initiated on last reboot */
468 prop = of_get_property(np, "mpipl-boot", NULL);
469 if (!prop) {
470 of_node_put(np);
471 return;
472 }
473
474 /* Get OPAL metadata */
475 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
476 if ((ret != OPAL_SUCCESS) || !addr) {
477 pr_err("Failed to get OPAL metadata (%d)\n", ret);
478 goto error_out;
479 }
480
481 addr = be64_to_cpu(addr);
482 pr_debug("OPAL metadata addr: %llx\n", addr);
483 opalc_metadata = __va(addr);
484
485 /* Get OPAL CPU metadata */
486 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
487 if ((ret != OPAL_SUCCESS) || !addr) {
488 pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
489 goto error_out;
490 }
491
492 addr = be64_to_cpu(addr);
493 pr_debug("CPU metadata addr: %llx\n", addr);
494 opalc_cpu_metadata = __va(addr);
495
496 /* Allocate memory for config buffer */
497 oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
498 if (oc_conf == NULL)
499 goto error_out;
500
501 /* Parse OPAL metadata */
502 if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
503 pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
504 OPAL_MPIPL_VERSION, opalc_metadata->version);
505 pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
506 }
507
508 oc_conf->ptload_cnt = 0;
509 idx = be32_to_cpu(opalc_metadata->region_cnt);
510 if (idx > MAX_PT_LOAD_CNT) {
511 pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
512 MAX_PT_LOAD_CNT, idx);
513 idx = MAX_PT_LOAD_CNT;
514 }
515 for (i = 0; i < idx; i++) {
516 oc_conf->ptload_addr[oc_conf->ptload_cnt] =
517 be64_to_cpu(opalc_metadata->region[i].dest);
518 oc_conf->ptload_size[oc_conf->ptload_cnt++] =
519 be64_to_cpu(opalc_metadata->region[i].size);
520 }
521 oc_conf->ptload_cnt = i;
522 oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
523
524 if (!oc_conf->ptload_cnt) {
525 pr_err("OPAL memory regions not found\n");
526 goto error_out;
527 }
528
529 /* Parse OPAL CPU metadata */
530 cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
531 if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
532 pr_warn("Supported CPU data version: %u, found: %u!\n",
533 HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
534 pr_warn("WARNING: F/W using newer CPU state data format!!\n");
535 }
536
537 addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
538 if (!addr) {
539 pr_err("CPU state data not found!\n");
540 goto error_out;
541 }
542 oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
543
544 oc_conf->cpu_state_data_size =
545 be64_to_cpu(opalc_cpu_metadata->region[0].size);
546 oc_conf->cpu_state_entry_size =
547 be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
548
549 if ((oc_conf->cpu_state_entry_size == 0) ||
550 (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
551 pr_err("CPU state data is invalid.\n");
552 goto error_out;
553 }
554 oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
555 oc_conf->cpu_state_entry_size);
556
557 of_node_put(np);
558 return;
559
560error_out:
561 pr_err("Could not export /sys/firmware/opal/core\n");
562 opalcore_cleanup();
563 of_node_put(np);
564}
565
566static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
567 struct kobj_attribute *attr,
568 const char *buf, size_t count)
569{
570 int input = -1;
571
572 if (kstrtoint(buf, 0, &input))
573 return -EINVAL;
574
575 if (input == 1) {
576 if (oc_conf == NULL) {
577 pr_err("'/sys/firmware/opal/core' file not accessible!\n");
578 return -EPERM;
579 }
580
581 /*
582 * Take away '/sys/firmware/opal/core' and release all memory
583 * used for exporting this file.
584 */
585 opalcore_cleanup();
586 } else
587 return -EINVAL;
588
589 return count;
590}
591
592static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore,
593 0200, NULL,
594 fadump_release_opalcore_store);
595
596static int __init opalcore_init(void)
597{
598 int rc = -1;
599
600 opalcore_config_init();
601
602 if (oc_conf == NULL)
603 return rc;
604
605 create_opalcore();
606
607 /*
608 * If oc_conf->opalcorebuf= is set in the 2nd kernel,
609 * then capture the dump.
610 */
611 if (!(is_opalcore_usable())) {
612 pr_err("Failed to export /sys/firmware/opal/core\n");
613 opalcore_cleanup();
614 return rc;
615 }
616
617 /* Set OPAL core file size */
618 opal_core_attr.size = oc_conf->opalcore_size;
619
620 /* Export OPAL core sysfs file */
621 rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr);
622 if (rc != 0) {
623 pr_err("Failed to export /sys/firmware/opal/core\n");
624 opalcore_cleanup();
625 return rc;
626 }
627
628 rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
629 if (rc) {
630 pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n",
631 rc);
632 }
633
634 return 0;
635}
636fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000000..d361d37d975f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,716 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Firmware-Assisted Dump support on POWER platform (OPAL).
4 *
5 * Copyright 2019, Hari Bathini, IBM Corporation.
6 */
7
8#define pr_fmt(fmt) "opal fadump: " fmt
9
10#include <linux/string.h>
11#include <linux/seq_file.h>
12#include <linux/of.h>
13#include <linux/of_fdt.h>
14#include <linux/libfdt.h>
15#include <linux/mm.h>
16#include <linux/crash_dump.h>
17
18#include <asm/page.h>
19#include <asm/opal.h>
20#include <asm/fadump-internal.h>
21
22#include "opal-fadump.h"
23
24
25#ifdef CONFIG_PRESERVE_FA_DUMP
26/*
27 * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
28 * ensure crash data is preserved in hope that the subsequent memory
29 * preserving kernel boot is going to process this crash data.
30 */
31void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
32{
33 const struct opal_fadump_mem_struct *opal_fdm_active;
34 const __be32 *prop;
35 unsigned long dn;
36 u64 addr = 0;
37 s64 ret;
38
39 dn = of_get_flat_dt_subnode_by_name(node, "dump");
40 if (dn == -FDT_ERR_NOTFOUND)
41 return;
42
43 /*
44 * Check if dump has been initiated on last reboot.
45 */
46 prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
47 if (!prop)
48 return;
49
50 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
51 if ((ret != OPAL_SUCCESS) || !addr) {
52 pr_debug("Could not get Kernel metadata (%lld)\n", ret);
53 return;
54 }
55
56 /*
57 * Preserve memory only if kernel memory regions are registered
58 * with f/w for MPIPL.
59 */
60 addr = be64_to_cpu(addr);
61 pr_debug("Kernel metadata addr: %llx\n", addr);
62 opal_fdm_active = (void *)addr;
63 if (opal_fdm_active->registered_regions == 0)
64 return;
65
66 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
67 if ((ret != OPAL_SUCCESS) || !addr) {
68 pr_err("Failed to get boot memory tag (%lld)\n", ret);
69 return;
70 }
71
72 /*
73 * Memory below this address can be used for booting a
74 * capture kernel or petitboot kernel. Preserve everything
75 * above this address for processing crashdump.
76 */
77 fadump_conf->boot_mem_top = be64_to_cpu(addr);
78 pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
79
80 pr_info("Firmware-assisted dump is active.\n");
81 fadump_conf->dump_active = 1;
82}
83
84#else /* CONFIG_PRESERVE_FA_DUMP */
85static const struct opal_fadump_mem_struct *opal_fdm_active;
86static const struct opal_mpipl_fadump *opal_cpu_metadata;
87static struct opal_fadump_mem_struct *opal_fdm;
88
89#ifdef CONFIG_OPAL_CORE
90extern bool kernel_initiated;
91#endif
92
93static int opal_fadump_unregister(struct fw_dump *fadump_conf);
94
95static void opal_fadump_update_config(struct fw_dump *fadump_conf,
96 const struct opal_fadump_mem_struct *fdm)
97{
98 pr_debug("Boot memory regions count: %d\n", fdm->region_cnt);
99
100 /*
101 * The destination address of the first boot memory region is the
102 * destination address of boot memory regions.
103 */
104 fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest;
105 pr_debug("Destination address of boot memory regions: %#016llx\n",
106 fadump_conf->boot_mem_dest_addr);
107
108 fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
109}
110
111/*
112 * This function is called in the capture kernel to get configuration details
113 * from metadata setup by the first kernel.
114 */
115static void opal_fadump_get_config(struct fw_dump *fadump_conf,
116 const struct opal_fadump_mem_struct *fdm)
117{
118 unsigned long base, size, last_end, hole_size;
119 int i;
120
121 if (!fadump_conf->dump_active)
122 return;
123
124 last_end = 0;
125 hole_size = 0;
126 fadump_conf->boot_memory_size = 0;
127
128 pr_debug("Boot memory regions:\n");
129 for (i = 0; i < fdm->region_cnt; i++) {
130 base = fdm->rgn[i].src;
131 size = fdm->rgn[i].size;
132 pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
133
134 fadump_conf->boot_mem_addr[i] = base;
135 fadump_conf->boot_mem_sz[i] = size;
136 fadump_conf->boot_memory_size += size;
137 hole_size += (base - last_end);
138
139 last_end = base + size;
140 }
141
142 /*
143 * Start address of reserve dump area (permanent reservation) for
144 * re-registering FADump after dump capture.
145 */
146 fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
147
148 /*
149 * Rarely, but it can so happen that system crashes before all
150 * boot memory regions are registered for MPIPL. In such
151 * cases, warn that the vmcore may not be accurate and proceed
152 * anyway as that is the best bet considering free pages, cache
153 * pages, user pages, etc are usually filtered out.
154 *
155 * Hope the memory that could not be preserved only has pages
156 * that are usually filtered out while saving the vmcore.
157 */
158 if (fdm->region_cnt > fdm->registered_regions) {
159 pr_warn("Not all memory regions were saved!!!\n");
160 pr_warn(" Unsaved memory regions:\n");
161 i = fdm->registered_regions;
162 while (i < fdm->region_cnt) {
163 pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
164 i, fdm->rgn[i].src, fdm->rgn[i].size);
165 i++;
166 }
167
168 pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
169 pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
170 }
171
172 fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
173 fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
174 opal_fadump_update_config(fadump_conf, fdm);
175}
176
177/* Initialize kernel metadata */
178static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
179{
180 fdm->version = OPAL_FADUMP_VERSION;
181 fdm->region_cnt = 0;
182 fdm->registered_regions = 0;
183 fdm->fadumphdr_addr = 0;
184}
185
186static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
187{
188 u64 addr = fadump_conf->reserve_dump_area_start;
189 int i;
190
191 opal_fdm = __va(fadump_conf->kernel_metadata);
192 opal_fadump_init_metadata(opal_fdm);
193
194 /* Boot memory regions */
195 for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
196 opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i];
197 opal_fdm->rgn[i].dest = addr;
198 opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i];
199
200 opal_fdm->region_cnt++;
201 addr += fadump_conf->boot_mem_sz[i];
202 }
203
204 /*
205 * Kernel metadata is passed to f/w and retrieved in capture kerenl.
206 * So, use it to save fadump header address instead of calculating it.
207 */
208 opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
209 fadump_conf->boot_memory_size);
210
211 opal_fadump_update_config(fadump_conf, opal_fdm);
212
213 return addr;
214}
215
216static u64 opal_fadump_get_metadata_size(void)
217{
218 return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
219}
220
221static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
222{
223 int err = 0;
224 s64 ret;
225
226 /*
227 * Use the last page(s) in FADump memory reservation for
228 * kernel metadata.
229 */
230 fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
231 fadump_conf->reserve_dump_area_size -
232 opal_fadump_get_metadata_size());
233 pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
234
235 /* Initialize kernel metadata before registering the address with f/w */
236 opal_fdm = __va(fadump_conf->kernel_metadata);
237 opal_fadump_init_metadata(opal_fdm);
238
239 /*
240 * Register metadata address with f/w. Can be retrieved in
241 * the capture kernel.
242 */
243 ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
244 fadump_conf->kernel_metadata);
245 if (ret != OPAL_SUCCESS) {
246 pr_err("Failed to set kernel metadata tag!\n");
247 err = -EPERM;
248 }
249
250 /*
251 * Register boot memory top address with f/w. Should be retrieved
252 * by a kernel that intends to preserve crash'ed kernel's memory.
253 */
254 ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
255 fadump_conf->boot_mem_top);
256 if (ret != OPAL_SUCCESS) {
257 pr_err("Failed to set boot memory tag!\n");
258 err = -EPERM;
259 }
260
261 return err;
262}
263
264static u64 opal_fadump_get_bootmem_min(void)
265{
266 return OPAL_FADUMP_MIN_BOOT_MEM;
267}
268
269static int opal_fadump_register(struct fw_dump *fadump_conf)
270{
271 s64 rc = OPAL_PARAMETER;
272 int i, err = -EIO;
273
274 for (i = 0; i < opal_fdm->region_cnt; i++) {
275 rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
276 opal_fdm->rgn[i].src,
277 opal_fdm->rgn[i].dest,
278 opal_fdm->rgn[i].size);
279 if (rc != OPAL_SUCCESS)
280 break;
281
282 opal_fdm->registered_regions++;
283 }
284
285 switch (rc) {
286 case OPAL_SUCCESS:
287 pr_info("Registration is successful!\n");
288 fadump_conf->dump_registered = 1;
289 err = 0;
290 break;
291 case OPAL_RESOURCE:
292 /* If MAX regions limit in f/w is hit, warn and proceed. */
293 pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
294 (opal_fdm->region_cnt - opal_fdm->registered_regions));
295 fadump_conf->dump_registered = 1;
296 err = 0;
297 break;
298 case OPAL_PARAMETER:
299 pr_err("Failed to register. Parameter Error(%lld).\n", rc);
300 break;
301 case OPAL_HARDWARE:
302 pr_err("Support not available.\n");
303 fadump_conf->fadump_supported = 0;
304 fadump_conf->fadump_enabled = 0;
305 break;
306 default:
307 pr_err("Failed to register. Unknown Error(%lld).\n", rc);
308 break;
309 }
310
311 /*
312 * If some regions were registered before OPAL_MPIPL_ADD_RANGE
313 * OPAL call failed, unregister all regions.
314 */
315 if ((err < 0) && (opal_fdm->registered_regions > 0))
316 opal_fadump_unregister(fadump_conf);
317
318 return err;
319}
320
321static int opal_fadump_unregister(struct fw_dump *fadump_conf)
322{
323 s64 rc;
324
325 rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
326 if (rc) {
327 pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
328 return -EIO;
329 }
330
331 opal_fdm->registered_regions = 0;
332 fadump_conf->dump_registered = 0;
333 return 0;
334}
335
336static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
337{
338 s64 rc;
339
340 rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
341 if (rc) {
342 pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
343 return -EIO;
344 }
345
346 fadump_conf->dump_active = 0;
347 opal_fdm_active = NULL;
348 return 0;
349}
350
351static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
352{
353 s64 ret;
354
355 ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
356 if (ret != OPAL_SUCCESS)
357 pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
358}
359
360/*
361 * Verify if CPU state data is available. If available, do a bit of sanity
362 * checking before processing this data.
363 */
364static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
365{
366 if (!opal_cpu_metadata)
367 return false;
368
369 fadump_conf->cpu_state_data_version =
370 be32_to_cpu(opal_cpu_metadata->cpu_data_version);
371 fadump_conf->cpu_state_entry_size =
372 be32_to_cpu(opal_cpu_metadata->cpu_data_size);
373 fadump_conf->cpu_state_dest_vaddr =
374 (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
375 fadump_conf->cpu_state_data_size =
376 be64_to_cpu(opal_cpu_metadata->region[0].size);
377
378 if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
379 pr_warn("Supported CPU state data version: %u, found: %d!\n",
380 HDAT_FADUMP_CPU_DATA_VER,
381 fadump_conf->cpu_state_data_version);
382 pr_warn("WARNING: F/W using newer CPU state data format!!\n");
383 }
384
385 if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
386 (fadump_conf->cpu_state_entry_size == 0) ||
387 (fadump_conf->cpu_state_entry_size >
388 fadump_conf->cpu_state_data_size)) {
389 pr_err("CPU state data is invalid. Ignoring!\n");
390 return false;
391 }
392
393 return true;
394}
395
396/*
397 * Convert CPU state data saved at the time of crash into ELF notes.
398 *
399 * While the crashing CPU's register data is saved by the kernel, CPU state
400 * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
401 * each register entry is of 16 bytes, a numerical identifier along with
402 * a GPR/SPR flag in the first 8 bytes and the register value in the next
403 * 8 bytes. For more details refer to F/W documentation. If this data is
404 * missing or in unsupported format, append crashing CPU's register data
405 * saved by the kernel in the PT_NOTE, to have something to work with in
406 * the vmcore file.
407 */
408static int __init
409opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
410 struct fadump_crash_info_header *fdh)
411{
412 u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
413 struct hdat_fadump_thread_hdr *thdr;
414 bool is_cpu_data_valid = false;
415 u32 num_cpus = 1, *note_buf;
416 struct pt_regs regs;
417 char *bufp;
418 int rc, i;
419
420 if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
421 size_per_thread = fadump_conf->cpu_state_entry_size;
422 num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
423 bufp = __va(fadump_conf->cpu_state_dest_vaddr);
424 is_cpu_data_valid = true;
425 }
426
427 rc = fadump_setup_cpu_notes_buf(num_cpus);
428 if (rc != 0)
429 return rc;
430
431 note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
432 if (!is_cpu_data_valid)
433 goto out;
434
435 /*
436 * Offset for register entries, entry size and registers count is
437 * duplicated in every thread header in keeping with HDAT format.
438 * Use these values from the first thread header.
439 */
440 thdr = (struct hdat_fadump_thread_hdr *)bufp;
441 regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
442 be32_to_cpu(thdr->offset));
443 reg_esize = be32_to_cpu(thdr->esize);
444 regs_cnt = be32_to_cpu(thdr->ecnt);
445
446 pr_debug("--------CPU State Data------------\n");
447 pr_debug("NumCpus : %u\n", num_cpus);
448 pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
449 regs_offset, reg_esize, regs_cnt);
450
451 for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
452 thdr = (struct hdat_fadump_thread_hdr *)bufp;
453
454 thread_pir = be32_to_cpu(thdr->pir);
455 pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
456 i, thread_pir, thdr->core_state);
457
458 /*
459 * If this is kernel initiated crash, crashing_cpu would be set
460 * appropriately and register data of the crashing CPU saved by
461 * crashing kernel. Add this saved register data of crashing CPU
462 * to elf notes and populate the pt_regs for the remaining CPUs
463 * from register state data provided by firmware.
464 */
465 if (fdh->crashing_cpu == thread_pir) {
466 note_buf = fadump_regs_to_elf_notes(note_buf,
467 &fdh->regs);
468 pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
469 fdh->crashing_cpu, fdh->regs.gpr[1],
470 fdh->regs.nip);
471 continue;
472 }
473
474 /*
475 * Register state data of MAX cores is provided by firmware,
476 * but some of this cores may not be active. So, while
477 * processing register state data, check core state and
478 * skip threads that belong to inactive cores.
479 */
480 if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
481 continue;
482
483 opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
484 reg_esize, true, &regs);
485 note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
486 pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
487 thread_pir, regs.gpr[1], regs.nip);
488 }
489
490out:
491 /*
492 * CPU state data is invalid/unsupported. Try appending crashing CPU's
493 * register data, if it is saved by the kernel.
494 */
495 if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
496 if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
497 fadump_free_cpu_notes_buf();
498 return -ENODEV;
499 }
500
501 pr_warn("WARNING: appending only crashing CPU's register data\n");
502 note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
503 }
504
505 final_note(note_buf);
506
507 pr_debug("Updating elfcore header (%llx) with cpu notes\n",
508 fdh->elfcorehdr_addr);
509 fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
510 return 0;
511}
512
513static int __init opal_fadump_process(struct fw_dump *fadump_conf)
514{
515 struct fadump_crash_info_header *fdh;
516 int rc = -EINVAL;
517
518 if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
519 return rc;
520
521 /* Validate the fadump crash info header */
522 fdh = __va(fadump_conf->fadumphdr_addr);
523 if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
524 pr_err("Crash info header is not valid.\n");
525 return rc;
526 }
527
528#ifdef CONFIG_OPAL_CORE
529 /*
530 * If this is a kernel initiated crash, crashing_cpu would be set
531 * appropriately and register data of the crashing CPU saved by
532 * crashing kernel. Add this saved register data of crashing CPU
533 * to elf notes and populate the pt_regs for the remaining CPUs
534 * from register state data provided by firmware.
535 */
536 if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
537 kernel_initiated = true;
538#endif
539
540 rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
541 if (rc)
542 return rc;
543
544 /*
545 * We are done validating dump info and elfcore header is now ready
546 * to be exported. set elfcorehdr_addr so that vmcore module will
547 * export the elfcore header through '/proc/vmcore'.
548 */
549 elfcorehdr_addr = fdh->elfcorehdr_addr;
550
551 return rc;
552}
553
554static void opal_fadump_region_show(struct fw_dump *fadump_conf,
555 struct seq_file *m)
556{
557 const struct opal_fadump_mem_struct *fdm_ptr;
558 u64 dumped_bytes = 0;
559 int i;
560
561 if (fadump_conf->dump_active)
562 fdm_ptr = opal_fdm_active;
563 else
564 fdm_ptr = opal_fdm;
565
566 for (i = 0; i < fdm_ptr->region_cnt; i++) {
567 /*
568 * Only regions that are registered for MPIPL
569 * would have dump data.
570 */
571 if ((fadump_conf->dump_active) &&
572 (i < fdm_ptr->registered_regions))
573 dumped_bytes = fdm_ptr->rgn[i].size;
574
575 seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
576 fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
577 seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
578 fdm_ptr->rgn[i].size, dumped_bytes);
579 }
580
581 /* Dump is active. Show reserved area start address. */
582 if (fadump_conf->dump_active) {
583 seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
584 fadump_conf->reserve_dump_area_start);
585 }
586}
587
588static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
589 const char *msg)
590{
591 int rc;
592
593 /*
594 * Unlike on pSeries platform, logical CPU number is not provided
595 * with architected register state data. So, store the crashing
596 * CPU's PIR instead to plug the appropriate register data for
597 * crashing CPU in the vmcore file.
598 */
599 fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
600
601 rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
602 if (rc == OPAL_UNSUPPORTED) {
603 pr_emerg("Reboot type %d not supported.\n",
604 OPAL_REBOOT_MPIPL);
605 } else if (rc == OPAL_HARDWARE)
606 pr_emerg("No backend support for MPIPL!\n");
607}
608
609static struct fadump_ops opal_fadump_ops = {
610 .fadump_init_mem_struct = opal_fadump_init_mem_struct,
611 .fadump_get_metadata_size = opal_fadump_get_metadata_size,
612 .fadump_setup_metadata = opal_fadump_setup_metadata,
613 .fadump_get_bootmem_min = opal_fadump_get_bootmem_min,
614 .fadump_register = opal_fadump_register,
615 .fadump_unregister = opal_fadump_unregister,
616 .fadump_invalidate = opal_fadump_invalidate,
617 .fadump_cleanup = opal_fadump_cleanup,
618 .fadump_process = opal_fadump_process,
619 .fadump_region_show = opal_fadump_region_show,
620 .fadump_trigger = opal_fadump_trigger,
621};
622
623void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
624{
625 const __be32 *prop;
626 unsigned long dn;
627 u64 addr = 0;
628 int i, len;
629 s64 ret;
630
631 /*
632 * Check if Firmware-Assisted Dump is supported. if yes, check
633 * if dump has been initiated on last reboot.
634 */
635 dn = of_get_flat_dt_subnode_by_name(node, "dump");
636 if (dn == -FDT_ERR_NOTFOUND) {
637 pr_debug("FADump support is missing!\n");
638 return;
639 }
640
641 if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
642 pr_err("Support missing for this f/w version!\n");
643 return;
644 }
645
646 prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
647 if (prop) {
648 /*
649 * Each f/w load area is an (address,size) pair,
650 * 2 cells each, totalling 4 cells per range.
651 */
652 for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
653 u64 base, end;
654
655 base = of_read_number(prop + (i * 4) + 0, 2);
656 end = base;
657 end += of_read_number(prop + (i * 4) + 2, 2);
658 if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
659 pr_err("F/W load area: 0x%llx-0x%llx\n",
660 base, end);
661 pr_err("F/W version not supported!\n");
662 return;
663 }
664 }
665 }
666
667 fadump_conf->ops = &opal_fadump_ops;
668 fadump_conf->fadump_supported = 1;
669
670 /*
671 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
672 * and request firmware to copy multiple kernel boot memory regions.
673 */
674 fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
675
676 /*
677 * Check if dump has been initiated on last reboot.
678 */
679 prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
680 if (!prop)
681 return;
682
683 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
684 if ((ret != OPAL_SUCCESS) || !addr) {
685 pr_err("Failed to get Kernel metadata (%lld)\n", ret);
686 return;
687 }
688
689 addr = be64_to_cpu(addr);
690 pr_debug("Kernel metadata addr: %llx\n", addr);
691
692 opal_fdm_active = __va(addr);
693 if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
694 pr_warn("Supported kernel metadata version: %u, found: %d!\n",
695 OPAL_FADUMP_VERSION, opal_fdm_active->version);
696 pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
697 }
698
699 /* Kernel regions not registered with f/w for MPIPL */
700 if (opal_fdm_active->registered_regions == 0) {
701 opal_fdm_active = NULL;
702 return;
703 }
704
705 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
706 if (addr) {
707 addr = be64_to_cpu(addr);
708 pr_debug("CPU metadata addr: %llx\n", addr);
709 opal_cpu_metadata = __va(addr);
710 }
711
712 pr_info("Firmware-assisted dump is active.\n");
713 fadump_conf->dump_active = 1;
714 opal_fadump_get_config(fadump_conf, opal_fdm_active);
715}
716#endif /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000000..f1e9ecf548c5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,146 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Firmware-Assisted Dump support on POWER platform (OPAL).
4 *
5 * Copyright 2019, Hari Bathini, IBM Corporation.
6 */
7
8#ifndef _POWERNV_OPAL_FADUMP_H
9#define _POWERNV_OPAL_FADUMP_H
10
11#include <asm/reg.h>
12
13/*
14 * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
15 * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
16 * mess with crash'ed kernel's memory during MPIPL.
17 */
18#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL)
19
20/*
21 * OPAL FADump metadata structure format version
22 *
23 * OPAL FADump kernel metadata structure stores kernel metadata needed to
24 * register-for/process crash dump. Format version is used to keep a tab on
25 * the changes in the structure format. The changes, if any, to the format
26 * are expected to be minimal and backward compatible.
27 */
28#define OPAL_FADUMP_VERSION 0x1
29
30/*
31 * OPAL FADump kernel metadata
32 *
33 * The address of this structure will be registered with f/w for retrieving
34 * and processing during crash dump.
35 */
36struct opal_fadump_mem_struct {
37 u8 version;
38 u8 reserved[3];
39 u16 region_cnt; /* number of regions */
40 u16 registered_regions; /* Regions registered for MPIPL */
41 u64 fadumphdr_addr;
42 struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS];
43} __packed;
44
45/*
46 * CPU state data
47 *
48 * CPU state data information is provided by f/w. The format for this data
49 * is defined in the HDAT spec. Version is used to keep a tab on the changes
50 * in this CPU state data format. Changes to this format are unlikely, but
51 * if there are any changes, please refer to latest HDAT specification.
52 */
53#define HDAT_FADUMP_CPU_DATA_VER 1
54
55#define HDAT_FADUMP_CORE_INACTIVE (0x0F)
56
57/* HDAT thread header for register entries */
58struct hdat_fadump_thread_hdr {
59 __be32 pir;
60 /* 0x00 - 0x0F - The corresponding stop state of the core */
61 u8 core_state;
62 u8 reserved[3];
63
64 __be32 offset; /* Offset to Register Entries array */
65 __be32 ecnt; /* Number of entries */
66 __be32 esize; /* Alloc size of each array entry in bytes */
67 __be32 eactsz; /* Actual size of each array entry in bytes */
68} __packed;
69
70/* Register types populated by f/w */
71#define HDAT_FADUMP_REG_TYPE_GPR 0x01
72#define HDAT_FADUMP_REG_TYPE_SPR 0x02
73
74/* ID numbers used by f/w while populating certain registers */
75#define HDAT_FADUMP_REG_ID_NIP 0x7D0
76#define HDAT_FADUMP_REG_ID_MSR 0x7D1
77#define HDAT_FADUMP_REG_ID_CCR 0x7D2
78
79/* HDAT register entry. */
80struct hdat_fadump_reg_entry {
81 __be32 reg_type;
82 __be32 reg_num;
83 __be64 reg_val;
84} __packed;
85
86static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
87 u32 reg_type, u32 reg_num,
88 u64 reg_val)
89{
90 if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
91 if (reg_num < 32)
92 regs->gpr[reg_num] = reg_val;
93 return;
94 }
95
96 switch (reg_num) {
97 case SPRN_CTR:
98 regs->ctr = reg_val;
99 break;
100 case SPRN_LR:
101 regs->link = reg_val;
102 break;
103 case SPRN_XER:
104 regs->xer = reg_val;
105 break;
106 case SPRN_DAR:
107 regs->dar = reg_val;
108 break;
109 case SPRN_DSISR:
110 regs->dsisr = reg_val;
111 break;
112 case HDAT_FADUMP_REG_ID_NIP:
113 regs->nip = reg_val;
114 break;
115 case HDAT_FADUMP_REG_ID_MSR:
116 regs->msr = reg_val;
117 break;
118 case HDAT_FADUMP_REG_ID_CCR:
119 regs->ccr = reg_val;
120 break;
121 }
122}
123
124static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
125 unsigned int reg_entry_size,
126 bool cpu_endian,
127 struct pt_regs *regs)
128{
129 struct hdat_fadump_reg_entry *reg_entry;
130 u64 val;
131 int i;
132
133 memset(regs, 0, sizeof(struct pt_regs));
134
135 for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
136 reg_entry = (struct hdat_fadump_reg_entry *)bufp;
137 val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
138 reg_entry->reg_val);
139 opal_fadump_set_regval_regnum(regs,
140 be32_to_cpu(reg_entry->reg_type),
141 be32_to_cpu(reg_entry->reg_num),
142 val);
143 }
144}
145
146#endif /* _POWERNV_OPAL_FADUMP_H */
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node,
53 struct imc_pmu *pmu_ptr) 53 struct imc_pmu *pmu_ptr)
54{ 54{
55 static u64 loc, *imc_mode_addr, *imc_cmd_addr; 55 static u64 loc, *imc_mode_addr, *imc_cmd_addr;
56 int chip = 0, nid;
57 char mode[16], cmd[16]; 56 char mode[16], cmd[16];
58 u32 cb_offset; 57 u32 cb_offset;
58 struct imc_mem_info *ptr = pmu_ptr->mem_info;
59 59
60 imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); 60 imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
61 61
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node,
69 if (of_property_read_u32(node, "cb_offset", &cb_offset)) 69 if (of_property_read_u32(node, "cb_offset", &cb_offset))
70 cb_offset = IMC_CNTL_BLK_OFFSET; 70 cb_offset = IMC_CNTL_BLK_OFFSET;
71 71
72 for_each_node(nid) { 72 while (ptr->vbase != NULL) {
73 loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; 73 loc = (u64)(ptr->vbase) + cb_offset;
74 imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); 74 imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
75 sprintf(mode, "imc_mode_%d", nid); 75 sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
76 if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, 76 if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
77 imc_mode_addr)) 77 imc_mode_addr))
78 goto err; 78 goto err;
79 79
80 imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); 80 imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
81 sprintf(cmd, "imc_cmd_%d", nid); 81 sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
82 if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, 82 if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
83 imc_cmd_addr)) 83 imc_cmd_addr))
84 goto err; 84 goto err;
85 chip++; 85 ptr++;
86 } 86 }
87 return; 87 return;
88 88
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index dc51d03c6370..d26da19a611f 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -29,23 +29,23 @@ struct memcons {
29 29
30static struct memcons *opal_memcons = NULL; 30static struct memcons *opal_memcons = NULL;
31 31
32ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) 32ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
33{ 33{
34 const char *conbuf; 34 const char *conbuf;
35 ssize_t ret; 35 ssize_t ret;
36 size_t first_read = 0; 36 size_t first_read = 0;
37 uint32_t out_pos, avail; 37 uint32_t out_pos, avail;
38 38
39 if (!opal_memcons) 39 if (!mc)
40 return -ENODEV; 40 return -ENODEV;
41 41
42 out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); 42 out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
43 43
44 /* Now we've read out_pos, put a barrier in before reading the new 44 /* Now we've read out_pos, put a barrier in before reading the new
45 * data it points to in conbuf. */ 45 * data it points to in conbuf. */
46 smp_rmb(); 46 smp_rmb();
47 47
48 conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); 48 conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
49 49
50 /* When the buffer has wrapped, read from the out_pos marker to the end 50 /* When the buffer has wrapped, read from the out_pos marker to the end
51 * of the buffer, and then read the remaining data as in the un-wrapped 51 * of the buffer, and then read the remaining data as in the un-wrapped
@@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
53 if (out_pos & MEMCONS_OUT_POS_WRAP) { 53 if (out_pos & MEMCONS_OUT_POS_WRAP) {
54 54
55 out_pos &= MEMCONS_OUT_POS_MASK; 55 out_pos &= MEMCONS_OUT_POS_MASK;
56 avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; 56 avail = be32_to_cpu(mc->obuf_size) - out_pos;
57 57
58 ret = memory_read_from_buffer(to, count, &pos, 58 ret = memory_read_from_buffer(to, count, &pos,
59 conbuf + out_pos, avail); 59 conbuf + out_pos, avail);
@@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
71 } 71 }
72 72
73 /* Sanity check. The firmware should not do this to us. */ 73 /* Sanity check. The firmware should not do this to us. */
74 if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { 74 if (out_pos > be32_to_cpu(mc->obuf_size)) {
75 pr_err("OPAL: memory console corruption. Aborting read.\n"); 75 pr_err("OPAL: memory console corruption. Aborting read.\n");
76 return -EINVAL; 76 return -EINVAL;
77 } 77 }
@@ -86,6 +86,11 @@ out:
86 return ret; 86 return ret;
87} 87}
88 88
89ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
90{
91 return memcons_copy(opal_memcons, to, pos, count);
92}
93
89static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, 94static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
90 struct bin_attribute *bin_attr, char *to, 95 struct bin_attribute *bin_attr, char *to,
91 loff_t pos, size_t count) 96 loff_t pos, size_t count)
@@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = {
98 .read = opal_msglog_read 103 .read = opal_msglog_read
99}; 104};
100 105
101void __init opal_msglog_init(void) 106struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
102{ 107{
103 u64 mcaddr; 108 u64 mcaddr;
104 struct memcons *mc; 109 struct memcons *mc;
105 110
106 if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { 111 if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
107 pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); 112 pr_warn("%s property not found, no message log\n",
108 return; 113 mc_prop_name);
114 goto out_err;
109 } 115 }
110 116
111 mc = phys_to_virt(mcaddr); 117 mc = phys_to_virt(mcaddr);
112 if (!mc) { 118 if (!mc) {
113 pr_warn("OPAL: memory console address is invalid\n"); 119 pr_warn("memory console address is invalid\n");
114 return; 120 goto out_err;
115 } 121 }
116 122
117 if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { 123 if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
118 pr_warn("OPAL: memory console version is invalid\n"); 124 pr_warn("memory console version is invalid\n");
119 return; 125 goto out_err;
120 } 126 }
121 127
122 /* Report maximum size */ 128 return mc;
123 opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + 129
124 be32_to_cpu(mc->obuf_size); 130out_err:
131 return NULL;
132}
133
134u32 memcons_get_size(struct memcons *mc)
135{
136 return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
137}
138
139void __init opal_msglog_init(void)
140{
141 opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
142 if (!opal_memcons) {
143 pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
144 return;
145 }
125 146
126 opal_memcons = mc; 147 opal_msglog_attr.size = memcons_get_size(opal_memcons);
127} 148}
128 149
129void __init opal_msglog_sysfs_init(void) 150void __init opal_msglog_sysfs_init(void)
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index e072bf157d62..45f4223a790f 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
342 int msg_size, item_size; 342 int msg_size, item_size;
343 unsigned long flags; 343 unsigned long flags;
344 344
345 if (msg_type != OPAL_MSG_PRD) 345 if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
346 return 0; 346 return 0;
347 347
348 /* Calculate total size of the message and item we need to store. The 348 /* Calculate total size of the message and item we need to store. The
@@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev)
393 return rc; 393 return rc;
394 } 394 }
395 395
396 rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
397 if (rc) {
398 pr_err("Couldn't register PRD2 event notifier\n");
399 return rc;
400 }
401
396 rc = misc_register(&opal_prd_dev); 402 rc = misc_register(&opal_prd_dev);
397 if (rc) { 403 if (rc) {
398 pr_err("failed to register miscdev\n"); 404 pr_err("failed to register miscdev\n");
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 66430eebe869..fd510d961b8c 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -1,7 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-or-later 1// SPDX-License-Identifier: GPL-2.0-or-later
2/* 2/*
3 * PowerNV LPC bus handling. 3 * PowerNV SCOM bus debugfs interface
4 * 4 *
5 * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
6 * <benh@kernel.crashing.org>
7 * and David Gibson, IBM Corporation.
5 * Copyright 2013 IBM Corp. 8 * Copyright 2013 IBM Corp.
6 */ 9 */
7 10
@@ -10,62 +13,13 @@
10#include <linux/bug.h> 13#include <linux/bug.h>
11#include <linux/gfp.h> 14#include <linux/gfp.h>
12#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/uaccess.h>
13 17
14#include <asm/machdep.h> 18#include <asm/machdep.h>
15#include <asm/firmware.h> 19#include <asm/firmware.h>
16#include <asm/opal.h> 20#include <asm/opal.h>
17#include <asm/scom.h> 21#include <asm/debugfs.h>
18 22#include <asm/prom.h>
19/*
20 * We could probably fit that inside the scom_map_t
21 * which is a void* after all but it's really too ugly
22 * so let's kmalloc it for now
23 */
24struct opal_scom_map {
25 uint32_t chip;
26 uint64_t addr;
27};
28
29static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
30{
31 struct opal_scom_map *m;
32 const __be32 *gcid;
33
34 if (!of_get_property(dev, "scom-controller", NULL)) {
35 pr_err("%s: device %pOF is not a SCOM controller\n",
36 __func__, dev);
37 return SCOM_MAP_INVALID;
38 }
39 gcid = of_get_property(dev, "ibm,chip-id", NULL);
40 if (!gcid) {
41 pr_err("%s: device %pOF has no ibm,chip-id\n",
42 __func__, dev);
43 return SCOM_MAP_INVALID;
44 }
45 m = kmalloc(sizeof(*m), GFP_KERNEL);
46 if (!m)
47 return NULL;
48 m->chip = be32_to_cpup(gcid);
49 m->addr = reg;
50
51 return (scom_map_t)m;
52}
53
54static void opal_scom_unmap(scom_map_t map)
55{
56 kfree(map);
57}
58
59static int opal_xscom_err_xlate(int64_t rc)
60{
61 switch(rc) {
62 case 0:
63 return 0;
64 /* Add more translations if necessary */
65 default:
66 return -EIO;
67 }
68}
69 23
70static u64 opal_scom_unmangle(u64 addr) 24static u64 opal_scom_unmangle(u64 addr)
71{ 25{
@@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr)
98 return addr; 52 return addr;
99} 53}
100 54
101static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) 55static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
102{ 56{
103 struct opal_scom_map *m = map;
104 int64_t rc; 57 int64_t rc;
105 __be64 v; 58 __be64 v;
106 59
107 reg = opal_scom_unmangle(m->addr + reg); 60 reg = opal_scom_unmangle(addr + reg);
108 rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); 61 rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
62 if (rc) {
63 *value = 0xfffffffffffffffful;
64 return -EIO;
65 }
109 *value = be64_to_cpu(v); 66 *value = be64_to_cpu(v);
110 return opal_xscom_err_xlate(rc); 67 return 0;
111} 68}
112 69
113static int opal_scom_write(scom_map_t map, u64 reg, u64 value) 70static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
114{ 71{
115 struct opal_scom_map *m = map;
116 int64_t rc; 72 int64_t rc;
117 73
118 reg = opal_scom_unmangle(m->addr + reg); 74 reg = opal_scom_unmangle(addr + reg);
119 rc = opal_xscom_write(m->chip, reg, value); 75 rc = opal_xscom_write(chip, reg, value);
120 return opal_xscom_err_xlate(rc); 76 if (rc)
77 return -EIO;
78 return 0;
79}
80
81struct scom_debug_entry {
82 u32 chip;
83 struct debugfs_blob_wrapper path;
84 char name[16];
85};
86
87static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
88 size_t count, loff_t *ppos)
89{
90 struct scom_debug_entry *ent = filp->private_data;
91 u64 __user *ubuf64 = (u64 __user *)ubuf;
92 loff_t off = *ppos;
93 ssize_t done = 0;
94 u64 reg, reg_base, reg_cnt, val;
95 int rc;
96
97 if (off < 0 || (off & 7) || (count & 7))
98 return -EINVAL;
99 reg_base = off >> 3;
100 reg_cnt = count >> 3;
101
102 for (reg = 0; reg < reg_cnt; reg++) {
103 rc = opal_scom_read(ent->chip, reg_base, reg, &val);
104 if (!rc)
105 rc = put_user(val, ubuf64);
106 if (rc) {
107 if (!done)
108 done = rc;
109 break;
110 }
111 ubuf64++;
112 *ppos += 8;
113 done += 8;
114 }
115 return done;
116}
117
118static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
119 size_t count, loff_t *ppos)
120{
121 struct scom_debug_entry *ent = filp->private_data;
122 u64 __user *ubuf64 = (u64 __user *)ubuf;
123 loff_t off = *ppos;
124 ssize_t done = 0;
125 u64 reg, reg_base, reg_cnt, val;
126 int rc;
127
128 if (off < 0 || (off & 7) || (count & 7))
129 return -EINVAL;
130 reg_base = off >> 3;
131 reg_cnt = count >> 3;
132
133 for (reg = 0; reg < reg_cnt; reg++) {
134 rc = get_user(val, ubuf64);
135 if (!rc)
136 rc = opal_scom_write(ent->chip, reg_base, reg, val);
137 if (rc) {
138 if (!done)
139 done = rc;
140 break;
141 }
142 ubuf64++;
143 done += 8;
144 }
145 return done;
121} 146}
122 147
123static const struct scom_controller opal_scom_controller = { 148static const struct file_operations scom_debug_fops = {
124 .map = opal_scom_map, 149 .read = scom_debug_read,
125 .unmap = opal_scom_unmap, 150 .write = scom_debug_write,
126 .read = opal_scom_read, 151 .open = simple_open,
127 .write = opal_scom_write 152 .llseek = default_llseek,
128}; 153};
129 154
130static int opal_xscom_init(void) 155static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
156 int chip)
131{ 157{
132 if (firmware_has_feature(FW_FEATURE_OPAL)) 158 struct scom_debug_entry *ent;
133 scom_init(&opal_scom_controller); 159 struct dentry *dir;
160
161 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
162 if (!ent)
163 return -ENOMEM;
164
165 ent->chip = chip;
166 snprintf(ent->name, 16, "%08x", chip);
167 ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
168 ent->path.size = strlen((char *)ent->path.data);
169
170 dir = debugfs_create_dir(ent->name, root);
171 if (!dir) {
172 kfree(ent->path.data);
173 kfree(ent);
174 return -1;
175 }
176
177 debugfs_create_blob("devspec", 0400, dir, &ent->path);
178 debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
179
134 return 0; 180 return 0;
135} 181}
136machine_arch_initcall(powernv, opal_xscom_init); 182
183static int scom_debug_init(void)
184{
185 struct device_node *dn;
186 struct dentry *root;
187 int chip, rc;
188
189 if (!firmware_has_feature(FW_FEATURE_OPAL))
190 return 0;
191
192 root = debugfs_create_dir("scom", powerpc_debugfs_root);
193 if (!root)
194 return -1;
195
196 rc = 0;
197 for_each_node_with_property(dn, "scom-controller") {
198 chip = of_get_ibm_chip_id(dn);
199 WARN_ON(chip == -1);
200 rc |= scom_debug_init_one(root, dn, chip);
201 }
202
203 return rc;
204}
205device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index aba443be7daa..38e90270280b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock);
58static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; 58static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
59static uint32_t opal_heartbeat; 59static uint32_t opal_heartbeat;
60static struct task_struct *kopald_tsk; 60static struct task_struct *kopald_tsk;
61static struct opal_msg *opal_msg;
62static u32 opal_msg_size __ro_after_init;
61 63
62void opal_configure_cores(void) 64void opal_configure_cores(void)
63{ 65{
@@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg)
271static void opal_handle_message(void) 273static void opal_handle_message(void)
272{ 274{
273 s64 ret; 275 s64 ret;
274 /*
275 * TODO: pre-allocate a message buffer depending on opal-msg-size
276 * value in /proc/device-tree.
277 */
278 static struct opal_msg msg;
279 u32 type; 276 u32 type;
280 277
281 ret = opal_get_msg(__pa(&msg), sizeof(msg)); 278 ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
282 /* No opal message pending. */ 279 /* No opal message pending. */
283 if (ret == OPAL_RESOURCE) 280 if (ret == OPAL_RESOURCE)
284 return; 281 return;
@@ -290,14 +287,14 @@ static void opal_handle_message(void)
290 return; 287 return;
291 } 288 }
292 289
293 type = be32_to_cpu(msg.msg_type); 290 type = be32_to_cpu(opal_msg->msg_type);
294 291
295 /* Sanity check */ 292 /* Sanity check */
296 if (type >= OPAL_MSG_TYPE_MAX) { 293 if (type >= OPAL_MSG_TYPE_MAX) {
297 pr_warn_once("%s: Unknown message type: %u\n", __func__, type); 294 pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
298 return; 295 return;
299 } 296 }
300 opal_message_do_notify(type, (void *)&msg); 297 opal_message_do_notify(type, (void *)opal_msg);
301} 298}
302 299
303static irqreturn_t opal_message_notify(int irq, void *data) 300static irqreturn_t opal_message_notify(int irq, void *data)
@@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data)
306 return IRQ_HANDLED; 303 return IRQ_HANDLED;
307} 304}
308 305
309static int __init opal_message_init(void) 306static int __init opal_message_init(struct device_node *opal_node)
310{ 307{
311 int ret, i, irq; 308 int ret, i, irq;
312 309
310 ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
311 if (ret) {
312 pr_notice("Failed to read opal-msg-size property\n");
313 opal_msg_size = sizeof(struct opal_msg);
314 }
315
316 opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
317 if (!opal_msg) {
318 opal_msg_size = sizeof(struct opal_msg);
319 /* Try to allocate fixed message size */
320 opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
321 BUG_ON(opal_msg == NULL);
322 }
323
313 for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) 324 for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
314 ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); 325 ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
315 326
@@ -705,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
705 bin_attr->size); 716 bin_attr->size);
706} 717}
707 718
708static BIN_ATTR_RO(symbol_map, 0); 719static struct bin_attribute symbol_map_attr = {
720 .attr = {.name = "symbol_map", .mode = 0400},
721 .read = symbol_map_read
722};
709 723
710static void opal_export_symmap(void) 724static void opal_export_symmap(void)
711{ 725{
@@ -722,10 +736,10 @@ static void opal_export_symmap(void)
722 return; 736 return;
723 737
724 /* Setup attributes */ 738 /* Setup attributes */
725 bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); 739 symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
726 bin_attr_symbol_map.size = be64_to_cpu(syms[1]); 740 symbol_map_attr.size = be64_to_cpu(syms[1]);
727 741
728 rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); 742 rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
729 if (rc) 743 if (rc)
730 pr_warn("Error %d creating OPAL symbols file\n", rc); 744 pr_warn("Error %d creating OPAL symbols file\n", rc);
731} 745}
@@ -910,7 +924,7 @@ static int __init opal_init(void)
910 } 924 }
911 925
912 /* Initialise OPAL messaging system */ 926 /* Initialise OPAL messaging system */
913 opal_message_init(); 927 opal_message_init(opal_node);
914 928
915 /* Initialise OPAL asynchronous completion interface */ 929 /* Initialise OPAL asynchronous completion interface */
916 opal_async_comp_init(); 930 opal_async_comp_init();
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..a0b9c0c23ed2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
36 struct page *tce_mem = NULL; 36 struct page *tce_mem = NULL;
37 __be64 *addr; 37 __be64 *addr;
38 38
39 tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); 39 tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
40 shift - PAGE_SHIFT);
40 if (!tce_mem) { 41 if (!tce_mem) {
41 pr_err("Failed to allocate a TCE memory, level shift=%d\n", 42 pr_err("Failed to allocate a TCE memory, level shift=%d\n",
42 shift); 43 shift);
@@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
48 return addr; 49 return addr;
49} 50}
50 51
52static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
53 unsigned long size, unsigned int levels);
54
51static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) 55static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
52{ 56{
53 __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; 57 __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
57 61
58 while (level) { 62 while (level) {
59 int n = (idx & mask) >> (level * shift); 63 int n = (idx & mask) >> (level * shift);
60 unsigned long tce; 64 unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
61 65
62 if (tmp[n] == 0) { 66 if (!tce) {
63 __be64 *tmp2; 67 __be64 *tmp2;
64 68
65 if (!alloc) 69 if (!alloc)
@@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
70 if (!tmp2) 74 if (!tmp2)
71 return NULL; 75 return NULL;
72 76
73 tmp[n] = cpu_to_be64(__pa(tmp2) | 77 tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
74 TCE_PCI_READ | TCE_PCI_WRITE); 78 oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
79 cpu_to_be64(tce)));
80 if (oldtce) {
81 pnv_pci_ioda2_table_do_free_pages(tmp2,
82 ilog2(tbl->it_level_size) + 3, 1);
83 tce = oldtce;
84 }
75 } 85 }
76 tce = be64_to_cpu(tmp[n]);
77 86
78 tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); 87 tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
79 idx &= ~mask; 88 idx &= ~mask;
@@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
161 170
162 if (ptce) 171 if (ptce)
163 *ptce = cpu_to_be64(0); 172 *ptce = cpu_to_be64(0);
173 else
174 /* Skip the rest of the level */
175 i |= tbl->it_level_size - 1;
164 } 176 }
165} 177}
166 178
@@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
260 unsigned int table_shift = max_t(unsigned int, entries_shift + 3, 272 unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
261 PAGE_SHIFT); 273 PAGE_SHIFT);
262 const unsigned long tce_table_size = 1UL << table_shift; 274 const unsigned long tce_table_size = 1UL << table_shift;
263 unsigned int tmplevels = levels;
264 275
265 if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) 276 if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
266 return -EINVAL; 277 return -EINVAL;
@@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
268 if (!is_power_of_2(window_size)) 279 if (!is_power_of_2(window_size))
269 return -EINVAL; 280 return -EINVAL;
270 281
271 if (alloc_userspace_copy && (window_size > (1ULL << 32)))
272 tmplevels = 1;
273
274 /* Adjust direct table size from window_size and levels */ 282 /* Adjust direct table size from window_size and levels */
275 entries_shift = (entries_shift + levels - 1) / levels; 283 entries_shift = (entries_shift + levels - 1) / levels;
276 level_shift = entries_shift + 3; 284 level_shift = entries_shift + 3;
@@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
281 289
282 /* Allocate TCE table */ 290 /* Allocate TCE table */
283 addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, 291 addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
284 tmplevels, tce_table_size, &offset, &total_allocated); 292 1, tce_table_size, &offset, &total_allocated);
285 293
286 /* addr==NULL means that the first level allocation failed */ 294 /* addr==NULL means that the first level allocation failed */
287 if (!addr) 295 if (!addr)
@@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
292 * we did not allocate as much as we wanted, 300 * we did not allocate as much as we wanted,
293 * release partially allocated table. 301 * release partially allocated table.
294 */ 302 */
295 if (tmplevels == levels && offset < tce_table_size) 303 if (levels == 1 && offset < tce_table_size)
296 goto free_tces_exit; 304 goto free_tces_exit;
297 305
298 /* Allocate userspace view of the TCE table */ 306 /* Allocate userspace view of the TCE table */
299 if (alloc_userspace_copy) { 307 if (alloc_userspace_copy) {
300 offset = 0; 308 offset = 0;
301 uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, 309 uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
302 tmplevels, tce_table_size, &offset, 310 1, tce_table_size, &offset,
303 &total_allocated_uas); 311 &total_allocated_uas);
304 if (!uas) 312 if (!uas)
305 goto free_tces_exit; 313 goto free_tces_exit;
306 if (tmplevels == levels && (offset < tce_table_size || 314 if (levels == 1 && (offset < tce_table_size ||
307 total_allocated_uas != total_allocated)) 315 total_allocated_uas != total_allocated))
308 goto free_uas_exit; 316 goto free_uas_exit;
309 } 317 }
@@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
318 326
319 pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", 327 pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
320 window_size, tce_table_size, bus_offset, tbl->it_base, 328 window_size, tce_table_size, bus_offset, tbl->it_base,
321 tbl->it_userspace, tmplevels, levels); 329 tbl->it_userspace, 1, levels);
322 330
323 return 0; 331 return 0;
324 332
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8080558d020..c28d0d9b7ee0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1939,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1939} 1939}
1940 1940
1941#ifdef CONFIG_IOMMU_API 1941#ifdef CONFIG_IOMMU_API
1942static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, 1942/* Common for IODA1 and IODA2 */
1943 unsigned long *hpa, enum dma_data_direction *direction) 1943static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
1944 unsigned long *hpa, enum dma_data_direction *direction,
1945 bool realmode)
1944{ 1946{
1945 long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); 1947 return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
1946
1947 if (!ret)
1948 pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
1949
1950 return ret;
1951}
1952
1953static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
1954 unsigned long *hpa, enum dma_data_direction *direction)
1955{
1956 long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
1957
1958 if (!ret)
1959 pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
1960
1961 return ret;
1962} 1948}
1963#endif 1949#endif
1964 1950
@@ -1973,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1973static struct iommu_table_ops pnv_ioda1_iommu_ops = { 1959static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1974 .set = pnv_ioda1_tce_build, 1960 .set = pnv_ioda1_tce_build,
1975#ifdef CONFIG_IOMMU_API 1961#ifdef CONFIG_IOMMU_API
1976 .exchange = pnv_ioda1_tce_xchg, 1962 .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1977 .exchange_rm = pnv_ioda1_tce_xchg_rm, 1963 .tce_kill = pnv_pci_p7ioc_tce_invalidate,
1978 .useraddrptr = pnv_tce_useraddrptr, 1964 .useraddrptr = pnv_tce_useraddrptr,
1979#endif 1965#endif
1980 .clear = pnv_ioda1_tce_free, 1966 .clear = pnv_ioda1_tce_free,
@@ -2103,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
2103 return ret; 2089 return ret;
2104} 2090}
2105 2091
2106#ifdef CONFIG_IOMMU_API
2107static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2108 unsigned long *hpa, enum dma_data_direction *direction)
2109{
2110 long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2111
2112 if (!ret)
2113 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
2114
2115 return ret;
2116}
2117
2118static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2119 unsigned long *hpa, enum dma_data_direction *direction)
2120{
2121 long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2122
2123 if (!ret)
2124 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2125
2126 return ret;
2127}
2128#endif
2129
2130static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, 2092static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2131 long npages) 2093 long npages)
2132{ 2094{
@@ -2138,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2138static struct iommu_table_ops pnv_ioda2_iommu_ops = { 2100static struct iommu_table_ops pnv_ioda2_iommu_ops = {
2139 .set = pnv_ioda2_tce_build, 2101 .set = pnv_ioda2_tce_build,
2140#ifdef CONFIG_IOMMU_API 2102#ifdef CONFIG_IOMMU_API
2141 .exchange = pnv_ioda2_tce_xchg, 2103 .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
2142 .exchange_rm = pnv_ioda2_tce_xchg_rm, 2104 .tce_kill = pnv_pci_ioda2_tce_invalidate,
2143 .useraddrptr = pnv_tce_useraddrptr, 2105 .useraddrptr = pnv_tce_useraddrptr,
2144#endif 2106#endif
2145 .clear = pnv_ioda2_tce_free, 2107 .clear = pnv_ioda2_tce_free,
@@ -2303,7 +2265,7 @@ found:
2303 tbl->it_ops = &pnv_ioda1_iommu_ops; 2265 tbl->it_ops = &pnv_ioda1_iommu_ops;
2304 pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; 2266 pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
2305 pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; 2267 pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2306 iommu_init_table(tbl, phb->hose->node); 2268 iommu_init_table(tbl, phb->hose->node, 0, 0);
2307 2269
2308 if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 2270 if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2309 pnv_ioda_setup_bus_dma(pe, pe->pbus); 2271 pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2420,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2420{ 2382{
2421 struct iommu_table *tbl = NULL; 2383 struct iommu_table *tbl = NULL;
2422 long rc; 2384 long rc;
2385 unsigned long res_start, res_end;
2423 2386
2424 /* 2387 /*
2425 * crashkernel= specifies the kdump kernel's maximum memory at 2388 * crashkernel= specifies the kdump kernel's maximum memory at
@@ -2433,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2433 * DMA window can be larger than available memory, which will 2396 * DMA window can be larger than available memory, which will
2434 * cause errors later. 2397 * cause errors later.
2435 */ 2398 */
2436 const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); 2399 const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
2400
2401 /*
2402 * We create the default window as big as we can. The constraint is
2403 * the max order of allocation possible. The TCE table is likely to
2404 * end up being multilevel and with on-demand allocation in place,
2405 * the initial use is not going to be huge as the default window aims
2406 * to support crippled devices (i.e. not fully 64bit DMAble) only.
2407 */
2408 /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
2409 const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
2410 /* Each TCE level cannot exceed maxblock so go multilevel if needed */
2411 unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
2412 unsigned long tcelevel_order = ilog2(maxblock >> 3);
2413 unsigned int levels = tces_order / tcelevel_order;
2414
2415 if (tces_order % tcelevel_order)
2416 levels += 1;
2417 /*
2418 * We try to stick to default levels (which is >1 at the moment) in
2419 * order to save memory by relying on on-demain TCE level allocation.
2420 */
2421 levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
2437 2422
2438 rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, 2423 rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
2439 IOMMU_PAGE_SHIFT_4K, 2424 window_size, levels, false, &tbl);
2440 window_size,
2441 POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
2442 if (rc) { 2425 if (rc) {
2443 pe_err(pe, "Failed to create 32-bit TCE table, err %ld", 2426 pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2444 rc); 2427 rc);
2445 return rc; 2428 return rc;
2446 } 2429 }
2447 2430
2448 iommu_init_table(tbl, pe->phb->hose->node); 2431 /* We use top part of 32bit space for MMIO so exclude it from DMA */
2432 res_start = 0;
2433 res_end = 0;
2434 if (window_size > pe->phb->ioda.m32_pci_base) {
2435 res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
2436 res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
2437 }
2438 iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
2449 2439
2450 rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); 2440 rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2451 if (rc) { 2441 if (rc) {
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6104418c9ad5..2825d004dece 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
54 break; 54 break;
55 } 55 }
56 56
57 if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { 57 if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
58 !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
58 of_node_put(parent); 59 of_node_put(parent);
59 continue; 60 continue;
60 } 61 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 469c24463247..f914f0b14e4e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
219 struct pnv_ioda_pe *pe); 219 struct pnv_ioda_pe *pe);
220 220
221/* pci-ioda-tce.c */ 221/* pci-ioda-tce.c */
222#define POWERNV_IOMMU_DEFAULT_LEVELS 1 222#define POWERNV_IOMMU_DEFAULT_LEVELS 2
223#define POWERNV_IOMMU_MAX_LEVELS 5 223#define POWERNV_IOMMU_MAX_LEVELS 5
224 224
225extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, 225extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index fd4a1c5a6369..1aa51c4fa904 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -30,4 +30,9 @@ extern void opal_event_shutdown(void);
30 30
31bool cpu_core_split_required(void); 31bool cpu_core_split_required(void);
32 32
33struct memcons;
34ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
35u32 memcons_get_size(struct memcons *mc);
36struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
37
33#endif /* _POWERNV_H */ 38#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a5e52f9eed3c..83498604d322 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -24,6 +24,7 @@
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/cpufreq.h> 26#include <linux/cpufreq.h>
27#include <linux/memblock.h>
27 28
28#include <asm/machdep.h> 29#include <asm/machdep.h>
29#include <asm/firmware.h> 30#include <asm/firmware.h>
@@ -166,6 +167,14 @@ static void __init pnv_init(void)
166 else 167 else
167#endif 168#endif
168 add_preferred_console("hvc", 0, NULL); 169 add_preferred_console("hvc", 0, NULL);
170
171 if (!radix_enabled()) {
172 int i;
173
174 /* Allocate per cpu area to save old slb contents during MCE */
175 for_each_possible_cpu(i)
176 paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i));
177 }
169} 178}
170 179
171static void __init pnv_init_IRQ(void) 180static void __init pnv_init_IRQ(void)
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000000..e4a00ad06f9d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,69 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Ultravisor high level interfaces
4 *
5 * Copyright 2019, IBM Corporation.
6 *
7 */
8#include <linux/init.h>
9#include <linux/printk.h>
10#include <linux/of_fdt.h>
11#include <linux/of.h>
12
13#include <asm/ultravisor.h>
14#include <asm/firmware.h>
15#include <asm/machdep.h>
16
17#include "powernv.h"
18
19static struct kobject *ultravisor_kobj;
20
21int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
22 int depth, void *data)
23{
24 if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
25 return 0;
26
27 powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
28 pr_debug("Ultravisor detected!\n");
29 return 1;
30}
31
32static struct memcons *uv_memcons;
33
34static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
35 struct bin_attribute *bin_attr, char *to,
36 loff_t pos, size_t count)
37{
38 return memcons_copy(uv_memcons, to, pos, count);
39}
40
41static struct bin_attribute uv_msglog_attr = {
42 .attr = {.name = "msglog", .mode = 0400},
43 .read = uv_msglog_read
44};
45
46static int __init uv_init(void)
47{
48 struct device_node *node;
49
50 if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
51 return 0;
52
53 node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
54 if (!node)
55 return -ENODEV;
56
57 uv_memcons = memcons_init(node, "memcons");
58 if (!uv_memcons)
59 return -ENOENT;
60
61 uv_msglog_attr.size = memcons_get_size(uv_memcons);
62
63 ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
64 if (!ultravisor_kobj)
65 return -ENOMEM;
66
67 return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
68}
69machine_subsys_initcall(powernv, uv_init);
diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index bdaeaecdc06b..1193c294b8d0 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu)
184 * setup_areas - Map the spu regions into the address space. 184 * setup_areas - Map the spu regions into the address space.
185 * 185 *
186 * The current HV requires the spu shadow regs to be mapped with the 186 * The current HV requires the spu shadow regs to be mapped with the
187 * PTE page protection bits set as read-only (PP=3). This implementation 187 * PTE page protection bits set as read-only.
188 * uses the low level __ioremap() to bypass the page protection settings
189 * inforced by ioremap_prot() to get the needed PTE bits set for the
190 * shadow regs.
191 */ 188 */
192 189
193static int __init setup_areas(struct spu *spu) 190static int __init setup_areas(struct spu *spu)
@@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu)
195 struct table {char* name; unsigned long addr; unsigned long size;}; 192 struct table {char* name; unsigned long addr; unsigned long size;};
196 unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); 193 unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
197 194
198 spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, 195 spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr,
199 sizeof(struct spe_shadow), 196 sizeof(struct spe_shadow), shadow_flags);
200 shadow_flags);
201 if (!spu_pdata(spu)->shadow) { 197 if (!spu_pdata(spu)->shadow) {
202 pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__); 198 pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__);
203 goto fail_ioremap; 199 goto fail_ioremap;
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f7b484f55553..9e35cddddf73 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,3 +145,17 @@ config PAPR_SCM
145 tristate "Support for the PAPR Storage Class Memory interface" 145 tristate "Support for the PAPR Storage Class Memory interface"
146 help 146 help
147 Enable access to hypervisor provided storage class memory. 147 Enable access to hypervisor provided storage class memory.
148
149config PPC_SVM
150 bool "Secure virtual machine (SVM) support for POWER"
151 depends on PPC_PSERIES
152 select SWIOTLB
153 select ARCH_HAS_MEM_ENCRYPT
154 select ARCH_HAS_FORCE_DMA_UNENCRYPTED
155 help
156 There are certain POWER platforms which support secure guests using
157 the Protected Execution Facility, with the help of an Ultravisor
158 executing below the hypervisor layer. This enables support for
159 those guests.
160
161 If unsure, say "N".
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index ab3d59aeacca..a3c74a5cf20d 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_IBMVIO) += vio.o
26obj-$(CONFIG_IBMEBUS) += ibmebus.o 26obj-$(CONFIG_IBMEBUS) += ibmebus.o
27obj-$(CONFIG_PAPR_SCM) += papr_scm.o 27obj-$(CONFIG_PAPR_SCM) += papr_scm.o
28obj-$(CONFIG_PPC_SPLPAR) += vphn.o 28obj-$(CONFIG_PPC_SPLPAR) += vphn.o
29obj-$(CONFIG_PPC_SVM) += svm.o
30obj-$(CONFIG_FA_DUMP) += rtas-fadump.o
29 31
30ifdef CONFIG_PPC_PSERIES 32ifdef CONFIG_PPC_PSERIES
31obj-$(CONFIG_SUSPEND) += suspend.o 33obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9edae1863e2f..893ba3f562c4 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -42,42 +42,44 @@ static int ibm_get_config_addr_info;
42static int ibm_get_config_addr_info2; 42static int ibm_get_config_addr_info2;
43static int ibm_configure_pe; 43static int ibm_configure_pe;
44 44
45#ifdef CONFIG_PCI_IOV
46void pseries_pcibios_bus_add_device(struct pci_dev *pdev) 45void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
47{ 46{
48 struct pci_dn *pdn = pci_get_pdn(pdev); 47 struct pci_dn *pdn = pci_get_pdn(pdev);
49 struct pci_dn *physfn_pdn;
50 struct eeh_dev *edev;
51 48
52 if (!pdev->is_virtfn) 49 if (eeh_has_flag(EEH_FORCE_DISABLED))
53 return; 50 return;
54 51
55 pdn->device_id = pdev->device; 52 dev_dbg(&pdev->dev, "EEH: Setting up device\n");
56 pdn->vendor_id = pdev->vendor; 53#ifdef CONFIG_PCI_IOV
57 pdn->class_code = pdev->class; 54 if (pdev->is_virtfn) {
58 /* 55 struct pci_dn *physfn_pdn;
59 * Last allow unfreeze return code used for retrieval
60 * by user space in eeh-sysfs to show the last command
61 * completion from platform.
62 */
63 pdn->last_allow_rc = 0;
64 physfn_pdn = pci_get_pdn(pdev->physfn);
65 pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index];
66 edev = pdn_to_eeh_dev(pdn);
67 56
68 /* 57 pdn->device_id = pdev->device;
69 * The following operations will fail if VF's sysfs files 58 pdn->vendor_id = pdev->vendor;
70 * aren't created or its resources aren't finalized. 59 pdn->class_code = pdev->class;
71 */ 60 /*
61 * Last allow unfreeze return code used for retrieval
62 * by user space in eeh-sysfs to show the last command
63 * completion from platform.
64 */
65 pdn->last_allow_rc = 0;
66 physfn_pdn = pci_get_pdn(pdev->physfn);
67 pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index];
68 }
69#endif
72 eeh_add_device_early(pdn); 70 eeh_add_device_early(pdn);
73 eeh_add_device_late(pdev); 71 eeh_add_device_late(pdev);
74 edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); 72#ifdef CONFIG_PCI_IOV
75 eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ 73 if (pdev->is_virtfn) {
76 eeh_add_to_parent_pe(edev); /* Add as VF PE type */ 74 struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
77 eeh_sysfs_add_device(pdev);
78 75
79} 76 edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
77 eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
78 eeh_add_to_parent_pe(edev); /* Add as VF PE type */
79 }
80#endif 80#endif
81 eeh_sysfs_add_device(pdev);
82}
81 83
82/* 84/*
83 * Buffer for reporting slot-error-detail rtas calls. Its here 85 * Buffer for reporting slot-error-detail rtas calls. Its here
@@ -144,10 +146,8 @@ static int pseries_eeh_init(void)
144 /* Set EEH probe mode */ 146 /* Set EEH probe mode */
145 eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); 147 eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
146 148
147#ifdef CONFIG_PCI_IOV
148 /* Set EEH machine dependent code */ 149 /* Set EEH machine dependent code */
149 ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; 150 ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
150#endif
151 151
152 return 0; 152 return 0;
153} 153}
@@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
251 if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) 251 if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
252 return NULL; 252 return NULL;
253 253
254 eeh_edev_dbg(edev, "Probing device\n");
255
254 /* 256 /*
255 * Update class code and mode of eeh device. We need 257 * Update class code and mode of eeh device. We need
256 * correctly reflects that current device is root port 258 * correctly reflects that current device is root port
@@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
280 pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); 282 pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
281 283
282 /* Enable EEH on the device */ 284 /* Enable EEH on the device */
285 eeh_edev_dbg(edev, "Enabling EEH on device\n");
283 ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); 286 ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
284 if (!ret) { 287 if (ret) {
288 eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
289 } else {
285 /* Retrieve PE address */ 290 /* Retrieve PE address */
286 edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); 291 edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
287 pe.addr = edev->pe_config_addr; 292 pe.addr = edev->pe_config_addr;
@@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
297 if (enable) { 302 if (enable) {
298 eeh_add_flag(EEH_ENABLED); 303 eeh_add_flag(EEH_ENABLED);
299 eeh_add_to_parent_pe(edev); 304 eeh_add_to_parent_pe(edev);
300
301 pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
302 __func__, pdn->busno, PCI_SLOT(pdn->devfn),
303 PCI_FUNC(pdn->devfn), pe.phb->global_number,
304 pe.addr);
305 } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && 305 } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
306 (pdn_to_eeh_dev(pdn->parent))->pe) { 306 (pdn_to_eeh_dev(pdn->parent))->pe) {
307 /* This device doesn't support EEH, but it may have an 307 /* This device doesn't support EEH, but it may have an
@@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
310 edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; 310 edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
311 eeh_add_to_parent_pe(edev); 311 eeh_add_to_parent_pe(edev);
312 } 312 }
313 eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
314 (enable ? "enabled" : "unsupported"), ret);
313 } 315 }
314 316
315 /* Save memory bars */ 317 /* Save memory bars */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 46d0d35b9ca4..8e700390f3d6 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
880 880
881 switch (hp_elog->action) { 881 switch (hp_elog->action) {
882 case PSERIES_HP_ELOG_ACTION_ADD: 882 case PSERIES_HP_ELOG_ACTION_ADD:
883 if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { 883 switch (hp_elog->id_type) {
884 case PSERIES_HP_ELOG_ID_DRC_COUNT:
884 count = hp_elog->_drc_u.drc_count; 885 count = hp_elog->_drc_u.drc_count;
885 rc = dlpar_memory_add_by_count(count); 886 rc = dlpar_memory_add_by_count(count);
886 } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { 887 break;
888 case PSERIES_HP_ELOG_ID_DRC_INDEX:
887 drc_index = hp_elog->_drc_u.drc_index; 889 drc_index = hp_elog->_drc_u.drc_index;
888 rc = dlpar_memory_add_by_index(drc_index); 890 rc = dlpar_memory_add_by_index(drc_index);
889 } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { 891 break;
892 case PSERIES_HP_ELOG_ID_DRC_IC:
890 count = hp_elog->_drc_u.ic.count; 893 count = hp_elog->_drc_u.ic.count;
891 drc_index = hp_elog->_drc_u.ic.index; 894 drc_index = hp_elog->_drc_u.ic.index;
892 rc = dlpar_memory_add_by_ic(count, drc_index); 895 rc = dlpar_memory_add_by_ic(count, drc_index);
893 } else { 896 break;
897 default:
894 rc = -EINVAL; 898 rc = -EINVAL;
899 break;
895 } 900 }
896 901
897 break; 902 break;
898 case PSERIES_HP_ELOG_ACTION_REMOVE: 903 case PSERIES_HP_ELOG_ACTION_REMOVE:
899 if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { 904 switch (hp_elog->id_type) {
905 case PSERIES_HP_ELOG_ID_DRC_COUNT:
900 count = hp_elog->_drc_u.drc_count; 906 count = hp_elog->_drc_u.drc_count;
901 rc = dlpar_memory_remove_by_count(count); 907 rc = dlpar_memory_remove_by_count(count);
902 } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { 908 break;
909 case PSERIES_HP_ELOG_ID_DRC_INDEX:
903 drc_index = hp_elog->_drc_u.drc_index; 910 drc_index = hp_elog->_drc_u.drc_index;
904 rc = dlpar_memory_remove_by_index(drc_index); 911 rc = dlpar_memory_remove_by_index(drc_index);
905 } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { 912 break;
913 case PSERIES_HP_ELOG_ID_DRC_IC:
906 count = hp_elog->_drc_u.ic.count; 914 count = hp_elog->_drc_u.ic.count;
907 drc_index = hp_elog->_drc_u.ic.index; 915 drc_index = hp_elog->_drc_u.ic.index;
908 rc = dlpar_memory_remove_by_ic(count, drc_index); 916 rc = dlpar_memory_remove_by_ic(count, drc_index);
909 } else { 917 break;
918 default:
910 rc = -EINVAL; 919 rc = -EINVAL;
920 break;
911 } 921 }
912 922
913 break; 923 break;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 889dc2e44b89..6ba081dd61c9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
36#include <asm/udbg.h> 36#include <asm/udbg.h>
37#include <asm/mmzone.h> 37#include <asm/mmzone.h>
38#include <asm/plpar_wrappers.h> 38#include <asm/plpar_wrappers.h>
39#include <asm/svm.h>
39 40
40#include "pseries.h" 41#include "pseries.h"
41 42
@@ -609,7 +610,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
609 610
610 iommu_table_setparms(pci->phb, dn, tbl); 611 iommu_table_setparms(pci->phb, dn, tbl);
611 tbl->it_ops = &iommu_table_pseries_ops; 612 tbl->it_ops = &iommu_table_pseries_ops;
612 iommu_init_table(tbl, pci->phb->node); 613 iommu_init_table(tbl, pci->phb->node, 0, 0);
613 614
614 /* Divide the rest (1.75GB) among the children */ 615 /* Divide the rest (1.75GB) among the children */
615 pci->phb->dma_window_size = 0x80000000ul; 616 pci->phb->dma_window_size = 0x80000000ul;
@@ -621,7 +622,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
621 622
622#ifdef CONFIG_IOMMU_API 623#ifdef CONFIG_IOMMU_API
623static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 624static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
624 long *tce, enum dma_data_direction *direction) 625 long *tce, enum dma_data_direction *direction,
626 bool realmode)
625{ 627{
626 long rc; 628 long rc;
627 unsigned long ioba = (unsigned long) index << tbl->it_page_shift; 629 unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
@@ -649,7 +651,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
649struct iommu_table_ops iommu_table_lpar_multi_ops = { 651struct iommu_table_ops iommu_table_lpar_multi_ops = {
650 .set = tce_buildmulti_pSeriesLP, 652 .set = tce_buildmulti_pSeriesLP,
651#ifdef CONFIG_IOMMU_API 653#ifdef CONFIG_IOMMU_API
652 .exchange = tce_exchange_pseries, 654 .xchg_no_kill = tce_exchange_pseries,
653#endif 655#endif
654 .clear = tce_freemulti_pSeriesLP, 656 .clear = tce_freemulti_pSeriesLP,
655 .get = tce_get_pSeriesLP 657 .get = tce_get_pSeriesLP
@@ -690,7 +692,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
690 iommu_table_setparms_lpar(ppci->phb, pdn, tbl, 692 iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
691 ppci->table_group, dma_window); 693 ppci->table_group, dma_window);
692 tbl->it_ops = &iommu_table_lpar_multi_ops; 694 tbl->it_ops = &iommu_table_lpar_multi_ops;
693 iommu_init_table(tbl, ppci->phb->node); 695 iommu_init_table(tbl, ppci->phb->node, 0, 0);
694 iommu_register_group(ppci->table_group, 696 iommu_register_group(ppci->table_group,
695 pci_domain_nr(bus), 0); 697 pci_domain_nr(bus), 0);
696 pr_debug(" created table: %p\n", ppci->table_group); 698 pr_debug(" created table: %p\n", ppci->table_group);
@@ -719,7 +721,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
719 tbl = PCI_DN(dn)->table_group->tables[0]; 721 tbl = PCI_DN(dn)->table_group->tables[0];
720 iommu_table_setparms(phb, dn, tbl); 722 iommu_table_setparms(phb, dn, tbl);
721 tbl->it_ops = &iommu_table_pseries_ops; 723 tbl->it_ops = &iommu_table_pseries_ops;
722 iommu_init_table(tbl, phb->node); 724 iommu_init_table(tbl, phb->node, 0, 0);
723 set_iommu_table_base(&dev->dev, tbl); 725 set_iommu_table_base(&dev->dev, tbl);
724 return; 726 return;
725 } 727 }
@@ -1169,7 +1171,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1169 iommu_table_setparms_lpar(pci->phb, pdn, tbl, 1171 iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1170 pci->table_group, dma_window); 1172 pci->table_group, dma_window);
1171 tbl->it_ops = &iommu_table_lpar_multi_ops; 1173 tbl->it_ops = &iommu_table_lpar_multi_ops;
1172 iommu_init_table(tbl, pci->phb->node); 1174 iommu_init_table(tbl, pci->phb->node, 0, 0);
1173 iommu_register_group(pci->table_group, 1175 iommu_register_group(pci->table_group,
1174 pci_domain_nr(pci->phb->bus), 0); 1176 pci_domain_nr(pci->phb->bus), 0);
1175 pr_debug(" created table: %p\n", pci->table_group); 1177 pr_debug(" created table: %p\n", pci->table_group);
@@ -1318,7 +1320,15 @@ void iommu_init_early_pSeries(void)
1318 of_reconfig_notifier_register(&iommu_reconfig_nb); 1320 of_reconfig_notifier_register(&iommu_reconfig_nb);
1319 register_memory_notifier(&iommu_mem_nb); 1321 register_memory_notifier(&iommu_mem_nb);
1320 1322
1321 set_pci_dma_ops(&dma_iommu_ops); 1323 /*
1324 * Secure guest memory is inacessible to devices so regular DMA isn't
1325 * possible.
1326 *
1327 * In that case keep devices' dma_map_ops as NULL so that the generic
1328 * DMA code path will use SWIOTLB to bounce buffers for DMA.
1329 */
1330 if (!is_secure_guest())
1331 set_pci_dma_ops(&dma_iommu_ops);
1322} 1332}
1323 1333
1324static int __init disable_multitce(char *str) 1334static int __init disable_multitce(char *str)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..36b846f6e74e 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
1413 return 0; 1413 return 0;
1414} 1414}
1415 1415
1416/* Must be called in user context */ 1416/*
1417 * Must be called in process context. The caller must hold the
1418 * cpus_lock.
1419 */
1417static int pseries_lpar_resize_hpt(unsigned long shift) 1420static int pseries_lpar_resize_hpt(unsigned long shift)
1418{ 1421{
1419 struct hpt_resize_state state = { 1422 struct hpt_resize_state state = {
@@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
1467 1470
1468 t1 = ktime_get(); 1471 t1 = ktime_get();
1469 1472
1470 rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); 1473 rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
1474 &state, NULL);
1471 1475
1472 t2 = ktime_get(); 1476 t2 = ktime_get();
1473 1477
@@ -1527,16 +1531,24 @@ void __init hpte_init_pseries(void)
1527 mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; 1531 mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range;
1528 mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; 1532 mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all;
1529 mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; 1533 mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
1530 register_process_table = pseries_lpar_register_process_table;
1531 1534
1532 if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) 1535 if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1533 mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; 1536 mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
1537
1538 /*
1539 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
1540 * to inform the hypervisor that we wish to use the HPT.
1541 */
1542 if (cpu_has_feature(CPU_FTR_ARCH_300))
1543 pseries_lpar_register_process_table(0, 0, 0);
1534} 1544}
1535 1545
1536void radix_init_pseries(void) 1546void radix_init_pseries(void)
1537{ 1547{
1538 pr_info("Using radix MMU under hypervisor\n"); 1548 pr_info("Using radix MMU under hypervisor\n");
1539 register_process_table = pseries_lpar_register_process_table; 1549
1550 pseries_lpar_register_process_table(__pa(process_tb),
1551 0, PRTB_SIZE_SHIFT - 12);
1540} 1552}
1541 1553
1542#ifdef CONFIG_PPC_SMLPAR 1554#ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index fe812bebdf5e..b571285f6c14 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -9,6 +9,7 @@
9#include <linux/cpu.h> 9#include <linux/cpu.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/sched.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/completion.h> 15#include <linux/completion.h>
@@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope)
207 208
208 prop_data += vd; 209 prop_data += vd;
209 } 210 }
211
212 cond_resched();
210 } 213 }
214
215 cond_resched();
211 } while (rtas_rc == 1); 216 } while (rtas_rc == 1);
212 217
213 of_node_put(dn); 218 of_node_put(dn);
@@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope)
310 add_dt_node(phandle, drc_index); 315 add_dt_node(phandle, drc_index);
311 break; 316 break;
312 } 317 }
318
319 cond_resched();
313 } 320 }
314 } 321 }
322
323 cond_resched();
315 } while (rc == 1); 324 } while (rc == 1);
316 325
317 kfree(rtas_buf); 326 kfree(rtas_buf);
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 1eae1d09980c..722830978639 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void)
229 229
230 pSeries_request_regions(); 230 pSeries_request_regions();
231 231
232 eeh_probe_devices(); 232 eeh_show_enabled();
233 eeh_addr_cache_build();
234 233
235#ifdef CONFIG_PCI_IOV 234#ifdef CONFIG_PCI_IOV
236 ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; 235 ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f16fdd0f71f7..3acdcc3bb908 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -76,6 +76,7 @@ struct pseries_mc_errorlog {
76#define MC_ERROR_TYPE_UE 0x00 76#define MC_ERROR_TYPE_UE 0x00
77#define MC_ERROR_TYPE_SLB 0x01 77#define MC_ERROR_TYPE_SLB 0x01
78#define MC_ERROR_TYPE_ERAT 0x02 78#define MC_ERROR_TYPE_ERAT 0x02
79#define MC_ERROR_TYPE_UNKNOWN 0x03
79#define MC_ERROR_TYPE_TLB 0x04 80#define MC_ERROR_TYPE_TLB 0x04
80#define MC_ERROR_TYPE_D_CACHE 0x05 81#define MC_ERROR_TYPE_D_CACHE 0x05
81#define MC_ERROR_TYPE_I_CACHE 0x07 82#define MC_ERROR_TYPE_I_CACHE 0x07
@@ -87,6 +88,9 @@ struct pseries_mc_errorlog {
87#define MC_ERROR_UE_LOAD_STORE 3 88#define MC_ERROR_UE_LOAD_STORE 3
88#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 89#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4
89 90
91#define UE_EFFECTIVE_ADDR_PROVIDED 0x40
92#define UE_LOGICAL_ADDR_PROVIDED 0x20
93
90#define MC_ERROR_SLB_PARITY 0 94#define MC_ERROR_SLB_PARITY 0
91#define MC_ERROR_SLB_MULTIHIT 1 95#define MC_ERROR_SLB_MULTIHIT 1
92#define MC_ERROR_SLB_INDETERMINATE 2 96#define MC_ERROR_SLB_INDETERMINATE 2
@@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
113 } 117 }
114} 118}
115 119
116static
117inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog)
118{
119 __be64 addr = 0;
120
121 switch (mlog->error_type) {
122 case MC_ERROR_TYPE_UE:
123 if (mlog->sub_err_type & 0x40)
124 addr = mlog->effective_address;
125 break;
126 case MC_ERROR_TYPE_SLB:
127 case MC_ERROR_TYPE_ERAT:
128 case MC_ERROR_TYPE_TLB:
129 if (mlog->sub_err_type & 0x80)
130 addr = mlog->effective_address;
131 default:
132 break;
133 }
134 return be64_to_cpu(addr);
135}
136
137/* 120/*
138 * Enable the hotplug interrupt late because processing them may touch other 121 * Enable the hotplug interrupt late because processing them may touch other
139 * devices or systems (e.g. hugepages) that have not been initialized at the 122 * devices or systems (e.g. hugepages) that have not been initialized at the
@@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
511 return 0; /* need to perform reset */ 494 return 0; /* need to perform reset */
512} 495}
513 496
514#define VAL_TO_STRING(ar, val) \
515 (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
516 497
517static void pseries_print_mce_info(struct pt_regs *regs, 498static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
518 struct rtas_error_log *errp)
519{ 499{
520 const char *level, *sevstr; 500 struct mce_error_info mce_err = { 0 };
501 unsigned long eaddr = 0, paddr = 0;
521 struct pseries_errorlog *pseries_log; 502 struct pseries_errorlog *pseries_log;
522 struct pseries_mc_errorlog *mce_log; 503 struct pseries_mc_errorlog *mce_log;
523 u8 error_type, err_sub_type;
524 u64 addr;
525 u8 initiator = rtas_error_initiator(errp);
526 int disposition = rtas_error_disposition(errp); 504 int disposition = rtas_error_disposition(errp);
505 int initiator = rtas_error_initiator(errp);
506 int severity = rtas_error_severity(errp);
507 u8 error_type, err_sub_type;
527 508
528 static const char * const initiators[] = { 509 if (initiator == RTAS_INITIATOR_UNKNOWN)
529 [0] = "Unknown", 510 mce_err.initiator = MCE_INITIATOR_UNKNOWN;
530 [1] = "CPU", 511 else if (initiator == RTAS_INITIATOR_CPU)
531 [2] = "PCI", 512 mce_err.initiator = MCE_INITIATOR_CPU;
532 [3] = "ISA", 513 else if (initiator == RTAS_INITIATOR_PCI)
533 [4] = "Memory", 514 mce_err.initiator = MCE_INITIATOR_PCI;
534 [5] = "Power Mgmt", 515 else if (initiator == RTAS_INITIATOR_ISA)
535 }; 516 mce_err.initiator = MCE_INITIATOR_ISA;
536 static const char * const mc_err_types[] = { 517 else if (initiator == RTAS_INITIATOR_MEMORY)
537 [0] = "UE", 518 mce_err.initiator = MCE_INITIATOR_MEMORY;
538 [1] = "SLB", 519 else if (initiator == RTAS_INITIATOR_POWERMGM)
539 [2] = "ERAT", 520 mce_err.initiator = MCE_INITIATOR_POWERMGM;
540 [3] = "Unknown", 521 else
541 [4] = "TLB", 522 mce_err.initiator = MCE_INITIATOR_UNKNOWN;
542 [5] = "D-Cache", 523
543 [6] = "Unknown", 524 if (severity == RTAS_SEVERITY_NO_ERROR)
544 [7] = "I-Cache", 525 mce_err.severity = MCE_SEV_NO_ERROR;
545 }; 526 else if (severity == RTAS_SEVERITY_EVENT)
546 static const char * const mc_ue_types[] = { 527 mce_err.severity = MCE_SEV_WARNING;
547 [0] = "Indeterminate", 528 else if (severity == RTAS_SEVERITY_WARNING)
548 [1] = "Instruction fetch", 529 mce_err.severity = MCE_SEV_WARNING;
549 [2] = "Page table walk ifetch", 530 else if (severity == RTAS_SEVERITY_ERROR_SYNC)
550 [3] = "Load/Store", 531 mce_err.severity = MCE_SEV_SEVERE;
551 [4] = "Page table walk Load/Store", 532 else if (severity == RTAS_SEVERITY_ERROR)
552 }; 533 mce_err.severity = MCE_SEV_SEVERE;
553 534 else if (severity == RTAS_SEVERITY_FATAL)
554 /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ 535 mce_err.severity = MCE_SEV_FATAL;
555 static const char * const mc_slb_types[] = { 536 else
556 [0] = "Parity", 537 mce_err.severity = MCE_SEV_FATAL;
557 [1] = "Multihit", 538
558 [2] = "Indeterminate", 539 if (severity <= RTAS_SEVERITY_ERROR_SYNC)
559 }; 540 mce_err.sync_error = true;
560 541 else
561 /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ 542 mce_err.sync_error = false;
562 static const char * const mc_soft_types[] = { 543
563 [0] = "Unknown", 544 mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
564 [1] = "Parity", 545 mce_err.error_class = MCE_ECLASS_UNKNOWN;
565 [2] = "Multihit", 546
566 [3] = "Indeterminate", 547 if (!rtas_error_extended(errp))
567 }; 548 goto out;
568
569 if (!rtas_error_extended(errp)) {
570 pr_err("Machine check interrupt: Missing extended error log\n");
571 return;
572 }
573 549
574 pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); 550 pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
575 if (pseries_log == NULL) 551 if (pseries_log == NULL)
576 return; 552 goto out;
577 553
578 mce_log = (struct pseries_mc_errorlog *)pseries_log->data; 554 mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
579
580 error_type = mce_log->error_type; 555 error_type = mce_log->error_type;
581 err_sub_type = rtas_mc_error_sub_type(mce_log); 556 err_sub_type = rtas_mc_error_sub_type(mce_log);
582 557
583 switch (rtas_error_severity(errp)) { 558 switch (mce_log->error_type) {
584 case RTAS_SEVERITY_NO_ERROR: 559 case MC_ERROR_TYPE_UE:
585 level = KERN_INFO; 560 mce_err.error_type = MCE_ERROR_TYPE_UE;
586 sevstr = "Harmless"; 561 switch (err_sub_type) {
587 break; 562 case MC_ERROR_UE_IFETCH:
588 case RTAS_SEVERITY_WARNING: 563 mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
589 level = KERN_WARNING; 564 break;
590 sevstr = ""; 565 case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
591 break; 566 mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
592 case RTAS_SEVERITY_ERROR: 567 break;
593 case RTAS_SEVERITY_ERROR_SYNC: 568 case MC_ERROR_UE_LOAD_STORE:
594 level = KERN_ERR; 569 mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
595 sevstr = "Severe"; 570 break;
596 break; 571 case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
597 case RTAS_SEVERITY_FATAL: 572 mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
598 default: 573 break;
599 level = KERN_ERR; 574 case MC_ERROR_UE_INDETERMINATE:
600 sevstr = "Fatal"; 575 default:
601 break; 576 mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
602 } 577 break;
578 }
579 if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
580 eaddr = be64_to_cpu(mce_log->effective_address);
603 581
604#ifdef CONFIG_PPC_BOOK3S_64 582 if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
605 /* Display faulty slb contents for SLB errors. */ 583 paddr = be64_to_cpu(mce_log->logical_address);
606 if (error_type == MC_ERROR_TYPE_SLB) 584 } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
607 slb_dump_contents(local_paca->mce_faulty_slbs); 585 unsigned long pfn;
608#endif
609 586
610 printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 587 pfn = addr_to_pfn(regs, eaddr);
611 disposition == RTAS_DISP_FULLY_RECOVERED ? 588 if (pfn != ULONG_MAX)
612 "Recovered" : "Not recovered"); 589 paddr = pfn << PAGE_SHIFT;
613 if (user_mode(regs)) { 590 }
614 printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level,
615 regs->nip, current->pid, current->comm);
616 } else {
617 printk("%s NIP [%016lx]: %pS\n", level, regs->nip,
618 (void *)regs->nip);
619 }
620 printk("%s Initiator: %s\n", level,
621 VAL_TO_STRING(initiators, initiator));
622 591
623 switch (error_type) {
624 case MC_ERROR_TYPE_UE:
625 printk("%s Error type: %s [%s]\n", level,
626 VAL_TO_STRING(mc_err_types, error_type),
627 VAL_TO_STRING(mc_ue_types, err_sub_type));
628 break; 592 break;
629 case MC_ERROR_TYPE_SLB: 593 case MC_ERROR_TYPE_SLB:
630 printk("%s Error type: %s [%s]\n", level, 594 mce_err.error_type = MCE_ERROR_TYPE_SLB;
631 VAL_TO_STRING(mc_err_types, error_type), 595 switch (err_sub_type) {
632 VAL_TO_STRING(mc_slb_types, err_sub_type)); 596 case MC_ERROR_SLB_PARITY:
597 mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
598 break;
599 case MC_ERROR_SLB_MULTIHIT:
600 mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
601 break;
602 case MC_ERROR_SLB_INDETERMINATE:
603 default:
604 mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
605 break;
606 }
607 if (mce_log->sub_err_type & 0x80)
608 eaddr = be64_to_cpu(mce_log->effective_address);
633 break; 609 break;
634 case MC_ERROR_TYPE_ERAT: 610 case MC_ERROR_TYPE_ERAT:
611 mce_err.error_type = MCE_ERROR_TYPE_ERAT;
612 switch (err_sub_type) {
613 case MC_ERROR_ERAT_PARITY:
614 mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
615 break;
616 case MC_ERROR_ERAT_MULTIHIT:
617 mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
618 break;
619 case MC_ERROR_ERAT_INDETERMINATE:
620 default:
621 mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
622 break;
623 }
624 if (mce_log->sub_err_type & 0x80)
625 eaddr = be64_to_cpu(mce_log->effective_address);
626 break;
635 case MC_ERROR_TYPE_TLB: 627 case MC_ERROR_TYPE_TLB:
636 printk("%s Error type: %s [%s]\n", level, 628 mce_err.error_type = MCE_ERROR_TYPE_TLB;
637 VAL_TO_STRING(mc_err_types, error_type), 629 switch (err_sub_type) {
638 VAL_TO_STRING(mc_soft_types, err_sub_type)); 630 case MC_ERROR_TLB_PARITY:
631 mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
632 break;
633 case MC_ERROR_TLB_MULTIHIT:
634 mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
635 break;
636 case MC_ERROR_TLB_INDETERMINATE:
637 default:
638 mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
639 break;
640 }
641 if (mce_log->sub_err_type & 0x80)
642 eaddr = be64_to_cpu(mce_log->effective_address);
643 break;
644 case MC_ERROR_TYPE_D_CACHE:
645 mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
639 break; 646 break;
647 case MC_ERROR_TYPE_I_CACHE:
648 mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
649 break;
650 case MC_ERROR_TYPE_UNKNOWN:
640 default: 651 default:
641 printk("%s Error type: %s\n", level, 652 mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
642 VAL_TO_STRING(mc_err_types, error_type));
643 break; 653 break;
644 } 654 }
645 655
646 addr = rtas_mc_get_effective_addr(mce_log);
647 if (addr)
648 printk("%s Effective address: %016llx\n", level, addr);
649}
650
651static int mce_handle_error(struct rtas_error_log *errp)
652{
653 struct pseries_errorlog *pseries_log;
654 struct pseries_mc_errorlog *mce_log;
655 int disposition = rtas_error_disposition(errp);
656 u8 error_type;
657
658 if (!rtas_error_extended(errp))
659 goto out;
660
661 pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
662 if (pseries_log == NULL)
663 goto out;
664
665 mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
666 error_type = mce_log->error_type;
667
668#ifdef CONFIG_PPC_BOOK3S_64 656#ifdef CONFIG_PPC_BOOK3S_64
669 if (disposition == RTAS_DISP_NOT_RECOVERED) { 657 if (disposition == RTAS_DISP_NOT_RECOVERED) {
670 switch (error_type) { 658 switch (error_type) {
@@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp)
682 slb_save_contents(local_paca->mce_faulty_slbs); 670 slb_save_contents(local_paca->mce_faulty_slbs);
683 flush_and_reload_slb(); 671 flush_and_reload_slb();
684 disposition = RTAS_DISP_FULLY_RECOVERED; 672 disposition = RTAS_DISP_FULLY_RECOVERED;
685 rtas_set_disposition_recovered(errp);
686 break; 673 break;
687 default: 674 default:
688 break; 675 break;
689 } 676 }
677 } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
678 /* Platform corrected itself but could be degraded */
679 printk(KERN_ERR "MCE: limited recovery, system may "
680 "be degraded\n");
681 disposition = RTAS_DISP_FULLY_RECOVERED;
690 } 682 }
691#endif 683#endif
692 684
693out: 685out:
694 return disposition; 686 save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
695} 687 &mce_err, regs->nip, eaddr, paddr);
696
697#ifdef CONFIG_MEMORY_FAILURE
698
699static DEFINE_PER_CPU(int, rtas_ue_count);
700static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
701 688
702#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 689 return disposition;
703#define UE_LOGICAL_ADDR_PROVIDED 0x20
704
705
706static void pseries_hwpoison_work_fn(struct work_struct *work)
707{
708 unsigned long paddr;
709 int index;
710
711 while (__this_cpu_read(rtas_ue_count) > 0) {
712 index = __this_cpu_read(rtas_ue_count) - 1;
713 paddr = __this_cpu_read(rtas_ue_paddr[index]);
714 memory_failure(paddr >> PAGE_SHIFT, 0);
715 __this_cpu_dec(rtas_ue_count);
716 }
717}
718
719static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
720
721static void queue_ue_paddr(unsigned long paddr)
722{
723 int index;
724
725 index = __this_cpu_inc_return(rtas_ue_count) - 1;
726 if (index >= MAX_MC_EVT) {
727 __this_cpu_dec(rtas_ue_count);
728 return;
729 }
730 this_cpu_write(rtas_ue_paddr[index], paddr);
731 schedule_work(&hwpoison_work);
732}
733
734static void pseries_do_memory_failure(struct pt_regs *regs,
735 struct pseries_mc_errorlog *mce_log)
736{
737 unsigned long paddr;
738
739 if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
740 paddr = be64_to_cpu(mce_log->logical_address);
741 } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
742 unsigned long pfn;
743
744 pfn = addr_to_pfn(regs,
745 be64_to_cpu(mce_log->effective_address));
746 if (pfn == ULONG_MAX)
747 return;
748 paddr = pfn << PAGE_SHIFT;
749 } else {
750 return;
751 }
752 queue_ue_paddr(paddr);
753}
754
755static void pseries_process_ue(struct pt_regs *regs,
756 struct rtas_error_log *errp)
757{
758 struct pseries_errorlog *pseries_log;
759 struct pseries_mc_errorlog *mce_log;
760
761 if (!rtas_error_extended(errp))
762 return;
763
764 pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
765 if (!pseries_log)
766 return;
767
768 mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
769
770 if (mce_log->error_type == MC_ERROR_TYPE_UE)
771 pseries_do_memory_failure(regs, mce_log);
772} 690}
773#else
774static inline void pseries_process_ue(struct pt_regs *regs,
775 struct rtas_error_log *errp) { }
776#endif /*CONFIG_MEMORY_FAILURE */
777 691
778/* 692/*
779 * Process MCE rtas errlog event. 693 * Process MCE rtas errlog event.
@@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work)
795 * Return 1 if corrected (or delivered a signal). 709 * Return 1 if corrected (or delivered a signal).
796 * Return 0 if there is nothing we can do. 710 * Return 0 if there is nothing we can do.
797 */ 711 */
798static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 712static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
799{ 713{
800 int recovered = 0; 714 int recovered = 0;
801 int disposition = rtas_error_disposition(err);
802
803 pseries_print_mce_info(regs, err);
804 715
805 if (!(regs->msr & MSR_RI)) { 716 if (!(regs->msr & MSR_RI)) {
806 /* If MSR_RI isn't set, we cannot recover */ 717 /* If MSR_RI isn't set, we cannot recover */
807 pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); 718 pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
808 recovered = 0; 719 recovered = 0;
809 720 } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
810 } else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
811 /* Platform corrected itself */ 721 /* Platform corrected itself */
812 recovered = 1; 722 recovered = 1;
723 } else if (evt->severity == MCE_SEV_FATAL) {
724 /* Fatal machine check */
725 pr_err("Machine check interrupt is fatal\n");
726 recovered = 0;
727 }
813 728
814 } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 729 if (!recovered && evt->sync_error) {
815 /* Platform corrected itself but could be degraded */
816 printk(KERN_ERR "MCE: limited recovery, system may "
817 "be degraded\n");
818 recovered = 1;
819
820 } else if (user_mode(regs) && !is_global_init(current) &&
821 rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
822
823 /* 730 /*
824 * If we received a synchronous error when in userspace 731 * Try to kill processes if we get a synchronous machine check
825 * kill the task. Firmware may report details of the fail 732 * (e.g., one caused by execution of this instruction). This
826 * asynchronously, so we can't rely on the target and type 733 * will devolve into a panic if we try to kill init or are in
827 * fields being valid here. 734 * an interrupt etc.
735 *
736 * TODO: Queue up this address for hwpoisioning later.
737 * TODO: This is not quite right for d-side machine
738 * checks ->nip is not necessarily the important
739 * address.
828 */ 740 */
829 printk(KERN_ERR "MCE: uncorrectable error, killing task " 741 if ((user_mode(regs))) {
830 "%s:%d\n", current->comm, current->pid); 742 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
831 743 recovered = 1;
832 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 744 } else if (die_will_crash()) {
833 recovered = 1; 745 /*
746 * die() would kill the kernel, so better to go via
747 * the platform reboot code that will log the
748 * machine check.
749 */
750 recovered = 0;
751 } else {
752 die("Machine check", regs, SIGBUS);
753 recovered = 1;
754 }
834 } 755 }
835 756
836 pseries_process_ue(regs, err);
837
838 /* Queue irq work to log this rtas event later. */
839 irq_work_queue(&mce_errlog_process_work);
840
841 return recovered; 757 return recovered;
842} 758}
843 759
@@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
853 */ 769 */
854int pSeries_machine_check_exception(struct pt_regs *regs) 770int pSeries_machine_check_exception(struct pt_regs *regs)
855{ 771{
856 struct rtas_error_log *errp; 772 struct machine_check_event evt;
857 773
858 if (fwnmi_active) { 774 if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
859 fwnmi_release_errinfo(); 775 return 0;
860 errp = fwnmi_get_errlog(); 776
861 if (errp && recover_mce(regs, errp)) 777 /* Print things out */
862 return 1; 778 if (evt.version != MCE_V1) {
779 pr_err("Machine Check Exception, Unknown event version %d !\n",
780 evt.version);
781 return 0;
863 } 782 }
783 machine_check_print_event_info(&evt, user_mode(regs), false);
784
785 if (recover_mce(regs, &evt))
786 return 1;
864 787
865 return 0; 788 return 0;
866} 789}
@@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs)
877 * to panic. Hence we will call it as soon as we go into 800 * to panic. Hence we will call it as soon as we go into
878 * virtual mode. 801 * virtual mode.
879 */ 802 */
880 disposition = mce_handle_error(errp); 803 disposition = mce_handle_error(regs, errp);
804 fwnmi_release_errinfo();
805
806 /* Queue irq work to log this rtas event later. */
807 irq_work_queue(&mce_errlog_process_work);
808
881 if (disposition == RTAS_DISP_FULLY_RECOVERED) 809 if (disposition == RTAS_DISP_FULLY_RECOVERED)
882 return 1; 810 return 1;
883 } 811 }
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
new file mode 100644
index 000000000000..70c3013fdd07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -0,0 +1,550 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Firmware-Assisted Dump support on POWERVM platform.
4 *
5 * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
6 * Copyright 2019, Hari Bathini, IBM Corporation.
7 */
8
9#define pr_fmt(fmt) "rtas fadump: " fmt
10
11#include <linux/string.h>
12#include <linux/memblock.h>
13#include <linux/delay.h>
14#include <linux/seq_file.h>
15#include <linux/crash_dump.h>
16
17#include <asm/page.h>
18#include <asm/prom.h>
19#include <asm/rtas.h>
20#include <asm/fadump.h>
21#include <asm/fadump-internal.h>
22
23#include "rtas-fadump.h"
24
25static struct rtas_fadump_mem_struct fdm;
26static const struct rtas_fadump_mem_struct *fdm_active;
27
28static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
29 const struct rtas_fadump_mem_struct *fdm)
30{
31 fadump_conf->boot_mem_dest_addr =
32 be64_to_cpu(fdm->rmr_region.destination_address);
33
34 fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr +
35 fadump_conf->boot_memory_size);
36}
37
38/*
39 * This function is called in the capture kernel to get configuration details
40 * setup in the first kernel and passed to the f/w.
41 */
42static void rtas_fadump_get_config(struct fw_dump *fadump_conf,
43 const struct rtas_fadump_mem_struct *fdm)
44{
45 fadump_conf->boot_mem_addr[0] =
46 be64_to_cpu(fdm->rmr_region.source_address);
47 fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len);
48 fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0];
49
50 fadump_conf->boot_mem_top = fadump_conf->boot_memory_size;
51 fadump_conf->boot_mem_regs_cnt = 1;
52
53 /*
54 * Start address of reserve dump area (permanent reservation) for
55 * re-registering FADump after dump capture.
56 */
57 fadump_conf->reserve_dump_area_start =
58 be64_to_cpu(fdm->cpu_state_data.destination_address);
59
60 rtas_fadump_update_config(fadump_conf, fdm);
61}
62
63static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
64{
65 u64 addr = fadump_conf->reserve_dump_area_start;
66
67 memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct));
68 addr = addr & PAGE_MASK;
69
70 fdm.header.dump_format_version = cpu_to_be32(0x00000001);
71 fdm.header.dump_num_sections = cpu_to_be16(3);
72 fdm.header.dump_status_flag = 0;
73 fdm.header.offset_first_dump_section =
74 cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct,
75 cpu_state_data));
76
77 /*
78 * Fields for disk dump option.
79 * We are not using disk dump option, hence set these fields to 0.
80 */
81 fdm.header.dd_block_size = 0;
82 fdm.header.dd_block_offset = 0;
83 fdm.header.dd_num_blocks = 0;
84 fdm.header.dd_offset_disk_path = 0;
85
86 /* set 0 to disable an automatic dump-reboot. */
87 fdm.header.max_time_auto = 0;
88
89 /* Kernel dump sections */
90 /* cpu state data section. */
91 fdm.cpu_state_data.request_flag =
92 cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
93 fdm.cpu_state_data.source_data_type =
94 cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
95 fdm.cpu_state_data.source_address = 0;
96 fdm.cpu_state_data.source_len =
97 cpu_to_be64(fadump_conf->cpu_state_data_size);
98 fdm.cpu_state_data.destination_address = cpu_to_be64(addr);
99 addr += fadump_conf->cpu_state_data_size;
100
101 /* hpte region section */
102 fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
103 fdm.hpte_region.source_data_type =
104 cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
105 fdm.hpte_region.source_address = 0;
106 fdm.hpte_region.source_len =
107 cpu_to_be64(fadump_conf->hpte_region_size);
108 fdm.hpte_region.destination_address = cpu_to_be64(addr);
109 addr += fadump_conf->hpte_region_size;
110
111 /* RMA region section */
112 fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
113 fdm.rmr_region.source_data_type =
114 cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
115 fdm.rmr_region.source_address = cpu_to_be64(0);
116 fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size);
117 fdm.rmr_region.destination_address = cpu_to_be64(addr);
118 addr += fadump_conf->boot_memory_size;
119
120 rtas_fadump_update_config(fadump_conf, &fdm);
121
122 return addr;
123}
124
125static u64 rtas_fadump_get_bootmem_min(void)
126{
127 return RTAS_FADUMP_MIN_BOOT_MEM;
128}
129
130static int rtas_fadump_register(struct fw_dump *fadump_conf)
131{
132 unsigned int wait_time;
133 int rc, err = -EIO;
134
135 /* TODO: Add upper time limit for the delay */
136 do {
137 rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
138 NULL, FADUMP_REGISTER, &fdm,
139 sizeof(struct rtas_fadump_mem_struct));
140
141 wait_time = rtas_busy_delay_time(rc);
142 if (wait_time)
143 mdelay(wait_time);
144
145 } while (wait_time);
146
147 switch (rc) {
148 case 0:
149 pr_info("Registration is successful!\n");
150 fadump_conf->dump_registered = 1;
151 err = 0;
152 break;
153 case -1:
154 pr_err("Failed to register. Hardware Error(%d).\n", rc);
155 break;
156 case -3:
157 if (!is_fadump_boot_mem_contiguous())
158 pr_err("Can't have holes in boot memory area.\n");
159 else if (!is_fadump_reserved_mem_contiguous())
160 pr_err("Can't have holes in reserved memory area.\n");
161
162 pr_err("Failed to register. Parameter Error(%d).\n", rc);
163 err = -EINVAL;
164 break;
165 case -9:
166 pr_err("Already registered!\n");
167 fadump_conf->dump_registered = 1;
168 err = -EEXIST;
169 break;
170 default:
171 pr_err("Failed to register. Unknown Error(%d).\n", rc);
172 break;
173 }
174
175 return err;
176}
177
178static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
179{
180 unsigned int wait_time;
181 int rc;
182
183 /* TODO: Add upper time limit for the delay */
184 do {
185 rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
186 NULL, FADUMP_UNREGISTER, &fdm,
187 sizeof(struct rtas_fadump_mem_struct));
188
189 wait_time = rtas_busy_delay_time(rc);
190 if (wait_time)
191 mdelay(wait_time);
192 } while (wait_time);
193
194 if (rc) {
195 pr_err("Failed to un-register - unexpected error(%d).\n", rc);
196 return -EIO;
197 }
198
199 fadump_conf->dump_registered = 0;
200 return 0;
201}
202
203static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
204{
205 unsigned int wait_time;
206 int rc;
207
208 /* TODO: Add upper time limit for the delay */
209 do {
210 rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
211 NULL, FADUMP_INVALIDATE, fdm_active,
212 sizeof(struct rtas_fadump_mem_struct));
213
214 wait_time = rtas_busy_delay_time(rc);
215 if (wait_time)
216 mdelay(wait_time);
217 } while (wait_time);
218
219 if (rc) {
220 pr_err("Failed to invalidate - unexpected error (%d).\n", rc);
221 return -EIO;
222 }
223
224 fadump_conf->dump_active = 0;
225 fdm_active = NULL;
226 return 0;
227}
228
229#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000
230static inline int rtas_fadump_gpr_index(u64 id)
231{
232 char str[3];
233 int i = -1;
234
235 if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) {
236 /* get the digits at the end */
237 id &= ~RTAS_FADUMP_GPR_MASK;
238 id >>= 24;
239 str[2] = '\0';
240 str[1] = id & 0xff;
241 str[0] = (id >> 8) & 0xff;
242 if (kstrtoint(str, 10, &i))
243 i = -EINVAL;
244 if (i > 31)
245 i = -1;
246 }
247 return i;
248}
249
250void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val)
251{
252 int i;
253
254 i = rtas_fadump_gpr_index(reg_id);
255 if (i >= 0)
256 regs->gpr[i] = (unsigned long)reg_val;
257 else if (reg_id == fadump_str_to_u64("NIA"))
258 regs->nip = (unsigned long)reg_val;
259 else if (reg_id == fadump_str_to_u64("MSR"))
260 regs->msr = (unsigned long)reg_val;
261 else if (reg_id == fadump_str_to_u64("CTR"))
262 regs->ctr = (unsigned long)reg_val;
263 else if (reg_id == fadump_str_to_u64("LR"))
264 regs->link = (unsigned long)reg_val;
265 else if (reg_id == fadump_str_to_u64("XER"))
266 regs->xer = (unsigned long)reg_val;
267 else if (reg_id == fadump_str_to_u64("CR"))
268 regs->ccr = (unsigned long)reg_val;
269 else if (reg_id == fadump_str_to_u64("DAR"))
270 regs->dar = (unsigned long)reg_val;
271 else if (reg_id == fadump_str_to_u64("DSISR"))
272 regs->dsisr = (unsigned long)reg_val;
273}
274
275static struct rtas_fadump_reg_entry*
276rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry,
277 struct pt_regs *regs)
278{
279 memset(regs, 0, sizeof(struct pt_regs));
280
281 while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
282 rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
283 be64_to_cpu(reg_entry->reg_value));
284 reg_entry++;
285 }
286 reg_entry++;
287 return reg_entry;
288}
289
290/*
291 * Read CPU state dump data and convert it into ELF notes.
292 * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
293 * used to access the data to allow for additional fields to be added without
294 * affecting compatibility. Each list of registers for a CPU starts with
295 * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
296 * 8 Byte ASCII identifier and 8 Byte register value. The register entry
297 * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
298 * of register value. For more details refer to PAPR document.
299 *
300 * Only for the crashing cpu we ignore the CPU dump data and get exact
301 * state from fadump crash info structure populated by first kernel at the
302 * time of crash.
303 */
304static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
305{
306 struct rtas_fadump_reg_save_area_header *reg_header;
307 struct fadump_crash_info_header *fdh = NULL;
308 struct rtas_fadump_reg_entry *reg_entry;
309 u32 num_cpus, *note_buf;
310 int i, rc = 0, cpu = 0;
311 struct pt_regs regs;
312 unsigned long addr;
313 void *vaddr;
314
315 addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
316 vaddr = __va(addr);
317
318 reg_header = vaddr;
319 if (be64_to_cpu(reg_header->magic_number) !=
320 fadump_str_to_u64("REGSAVE")) {
321 pr_err("Unable to read register save area.\n");
322 return -ENOENT;
323 }
324
325 pr_debug("--------CPU State Data------------\n");
326 pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
327 pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
328
329 vaddr += be32_to_cpu(reg_header->num_cpu_offset);
330 num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
331 pr_debug("NumCpus : %u\n", num_cpus);
332 vaddr += sizeof(u32);
333 reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
334
335 rc = fadump_setup_cpu_notes_buf(num_cpus);
336 if (rc != 0)
337 return rc;
338
339 note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
340
341 if (fadump_conf->fadumphdr_addr)
342 fdh = __va(fadump_conf->fadumphdr_addr);
343
344 for (i = 0; i < num_cpus; i++) {
345 if (be64_to_cpu(reg_entry->reg_id) !=
346 fadump_str_to_u64("CPUSTRT")) {
347 pr_err("Unable to read CPU state data\n");
348 rc = -ENOENT;
349 goto error_out;
350 }
351 /* Lower 4 bytes of reg_value contains logical cpu id */
352 cpu = (be64_to_cpu(reg_entry->reg_value) &
353 RTAS_FADUMP_CPU_ID_MASK);
354 if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
355 RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
356 continue;
357 }
358 pr_debug("Reading register data for cpu %d...\n", cpu);
359 if (fdh && fdh->crashing_cpu == cpu) {
360 regs = fdh->regs;
361 note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
362 RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
363 } else {
364 reg_entry++;
365 reg_entry = rtas_fadump_read_regs(reg_entry, &regs);
366 note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
367 }
368 }
369 final_note(note_buf);
370
371 if (fdh) {
372 pr_debug("Updating elfcore header (%llx) with cpu notes\n",
373 fdh->elfcorehdr_addr);
374 fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
375 }
376 return 0;
377
378error_out:
379 fadump_free_cpu_notes_buf();
380 return rc;
381
382}
383
384/*
385 * Validate and process the dump data stored by firmware before exporting
386 * it through '/proc/vmcore'.
387 */
388static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
389{
390 struct fadump_crash_info_header *fdh;
391 int rc = 0;
392
393 if (!fdm_active || !fadump_conf->fadumphdr_addr)
394 return -EINVAL;
395
396 /* Check if the dump data is valid. */
397 if ((be16_to_cpu(fdm_active->header.dump_status_flag) ==
398 RTAS_FADUMP_ERROR_FLAG) ||
399 (fdm_active->cpu_state_data.error_flags != 0) ||
400 (fdm_active->rmr_region.error_flags != 0)) {
401 pr_err("Dump taken by platform is not valid\n");
402 return -EINVAL;
403 }
404 if ((fdm_active->rmr_region.bytes_dumped !=
405 fdm_active->rmr_region.source_len) ||
406 !fdm_active->cpu_state_data.bytes_dumped) {
407 pr_err("Dump taken by platform is incomplete\n");
408 return -EINVAL;
409 }
410
411 /* Validate the fadump crash info header */
412 fdh = __va(fadump_conf->fadumphdr_addr);
413 if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
414 pr_err("Crash info header is not valid.\n");
415 return -EINVAL;
416 }
417
418 rc = rtas_fadump_build_cpu_notes(fadump_conf);
419 if (rc)
420 return rc;
421
422 /*
423 * We are done validating dump info and elfcore header is now ready
424 * to be exported. set elfcorehdr_addr so that vmcore module will
425 * export the elfcore header through '/proc/vmcore'.
426 */
427 elfcorehdr_addr = fdh->elfcorehdr_addr;
428
429 return 0;
430}
431
432static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
433 struct seq_file *m)
434{
435 const struct rtas_fadump_section *cpu_data_section;
436 const struct rtas_fadump_mem_struct *fdm_ptr;
437
438 if (fdm_active)
439 fdm_ptr = fdm_active;
440 else
441 fdm_ptr = &fdm;
442
443 cpu_data_section = &(fdm_ptr->cpu_state_data);
444 seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
445 be64_to_cpu(cpu_data_section->destination_address),
446 be64_to_cpu(cpu_data_section->destination_address) +
447 be64_to_cpu(cpu_data_section->source_len) - 1,
448 be64_to_cpu(cpu_data_section->source_len),
449 be64_to_cpu(cpu_data_section->bytes_dumped));
450
451 seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
452 be64_to_cpu(fdm_ptr->hpte_region.destination_address),
453 be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
454 be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
455 be64_to_cpu(fdm_ptr->hpte_region.source_len),
456 be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
457
458 seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
459 be64_to_cpu(fdm_ptr->rmr_region.source_address),
460 be64_to_cpu(fdm_ptr->rmr_region.destination_address));
461 seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
462 be64_to_cpu(fdm_ptr->rmr_region.source_len),
463 be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
464
465 /* Dump is active. Show reserved area start address. */
466 if (fdm_active) {
467 seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
468 fadump_conf->reserve_dump_area_start);
469 }
470}
471
472static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
473 const char *msg)
474{
475 /* Call ibm,os-term rtas call to trigger firmware assisted dump */
476 rtas_os_term((char *)msg);
477}
478
479static struct fadump_ops rtas_fadump_ops = {
480 .fadump_init_mem_struct = rtas_fadump_init_mem_struct,
481 .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min,
482 .fadump_register = rtas_fadump_register,
483 .fadump_unregister = rtas_fadump_unregister,
484 .fadump_invalidate = rtas_fadump_invalidate,
485 .fadump_process = rtas_fadump_process,
486 .fadump_region_show = rtas_fadump_region_show,
487 .fadump_trigger = rtas_fadump_trigger,
488};
489
490void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
491{
492 int i, size, num_sections;
493 const __be32 *sections;
494 const __be32 *token;
495
496 /*
497 * Check if Firmware Assisted dump is supported. if yes, check
498 * if dump has been initiated on last reboot.
499 */
500 token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
501 if (!token)
502 return;
503
504 fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token);
505 fadump_conf->ops = &rtas_fadump_ops;
506 fadump_conf->fadump_supported = 1;
507
508 /* Firmware supports 64-bit value for size, align it to pagesize. */
509 fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE);
510
511 /*
512 * The 'ibm,kernel-dump' rtas node is present only if there is
513 * dump data waiting for us.
514 */
515 fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
516 if (fdm_active) {
517 pr_info("Firmware-assisted dump is active.\n");
518 fadump_conf->dump_active = 1;
519 rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active));
520 }
521
522 /* Get the sizes required to store dump data for the firmware provided
523 * dump sections.
524 * For each dump section type supported, a 32bit cell which defines
525 * the ID of a supported section followed by two 32 bit cells which
526 * gives the size of the section in bytes.
527 */
528 sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
529 &size);
530
531 if (!sections)
532 return;
533
534 num_sections = size / (3 * sizeof(u32));
535
536 for (i = 0; i < num_sections; i++, sections += 3) {
537 u32 type = (u32)of_read_number(sections, 1);
538
539 switch (type) {
540 case RTAS_FADUMP_CPU_STATE_DATA:
541 fadump_conf->cpu_state_data_size =
542 of_read_ulong(&sections[1], 2);
543 break;
544 case RTAS_FADUMP_HPTE_REGION:
545 fadump_conf->hpte_region_size =
546 of_read_ulong(&sections[1], 2);
547 break;
548 }
549 }
550}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
new file mode 100644
index 000000000000..fd59bd7ca9c3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -0,0 +1,114 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Firmware-Assisted Dump support on POWERVM platform.
4 *
5 * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
6 * Copyright 2019, Hari Bathini, IBM Corporation.
7 */
8
9#ifndef _PSERIES_RTAS_FADUMP_H
10#define _PSERIES_RTAS_FADUMP_H
11
12/*
13 * On some Power systems where RMO is 128MB, it still requires minimum of
14 * 256MB for kernel to boot successfully. When kdump infrastructure is
15 * configured to save vmcore over network, we run into OOM issue while
16 * loading modules related to network setup. Hence we need additional 64M
17 * of memory to avoid OOM issue.
18 */
19#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26))
20
21/* Firmware provided dump sections */
22#define RTAS_FADUMP_CPU_STATE_DATA 0x0001
23#define RTAS_FADUMP_HPTE_REGION 0x0002
24#define RTAS_FADUMP_REAL_MODE_REGION 0x0011
25
26/* Dump request flag */
27#define RTAS_FADUMP_REQUEST_FLAG 0x00000001
28
29/* Dump status flag */
30#define RTAS_FADUMP_ERROR_FLAG 0x2000
31
32/* Kernel Dump section info */
33struct rtas_fadump_section {
34 __be32 request_flag;
35 __be16 source_data_type;
36 __be16 error_flags;
37 __be64 source_address;
38 __be64 source_len;
39 __be64 bytes_dumped;
40 __be64 destination_address;
41};
42
43/* ibm,configure-kernel-dump header. */
44struct rtas_fadump_section_header {
45 __be32 dump_format_version;
46 __be16 dump_num_sections;
47 __be16 dump_status_flag;
48 __be32 offset_first_dump_section;
49
50 /* Fields for disk dump option. */
51 __be32 dd_block_size;
52 __be64 dd_block_offset;
53 __be64 dd_num_blocks;
54 __be32 dd_offset_disk_path;
55
56 /* Maximum time allowed to prevent an automatic dump-reboot. */
57 __be32 max_time_auto;
58};
59
60/*
61 * Firmware Assisted dump memory structure. This structure is required for
62 * registering future kernel dump with power firmware through rtas call.
63 *
64 * No disk dump option. Hence disk dump path string section is not included.
65 */
66struct rtas_fadump_mem_struct {
67 struct rtas_fadump_section_header header;
68
69 /* Kernel dump sections */
70 struct rtas_fadump_section cpu_state_data;
71 struct rtas_fadump_section hpte_region;
72
73 /*
74 * TODO: Extend multiple boot memory regions support in the kernel
75 * for this platform.
76 */
77 struct rtas_fadump_section rmr_region;
78};
79
80/*
81 * The firmware-assisted dump format.
82 *
83 * The register save area is an area in the partition's memory used to preserve
84 * the register contents (CPU state data) for the active CPUs during a firmware
85 * assisted dump. The dump format contains register save area header followed
86 * by register entries. Each list of registers for a CPU starts with "CPUSTRT"
87 * and ends with "CPUEND".
88 */
89
90/* Register save area header. */
91struct rtas_fadump_reg_save_area_header {
92 __be64 magic_number;
93 __be32 version;
94 __be32 num_cpu_offset;
95};
96
97/* Register entry. */
98struct rtas_fadump_reg_entry {
99 __be64 reg_id;
100 __be64 reg_value;
101};
102
103/* Utility macros */
104#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \
105({ \
106 while (be64_to_cpu(reg_entry->reg_id) != \
107 fadump_str_to_u64("CPUEND")) \
108 reg_entry++; \
109 reg_entry++; \
110})
111
112#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
113
114#endif /* _PSERIES_RTAS_FADUMP_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f5940cc71c37..f8adcd0e4589 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -69,6 +69,7 @@
69#include <asm/security_features.h> 69#include <asm/security_features.h>
70#include <asm/asm-const.h> 70#include <asm/asm-const.h>
71#include <asm/swiotlb.h> 71#include <asm/swiotlb.h>
72#include <asm/svm.h>
72 73
73#include "pseries.h" 74#include "pseries.h"
74#include "../../../../drivers/pci/pci.h" 75#include "../../../../drivers/pci/pci.h"
@@ -141,17 +142,19 @@ static void __init fwnmi_init(void)
141 } 142 }
142 143
143#ifdef CONFIG_PPC_BOOK3S_64 144#ifdef CONFIG_PPC_BOOK3S_64
144 /* Allocate per cpu slb area to save old slb contents during MCE */ 145 if (!radix_enabled()) {
145 size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; 146 /* Allocate per cpu area to save old slb contents during MCE */
146 slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry), 147 size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
147 MEMBLOCK_LOW_LIMIT, ppc64_rma_size, 148 slb_ptr = memblock_alloc_try_nid_raw(size,
148 NUMA_NO_NODE); 149 sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
149 if (!slb_ptr) 150 ppc64_rma_size, NUMA_NO_NODE);
150 panic("Failed to allocate %zu bytes below %pa for slb area\n", 151 if (!slb_ptr)
151 size, &ppc64_rma_size); 152 panic("Failed to allocate %zu bytes below %pa for slb area\n",
152 153 size, &ppc64_rma_size);
153 for_each_possible_cpu(i) 154
154 paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); 155 for_each_possible_cpu(i)
156 paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
157 }
155#endif 158#endif
156} 159}
157 160
@@ -297,8 +300,10 @@ static inline int alloc_dispatch_logs(void)
297 300
298static int alloc_dispatch_log_kmem_cache(void) 301static int alloc_dispatch_log_kmem_cache(void)
299{ 302{
303 void (*ctor)(void *) = get_dtl_cache_ctor();
304
300 dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, 305 dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
301 DISPATCH_LOG_BYTES, 0, NULL); 306 DISPATCH_LOG_BYTES, 0, ctor);
302 if (!dtl_cache) { 307 if (!dtl_cache) {
303 pr_warn("Failed to create dispatch trace log buffer cache\n"); 308 pr_warn("Failed to create dispatch trace log buffer cache\n");
304 pr_warn("Stolen time statistics will be unreliable\n"); 309 pr_warn("Stolen time statistics will be unreliable\n");
@@ -316,6 +321,9 @@ static void pseries_lpar_idle(void)
316 * low power mode by ceding processor to hypervisor 321 * low power mode by ceding processor to hypervisor
317 */ 322 */
318 323
324 if (!prep_irq_for_idle())
325 return;
326
319 /* Indicate to hypervisor that we are idle. */ 327 /* Indicate to hypervisor that we are idle. */
320 get_lppaca()->idle = 1; 328 get_lppaca()->idle = 1;
321 329
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 4b3ef8d9c63f..ad61e90032da 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -41,6 +41,7 @@
41#include <asm/dbell.h> 41#include <asm/dbell.h>
42#include <asm/plpar_wrappers.h> 42#include <asm/plpar_wrappers.h>
43#include <asm/code-patching.h> 43#include <asm/code-patching.h>
44#include <asm/svm.h>
44 45
45#include "pseries.h" 46#include "pseries.h"
46#include "offline_states.h" 47#include "offline_states.h"
@@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void)
221{ 222{
222 xics_smp_probe(); 223 xics_smp_probe();
223 224
224 if (cpu_has_feature(CPU_FTR_DBELL)) 225 if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
225 smp_ops->cause_ipi = smp_pseries_cause_ipi; 226 smp_ops->cause_ipi = smp_pseries_cause_ipi;
226 else 227 else
227 smp_ops->cause_ipi = icp_ops->cause_ipi; 228 smp_ops->cause_ipi = icp_ops->cause_ipi;
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index 000000000000..40c0637203d5
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,85 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Secure VM platform
4 *
5 * Copyright 2018 IBM Corporation
6 * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
7 */
8
9#include <linux/mm.h>
10#include <asm/machdep.h>
11#include <asm/svm.h>
12#include <asm/swiotlb.h>
13#include <asm/ultravisor.h>
14
15static int __init init_svm(void)
16{
17 if (!is_secure_guest())
18 return 0;
19
20 /* Don't release the SWIOTLB buffer. */
21 ppc_swiotlb_enable = 1;
22
23 /*
24 * Since the guest memory is inaccessible to the host, devices always
25 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
26 * otherwise.
27 */
28 swiotlb_force = SWIOTLB_FORCE;
29
30 /* Share the SWIOTLB buffer with the host. */
31 swiotlb_update_mem_attributes();
32
33 return 0;
34}
35machine_early_initcall(pseries, init_svm);
36
37int set_memory_encrypted(unsigned long addr, int numpages)
38{
39 if (!PAGE_ALIGNED(addr))
40 return -EINVAL;
41
42 uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
43
44 return 0;
45}
46
47int set_memory_decrypted(unsigned long addr, int numpages)
48{
49 if (!PAGE_ALIGNED(addr))
50 return -EINVAL;
51
52 uv_share_page(PHYS_PFN(__pa(addr)), numpages);
53
54 return 0;
55}
56
57/* There's one dispatch log per CPU. */
58#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
59
60static struct page *dtl_page_store[NR_DTL_PAGE];
61static long dtl_nr_pages;
62
63static bool is_dtl_page_shared(struct page *page)
64{
65 long i;
66
67 for (i = 0; i < dtl_nr_pages; i++)
68 if (dtl_page_store[i] == page)
69 return true;
70
71 return false;
72}
73
74void dtl_cache_ctor(void *addr)
75{
76 unsigned long pfn = PHYS_PFN(__pa(addr));
77 struct page *page = pfn_to_page(pfn);
78
79 if (!is_dtl_page_shared(page)) {
80 dtl_page_store[dtl_nr_pages] = page;
81 dtl_nr_pages++;
82 WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
83 uv_share_page(pfn, 1);
84 }
85}
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 3473eef7628c..79e2287991db 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1193,7 +1193,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
1193 else 1193 else
1194 tbl->it_ops = &iommu_table_pseries_ops; 1194 tbl->it_ops = &iommu_table_pseries_ops;
1195 1195
1196 return iommu_init_table(tbl, -1); 1196 return iommu_init_table(tbl, -1, 0, 0);
1197} 1197}
1198 1198
1199/** 1199/**
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index d23288c4abf6..9ebcc1337560 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,13 +28,6 @@ config PPC_MSI_BITMAP
28source "arch/powerpc/sysdev/xics/Kconfig" 28source "arch/powerpc/sysdev/xics/Kconfig"
29source "arch/powerpc/sysdev/xive/Kconfig" 29source "arch/powerpc/sysdev/xive/Kconfig"
30 30
31config PPC_SCOM
32 bool
33
34config SCOM_DEBUGFS
35 bool "Expose SCOM controllers via debugfs"
36 depends on PPC_SCOM && DEBUG_FS
37
38config GE_FPGA 31config GE_FPGA
39 bool 32 bool
40 33
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 9d73dfddf060..603b3c656d19 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -49,8 +49,6 @@ ifdef CONFIG_SUSPEND
49obj-$(CONFIG_PPC_BOOK3S_32) += 6xx-suspend.o 49obj-$(CONFIG_PPC_BOOK3S_32) += 6xx-suspend.o
50endif 50endif
51 51
52obj-$(CONFIG_PPC_SCOM) += scom.o
53
54obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o 52obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o
55 53
56obj-$(CONFIG_PPC_XICS) += xics/ 54obj-$(CONFIG_PPC_XICS) += xics/
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 21a1fae0714e..6b4a34b36d98 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -344,7 +344,7 @@ static void iommu_table_dart_setup(void)
344 iommu_table_dart.it_index = 0; 344 iommu_table_dart.it_index = 0;
345 iommu_table_dart.it_blocksize = 1; 345 iommu_table_dart.it_blocksize = 1;
346 iommu_table_dart.it_ops = &iommu_dart_ops; 346 iommu_table_dart.it_ops = &iommu_dart_ops;
347 iommu_init_table(&iommu_table_dart, -1); 347 iommu_init_table(&iommu_table_dart, -1, 0, 0);
348 348
349 /* Reserve the last page of the DART to avoid possible prefetch 349 /* Reserve the last page of the DART to avoid possible prefetch
350 * past the DART mapped area 350 * past the DART mapped area
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
deleted file mode 100644
index 94e885bf3aee..000000000000
--- a/arch/powerpc/sysdev/scom.c
+++ /dev/null
@@ -1,223 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
4 * <benh@kernel.crashing.org>
5 * and David Gibson, IBM Corporation.
6 */
7
8#include <linux/kernel.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <asm/debugfs.h>
12#include <asm/prom.h>
13#include <asm/scom.h>
14#include <linux/uaccess.h>
15
16const struct scom_controller *scom_controller;
17EXPORT_SYMBOL_GPL(scom_controller);
18
19struct device_node *scom_find_parent(struct device_node *node)
20{
21 struct device_node *par, *tmp;
22 const u32 *p;
23
24 for (par = of_node_get(node); par;) {
25 if (of_get_property(par, "scom-controller", NULL))
26 break;
27 p = of_get_property(par, "scom-parent", NULL);
28 tmp = par;
29 if (p == NULL)
30 par = of_get_parent(par);
31 else
32 par = of_find_node_by_phandle(*p);
33 of_node_put(tmp);
34 }
35 return par;
36}
37EXPORT_SYMBOL_GPL(scom_find_parent);
38
39scom_map_t scom_map_device(struct device_node *dev, int index)
40{
41 struct device_node *parent;
42 unsigned int cells, size;
43 const __be32 *prop, *sprop;
44 u64 reg, cnt;
45 scom_map_t ret;
46
47 parent = scom_find_parent(dev);
48
49 if (parent == NULL)
50 return NULL;
51
52 /*
53 * We support "scom-reg" properties for adding scom registers
54 * to a random device-tree node with an explicit scom-parent
55 *
56 * We also support the simple "reg" property if the device is
57 * a direct child of a scom controller.
58 *
59 * In case both exist, "scom-reg" takes precedence.
60 */
61 prop = of_get_property(dev, "scom-reg", &size);
62 sprop = of_get_property(parent, "#scom-cells", NULL);
63 if (!prop && parent == dev->parent) {
64 prop = of_get_property(dev, "reg", &size);
65 sprop = of_get_property(parent, "#address-cells", NULL);
66 }
67 if (!prop)
68 return NULL;
69 cells = sprop ? be32_to_cpup(sprop) : 1;
70 size >>= 2;
71
72 if (index >= (size / (2*cells)))
73 return NULL;
74
75 reg = of_read_number(&prop[index * cells * 2], cells);
76 cnt = of_read_number(&prop[index * cells * 2 + cells], cells);
77
78 ret = scom_map(parent, reg, cnt);
79 of_node_put(parent);
80
81 return ret;
82}
83EXPORT_SYMBOL_GPL(scom_map_device);
84
85#ifdef CONFIG_SCOM_DEBUGFS
86struct scom_debug_entry {
87 struct device_node *dn;
88 struct debugfs_blob_wrapper path;
89 char name[16];
90};
91
92static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
93 size_t count, loff_t *ppos)
94{
95 struct scom_debug_entry *ent = filp->private_data;
96 u64 __user *ubuf64 = (u64 __user *)ubuf;
97 loff_t off = *ppos;
98 ssize_t done = 0;
99 u64 reg, reg_cnt, val;
100 scom_map_t map;
101 int rc;
102
103 if (off < 0 || (off & 7) || (count & 7))
104 return -EINVAL;
105 reg = off >> 3;
106 reg_cnt = count >> 3;
107
108 map = scom_map(ent->dn, reg, reg_cnt);
109 if (!scom_map_ok(map))
110 return -ENXIO;
111
112 for (reg = 0; reg < reg_cnt; reg++) {
113 rc = scom_read(map, reg, &val);
114 if (!rc)
115 rc = put_user(val, ubuf64);
116 if (rc) {
117 if (!done)
118 done = rc;
119 break;
120 }
121 ubuf64++;
122 *ppos += 8;
123 done += 8;
124 }
125 scom_unmap(map);
126 return done;
127}
128
129static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf,
130 size_t count, loff_t *ppos)
131{
132 struct scom_debug_entry *ent = filp->private_data;
133 u64 __user *ubuf64 = (u64 __user *)ubuf;
134 loff_t off = *ppos;
135 ssize_t done = 0;
136 u64 reg, reg_cnt, val;
137 scom_map_t map;
138 int rc;
139
140 if (off < 0 || (off & 7) || (count & 7))
141 return -EINVAL;
142 reg = off >> 3;
143 reg_cnt = count >> 3;
144
145 map = scom_map(ent->dn, reg, reg_cnt);
146 if (!scom_map_ok(map))
147 return -ENXIO;
148
149 for (reg = 0; reg < reg_cnt; reg++) {
150 rc = get_user(val, ubuf64);
151 if (!rc)
152 rc = scom_write(map, reg, val);
153 if (rc) {
154 if (!done)
155 done = rc;
156 break;
157 }
158 ubuf64++;
159 done += 8;
160 }
161 scom_unmap(map);
162 return done;
163}
164
165static const struct file_operations scom_debug_fops = {
166 .read = scom_debug_read,
167 .write = scom_debug_write,
168 .open = simple_open,
169 .llseek = default_llseek,
170};
171
172static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
173 int i)
174{
175 struct scom_debug_entry *ent;
176 struct dentry *dir;
177
178 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
179 if (!ent)
180 return -ENOMEM;
181
182 ent->dn = of_node_get(dn);
183 snprintf(ent->name, 16, "%08x", i);
184 ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn);
185 ent->path.size = strlen((char *)ent->path.data);
186
187 dir = debugfs_create_dir(ent->name, root);
188 if (!dir) {
189 of_node_put(dn);
190 kfree(ent->path.data);
191 kfree(ent);
192 return -1;
193 }
194
195 debugfs_create_blob("devspec", 0400, dir, &ent->path);
196 debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
197
198 return 0;
199}
200
201static int scom_debug_init(void)
202{
203 struct device_node *dn;
204 struct dentry *root;
205 int i, rc;
206
207 root = debugfs_create_dir("scom", powerpc_debugfs_root);
208 if (!root)
209 return -1;
210
211 i = rc = 0;
212 for_each_node_with_property(dn, "scom-controller") {
213 int id = of_get_ibm_chip_id(dn);
214 if (id == -1)
215 id = i;
216 rc |= scom_debug_init_one(root, dn, id);
217 i++;
218 }
219
220 return rc;
221}
222device_initcall(scom_debug_init);
223#endif /* CONFIG_SCOM_DEBUGFS */
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index be86fce1a84e..df832b09e3e9 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -196,7 +196,7 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
196 196
197/* 197/*
198 * This is used to perform the magic loads from an ESB 198 * This is used to perform the magic loads from an ESB
199 * described in xive.h 199 * described in xive-regs.h
200 */ 200 */
201static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset) 201static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset)
202{ 202{
@@ -237,26 +237,61 @@ static notrace void xive_dump_eq(const char *name, struct xive_q *q)
237 i0 = be32_to_cpup(q->qpage + idx); 237 i0 = be32_to_cpup(q->qpage + idx);
238 idx = (idx + 1) & q->msk; 238 idx = (idx + 1) & q->msk;
239 i1 = be32_to_cpup(q->qpage + idx); 239 i1 = be32_to_cpup(q->qpage + idx);
240 xmon_printf(" %s Q T=%d %08x %08x ...\n", name, 240 xmon_printf("%s idx=%d T=%d %08x %08x ...", name,
241 q->toggle, i0, i1); 241 q->idx, q->toggle, i0, i1);
242} 242}
243 243
244notrace void xmon_xive_do_dump(int cpu) 244notrace void xmon_xive_do_dump(int cpu)
245{ 245{
246 struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 246 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
247 247
248 xmon_printf("XIVE state for CPU %d:\n", cpu); 248 xmon_printf("CPU %d:", cpu);
249 xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); 249 if (xc) {
250 xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); 250 xmon_printf("pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr);
251
251#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
252 { 253 {
253 u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); 254 u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET);
254 xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, 255
255 val & XIVE_ESB_VAL_P ? 'P' : 'p', 256 xmon_printf("IPI=0x%08x PQ=%c%c ", xc->hw_ipi,
256 val & XIVE_ESB_VAL_Q ? 'Q' : 'q'); 257 val & XIVE_ESB_VAL_P ? 'P' : '-',
257 } 258 val & XIVE_ESB_VAL_Q ? 'Q' : '-');
259 }
258#endif 260#endif
261 xive_dump_eq("EQ", &xc->queue[xive_irq_priority]);
262 }
263 xmon_printf("\n");
264}
265
266int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
267{
268 int rc;
269 u32 target;
270 u8 prio;
271 u32 lirq;
272
273 rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
274 if (rc) {
275 xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
276 return rc;
277 }
278
279 xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
280 hw_irq, target, prio, lirq);
281
282 if (d) {
283 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
284 u64 val = xive_esb_read(xd, XIVE_ESB_GET);
285
286 xmon_printf("PQ=%c%c",
287 val & XIVE_ESB_VAL_P ? 'P' : '-',
288 val & XIVE_ESB_VAL_Q ? 'Q' : '-');
289 }
290
291 xmon_printf("\n");
292 return 0;
259} 293}
294
260#endif /* CONFIG_XMON */ 295#endif /* CONFIG_XMON */
261 296
262static unsigned int xive_get_irq(void) 297static unsigned int xive_get_irq(void)
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 37987c815913..0ff6b739052c 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -111,6 +111,20 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
111} 111}
112EXPORT_SYMBOL_GPL(xive_native_configure_irq); 112EXPORT_SYMBOL_GPL(xive_native_configure_irq);
113 113
114static int xive_native_get_irq_config(u32 hw_irq, u32 *target, u8 *prio,
115 u32 *sw_irq)
116{
117 s64 rc;
118 __be64 vp;
119 __be32 lirq;
120
121 rc = opal_xive_get_irq_config(hw_irq, &vp, prio, &lirq);
122
123 *target = be64_to_cpu(vp);
124 *sw_irq = be32_to_cpu(lirq);
125
126 return rc == 0 ? 0 : -ENXIO;
127}
114 128
115/* This can be called multiple time to change a queue configuration */ 129/* This can be called multiple time to change a queue configuration */
116int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, 130int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
@@ -231,6 +245,17 @@ static bool xive_native_match(struct device_node *node)
231 return of_device_is_compatible(node, "ibm,opal-xive-vc"); 245 return of_device_is_compatible(node, "ibm,opal-xive-vc");
232} 246}
233 247
248static s64 opal_xive_allocate_irq(u32 chip_id)
249{
250 s64 irq = opal_xive_allocate_irq_raw(chip_id);
251
252 /*
253 * Old versions of skiboot can incorrectly return 0xffffffff to
254 * indicate no space, fix it up here.
255 */
256 return irq == 0xffffffff ? OPAL_RESOURCE : irq;
257}
258
234#ifdef CONFIG_SMP 259#ifdef CONFIG_SMP
235static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) 260static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
236{ 261{
@@ -442,6 +467,7 @@ EXPORT_SYMBOL_GPL(xive_native_sync_queue);
442static const struct xive_ops xive_native_ops = { 467static const struct xive_ops xive_native_ops = {
443 .populate_irq_data = xive_native_populate_irq_data, 468 .populate_irq_data = xive_native_populate_irq_data,
444 .configure_irq = xive_native_configure_irq, 469 .configure_irq = xive_native_configure_irq,
470 .get_irq_config = xive_native_get_irq_config,
445 .setup_queue = xive_native_setup_queue, 471 .setup_queue = xive_native_setup_queue,
446 .cleanup_queue = xive_native_cleanup_queue, 472 .cleanup_queue = xive_native_cleanup_queue,
447 .match = xive_native_match, 473 .match = xive_native_match,
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index 8ef9cf4ebb1c..33c10749edec 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -45,7 +45,7 @@ static int xive_irq_bitmap_add(int base, int count)
45{ 45{
46 struct xive_irq_bitmap *xibm; 46 struct xive_irq_bitmap *xibm;
47 47
48 xibm = kzalloc(sizeof(*xibm), GFP_ATOMIC); 48 xibm = kzalloc(sizeof(*xibm), GFP_KERNEL);
49 if (!xibm) 49 if (!xibm)
50 return -ENOMEM; 50 return -ENOMEM;
51 51
@@ -53,6 +53,10 @@ static int xive_irq_bitmap_add(int base, int count)
53 xibm->base = base; 53 xibm->base = base;
54 xibm->count = count; 54 xibm->count = count;
55 xibm->bitmap = kzalloc(xibm->count, GFP_KERNEL); 55 xibm->bitmap = kzalloc(xibm->count, GFP_KERNEL);
56 if (!xibm->bitmap) {
57 kfree(xibm);
58 return -ENOMEM;
59 }
56 list_add(&xibm->list, &xive_irq_bitmaps); 60 list_add(&xibm->list, &xive_irq_bitmaps);
57 61
58 pr_info("Using IRQ range [%x-%x]", xibm->base, 62 pr_info("Using IRQ range [%x-%x]", xibm->base,
@@ -211,6 +215,38 @@ static long plpar_int_set_source_config(unsigned long flags,
211 return 0; 215 return 0;
212} 216}
213 217
218static long plpar_int_get_source_config(unsigned long flags,
219 unsigned long lisn,
220 unsigned long *target,
221 unsigned long *prio,
222 unsigned long *sw_irq)
223{
224 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
225 long rc;
226
227 pr_devel("H_INT_GET_SOURCE_CONFIG flags=%lx lisn=%lx\n", flags, lisn);
228
229 do {
230 rc = plpar_hcall(H_INT_GET_SOURCE_CONFIG, retbuf, flags, lisn,
231 target, prio, sw_irq);
232 } while (plpar_busy_delay(rc));
233
234 if (rc) {
235 pr_err("H_INT_GET_SOURCE_CONFIG lisn=%ld failed %ld\n",
236 lisn, rc);
237 return rc;
238 }
239
240 *target = retbuf[0];
241 *prio = retbuf[1];
242 *sw_irq = retbuf[2];
243
244 pr_devel("H_INT_GET_SOURCE_CONFIG target=%lx prio=%lx sw_irq=%lx\n",
245 retbuf[0], retbuf[1], retbuf[2]);
246
247 return 0;
248}
249
214static long plpar_int_get_queue_info(unsigned long flags, 250static long plpar_int_get_queue_info(unsigned long flags,
215 unsigned long target, 251 unsigned long target,
216 unsigned long priority, 252 unsigned long priority,
@@ -394,6 +430,24 @@ static int xive_spapr_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
394 return rc == 0 ? 0 : -ENXIO; 430 return rc == 0 ? 0 : -ENXIO;
395} 431}
396 432
433static int xive_spapr_get_irq_config(u32 hw_irq, u32 *target, u8 *prio,
434 u32 *sw_irq)
435{
436 long rc;
437 unsigned long h_target;
438 unsigned long h_prio;
439 unsigned long h_sw_irq;
440
441 rc = plpar_int_get_source_config(0, hw_irq, &h_target, &h_prio,
442 &h_sw_irq);
443
444 *target = h_target;
445 *prio = h_prio;
446 *sw_irq = h_sw_irq;
447
448 return rc == 0 ? 0 : -ENXIO;
449}
450
397/* This can be called multiple time to change a queue configuration */ 451/* This can be called multiple time to change a queue configuration */
398static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, 452static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio,
399 __be32 *qpage, u32 order) 453 __be32 *qpage, u32 order)
@@ -586,6 +640,7 @@ static void xive_spapr_sync_source(u32 hw_irq)
586static const struct xive_ops xive_spapr_ops = { 640static const struct xive_ops xive_spapr_ops = {
587 .populate_irq_data = xive_spapr_populate_irq_data, 641 .populate_irq_data = xive_spapr_populate_irq_data,
588 .configure_irq = xive_spapr_configure_irq, 642 .configure_irq = xive_spapr_configure_irq,
643 .get_irq_config = xive_spapr_get_irq_config,
589 .setup_queue = xive_spapr_setup_queue, 644 .setup_queue = xive_spapr_setup_queue,
590 .cleanup_queue = xive_spapr_cleanup_queue, 645 .cleanup_queue = xive_spapr_cleanup_queue,
591 .match = xive_spapr_match, 646 .match = xive_spapr_match,
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
index 211725dbf364..59cd366e7933 100644
--- a/arch/powerpc/sysdev/xive/xive-internal.h
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -33,6 +33,8 @@ struct xive_cpu {
33struct xive_ops { 33struct xive_ops {
34 int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); 34 int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
35 int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); 35 int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
36 int (*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio,
37 u32 *sw_irq);
36 int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); 38 int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
37 void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); 39 void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
38 void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); 40 void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 14e56c25879f..d83364ebc5c5 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2534,13 +2534,16 @@ static void dump_pacas(void)
2534static void dump_one_xive(int cpu) 2534static void dump_one_xive(int cpu)
2535{ 2535{
2536 unsigned int hwid = get_hard_smp_processor_id(cpu); 2536 unsigned int hwid = get_hard_smp_processor_id(cpu);
2537 2537 bool hv = cpu_has_feature(CPU_FTR_HVMODE);
2538 opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); 2538
2539 opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); 2539 if (hv) {
2540 opal_xive_dump(XIVE_DUMP_TM_OS, hwid); 2540 opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
2541 opal_xive_dump(XIVE_DUMP_TM_USER, hwid); 2541 opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
2542 opal_xive_dump(XIVE_DUMP_VP, hwid); 2542 opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
2543 opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); 2543 opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
2544 opal_xive_dump(XIVE_DUMP_VP, hwid);
2545 opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
2546 }
2544 2547
2545 if (setjmp(bus_error_jmp) != 0) { 2548 if (setjmp(bus_error_jmp) != 0) {
2546 catch_memory_errors = 0; 2549 catch_memory_errors = 0;
@@ -2569,16 +2572,28 @@ static void dump_all_xives(void)
2569 dump_one_xive(cpu); 2572 dump_one_xive(cpu);
2570} 2573}
2571 2574
2572static void dump_one_xive_irq(u32 num) 2575static void dump_one_xive_irq(u32 num, struct irq_data *d)
2576{
2577 xmon_xive_get_irq_config(num, d);
2578}
2579
2580static void dump_all_xive_irq(void)
2573{ 2581{
2574 s64 rc; 2582 unsigned int i;
2575 __be64 vp; 2583 struct irq_desc *desc;
2576 u8 prio; 2584
2577 __be32 lirq; 2585 for_each_irq_desc(i, desc) {
2578 2586 struct irq_data *d = irq_desc_get_irq_data(desc);
2579 rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq); 2587 unsigned int hwirq;
2580 xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n", 2588
2581 num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc); 2589 if (!d)
2590 continue;
2591
2592 hwirq = (unsigned int)irqd_to_hwirq(d);
2593 /* IPIs are special (HW number 0) */
2594 if (hwirq)
2595 dump_one_xive_irq(hwirq, d);
2596 }
2582} 2597}
2583 2598
2584static void dump_xives(void) 2599static void dump_xives(void)
@@ -2597,7 +2612,9 @@ static void dump_xives(void)
2597 return; 2612 return;
2598 } else if (c == 'i') { 2613 } else if (c == 'i') {
2599 if (scanhex(&num)) 2614 if (scanhex(&num))
2600 dump_one_xive_irq(num); 2615 dump_one_xive_irq(num, NULL);
2616 else
2617 dump_all_xive_irq();
2601 return; 2618 return;
2602 } 2619 }
2603 2620
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ea5eac00b327..f933a473b128 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -1,7 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2config ARCH_HAS_MEM_ENCRYPT
3 def_bool y
4
5config MMU 2config MMU
6 def_bool y 3 def_bool y
7 4
@@ -68,6 +65,7 @@ config S390
68 select ARCH_HAS_GCOV_PROFILE_ALL 65 select ARCH_HAS_GCOV_PROFILE_ALL
69 select ARCH_HAS_GIGANTIC_PAGE 66 select ARCH_HAS_GIGANTIC_PAGE
70 select ARCH_HAS_KCOV 67 select ARCH_HAS_KCOV
68 select ARCH_HAS_MEM_ENCRYPT
71 select ARCH_HAS_PTE_SPECIAL 69 select ARCH_HAS_PTE_SPECIAL
72 select ARCH_HAS_SET_MEMORY 70 select ARCH_HAS_SET_MEMORY
73 select ARCH_HAS_STRICT_KERNEL_RWX 71 select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 3eb018508190..2542cbf7e2d1 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,10 +4,7 @@
4 4
5#ifndef __ASSEMBLY__ 5#ifndef __ASSEMBLY__
6 6
7#define sme_me_mask 0ULL 7static inline bool mem_encrypt_active(void) { return false; }
8
9static inline bool sme_active(void) { return false; }
10extern bool sev_active(void);
11 8
12int set_memory_encrypted(unsigned long addr, int numpages); 9int set_memory_encrypted(unsigned long addr, int numpages);
13int set_memory_decrypted(unsigned long addr, int numpages); 10int set_memory_decrypted(unsigned long addr, int numpages);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 20340a03ad90..a124f19f7b3c 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -156,14 +156,9 @@ int set_memory_decrypted(unsigned long addr, int numpages)
156} 156}
157 157
158/* are we a protected virtualization guest? */ 158/* are we a protected virtualization guest? */
159bool sev_active(void)
160{
161 return is_prot_virt_guest();
162}
163
164bool force_dma_unencrypted(struct device *dev) 159bool force_dma_unencrypted(struct device *dev)
165{ 160{
166 return sev_active(); 161 return is_prot_virt_guest();
167} 162}
168 163
169/* protected virtualization */ 164/* protected virtualization */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c28ab5c01879..37ed5f5910d5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -68,6 +68,7 @@ config X86
68 select ARCH_HAS_FORTIFY_SOURCE 68 select ARCH_HAS_FORTIFY_SOURCE
69 select ARCH_HAS_GCOV_PROFILE_ALL 69 select ARCH_HAS_GCOV_PROFILE_ALL
70 select ARCH_HAS_KCOV if X86_64 70 select ARCH_HAS_KCOV if X86_64
71 select ARCH_HAS_MEM_ENCRYPT
71 select ARCH_HAS_MEMBARRIER_SYNC_CORE 72 select ARCH_HAS_MEMBARRIER_SYNC_CORE
72 select ARCH_HAS_PMEM_API if X86_64 73 select ARCH_HAS_PMEM_API if X86_64
73 select ARCH_HAS_PTE_DEVMAP if X86_64 74 select ARCH_HAS_PTE_DEVMAP if X86_64
@@ -1526,9 +1527,6 @@ config X86_CPA_STATISTICS
1526 helps to determine the effectiveness of preserving large and huge 1527 helps to determine the effectiveness of preserving large and huge
1527 page mappings when mapping protections are changed. 1528 page mappings when mapping protections are changed.
1528 1529
1529config ARCH_HAS_MEM_ENCRYPT
1530 def_bool y
1531
1532config AMD_MEM_ENCRYPT 1530config AMD_MEM_ENCRYPT
1533 bool "AMD Secure Memory Encryption (SME) support" 1531 bool "AMD Secure Memory Encryption (SME) support"
1534 depends on X86_64 && CPU_SUP_AMD 1532 depends on X86_64 && CPU_SUP_AMD
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 0c196c47d621..848ce43b9040 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -92,6 +92,16 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
92 92
93extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; 93extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
94 94
95static inline bool mem_encrypt_active(void)
96{
97 return sme_me_mask;
98}
99
100static inline u64 sme_get_me_mask(void)
101{
102 return sme_me_mask;
103}
104
95#endif /* __ASSEMBLY__ */ 105#endif /* __ASSEMBLY__ */
96 106
97#endif /* __X86_MEM_ENCRYPT_H__ */ 107#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 22369dd5de3b..045e82e8945b 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
70{ 70{
71 return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); 71 return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
72} 72}
73
74ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
75{
76 return read_from_oldmem(buf, count, ppos, 0, sev_active());
77}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index fece30ca8b0c..9268c12458c8 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -344,13 +344,11 @@ bool sme_active(void)
344{ 344{
345 return sme_me_mask && !sev_enabled; 345 return sme_me_mask && !sev_enabled;
346} 346}
347EXPORT_SYMBOL(sme_active);
348 347
349bool sev_active(void) 348bool sev_active(void)
350{ 349{
351 return sme_me_mask && sev_enabled; 350 return sme_me_mask && sev_enabled;
352} 351}
353EXPORT_SYMBOL(sev_active);
354 352
355/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ 353/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
356bool force_dma_unencrypted(struct device *dev) 354bool force_dma_unencrypted(struct device *dev)
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 482a2c1b340a..43b312d06e3e 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -18,6 +18,7 @@
18#include <linux/sched/task.h> 18#include <linux/sched/task.h>
19 19
20#include <asm/cputable.h> 20#include <asm/cputable.h>
21#include <asm/mmu.h>
21#include <misc/cxl-base.h> 22#include <misc/cxl-base.h>
22 23
23#include "cxl.h" 24#include "cxl.h"
@@ -315,6 +316,9 @@ static int __init init_cxl(void)
315{ 316{
316 int rc = 0; 317 int rc = 0;
317 318
319 if (!tlbie_capable)
320 return -EINVAL;
321
318 if ((rc = cxl_file_init())) 322 if ((rc = cxl_file_init()))
319 return rc; 323 return rc;
320 324
diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c
index 7210d9e059be..ef73cf35dda2 100644
--- a/drivers/misc/ocxl/main.c
+++ b/drivers/misc/ocxl/main.c
@@ -2,12 +2,16 @@
2// Copyright 2017 IBM Corp. 2// Copyright 2017 IBM Corp.
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <asm/mmu.h>
5#include "ocxl_internal.h" 6#include "ocxl_internal.h"
6 7
7static int __init init_ocxl(void) 8static int __init init_ocxl(void)
8{ 9{
9 int rc = 0; 10 int rc = 0;
10 11
12 if (!tlbie_capable)
13 return -EINVAL;
14
11 rc = ocxl_file_init(); 15 rc = ocxl_file_init();
12 if (rc) 16 if (rc)
13 return rc; 17 return rc;
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6758fd7c382e..d7b2b47bc33e 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -419,9 +419,21 @@ static int pnv_php_get_attention_state(struct hotplug_slot *slot, u8 *state)
419static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state) 419static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state)
420{ 420{
421 struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); 421 struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
422 struct pci_dev *bridge = php_slot->pdev;
423 u16 new, mask;
422 424
423 /* FIXME: Make it real once firmware supports it */
424 php_slot->attention_state = state; 425 php_slot->attention_state = state;
426 if (!bridge)
427 return 0;
428
429 mask = PCI_EXP_SLTCTL_AIC;
430
431 if (state)
432 new = PCI_EXP_SLTCTL_ATTN_IND_ON;
433 else
434 new = PCI_EXP_SLTCTL_ATTN_IND_OFF;
435
436 pcie_capability_clear_and_set_word(bridge, PCI_EXP_SLTCTL, mask, new);
425 437
426 return 0; 438 return 0;
427} 439}
@@ -511,6 +523,37 @@ scan:
511 return 0; 523 return 0;
512} 524}
513 525
526static int pnv_php_reset_slot(struct hotplug_slot *slot, int probe)
527{
528 struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
529 struct pci_dev *bridge = php_slot->pdev;
530 uint16_t sts;
531
532 /*
533 * The CAPI folks want pnv_php to drive OpenCAPI slots
534 * which don't have a bridge. Only claim to support
535 * reset_slot() if we have a bridge device (for now...)
536 */
537 if (probe)
538 return !bridge;
539
540 /* mask our interrupt while resetting the bridge */
541 if (php_slot->irq > 0)
542 disable_irq(php_slot->irq);
543
544 pci_bridge_secondary_bus_reset(bridge);
545
546 /* clear any state changes that happened due to the reset */
547 pcie_capability_read_word(php_slot->pdev, PCI_EXP_SLTSTA, &sts);
548 sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
549 pcie_capability_write_word(php_slot->pdev, PCI_EXP_SLTSTA, sts);
550
551 if (php_slot->irq > 0)
552 enable_irq(php_slot->irq);
553
554 return 0;
555}
556
514static int pnv_php_enable_slot(struct hotplug_slot *slot) 557static int pnv_php_enable_slot(struct hotplug_slot *slot)
515{ 558{
516 struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); 559 struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
@@ -548,6 +591,7 @@ static const struct hotplug_slot_ops php_slot_ops = {
548 .set_attention_status = pnv_php_set_attention_state, 591 .set_attention_status = pnv_php_set_attention_state,
549 .enable_slot = pnv_php_enable_slot, 592 .enable_slot = pnv_php_enable_slot,
550 .disable_slot = pnv_php_disable_slot, 593 .disable_slot = pnv_php_disable_slot,
594 .reset_slot = pnv_php_reset_slot,
551}; 595};
552 596
553static void pnv_php_release(struct pnv_php_slot *php_slot) 597static void pnv_php_release(struct pnv_php_slot *php_slot)
@@ -721,6 +765,12 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
721 pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &sts); 765 pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &sts);
722 sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC); 766 sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
723 pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, sts); 767 pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, sts);
768
769 pci_dbg(pdev, "PCI slot [%s]: HP int! DLAct: %d, PresDet: %d\n",
770 php_slot->name,
771 !!(sts & PCI_EXP_SLTSTA_DLLSC),
772 !!(sts & PCI_EXP_SLTSTA_PDC));
773
724 if (sts & PCI_EXP_SLTSTA_DLLSC) { 774 if (sts & PCI_EXP_SLTSTA_DLLSC) {
725 pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lsts); 775 pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lsts);
726 added = !!(lsts & PCI_EXP_LNKSTA_DLLLA); 776 added = !!(lsts & PCI_EXP_LNKSTA_DLLLA);
@@ -735,6 +785,7 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
735 785
736 added = !!(presence == OPAL_PCI_SLOT_PRESENT); 786 added = !!(presence == OPAL_PCI_SLOT_PRESENT);
737 } else { 787 } else {
788 pci_dbg(pdev, "PCI slot [%s]: Spurious IRQ?\n", php_slot->name);
738 return IRQ_NONE; 789 return IRQ_NONE;
739 } 790 }
740 791
@@ -955,6 +1006,9 @@ static int __init pnv_php_init(void)
955 for_each_compatible_node(dn, NULL, "ibm,ioda2-phb") 1006 for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
956 pnv_php_register(dn); 1007 pnv_php_register(dn);
957 1008
1009 for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
1010 pnv_php_register(dn);
1011
958 return 0; 1012 return 0;
959} 1013}
960 1014
@@ -964,6 +1018,9 @@ static void __exit pnv_php_exit(void)
964 1018
965 for_each_compatible_node(dn, NULL, "ibm,ioda2-phb") 1019 for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
966 pnv_php_unregister(dn); 1020 pnv_php_unregister(dn);
1021
1022 for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
1023 pnv_php_unregister(dn);
967} 1024}
968 1025
969module_init(pnv_php_init); 1026module_init(pnv_php_init);
diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index bcd5d357ca23..c3899ee1db99 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -230,7 +230,7 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
230 struct of_drc_info drc; 230 struct of_drc_info drc;
231 const __be32 *value; 231 const __be32 *value;
232 char cell_drc_name[MAX_DRC_NAME_LEN]; 232 char cell_drc_name[MAX_DRC_NAME_LEN];
233 int j, fndit; 233 int j;
234 234
235 info = of_find_property(dn->parent, "ibm,drc-info", NULL); 235 info = of_find_property(dn->parent, "ibm,drc-info", NULL);
236 if (info == NULL) 236 if (info == NULL)
@@ -245,17 +245,13 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
245 245
246 /* Should now know end of current entry */ 246 /* Should now know end of current entry */
247 247
248 if (my_index > drc.last_drc_index) 248 /* Found it */
249 continue; 249 if (my_index <= drc.last_drc_index) {
250 250 sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix,
251 fndit = 1; 251 my_index);
252 break; 252 break;
253 }
253 } 254 }
254 /* Found it */
255
256 if (fndit)
257 sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix,
258 my_index);
259 255
260 if (((drc_name == NULL) || 256 if (((drc_name == NULL) ||
261 (drc_name && !strcmp(drc_name, cell_drc_name))) && 257 (drc_name && !strcmp(drc_name, cell_drc_name))) &&
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8ce9ad21129f..9809369e0ed3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -435,7 +435,7 @@ static int tce_iommu_clear(struct tce_container *container,
435 unsigned long oldhpa; 435 unsigned long oldhpa;
436 long ret; 436 long ret;
437 enum dma_data_direction direction; 437 enum dma_data_direction direction;
438 unsigned long lastentry = entry + pages; 438 unsigned long lastentry = entry + pages, firstentry = entry;
439 439
440 for ( ; entry < lastentry; ++entry) { 440 for ( ; entry < lastentry; ++entry) {
441 if (tbl->it_indirect_levels && tbl->it_userspace) { 441 if (tbl->it_indirect_levels && tbl->it_userspace) {
@@ -460,7 +460,7 @@ static int tce_iommu_clear(struct tce_container *container,
460 460
461 direction = DMA_NONE; 461 direction = DMA_NONE;
462 oldhpa = 0; 462 oldhpa = 0;
463 ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, 463 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
464 &direction); 464 &direction);
465 if (ret) 465 if (ret)
466 continue; 466 continue;
@@ -476,6 +476,8 @@ static int tce_iommu_clear(struct tce_container *container,
476 tce_iommu_unuse_page(container, oldhpa); 476 tce_iommu_unuse_page(container, oldhpa);
477 } 477 }
478 478
479 iommu_tce_kill(tbl, firstentry, pages);
480
479 return 0; 481 return 0;
480} 482}
481 483
@@ -518,8 +520,8 @@ static long tce_iommu_build(struct tce_container *container,
518 520
519 hpa |= offset; 521 hpa |= offset;
520 dirtmp = direction; 522 dirtmp = direction;
521 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 523 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
522 &dirtmp); 524 &hpa, &dirtmp);
523 if (ret) { 525 if (ret) {
524 tce_iommu_unuse_page(container, hpa); 526 tce_iommu_unuse_page(container, hpa);
525 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", 527 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -536,6 +538,8 @@ static long tce_iommu_build(struct tce_container *container,
536 538
537 if (ret) 539 if (ret)
538 tce_iommu_clear(container, tbl, entry, i); 540 tce_iommu_clear(container, tbl, entry, i);
541 else
542 iommu_tce_kill(tbl, entry, pages);
539 543
540 return ret; 544 return ret;
541} 545}
@@ -572,8 +576,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
572 if (mm_iommu_mapped_inc(mem)) 576 if (mm_iommu_mapped_inc(mem))
573 break; 577 break;
574 578
575 ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, 579 ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
576 &dirtmp); 580 &hpa, &dirtmp);
577 if (ret) { 581 if (ret) {
578 /* dirtmp cannot be DMA_NONE here */ 582 /* dirtmp cannot be DMA_NONE here */
579 tce_iommu_unuse_page_v2(container, tbl, entry + i); 583 tce_iommu_unuse_page_v2(container, tbl, entry + i);
@@ -593,6 +597,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
593 597
594 if (ret) 598 if (ret)
595 tce_iommu_clear(container, tbl, entry, i); 599 tce_iommu_clear(container, tbl, entry, i);
600 else
601 iommu_tce_kill(tbl, entry, pages);
596 602
597 return ret; 603 return ret;
598} 604}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7bcc92add72c..7b13988796e1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -104,9 +104,9 @@ static int pfn_is_ram(unsigned long pfn)
104} 104}
105 105
106/* Reads a page from the oldmem device from given offset. */ 106/* Reads a page from the oldmem device from given offset. */
107static ssize_t read_from_oldmem(char *buf, size_t count, 107ssize_t read_from_oldmem(char *buf, size_t count,
108 u64 *ppos, int userbuf, 108 u64 *ppos, int userbuf,
109 bool encrypted) 109 bool encrypted)
110{ 110{
111 unsigned long pfn, offset; 111 unsigned long pfn, offset;
112 size_t nr_bytes; 112 size_t nr_bytes;
@@ -170,7 +170,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
170 */ 170 */
171ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) 171ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
172{ 172{
173 return read_from_oldmem(buf, count, ppos, 0, sev_active()); 173 return read_from_oldmem(buf, count, ppos, 0, false);
174} 174}
175 175
176/* 176/*
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f774c5eb9e3c..4664fc1871de 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -115,4 +115,18 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
115 return -EOPNOTSUPP; 115 return -EOPNOTSUPP;
116} 116}
117#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ 117#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
118
119#ifdef CONFIG_PROC_VMCORE
120ssize_t read_from_oldmem(char *buf, size_t count,
121 u64 *ppos, int userbuf,
122 bool encrypted);
123#else
124static inline ssize_t read_from_oldmem(char *buf, size_t count,
125 u64 *ppos, int userbuf,
126 bool encrypted)
127{
128 return -EOPNOTSUPP;
129}
130#endif /* CONFIG_PROC_VMCORE */
131
118#endif /* LINUX_CRASHDUMP_H */ 132#endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
19 19
20/* Given an address, look for it in the exception tables */ 20/* Given an address, look for it in the exception tables */
21const struct exception_table_entry *search_exception_tables(unsigned long add); 21const struct exception_table_entry *search_exception_tables(unsigned long add);
22const struct exception_table_entry *
23search_kernel_exception_table(unsigned long addr);
22 24
23#ifdef CONFIG_MODULES 25#ifdef CONFIG_MODULES
24/* For extable.c to search modules' exception tables. */ 26/* For extable.c to search modules' exception tables. */
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 470bd53a89df..5c4a18a91f89 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -18,23 +18,10 @@
18 18
19#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ 19#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
20 20
21#define sme_me_mask 0ULL 21static inline bool mem_encrypt_active(void) { return false; }
22
23static inline bool sme_active(void) { return false; }
24static inline bool sev_active(void) { return false; }
25 22
26#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ 23#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
27 24
28static inline bool mem_encrypt_active(void)
29{
30 return sme_me_mask;
31}
32
33static inline u64 sme_get_me_mask(void)
34{
35 return sme_me_mask;
36}
37
38#ifdef CONFIG_AMD_MEM_ENCRYPT 25#ifdef CONFIG_AMD_MEM_ENCRYPT
39/* 26/*
40 * The __sme_set() and __sme_clr() macros are useful for adding or removing 27 * The __sme_set() and __sme_clr() macros are useful for adding or removing
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 64a3d294f4b4..d9334f31a5af 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -345,12 +345,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
345} 345}
346EXPORT_SYMBOL(dma_free_attrs); 346EXPORT_SYMBOL(dma_free_attrs);
347 347
348static inline void dma_check_mask(struct device *dev, u64 mask)
349{
350 if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
351 dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
352}
353
354int dma_supported(struct device *dev, u64 mask) 348int dma_supported(struct device *dev, u64 mask)
355{ 349{
356 const struct dma_map_ops *ops = get_dma_ops(dev); 350 const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -381,7 +375,6 @@ int dma_set_mask(struct device *dev, u64 mask)
381 return -EIO; 375 return -EIO;
382 376
383 arch_dma_set_mask(dev, mask); 377 arch_dma_set_mask(dev, mask);
384 dma_check_mask(dev, mask);
385 *dev->dma_mask = mask; 378 *dev->dma_mask = mask;
386 return 0; 379 return 0;
387} 380}
@@ -399,7 +392,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
399 if (!dma_supported(dev, mask)) 392 if (!dma_supported(dev, mask))
400 return -EIO; 393 return -EIO;
401 394
402 dma_check_mask(dev, mask);
403 dev->coherent_dma_mask = mask; 395 dev->coherent_dma_mask = mask;
404 return 0; 396 return 0;
405} 397}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 796a44f8ef5a..673a2cdb2656 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -463,8 +463,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
463 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 463 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
464 464
465 if (mem_encrypt_active()) 465 if (mem_encrypt_active())
466 pr_warn_once("%s is active and system is using DMA bounce buffers\n", 466 pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
467 sme_active() ? "SME" : "SEV");
468 467
469 if (mapping_size > alloc_size) { 468 if (mapping_size > alloc_size) {
470 dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", 469 dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
40 } 40 }
41} 41}
42 42
43/* Given an address, look for it in the kernel exception table */
44const
45struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
46{
47 return search_extable(__start___ex_table,
48 __stop___ex_table - __start___ex_table, addr);
49}
50
43/* Given an address, look for it in the exception tables. */ 51/* Given an address, look for it in the exception tables. */
44const struct exception_table_entry *search_exception_tables(unsigned long addr) 52const struct exception_table_entry *search_exception_tables(unsigned long addr)
45{ 53{
46 const struct exception_table_entry *e; 54 const struct exception_table_entry *e;
47 55
48 e = search_extable(__start___ex_table, 56 e = search_kernel_exception_table(addr);
49 __stop___ex_table - __start___ex_table, addr);
50 if (!e) 57 if (!e)
51 e = search_module_extables(addr); 58 e = search_module_extables(addr);
52 return e; 59 return e;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8dfd5021b933..7950a0356042 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
276 int index = task->curr_ret_stack; 276 int index = task->curr_ret_stack;
277 int i; 277 int i;
278 278
279 if (ret != (unsigned long)return_to_handler) 279 if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
280 return ret; 280 return ret;
281 281
282 if (index < 0) 282 if (index < 0)
@@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
294{ 294{
295 int task_idx; 295 int task_idx;
296 296
297 if (ret != (unsigned long)return_to_handler) 297 if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
298 return ret; 298 return ret;
299 299
300 task_idx = task->curr_ret_stack; 300 task_idx = task->curr_ret_stack;
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index b3ad909aefbc..644770c3b754 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -26,6 +26,7 @@ SUB_DIRS = alignment \
26 switch_endian \ 26 switch_endian \
27 syscalls \ 27 syscalls \
28 tm \ 28 tm \
29 eeh \
29 vphn \ 30 vphn \
30 math \ 31 math \
31 ptrace \ 32 ptrace \
diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore
index ce12cd0e2967..12ef5b031974 100644
--- a/tools/testing/selftests/powerpc/copyloops/.gitignore
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -1,13 +1,14 @@
1copyuser_64_t0 1copyuser_64_t0
2copyuser_64_t1 2copyuser_64_t1
3copyuser_64_t2 3copyuser_64_t2
4copyuser_power7_t0 4copyuser_p7_t0
5copyuser_power7_t1 5copyuser_p7_t1
6memcpy_64_t0 6memcpy_64_t0
7memcpy_64_t1 7memcpy_64_t1
8memcpy_64_t2 8memcpy_64_t2
9memcpy_power7_t0 9memcpy_p7_t0
10memcpy_power7_t1 10memcpy_p7_t1
11copyuser_64_exc_t0 11copyuser_64_exc_t0
12copyuser_64_exc_t1 12copyuser_64_exc_t1
13copyuser_64_exc_t2 13copyuser_64_exc_t2
14memcpy_mcsafe_64
diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile
index 44574f3818b3..0917983a1c78 100644
--- a/tools/testing/selftests/powerpc/copyloops/Makefile
+++ b/tools/testing/selftests/powerpc/copyloops/Makefile
@@ -12,7 +12,7 @@ ASFLAGS = $(CFLAGS) -Wa,-mpower4
12TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ 12TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \
13 copyuser_p7_t0 copyuser_p7_t1 \ 13 copyuser_p7_t0 copyuser_p7_t1 \
14 memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ 14 memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \
15 memcpy_p7_t0 memcpy_p7_t1 \ 15 memcpy_p7_t0 memcpy_p7_t1 memcpy_mcsafe_64 \
16 copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 16 copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2
17 17
18EXTRA_SOURCES := validate.c ../harness.c stubs.S 18EXTRA_SOURCES := validate.c ../harness.c stubs.S
@@ -45,6 +45,11 @@ $(OUTPUT)/memcpy_p7_t%: memcpy_power7.S $(EXTRA_SOURCES)
45 -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \ 45 -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \
46 -o $@ $^ 46 -o $@ $^
47 47
48$(OUTPUT)/memcpy_mcsafe_64: memcpy_mcsafe_64.S $(EXTRA_SOURCES)
49 $(CC) $(CPPFLAGS) $(CFLAGS) \
50 -D COPY_LOOP=test_memcpy_mcsafe \
51 -o $@ $^
52
48$(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \ 53$(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \
49 copy_tofrom_user_reference.S stubs.S 54 copy_tofrom_user_reference.S stubs.S
50 $(CC) $(CPPFLAGS) $(CFLAGS) \ 55 $(CC) $(CPPFLAGS) $(CFLAGS) \
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h
index 05c1663c89b0..e6b80d5fbd14 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/export.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#define EXPORT_SYMBOL(x) 2#define EXPORT_SYMBOL(x)
3#define EXPORT_SYMBOL_GPL(x)
3#define EXPORT_SYMBOL_KASAN(x) 4#define EXPORT_SYMBOL_KASAN(x)
diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
new file mode 120000
index 000000000000..f0feef3062f6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
@@ -0,0 +1 @@
../../../../../arch/powerpc/lib/memcpy_mcsafe_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile
new file mode 100644
index 000000000000..b397babd569b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/Makefile
@@ -0,0 +1,9 @@
1# SPDX-License-Identifier: GPL-2.0
2noarg:
3 $(MAKE) -C ../
4
5TEST_PROGS := eeh-basic.sh
6TEST_FILES := eeh-functions.sh
7
8top_srcdir = ../../../../..
9include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
new file mode 100755
index 000000000000..f988d2f42e8f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -0,0 +1,82 @@
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0-only
3
4. ./eeh-functions.sh
5
6if ! eeh_supported ; then
7 echo "EEH not supported on this system, skipping"
8 exit 0;
9fi
10
11if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
12 [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
13 echo "debugfs EEH testing files are missing. Is debugfs mounted?"
14 exit 1;
15fi
16
17pre_lspci=`mktemp`
18lspci > $pre_lspci
19
20# Bump the max freeze count to something absurd so we don't
21# trip over it while breaking things.
22echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
23
24# record the devices that we break in here. Assuming everything
25# goes to plan we should get them back once the recover process
26# is finished.
27devices=""
28
29# Build up a list of candidate devices.
30for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
31 # skip bridges since we can't recover them (yet...)
32 if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
33 echo "$dev, Skipped: bridge"
34 continue;
35 fi
36
37 # Skip VFs for now since we don't have a reliable way
38 # to break them.
39 if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
40 echo "$dev, Skipped: virtfn"
41 continue;
42 fi
43
44 # Don't inject errosr into an already-frozen PE. This happens with
45 # PEs that contain multiple PCI devices (e.g. multi-function cards)
46 # and injecting new errors during the recovery process will probably
47 # result in the recovery failing and the device being marked as
48 # failed.
49 if ! pe_ok $dev ; then
50 echo "$dev, Skipped: Bad initial PE state"
51 continue;
52 fi
53
54 echo "$dev, Added"
55
56 # Add to this list of device to check
57 devices="$devices $dev"
58done
59
60dev_count="$(echo $devices | wc -w)"
61echo "Found ${dev_count} breakable devices..."
62
63failed=0
64for dev in $devices ; do
65 echo "Breaking $dev..."
66
67 if ! pe_ok $dev ; then
68 echo "Skipping $dev, Initial PE state is not ok"
69 failed="$((failed + 1))"
70 continue;
71 fi
72
73 if ! eeh_one_dev $dev ; then
74 failed="$((failed + 1))"
75 fi
76done
77
78echo "$failed devices failed to recover ($dev_count tested)"
79lspci | diff -u $pre_lspci -
80rm -f $pre_lspci
81
82exit $failed
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
new file mode 100755
index 000000000000..26112ab5cdf4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -0,0 +1,76 @@
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0-only
3
4pe_ok() {
5 local dev="$1"
6 local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
7
8 if ! [ -e "$path" ] ; then
9 return 1;
10 fi
11
12 local fw_state="$(cut -d' ' -f1 < $path)"
13 local sw_state="$(cut -d' ' -f2 < $path)"
14
15 # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
16 # error state or being recovered. Either way, not ok.
17 if [ "$((sw_state & 0x3))" -ne 0 ] ; then
18 return 1
19 fi
20
21 # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
22 # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
23 # the platform backends set these when the PE is in reset. The
24 # RECOVERING check above should stop any false positives though.
25 if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
26 return 1
27 fi
28
29 return 0;
30}
31
32eeh_supported() {
33 test -e /proc/powerpc/eeh && \
34 grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
35}
36
37eeh_one_dev() {
38 local dev="$1"
39
40 # Using this function from the command line is sometimes useful for
41 # testing so check that the argument is a well-formed sysfs device
42 # name.
43 if ! test -e /sys/bus/pci/devices/$dev/ ; then
44 echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
45 return 1;
46 fi
47
48 # Break it
49 echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
50
51 # Force an EEH device check. If the kernel has already
52 # noticed the EEH (due to a driver poll or whatever), this
53 # is a no-op.
54 echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
55
56 # Enforce a 30s timeout for recovery. Even the IPR, which is infamously
57 # slow to reset, should recover within 30s.
58 max_wait=30
59
60 for i in `seq 0 ${max_wait}` ; do
61 if pe_ok $dev ; then
62 break;
63 fi
64 echo "$dev, waited $i/${max_wait}"
65 sleep 1
66 done
67
68 if ! pe_ok $dev ; then
69 echo "$dev, Failed to recover!"
70 return 1;
71 fi
72
73 echo "$dev, Recovered after $i seconds"
74 return 0;
75}
76
diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 07ec449a2767..dce19f221c46 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -10,3 +10,6 @@ ptrace-tm-spd-vsx
10ptrace-tm-spr 10ptrace-tm-spr
11ptrace-hwbreak 11ptrace-hwbreak
12perf-hwbreak 12perf-hwbreak
13core-pkey
14ptrace-pkey
15ptrace-syscall
diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore
new file mode 100644
index 000000000000..0b969fba3beb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/.gitignore
@@ -0,0 +1 @@
rfi_flush
diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore
index 0b43da74ee46..31a17e0ba884 100644
--- a/tools/testing/selftests/powerpc/stringloops/.gitignore
+++ b/tools/testing/selftests/powerpc/stringloops/.gitignore
@@ -1 +1,4 @@
1memcmp 1memcmp_64
2memcmp_32
3strlen
4strlen_32
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
index d57c2d2ab6ec..254f912ad611 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
@@ -5,10 +5,11 @@
5 * Test the kernel's signal frame code. 5 * Test the kernel's signal frame code.
6 * 6 *
7 * The kernel sets up two sets of ucontexts if the signal was to be 7 * The kernel sets up two sets of ucontexts if the signal was to be
8 * delivered while the thread was in a transaction. 8 * delivered while the thread was in a transaction (referred too as
9 * first and second contexts).
9 * Expected behaviour is that the checkpointed state is in the user 10 * Expected behaviour is that the checkpointed state is in the user
10 * context passed to the signal handler. The speculated state can be 11 * context passed to the signal handler (first context). The speculated
11 * accessed with the uc_link pointer. 12 * state can be accessed with the uc_link pointer (second context).
12 * 13 *
13 * The rationale for this is that if TM unaware code (which linked 14 * The rationale for this is that if TM unaware code (which linked
14 * against TM libs) installs a signal handler it will not know of the 15 * against TM libs) installs a signal handler it will not know of the
@@ -28,17 +29,20 @@
28 29
29#define MAX_ATTEMPT 500000 30#define MAX_ATTEMPT 500000
30 31
31#define NV_FPU_REGS 18 32#define NV_FPU_REGS 18 /* Number of non-volatile FP registers */
33#define FPR14 14 /* First non-volatile FP register to check in f14-31 subset */
32 34
33long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); 35long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
34 36
35/* Be sure there are 2x as many as there are NV FPU regs (2x18) */ 37/* Test only non-volatile registers, i.e. 18 fpr registers from f14 to f31 */
36static double fps[] = { 38static double fps[] = {
39 /* First context will be set with these values, i.e. non-speculative */
37 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 40 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
41 /* Second context will be set with these values, i.e. speculative */
38 -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18 42 -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
39}; 43};
40 44
41static sig_atomic_t fail; 45static sig_atomic_t fail, broken;
42 46
43static void signal_usr1(int signum, siginfo_t *info, void *uc) 47static void signal_usr1(int signum, siginfo_t *info, void *uc)
44{ 48{
@@ -46,11 +50,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc)
46 ucontext_t *ucp = uc; 50 ucontext_t *ucp = uc;
47 ucontext_t *tm_ucp = ucp->uc_link; 51 ucontext_t *tm_ucp = ucp->uc_link;
48 52
49 for (i = 0; i < NV_FPU_REGS && !fail; i++) { 53 for (i = 0; i < NV_FPU_REGS; i++) {
50 fail = (ucp->uc_mcontext.fp_regs[i + 14] != fps[i]); 54 /* Check first context. Print all mismatches. */
51 fail |= (tm_ucp->uc_mcontext.fp_regs[i + 14] != fps[i + NV_FPU_REGS]); 55 fail = (ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[i]);
52 if (fail) 56 if (fail) {
53 printf("Failed on %d FP %g or %g\n", i, ucp->uc_mcontext.fp_regs[i + 14], tm_ucp->uc_mcontext.fp_regs[i + 14]); 57 broken = 1;
58 printf("FPR%d (1st context) == %g instead of %g (expected)\n",
59 FPR14 + i, ucp->uc_mcontext.fp_regs[FPR14 + i], fps[i]);
60 }
61 }
62
63 for (i = 0; i < NV_FPU_REGS; i++) {
64 /* Check second context. Print all mismatches. */
65 fail = (tm_ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[NV_FPU_REGS + i]);
66 if (fail) {
67 broken = 1;
68 printf("FPR%d (2nd context) == %g instead of %g (expected)\n",
69 FPR14 + i, tm_ucp->uc_mcontext.fp_regs[FPR14 + i], fps[NV_FPU_REGS + i]);
70 }
54 } 71 }
55} 72}
56 73
@@ -72,13 +89,19 @@ static int tm_signal_context_chk_fpu()
72 } 89 }
73 90
74 i = 0; 91 i = 0;
75 while (i < MAX_ATTEMPT && !fail) { 92 while (i < MAX_ATTEMPT && !broken) {
93 /*
94 * tm_signal_self_context_load will set both first and second
95 * contexts accordingly to the values passed through non-NULL
96 * array pointers to it, in that case 'fps', and invoke the
97 * signal handler installed for SIGUSR1.
98 */
76 rc = tm_signal_self_context_load(pid, NULL, fps, NULL, NULL); 99 rc = tm_signal_self_context_load(pid, NULL, fps, NULL, NULL);
77 FAIL_IF(rc != pid); 100 FAIL_IF(rc != pid);
78 i++; 101 i++;
79 } 102 }
80 103
81 return fail; 104 return (broken);
82} 105}
83 106
84int main(void) 107int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
index 4d05f8b0254c..0cc680f61828 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
@@ -5,10 +5,11 @@
5 * Test the kernel's signal frame code. 5 * Test the kernel's signal frame code.
6 * 6 *
7 * The kernel sets up two sets of ucontexts if the signal was to be 7 * The kernel sets up two sets of ucontexts if the signal was to be
8 * delivered while the thread was in a transaction. 8 * delivered while the thread was in a transaction (referred too as
9 * first and second contexts).
9 * Expected behaviour is that the checkpointed state is in the user 10 * Expected behaviour is that the checkpointed state is in the user
10 * context passed to the signal handler. The speculated state can be 11 * context passed to the signal handler (first context). The speculated
11 * accessed with the uc_link pointer. 12 * state can be accessed with the uc_link pointer (second context).
12 * 13 *
13 * The rationale for this is that if TM unaware code (which linked 14 * The rationale for this is that if TM unaware code (which linked
14 * against TM libs) installs a signal handler it will not know of the 15 * against TM libs) installs a signal handler it will not know of the
@@ -28,14 +29,22 @@
28 29
29#define MAX_ATTEMPT 500000 30#define MAX_ATTEMPT 500000
30 31
31#define NV_GPR_REGS 18 32#define NV_GPR_REGS 18 /* Number of non-volatile GPR registers */
33#define R14 14 /* First non-volatile register to check in r14-r31 subset */
32 34
33long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); 35long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
34 36
35static sig_atomic_t fail; 37static sig_atomic_t fail, broken;
36 38
37static long gps[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 39/* Test only non-volatile general purpose registers, i.e. r14-r31 */
38 -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}; 40static long gprs[] = {
41 /* First context will be set with these values, i.e. non-speculative */
42 /* R14, R15, ... */
43 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
44 /* Second context will be set with these values, i.e. speculative */
45 /* R14, R15, ... */
46 -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
47};
39 48
40static void signal_usr1(int signum, siginfo_t *info, void *uc) 49static void signal_usr1(int signum, siginfo_t *info, void *uc)
41{ 50{
@@ -43,12 +52,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc)
43 ucontext_t *ucp = uc; 52 ucontext_t *ucp = uc;
44 ucontext_t *tm_ucp = ucp->uc_link; 53 ucontext_t *tm_ucp = ucp->uc_link;
45 54
46 for (i = 0; i < NV_GPR_REGS && !fail; i++) { 55 /* Check first context. Print all mismatches. */
47 fail = (ucp->uc_mcontext.gp_regs[i + 14] != gps[i]); 56 for (i = 0; i < NV_GPR_REGS; i++) {
48 fail |= (tm_ucp->uc_mcontext.gp_regs[i + 14] != gps[i + NV_GPR_REGS]); 57 fail = (ucp->uc_mcontext.gp_regs[R14 + i] != gprs[i]);
49 if (fail) 58 if (fail) {
50 printf("Failed on %d GPR %lu or %lu\n", i, 59 broken = 1;
51 ucp->uc_mcontext.gp_regs[i + 14], tm_ucp->uc_mcontext.gp_regs[i + 14]); 60 printf("GPR%d (1st context) == %lu instead of %lu (expected)\n",
61 R14 + i, ucp->uc_mcontext.gp_regs[R14 + i], gprs[i]);
62 }
63 }
64
65 /* Check second context. Print all mismatches. */
66 for (i = 0; i < NV_GPR_REGS; i++) {
67 fail = (tm_ucp->uc_mcontext.gp_regs[R14 + i] != gprs[NV_GPR_REGS + i]);
68 if (fail) {
69 broken = 1;
70 printf("GPR%d (2nd context) == %lu instead of %lu (expected)\n",
71 R14 + i, tm_ucp->uc_mcontext.gp_regs[R14 + i], gprs[NV_GPR_REGS + i]);
72 }
52 } 73 }
53} 74}
54 75
@@ -70,13 +91,19 @@ static int tm_signal_context_chk_gpr()
70 } 91 }
71 92
72 i = 0; 93 i = 0;
73 while (i < MAX_ATTEMPT && !fail) { 94 while (i < MAX_ATTEMPT && !broken) {
74 rc = tm_signal_self_context_load(pid, gps, NULL, NULL, NULL); 95 /*
96 * tm_signal_self_context_load will set both first and second
97 * contexts accordingly to the values passed through non-NULL
98 * array pointers to it, in that case 'gprs', and invoke the
99 * signal handler installed for SIGUSR1.
100 */
101 rc = tm_signal_self_context_load(pid, gprs, NULL, NULL, NULL);
75 FAIL_IF(rc != pid); 102 FAIL_IF(rc != pid);
76 i++; 103 i++;
77 } 104 }
78 105
79 return fail; 106 return broken;
80} 107}
81 108
82int main(void) 109int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
index 48ad01499b1a..b6d52730a0d8 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
@@ -5,10 +5,11 @@
5 * Test the kernel's signal frame code. 5 * Test the kernel's signal frame code.
6 * 6 *
7 * The kernel sets up two sets of ucontexts if the signal was to be 7 * The kernel sets up two sets of ucontexts if the signal was to be
8 * delivered while the thread was in a transaction. 8 * delivered while the thread was in a transaction (referred too as
9 * first and second contexts).
9 * Expected behaviour is that the checkpointed state is in the user 10 * Expected behaviour is that the checkpointed state is in the user
10 * context passed to the signal handler. The speculated state can be 11 * context passed to the signal handler (first context). The speculated
11 * accessed with the uc_link pointer. 12 * state can be accessed with the uc_link pointer (second context).
12 * 13 *
13 * The rationale for this is that if TM unaware code (which linked 14 * The rationale for this is that if TM unaware code (which linked
14 * against TM libs) installs a signal handler it will not know of the 15 * against TM libs) installs a signal handler it will not know of the
@@ -29,18 +30,24 @@
29 30
30#define MAX_ATTEMPT 500000 31#define MAX_ATTEMPT 500000
31 32
32#define NV_VMX_REGS 12 33#define NV_VMX_REGS 12 /* Number of non-volatile VMX registers */
34#define VMX20 20 /* First non-volatile register to check in vr20-31 subset */
33 35
34long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); 36long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
35 37
36static sig_atomic_t fail; 38static sig_atomic_t fail, broken;
37 39
40/* Test only non-volatile registers, i.e. 12 vmx registers from vr20 to vr31 */
38vector int vms[] = { 41vector int vms[] = {
39 {1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12}, 42 /* First context will be set with these values, i.e. non-speculative */
43 /* VMX20 , VMX21 , ... */
44 { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
40 {13,14,15,16},{17,18,19,20},{21,22,23,24}, 45 {13,14,15,16},{17,18,19,20},{21,22,23,24},
41 {25,26,27,28},{29,30,31,32},{33,34,35,36}, 46 {25,26,27,28},{29,30,31,32},{33,34,35,36},
42 {37,38,39,40},{41,42,43,44},{45,46,47,48}, 47 {37,38,39,40},{41,42,43,44},{45,46,47,48},
43 {-1, -2, -3, -4}, {-5, -6, -7, -8}, {-9, -10,-11,-12}, 48 /* Second context will be set with these values, i.e. speculative */
49 /* VMX20 , VMX21 , ... */
50 { -1, -2, -3, -4},{ -5, -6, -7, -8},{ -9,-10,-11,-12},
44 {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24}, 51 {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
45 {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36}, 52 {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
46 {-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48} 53 {-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48}
@@ -48,26 +55,43 @@ vector int vms[] = {
48 55
49static void signal_usr1(int signum, siginfo_t *info, void *uc) 56static void signal_usr1(int signum, siginfo_t *info, void *uc)
50{ 57{
51 int i; 58 int i, j;
52 ucontext_t *ucp = uc; 59 ucontext_t *ucp = uc;
53 ucontext_t *tm_ucp = ucp->uc_link; 60 ucontext_t *tm_ucp = ucp->uc_link;
54 61
55 for (i = 0; i < NV_VMX_REGS && !fail; i++) { 62 for (i = 0; i < NV_VMX_REGS; i++) {
56 fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[i + 20], 63 /* Check first context. Print all mismatches. */
64 fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
57 &vms[i], sizeof(vector int)); 65 &vms[i], sizeof(vector int));
58 fail |= memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[i + 20],
59 &vms[i + NV_VMX_REGS], sizeof (vector int));
60
61 if (fail) { 66 if (fail) {
62 int j; 67 broken = 1;
68 printf("VMX%d (1st context) == 0x", VMX20 + i);
69 /* Print actual value in first context. */
70 for (j = 0; j < 4; j++)
71 printf("%08x", ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
72 printf(" instead of 0x");
73 /* Print expected value. */
74 for (j = 0; j < 4; j++)
75 printf("%08x", vms[i][j]);
76 printf(" (expected)\n");
77 }
78 }
63 79
64 fprintf(stderr, "Failed on %d vmx 0x", i); 80 for (i = 0; i < NV_VMX_REGS; i++) {
81 /* Check second context. Print all mismatches. */
82 fail = memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
83 &vms[NV_VMX_REGS + i], sizeof (vector int));
84 if (fail) {
85 broken = 1;
86 printf("VMX%d (2nd context) == 0x", NV_VMX_REGS + i);
87 /* Print actual value in second context. */
88 for (j = 0; j < 4; j++)
89 printf("%08x", tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
90 printf(" instead of 0x");
91 /* Print expected value. */
65 for (j = 0; j < 4; j++) 92 for (j = 0; j < 4; j++)
66 fprintf(stderr, "%04x", ucp->uc_mcontext.v_regs->vrregs[i + 20][j]); 93 printf("%08x", vms[NV_VMX_REGS + i][j]);
67 fprintf(stderr, " vs 0x"); 94 printf(" (expected)\n");
68 for (j = 0 ; j < 4; j++)
69 fprintf(stderr, "%04x", tm_ucp->uc_mcontext.v_regs->vrregs[i + 20][j]);
70 fprintf(stderr, "\n");
71 } 95 }
72 } 96 }
73} 97}
@@ -90,13 +114,19 @@ static int tm_signal_context_chk()
90 } 114 }
91 115
92 i = 0; 116 i = 0;
93 while (i < MAX_ATTEMPT && !fail) { 117 while (i < MAX_ATTEMPT && !broken) {
118 /*
119 * tm_signal_self_context_load will set both first and second
120 * contexts accordingly to the values passed through non-NULL
121 * array pointers to it, in that case 'vms', and invoke the
122 * signal handler installed for SIGUSR1.
123 */
94 rc = tm_signal_self_context_load(pid, NULL, NULL, vms, NULL); 124 rc = tm_signal_self_context_load(pid, NULL, NULL, vms, NULL);
95 FAIL_IF(rc != pid); 125 FAIL_IF(rc != pid);
96 i++; 126 i++;
97 } 127 }
98 128
99 return fail; 129 return (broken);
100} 130}
101 131
102int main(void) 132int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
index 8c8677a408bb..8e25e2072ecd 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
@@ -5,10 +5,11 @@
5 * Test the kernel's signal frame code. 5 * Test the kernel's signal frame code.
6 * 6 *
7 * The kernel sets up two sets of ucontexts if the signal was to be 7 * The kernel sets up two sets of ucontexts if the signal was to be
8 * delivered while the thread was in a transaction. 8 * delivered while the thread was in a transaction (referred too as
9 * first and second contexts).
9 * Expected behaviour is that the checkpointed state is in the user 10 * Expected behaviour is that the checkpointed state is in the user
10 * context passed to the signal handler. The speculated state can be 11 * context passed to the signal handler (first context). The speculated
11 * accessed with the uc_link pointer. 12 * state can be accessed with the uc_link pointer (second context).
12 * 13 *
13 * The rationale for this is that if TM unaware code (which linked 14 * The rationale for this is that if TM unaware code (which linked
14 * against TM libs) installs a signal handler it will not know of the 15 * against TM libs) installs a signal handler it will not know of the
@@ -29,17 +30,24 @@
29 30
30#define MAX_ATTEMPT 500000 31#define MAX_ATTEMPT 500000
31 32
32#define NV_VSX_REGS 12 33#define NV_VSX_REGS 12 /* Number of VSX registers to check. */
34#define VSX20 20 /* First VSX register to check in vsr20-vsr31 subset */
35#define FPR20 20 /* FPR20 overlaps VSX20 most significant doubleword */
33 36
34long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); 37long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
35 38
36static sig_atomic_t fail; 39static sig_atomic_t fail, broken;
37 40
38vector int vss[] = { 41/* Test only 12 vsx registers from vsr20 to vsr31 */
39 {1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12}, 42vector int vsxs[] = {
43 /* First context will be set with these values, i.e. non-speculative */
44 /* VSX20 , VSX21 , ... */
45 { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
40 {13,14,15,16},{17,18,19,20},{21,22,23,24}, 46 {13,14,15,16},{17,18,19,20},{21,22,23,24},
41 {25,26,27,28},{29,30,31,32},{33,34,35,36}, 47 {25,26,27,28},{29,30,31,32},{33,34,35,36},
42 {37,38,39,40},{41,42,43,44},{45,46,47,48}, 48 {37,38,39,40},{41,42,43,44},{45,46,47,48},
49 /* Second context will be set with these values, i.e. speculative */
50 /* VSX20 , VSX21 , ... */
43 {-1, -2, -3, -4 },{-5, -6, -7, -8 },{-9, -10,-11,-12}, 51 {-1, -2, -3, -4 },{-5, -6, -7, -8 },{-9, -10,-11,-12},
44 {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24}, 52 {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
45 {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36}, 53 {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
@@ -48,41 +56,91 @@ vector int vss[] = {
48 56
49static void signal_usr1(int signum, siginfo_t *info, void *uc) 57static void signal_usr1(int signum, siginfo_t *info, void *uc)
50{ 58{
51 int i; 59 int i, j;
52 uint8_t vsc[sizeof(vector int)]; 60 uint8_t vsx[sizeof(vector int)];
53 uint8_t vst[sizeof(vector int)]; 61 uint8_t vsx_tm[sizeof(vector int)];
54 ucontext_t *ucp = uc; 62 ucontext_t *ucp = uc;
55 ucontext_t *tm_ucp = ucp->uc_link; 63 ucontext_t *tm_ucp = ucp->uc_link;
56 64
57 /* 65 /*
58 * The other half of the VSX regs will be after v_regs. 66 * FP registers and VMX registers overlap the VSX registers.
67 *
68 * FP registers (f0-31) overlap the most significant 64 bits of VSX
69 * registers vsr0-31, whilst VMX registers vr0-31, being 128-bit like
70 * the VSX registers, overlap fully the other half of VSX registers,
71 * i.e. vr0-31 overlaps fully vsr32-63.
72 *
73 * Due to compatibility and historical reasons (VMX/Altivec support
74 * appeared first on the architecture), VMX registers vr0-31 (so VSX
75 * half vsr32-63 too) are stored right after the v_regs pointer, in an
76 * area allocated for 'vmx_reverse' array (please see
77 * arch/powerpc/include/uapi/asm/sigcontext.h for details about the
78 * mcontext_t structure on Power).
79 *
80 * The other VSX half (vsr0-31) is hence stored below vr0-31/vsr32-63
81 * registers, but only the least significant 64 bits of vsr0-31. The
82 * most significant 64 bits of vsr0-31 (f0-31), as it overlaps the FP
83 * registers, is kept in fp_regs.
84 *
85 * v_regs is a 16 byte aligned pointer at the start of vmx_reserve
86 * (vmx_reserve may or may not be 16 aligned) where the v_regs structure
87 * exists, so v_regs points to where vr0-31 / vsr32-63 registers are
88 * fully stored. Since v_regs type is elf_vrregset_t, v_regs + 1
89 * skips all the slots used to store vr0-31 / vsr32-64 and points to
90 * part of one VSX half, i.e. v_regs + 1 points to the least significant
91 * 64 bits of vsr0-31. The other part of this half (the most significant
92 * part of vsr0-31) is stored in fp_regs.
59 * 93 *
60 * In short, vmx_reserve array holds everything. v_regs is a 16
61 * byte aligned pointer at the start of vmx_reserve (vmx_reserve
62 * may or may not be 16 aligned) where the v_regs structure exists.
63 * (half of) The VSX regsters are directly after v_regs so the
64 * easiest way to find them below.
65 */ 94 */
95 /* Get pointer to least significant doubleword of vsr0-31 */
66 long *vsx_ptr = (long *)(ucp->uc_mcontext.v_regs + 1); 96 long *vsx_ptr = (long *)(ucp->uc_mcontext.v_regs + 1);
67 long *tm_vsx_ptr = (long *)(tm_ucp->uc_mcontext.v_regs + 1); 97 long *tm_vsx_ptr = (long *)(tm_ucp->uc_mcontext.v_regs + 1);
68 for (i = 0; i < NV_VSX_REGS && !fail; i++) {
69 memcpy(vsc, &ucp->uc_mcontext.fp_regs[i + 20], 8);
70 memcpy(vsc + 8, &vsx_ptr[20 + i], 8);
71 fail = memcmp(vsc, &vss[i], sizeof(vector int));
72 memcpy(vst, &tm_ucp->uc_mcontext.fp_regs[i + 20], 8);
73 memcpy(vst + 8, &tm_vsx_ptr[20 + i], 8);
74 fail |= memcmp(vst, &vss[i + NV_VSX_REGS], sizeof(vector int));
75 98
76 if (fail) { 99 /* Check first context. Print all mismatches. */
77 int j; 100 for (i = 0; i < NV_VSX_REGS; i++) {
101 /*
102 * Copy VSX most significant doubleword from fp_regs and
103 * copy VSX least significant one from 64-bit slots below
104 * saved VMX registers.
105 */
106 memcpy(vsx, &ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
107 memcpy(vsx + 8, &vsx_ptr[VSX20 + i], 8);
108
109 fail = memcmp(vsx, &vsxs[i], sizeof(vector int));
78 110
79 fprintf(stderr, "Failed on %d vsx 0x", i); 111 if (fail) {
112 broken = 1;
113 printf("VSX%d (1st context) == 0x", VSX20 + i);
80 for (j = 0; j < 16; j++) 114 for (j = 0; j < 16; j++)
81 fprintf(stderr, "%02x", vsc[j]); 115 printf("%02x", vsx[j]);
82 fprintf(stderr, " vs 0x"); 116 printf(" instead of 0x");
117 for (j = 0; j < 4; j++)
118 printf("%08x", vsxs[i][j]);
119 printf(" (expected)\n");
120 }
121 }
122
123 /* Check second context. Print all mismatches. */
124 for (i = 0; i < NV_VSX_REGS; i++) {
125 /*
126 * Copy VSX most significant doubleword from fp_regs and
127 * copy VSX least significant one from 64-bit slots below
128 * saved VMX registers.
129 */
130 memcpy(vsx_tm, &tm_ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
131 memcpy(vsx_tm + 8, &tm_vsx_ptr[VSX20 + i], 8);
132
133 fail = memcmp(vsx_tm, &vsxs[NV_VSX_REGS + i], sizeof(vector int));
134
135 if (fail) {
136 broken = 1;
137 printf("VSX%d (2nd context) == 0x", VSX20 + i);
83 for (j = 0; j < 16; j++) 138 for (j = 0; j < 16; j++)
84 fprintf(stderr, "%02x", vst[j]); 139 printf("%02x", vsx_tm[j]);
85 fprintf(stderr, "\n"); 140 printf(" instead of 0x");
141 for (j = 0; j < 4; j++)
142 printf("%08x", vsxs[NV_VSX_REGS + i][j]);
143 printf("(expected)\n");
86 } 144 }
87 } 145 }
88} 146}
@@ -105,13 +163,19 @@ static int tm_signal_context_chk()
105 } 163 }
106 164
107 i = 0; 165 i = 0;
108 while (i < MAX_ATTEMPT && !fail) { 166 while (i < MAX_ATTEMPT && !broken) {
109 rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vss); 167 /*
168 * tm_signal_self_context_load will set both first and second
169 * contexts accordingly to the values passed through non-NULL
170 * array pointers to it, in that case 'vsxs', and invoke the
171 * signal handler installed for SIGUSR1.
172 */
173 rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vsxs);
110 FAIL_IF(rc != pid); 174 FAIL_IF(rc != pid);
111 i++; 175 i++;
112 } 176 }
113 177
114 return fail; 178 return (broken);
115} 179}
116 180
117int main(void) 181int main(void)
diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
index 97f9f491c541..c402464b038f 100644
--- a/tools/testing/selftests/powerpc/tm/tm.h
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -55,7 +55,8 @@ static inline bool failure_is_unavailable(void)
55static inline bool failure_is_reschedule(void) 55static inline bool failure_is_reschedule(void)
56{ 56{
57 if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED || 57 if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED ||
58 (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED) 58 (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED ||
59 (failure_code() & TM_CAUSE_KVM_FAC_UNAV) == TM_CAUSE_KVM_FAC_UNAV)
59 return true; 60 return true;
60 61
61 return false; 62 return false;