summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/LSM/SafeSetID.rst107
-rw-r--r--Documentation/admin-guide/LSM/index.rst14
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--Documentation/cgroup-v1/pids.txt3
-rw-r--r--Documentation/filesystems/xfs.txt3
-rw-r--r--Documentation/kdump/vmcoreinfo.txt495
-rw-r--r--Documentation/xtensa/booting.txt19
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/powerpc/Kconfig79
-rw-r--r--arch/powerpc/Kconfig.debug4
-rw-r--r--arch/powerpc/Makefile11
-rw-r--r--arch/powerpc/boot/dts/Makefile1
-rw-r--r--arch/powerpc/boot/dts/akebono.dts2
-rw-r--r--arch/powerpc/boot/dts/bluestone.dts2
-rw-r--r--arch/powerpc/boot/dts/currituck.dts2
-rw-r--r--arch/powerpc/boot/dts/iss4xx-mpic.dts2
-rw-r--r--arch/powerpc/boot/dts/wii.dts22
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h14
-rw-r--r--arch/powerpc/include/asm/book3s/32/mmu-hash.h2
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h11
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h32
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgalloc.h8
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h16
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h30
-rw-r--r--arch/powerpc/include/asm/checksum.h4
-rw-r--r--arch/powerpc/include/asm/device.h10
-rw-r--r--arch/powerpc/include/asm/dma-direct.h18
-rw-r--r--arch/powerpc/include/asm/dma-mapping.h92
-rw-r--r--arch/powerpc/include/asm/eeh.h10
-rw-r--r--arch/powerpc/include/asm/eeh_event.h1
-rw-r--r--arch/powerpc/include/asm/exception-64s.h4
-rw-r--r--arch/powerpc/include/asm/hvsi.h2
-rw-r--r--arch/powerpc/include/asm/iommu.h17
-rw-r--r--arch/powerpc/include/asm/ipic.h3
-rw-r--r--arch/powerpc/include/asm/irq.h18
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h3
-rw-r--r--arch/powerpc/include/asm/livepatch.h7
-rw-r--r--arch/powerpc/include/asm/machdep.h4
-rw-r--r--arch/powerpc/include/asm/mce.h2
-rw-r--r--arch/powerpc/include/asm/mmu.h13
-rw-r--r--arch/powerpc/include/asm/nmi.h2
-rw-r--r--arch/powerpc/include/asm/nohash/32/mmu-8xx.h3
-rw-r--r--arch/powerpc/include/asm/page.h14
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h7
-rw-r--r--arch/powerpc/include/asm/pci.h2
-rw-r--r--arch/powerpc/include/asm/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/powernv.h3
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h16
-rw-r--r--arch/powerpc/include/asm/ppc-pci.h4
-rw-r--r--arch/powerpc/include/asm/processor.h108
-rw-r--r--arch/powerpc/include/asm/ptrace.h2
-rw-r--r--arch/powerpc/include/asm/reg.h9
-rw-r--r--arch/powerpc/include/asm/sections.h7
-rw-r--r--arch/powerpc/include/asm/smp.h17
-rw-r--r--arch/powerpc/include/asm/swiotlb.h5
-rw-r--r--arch/powerpc/include/asm/task_size_32.h21
-rw-r--r--arch/powerpc/include/asm/task_size_64.h79
-rw-r--r--arch/powerpc/include/asm/thread_info.h19
-rw-r--r--arch/powerpc/include/asm/topology.h2
-rw-r--r--arch/powerpc/kernel/Makefile15
-rw-r--r--arch/powerpc/kernel/asm-offsets.c15
-rw-r--r--arch/powerpc/kernel/cpu_setup_6xx.S4
-rw-r--r--arch/powerpc/kernel/dma-iommu.c75
-rw-r--r--arch/powerpc/kernel/dma-mask.c12
-rw-r--r--arch/powerpc/kernel/dma-swiotlb.c89
-rw-r--r--arch/powerpc/kernel/dma.c362
-rw-r--r--arch/powerpc/kernel/dt_cpu_ftrs.c17
-rw-r--r--arch/powerpc/kernel/eeh.c190
-rw-r--r--arch/powerpc/kernel/eeh_cache.c36
-rw-r--r--arch/powerpc/kernel/eeh_driver.c86
-rw-r--r--arch/powerpc/kernel/eeh_event.c16
-rw-r--r--arch/powerpc/kernel/eeh_pe.c68
-rw-r--r--arch/powerpc/kernel/eeh_sysfs.c3
-rw-r--r--arch/powerpc/kernel/entry_32.S97
-rw-r--r--arch/powerpc/kernel/entry_64.S53
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S5
-rw-r--r--arch/powerpc/kernel/exceptions-64e.S14
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S94
-rw-r--r--arch/powerpc/kernel/head_32.S160
-rw-r--r--arch/powerpc/kernel/head_40x.S9
-rw-r--r--arch/powerpc/kernel/head_44x.S8
-rw-r--r--arch/powerpc/kernel/head_64.S20
-rw-r--r--arch/powerpc/kernel/head_8xx.S124
-rw-r--r--arch/powerpc/kernel/head_booke.h12
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S16
-rw-r--r--arch/powerpc/kernel/idle_6xx.S8
-rw-r--r--arch/powerpc/kernel/idle_book3e.S2
-rw-r--r--arch/powerpc/kernel/idle_e500.S8
-rw-r--r--arch/powerpc/kernel/idle_power4.S2
-rw-r--r--arch/powerpc/kernel/irq.c119
-rw-r--r--arch/powerpc/kernel/kgdb.c28
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c6
-rw-r--r--arch/powerpc/kernel/mce.c11
-rw-r--r--arch/powerpc/kernel/misc_32.S17
-rw-r--r--arch/powerpc/kernel/pci-common.c21
-rw-r--r--arch/powerpc/kernel/process.c68
-rw-r--r--arch/powerpc/kernel/ptrace.c18
-rw-r--r--arch/powerpc/kernel/setup-common.c5
-rw-r--r--arch/powerpc/kernel/setup_32.c26
-rw-r--r--arch/powerpc/kernel/setup_64.c51
-rw-r--r--arch/powerpc/kernel/smp.c109
-rw-r--r--arch/powerpc/kernel/stacktrace.c102
-rw-r--r--arch/powerpc/kernel/syscalls.c2
-rw-r--r--arch/powerpc/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/powerpc/kernel/systbl.S6
-rw-r--r--arch/powerpc/kernel/time.c1
-rw-r--r--arch/powerpc/kernel/trace/Makefile3
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S6
-rw-r--r--arch/powerpc/kernel/traps.c133
-rw-r--r--arch/powerpc/kernel/udbg.c2
-rw-r--r--arch/powerpc/kernel/vdso32/Makefile1
-rw-r--r--arch/powerpc/kernel/vdso64/Makefile1
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S14
-rw-r--r--arch/powerpc/kvm/Makefile5
-rw-r--r--arch/powerpc/kvm/book3s.c7
-rw-r--r--arch/powerpc/kvm/book3s_hv.c25
-rw-r--r--arch/powerpc/kvm/book3s_hv_hmi.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c58
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S66
-rw-r--r--arch/powerpc/lib/Makefile3
-rw-r--r--arch/powerpc/lib/sstep.c114
-rw-r--r--arch/powerpc/lib/test_emulate_step.c535
-rw-r--r--arch/powerpc/lib/test_emulate_step_exec_instr.S150
-rw-r--r--arch/powerpc/math-emu/Makefile2
-rw-r--r--arch/powerpc/mm/40x_mmu.c2
-rw-r--r--arch/powerpc/mm/44x_mmu.c2
-rw-r--r--arch/powerpc/mm/8xx_mmu.c91
-rw-r--r--arch/powerpc/mm/Makefile15
-rw-r--r--arch/powerpc/mm/dma-noncoherent.c40
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c2
-rw-r--r--arch/powerpc/mm/hash_low_32.S76
-rw-r--r--arch/powerpc/mm/hash_utils_64.c6
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c5
-rw-r--r--arch/powerpc/mm/init_32.c6
-rw-r--r--arch/powerpc/mm/init_64.c2
-rw-r--r--arch/powerpc/mm/mem.c61
-rw-r--r--arch/powerpc/mm/mmu_decl.h10
-rw-r--r--arch/powerpc/mm/numa.c9
-rw-r--r--arch/powerpc/mm/pgtable_32.c42
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c186
-rw-r--r--arch/powerpc/mm/ptdump/8xx.c (renamed from arch/powerpc/mm/dump_linuxpagetables-8xx.c)2
-rw-r--r--arch/powerpc/mm/ptdump/Makefile9
-rw-r--r--arch/powerpc/mm/ptdump/bats.c (renamed from arch/powerpc/mm/dump_bats.c)0
-rw-r--r--arch/powerpc/mm/ptdump/book3s64.c (renamed from arch/powerpc/mm/dump_linuxpagetables-book3s64.c)2
-rw-r--r--arch/powerpc/mm/ptdump/hashpagetable.c (renamed from arch/powerpc/mm/dump_hashpagetable.c)2
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c (renamed from arch/powerpc/mm/dump_linuxpagetables.c)20
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.h (renamed from arch/powerpc/mm/dump_linuxpagetables.h)0
-rw-r--r--arch/powerpc/mm/ptdump/segment_regs.c (renamed from arch/powerpc/mm/dump_sr.c)0
-rw-r--r--arch/powerpc/mm/ptdump/shared.c (renamed from arch/powerpc/mm/dump_linuxpagetables-generic.c)2
-rw-r--r--arch/powerpc/mm/slb.c5
-rw-r--r--arch/powerpc/mm/slice.c10
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/powerpc/net/bpf_jit32.h5
-rw-r--r--arch/powerpc/perf/power9-events-list.h24
-rw-r--r--arch/powerpc/perf/power9-pmu.c4
-rw-r--r--arch/powerpc/platforms/44x/Kconfig1
-rw-r--r--arch/powerpc/platforms/44x/ppc476.c1
-rw-r--r--arch/powerpc/platforms/44x/warp.c2
-rw-r--r--arch/powerpc/platforms/83xx/suspend-asm.S34
-rw-r--r--arch/powerpc/platforms/85xx/corenet_generic.c5
-rw-r--r--arch/powerpc/platforms/85xx/ge_imp3a.c2
-rw-r--r--arch/powerpc/platforms/85xx/mpc8536_ds.c2
-rw-r--r--arch/powerpc/platforms/85xx/mpc85xx_ds.c4
-rw-r--r--arch/powerpc/platforms/85xx/mpc85xx_mds.c4
-rw-r--r--arch/powerpc/platforms/85xx/p1010rdb.c1
-rw-r--r--arch/powerpc/platforms/85xx/p1022_ds.c2
-rw-r--r--arch/powerpc/platforms/85xx/p1022_rdk.c2
-rw-r--r--arch/powerpc/platforms/85xx/qemu_e500.c1
-rw-r--r--arch/powerpc/platforms/86xx/mpc86xx_hpcn.c1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype9
-rw-r--r--arch/powerpc/platforms/cell/iommu.c172
-rw-r--r--arch/powerpc/platforms/cell/spu_callbacks.c2
-rw-r--r--arch/powerpc/platforms/cell/spu_syscalls.c1
-rw-r--r--arch/powerpc/platforms/cell/spufs/file.c5
-rw-r--r--arch/powerpc/platforms/embedded6xx/wii.c24
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c2
-rw-r--r--arch/powerpc/platforms/pasemi/setup.c51
-rw-r--r--arch/powerpc/platforms/powernv/Makefile5
-rw-r--r--arch/powerpc/platforms/powernv/idle.c27
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c16
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c283
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S344
-rw-r--r--arch/powerpc/platforms/powernv/opal.c3
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c1
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c146
-rw-r--r--arch/powerpc/platforms/powernv/smp.c25
-rw-r--r--arch/powerpc/platforms/ps3/device-init.c4
-rw-r--r--arch/powerpc/platforms/ps3/os-area.c4
-rw-r--r--arch/powerpc/platforms/ps3/system-bus.c4
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c19
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c99
-rw-r--r--arch/powerpc/platforms/pseries/lparcfg.c1
-rw-r--r--arch/powerpc/platforms/pseries/vio.c95
-rw-r--r--arch/powerpc/sysdev/6xx-suspend.S5
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c58
-rw-r--r--arch/powerpc/sysdev/fsl_pci.c25
-rw-r--r--arch/powerpc/sysdev/ipic.c35
-rw-r--r--arch/powerpc/sysdev/tsi108_dev.c2
-rw-r--r--arch/powerpc/sysdev/xive/common.c2
-rw-r--r--arch/powerpc/xmon/Makefile1
-rw-r--r--arch/powerpc/xmon/ppc-dis.c2
-rw-r--r--arch/powerpc/xmon/xmon.c2
-rw-r--r--arch/riscv/Kconfig6
-rw-r--r--arch/riscv/include/asm/fixmap.h44
-rw-r--r--arch/riscv/include/asm/pgtable.h1
-rw-r--r--arch/riscv/include/asm/smp.h18
-rw-r--r--arch/riscv/kernel/cpu.c30
-rw-r--r--arch/riscv/kernel/cpufeature.c48
-rw-r--r--arch/riscv/kernel/ftrace.c2
-rw-r--r--arch/riscv/kernel/setup.c141
-rw-r--r--arch/riscv/kernel/smp.c10
-rw-r--r--arch/riscv/kernel/smpboot.c24
-rw-r--r--arch/riscv/mm/init.c156
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/acpi.c338
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/head_64.S11
-rw-r--r--arch/x86/boot/compressed/kaslr.c75
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/compressed/misc.h23
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c19
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S2
-rw-r--r--arch/x86/boot/setup.ld2
-rw-r--r--arch/x86/boot/string.c141
-rw-r--r--arch/x86/boot/string.h1
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig3
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/events/intel/lbr.c1
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/fpu/internal.h57
-rw-r--r--arch/x86/include/asm/fpu/types.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/uv/bios.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S12
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c7
-rw-r--r--arch/x86/kernel/e820.c9
-rw-r--r--arch/x86/kernel/fpu/xstate.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c5
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kernel/uprobes.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/lib/insn-eval.c2
-rw-r--r--arch/x86/mm/cpu_entry_area.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/platform/uv/bios_uv.c16
-rw-r--r--arch/x86/platform/uv/tlb_uv.c8
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/realmode/rm/realmode.lds.S2
-rw-r--r--arch/xtensa/Kconfig2
-rw-r--r--arch/xtensa/include/asm/Kbuild2
-rw-r--r--arch/xtensa/include/asm/cmpxchg.h36
-rw-r--r--arch/xtensa/include/asm/spinlock.h185
-rw-r--r--arch/xtensa/include/asm/spinlock_types.h15
-rw-r--r--arch/xtensa/include/asm/thread_info.h9
-rw-r--r--arch/xtensa/kernel/process.c6
-rw-r--r--arch/xtensa/kernel/smp.c38
-rw-r--r--arch/xtensa/kernel/time.c53
-rw-r--r--arch/xtensa/kernel/traps.c5
-rw-r--r--drivers/misc/cxl/guest.c2
-rw-r--r--drivers/misc/cxl/pci.c39
-rw-r--r--drivers/misc/cxl/vphb.c3
-rw-r--r--drivers/net/ethernet/pasemi/pasemi_mac.c1
-rw-r--r--drivers/tty/tty_audit.c2
-rw-r--r--drivers/vfio/vfio_spapr_eeh.c6
-rw-r--r--fs/Makefile3
-rw-r--r--fs/btrfs/acl.c9
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/backref.c22
-rw-r--r--fs/btrfs/compression.c253
-rw-r--r--fs/btrfs/compression.h52
-rw-r--r--fs/btrfs/ctree.c74
-rw-r--r--fs/btrfs/ctree.h61
-rw-r--r--fs/btrfs/delayed-ref.c15
-rw-r--r--fs/btrfs/delayed-ref.h11
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c39
-rw-r--r--fs/btrfs/extent-tree.c292
-rw-r--r--fs/btrfs/extent_io.c87
-rw-r--r--fs/btrfs/extent_io.h15
-rw-r--r--fs/btrfs/extent_map.c5
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c207
-rw-r--r--fs/btrfs/ioctl.c60
-rw-r--r--fs/btrfs/locking.c108
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c31
-rw-r--r--fs/btrfs/qgroup.c372
-rw-r--r--fs/btrfs/qgroup.h120
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/relocation.c119
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c49
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c282
-rw-r--r--fs/btrfs/volumes.c202
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zlib.c45
-rw-r--r--fs/btrfs/zstd.c316
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h17
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c44
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/namei.c2
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c267
-rw-r--r--fs/notify/fanotify/fanotify.h116
-rw-r--r--fs/notify/fanotify/fanotify_user.c373
-rw-r--r--fs/notify/fsnotify.c15
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/notify/mark.c42
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/proc/base.c70
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/udf/super.c51
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c302
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c3
-rw-r--r--fs/xfs/libxfs/xfs_types.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/scrub/agheader.c10
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c330
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_aops.c266
-rw-r--r--fs/xfs/xfs_aops.h24
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c72
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c31
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c769
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c150
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h115
-rw-r--r--fs/xfs/xfs_trans_bmap.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_refcount.c1
-rw-r--r--fs/xfs/xfs_trans_rmap.c1
-rw-r--r--fs/xfs/xfs_xattr.c3
-rw-r--r--include/linux/audit.h66
-rw-r--r--include/linux/capability.h10
-rw-r--r--include/linux/cgroup-defs.h2
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/cred.h1
-rw-r--r--include/linux/fanotify.h26
-rw-r--r--include/linux/fs.h17
-rw-r--r--include/linux/fs_types.h75
-rw-r--r--include/linux/fsnotify.h73
-rw-r--r--include/linux/fsnotify_backend.h67
-rw-r--r--include/linux/kprobes.h5
-rw-r--r--include/linux/lsm_hooks.h49
-rw-r--r--include/linux/namei.h3
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/security.h48
-rw-r--r--include/linux/selinux.h35
-rw-r--r--include/linux/statfs.h3
-rw-r--r--include/linux/swiotlb.h3
-rw-r--r--include/trace/events/btrfs.h30
-rw-r--r--include/uapi/linux/btrfs.h2
-rw-r--r--include/uapi/linux/fanotify.h29
-rw-r--r--init/init_task.c2
-rw-r--r--kernel/audit.c267
-rw-r--r--kernel/audit.h81
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c6
-rw-r--r--kernel/auditsc.c320
-rw-r--r--kernel/capability.c45
-rw-r--r--kernel/cgroup/cgroup.c15
-rw-r--r--kernel/cgroup/cpuset.c13
-rw-r--r--kernel/cgroup/pids.c4
-rw-r--r--kernel/cgroup/rstat.c10
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/dma/Kconfig3
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/dma/mapping.c11
-rw-r--r--kernel/dma/swiotlb.c12
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/trace/trace_events_filter.c5
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/workqueue.c9
-rw-r--r--mm/gup.c6
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu.c2
-rw-r--r--security/Kconfig45
-rw-r--r--security/Makefile2
-rw-r--r--security/apparmor/Kconfig16
-rw-r--r--security/apparmor/audit.c3
-rw-r--r--security/apparmor/capability.c14
-rw-r--r--security/apparmor/domain.c4
-rw-r--r--security/apparmor/include/audit.h3
-rw-r--r--security/apparmor/include/capability.h2
-rw-r--r--security/apparmor/include/cred.h16
-rw-r--r--security/apparmor/include/file.h5
-rw-r--r--security/apparmor/include/lib.h4
-rw-r--r--security/apparmor/include/task.h18
-rw-r--r--security/apparmor/ipc.c3
-rw-r--r--security/apparmor/lsm.c67
-rw-r--r--security/apparmor/resource.c2
-rw-r--r--security/apparmor/task.c6
-rw-r--r--security/commoncap.c30
-rw-r--r--security/integrity/ima/ima.h3
-rw-r--r--security/integrity/ima/ima_appraise.c1
-rw-r--r--security/integrity/ima/ima_policy.c10
-rw-r--r--security/integrity/ima/ima_template_lib.c1
-rw-r--r--security/keys/keyctl.c2
-rw-r--r--security/keys/keyring.c1
-rw-r--r--security/keys/process_keys.c3
-rw-r--r--security/keys/request_key.c4
-rw-r--r--security/loadpin/loadpin.c8
-rw-r--r--security/safesetid/Kconfig14
-rw-r--r--security/safesetid/Makefile7
-rw-r--r--security/safesetid/lsm.c277
-rw-r--r--security/safesetid/lsm.h33
-rw-r--r--security/safesetid/securityfs.c193
-rw-r--r--security/security.c654
-rw-r--r--security/selinux/Kconfig15
-rw-r--r--security/selinux/Makefile2
-rw-r--r--security/selinux/avc.c199
-rw-r--r--security/selinux/exports.c23
-rw-r--r--security/selinux/hooks.c420
-rw-r--r--security/selinux/include/audit.h7
-rw-r--r--security/selinux/include/avc.h6
-rw-r--r--security/selinux/include/objsec.h38
-rw-r--r--security/selinux/include/security.h3
-rw-r--r--security/selinux/selinuxfs.c4
-rw-r--r--security/selinux/ss/services.c41
-rw-r--r--security/selinux/xfrm.c4
-rw-r--r--security/smack/smack.h44
-rw-r--r--security/smack/smack_access.c6
-rw-r--r--security/smack/smack_lsm.c321
-rw-r--r--security/smack/smackfs.c18
-rw-r--r--security/tomoyo/audit.c31
-rw-r--r--security/tomoyo/common.c199
-rw-r--r--security/tomoyo/common.h51
-rw-r--r--security/tomoyo/condition.c59
-rw-r--r--security/tomoyo/domain.c76
-rw-r--r--security/tomoyo/file.c20
-rw-r--r--security/tomoyo/gc.c19
-rw-r--r--security/tomoyo/group.c5
-rw-r--r--security/tomoyo/load_policy.c8
-rw-r--r--security/tomoyo/memory.c9
-rw-r--r--security/tomoyo/mount.c2
-rw-r--r--security/tomoyo/realpath.c18
-rw-r--r--security/tomoyo/securityfs_if.c30
-rw-r--r--security/tomoyo/tomoyo.c160
-rw-r--r--security/tomoyo/util.c23
-rw-r--r--security/yama/yama_lsm.c8
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/null_syscall.c2
-rw-r--r--tools/testing/selftests/powerpc/include/reg.h8
-rw-r--r--tools/testing/selftests/powerpc/include/utils.h2
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c1
-rw-r--r--tools/testing/selftests/powerpc/tm/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/tm/Makefile4
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c184
-rw-r--r--tools/testing/selftests/safesetid/.gitignore1
-rw-r--r--tools/testing/selftests/safesetid/Makefile8
-rw-r--r--tools/testing/selftests/safesetid/config2
-rw-r--r--tools/testing/selftests/safesetid/safesetid-test.c334
-rwxr-xr-xtools/testing/selftests/safesetid/safesetid-test.sh26
-rw-r--r--tools/testing/selftests/vm/map_hugetlb.c29
547 files changed, 13757 insertions, 7808 deletions
diff --git a/Documentation/admin-guide/LSM/SafeSetID.rst b/Documentation/admin-guide/LSM/SafeSetID.rst
new file mode 100644
index 000000000000..212434ef65ad
--- /dev/null
+++ b/Documentation/admin-guide/LSM/SafeSetID.rst
@@ -0,0 +1,107 @@
1=========
2SafeSetID
3=========
4SafeSetID is an LSM module that gates the setid family of syscalls to restrict
5UID/GID transitions from a given UID/GID to only those approved by a
6system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
7from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
8allowing a user to set up user namespace UID mappings.
9
10
11Background
12==========
13In absence of file capabilities, processes spawned on a Linux system that need
14to switch to a different user must be spawned with CAP_SETUID privileges.
15CAP_SETUID is granted to programs running as root or those running as a non-root
16user that have been explicitly given the CAP_SETUID runtime capability. It is
17often preferable to use Linux runtime capabilities rather than file
18capabilities, since using file capabilities to run a program with elevated
19privileges opens up possible security holes since any user with access to the
20file can exec() that program to gain the elevated privileges.
21
22While it is possible to implement a tree of processes by giving full
23CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
24tree of processes under non-root user(s) in the first place. Specifically,
25since CAP_SETUID allows changing to any user on the system, including the root
26user, it is an overpowered capability for what is needed in this scenario,
27especially since programs often only call setuid() to drop privileges to a
28lesser-privileged user -- not elevate privileges. Unfortunately, there is no
29generally feasible way in Linux to restrict the potential UIDs that a user can
30switch to through setuid() beyond allowing a switch to any user on the system.
31This SafeSetID LSM seeks to provide a solution for restricting setid
32capabilities in such a way.
33
34The main use case for this LSM is to allow a non-root program to transition to
35other untrusted uids without full blown CAP_SETUID capabilities. The non-root
36program would still need CAP_SETUID to do any kind of transition, but the
37additional restrictions imposed by this LSM would mean it is a "safer" version
38of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
39do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
40namespace). The higher level goal is to allow for uid-based sandboxing of system
41services without having to give out CAP_SETUID all over the place just so that
42non-root programs can drop to even-lesser-privileged uids. This is especially
43relevant when one non-root daemon on the system should be allowed to spawn other
44processes as different uids, but its undesirable to give the daemon a
45basically-root-equivalent CAP_SETUID.
46
47
48Other Approaches Considered
49===========================
50
51Solve this problem in userspace
52-------------------------------
53For candidate applications that would like to have restricted setid capabilities
54as implemented in this LSM, an alternative option would be to simply take away
55setid capabilities from the application completely and refactor the process
56spawning semantics in the application (e.g. by using a privileged helper program
57to do process spawning and UID/GID transitions). Unfortunately, there are a
58number of semantics around process spawning that would be affected by this, such
59as fork() calls where the program doesn???t immediately call exec() after the
60fork(), parent processes specifying custom environment variables or command line
61args for spawned child processes, or inheritance of file handles across a
62fork()/exec(). Because of this, as solution that uses a privileged helper in
63userspace would likely be less appealing to incorporate into existing projects
64that rely on certain process-spawning semantics in Linux.
65
66Use user namespaces
67-------------------
68Another possible approach would be to run a given process tree in its own user
69namespace and give programs in the tree setid capabilities. In this way,
70programs in the tree could change to any desired UID/GID in the context of their
71own user namespace, and only approved UIDs/GIDs could be mapped back to the
72initial system user namespace, affectively preventing privilege escalation.
73Unfortunately, it is not generally feasible to use user namespaces in isolation,
74without pairing them with other namespace types, which is not always an option.
75Linux checks for capabilities based off of the user namespace that ???owns??? some
76entity. For example, Linux has the notion that network namespaces are owned by
77the user namespace in which they were created. A consequence of this is that
78capability checks for access to a given network namespace are done by checking
79whether a task has the given capability in the context of the user namespace
80that owns the network namespace -- not necessarily the user namespace under
81which the given task runs. Therefore spawning a process in a new user namespace
82effectively prevents it from accessing the network namespace owned by the
83initial namespace. This is a deal-breaker for any application that expects to
84retain the CAP_NET_ADMIN capability for the purpose of adjusting network
85configurations. Using user namespaces in isolation causes problems regarding
86other system interactions, including use of pid namespaces and device creation.
87
88Use an existing LSM
89-------------------
90None of the other in-tree LSMs have the capability to gate setid transitions, or
91even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
92"Since setuid only affects the current process, and since the SELinux controls
93are not based on the Linux identity attributes, SELinux does not need to control
94this operation."
95
96
97Directions for use
98==================
99This LSM hooks the setid syscalls to make sure transitions are allowed if an
100applicable restriction policy is in place. Policies are configured through
101securityfs by writing to the safesetid/add_whitelist_policy and
102safesetid/flush_whitelist_policies files at the location where securityfs is
103mounted. The format for adding a policy is '<UID>:<UID>', using literal
104numbers, such as '123:456'. To flush the policies, any write to the file is
105sufficient. Again, configuring a policy for a UID will prevent that UID from
106obtaining auxiliary setid privileges, such as allowing a user to set up user
107namespace UID mappings.
diff --git a/Documentation/admin-guide/LSM/index.rst b/Documentation/admin-guide/LSM/index.rst
index c980dfe9abf1..a6ba95fbaa9f 100644
--- a/Documentation/admin-guide/LSM/index.rst
+++ b/Documentation/admin-guide/LSM/index.rst
@@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
17specific changes to system operation when these tweaks are not available 17specific changes to system operation when these tweaks are not available
18in the core functionality of Linux itself. 18in the core functionality of Linux itself.
19 19
20Without a specific LSM built into the kernel, the default LSM will be the 20The Linux capabilities modules will always be included. This may be
21Linux capabilities system. Most LSMs choose to extend the capabilities 21followed by any number of "minor" modules and at most one "major" module.
22system, building their checks on top of the defined capability hooks.
23For more details on capabilities, see ``capabilities(7)`` in the Linux 22For more details on capabilities, see ``capabilities(7)`` in the Linux
24man-pages project. 23man-pages project.
25 24
@@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
30be first, followed by any "minor" modules (e.g. Yama) and then 29be first, followed by any "minor" modules (e.g. Yama) and then
31the one "major" module (e.g. SELinux) if there is one configured. 30the one "major" module (e.g. SELinux) if there is one configured.
32 31
32Process attributes associated with "major" security modules should
33be accessed and maintained using the special files in ``/proc/.../attr``.
34A security module may maintain a module specific subdirectory there,
35named after the module. ``/proc/.../attr/smack`` is provided by the Smack
36security module and contains all its special files. The files directly
37in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
38subdirectories.
39
33.. toctree:: 40.. toctree::
34 :maxdepth: 1 41 :maxdepth: 1
35 42
@@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
39 Smack 46 Smack
40 tomoyo 47 tomoyo
41 Yama 48 Yama
49 SafeSetID
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 53d3288c328b..20f92c16ffbf 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1519,7 +1519,7 @@ protected workload.
1519 1519
1520The limits are only applied at the peer level in the hierarchy. This means that 1520The limits are only applied at the peer level in the hierarchy. This means that
1521in the diagram below, only groups A, B, and C will influence each other, and 1521in the diagram below, only groups A, B, and C will influence each other, and
1522groups D and F will influence each other. Group G will influence nobody. 1522groups D and F will influence each other. Group G will influence nobody::
1523 1523
1524 [root] 1524 [root]
1525 / | \ 1525 / | \
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a422560fbc15..42379633801f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2333,6 +2333,10 @@
2333 2333
2334 lsm.debug [SECURITY] Enable LSM initialization debugging output. 2334 lsm.debug [SECURITY] Enable LSM initialization debugging output.
2335 2335
2336 lsm=lsm1,...,lsmN
2337 [SECURITY] Choose order of LSM initialization. This
2338 overrides CONFIG_LSM, and the "security=" parameter.
2339
2336 machvec= [IA-64] Force the use of a particular machine-vector 2340 machvec= [IA-64] Force the use of a particular machine-vector
2337 (machvec) in a generic kernel. 2341 (machvec) in a generic kernel.
2338 Example: machvec=hpzx1_swiotlb 2342 Example: machvec=hpzx1_swiotlb
@@ -4110,11 +4114,9 @@
4110 Note: increases power consumption, thus should only be 4114 Note: increases power consumption, thus should only be
4111 enabled if running jitter sensitive (HPC/RT) workloads. 4115 enabled if running jitter sensitive (HPC/RT) workloads.
4112 4116
4113 security= [SECURITY] Choose a security module to enable at boot. 4117 security= [SECURITY] Choose a legacy "major" security module to
4114 If this boot parameter is not specified, only the first 4118 enable at boot. This has been deprecated by the
4115 security module asking for security registration will be 4119 "lsm=" parameter.
4116 loaded. An invalid security module name will be treated
4117 as if no module has been chosen.
4118 4120
4119 selinux= [SELINUX] Disable or enable SELinux at boot time. 4121 selinux= [SELINUX] Disable or enable SELinux at boot time.
4120 Format: { "0" | "1" } 4122 Format: { "0" | "1" }
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt
index 1a078b5d281a..e105d708ccde 100644
--- a/Documentation/cgroup-v1/pids.txt
+++ b/Documentation/cgroup-v1/pids.txt
@@ -33,6 +33,9 @@ limit in the hierarchy is followed).
33pids.current tracks all child cgroup hierarchies, so parent/pids.current is a 33pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
34superset of parent/child/pids.current. 34superset of parent/child/pids.current.
35 35
36The pids.events file contains event counters:
37 - max: Number of times fork failed because limit was hit.
38
36Example 39Example
37------- 40-------
38 41
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 9ccfd1bc6201..a5cbb5e0e3db 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -272,7 +272,7 @@ The following sysctls are available for the XFS filesystem:
272 XFS_ERRLEVEL_LOW: 1 272 XFS_ERRLEVEL_LOW: 1
273 XFS_ERRLEVEL_HIGH: 5 273 XFS_ERRLEVEL_HIGH: 5
274 274
275 fs.xfs.panic_mask (Min: 0 Default: 0 Max: 255) 275 fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256)
276 Causes certain error conditions to call BUG(). Value is a bitmask; 276 Causes certain error conditions to call BUG(). Value is a bitmask;
277 OR together the tags which represent errors which should cause panics: 277 OR together the tags which represent errors which should cause panics:
278 278
@@ -285,6 +285,7 @@ The following sysctls are available for the XFS filesystem:
285 XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 285 XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
286 XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 286 XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
287 XFS_PTAG_FSBLOCK_ZERO 0x00000080 287 XFS_PTAG_FSBLOCK_ZERO 0x00000080
288 XFS_PTAG_VERIFIER_ERROR 0x00000100
288 289
289 This option is intended for debugging only. 290 This option is intended for debugging only.
290 291
diff --git a/Documentation/kdump/vmcoreinfo.txt b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index 000000000000..bb94a4bd597a
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,495 @@
1================================================================
2 VMCOREINFO
3================================================================
4
5===========
6What is it?
7===========
8
9VMCOREINFO is a special ELF note section. It contains various
10information from the kernel like structure size, page size, symbol
11values, field offsets, etc. These data are packed into an ELF note
12section and used by user-space tools like crash and makedumpfile to
13analyze a kernel's memory layout.
14
15================
16Common variables
17================
18
19init_uts_ns.name.release
20------------------------
21
22The version of the Linux kernel. Used to find the corresponding source
23code from which the kernel has been built. For example, crash uses it to
24find the corresponding vmlinux in order to process vmcore.
25
26PAGE_SIZE
27---------
28
29The size of a page. It is the smallest unit of data used by the memory
30management facilities. It is usually 4096 bytes of size and a page is
31aligned on 4096 bytes. Used for computing page addresses.
32
33init_uts_ns
34-----------
35
36The UTS namespace which is used to isolate two specific elements of the
37system that relate to the uname(2) system call. It is named after the
38data structure used to store information returned by the uname(2) system
39call.
40
41User-space tools can get the kernel name, host name, kernel release
42number, kernel version, architecture name and OS type from it.
43
44node_online_map
45---------------
46
47An array node_states[N_ONLINE] which represents the set of online nodes
48in a system, one bit position per node number. Used to keep track of
49which nodes are in the system and online.
50
51swapper_pg_dir
52-------------
53
54The global page directory pointer of the kernel. Used to translate
55virtual to physical addresses.
56
57_stext
58------
59
60Defines the beginning of the text section. In general, _stext indicates
61the kernel start address. Used to convert a virtual address from the
62direct kernel map to a physical address.
63
64vmap_area_list
65--------------
66
67Stores the virtual area list. makedumpfile gets the vmalloc start value
68from this variable and its value is necessary for vmalloc translation.
69
70mem_map
71-------
72
73Physical addresses are translated to struct pages by treating them as
74an index into the mem_map array. Right-shifting a physical address
75PAGE_SHIFT bits converts it into a page frame number which is an index
76into that mem_map array.
77
78Used to map an address to the corresponding struct page.
79
80contig_page_data
81----------------
82
83Makedumpfile gets the pglist_data structure from this symbol, which is
84used to describe the memory layout.
85
86User-space tools use this to exclude free pages when dumping memory.
87
88mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
89--------------------------------------------------------------------------
90
91The address of the mem_section array, its length, structure size, and
92the section_mem_map offset.
93
94It exists in the sparse memory mapping model, and it is also somewhat
95similar to the mem_map variable, both of them are used to translate an
96address.
97
98page
99----
100
101The size of a page structure. struct page is an important data structure
102and it is widely used to compute contiguous memory.
103
104pglist_data
105-----------
106
107The size of a pglist_data structure. This value is used to check if the
108pglist_data structure is valid. It is also used for checking the memory
109type.
110
111zone
112----
113
114The size of a zone structure. This value is used to check if the zone
115structure has been found. It is also used for excluding free pages.
116
117free_area
118---------
119
120The size of a free_area structure. It indicates whether the free_area
121structure is valid or not. Useful when excluding free pages.
122
123list_head
124---------
125
126The size of a list_head structure. Used when iterating lists in a
127post-mortem analysis session.
128
129nodemask_t
130----------
131
132The size of a nodemask_t type. Used to compute the number of online
133nodes.
134
135(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
136 compound_order|compound_head)
137-------------------------------------------------------------------
138
139User-space tools compute their values based on the offset of these
140variables. The variables are used when excluding unnecessary pages.
141
142(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_
143 spanned_pages|node_id)
144-------------------------------------------------------------------
145
146On NUMA machines, each NUMA node has a pg_data_t to describe its memory
147layout. On UMA machines there is a single pglist_data which describes the
148whole memory.
149
150These values are used to check the memory type and to compute the
151virtual address for memory map.
152
153(zone, free_area|vm_stat|spanned_pages)
154---------------------------------------
155
156Each node is divided into a number of blocks called zones which
157represent ranges within memory. A zone is described by a structure zone.
158
159User-space tools compute required values based on the offset of these
160variables.
161
162(free_area, free_list)
163----------------------
164
165Offset of the free_list's member. This value is used to compute the number
166of free pages.
167
168Each zone has a free_area structure array called free_area[MAX_ORDER].
169The free_list represents a linked list of free page blocks.
170
171(list_head, next|prev)
172----------------------
173
174Offsets of the list_head's members. list_head is used to define a
175circular linked list. User-space tools need these in order to traverse
176lists.
177
178(vmap_area, va_start|list)
179--------------------------
180
181Offsets of the vmap_area's members. They carry vmalloc-specific
182information. Makedumpfile gets the start address of the vmalloc region
183from this.
184
185(zone.free_area, MAX_ORDER)
186---------------------------
187
188Free areas descriptor. User-space tools use this value to iterate the
189free_area ranges. MAX_ORDER is used by the zone buddy allocator.
190
191log_first_idx
192-------------
193
194Index of the first record stored in the buffer log_buf. Used by
195user-space tools to read the strings in the log_buf.
196
197log_buf
198-------
199
200Console output is written to the ring buffer log_buf at index
201log_first_idx. Used to get the kernel log.
202
203log_buf_len
204-----------
205
206log_buf's length.
207
208clear_idx
209---------
210
211The index that the next printk() record to read after the last clear
212command. It indicates the first record after the last SYSLOG_ACTION
213_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
214the dmesg log.
215
216log_next_idx
217------------
218
219The index of the next record to store in the buffer log_buf. Used to
220compute the index of the current buffer position.
221
222printk_log
223----------
224
225The size of a structure printk_log. Used to compute the size of
226messages, and extract dmesg log. It encapsulates header information for
227log_buf, such as timestamp, syslog level, etc.
228
229(printk_log, ts_nsec|len|text_len|dict_len)
230-------------------------------------------
231
232It represents field offsets in struct printk_log. User space tools
233parse it and check whether the values of printk_log's members have been
234changed.
235
236(free_area.free_list, MIGRATE_TYPES)
237------------------------------------
238
239The number of migrate types for pages. The free_list is described by the
240array. Used by tools to compute the number of free pages.
241
242NR_FREE_PAGES
243-------------
244
245On linux-2.6.21 or later, the number of free pages is in
246vm_stat[NR_FREE_PAGES]. Used to get the number of free pages.
247
248PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision
249|PG_head_mask|PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)
250|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
251-----------------------------------------------------------------
252
253Page attributes. These flags are used to filter various unnecessary for
254dumping pages.
255
256HUGETLB_PAGE_DTOR
257-----------------
258
259The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile
260excludes these pages.
261
262======
263x86_64
264======
265
266phys_base
267---------
268
269Used to convert the virtual address of an exported kernel symbol to its
270corresponding physical address.
271
272init_top_pgt
273------------
274
275Used to walk through the whole page table and convert virtual addresses
276to physical addresses. The init_top_pgt is somewhat similar to
277swapper_pg_dir, but it is only used in x86_64.
278
279pgtable_l5_enabled
280------------------
281
282User-space tools need to know whether the crash kernel was in 5-level
283paging mode.
284
285node_data
286---------
287
288This is a struct pglist_data array and stores all NUMA nodes
289information. Makedumpfile gets the pglist_data structure from it.
290
291(node_data, MAX_NUMNODES)
292-------------------------
293
294The maximum number of nodes in system.
295
296KERNELOFFSET
297------------
298
299The kernel randomization offset. Used to compute the page offset. If
300KASLR is disabled, this value is zero.
301
302KERNEL_IMAGE_SIZE
303-----------------
304
305Currently unused by Makedumpfile. Used to compute the module virtual
306address by Crash.
307
308sme_mask
309--------
310
311AMD-specific with SME support: it indicates the secure memory encryption
312mask. Makedumpfile tools need to know whether the crash kernel was
313encrypted. If SME is enabled in the first kernel, the crash kernel's
314page table entries (pgd/pud/pmd/pte) contain the memory encryption
315mask. This is used to remove the SME mask and obtain the true physical
316address.
317
318Currently, sme_mask stores the value of the C-bit position. If needed,
319additional SME-relevant info can be placed in that variable.
320
321For example:
322[ misc ][ enc bit ][ other misc SME info ]
3230000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000
32463 59 55 51 47 43 39 35 31 27 ... 3
325
326======
327x86_32
328======
329
330X86_PAE
331-------
332
333Denotes whether physical address extensions are enabled. It has the cost
334of a higher page table lookup overhead, and also consumes more page
335table space per process. Used to check whether PAE was enabled in the
336crash kernel when converting virtual addresses to physical addresses.
337
338====
339ia64
340====
341
342pgdat_list|(pgdat_list, MAX_NUMNODES)
343-------------------------------------
344
345pg_data_t array storing all NUMA nodes information. MAX_NUMNODES
346indicates the number of the nodes.
347
348node_memblk|(node_memblk, NR_NODE_MEMBLKS)
349------------------------------------------
350
351List of node memory chunks. Filled when parsing the SRAT table to obtain
352information about memory nodes. NR_NODE_MEMBLKS indicates the number of
353node memory chunks.
354
355These values are used to compute the number of nodes the crashed kernel used.
356
357node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size)
358----------------------------------------------------------------
359
360The size of a struct node_memblk_s and the offsets of the
361node_memblk_s's members. Used to compute the number of nodes.
362
363PGTABLE_3|PGTABLE_4
364-------------------
365
366User-space tools need to know whether the crash kernel was in 3-level or
3674-level paging mode. Used to distinguish the page table.
368
369=====
370ARM64
371=====
372
373VA_BITS
374-------
375
376The maximum number of bits for virtual addresses. Used to compute the
377virtual memory ranges.
378
379kimage_voffset
380--------------
381
382The offset between the kernel virtual and physical mappings. Used to
383translate virtual to physical addresses.
384
385PHYS_OFFSET
386-----------
387
388Indicates the physical address of the start of memory. Similar to
389kimage_voffset, which is used to translate virtual to physical
390addresses.
391
392KERNELOFFSET
393------------
394
395The kernel randomization offset. Used to compute the page offset. If
396KASLR is disabled, this value is zero.
397
398====
399arm
400====
401
402ARM_LPAE
403--------
404
405It indicates whether the crash kernel supports large physical address
406extensions. Used to translate virtual to physical addresses.
407
408====
409s390
410====
411
412lowcore_ptr
413----------
414
415An array with a pointer to the lowcore of every CPU. Used to print the
416psw and all registers information.
417
418high_memory
419-----------
420
421Used to get the vmalloc_start address from the high_memory symbol.
422
423(lowcore_ptr, NR_CPUS)
424----------------------
425
426The maximum number of CPUs.
427
428=======
429powerpc
430=======
431
432
433node_data|(node_data, MAX_NUMNODES)
434-----------------------------------
435
436See above.
437
438contig_page_data
439----------------
440
441See above.
442
443vmemmap_list
444------------
445
446The vmemmap_list maintains the entire vmemmap physical mapping. Used
447to get vmemmap list count and populated vmemmap regions info. If the
448vmemmap address translation information is stored in the crash kernel,
449it is used to translate vmemmap kernel virtual addresses.
450
451mmu_vmemmap_psize
452-----------------
453
454The size of a page. Used to translate virtual to physical addresses.
455
456mmu_psize_defs
457--------------
458
459Page size definitions, i.e. 4k, 64k, or 16M.
460
461Used to make vtop translations.
462
463vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|
464(vmemmap_backing, virt_addr)
465----------------------------------------------------------------
466
467The vmemmap virtual address space management does not have a traditional
468page table to track which virtual struct pages are backed by a physical
469mapping. The virtual to physical mappings are tracked in a simple linked
470list format.
471
472User-space tools need to know the offset of list, phys and virt_addr
473when computing the count of vmemmap regions.
474
475mmu_psize_def|(mmu_psize_def, shift)
476------------------------------------
477
478The size of a struct mmu_psize_def and the offset of mmu_psize_def's
479member.
480
481Used in vtop translations.
482
483==
484sh
485==
486
487node_data|(node_data, MAX_NUMNODES)
488-----------------------------------
489
490See above.
491
492X2TLB
493-----
494
495Indicates whether the crashed kernel enabled SH extended mode.
diff --git a/Documentation/xtensa/booting.txt b/Documentation/xtensa/booting.txt
new file mode 100644
index 000000000000..402b33a2619f
--- /dev/null
+++ b/Documentation/xtensa/booting.txt
@@ -0,0 +1,19 @@
1Passing boot parameters to the kernel.
2
3Boot parameters are represented as a TLV list in the memory. Please see
4arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and
5tag value constants. First entry in the list must have type BP_TAG_FIRST, last
6entry must have type BP_TAG_LAST. The address of the first list entry is
7passed to the kernel in the register a2. The address type depends on MMU type:
8- For configurations without MMU, with region protection or with MPU the
9 address must be the physical address.
10- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n
11 the address must be a valid address in the current mapping. The kernel will
12 not change the mapping on its own.
13- For configurations with MMUv2 the address must be a virtual address in the
14 default virtual mapping (0xd0000000..0xffffffff).
15- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a
16 virtual or physical address. In either case it must be within the default
17 virtual mapping. It is considered physical if it is within the range of
18 physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR..
19 XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3e90641e012e..fce33cc179b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3971,9 +3971,10 @@ M: Johannes Weiner <hannes@cmpxchg.org>
3971L: cgroups@vger.kernel.org 3971L: cgroups@vger.kernel.org
3972T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 3972T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
3973S: Maintained 3973S: Maintained
3974F: Documentation/cgroup* 3974F: Documentation/admin-guide/cgroup-v2.rst
3975F: Documentation/cgroup-v1/
3975F: include/linux/cgroup* 3976F: include/linux/cgroup*
3976F: kernel/cgroup* 3977F: kernel/cgroup/
3977 3978
3978CONTROL GROUP - CPUSET 3979CONTROL GROUP - CPUSET
3979M: Li Zefan <lizefan@huawei.com> 3980M: Li Zefan <lizefan@huawei.com>
@@ -5948,6 +5949,7 @@ L: linux-fsdevel@vger.kernel.org
5948S: Maintained 5949S: Maintained
5949F: fs/* 5950F: fs/*
5950F: include/linux/fs.h 5951F: include/linux/fs.h
5952F: include/linux/fs_types.h
5951F: include/uapi/linux/fs.h 5953F: include/uapi/linux/fs.h
5952 5954
5953FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER 5955FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
@@ -15556,12 +15558,11 @@ F: mm/shmem.c
15556TOMOYO SECURITY MODULE 15558TOMOYO SECURITY MODULE
15557M: Kentaro Takeda <takedakn@nttdata.co.jp> 15559M: Kentaro Takeda <takedakn@nttdata.co.jp>
15558M: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> 15560M: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
15559L: tomoyo-dev-en@lists.sourceforge.jp (subscribers-only, for developers in English) 15561L: tomoyo-dev-en@lists.osdn.me (subscribers-only, for developers in English)
15560L: tomoyo-users-en@lists.sourceforge.jp (subscribers-only, for users in English) 15562L: tomoyo-users-en@lists.osdn.me (subscribers-only, for users in English)
15561L: tomoyo-dev@lists.sourceforge.jp (subscribers-only, for developers in Japanese) 15563L: tomoyo-dev@lists.osdn.me (subscribers-only, for developers in Japanese)
15562L: tomoyo-users@lists.sourceforge.jp (subscribers-only, for users in Japanese) 15564L: tomoyo-users@lists.osdn.me (subscribers-only, for users in Japanese)
15563W: http://tomoyo.sourceforge.jp/ 15565W: https://tomoyo.osdn.jp/
15564T: quilt http://svn.sourceforge.jp/svnroot/tomoyo/trunk/2.5.x/tomoyo-lsm/patches/
15565S: Maintained 15566S: Maintained
15566F: security/tomoyo/ 15567F: security/tomoyo/
15567 15568
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7deb3ea2dd3f..b5dce13a6132 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -119,9 +119,6 @@ config GENERIC_HWEIGHT
119 bool 119 bool
120 default y 120 default y
121 121
122config ARCH_HAS_DMA_SET_COHERENT_MASK
123 bool
124
125config PPC 122config PPC
126 bool 123 bool
127 default y 124 default y
@@ -131,10 +128,10 @@ config PPC
131 select ARCH_32BIT_OFF_T if PPC32 128 select ARCH_32BIT_OFF_T if PPC32
132 select ARCH_HAS_DEBUG_VIRTUAL 129 select ARCH_HAS_DEBUG_VIRTUAL
133 select ARCH_HAS_DEVMEM_IS_ALLOWED 130 select ARCH_HAS_DEVMEM_IS_ALLOWED
134 select ARCH_HAS_DMA_SET_COHERENT_MASK
135 select ARCH_HAS_ELF_RANDOMIZE 131 select ARCH_HAS_ELF_RANDOMIZE
136 select ARCH_HAS_FORTIFY_SOURCE 132 select ARCH_HAS_FORTIFY_SOURCE
137 select ARCH_HAS_GCOV_PROFILE_ALL 133 select ARCH_HAS_GCOV_PROFILE_ALL
134 select ARCH_HAS_KCOV
138 select ARCH_HAS_PHYS_TO_DMA 135 select ARCH_HAS_PHYS_TO_DMA
139 select ARCH_HAS_PMEM_API if PPC64 136 select ARCH_HAS_PMEM_API if PPC64
140 select ARCH_HAS_PTE_SPECIAL 137 select ARCH_HAS_PTE_SPECIAL
@@ -203,7 +200,7 @@ config PPC
203 select HAVE_IOREMAP_PROT 200 select HAVE_IOREMAP_PROT
204 select HAVE_IRQ_EXIT_ON_IRQ_STACK 201 select HAVE_IRQ_EXIT_ON_IRQ_STACK
205 select HAVE_KERNEL_GZIP 202 select HAVE_KERNEL_GZIP
206 select HAVE_KERNEL_XZ if PPC_BOOK3S 203 select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x
207 select HAVE_KPROBES 204 select HAVE_KPROBES
208 select HAVE_KPROBES_ON_FTRACE 205 select HAVE_KPROBES_ON_FTRACE
209 select HAVE_KRETPROBES 206 select HAVE_KRETPROBES
@@ -222,7 +219,7 @@ config PPC
222 select HAVE_PERF_USER_STACK_DUMP 219 select HAVE_PERF_USER_STACK_DUMP
223 select HAVE_RCU_TABLE_FREE if SMP 220 select HAVE_RCU_TABLE_FREE if SMP
224 select HAVE_REGS_AND_STACK_ACCESS_API 221 select HAVE_REGS_AND_STACK_ACCESS_API
225 select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN 222 select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
226 select HAVE_SYSCALL_TRACEPOINTS 223 select HAVE_SYSCALL_TRACEPOINTS
227 select HAVE_VIRT_CPU_ACCOUNTING 224 select HAVE_VIRT_CPU_ACCOUNTING
228 select HAVE_IRQ_TIME_ACCOUNTING 225 select HAVE_IRQ_TIME_ACCOUNTING
@@ -243,6 +240,7 @@ config PPC
243 select RTC_LIB 240 select RTC_LIB
244 select SPARSE_IRQ 241 select SPARSE_IRQ
245 select SYSCTL_EXCEPTION_TRACE 242 select SYSCTL_EXCEPTION_TRACE
243 select THREAD_INFO_IN_TASK
246 select VIRT_TO_BUS if !PPC64 244 select VIRT_TO_BUS if !PPC64
247 # 245 #
248 # Please keep this list sorted alphabetically. 246 # Please keep this list sorted alphabetically.
@@ -253,9 +251,6 @@ config PPC_BARRIER_NOSPEC
253 default y 251 default y
254 depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E 252 depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E
255 253
256config GENERIC_CSUM
257 def_bool n
258
259config EARLY_PRINTK 254config EARLY_PRINTK
260 bool 255 bool
261 default y 256 default y
@@ -475,9 +470,6 @@ config ARCH_CPU_PROBE_RELEASE
475config ARCH_ENABLE_MEMORY_HOTPLUG 470config ARCH_ENABLE_MEMORY_HOTPLUG
476 def_bool y 471 def_bool y
477 472
478config ARCH_HAS_WALK_MEMORY
479 def_bool y
480
481config ARCH_ENABLE_MEMORY_HOTREMOVE 473config ARCH_ENABLE_MEMORY_HOTREMOVE
482 def_bool y 474 def_bool y
483 475
@@ -693,7 +685,7 @@ config PPC_16K_PAGES
693 685
694config PPC_64K_PAGES 686config PPC_64K_PAGES
695 bool "64k page size" 687 bool "64k page size"
696 depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) 688 depends on 44x || PPC_BOOK3S_64
697 select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 689 select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
698 690
699config PPC_256K_PAGES 691config PPC_256K_PAGES
@@ -711,6 +703,13 @@ config PPC_256K_PAGES
711 703
712endchoice 704endchoice
713 705
706config PPC_PAGE_SHIFT
707 int
708 default 18 if PPC_256K_PAGES
709 default 16 if PPC_64K_PAGES
710 default 14 if PPC_16K_PAGES
711 default 12
712
714config THREAD_SHIFT 713config THREAD_SHIFT
715 int "Thread shift" if EXPERT 714 int "Thread shift" if EXPERT
716 range 13 15 715 range 13 15
@@ -721,6 +720,59 @@ config THREAD_SHIFT
721 Used to define the stack size. The default is almost always what you 720 Used to define the stack size. The default is almost always what you
722 want. Only change this if you know what you are doing. 721 want. Only change this if you know what you are doing.
723 722
723config ETEXT_SHIFT_BOOL
724 bool "Set custom etext alignment" if STRICT_KERNEL_RWX && \
725 (PPC_BOOK3S_32 || PPC_8xx)
726 depends on ADVANCED_OPTIONS
727 help
728 This option allows you to set the kernel end of text alignment. When
729 RAM is mapped by blocks, the alignment needs to fit the size and
730 number of possible blocks. The default should be OK for most configs.
731
732 Say N here unless you know what you are doing.
733
734config ETEXT_SHIFT
735 int "_etext shift" if ETEXT_SHIFT_BOOL
736 range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
737 range 19 23 if STRICT_KERNEL_RWX && PPC_8xx
738 default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
739 default 19 if STRICT_KERNEL_RWX && PPC_8xx
740 default PPC_PAGE_SHIFT
741 help
742 On Book3S 32 (603+), IBATs are used to map kernel text.
743 Smaller is the alignment, greater is the number of necessary IBATs.
744
745 On 8xx, large pages (512kb or 8M) are used to map kernel linear
746 memory. Aligning to 8M reduces TLB misses as only 8M pages are used
747 in that case.
748
749config DATA_SHIFT_BOOL
750 bool "Set custom data alignment" if STRICT_KERNEL_RWX && \
751 (PPC_BOOK3S_32 || PPC_8xx)
752 depends on ADVANCED_OPTIONS
753 help
754 This option allows you to set the kernel data alignment. When
755 RAM is mapped by blocks, the alignment needs to fit the size and
756 number of possible blocks. The default should be OK for most configs.
757
758 Say N here unless you know what you are doing.
759
760config DATA_SHIFT
761 int "Data shift" if DATA_SHIFT_BOOL
762 default 24 if STRICT_KERNEL_RWX && PPC64
763 range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
764 range 19 23 if STRICT_KERNEL_RWX && PPC_8xx
765 default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
766 default 23 if STRICT_KERNEL_RWX && PPC_8xx
767 default PPC_PAGE_SHIFT
768 help
769 On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
770 Smaller is the alignment, greater is the number of necessary DBATs.
771
772 On 8xx, large pages (512kb or 8M) are used to map kernel linear
773 memory. Aligning to 8M reduces TLB misses as only 8M pages are used
774 in that case.
775
724config FORCE_MAX_ZONEORDER 776config FORCE_MAX_ZONEORDER
725 int "Maximum zone order" 777 int "Maximum zone order"
726 range 8 9 if PPC64 && PPC_64K_PAGES 778 range 8 9 if PPC64 && PPC_64K_PAGES
@@ -887,6 +939,7 @@ config FSL_SOC
887 939
888config FSL_PCI 940config FSL_PCI
889 bool 941 bool
942 select ARCH_HAS_DMA_SET_MASK
890 select PPC_INDIRECT_PCI 943 select PPC_INDIRECT_PCI
891 select PCI_QUIRKS 944 select PCI_QUIRKS
892 945
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index f4961fbcb48d..4e00cb0a5464 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -361,10 +361,6 @@ config PPC_PTDUMP
361 361
362 If you are unsure, say N. 362 If you are unsure, say N.
363 363
364config PPC_HTDUMP
365 def_bool y
366 depends on PPC_PTDUMP && PPC_BOOK3S_64
367
368config PPC_FAST_ENDIAN_SWITCH 364config PPC_FAST_ENDIAN_SWITCH
369 bool "Deprecated fast endian-switch syscall" 365 bool "Deprecated fast endian-switch syscall"
370 depends on DEBUG_KERNEL && PPC_BOOK3S_64 366 depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 488c9edffa58..7de49889bd5d 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -213,9 +213,9 @@ endif
213asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) 213asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
214 214
215KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr) 215KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr)
216KBUILD_AFLAGS += -Iarch/$(ARCH) $(AFLAGS-y) 216KBUILD_AFLAGS += $(AFLAGS-y)
217KBUILD_CFLAGS += $(call cc-option,-msoft-float) 217KBUILD_CFLAGS += $(call cc-option,-msoft-float)
218KBUILD_CFLAGS += -pipe -Iarch/$(ARCH) $(CFLAGS-y) 218KBUILD_CFLAGS += -pipe $(CFLAGS-y)
219CPP = $(CC) -E $(KBUILD_CFLAGS) 219CPP = $(CC) -E $(KBUILD_CFLAGS)
220 220
221CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ 221CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__
@@ -427,6 +427,13 @@ else
427endif 427endif
428endif 428endif
429 429
430ifdef CONFIG_SMP
431prepare: task_cpu_prepare
432
433task_cpu_prepare: prepare0
434 $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h))
435endif
436
430# Check toolchain versions: 437# Check toolchain versions:
431# - gcc-4.6 is the minimum kernel-wide version so nothing required. 438# - gcc-4.6 is the minimum kernel-wide version so nothing required.
432checkbin: 439checkbin:
diff --git a/arch/powerpc/boot/dts/Makefile b/arch/powerpc/boot/dts/Makefile
index fb335d05aae8..1cbc0e4ce857 100644
--- a/arch/powerpc/boot/dts/Makefile
+++ b/arch/powerpc/boot/dts/Makefile
@@ -4,3 +4,4 @@ subdir-y += fsl
4 4
5dtstree := $(srctree)/$(src) 5dtstree := $(srctree)/$(src)
6dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) 6dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
7dtb-$(CONFIG_XILINX_VIRTEX440_GENERIC_BOARD) += virtex440-ml507.dtb virtex440-ml510.dtb
diff --git a/arch/powerpc/boot/dts/akebono.dts b/arch/powerpc/boot/dts/akebono.dts
index 8a7a10139bc9..cd9d66041a3f 100644
--- a/arch/powerpc/boot/dts/akebono.dts
+++ b/arch/powerpc/boot/dts/akebono.dts
@@ -40,7 +40,7 @@
40 d-cache-size = <32768>; 40 d-cache-size = <32768>;
41 dcr-controller; 41 dcr-controller;
42 dcr-access-method = "native"; 42 dcr-access-method = "native";
43 status = "ok"; 43 status = "okay";
44 }; 44 };
45 cpu@1 { 45 cpu@1 {
46 device_type = "cpu"; 46 device_type = "cpu";
diff --git a/arch/powerpc/boot/dts/bluestone.dts b/arch/powerpc/boot/dts/bluestone.dts
index b0b26d8d68a2..64eaf7e09d22 100644
--- a/arch/powerpc/boot/dts/bluestone.dts
+++ b/arch/powerpc/boot/dts/bluestone.dts
@@ -109,7 +109,7 @@
109 109
110 OCM: ocm@400040000 { 110 OCM: ocm@400040000 {
111 compatible = "ibm,ocm"; 111 compatible = "ibm,ocm";
112 status = "ok"; 112 status = "okay";
113 cell-index = <1>; 113 cell-index = <1>;
114 /* configured in U-Boot */ 114 /* configured in U-Boot */
115 reg = <4 0x00040000 0x8000>; /* 32K */ 115 reg = <4 0x00040000 0x8000>; /* 32K */
diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts
index a04a4fcfde63..b6d87b9c2cef 100644
--- a/arch/powerpc/boot/dts/currituck.dts
+++ b/arch/powerpc/boot/dts/currituck.dts
@@ -39,7 +39,7 @@
39 d-cache-size = <32768>; 39 d-cache-size = <32768>;
40 dcr-controller; 40 dcr-controller;
41 dcr-access-method = "native"; 41 dcr-access-method = "native";
42 status = "ok"; 42 status = "okay";
43 }; 43 };
44 cpu@1 { 44 cpu@1 {
45 device_type = "cpu"; 45 device_type = "cpu";
diff --git a/arch/powerpc/boot/dts/iss4xx-mpic.dts b/arch/powerpc/boot/dts/iss4xx-mpic.dts
index f7063198b2dc..c9f90f1a9c8e 100644
--- a/arch/powerpc/boot/dts/iss4xx-mpic.dts
+++ b/arch/powerpc/boot/dts/iss4xx-mpic.dts
@@ -43,7 +43,7 @@
43 d-cache-size = <32768>; 43 d-cache-size = <32768>;
44 dcr-controller; 44 dcr-controller;
45 dcr-access-method = "native"; 45 dcr-access-method = "native";
46 status = "ok"; 46 status = "okay";
47 }; 47 };
48 cpu@1 { 48 cpu@1 {
49 device_type = "cpu"; 49 device_type = "cpu";
diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts
index 104b1d6d5695..c406bdb4f36f 100644
--- a/arch/powerpc/boot/dts/wii.dts
+++ b/arch/powerpc/boot/dts/wii.dts
@@ -14,6 +14,7 @@
14 14
15/dts-v1/; 15/dts-v1/;
16#include <dt-bindings/gpio/gpio.h> 16#include <dt-bindings/gpio/gpio.h>
17#include <dt-bindings/input/input.h>
17 18
18/* 19/*
19 * This is commented-out for now. 20 * This is commented-out for now.
@@ -187,6 +188,11 @@
187 "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3", 188 "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3",
188 "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7"; 189 "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7";
189 190
191 interrupt-controller;
192 #interrupt-cells = <2>;
193 interrupts = <10>;
194 interrupt-parent = <&PIC1>;
195
190 /* 196 /*
191 * This is commented out while a standard binding 197 * This is commented out while a standard binding
192 * for i2c over gpio is defined. 198 * for i2c over gpio is defined.
@@ -235,5 +241,21 @@
235 panic-indicator; 241 panic-indicator;
236 }; 242 };
237 }; 243 };
244
245 gpio-keys {
246 compatible = "gpio-keys";
247
248 power {
249 label = "Power Button";
250 gpios = <&GPIO 0 GPIO_ACTIVE_HIGH>;
251 linux,code = <KEY_POWER>;
252 };
253
254 eject {
255 label = "Eject Button";
256 gpios = <&GPIO 6 GPIO_ACTIVE_HIGH>;
257 linux,code = <KEY_EJECTCD>;
258 };
259 };
238}; 260};
239 261
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 1d911f68a23b..296584e6dd55 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -23,8 +23,8 @@
23#include <uapi/asm/ucontext.h> 23#include <uapi/asm/ucontext.h>
24 24
25/* SMP */ 25/* SMP */
26extern struct thread_info *current_set[NR_CPUS]; 26extern struct task_struct *current_set[NR_CPUS];
27extern struct thread_info *secondary_ti; 27extern struct task_struct *secondary_current;
28void start_secondary(void *unused); 28void start_secondary(void *unused);
29 29
30/* kexec */ 30/* kexec */
@@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image);
37extern struct static_key hcall_tracepoint_key; 37extern struct static_key hcall_tracepoint_key;
38void __trace_hcall_entry(unsigned long opcode, unsigned long *args); 38void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
39void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); 39void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
40/* OPAL tracing */
41#ifdef CONFIG_JUMP_LABEL
42extern struct static_key opal_tracepoint_key;
43#endif
44 40
45void __trace_opal_entry(unsigned long opcode, unsigned long *args); 41/* OPAL */
46void __trace_opal_exit(long opcode, unsigned long retval); 42int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
43 int64_t a4, int64_t a5, int64_t a6, int64_t a7,
44 int64_t opcode, uint64_t msr);
47 45
48/* VMX copying */ 46/* VMX copying */
49int enter_vmx_usercopy(void); 47int enter_vmx_usercopy(void);
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 0c261ba2c826..5cb588395fdc 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -92,6 +92,8 @@ typedef struct {
92 unsigned long vdso_base; 92 unsigned long vdso_base;
93} mm_context_t; 93} mm_context_t;
94 94
95void update_bats(void);
96
95/* patch sites */ 97/* patch sites */
96extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2; 98extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2;
97extern s32 patch__hash_page_B, patch__hash_page_C; 99extern s32 patch__hash_page_B, patch__hash_page_C;
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 49d76adb9bc5..aa8406b8f7ba 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -174,7 +174,18 @@ static inline bool pte_user(pte_t pte)
174 * of RAM. -- Cort 174 * of RAM. -- Cort
175 */ 175 */
176#define VMALLOC_OFFSET (0x1000000) /* 16M */ 176#define VMALLOC_OFFSET (0x1000000) /* 16M */
177
178/*
179 * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules
180 * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear
181 * memory shall not share segments.
182 */
183#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES)
184#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \
185 ~(VMALLOC_OFFSET - 1))
186#else
177#define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) 187#define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)))
188#endif
178#define VMALLOC_END ioremap_bot 189#define VMALLOC_END ioremap_bot
179 190
180#ifndef __ASSEMBLY__ 191#ifndef __ASSEMBLY__
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 247aff9cc6ba..54b7af6cd27f 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -40,22 +40,36 @@
40#else 40#else
41#define H_PUD_CACHE_INDEX (H_PUD_INDEX_SIZE) 41#define H_PUD_CACHE_INDEX (H_PUD_INDEX_SIZE)
42#endif 42#endif
43
43/* 44/*
44 * Define the address range of the kernel non-linear virtual area 45 * Define the address range of the kernel non-linear virtual area. In contrast
46 * to the linear mapping, this is managed using the kernel page tables and then
47 * inserted into the hash page table to actually take effect, similarly to user
48 * mappings.
45 */ 49 */
46#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) 50#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000)
47#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */
48 51
49/* 52/*
50 * The vmalloc space starts at the beginning of that region, and 53 * Allow virtual mapping of one context size.
51 * occupies half of it on hash CPUs and a quarter of it on Book3E 54 * 512TB for 64K page size
52 * (we keep a quarter for the virtual memmap) 55 * 64TB for 4K page size
56 */
57#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT)
58
59/*
60 * 8TB IO mapping size
61 */
62#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */
63
64/*
65 * The vmalloc space starts at the beginning of the kernel non-linear virtual
66 * region, and occupies 504T (64K) or 56T (4K)
53 */ 67 */
54#define H_VMALLOC_START H_KERN_VIRT_START 68#define H_VMALLOC_START H_KERN_VIRT_START
55#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ 69#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE)
56#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) 70#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE)
57 71
58#define H_KERN_IO_START H_VMALLOC_END 72#define H_KERN_IO_START H_VMALLOC_END
59 73
60/* 74/*
61 * Region IDs 75 * Region IDs
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 12e522807f9f..a28a28079edb 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -23,7 +23,7 @@
23 */ 23 */
24#include <asm/book3s/64/pgtable.h> 24#include <asm/book3s/64/pgtable.h>
25#include <asm/bug.h> 25#include <asm/bug.h>
26#include <asm/processor.h> 26#include <asm/task_size_64.h>
27#include <asm/cpu_has_feature.h> 27#include <asm/cpu_has_feature.h>
28 28
29/* 29/*
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 9c1173283b96..138bc2ecc0c4 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
111 111
112static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 112static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
113{ 113{
114 pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); 114 *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
115} 115}
116 116
117static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 117static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
138 138
139static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) 139static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
140{ 140{
141 pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); 141 *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS);
142} 142}
143 143
144static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, 144static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
@@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
176static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, 176static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
177 pte_t *pte) 177 pte_t *pte)
178{ 178{
179 pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); 179 *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS);
180} 180}
181 181
182static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, 182static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
183 pgtable_t pte_page) 183 pgtable_t pte_page)
184{ 184{
185 pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); 185 *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
186} 186}
187 187
188static inline pgtable_t pmd_pgtable(pmd_t pmd) 188static inline pgtable_t pmd_pgtable(pmd_t pmd)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 868fcaf56f6b..581f91be9dd4 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
811 return hash__set_pte_at(mm, addr, ptep, pte, percpu); 811 return hash__set_pte_at(mm, addr, ptep, pte, percpu);
812} 812}
813 813
814#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) 814#define _PAGE_CACHE_CTL (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
815 815
816#define pgprot_noncached pgprot_noncached 816#define pgprot_noncached pgprot_noncached
817static inline pgprot_t pgprot_noncached(pgprot_t prot) 817static inline pgprot_t pgprot_noncached(pgprot_t prot)
@@ -851,11 +851,6 @@ static inline bool pte_ci(pte_t pte)
851 return false; 851 return false;
852} 852}
853 853
854static inline void pmd_set(pmd_t *pmdp, unsigned long val)
855{
856 *pmdp = __pmd(val);
857}
858
859static inline void pmd_clear(pmd_t *pmdp) 854static inline void pmd_clear(pmd_t *pmdp)
860{ 855{
861 *pmdp = __pmd(0); 856 *pmdp = __pmd(0);
@@ -887,11 +882,6 @@ static inline int pmd_bad(pmd_t pmd)
887 return hash__pmd_bad(pmd); 882 return hash__pmd_bad(pmd);
888} 883}
889 884
890static inline void pud_set(pud_t *pudp, unsigned long val)
891{
892 *pudp = __pud(val);
893}
894
895static inline void pud_clear(pud_t *pudp) 885static inline void pud_clear(pud_t *pudp)
896{ 886{
897 *pudp = __pud(0); 887 *pudp = __pud(0);
@@ -934,10 +924,6 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
934} 924}
935 925
936#define pgd_write(pgd) pte_write(pgd_pte(pgd)) 926#define pgd_write(pgd) pte_write(pgd_pte(pgd))
937static inline void pgd_set(pgd_t *pgdp, unsigned long val)
938{
939 *pgdp = __pgd(val);
940}
941 927
942static inline void pgd_clear(pgd_t *pgdp) 928static inline void pgd_clear(pgd_t *pgdp)
943{ 929{
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 671316f9e95d..05147cecb8df 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -13,8 +13,32 @@ static inline int mmu_get_ap(int psize)
13 13
14#ifdef CONFIG_PPC_RADIX_MMU 14#ifdef CONFIG_PPC_RADIX_MMU
15extern void radix__tlbiel_all(unsigned int action); 15extern void radix__tlbiel_all(unsigned int action);
16extern void radix__flush_tlb_lpid_page(unsigned int lpid,
17 unsigned long addr,
18 unsigned long page_size);
19extern void radix__flush_pwc_lpid(unsigned int lpid);
20extern void radix__flush_tlb_lpid(unsigned int lpid);
21extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
16#else 22#else
17static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; 23static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); };
24static inline void radix__flush_tlb_lpid_page(unsigned int lpid,
25 unsigned long addr,
26 unsigned long page_size)
27{
28 WARN_ON(1);
29}
30static inline void radix__flush_pwc_lpid(unsigned int lpid)
31{
32 WARN_ON(1);
33}
34static inline void radix__flush_tlb_lpid(unsigned int lpid)
35{
36 WARN_ON(1);
37}
38static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
39{
40 WARN_ON(1);
41}
18#endif 42#endif
19 43
20extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, 44extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
@@ -49,12 +73,6 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
49extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); 73extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
50extern void radix__flush_tlb_all(void); 74extern void radix__flush_tlb_all(void);
51 75
52extern void radix__flush_tlb_lpid_page(unsigned int lpid,
53 unsigned long addr,
54 unsigned long page_size);
55extern void radix__flush_pwc_lpid(unsigned int lpid);
56extern void radix__flush_tlb_lpid(unsigned int lpid);
57extern void radix__local_flush_tlb_lpid(unsigned int lpid); 76extern void radix__local_flush_tlb_lpid(unsigned int lpid);
58extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
59 77
60#endif 78#endif
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index a78a57e5058d..72a65d744a28 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -9,9 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifdef CONFIG_GENERIC_CSUM
13#include <asm-generic/checksum.h>
14#else
15#include <linux/bitops.h> 12#include <linux/bitops.h>
16#include <linux/in6.h> 13#include <linux/in6.h>
17/* 14/*
@@ -217,6 +214,5 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
217 const struct in6_addr *daddr, 214 const struct in6_addr *daddr,
218 __u32 len, __u8 proto, __wsum sum); 215 __u32 len, __u8 proto, __wsum sum);
219 216
220#endif
221#endif /* __KERNEL__ */ 217#endif /* __KERNEL__ */
222#endif 218#endif
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 0245bfcaac32..a130be13ee83 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -20,6 +20,11 @@ struct iommu_table;
20 */ 20 */
21struct dev_archdata { 21struct dev_archdata {
22 /* 22 /*
23 * Set to %true if the dma_iommu_ops are requested to use a direct
24 * window instead of dynamically mapping memory.
25 */
26 bool iommu_bypass : 1;
27 /*
23 * These two used to be a union. However, with the hybrid ops we need 28 * These two used to be a union. However, with the hybrid ops we need
24 * both so here we store both a DMA offset for direct mappings and 29 * both so here we store both a DMA offset for direct mappings and
25 * an iommu_table for remapped DMA. 30 * an iommu_table for remapped DMA.
@@ -33,9 +38,6 @@ struct dev_archdata {
33#ifdef CONFIG_IOMMU_API 38#ifdef CONFIG_IOMMU_API
34 void *iommu_domain; 39 void *iommu_domain;
35#endif 40#endif
36#ifdef CONFIG_SWIOTLB
37 dma_addr_t max_direct_dma_addr;
38#endif
39#ifdef CONFIG_PPC64 41#ifdef CONFIG_PPC64
40 struct pci_dn *pci_data; 42 struct pci_dn *pci_data;
41#endif 43#endif
@@ -54,6 +56,4 @@ struct pdev_archdata {
54 u64 dma_mask; 56 u64 dma_mask;
55}; 57};
56 58
57#define ARCH_HAS_DMA_GET_REQUIRED_MASK
58
59#endif /* _ASM_POWERPC_DEVICE_H */ 59#endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h
index 7702875aabb7..a2912b47102c 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -4,26 +4,24 @@
4 4
5static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) 5static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
6{ 6{
7#ifdef CONFIG_SWIOTLB
8 struct dev_archdata *sd = &dev->archdata;
9
10 if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr)
11 return false;
12#endif
13
14 if (!dev->dma_mask) 7 if (!dev->dma_mask)
15 return false; 8 return false;
16 9
17 return addr + size - 1 <= *dev->dma_mask; 10 return addr + size - 1 <=
11 min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
18} 12}
19 13
20static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) 14static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
21{ 15{
22 return paddr + get_dma_offset(dev); 16 if (!dev)
17 return paddr + PCI_DRAM_OFFSET;
18 return paddr + dev->archdata.dma_offset;
23} 19}
24 20
25static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) 21static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
26{ 22{
27 return daddr - get_dma_offset(dev); 23 if (!dev)
24 return daddr - PCI_DRAM_OFFSET;
25 return daddr - dev->archdata.dma_offset;
28} 26}
29#endif /* ASM_POWERPC_DMA_DIRECT_H */ 27#endif /* ASM_POWERPC_DMA_DIRECT_H */
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index ebf66809f2d3..565d6f74b189 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -1,74 +1,9 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2/* 2/*
3 * Copyright (C) 2004 IBM 3 * Copyright (C) 2004 IBM
4 *
5 * Implements the generic device dma API for powerpc.
6 * the pci and vio busses
7 */ 4 */
8#ifndef _ASM_DMA_MAPPING_H 5#ifndef _ASM_DMA_MAPPING_H
9#define _ASM_DMA_MAPPING_H 6#define _ASM_DMA_MAPPING_H
10#ifdef __KERNEL__
11
12#include <linux/types.h>
13#include <linux/cache.h>
14/* need struct page definitions */
15#include <linux/mm.h>
16#include <linux/scatterlist.h>
17#include <linux/dma-debug.h>
18#include <asm/io.h>
19#include <asm/swiotlb.h>
20
21/* Some dma direct funcs must be visible for use in other dma_ops */
22extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
23 dma_addr_t *dma_handle, gfp_t flag,
24 unsigned long attrs);
25extern void __dma_nommu_free_coherent(struct device *dev, size_t size,
26 void *vaddr, dma_addr_t dma_handle,
27 unsigned long attrs);
28extern int dma_nommu_mmap_coherent(struct device *dev,
29 struct vm_area_struct *vma,
30 void *cpu_addr, dma_addr_t handle,
31 size_t size, unsigned long attrs);
32
33#ifdef CONFIG_NOT_COHERENT_CACHE
34/*
35 * DMA-consistent mapping functions for PowerPCs that don't support
36 * cache snooping. These allocate/free a region of uncached mapped
37 * memory space for use with DMA devices. Alternatively, you could
38 * allocate the space "normally" and use the cache management functions
39 * to ensure it is consistent.
40 */
41struct device;
42extern void __dma_sync(void *vaddr, size_t size, int direction);
43extern void __dma_sync_page(struct page *page, unsigned long offset,
44 size_t size, int direction);
45extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr);
46
47#else /* ! CONFIG_NOT_COHERENT_CACHE */
48/*
49 * Cache coherent cores.
50 */
51
52#define __dma_sync(addr, size, rw) ((void)0)
53#define __dma_sync_page(pg, off, sz, rw) ((void)0)
54
55#endif /* ! CONFIG_NOT_COHERENT_CACHE */
56
57static inline unsigned long device_to_mask(struct device *dev)
58{
59 if (dev->dma_mask && *dev->dma_mask)
60 return *dev->dma_mask;
61 /* Assume devices without mask can take 32 bit addresses */
62 return 0xfffffffful;
63}
64
65/*
66 * Available generic sets of operations
67 */
68#ifdef CONFIG_PPC64
69extern struct dma_map_ops dma_iommu_ops;
70#endif
71extern const struct dma_map_ops dma_nommu_ops;
72 7
73static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) 8static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
74{ 9{
@@ -80,31 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
80 return NULL; 15 return NULL;
81} 16}
82 17
83/*
84 * get_dma_offset()
85 *
86 * Get the dma offset on configurations where the dma address can be determined
87 * from the physical address by looking at a simple offset. Direct dma and
88 * swiotlb use this function, but it is typically not used by implementations
89 * with an iommu.
90 */
91static inline dma_addr_t get_dma_offset(struct device *dev)
92{
93 if (dev)
94 return dev->archdata.dma_offset;
95
96 return PCI_DRAM_OFFSET;
97}
98
99static inline void set_dma_offset(struct device *dev, dma_addr_t off)
100{
101 if (dev)
102 dev->archdata.dma_offset = off;
103}
104
105#define HAVE_ARCH_DMA_SET_MASK 1
106
107extern u64 __dma_get_required_mask(struct device *dev);
108
109#endif /* __KERNEL__ */
110#endif /* _ASM_DMA_MAPPING_H */ 18#endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8b596d096ebe..94cfcf33030a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -219,7 +219,8 @@ struct eeh_ops {
219}; 219};
220 220
221extern int eeh_subsystem_flags; 221extern int eeh_subsystem_flags;
222extern int eeh_max_freezes; 222extern u32 eeh_max_freezes;
223extern bool eeh_debugfs_no_recover;
223extern struct eeh_ops *eeh_ops; 224extern struct eeh_ops *eeh_ops;
224extern raw_spinlock_t confirm_error_lock; 225extern raw_spinlock_t confirm_error_lock;
225 226
@@ -293,14 +294,14 @@ void eeh_add_device_late(struct pci_dev *);
293void eeh_add_device_tree_late(struct pci_bus *); 294void eeh_add_device_tree_late(struct pci_bus *);
294void eeh_add_sysfs_files(struct pci_bus *); 295void eeh_add_sysfs_files(struct pci_bus *);
295void eeh_remove_device(struct pci_dev *); 296void eeh_remove_device(struct pci_dev *);
296int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state); 297int eeh_unfreeze_pe(struct eeh_pe *pe);
297int eeh_pe_reset_and_recover(struct eeh_pe *pe); 298int eeh_pe_reset_and_recover(struct eeh_pe *pe);
298int eeh_dev_open(struct pci_dev *pdev); 299int eeh_dev_open(struct pci_dev *pdev);
299void eeh_dev_release(struct pci_dev *pdev); 300void eeh_dev_release(struct pci_dev *pdev);
300struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); 301struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group);
301int eeh_pe_set_option(struct eeh_pe *pe, int option); 302int eeh_pe_set_option(struct eeh_pe *pe, int option);
302int eeh_pe_get_state(struct eeh_pe *pe); 303int eeh_pe_get_state(struct eeh_pe *pe);
303int eeh_pe_reset(struct eeh_pe *pe, int option); 304int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed);
304int eeh_pe_configure(struct eeh_pe *pe); 305int eeh_pe_configure(struct eeh_pe *pe);
305int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, 306int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
306 unsigned long addr, unsigned long mask); 307 unsigned long addr, unsigned long mask);
@@ -460,6 +461,9 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf,
460 eeh_check_failure(addr); 461 eeh_check_failure(addr);
461} 462}
462 463
464
465void eeh_cache_debugfs_init(void);
466
463#endif /* CONFIG_PPC64 */ 467#endif /* CONFIG_PPC64 */
464#endif /* __KERNEL__ */ 468#endif /* __KERNEL__ */
465#endif /* _POWERPC_EEH_H */ 469#endif /* _POWERPC_EEH_H */
diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index 9884e872686f..6d0412b846ac 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -33,6 +33,7 @@ struct eeh_event {
33 33
34int eeh_event_init(void); 34int eeh_event_init(void);
35int eeh_send_failure_event(struct eeh_pe *pe); 35int eeh_send_failure_event(struct eeh_pe *pe);
36int __eeh_send_failure_event(struct eeh_pe *pe);
36void eeh_remove_event(struct eeh_pe *pe, bool force); 37void eeh_remove_event(struct eeh_pe *pe, bool force);
37void eeh_handle_normal_event(struct eeh_pe *pe); 38void eeh_handle_normal_event(struct eeh_pe *pe);
38void eeh_handle_special_event(void); 39void eeh_handle_special_event(void);
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 3b4767ed3ec5..937bb630093f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
671 671
672#define RUNLATCH_ON \ 672#define RUNLATCH_ON \
673BEGIN_FTR_SECTION \ 673BEGIN_FTR_SECTION \
674 CURRENT_THREAD_INFO(r3, r1); \ 674 ld r3, PACA_THREAD_INFO(r13); \
675 ld r4,TI_LOCAL_FLAGS(r3); \ 675 ld r4,TI_LOCAL_FLAGS(r3); \
676 andi. r0,r4,_TLF_RUNLATCH; \ 676 andi. r0,r4,_TLF_RUNLATCH; \
677 beql ppc64_runlatch_on_trampoline; \ 677 beql ppc64_runlatch_on_trampoline; \
@@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
721#ifdef CONFIG_PPC_970_NAP 721#ifdef CONFIG_PPC_970_NAP
722#define FINISH_NAP \ 722#define FINISH_NAP \
723BEGIN_FTR_SECTION \ 723BEGIN_FTR_SECTION \
724 CURRENT_THREAD_INFO(r11, r1); \ 724 ld r11, PACA_THREAD_INFO(r13); \
725 ld r9,TI_LOCAL_FLAGS(r11); \ 725 ld r9,TI_LOCAL_FLAGS(r11); \
726 andi. r10,r9,_TLF_NAPPING; \ 726 andi. r10,r9,_TLF_NAPPING; \
727 bnel power4_fixup_nap; \ 727 bnel power4_fixup_nap; \
diff --git a/arch/powerpc/include/asm/hvsi.h b/arch/powerpc/include/asm/hvsi.h
index 3fdc54df63c9..464a7519ed64 100644
--- a/arch/powerpc/include/asm/hvsi.h
+++ b/arch/powerpc/include/asm/hvsi.h
@@ -64,7 +64,7 @@ struct hvsi_priv {
64 unsigned int inbuf_len; /* data in input buffer */ 64 unsigned int inbuf_len; /* data in input buffer */
65 unsigned char inbuf[HVSI_INBUF_SIZE]; 65 unsigned char inbuf[HVSI_INBUF_SIZE];
66 unsigned int inbuf_cur; /* Cursor in input buffer */ 66 unsigned int inbuf_cur; /* Cursor in input buffer */
67 unsigned int inbuf_pktlen; /* packet lenght from cursor */ 67 unsigned int inbuf_pktlen; /* packet length from cursor */
68 atomic_t seqno; /* packet sequence number */ 68 atomic_t seqno; /* packet sequence number */
69 unsigned int opened:1; /* driver opened */ 69 unsigned int opened:1; /* driver opened */
70 unsigned int established:1; /* protocol established */ 70 unsigned int established:1; /* protocol established */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 17524d222a7b..0ac52392ed99 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -237,6 +237,7 @@ static inline void iommu_del_device(struct device *dev)
237} 237}
238#endif /* !CONFIG_IOMMU_API */ 238#endif /* !CONFIG_IOMMU_API */
239 239
240u64 dma_iommu_get_required_mask(struct device *dev);
240#else 241#else
241 242
242static inline void *get_iommu_table_base(struct device *dev) 243static inline void *get_iommu_table_base(struct device *dev)
@@ -318,5 +319,21 @@ extern void iommu_release_ownership(struct iommu_table *tbl);
318extern enum dma_data_direction iommu_tce_direction(unsigned long tce); 319extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
319extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); 320extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
320 321
322#ifdef CONFIG_PPC_CELL_NATIVE
323extern bool iommu_fixed_is_weak;
324#else
325#define iommu_fixed_is_weak false
326#endif
327
328extern const struct dma_map_ops dma_iommu_ops;
329
330static inline unsigned long device_to_mask(struct device *dev)
331{
332 if (dev->dma_mask && *dev->dma_mask)
333 return *dev->dma_mask;
334 /* Assume devices without mask can take 32 bit addresses */
335 return 0xfffffffful;
336}
337
321#endif /* __KERNEL__ */ 338#endif /* __KERNEL__ */
322#endif /* _ASM_IOMMU_H */ 339#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/include/asm/ipic.h b/arch/powerpc/include/asm/ipic.h
index 3dbd47f2bffe..abad50a745db 100644
--- a/arch/powerpc/include/asm/ipic.h
+++ b/arch/powerpc/include/asm/ipic.h
@@ -69,10 +69,7 @@ enum ipic_mcp_irq {
69 IPIC_MCP_MU = 7, 69 IPIC_MCP_MU = 7,
70}; 70};
71 71
72extern void ipic_set_highest_priority(unsigned int irq);
73extern void ipic_set_default_priority(void); 72extern void ipic_set_default_priority(void);
74extern void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq);
75extern void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq);
76extern u32 ipic_get_mcp_status(void); 73extern u32 ipic_get_mcp_status(void);
77extern void ipic_clear_mcp_status(u32 mask); 74extern void ipic_clear_mcp_status(u32 mask);
78 75
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index ee39ce56b2a2..c91a60cda4fa 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -48,23 +48,19 @@ struct pt_regs;
48 * Per-cpu stacks for handling critical, debug and machine check 48 * Per-cpu stacks for handling critical, debug and machine check
49 * level interrupts. 49 * level interrupts.
50 */ 50 */
51extern struct thread_info *critirq_ctx[NR_CPUS]; 51extern void *critirq_ctx[NR_CPUS];
52extern struct thread_info *dbgirq_ctx[NR_CPUS]; 52extern void *dbgirq_ctx[NR_CPUS];
53extern struct thread_info *mcheckirq_ctx[NR_CPUS]; 53extern void *mcheckirq_ctx[NR_CPUS];
54extern void exc_lvl_ctx_init(void);
55#else
56#define exc_lvl_ctx_init()
57#endif 54#endif
58 55
59/* 56/*
60 * Per-cpu stacks for handling hard and soft interrupts. 57 * Per-cpu stacks for handling hard and soft interrupts.
61 */ 58 */
62extern struct thread_info *hardirq_ctx[NR_CPUS]; 59extern void *hardirq_ctx[NR_CPUS];
63extern struct thread_info *softirq_ctx[NR_CPUS]; 60extern void *softirq_ctx[NR_CPUS];
64 61
65extern void irq_ctx_init(void); 62void call_do_softirq(void *sp);
66extern void call_do_softirq(struct thread_info *tp); 63void call_do_irq(struct pt_regs *regs, void *sp);
67extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp);
68extern void do_IRQ(struct pt_regs *regs); 64extern void do_IRQ(struct pt_regs *regs);
69extern void __init init_IRQ(void); 65extern void __init init_IRQ(void);
70extern void __do_irq(struct pt_regs *regs); 66extern void __do_irq(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index eb0d79f0ca45..a6c8548ed9fa 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -141,6 +141,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
141 141
142extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); 142extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
143extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); 143extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
144extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags);
144extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); 145extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
145extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu); 146extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
146extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu); 147extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
@@ -632,7 +633,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
632 unsigned int yield_count); 633 unsigned int yield_count);
633long kvmppc_h_random(struct kvm_vcpu *vcpu); 634long kvmppc_h_random(struct kvm_vcpu *vcpu);
634void kvmhv_commence_exit(int trap); 635void kvmhv_commence_exit(int trap);
635long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); 636void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
636void kvmppc_subcore_enter_guest(void); 637void kvmppc_subcore_enter_guest(void);
637void kvmppc_subcore_exit_guest(void); 638void kvmppc_subcore_exit_guest(void);
638long kvmppc_realmode_hmi_handler(void); 639long kvmppc_realmode_hmi_handler(void);
diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h
index 47a03b9b528b..5070df19d463 100644
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@ -21,6 +21,7 @@
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/ftrace.h> 23#include <linux/ftrace.h>
24#include <linux/sched/task_stack.h>
24 25
25#ifdef CONFIG_LIVEPATCH 26#ifdef CONFIG_LIVEPATCH
26static inline int klp_check_compiler_support(void) 27static inline int klp_check_compiler_support(void)
@@ -43,13 +44,13 @@ static inline unsigned long klp_get_ftrace_location(unsigned long faddr)
43 return ftrace_location_range(faddr, faddr + 16); 44 return ftrace_location_range(faddr, faddr + 16);
44} 45}
45 46
46static inline void klp_init_thread_info(struct thread_info *ti) 47static inline void klp_init_thread_info(struct task_struct *p)
47{ 48{
48 /* + 1 to account for STACK_END_MAGIC */ 49 /* + 1 to account for STACK_END_MAGIC */
49 ti->livepatch_sp = (unsigned long *)(ti + 1) + 1; 50 task_thread_info(p)->livepatch_sp = end_of_stack(p) + 1;
50} 51}
51#else 52#else
52static void klp_init_thread_info(struct thread_info *ti) { } 53static inline void klp_init_thread_info(struct task_struct *p) { }
53#endif /* CONFIG_LIVEPATCH */ 54#endif /* CONFIG_LIVEPATCH */
54 55
55#endif /* _ASM_POWERPC_LIVEPATCH_H */ 56#endif /* _ASM_POWERPC_LIVEPATCH_H */
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 8311869005fa..2f0ca6560e47 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -47,9 +47,7 @@ struct machdep_calls {
47#endif 47#endif
48#endif /* CONFIG_PPC64 */ 48#endif /* CONFIG_PPC64 */
49 49
50 /* Platform set_dma_mask and dma_get_required_mask overrides */ 50 void (*dma_set_mask)(struct device *dev, u64 dma_mask);
51 int (*dma_set_mask)(struct device *dev, u64 dma_mask);
52 u64 (*dma_get_required_mask)(struct device *dev);
53 51
54 int (*probe)(void); 52 int (*probe)(void);
55 void (*setup_arch)(void); /* Optional, may be NULL */ 53 void (*setup_arch)(void); /* Optional, may be NULL */
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a8b8903e1844..17996bc9382b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,7 @@ extern int get_mce_event(struct machine_check_event *mce, bool release);
209extern void release_mce_event(void); 209extern void release_mce_event(void);
210extern void machine_check_queue_event(void); 210extern void machine_check_queue_event(void);
211extern void machine_check_print_event_info(struct machine_check_event *evt, 211extern void machine_check_print_event_info(struct machine_check_event *evt,
212 bool user_mode); 212 bool user_mode, bool in_guest);
213#ifdef CONFIG_PPC_BOOK3S_64 213#ifdef CONFIG_PPC_BOOK3S_64
214void flush_and_reload_slb(void); 214void flush_and_reload_slb(void);
215#endif /* CONFIG_PPC_BOOK3S_64 */ 215#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 25607604a7a5..d34ad1657d7b 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -289,6 +289,17 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
289} 289}
290#endif /* CONFIG_PPC_MEM_KEYS */ 290#endif /* CONFIG_PPC_MEM_KEYS */
291 291
292#ifdef CONFIG_STRICT_KERNEL_RWX
293static inline bool strict_kernel_rwx_enabled(void)
294{
295 return rodata_enabled;
296}
297#else
298static inline bool strict_kernel_rwx_enabled(void)
299{
300 return false;
301}
302#endif
292#endif /* !__ASSEMBLY__ */ 303#endif /* !__ASSEMBLY__ */
293 304
294/* The kernel use the constants below to index in the page sizes array. 305/* The kernel use the constants below to index in the page sizes array.
@@ -356,6 +367,8 @@ extern void early_init_mmu_secondary(void);
356extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, 367extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
357 phys_addr_t first_memblock_size); 368 phys_addr_t first_memblock_size);
358static inline void mmu_early_init_devtree(void) { } 369static inline void mmu_early_init_devtree(void) { }
370
371extern void *abatron_pteptrs[2];
359#endif /* __ASSEMBLY__ */ 372#endif /* __ASSEMBLY__ */
360#endif 373#endif
361 374
diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
index bd9ba8defd72..84b4cfe73edd 100644
--- a/arch/powerpc/include/asm/nmi.h
+++ b/arch/powerpc/include/asm/nmi.h
@@ -14,4 +14,6 @@ extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
14#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace 14#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
15#endif 15#endif
16 16
17extern void hv_nmi_check_nonrecoverable(struct pt_regs *regs);
18
17#endif /* _ASM_NMI_H */ 19#endif /* _ASM_NMI_H */
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index b0f764c827c0..0a1a3fc54e54 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -231,9 +231,10 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
231} 231}
232 232
233/* patch sites */ 233/* patch sites */
234extern s32 patch__itlbmiss_linmem_top; 234extern s32 patch__itlbmiss_linmem_top, patch__itlbmiss_linmem_top8;
235extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp; 235extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp;
236extern s32 patch__fixupdar_linmem_top; 236extern s32 patch__fixupdar_linmem_top;
237extern s32 patch__dtlbmiss_romem_top, patch__dtlbmiss_romem_top8;
237 238
238extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2; 239extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2;
239extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3; 240extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3;
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 5c5ea2413413..ed870468ef6f 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -20,20 +20,11 @@
20 20
21/* 21/*
22 * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages 22 * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages
23 * on PPC44x). For PPC64 we support either 4K or 64K software 23 * on PPC44x and 4K/16K on 8xx). For PPC64 we support either 4K or 64K software
24 * page size. When using 64K pages however, whether we are really supporting 24 * page size. When using 64K pages however, whether we are really supporting
25 * 64K pages in HW or not is irrelevant to those definitions. 25 * 64K pages in HW or not is irrelevant to those definitions.
26 */ 26 */
27#if defined(CONFIG_PPC_256K_PAGES) 27#define PAGE_SHIFT CONFIG_PPC_PAGE_SHIFT
28#define PAGE_SHIFT 18
29#elif defined(CONFIG_PPC_64K_PAGES)
30#define PAGE_SHIFT 16
31#elif defined(CONFIG_PPC_16K_PAGES)
32#define PAGE_SHIFT 14
33#else
34#define PAGE_SHIFT 12
35#endif
36
37#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) 28#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
38 29
39#ifndef __ASSEMBLY__ 30#ifndef __ASSEMBLY__
@@ -326,7 +317,6 @@ struct page;
326extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); 317extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
327extern void copy_user_page(void *to, void *from, unsigned long vaddr, 318extern void copy_user_page(void *to, void *from, unsigned long vaddr,
328 struct page *p); 319 struct page *p);
329extern int page_is_ram(unsigned long pfn);
330extern int devmem_is_allowed(unsigned long pfn); 320extern int devmem_is_allowed(unsigned long pfn);
331 321
332#ifdef CONFIG_PPC_SMLPAR 322#ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 77fc21278fa2..fc188e0e9179 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -20,6 +20,8 @@ struct device_node;
20struct pci_controller_ops { 20struct pci_controller_ops {
21 void (*dma_dev_setup)(struct pci_dev *pdev); 21 void (*dma_dev_setup)(struct pci_dev *pdev);
22 void (*dma_bus_setup)(struct pci_bus *bus); 22 void (*dma_bus_setup)(struct pci_bus *bus);
23 bool (*iommu_bypass_supported)(struct pci_dev *pdev,
24 u64 mask);
23 25
24 int (*probe_mode)(struct pci_bus *bus); 26 int (*probe_mode)(struct pci_bus *bus);
25 27
@@ -44,9 +46,6 @@ struct pci_controller_ops {
44 void (*teardown_msi_irqs)(struct pci_dev *pdev); 46 void (*teardown_msi_irqs)(struct pci_dev *pdev);
45#endif 47#endif
46 48
47 int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
48 u64 (*dma_get_required_mask)(struct pci_dev *pdev);
49
50 void (*shutdown)(struct pci_controller *hose); 49 void (*shutdown)(struct pci_controller *hose);
51}; 50};
52 51
@@ -275,6 +274,8 @@ extern int pcibios_map_io_space(struct pci_bus *bus);
275extern struct pci_controller *pci_find_hose_for_OF_device( 274extern struct pci_controller *pci_find_hose_for_OF_device(
276 struct device_node* node); 275 struct device_node* node);
277 276
277extern struct pci_controller *pci_find_controller_for_domain(int domain_nr);
278
278/* Fill up host controller resources from the OF node */ 279/* Fill up host controller resources from the OF node */
279extern void pci_process_bridge_OF_ranges(struct pci_controller *hose, 280extern void pci_process_bridge_OF_ranges(struct pci_controller *hose,
280 struct device_node *dev, int primary); 281 struct device_node *dev, int primary);
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 0c72f1897063..6a1861a6301e 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
52 52
53#ifdef CONFIG_PCI 53#ifdef CONFIG_PCI
54extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); 54extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops);
55extern const struct dma_map_ops *get_pci_dma_ops(void);
56#else /* CONFIG_PCI */ 55#else /* CONFIG_PCI */
57#define set_pci_dma_ops(d) 56#define set_pci_dma_ops(d)
58#define get_pci_dma_ops() NULL
59#endif 57#endif
60 58
61#ifdef CONFIG_PPC64 59#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index dad1d27e196d..505550fb2935 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[];
66 66
67extern pgd_t swapper_pg_dir[]; 67extern pgd_t swapper_pg_dir[];
68 68
69int dma_pfn_limit_to_zone(u64 pfn_limit);
70extern void paging_init(void); 69extern void paging_init(void);
71 70
72/* 71/*
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index 2f3ff7a27881..05b552418519 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -23,6 +23,8 @@ extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
23 unsigned long *flags, unsigned long *status, 23 unsigned long *flags, unsigned long *status,
24 int count); 24 int count);
25 25
26void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val);
27
26void pnv_tm_init(void); 28void pnv_tm_init(void);
27#else 29#else
28static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } 30static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
@@ -40,7 +42,6 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context,
40} 42}
41 43
42static inline void pnv_tm_init(void) { } 44static inline void pnv_tm_init(void) { }
43static inline void pnv_power9_force_smt4(void) { }
44#endif 45#endif
45 46
46#endif /* _ASM_POWERNV_H */ 47#endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index f9513ad38fa6..c5698a523bb1 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -326,6 +326,7 @@
326#define PPC_INST_ADDI 0x38000000 326#define PPC_INST_ADDI 0x38000000
327#define PPC_INST_ADDIS 0x3c000000 327#define PPC_INST_ADDIS 0x3c000000
328#define PPC_INST_ADD 0x7c000214 328#define PPC_INST_ADD 0x7c000214
329#define PPC_INST_ADDC 0x7c000014
329#define PPC_INST_SUB 0x7c000050 330#define PPC_INST_SUB 0x7c000050
330#define PPC_INST_BLR 0x4e800020 331#define PPC_INST_BLR 0x4e800020
331#define PPC_INST_BLRL 0x4e800021 332#define PPC_INST_BLRL 0x4e800021
@@ -334,6 +335,9 @@
334#define PPC_INST_MULLW 0x7c0001d6 335#define PPC_INST_MULLW 0x7c0001d6
335#define PPC_INST_MULHWU 0x7c000016 336#define PPC_INST_MULHWU 0x7c000016
336#define PPC_INST_MULLI 0x1c000000 337#define PPC_INST_MULLI 0x1c000000
338#define PPC_INST_MADDHD 0x10000030
339#define PPC_INST_MADDHDU 0x10000031
340#define PPC_INST_MADDLD 0x10000033
337#define PPC_INST_DIVWU 0x7c000396 341#define PPC_INST_DIVWU 0x7c000396
338#define PPC_INST_DIVD 0x7c0003d2 342#define PPC_INST_DIVD 0x7c0003d2
339#define PPC_INST_RLWINM 0x54000000 343#define PPC_INST_RLWINM 0x54000000
@@ -377,6 +381,7 @@
377/* macros to insert fields into opcodes */ 381/* macros to insert fields into opcodes */
378#define ___PPC_RA(a) (((a) & 0x1f) << 16) 382#define ___PPC_RA(a) (((a) & 0x1f) << 16)
379#define ___PPC_RB(b) (((b) & 0x1f) << 11) 383#define ___PPC_RB(b) (((b) & 0x1f) << 11)
384#define ___PPC_RC(c) (((c) & 0x1f) << 6)
380#define ___PPC_RS(s) (((s) & 0x1f) << 21) 385#define ___PPC_RS(s) (((s) & 0x1f) << 21)
381#define ___PPC_RT(t) ___PPC_RS(t) 386#define ___PPC_RT(t) ___PPC_RS(t)
382#define ___PPC_R(r) (((r) & 0x1) << 16) 387#define ___PPC_R(r) (((r) & 0x1) << 16)
@@ -396,7 +401,7 @@
396#define __PPC_WS(w) (((w) & 0x1f) << 11) 401#define __PPC_WS(w) (((w) & 0x1f) << 11)
397#define __PPC_SH(s) __PPC_WS(s) 402#define __PPC_SH(s) __PPC_WS(s)
398#define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4)) 403#define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4))
399#define __PPC_MB(s) (((s) & 0x1f) << 6) 404#define __PPC_MB(s) ___PPC_RC(s)
400#define __PPC_ME(s) (((s) & 0x1f) << 1) 405#define __PPC_ME(s) (((s) & 0x1f) << 1)
401#define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20)) 406#define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20))
402#define __PPC_ME64(s) __PPC_MB64(s) 407#define __PPC_ME64(s) __PPC_MB64(s)
@@ -438,6 +443,15 @@
438#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ 443#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \
439 ___PPC_RT(t) | ___PPC_RA(a) | \ 444 ___PPC_RT(t) | ___PPC_RA(a) | \
440 ___PPC_RB(b)) 445 ___PPC_RB(b))
446#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \
447 ___PPC_RT(t) | ___PPC_RA(a) | \
448 ___PPC_RB(b) | ___PPC_RC(c))
449#define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHDU | \
450 ___PPC_RT(t) | ___PPC_RA(a) | \
451 ___PPC_RB(b) | ___PPC_RC(c))
452#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \
453 ___PPC_RT(t) | ___PPC_RA(a) | \
454 ___PPC_RB(b) | ___PPC_RC(c))
441#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ 455#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \
442 ___PPC_RB(b)) 456 ___PPC_RB(b))
443#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) 457#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC)
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index f67da277d652..f191ef0d2a0a 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -53,13 +53,13 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
53struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); 53struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
54void eeh_slot_error_detail(struct eeh_pe *pe, int severity); 54void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
55int eeh_pci_enable(struct eeh_pe *pe, int function); 55int eeh_pci_enable(struct eeh_pe *pe, int function);
56int eeh_pe_reset_full(struct eeh_pe *pe); 56int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
57void eeh_save_bars(struct eeh_dev *edev); 57void eeh_save_bars(struct eeh_dev *edev);
58int rtas_write_config(struct pci_dn *, int where, int size, u32 val); 58int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
59int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); 59int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
60void eeh_pe_state_mark(struct eeh_pe *pe, int state); 60void eeh_pe_state_mark(struct eeh_pe *pe, int state);
61void eeh_pe_mark_isolated(struct eeh_pe *pe); 61void eeh_pe_mark_isolated(struct eeh_pe *pe);
62void eeh_pe_state_clear(struct eeh_pe *pe, int state); 62void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed);
63void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); 63void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state);
64void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); 64void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
65 65
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ee58526cb6c2..3351bcf42f2d 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -40,7 +40,7 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42#include <linux/types.h> 42#include <linux/types.h>
43#include <asm/thread_info.h> 43#include <linux/thread_info.h>
44#include <asm/ptrace.h> 44#include <asm/ptrace.h>
45#include <asm/hw_breakpoint.h> 45#include <asm/hw_breakpoint.h>
46 46
@@ -77,105 +77,15 @@ extern int _chrp_type;
77 77
78#ifdef __KERNEL__ 78#ifdef __KERNEL__
79 79
80struct task_struct;
81void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
82void release_thread(struct task_struct *);
83
84#ifdef CONFIG_PPC32
85
86#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START
87#error User TASK_SIZE overlaps with KERNEL_START address
88#endif
89#define TASK_SIZE (CONFIG_TASK_SIZE)
90
91/* This decides where the kernel will search for a free chunk of vm
92 * space during mmap's.
93 */
94#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3)
95#endif
96
97#ifdef CONFIG_PPC64 80#ifdef CONFIG_PPC64
98/* 81#include <asm/task_size_64.h>
99 * 64-bit user address space can have multiple limits
100 * For now supported values are:
101 */
102#define TASK_SIZE_64TB (0x0000400000000000UL)
103#define TASK_SIZE_128TB (0x0000800000000000UL)
104#define TASK_SIZE_512TB (0x0002000000000000UL)
105#define TASK_SIZE_1PB (0x0004000000000000UL)
106#define TASK_SIZE_2PB (0x0008000000000000UL)
107/*
108 * With 52 bits in the address we can support
109 * upto 4PB of range.
110 */
111#define TASK_SIZE_4PB (0x0010000000000000UL)
112
113/*
114 * For now 512TB is only supported with book3s and 64K linux page size.
115 */
116#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
117/*
118 * Max value currently used:
119 */
120#define TASK_SIZE_USER64 TASK_SIZE_4PB
121#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB
122#define TASK_CONTEXT_SIZE TASK_SIZE_512TB
123#else 82#else
124#define TASK_SIZE_USER64 TASK_SIZE_64TB 83#include <asm/task_size_32.h>
125#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB
126/*
127 * We don't need to allocate extended context ids for 4K page size, because
128 * we limit the max effective address on this config to 64TB.
129 */
130#define TASK_CONTEXT_SIZE TASK_SIZE_64TB
131#endif 84#endif
132 85
133/* 86struct task_struct;
134 * 32-bit user address space is 4GB - 1 page 87void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp);
135 * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT 88void release_thread(struct task_struct *);
136 */
137#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
138
139#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
140 TASK_SIZE_USER32 : TASK_SIZE_USER64)
141#define TASK_SIZE TASK_SIZE_OF(current)
142/* This decides where the kernel will search for a free chunk of vm
143 * space during mmap's.
144 */
145#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
146#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4))
147
148#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
149 TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
150#endif
151
152/*
153 * Initial task size value for user applications. For book3s 64 we start
154 * with 128TB and conditionally enable upto 512TB
155 */
156#ifdef CONFIG_PPC_BOOK3S_64
157#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \
158 TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64)
159#else
160#define DEFAULT_MAP_WINDOW TASK_SIZE
161#endif
162
163#ifdef __powerpc64__
164
165#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64
166#define STACK_TOP_USER32 TASK_SIZE_USER32
167
168#define STACK_TOP (is_32bit_task() ? \
169 STACK_TOP_USER32 : STACK_TOP_USER64)
170
171#define STACK_TOP_MAX TASK_SIZE_USER64
172
173#else /* __powerpc64__ */
174
175#define STACK_TOP TASK_SIZE
176#define STACK_TOP_MAX STACK_TOP
177
178#endif /* __powerpc64__ */
179 89
180typedef struct { 90typedef struct {
181 unsigned long seg; 91 unsigned long seg;
@@ -250,6 +160,9 @@ struct thread_struct {
250#ifdef CONFIG_PPC32 160#ifdef CONFIG_PPC32
251 void *pgdir; /* root of page-table tree */ 161 void *pgdir; /* root of page-table tree */
252 unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ 162 unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */
163#ifdef CONFIG_PPC_RTAS
164 unsigned long rtas_sp; /* stack pointer for when in RTAS */
165#endif
253#endif 166#endif
254 /* Debug Registers */ 167 /* Debug Registers */
255 struct debug_reg debug; 168 struct debug_reg debug;
@@ -357,8 +270,7 @@ struct thread_struct {
357#define ARCH_MIN_TASKALIGN 16 270#define ARCH_MIN_TASKALIGN 16
358 271
359#define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) 272#define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack)
360#define INIT_SP_LIMIT \ 273#define INIT_SP_LIMIT ((unsigned long)&init_stack)
361 (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack)
362 274
363#ifdef CONFIG_SPE 275#ifdef CONFIG_SPE
364#define SPEFSCR_INIT \ 276#define SPEFSCR_INIT \
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 0b8a735b6d85..64271e562fed 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -157,7 +157,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno,
157 unsigned long data); 157 unsigned long data);
158 158
159#define current_pt_regs() \ 159#define current_pt_regs() \
160 ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1) 160 ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1)
161/* 161/*
162 * We use the least-significant bit of the trap field to indicate 162 * We use the least-significant bit of the trap field to indicate
163 * whether we have saved the full set of registers, or only a 163 * whether we have saved the full set of registers, or only a
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 1c98ef1f2d5b..c5b2aff0ce8e 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1062,7 +1062,7 @@
1062 * - SPRG9 debug exception scratch 1062 * - SPRG9 debug exception scratch
1063 * 1063 *
1064 * All 32-bit: 1064 * All 32-bit:
1065 * - SPRG3 current thread_info pointer 1065 * - SPRG3 current thread_struct physical addr pointer
1066 * (virtual on BookE, physical on others) 1066 * (virtual on BookE, physical on others)
1067 * 1067 *
1068 * 32-bit classic: 1068 * 32-bit classic:
@@ -1167,7 +1167,7 @@
1167#ifdef CONFIG_PPC_BOOK3S_32 1167#ifdef CONFIG_PPC_BOOK3S_32
1168#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 1168#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0
1169#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 1169#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1
1170#define SPRN_SPRG_RTAS SPRN_SPRG2 1170#define SPRN_SPRG_PGDIR SPRN_SPRG2
1171#define SPRN_SPRG_603_LRU SPRN_SPRG4 1171#define SPRN_SPRG_603_LRU SPRN_SPRG4
1172#endif 1172#endif
1173 1173
@@ -1425,6 +1425,11 @@ static inline void msr_check_and_clear(unsigned long bits)
1425#define mfsrin(v) ({unsigned int rval; \ 1425#define mfsrin(v) ({unsigned int rval; \
1426 asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \ 1426 asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \
1427 rval;}) 1427 rval;})
1428
1429static inline void mtsrin(u32 val, u32 idx)
1430{
1431 asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx));
1432}
1428#endif 1433#endif
1429 1434
1430#define proc_trap() asm volatile("trap") 1435#define proc_trap() asm volatile("trap")
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index e335a8f846af..4a1664a8658d 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -17,6 +17,13 @@ extern char __end_interrupts[];
17extern char __prom_init_toc_start[]; 17extern char __prom_init_toc_start[];
18extern char __prom_init_toc_end[]; 18extern char __prom_init_toc_end[];
19 19
20#ifdef CONFIG_PPC_POWERNV
21extern char start_real_trampolines[];
22extern char end_real_trampolines[];
23extern char start_virt_trampolines[];
24extern char end_virt_trampolines[];
25#endif
26
20static inline int in_kernel_text(unsigned long addr) 27static inline int in_kernel_text(unsigned long addr)
21{ 28{
22 if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) 29 if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end)
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 41695745032c..0de717e16dd6 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu);
83/* 32-bit */ 83/* 32-bit */
84extern int smp_hw_index[]; 84extern int smp_hw_index[];
85 85
86#define raw_smp_processor_id() (current_thread_info()->cpu) 86/*
87 * This is particularly ugly: it appears we can't actually get the definition
88 * of task_struct here, but we need access to the CPU this task is running on.
89 * Instead of using task_struct we're using _TASK_CPU which is extracted from
90 * asm-offsets.h by kbuild to get the current processor ID.
91 *
92 * This also needs to be safeguarded when building asm-offsets.s because at
93 * that time _TASK_CPU is not defined yet. It could have been guarded by
94 * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing
95 * when building something else than asm-offsets.s
96 */
97#ifdef GENERATING_ASM_OFFSETS
98#define raw_smp_processor_id() (0)
99#else
100#define raw_smp_processor_id() (*(unsigned int *)((void *)current + _TASK_CPU))
101#endif
87#define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) 102#define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])
88 103
89static inline int get_hard_smp_processor_id(int cpu) 104static inline int get_hard_smp_processor_id(int cpu)
diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h
index f65ecf57b66c..b7d082c0ec25 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -13,12 +13,7 @@
13 13
14#include <linux/swiotlb.h> 14#include <linux/swiotlb.h>
15 15
16extern const struct dma_map_ops powerpc_swiotlb_dma_ops;
17
18extern unsigned int ppc_swiotlb_enable; 16extern unsigned int ppc_swiotlb_enable;
19int __init swiotlb_setup_bus_notifier(void);
20
21extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev);
22 17
23#ifdef CONFIG_SWIOTLB 18#ifdef CONFIG_SWIOTLB
24void swiotlb_detect_4g(void); 19void swiotlb_detect_4g(void);
diff --git a/arch/powerpc/include/asm/task_size_32.h b/arch/powerpc/include/asm/task_size_32.h
new file mode 100644
index 000000000000..de7290ee770f
--- /dev/null
+++ b/arch/powerpc/include/asm/task_size_32.h
@@ -0,0 +1,21 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _ASM_POWERPC_TASK_SIZE_32_H
3#define _ASM_POWERPC_TASK_SIZE_32_H
4
5#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START
6#error User TASK_SIZE overlaps with KERNEL_START address
7#endif
8
9#define TASK_SIZE (CONFIG_TASK_SIZE)
10
11/*
12 * This decides where the kernel will search for a free chunk of vm space during
13 * mmap's.
14 */
15#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3)
16
17#define DEFAULT_MAP_WINDOW TASK_SIZE
18#define STACK_TOP TASK_SIZE
19#define STACK_TOP_MAX STACK_TOP
20
21#endif /* _ASM_POWERPC_TASK_SIZE_32_H */
diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h
new file mode 100644
index 000000000000..eab4779f6b84
--- /dev/null
+++ b/arch/powerpc/include/asm/task_size_64.h
@@ -0,0 +1,79 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _ASM_POWERPC_TASK_SIZE_64_H
3#define _ASM_POWERPC_TASK_SIZE_64_H
4
5/*
6 * 64-bit user address space can have multiple limits
7 * For now supported values are:
8 */
9#define TASK_SIZE_64TB (0x0000400000000000UL)
10#define TASK_SIZE_128TB (0x0000800000000000UL)
11#define TASK_SIZE_512TB (0x0002000000000000UL)
12#define TASK_SIZE_1PB (0x0004000000000000UL)
13#define TASK_SIZE_2PB (0x0008000000000000UL)
14
15/*
16 * With 52 bits in the address we can support up to 4PB of range.
17 */
18#define TASK_SIZE_4PB (0x0010000000000000UL)
19
20/*
21 * For now 512TB is only supported with book3s and 64K linux page size.
22 */
23#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
24/*
25 * Max value currently used:
26 */
27#define TASK_SIZE_USER64 TASK_SIZE_4PB
28#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB
29#define TASK_CONTEXT_SIZE TASK_SIZE_512TB
30#else
31#define TASK_SIZE_USER64 TASK_SIZE_64TB
32#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB
33
34/*
35 * We don't need to allocate extended context ids for 4K page size, because we
36 * limit the max effective address on this config to 64TB.
37 */
38#define TASK_CONTEXT_SIZE TASK_SIZE_64TB
39#endif
40
41/*
42 * 32-bit user address space is 4GB - 1 page
43 * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
44 */
45#define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE))
46
47#define TASK_SIZE_OF(tsk) \
48 (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 : \
49 TASK_SIZE_USER64)
50
51#define TASK_SIZE TASK_SIZE_OF(current)
52
53#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
54#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4))
55
56/*
57 * This decides where the kernel will search for a free chunk of vm space during
58 * mmap's.
59 */
60#define TASK_UNMAPPED_BASE \
61 ((is_32bit_task()) ? TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64)
62
63/*
64 * Initial task size value for user applications. For book3s 64 we start
65 * with 128TB and conditionally enable upto 512TB
66 */
67#ifdef CONFIG_PPC_BOOK3S_64
68#define DEFAULT_MAP_WINDOW \
69 ((is_32bit_task()) ? TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64)
70#else
71#define DEFAULT_MAP_WINDOW TASK_SIZE
72#endif
73
74#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64
75#define STACK_TOP_USER32 TASK_SIZE_USER32
76#define STACK_TOP_MAX TASK_SIZE_USER64
77#define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64)
78
79#endif /* _ASM_POWERPC_TASK_SIZE_64_H */
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 544cac0474cb..8e1d0195ac36 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -17,12 +17,6 @@
17 17
18#define THREAD_SIZE (1 << THREAD_SHIFT) 18#define THREAD_SIZE (1 << THREAD_SHIFT)
19 19
20#ifdef CONFIG_PPC64
21#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(clrrdi dest, sp, THREAD_SHIFT)
22#else
23#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(rlwinm dest, sp, 0, 0, 31-THREAD_SHIFT)
24#endif
25
26#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
27#include <linux/cache.h> 21#include <linux/cache.h>
28#include <asm/processor.h> 22#include <asm/processor.h>
@@ -34,8 +28,6 @@
34 * low level task data. 28 * low level task data.
35 */ 29 */
36struct thread_info { 30struct thread_info {
37 struct task_struct *task; /* main task structure */
38 int cpu; /* cpu we're on */
39 int preempt_count; /* 0 => preemptable, 31 int preempt_count; /* 0 => preemptable,
40 <0 => BUG */ 32 <0 => BUG */
41 unsigned long local_flags; /* private flags for thread */ 33 unsigned long local_flags; /* private flags for thread */
@@ -58,8 +50,6 @@ struct thread_info {
58 */ 50 */
59#define INIT_THREAD_INFO(tsk) \ 51#define INIT_THREAD_INFO(tsk) \
60{ \ 52{ \
61 .task = &tsk, \
62 .cpu = 0, \
63 .preempt_count = INIT_PREEMPT_COUNT, \ 53 .preempt_count = INIT_PREEMPT_COUNT, \
64 .flags = 0, \ 54 .flags = 0, \
65} 55}
@@ -67,15 +57,6 @@ struct thread_info {
67#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) 57#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
68 58
69/* how to get the thread information struct from C */ 59/* how to get the thread information struct from C */
70static inline struct thread_info *current_thread_info(void)
71{
72 unsigned long val;
73
74 asm (CURRENT_THREAD_INFO(%0,1) : "=r" (val));
75
76 return (struct thread_info *)val;
77}
78
79extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); 60extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
80 61
81#ifdef CONFIG_PPC_BOOK3S_64 62#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index a4a718dbfec6..f85e2b01c3df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {}
132#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) 132#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
133#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) 133#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
134#define topology_core_id(cpu) (cpu_to_core_id(cpu)) 134#define topology_core_id(cpu) (cpu_to_core_id(cpu))
135
136int dlpar_cpu_readd(int cpu);
135#endif 137#endif
136#endif 138#endif
137 139
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index cb7f0bb9ee71..cddadccf551d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -36,7 +36,7 @@ obj-y := cputable.o ptrace.o syscalls.o \
36 process.o systbl.o idle.o \ 36 process.o systbl.o idle.o \
37 signal.o sysfs.o cacheinfo.o time.o \ 37 signal.o sysfs.o cacheinfo.o time.o \
38 prom.o traps.o setup-common.o \ 38 prom.o traps.o setup-common.o \
39 udbg.o misc.o io.o dma.o misc_$(BITS).o \ 39 udbg.o misc.o io.o misc_$(BITS).o \
40 of_platform.o prom_parse.o 40 of_platform.o prom_parse.o
41obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ 41obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
42 signal_64.o ptrace32.o \ 42 signal_64.o ptrace32.o \
@@ -105,6 +105,7 @@ obj-$(CONFIG_UPROBES) += uprobes.o
105obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o 105obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
106obj-$(CONFIG_STACKTRACE) += stacktrace.o 106obj-$(CONFIG_STACKTRACE) += stacktrace.o
107obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o 107obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o
108obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
108 109
109pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o 110pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o
110obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \ 111obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \
@@ -142,19 +143,29 @@ endif
142obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o 143obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
143obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o 144obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
144 145
145# Disable GCOV & sanitizers in odd or sensitive code 146# Disable GCOV, KCOV & sanitizers in odd or sensitive code
146GCOV_PROFILE_prom_init.o := n 147GCOV_PROFILE_prom_init.o := n
148KCOV_INSTRUMENT_prom_init.o := n
147UBSAN_SANITIZE_prom_init.o := n 149UBSAN_SANITIZE_prom_init.o := n
148GCOV_PROFILE_machine_kexec_64.o := n 150GCOV_PROFILE_machine_kexec_64.o := n
151KCOV_INSTRUMENT_machine_kexec_64.o := n
149UBSAN_SANITIZE_machine_kexec_64.o := n 152UBSAN_SANITIZE_machine_kexec_64.o := n
150GCOV_PROFILE_machine_kexec_32.o := n 153GCOV_PROFILE_machine_kexec_32.o := n
154KCOV_INSTRUMENT_machine_kexec_32.o := n
151UBSAN_SANITIZE_machine_kexec_32.o := n 155UBSAN_SANITIZE_machine_kexec_32.o := n
152GCOV_PROFILE_kprobes.o := n 156GCOV_PROFILE_kprobes.o := n
157KCOV_INSTRUMENT_kprobes.o := n
153UBSAN_SANITIZE_kprobes.o := n 158UBSAN_SANITIZE_kprobes.o := n
154GCOV_PROFILE_kprobes-ftrace.o := n 159GCOV_PROFILE_kprobes-ftrace.o := n
160KCOV_INSTRUMENT_kprobes-ftrace.o := n
155UBSAN_SANITIZE_kprobes-ftrace.o := n 161UBSAN_SANITIZE_kprobes-ftrace.o := n
156UBSAN_SANITIZE_vdso.o := n 162UBSAN_SANITIZE_vdso.o := n
157 163
164# Necessary for booting with kcov enabled on book3e machines
165KCOV_INSTRUMENT_cputable.o := n
166KCOV_INSTRUMENT_setup_64.o := n
167KCOV_INSTRUMENT_paca.o := n
168
158extra-$(CONFIG_PPC_FPU) += fpu.o 169extra-$(CONFIG_PPC_FPU) += fpu.o
159extra-$(CONFIG_ALTIVEC) += vector.o 170extra-$(CONFIG_ALTIVEC) += vector.o
160extra-$(CONFIG_PPC64) += entry_64.o 171extra-$(CONFIG_PPC64) += entry_64.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9ffc72ded73a..86a61e5f8285 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -13,6 +13,8 @@
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 */ 14 */
15 15
16#define GENERATING_ASM_OFFSETS /* asm/smp.h */
17
16#include <linux/compat.h> 18#include <linux/compat.h>
17#include <linux/signal.h> 19#include <linux/signal.h>
18#include <linux/sched.h> 20#include <linux/sched.h>
@@ -90,10 +92,15 @@ int main(void)
90 DEFINE(SIGSEGV, SIGSEGV); 92 DEFINE(SIGSEGV, SIGSEGV);
91 DEFINE(NMI_MASK, NMI_MASK); 93 DEFINE(NMI_MASK, NMI_MASK);
92#else 94#else
93 OFFSET(THREAD_INFO, task_struct, stack);
94 DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
95 OFFSET(KSP_LIMIT, thread_struct, ksp_limit); 95 OFFSET(KSP_LIMIT, thread_struct, ksp_limit);
96#ifdef CONFIG_PPC_RTAS
97 OFFSET(RTAS_SP, thread_struct, rtas_sp);
98#endif
96#endif /* CONFIG_PPC64 */ 99#endif /* CONFIG_PPC64 */
100 OFFSET(TASK_STACK, task_struct, stack);
101#ifdef CONFIG_SMP
102 OFFSET(TASK_CPU, task_struct, cpu);
103#endif
97 104
98#ifdef CONFIG_LIVEPATCH 105#ifdef CONFIG_LIVEPATCH
99 OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); 106 OFFSET(TI_livepatch_sp, thread_info, livepatch_sp);
@@ -161,8 +168,6 @@ int main(void)
161 OFFSET(TI_FLAGS, thread_info, flags); 168 OFFSET(TI_FLAGS, thread_info, flags);
162 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); 169 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
163 OFFSET(TI_PREEMPT, thread_info, preempt_count); 170 OFFSET(TI_PREEMPT, thread_info, preempt_count);
164 OFFSET(TI_TASK, thread_info, task);
165 OFFSET(TI_CPU, thread_info, cpu);
166 171
167#ifdef CONFIG_PPC64 172#ifdef CONFIG_PPC64
168 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); 173 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size);
@@ -177,6 +182,8 @@ int main(void)
177 OFFSET(PACAPROCSTART, paca_struct, cpu_start); 182 OFFSET(PACAPROCSTART, paca_struct, cpu_start);
178 OFFSET(PACAKSAVE, paca_struct, kstack); 183 OFFSET(PACAKSAVE, paca_struct, kstack);
179 OFFSET(PACACURRENT, paca_struct, __current); 184 OFFSET(PACACURRENT, paca_struct, __current);
185 DEFINE(PACA_THREAD_INFO, offsetof(struct paca_struct, __current) +
186 offsetof(struct task_struct, thread_info));
180 OFFSET(PACASAVEDMSR, paca_struct, saved_msr); 187 OFFSET(PACASAVEDMSR, paca_struct, saved_msr);
181 OFFSET(PACAR1, paca_struct, saved_r1); 188 OFFSET(PACAR1, paca_struct, saved_r1);
182 OFFSET(PACATOC, paca_struct, kernel_toc); 189 OFFSET(PACATOC, paca_struct, kernel_toc);
diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S
index 8c069e96c478..6f1c11e0691f 100644
--- a/arch/powerpc/kernel/cpu_setup_6xx.S
+++ b/arch/powerpc/kernel/cpu_setup_6xx.S
@@ -24,6 +24,10 @@ BEGIN_MMU_FTR_SECTION
24 li r10,0 24 li r10,0
25 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ 25 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */
26END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) 26END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
27 lis r10, (swapper_pg_dir - PAGE_OFFSET)@h
28 ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l
29 mtspr SPRN_SPRG_PGDIR, r10
30
27BEGIN_FTR_SECTION 31BEGIN_FTR_SECTION
28 bl __init_fpu_registers 32 bl __init_fpu_registers
29END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) 33END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE)
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 9c9bcaae2f75..09231ef06d01 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -6,12 +6,31 @@
6 * busses using the iommu infrastructure 6 * busses using the iommu infrastructure
7 */ 7 */
8 8
9#include <linux/dma-direct.h>
10#include <linux/pci.h>
9#include <asm/iommu.h> 11#include <asm/iommu.h>
10 12
11/* 13/*
12 * Generic iommu implementation 14 * Generic iommu implementation
13 */ 15 */
14 16
17/*
18 * The coherent mask may be smaller than the real mask, check if we can
19 * really use a direct window.
20 */
21static inline bool dma_iommu_alloc_bypass(struct device *dev)
22{
23 return dev->archdata.iommu_bypass && !iommu_fixed_is_weak &&
24 dma_direct_supported(dev, dev->coherent_dma_mask);
25}
26
27static inline bool dma_iommu_map_bypass(struct device *dev,
28 unsigned long attrs)
29{
30 return dev->archdata.iommu_bypass &&
31 (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING));
32}
33
15/* Allocates a contiguous real buffer and creates mappings over it. 34/* Allocates a contiguous real buffer and creates mappings over it.
16 * Returns the virtual address of the buffer and sets dma_handle 35 * Returns the virtual address of the buffer and sets dma_handle
17 * to the dma address (mapping) of the first page. 36 * to the dma address (mapping) of the first page.
@@ -20,6 +39,8 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
20 dma_addr_t *dma_handle, gfp_t flag, 39 dma_addr_t *dma_handle, gfp_t flag,
21 unsigned long attrs) 40 unsigned long attrs)
22{ 41{
42 if (dma_iommu_alloc_bypass(dev))
43 return dma_direct_alloc(dev, size, dma_handle, flag, attrs);
23 return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, 44 return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
24 dma_handle, dev->coherent_dma_mask, flag, 45 dma_handle, dev->coherent_dma_mask, flag,
25 dev_to_node(dev)); 46 dev_to_node(dev));
@@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size,
29 void *vaddr, dma_addr_t dma_handle, 50 void *vaddr, dma_addr_t dma_handle,
30 unsigned long attrs) 51 unsigned long attrs)
31{ 52{
32 iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); 53 if (dma_iommu_alloc_bypass(dev))
54 dma_direct_free(dev, size, vaddr, dma_handle, attrs);
55 else
56 iommu_free_coherent(get_iommu_table_base(dev), size, vaddr,
57 dma_handle);
33} 58}
34 59
35/* Creates TCEs for a user provided buffer. The user buffer must be 60/* Creates TCEs for a user provided buffer. The user buffer must be
@@ -42,6 +67,9 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
42 enum dma_data_direction direction, 67 enum dma_data_direction direction,
43 unsigned long attrs) 68 unsigned long attrs)
44{ 69{
70 if (dma_iommu_map_bypass(dev, attrs))
71 return dma_direct_map_page(dev, page, offset, size, direction,
72 attrs);
45 return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, 73 return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
46 size, device_to_mask(dev), direction, attrs); 74 size, device_to_mask(dev), direction, attrs);
47} 75}
@@ -51,8 +79,9 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
51 size_t size, enum dma_data_direction direction, 79 size_t size, enum dma_data_direction direction,
52 unsigned long attrs) 80 unsigned long attrs)
53{ 81{
54 iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, 82 if (!dma_iommu_map_bypass(dev, attrs))
55 attrs); 83 iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size,
84 direction, attrs);
56} 85}
57 86
58 87
@@ -60,6 +89,8 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
60 int nelems, enum dma_data_direction direction, 89 int nelems, enum dma_data_direction direction,
61 unsigned long attrs) 90 unsigned long attrs)
62{ 91{
92 if (dma_iommu_map_bypass(dev, attrs))
93 return dma_direct_map_sg(dev, sglist, nelems, direction, attrs);
63 return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, 94 return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems,
64 device_to_mask(dev), direction, attrs); 95 device_to_mask(dev), direction, attrs);
65} 96}
@@ -68,10 +99,20 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist,
68 int nelems, enum dma_data_direction direction, 99 int nelems, enum dma_data_direction direction,
69 unsigned long attrs) 100 unsigned long attrs)
70{ 101{
71 ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, 102 if (!dma_iommu_map_bypass(dev, attrs))
103 ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems,
72 direction, attrs); 104 direction, attrs);
73} 105}
74 106
107static bool dma_iommu_bypass_supported(struct device *dev, u64 mask)
108{
109 struct pci_dev *pdev = to_pci_dev(dev);
110 struct pci_controller *phb = pci_bus_to_host(pdev->bus);
111
112 return phb->controller_ops.iommu_bypass_supported &&
113 phb->controller_ops.iommu_bypass_supported(pdev, mask);
114}
115
75/* We support DMA to/from any memory page via the iommu */ 116/* We support DMA to/from any memory page via the iommu */
76int dma_iommu_dma_supported(struct device *dev, u64 mask) 117int dma_iommu_dma_supported(struct device *dev, u64 mask)
77{ 118{
@@ -83,32 +124,48 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
83 return 0; 124 return 0;
84 } 125 }
85 126
127 if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
128 dev->archdata.iommu_bypass = true;
129 dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
130 return 1;
131 }
132
86 if (tbl->it_offset > (mask >> tbl->it_page_shift)) { 133 if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
87 dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); 134 dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
88 dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", 135 dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
89 mask, tbl->it_offset << tbl->it_page_shift); 136 mask, tbl->it_offset << tbl->it_page_shift);
90 return 0; 137 return 0;
91 } else 138 }
92 return 1; 139
140 dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
141 dev->archdata.iommu_bypass = false;
142 return 1;
93} 143}
94 144
95static u64 dma_iommu_get_required_mask(struct device *dev) 145u64 dma_iommu_get_required_mask(struct device *dev)
96{ 146{
97 struct iommu_table *tbl = get_iommu_table_base(dev); 147 struct iommu_table *tbl = get_iommu_table_base(dev);
98 u64 mask; 148 u64 mask;
149
99 if (!tbl) 150 if (!tbl)
100 return 0; 151 return 0;
101 152
153 if (dev_is_pci(dev)) {
154 u64 bypass_mask = dma_direct_get_required_mask(dev);
155
156 if (dma_iommu_bypass_supported(dev, bypass_mask))
157 return bypass_mask;
158 }
159
102 mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); 160 mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1);
103 mask += mask - 1; 161 mask += mask - 1;
104 162
105 return mask; 163 return mask;
106} 164}
107 165
108struct dma_map_ops dma_iommu_ops = { 166const struct dma_map_ops dma_iommu_ops = {
109 .alloc = dma_iommu_alloc_coherent, 167 .alloc = dma_iommu_alloc_coherent,
110 .free = dma_iommu_free_coherent, 168 .free = dma_iommu_free_coherent,
111 .mmap = dma_nommu_mmap_coherent,
112 .map_sg = dma_iommu_map_sg, 169 .map_sg = dma_iommu_map_sg,
113 .unmap_sg = dma_iommu_unmap_sg, 170 .unmap_sg = dma_iommu_unmap_sg,
114 .dma_supported = dma_iommu_dma_supported, 171 .dma_supported = dma_iommu_dma_supported,
diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c
new file mode 100644
index 000000000000..ffbbbc432612
--- /dev/null
+++ b/arch/powerpc/kernel/dma-mask.c
@@ -0,0 +1,12 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/dma-mapping.h>
4#include <linux/export.h>
5#include <asm/machdep.h>
6
7void arch_dma_set_mask(struct device *dev, u64 dma_mask)
8{
9 if (ppc_md.dma_set_mask)
10 ppc_md.dma_set_mask(dev, dma_mask);
11}
12EXPORT_SYMBOL(arch_dma_set_mask);
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 7d5fc9751622..132d61c91629 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -10,101 +10,12 @@
10 * option) any later version. 10 * option) any later version.
11 * 11 *
12 */ 12 */
13
14#include <linux/dma-direct.h>
15#include <linux/memblock.h> 13#include <linux/memblock.h>
16#include <linux/pfn.h>
17#include <linux/of_platform.h>
18#include <linux/platform_device.h>
19#include <linux/pci.h>
20
21#include <asm/machdep.h> 14#include <asm/machdep.h>
22#include <asm/swiotlb.h> 15#include <asm/swiotlb.h>
23#include <asm/dma.h>
24 16
25unsigned int ppc_swiotlb_enable; 17unsigned int ppc_swiotlb_enable;
26 18
27static u64 swiotlb_powerpc_get_required(struct device *dev)
28{
29 u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr;
30
31 end = memblock_end_of_DRAM();
32 if (max_direct_dma_addr && end > max_direct_dma_addr)
33 end = max_direct_dma_addr;
34 end += get_dma_offset(dev);
35
36 mask = 1ULL << (fls64(end) - 1);
37 mask += mask - 1;
38
39 return mask;
40}
41
42/*
43 * At the moment, all platforms that use this code only require
44 * swiotlb to be used if we're operating on HIGHMEM. Since
45 * we don't ever call anything other than map_sg, unmap_sg,
46 * map_page, and unmap_page on highmem, use normal dma_ops
47 * for everything else.
48 */
49const struct dma_map_ops powerpc_swiotlb_dma_ops = {
50 .alloc = __dma_nommu_alloc_coherent,
51 .free = __dma_nommu_free_coherent,
52 .mmap = dma_nommu_mmap_coherent,
53 .map_sg = dma_direct_map_sg,
54 .unmap_sg = dma_direct_unmap_sg,
55 .dma_supported = swiotlb_dma_supported,
56 .map_page = dma_direct_map_page,
57 .unmap_page = dma_direct_unmap_page,
58 .sync_single_for_cpu = dma_direct_sync_single_for_cpu,
59 .sync_single_for_device = dma_direct_sync_single_for_device,
60 .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
61 .sync_sg_for_device = dma_direct_sync_sg_for_device,
62 .get_required_mask = swiotlb_powerpc_get_required,
63};
64
65void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
66{
67 struct pci_controller *hose;
68 struct dev_archdata *sd;
69
70 hose = pci_bus_to_host(pdev->bus);
71 sd = &pdev->dev.archdata;
72 sd->max_direct_dma_addr =
73 hose->dma_window_base_cur + hose->dma_window_size;
74}
75
76static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
77 unsigned long action, void *data)
78{
79 struct device *dev = data;
80 struct dev_archdata *sd;
81
82 /* We are only intereted in device addition */
83 if (action != BUS_NOTIFY_ADD_DEVICE)
84 return 0;
85
86 sd = &dev->archdata;
87 sd->max_direct_dma_addr = 0;
88
89 /* May need to bounce if the device can't address all of DRAM */
90 if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM())
91 set_dma_ops(dev, &powerpc_swiotlb_dma_ops);
92
93 return NOTIFY_DONE;
94}
95
96static struct notifier_block ppc_swiotlb_plat_bus_notifier = {
97 .notifier_call = ppc_swiotlb_bus_notify,
98 .priority = 0,
99};
100
101int __init swiotlb_setup_bus_notifier(void)
102{
103 bus_register_notifier(&platform_bus_type,
104 &ppc_swiotlb_plat_bus_notifier);
105 return 0;
106}
107
108void __init swiotlb_detect_4g(void) 19void __init swiotlb_detect_4g(void)
109{ 20{
110 if ((memblock_end_of_DRAM() - 1) > 0xffffffff) 21 if ((memblock_end_of_DRAM() - 1) > 0xffffffff)
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
deleted file mode 100644
index b1903ebb2e9c..000000000000
--- a/arch/powerpc/kernel/dma.c
+++ /dev/null
@@ -1,362 +0,0 @@
1/*
2 * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation
3 *
4 * Provide default implementations of the DMA mapping callbacks for
5 * directly mapped busses.
6 */
7
8#include <linux/device.h>
9#include <linux/dma-mapping.h>
10#include <linux/dma-debug.h>
11#include <linux/gfp.h>
12#include <linux/memblock.h>
13#include <linux/export.h>
14#include <linux/pci.h>
15#include <asm/vio.h>
16#include <asm/bug.h>
17#include <asm/machdep.h>
18#include <asm/swiotlb.h>
19#include <asm/iommu.h>
20
21/*
22 * Generic direct DMA implementation
23 *
24 * This implementation supports a per-device offset that can be applied if
25 * the address at which memory is visible to devices is not 0. Platform code
26 * can set archdata.dma_data to an unsigned long holding the offset. By
27 * default the offset is PCI_DRAM_OFFSET.
28 */
29
30static u64 __maybe_unused get_pfn_limit(struct device *dev)
31{
32 u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
33 struct dev_archdata __maybe_unused *sd = &dev->archdata;
34
35#ifdef CONFIG_SWIOTLB
36 if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops)
37 pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
38#endif
39
40 return pfn;
41}
42
43static int dma_nommu_dma_supported(struct device *dev, u64 mask)
44{
45#ifdef CONFIG_PPC64
46 u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1);
47
48 /* Limit fits in the mask, we are good */
49 if (mask >= limit)
50 return 1;
51
52#ifdef CONFIG_FSL_SOC
53 /*
54 * Freescale gets another chance via ZONE_DMA, however
55 * that will have to be refined if/when they support iommus
56 */
57 return 1;
58#endif
59 /* Sorry ... */
60 return 0;
61#else
62 return 1;
63#endif
64}
65
66#ifndef CONFIG_NOT_COHERENT_CACHE
67void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
68 dma_addr_t *dma_handle, gfp_t flag,
69 unsigned long attrs)
70{
71 void *ret;
72 struct page *page;
73 int node = dev_to_node(dev);
74#ifdef CONFIG_FSL_SOC
75 u64 pfn = get_pfn_limit(dev);
76 int zone;
77
78 /*
79 * This code should be OK on other platforms, but we have drivers that
80 * don't set coherent_dma_mask. As a workaround we just ifdef it. This
81 * whole routine needs some serious cleanup.
82 */
83
84 zone = dma_pfn_limit_to_zone(pfn);
85 if (zone < 0) {
86 dev_err(dev, "%s: No suitable zone for pfn %#llx\n",
87 __func__, pfn);
88 return NULL;
89 }
90
91 switch (zone) {
92#ifdef CONFIG_ZONE_DMA
93 case ZONE_DMA:
94 flag |= GFP_DMA;
95 break;
96#endif
97 };
98#endif /* CONFIG_FSL_SOC */
99
100 page = alloc_pages_node(node, flag, get_order(size));
101 if (page == NULL)
102 return NULL;
103 ret = page_address(page);
104 memset(ret, 0, size);
105 *dma_handle = __pa(ret) + get_dma_offset(dev);
106
107 return ret;
108}
109
110void __dma_nommu_free_coherent(struct device *dev, size_t size,
111 void *vaddr, dma_addr_t dma_handle,
112 unsigned long attrs)
113{
114 free_pages((unsigned long)vaddr, get_order(size));
115}
116#endif /* !CONFIG_NOT_COHERENT_CACHE */
117
118static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
119 dma_addr_t *dma_handle, gfp_t flag,
120 unsigned long attrs)
121{
122 struct iommu_table *iommu;
123
124 /* The coherent mask may be smaller than the real mask, check if
125 * we can really use the direct ops
126 */
127 if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
128 return __dma_nommu_alloc_coherent(dev, size, dma_handle,
129 flag, attrs);
130
131 /* Ok we can't ... do we have an iommu ? If not, fail */
132 iommu = get_iommu_table_base(dev);
133 if (!iommu)
134 return NULL;
135
136 /* Try to use the iommu */
137 return iommu_alloc_coherent(dev, iommu, size, dma_handle,
138 dev->coherent_dma_mask, flag,
139 dev_to_node(dev));
140}
141
142static void dma_nommu_free_coherent(struct device *dev, size_t size,
143 void *vaddr, dma_addr_t dma_handle,
144 unsigned long attrs)
145{
146 struct iommu_table *iommu;
147
148 /* See comments in dma_nommu_alloc_coherent() */
149 if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
150 return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle,
151 attrs);
152 /* Maybe we used an iommu ... */
153 iommu = get_iommu_table_base(dev);
154
155 /* If we hit that we should have never allocated in the first
156 * place so how come we are freeing ?
157 */
158 if (WARN_ON(!iommu))
159 return;
160 iommu_free_coherent(iommu, size, vaddr, dma_handle);
161}
162
163int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
164 void *cpu_addr, dma_addr_t handle, size_t size,
165 unsigned long attrs)
166{
167 unsigned long pfn;
168
169#ifdef CONFIG_NOT_COHERENT_CACHE
170 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
171 pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
172#else
173 pfn = page_to_pfn(virt_to_page(cpu_addr));
174#endif
175 return remap_pfn_range(vma, vma->vm_start,
176 pfn + vma->vm_pgoff,
177 vma->vm_end - vma->vm_start,
178 vma->vm_page_prot);
179}
180
181static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
182 int nents, enum dma_data_direction direction,
183 unsigned long attrs)
184{
185 struct scatterlist *sg;
186 int i;
187
188 for_each_sg(sgl, sg, nents, i) {
189 sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
190 sg->dma_length = sg->length;
191
192 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
193 continue;
194
195 __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
196 }
197
198 return nents;
199}
200
201static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl,
202 int nents, enum dma_data_direction direction,
203 unsigned long attrs)
204{
205 struct scatterlist *sg;
206 int i;
207
208 for_each_sg(sgl, sg, nents, i)
209 __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
210}
211
212static u64 dma_nommu_get_required_mask(struct device *dev)
213{
214 u64 end, mask;
215
216 end = memblock_end_of_DRAM() + get_dma_offset(dev);
217
218 mask = 1ULL << (fls64(end) - 1);
219 mask += mask - 1;
220
221 return mask;
222}
223
224static inline dma_addr_t dma_nommu_map_page(struct device *dev,
225 struct page *page,
226 unsigned long offset,
227 size_t size,
228 enum dma_data_direction dir,
229 unsigned long attrs)
230{
231 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
232 __dma_sync_page(page, offset, size, dir);
233
234 return page_to_phys(page) + offset + get_dma_offset(dev);
235}
236
237static inline void dma_nommu_unmap_page(struct device *dev,
238 dma_addr_t dma_address,
239 size_t size,
240 enum dma_data_direction direction,
241 unsigned long attrs)
242{
243 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
244 __dma_sync(bus_to_virt(dma_address), size, direction);
245}
246
247#ifdef CONFIG_NOT_COHERENT_CACHE
248static inline void dma_nommu_sync_sg(struct device *dev,
249 struct scatterlist *sgl, int nents,
250 enum dma_data_direction direction)
251{
252 struct scatterlist *sg;
253 int i;
254
255 for_each_sg(sgl, sg, nents, i)
256 __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
257}
258
259static inline void dma_nommu_sync_single(struct device *dev,
260 dma_addr_t dma_handle, size_t size,
261 enum dma_data_direction direction)
262{
263 __dma_sync(bus_to_virt(dma_handle), size, direction);
264}
265#endif
266
267const struct dma_map_ops dma_nommu_ops = {
268 .alloc = dma_nommu_alloc_coherent,
269 .free = dma_nommu_free_coherent,
270 .mmap = dma_nommu_mmap_coherent,
271 .map_sg = dma_nommu_map_sg,
272 .unmap_sg = dma_nommu_unmap_sg,
273 .dma_supported = dma_nommu_dma_supported,
274 .map_page = dma_nommu_map_page,
275 .unmap_page = dma_nommu_unmap_page,
276 .get_required_mask = dma_nommu_get_required_mask,
277#ifdef CONFIG_NOT_COHERENT_CACHE
278 .sync_single_for_cpu = dma_nommu_sync_single,
279 .sync_single_for_device = dma_nommu_sync_single,
280 .sync_sg_for_cpu = dma_nommu_sync_sg,
281 .sync_sg_for_device = dma_nommu_sync_sg,
282#endif
283};
284EXPORT_SYMBOL(dma_nommu_ops);
285
286int dma_set_coherent_mask(struct device *dev, u64 mask)
287{
288 if (!dma_supported(dev, mask)) {
289 /*
290 * We need to special case the direct DMA ops which can
291 * support a fallback for coherent allocations. There
292 * is no dma_op->set_coherent_mask() so we have to do
293 * things the hard way:
294 */
295 if (get_dma_ops(dev) != &dma_nommu_ops ||
296 get_iommu_table_base(dev) == NULL ||
297 !dma_iommu_dma_supported(dev, mask))
298 return -EIO;
299 }
300 dev->coherent_dma_mask = mask;
301 return 0;
302}
303EXPORT_SYMBOL(dma_set_coherent_mask);
304
305int dma_set_mask(struct device *dev, u64 dma_mask)
306{
307 if (ppc_md.dma_set_mask)
308 return ppc_md.dma_set_mask(dev, dma_mask);
309
310 if (dev_is_pci(dev)) {
311 struct pci_dev *pdev = to_pci_dev(dev);
312 struct pci_controller *phb = pci_bus_to_host(pdev->bus);
313 if (phb->controller_ops.dma_set_mask)
314 return phb->controller_ops.dma_set_mask(pdev, dma_mask);
315 }
316
317 if (!dev->dma_mask || !dma_supported(dev, dma_mask))
318 return -EIO;
319 *dev->dma_mask = dma_mask;
320 return 0;
321}
322EXPORT_SYMBOL(dma_set_mask);
323
324u64 __dma_get_required_mask(struct device *dev)
325{
326 const struct dma_map_ops *dma_ops = get_dma_ops(dev);
327
328 if (unlikely(dma_ops == NULL))
329 return 0;
330
331 if (dma_ops->get_required_mask)
332 return dma_ops->get_required_mask(dev);
333
334 return DMA_BIT_MASK(8 * sizeof(dma_addr_t));
335}
336
337u64 dma_get_required_mask(struct device *dev)
338{
339 if (ppc_md.dma_get_required_mask)
340 return ppc_md.dma_get_required_mask(dev);
341
342 if (dev_is_pci(dev)) {
343 struct pci_dev *pdev = to_pci_dev(dev);
344 struct pci_controller *phb = pci_bus_to_host(pdev->bus);
345 if (phb->controller_ops.dma_get_required_mask)
346 return phb->controller_ops.dma_get_required_mask(pdev);
347 }
348
349 return __dma_get_required_mask(dev);
350}
351EXPORT_SYMBOL_GPL(dma_get_required_mask);
352
353static int __init dma_init(void)
354{
355#ifdef CONFIG_IBMVIO
356 dma_debug_add_bus(&vio_bus_type);
357#endif
358
359 return 0;
360}
361fs_initcall(dma_init);
362
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 8be3721d9302..e49bd5efcfe6 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -666,8 +666,10 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
666 m = &dt_cpu_feature_match_table[i]; 666 m = &dt_cpu_feature_match_table[i];
667 if (!strcmp(f->name, m->name)) { 667 if (!strcmp(f->name, m->name)) {
668 known = true; 668 known = true;
669 if (m->enable(f)) 669 if (m->enable(f)) {
670 cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
670 break; 671 break;
672 }
671 673
672 pr_info("not enabling: %s (disabled or unsupported by kernel)\n", 674 pr_info("not enabling: %s (disabled or unsupported by kernel)\n",
673 f->name); 675 f->name);
@@ -675,17 +677,12 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
675 } 677 }
676 } 678 }
677 679
678 if (!known && enable_unknown) { 680 if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) {
679 if (!feat_try_enable_unknown(f)) { 681 pr_info("not enabling: %s (unknown and unsupported by kernel)\n",
680 pr_info("not enabling: %s (unknown and unsupported by kernel)\n", 682 f->name);
681 f->name); 683 return false;
682 return false;
683 }
684 } 684 }
685 685
686 if (m->cpu_ftr_bit_mask)
687 cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
688
689 if (known) 686 if (known)
690 pr_debug("enabling: %s\n", f->name); 687 pr_debug("enabling: %s\n", f->name);
691 else 688 else
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ae05203eb4de..289c0b37d845 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -109,7 +109,14 @@ EXPORT_SYMBOL(eeh_subsystem_flags);
109 * frozen count in last hour exceeds this limit, the PE will 109 * frozen count in last hour exceeds this limit, the PE will
110 * be forced to be offline permanently. 110 * be forced to be offline permanently.
111 */ 111 */
112int eeh_max_freezes = 5; 112u32 eeh_max_freezes = 5;
113
114/*
115 * Controls whether a recovery event should be scheduled when an
116 * isolated device is discovered. This is only really useful for
117 * debugging problems with the EEH core.
118 */
119bool eeh_debugfs_no_recover;
113 120
114/* Platform dependent EEH operations */ 121/* Platform dependent EEH operations */
115struct eeh_ops *eeh_ops = NULL; 122struct eeh_ops *eeh_ops = NULL;
@@ -823,15 +830,15 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
823 switch (state) { 830 switch (state) {
824 case pcie_deassert_reset: 831 case pcie_deassert_reset:
825 eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); 832 eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
826 eeh_unfreeze_pe(pe, false); 833 eeh_unfreeze_pe(pe);
827 if (!(pe->type & EEH_PE_VF)) 834 if (!(pe->type & EEH_PE_VF))
828 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); 835 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
829 eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev); 836 eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
830 eeh_pe_state_clear(pe, EEH_PE_ISOLATED); 837 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
831 break; 838 break;
832 case pcie_hot_reset: 839 case pcie_hot_reset:
833 eeh_pe_mark_isolated(pe); 840 eeh_pe_mark_isolated(pe);
834 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); 841 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
835 eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); 842 eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
836 eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); 843 eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
837 if (!(pe->type & EEH_PE_VF)) 844 if (!(pe->type & EEH_PE_VF))
@@ -840,7 +847,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
840 break; 847 break;
841 case pcie_warm_reset: 848 case pcie_warm_reset:
842 eeh_pe_mark_isolated(pe); 849 eeh_pe_mark_isolated(pe);
843 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); 850 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true);
844 eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); 851 eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
845 eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); 852 eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
846 if (!(pe->type & EEH_PE_VF)) 853 if (!(pe->type & EEH_PE_VF))
@@ -848,7 +855,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
848 eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); 855 eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
849 break; 856 break;
850 default: 857 default:
851 eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED); 858 eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true);
852 return -EINVAL; 859 return -EINVAL;
853 }; 860 };
854 861
@@ -877,6 +884,24 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
877 return NULL; 884 return NULL;
878} 885}
879 886
887static void eeh_pe_refreeze_passed(struct eeh_pe *root)
888{
889 struct eeh_pe *pe;
890 int state;
891
892 eeh_for_each_pe(root, pe) {
893 if (eeh_pe_passed(pe)) {
894 state = eeh_ops->get_state(pe, NULL);
895 if (state &
896 (EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED)) {
897 pr_info("EEH: Passed-through PE PHB#%x-PE#%x was thawed by reset, re-freezing for safety.\n",
898 pe->phb->global_number, pe->addr);
899 eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE);
900 }
901 }
902 }
903}
904
880/** 905/**
881 * eeh_pe_reset_full - Complete a full reset process on the indicated PE 906 * eeh_pe_reset_full - Complete a full reset process on the indicated PE
882 * @pe: EEH PE 907 * @pe: EEH PE
@@ -889,12 +914,12 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
889 * 914 *
890 * This function will attempt to reset a PE three times before failing. 915 * This function will attempt to reset a PE three times before failing.
891 */ 916 */
892int eeh_pe_reset_full(struct eeh_pe *pe) 917int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed)
893{ 918{
894 int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED); 919 int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
895 int type = EEH_RESET_HOT; 920 int type = EEH_RESET_HOT;
896 unsigned int freset = 0; 921 unsigned int freset = 0;
897 int i, state, ret; 922 int i, state = 0, ret;
898 923
899 /* 924 /*
900 * Determine the type of reset to perform - hot or fundamental. 925 * Determine the type of reset to perform - hot or fundamental.
@@ -911,32 +936,42 @@ int eeh_pe_reset_full(struct eeh_pe *pe)
911 936
912 /* Make three attempts at resetting the bus */ 937 /* Make three attempts at resetting the bus */
913 for (i = 0; i < 3; i++) { 938 for (i = 0; i < 3; i++) {
914 ret = eeh_pe_reset(pe, type); 939 ret = eeh_pe_reset(pe, type, include_passed);
915 if (ret) 940 if (!ret)
916 break; 941 ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE,
917 942 include_passed);
918 ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); 943 if (ret) {
919 if (ret) 944 ret = -EIO;
920 break; 945 pr_warn("EEH: Failure %d resetting PHB#%x-PE#%x (attempt %d)\n\n",
946 state, pe->phb->global_number, pe->addr, i + 1);
947 continue;
948 }
949 if (i)
950 pr_warn("EEH: PHB#%x-PE#%x: Successful reset (attempt %d)\n",
951 pe->phb->global_number, pe->addr, i + 1);
921 952
922 /* Wait until the PE is in a functioning state */ 953 /* Wait until the PE is in a functioning state */
923 state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); 954 state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
924 if (state < 0) { 955 if (state < 0) {
925 pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", 956 pr_warn("EEH: Unrecoverable slot failure on PHB#%x-PE#%x",
926 __func__, pe->phb->global_number, pe->addr); 957 pe->phb->global_number, pe->addr);
927 ret = -ENOTRECOVERABLE; 958 ret = -ENOTRECOVERABLE;
928 break; 959 break;
929 } 960 }
930 if (eeh_state_active(state)) 961 if (eeh_state_active(state))
931 break; 962 break;
932 963 else
933 /* Set error in case this is our last attempt */ 964 pr_warn("EEH: PHB#%x-PE#%x: Slot inactive after reset: 0x%x (attempt %d)\n",
934 ret = -EIO; 965 pe->phb->global_number, pe->addr, state, i + 1);
935 pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n",
936 __func__, state, pe->phb->global_number, pe->addr, (i + 1));
937 } 966 }
938 967
939 eeh_pe_state_clear(pe, reset_state); 968 /* Resetting the PE may have unfrozen child PEs. If those PEs have been
969 * (potentially) passed through to a guest, re-freeze them:
970 */
971 if (!include_passed)
972 eeh_pe_refreeze_passed(pe);
973
974 eeh_pe_state_clear(pe, reset_state, true);
940 return ret; 975 return ret;
941} 976}
942 977
@@ -1309,7 +1344,7 @@ void eeh_remove_device(struct pci_dev *dev)
1309 edev->mode &= ~EEH_DEV_SYSFS; 1344 edev->mode &= ~EEH_DEV_SYSFS;
1310} 1345}
1311 1346
1312int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) 1347int eeh_unfreeze_pe(struct eeh_pe *pe)
1313{ 1348{
1314 int ret; 1349 int ret;
1315 1350
@@ -1327,10 +1362,6 @@ int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state)
1327 return ret; 1362 return ret;
1328 } 1363 }
1329 1364
1330 /* Clear software isolated state */
1331 if (sw_state && (pe->state & EEH_PE_ISOLATED))
1332 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
1333
1334 return ret; 1365 return ret;
1335} 1366}
1336 1367
@@ -1382,7 +1413,10 @@ static int eeh_pe_change_owner(struct eeh_pe *pe)
1382 } 1413 }
1383 } 1414 }
1384 1415
1385 return eeh_unfreeze_pe(pe, true); 1416 ret = eeh_unfreeze_pe(pe);
1417 if (!ret)
1418 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
1419 return ret;
1386} 1420}
1387 1421
1388/** 1422/**
@@ -1612,13 +1646,12 @@ int eeh_pe_get_state(struct eeh_pe *pe)
1612} 1646}
1613EXPORT_SYMBOL_GPL(eeh_pe_get_state); 1647EXPORT_SYMBOL_GPL(eeh_pe_get_state);
1614 1648
1615static int eeh_pe_reenable_devices(struct eeh_pe *pe) 1649static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed)
1616{ 1650{
1617 struct eeh_dev *edev, *tmp; 1651 struct eeh_dev *edev, *tmp;
1618 struct pci_dev *pdev; 1652 struct pci_dev *pdev;
1619 int ret = 0; 1653 int ret = 0;
1620 1654
1621 /* Restore config space */
1622 eeh_pe_restore_bars(pe); 1655 eeh_pe_restore_bars(pe);
1623 1656
1624 /* 1657 /*
@@ -1639,7 +1672,14 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe)
1639 } 1672 }
1640 1673
1641 /* The PE is still in frozen state */ 1674 /* The PE is still in frozen state */
1642 return eeh_unfreeze_pe(pe, true); 1675 if (include_passed || !eeh_pe_passed(pe)) {
1676 ret = eeh_unfreeze_pe(pe);
1677 } else
1678 pr_info("EEH: Note: Leaving passthrough PHB#%x-PE#%x frozen.\n",
1679 pe->phb->global_number, pe->addr);
1680 if (!ret)
1681 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, include_passed);
1682 return ret;
1643} 1683}
1644 1684
1645 1685
@@ -1652,7 +1692,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe)
1652 * indicated type, either fundamental reset or hot reset. 1692 * indicated type, either fundamental reset or hot reset.
1653 * PE reset is the most important part for error recovery. 1693 * PE reset is the most important part for error recovery.
1654 */ 1694 */
1655int eeh_pe_reset(struct eeh_pe *pe, int option) 1695int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed)
1656{ 1696{
1657 int ret = 0; 1697 int ret = 0;
1658 1698
@@ -1666,11 +1706,11 @@ int eeh_pe_reset(struct eeh_pe *pe, int option)
1666 switch (option) { 1706 switch (option) {
1667 case EEH_RESET_DEACTIVATE: 1707 case EEH_RESET_DEACTIVATE:
1668 ret = eeh_ops->reset(pe, option); 1708 ret = eeh_ops->reset(pe, option);
1669 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); 1709 eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, include_passed);
1670 if (ret) 1710 if (ret)
1671 break; 1711 break;
1672 1712
1673 ret = eeh_pe_reenable_devices(pe); 1713 ret = eeh_pe_reenable_devices(pe, include_passed);
1674 break; 1714 break;
1675 case EEH_RESET_HOT: 1715 case EEH_RESET_HOT:
1676 case EEH_RESET_FUNDAMENTAL: 1716 case EEH_RESET_FUNDAMENTAL:
@@ -1796,22 +1836,64 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val)
1796 return 0; 1836 return 0;
1797} 1837}
1798 1838
1799static int eeh_freeze_dbgfs_set(void *data, u64 val) 1839DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
1800{ 1840 eeh_enable_dbgfs_set, "0x%llx\n");
1801 eeh_max_freezes = val;
1802 return 0;
1803}
1804 1841
1805static int eeh_freeze_dbgfs_get(void *data, u64 *val) 1842static ssize_t eeh_force_recover_write(struct file *filp,
1843 const char __user *user_buf,
1844 size_t count, loff_t *ppos)
1806{ 1845{
1807 *val = eeh_max_freezes; 1846 struct pci_controller *hose;
1808 return 0; 1847 uint32_t phbid, pe_no;
1848 struct eeh_pe *pe;
1849 char buf[20];
1850 int ret;
1851
1852 ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
1853 if (!ret)
1854 return -EFAULT;
1855
1856 /*
1857 * When PE is NULL the event is a "special" event. Rather than
1858 * recovering a specific PE it forces the EEH core to scan for failed
1859 * PHBs and recovers each. This needs to be done before any device
1860 * recoveries can occur.
1861 */
1862 if (!strncmp(buf, "hwcheck", 7)) {
1863 __eeh_send_failure_event(NULL);
1864 return count;
1865 }
1866
1867 ret = sscanf(buf, "%x:%x", &phbid, &pe_no);
1868 if (ret != 2)
1869 return -EINVAL;
1870
1871 hose = pci_find_controller_for_domain(phbid);
1872 if (!hose)
1873 return -ENODEV;
1874
1875 /* Retrieve PE */
1876 pe = eeh_pe_get(hose, pe_no, 0);
1877 if (!pe)
1878 return -ENODEV;
1879
1880 /*
1881 * We don't do any state checking here since the detection
1882 * process is async to the recovery process. The recovery
1883 * thread *should* not break even if we schedule a recovery
1884 * from an odd state (e.g. PE removed, or recovery of a
1885 * non-isolated PE)
1886 */
1887 __eeh_send_failure_event(pe);
1888
1889 return ret < 0 ? ret : count;
1809} 1890}
1810 1891
1811DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, 1892static const struct file_operations eeh_force_recover_fops = {
1812 eeh_enable_dbgfs_set, "0x%llx\n"); 1893 .open = simple_open,
1813DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get, 1894 .llseek = no_llseek,
1814 eeh_freeze_dbgfs_set, "0x%llx\n"); 1895 .write = eeh_force_recover_write,
1896};
1815#endif 1897#endif
1816 1898
1817static int __init eeh_init_proc(void) 1899static int __init eeh_init_proc(void)
@@ -1822,9 +1904,15 @@ static int __init eeh_init_proc(void)
1822 debugfs_create_file_unsafe("eeh_enable", 0600, 1904 debugfs_create_file_unsafe("eeh_enable", 0600,
1823 powerpc_debugfs_root, NULL, 1905 powerpc_debugfs_root, NULL,
1824 &eeh_enable_dbgfs_ops); 1906 &eeh_enable_dbgfs_ops);
1825 debugfs_create_file_unsafe("eeh_max_freezes", 0600, 1907 debugfs_create_u32("eeh_max_freezes", 0600,
1826 powerpc_debugfs_root, NULL, 1908 powerpc_debugfs_root, &eeh_max_freezes);
1827 &eeh_freeze_dbgfs_ops); 1909 debugfs_create_bool("eeh_disable_recovery", 0600,
1910 powerpc_debugfs_root,
1911 &eeh_debugfs_no_recover);
1912 debugfs_create_file_unsafe("eeh_force_recover", 0600,
1913 powerpc_debugfs_root, NULL,
1914 &eeh_force_recover_fops);
1915 eeh_cache_debugfs_init();
1828#endif 1916#endif
1829 } 1917 }
1830 1918
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 201943d54a6e..9c68f0837385 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -26,6 +26,7 @@
26#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27#include <linux/atomic.h> 27#include <linux/atomic.h>
28#include <asm/pci-bridge.h> 28#include <asm/pci-bridge.h>
29#include <asm/debugfs.h>
29#include <asm/ppc-pci.h> 30#include <asm/ppc-pci.h>
30 31
31 32
@@ -113,7 +114,7 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
113 while (n) { 114 while (n) {
114 struct pci_io_addr_range *piar; 115 struct pci_io_addr_range *piar;
115 piar = rb_entry(n, struct pci_io_addr_range, rb_node); 116 piar = rb_entry(n, struct pci_io_addr_range, rb_node);
116 pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n", 117 pr_info("PCI: %s addr range %d [%pap-%pap]: %s\n",
117 (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, 118 (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
118 &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); 119 &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev));
119 cnt++; 120 cnt++;
@@ -157,10 +158,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
157 piar->pcidev = dev; 158 piar->pcidev = dev;
158 piar->flags = flags; 159 piar->flags = flags;
159 160
160#ifdef DEBUG
161 pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", 161 pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
162 &alo, &ahi, pci_name(dev)); 162 &alo, &ahi, pci_name(dev));
163#endif
164 163
165 rb_link_node(&piar->rb_node, parent, p); 164 rb_link_node(&piar->rb_node, parent, p);
166 rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); 165 rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -240,6 +239,8 @@ restart:
240 piar = rb_entry(n, struct pci_io_addr_range, rb_node); 239 piar = rb_entry(n, struct pci_io_addr_range, rb_node);
241 240
242 if (piar->pcidev == dev) { 241 if (piar->pcidev == dev) {
242 pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n",
243 &piar->addr_lo, &piar->addr_hi, pci_name(dev));
243 rb_erase(n, &pci_io_addr_cache_root.rb_root); 244 rb_erase(n, &pci_io_addr_cache_root.rb_root);
244 kfree(piar); 245 kfree(piar);
245 goto restart; 246 goto restart;
@@ -298,9 +299,30 @@ void eeh_addr_cache_build(void)
298 eeh_addr_cache_insert_dev(dev); 299 eeh_addr_cache_insert_dev(dev);
299 eeh_sysfs_add_device(dev); 300 eeh_sysfs_add_device(dev);
300 } 301 }
302}
301 303
302#ifdef DEBUG 304static int eeh_addr_cache_show(struct seq_file *s, void *v)
303 /* Verify tree built up above, echo back the list of addrs. */ 305{
304 eeh_addr_cache_print(&pci_io_addr_cache_root); 306 struct pci_io_addr_range *piar;
305#endif 307 struct rb_node *n;
308
309 spin_lock(&pci_io_addr_cache_root.piar_lock);
310 for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) {
311 piar = rb_entry(n, struct pci_io_addr_range, rb_node);
312
313 seq_printf(s, "%s addr range [%pap-%pap]: %s\n",
314 (piar->flags & IORESOURCE_IO) ? "i/o" : "mem",
315 &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev));
316 }
317 spin_unlock(&pci_io_addr_cache_root.piar_lock);
318
319 return 0;
320}
321DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache);
322
323void eeh_cache_debugfs_init(void)
324{
325 debugfs_create_file_unsafe("eeh_address_cache", 0400,
326 powerpc_debugfs_root, NULL,
327 &eeh_addr_cache_fops);
306} 328}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 99eab7bc7edc..89623962c727 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
510 * support EEH. So we just care about PCI devices for 510 * support EEH. So we just care about PCI devices for
511 * simplicity here. 511 * simplicity here.
512 */ 512 */
513 if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) 513 if (!eeh_edev_actionable(edev) ||
514 return NULL; 514 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
515
516 /*
517 * We rely on count-based pcibios_release_device() to
518 * detach permanently offlined PEs. Unfortunately, that's
519 * not reliable enough. We might have the permanently
520 * offlined PEs attached, but we needn't take care of
521 * them and their child devices.
522 */
523 if (eeh_dev_removed(edev))
524 return NULL; 515 return NULL;
525 516
526 if (rmv_data) { 517 if (rmv_data) {
527 if (eeh_pe_passed(edev->pe))
528 return NULL;
529 driver = eeh_pcid_get(dev); 518 driver = eeh_pcid_get(dev);
530 if (driver) { 519 if (driver) {
531 if (driver->err_handler && 520 if (driver->err_handler &&
@@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
539 } 528 }
540 529
541 /* Remove it from PCI subsystem */ 530 /* Remove it from PCI subsystem */
542 pr_debug("EEH: Removing %s without EEH sensitive driver\n", 531 pr_info("EEH: Removing %s without EEH sensitive driver\n",
543 pci_name(dev)); 532 pci_name(dev));
544 edev->mode |= EEH_DEV_DISCONNECTED; 533 edev->mode |= EEH_DEV_DISCONNECTED;
545 if (rmv_data) 534 if (rmv_data)
546 rmv_data->removed_dev_count++; 535 rmv_data->removed_dev_count++;
@@ -591,34 +580,22 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
591 * PE reset (for 3 times), we try to clear the frozen state 580 * PE reset (for 3 times), we try to clear the frozen state
592 * for 3 times as well. 581 * for 3 times as well.
593 */ 582 */
594static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) 583static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed)
595{ 584{
596 bool clear_sw_state = *(bool *)flag; 585 struct eeh_pe *pe;
597 int i, rc = 1; 586 int i;
598
599 for (i = 0; rc && i < 3; i++)
600 rc = eeh_unfreeze_pe(pe, clear_sw_state);
601 587
602 /* Stop immediately on any errors */ 588 eeh_for_each_pe(root, pe) {
603 if (rc) { 589 if (include_passed || !eeh_pe_passed(pe)) {
604 pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", 590 for (i = 0; i < 3; i++)
605 __func__, rc, pe->phb->global_number, pe->addr); 591 if (!eeh_unfreeze_pe(pe))
606 return (void *)pe; 592 break;
593 if (i >= 3)
594 return -EIO;
595 }
607 } 596 }
608 597 eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed);
609 return NULL; 598 return 0;
610}
611
612static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
613 bool clear_sw_state)
614{
615 void *rc;
616
617 rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state);
618 if (!rc)
619 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
620
621 return rc ? -EIO : 0;
622} 599}
623 600
624int eeh_pe_reset_and_recover(struct eeh_pe *pe) 601int eeh_pe_reset_and_recover(struct eeh_pe *pe)
@@ -636,16 +613,16 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
636 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); 613 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
637 614
638 /* Issue reset */ 615 /* Issue reset */
639 ret = eeh_pe_reset_full(pe); 616 ret = eeh_pe_reset_full(pe, true);
640 if (ret) { 617 if (ret) {
641 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 618 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
642 return ret; 619 return ret;
643 } 620 }
644 621
645 /* Unfreeze the PE */ 622 /* Unfreeze the PE */
646 ret = eeh_clear_pe_frozen_state(pe, true); 623 ret = eeh_clear_pe_frozen_state(pe, true);
647 if (ret) { 624 if (ret) {
648 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 625 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
649 return ret; 626 return ret;
650 } 627 }
651 628
@@ -653,7 +630,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
653 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); 630 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
654 631
655 /* Clear recovery mode */ 632 /* Clear recovery mode */
656 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 633 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
657 634
658 return 0; 635 return 0;
659} 636}
@@ -676,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
676 time64_t tstamp; 653 time64_t tstamp;
677 int cnt, rc; 654 int cnt, rc;
678 struct eeh_dev *edev; 655 struct eeh_dev *edev;
656 struct eeh_pe *tmp_pe;
657 bool any_passed = false;
658
659 eeh_for_each_pe(pe, tmp_pe)
660 any_passed |= eeh_pe_passed(tmp_pe);
679 661
680 /* pcibios will clear the counter; save the value */ 662 /* pcibios will clear the counter; save the value */
681 cnt = pe->freeze_count; 663 cnt = pe->freeze_count;
@@ -688,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
688 * into pci_hp_add_devices(). 670 * into pci_hp_add_devices().
689 */ 671 */
690 eeh_pe_state_mark(pe, EEH_PE_KEEP); 672 eeh_pe_state_mark(pe, EEH_PE_KEEP);
691 if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { 673 if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
692 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); 674 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
693 } else { 675 } else {
694 pci_lock_rescan_remove(); 676 pci_lock_rescan_remove();
@@ -705,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
705 * config accesses. So we prefer to block them. However, controlled 687 * config accesses. So we prefer to block them. However, controlled
706 * PCI config accesses initiated from EEH itself are allowed. 688 * PCI config accesses initiated from EEH itself are allowed.
707 */ 689 */
708 rc = eeh_pe_reset_full(pe); 690 rc = eeh_pe_reset_full(pe, false);
709 if (rc) 691 if (rc)
710 return rc; 692 return rc;
711 693
@@ -744,11 +726,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
744 eeh_add_virt_device(edev); 726 eeh_add_virt_device(edev);
745 } else { 727 } else {
746 if (!driver_eeh_aware) 728 if (!driver_eeh_aware)
747 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 729 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
748 pci_hp_add_devices(bus); 730 pci_hp_add_devices(bus);
749 } 731 }
750 } 732 }
751 eeh_pe_state_clear(pe, EEH_PE_KEEP); 733 eeh_pe_state_clear(pe, EEH_PE_KEEP, true);
752 734
753 pe->tstamp = tstamp; 735 pe->tstamp = tstamp;
754 pe->freeze_count = cnt; 736 pe->freeze_count = cnt;
@@ -900,7 +882,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
900 * is still in frozen state. Clear it before 882 * is still in frozen state. Clear it before
901 * resuming the PE. 883 * resuming the PE.
902 */ 884 */
903 eeh_pe_state_clear(pe, EEH_PE_ISOLATED); 885 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
904 result = PCI_ERS_RESULT_RECOVERED; 886 result = PCI_ERS_RESULT_RECOVERED;
905 } 887 }
906 } 888 }
@@ -977,7 +959,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
977 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); 959 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
978 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 960 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
979 } else { 961 } else {
980 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 962 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
981 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 963 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
982 964
983 pci_lock_rescan_remove(); 965 pci_lock_rescan_remove();
@@ -987,7 +969,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
987 return; 969 return;
988 } 970 }
989 } 971 }
990 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 972 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
991} 973}
992 974
993/** 975/**
@@ -1069,7 +1051,7 @@ void eeh_handle_special_event(void)
1069 continue; 1051 continue;
1070 1052
1071 /* Notify all devices to be down */ 1053 /* Notify all devices to be down */
1072 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 1054 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
1073 eeh_set_channel_state(pe, pci_channel_io_perm_failure); 1055 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
1074 eeh_pe_report( 1056 eeh_pe_report(
1075 "error_detected(permanent failure)", pe, 1057 "error_detected(permanent failure)", pe,
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 227e57f980df..539aca055d70 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -121,7 +121,7 @@ int eeh_event_init(void)
121 * the actual event will be delivered in a normal context 121 * the actual event will be delivered in a normal context
122 * (from a workqueue). 122 * (from a workqueue).
123 */ 123 */
124int eeh_send_failure_event(struct eeh_pe *pe) 124int __eeh_send_failure_event(struct eeh_pe *pe)
125{ 125{
126 unsigned long flags; 126 unsigned long flags;
127 struct eeh_event *event; 127 struct eeh_event *event;
@@ -144,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe)
144 return 0; 144 return 0;
145} 145}
146 146
147int eeh_send_failure_event(struct eeh_pe *pe)
148{
149 /*
150 * If we've manually supressed recovery events via debugfs
151 * then just drop it on the floor.
152 */
153 if (eeh_debugfs_no_recover) {
154 pr_err("EEH: Event dropped due to no_recover setting\n");
155 return 0;
156 }
157
158 return __eeh_send_failure_event(pe);
159}
160
147/** 161/**
148 * eeh_remove_event - Remove EEH event from the queue 162 * eeh_remove_event - Remove EEH event from the queue
149 * @pe: Event binding to the PE 163 * @pe: Event binding to the PE
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 6fa2032e0594..8b578891f27c 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -657,62 +657,52 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
657} 657}
658 658
659/** 659/**
660 * __eeh_pe_state_clear - Clear state for the PE 660 * eeh_pe_state_clear - Clear state for the PE
661 * @data: EEH PE 661 * @data: EEH PE
662 * @flag: state 662 * @state: state
663 * @include_passed: include passed-through devices?
663 * 664 *
664 * The function is used to clear the indicated state from the 665 * The function is used to clear the indicated state from the
665 * given PE. Besides, we also clear the check count of the PE 666 * given PE. Besides, we also clear the check count of the PE
666 * as well. 667 * as well.
667 */ 668 */
668static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag) 669void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed)
669{ 670{
670 int state = *((int *)flag); 671 struct eeh_pe *pe;
671 struct eeh_dev *edev, *tmp; 672 struct eeh_dev *edev, *tmp;
672 struct pci_dev *pdev; 673 struct pci_dev *pdev;
673 674
674 /* Keep the state of permanently removed PE intact */ 675 eeh_for_each_pe(root, pe) {
675 if (pe->state & EEH_PE_REMOVED) 676 /* Keep the state of permanently removed PE intact */
676 return NULL; 677 if (pe->state & EEH_PE_REMOVED)
678 continue;
677 679
678 pe->state &= ~state; 680 if (!include_passed && eeh_pe_passed(pe))
681 continue;
679 682
680 /* 683 pe->state &= ~state;
681 * Special treatment on clearing isolated state. Clear
682 * check count since last isolation and put all affected
683 * devices to normal state.
684 */
685 if (!(state & EEH_PE_ISOLATED))
686 return NULL;
687 684
688 pe->check_count = 0; 685 /*
689 eeh_pe_for_each_dev(pe, edev, tmp) { 686 * Special treatment on clearing isolated state. Clear
690 pdev = eeh_dev_to_pci_dev(edev); 687 * check count since last isolation and put all affected
691 if (!pdev) 688 * devices to normal state.
689 */
690 if (!(state & EEH_PE_ISOLATED))
692 continue; 691 continue;
693 692
694 pdev->error_state = pci_channel_io_normal; 693 pe->check_count = 0;
695 } 694 eeh_pe_for_each_dev(pe, edev, tmp) {
696 695 pdev = eeh_dev_to_pci_dev(edev);
697 /* Unblock PCI config access if required */ 696 if (!pdev)
698 if (pe->state & EEH_PE_CFG_RESTRICTED) 697 continue;
699 pe->state &= ~EEH_PE_CFG_BLOCKED;
700 698
701 return NULL; 699 pdev->error_state = pci_channel_io_normal;
702} 700 }
703 701
704/** 702 /* Unblock PCI config access if required */
705 * eeh_pe_state_clear - Clear state for the PE and its children 703 if (pe->state & EEH_PE_CFG_RESTRICTED)
706 * @pe: PE 704 pe->state &= ~EEH_PE_CFG_BLOCKED;
707 * @state: state to be cleared 705 }
708 *
709 * When the PE and its children has been recovered from error,
710 * we need clear the error state for that. The function is used
711 * for the purpose.
712 */
713void eeh_pe_state_clear(struct eeh_pe *pe, int state)
714{
715 eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
716} 706}
717 707
718/* 708/*
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
index deed906dd8f1..3fa04dda1737 100644
--- a/arch/powerpc/kernel/eeh_sysfs.c
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -82,8 +82,9 @@ static ssize_t eeh_pe_state_store(struct device *dev,
82 if (!(edev->pe->state & EEH_PE_ISOLATED)) 82 if (!(edev->pe->state & EEH_PE_ISOLATED))
83 return count; 83 return count;
84 84
85 if (eeh_unfreeze_pe(edev->pe, true)) 85 if (eeh_unfreeze_pe(edev->pe))
86 return -EIO; 86 return -EIO;
87 eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED, true);
87 88
88 return count; 89 return count;
89} 90}
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 0768dfd8a64e..b61cfd29c76f 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -97,14 +97,11 @@ crit_transfer_to_handler:
97 mfspr r0,SPRN_SRR1 97 mfspr r0,SPRN_SRR1
98 stw r0,_SRR1(r11) 98 stw r0,_SRR1(r11)
99 99
100 /* set the stack limit to the current stack 100 /* set the stack limit to the current stack */
101 * and set the limit to protect the thread_info
102 * struct
103 */
104 mfspr r8,SPRN_SPRG_THREAD 101 mfspr r8,SPRN_SPRG_THREAD
105 lwz r0,KSP_LIMIT(r8) 102 lwz r0,KSP_LIMIT(r8)
106 stw r0,SAVED_KSP_LIMIT(r11) 103 stw r0,SAVED_KSP_LIMIT(r11)
107 rlwimi r0,r1,0,0,(31-THREAD_SHIFT) 104 rlwinm r0,r1,0,0,(31 - THREAD_SHIFT)
108 stw r0,KSP_LIMIT(r8) 105 stw r0,KSP_LIMIT(r8)
109 /* fall through */ 106 /* fall through */
110#endif 107#endif
@@ -121,14 +118,11 @@ crit_transfer_to_handler:
121 mfspr r0,SPRN_SRR1 118 mfspr r0,SPRN_SRR1
122 stw r0,crit_srr1@l(0) 119 stw r0,crit_srr1@l(0)
123 120
124 /* set the stack limit to the current stack 121 /* set the stack limit to the current stack */
125 * and set the limit to protect the thread_info
126 * struct
127 */
128 mfspr r8,SPRN_SPRG_THREAD 122 mfspr r8,SPRN_SPRG_THREAD
129 lwz r0,KSP_LIMIT(r8) 123 lwz r0,KSP_LIMIT(r8)
130 stw r0,saved_ksp_limit@l(0) 124 stw r0,saved_ksp_limit@l(0)
131 rlwimi r0,r1,0,0,(31-THREAD_SHIFT) 125 rlwinm r0,r1,0,0,(31 - THREAD_SHIFT)
132 stw r0,KSP_LIMIT(r8) 126 stw r0,KSP_LIMIT(r8)
133 /* fall through */ 127 /* fall through */
134#endif 128#endif
@@ -157,7 +151,6 @@ transfer_to_handler:
157 stw r2,_XER(r11) 151 stw r2,_XER(r11)
158 mfspr r12,SPRN_SPRG_THREAD 152 mfspr r12,SPRN_SPRG_THREAD
159 addi r2,r12,-THREAD 153 addi r2,r12,-THREAD
160 tovirt(r2,r2) /* set r2 to current */
161 beq 2f /* if from user, fix up THREAD.regs */ 154 beq 2f /* if from user, fix up THREAD.regs */
162 addi r11,r1,STACK_FRAME_OVERHEAD 155 addi r11,r1,STACK_FRAME_OVERHEAD
163 stw r11,PT_REGS(r12) 156 stw r11,PT_REGS(r12)
@@ -166,6 +159,9 @@ transfer_to_handler:
166 internal debug mode bit to do this. */ 159 internal debug mode bit to do this. */
167 lwz r12,THREAD_DBCR0(r12) 160 lwz r12,THREAD_DBCR0(r12)
168 andis. r12,r12,DBCR0_IDM@h 161 andis. r12,r12,DBCR0_IDM@h
162#endif
163 ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
164#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
169 beq+ 3f 165 beq+ 3f
170 /* From user and task is ptraced - load up global dbcr0 */ 166 /* From user and task is ptraced - load up global dbcr0 */
171 li r12,-1 /* clear all pending debug events */ 167 li r12,-1 /* clear all pending debug events */
@@ -174,8 +170,7 @@ transfer_to_handler:
174 tophys(r11,r11) 170 tophys(r11,r11)
175 addi r11,r11,global_dbcr0@l 171 addi r11,r11,global_dbcr0@l
176#ifdef CONFIG_SMP 172#ifdef CONFIG_SMP
177 CURRENT_THREAD_INFO(r9, r1) 173 lwz r9,TASK_CPU(r2)
178 lwz r9,TI_CPU(r9)
179 slwi r9,r9,3 174 slwi r9,r9,3
180 add r11,r11,r9 175 add r11,r11,r9
181#endif 176#endif
@@ -185,11 +180,6 @@ transfer_to_handler:
185 addi r12,r12,-1 180 addi r12,r12,-1
186 stw r12,4(r11) 181 stw r12,4(r11)
187#endif 182#endif
188#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
189 CURRENT_THREAD_INFO(r9, r1)
190 tophys(r9, r9)
191 ACCOUNT_CPU_USER_ENTRY(r9, r11, r12)
192#endif
193 183
194 b 3f 184 b 3f
195 185
@@ -201,9 +191,7 @@ transfer_to_handler:
201 ble- stack_ovf /* then the kernel stack overflowed */ 191 ble- stack_ovf /* then the kernel stack overflowed */
2025: 1925:
203#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 193#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
204 CURRENT_THREAD_INFO(r9, r1) 194 lwz r12,TI_LOCAL_FLAGS(r2)
205 tophys(r9,r9) /* check local flags */
206 lwz r12,TI_LOCAL_FLAGS(r9)
207 mtcrf 0x01,r12 195 mtcrf 0x01,r12
208 bt- 31-TLF_NAPPING,4f 196 bt- 31-TLF_NAPPING,4f
209 bt- 31-TLF_SLEEPING,7f 197 bt- 31-TLF_SLEEPING,7f
@@ -212,6 +200,7 @@ transfer_to_handler:
212transfer_to_handler_cont: 200transfer_to_handler_cont:
2133: 2013:
214 mflr r9 202 mflr r9
203 tovirt(r2, r2) /* set r2 to current */
215 lwz r11,0(r9) /* virtual address of handler */ 204 lwz r11,0(r9) /* virtual address of handler */
216 lwz r9,4(r9) /* where to go when done */ 205 lwz r9,4(r9) /* where to go when done */
217#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) 206#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
@@ -275,11 +264,11 @@ reenable_mmu: /* re-enable mmu so we can */
275 264
276#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 265#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
2774: rlwinm r12,r12,0,~_TLF_NAPPING 2664: rlwinm r12,r12,0,~_TLF_NAPPING
278 stw r12,TI_LOCAL_FLAGS(r9) 267 stw r12,TI_LOCAL_FLAGS(r2)
279 b power_save_ppc32_restore 268 b power_save_ppc32_restore
280 269
2817: rlwinm r12,r12,0,~_TLF_SLEEPING 2707: rlwinm r12,r12,0,~_TLF_SLEEPING
282 stw r12,TI_LOCAL_FLAGS(r9) 271 stw r12,TI_LOCAL_FLAGS(r2)
283 lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ 272 lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */
284 rlwinm r9,r9,0,~MSR_EE 273 rlwinm r9,r9,0,~MSR_EE
285 lwz r12,_LINK(r11) /* and return to address in LR */ 274 lwz r12,_LINK(r11) /* and return to address in LR */
@@ -351,8 +340,7 @@ _GLOBAL(DoSyscall)
351 mtmsr r11 340 mtmsr r11
3521: 3411:
353#endif /* CONFIG_TRACE_IRQFLAGS */ 342#endif /* CONFIG_TRACE_IRQFLAGS */
354 CURRENT_THREAD_INFO(r10, r1) 343 lwz r11,TI_FLAGS(r2)
355 lwz r11,TI_FLAGS(r10)
356 andi. r11,r11,_TIF_SYSCALL_DOTRACE 344 andi. r11,r11,_TIF_SYSCALL_DOTRACE
357 bne- syscall_dotrace 345 bne- syscall_dotrace
358syscall_dotrace_cont: 346syscall_dotrace_cont:
@@ -385,13 +373,12 @@ ret_from_syscall:
385 lwz r3,GPR3(r1) 373 lwz r3,GPR3(r1)
386#endif 374#endif
387 mr r6,r3 375 mr r6,r3
388 CURRENT_THREAD_INFO(r12, r1)
389 /* disable interrupts so current_thread_info()->flags can't change */ 376 /* disable interrupts so current_thread_info()->flags can't change */
390 LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ 377 LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */
391 /* Note: We don't bother telling lockdep about it */ 378 /* Note: We don't bother telling lockdep about it */
392 SYNC 379 SYNC
393 MTMSRD(r10) 380 MTMSRD(r10)
394 lwz r9,TI_FLAGS(r12) 381 lwz r9,TI_FLAGS(r2)
395 li r8,-MAX_ERRNO 382 li r8,-MAX_ERRNO
396 andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) 383 andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
397 bne- syscall_exit_work 384 bne- syscall_exit_work
@@ -438,8 +425,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
438#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 425#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
439 andi. r4,r8,MSR_PR 426 andi. r4,r8,MSR_PR
440 beq 3f 427 beq 3f
441 CURRENT_THREAD_INFO(r4, r1) 428 ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
442 ACCOUNT_CPU_USER_EXIT(r4, r5, r7)
4433: 4293:
444#endif 430#endif
445 lwz r4,_LINK(r1) 431 lwz r4,_LINK(r1)
@@ -532,7 +518,7 @@ syscall_exit_work:
532 /* Clear per-syscall TIF flags if any are set. */ 518 /* Clear per-syscall TIF flags if any are set. */
533 519
534 li r11,_TIF_PERSYSCALL_MASK 520 li r11,_TIF_PERSYSCALL_MASK
535 addi r12,r12,TI_FLAGS 521 addi r12,r2,TI_FLAGS
5363: lwarx r8,0,r12 5223: lwarx r8,0,r12
537 andc r8,r8,r11 523 andc r8,r8,r11
538#ifdef CONFIG_IBM405_ERR77 524#ifdef CONFIG_IBM405_ERR77
@@ -540,7 +526,6 @@ syscall_exit_work:
540#endif 526#endif
541 stwcx. r8,0,r12 527 stwcx. r8,0,r12
542 bne- 3b 528 bne- 3b
543 subi r12,r12,TI_FLAGS
544 529
5454: /* Anything which requires enabling interrupts? */ 5304: /* Anything which requires enabling interrupts? */
546 andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) 531 andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
@@ -745,6 +730,9 @@ fast_exception_return:
745 mtcr r10 730 mtcr r10
746 lwz r10,_LINK(r11) 731 lwz r10,_LINK(r11)
747 mtlr r10 732 mtlr r10
733 /* Clear the exception_marker on the stack to avoid confusing stacktrace */
734 li r10, 0
735 stw r10, 8(r11)
748 REST_GPR(10, r11) 736 REST_GPR(10, r11)
749#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) 737#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
750 mtspr SPRN_NRI, r0 738 mtspr SPRN_NRI, r0
@@ -819,8 +807,7 @@ ret_from_except:
819 807
820user_exc_return: /* r10 contains MSR_KERNEL here */ 808user_exc_return: /* r10 contains MSR_KERNEL here */
821 /* Check current_thread_info()->flags */ 809 /* Check current_thread_info()->flags */
822 CURRENT_THREAD_INFO(r9, r1) 810 lwz r9,TI_FLAGS(r2)
823 lwz r9,TI_FLAGS(r9)
824 andi. r0,r9,_TIF_USER_WORK_MASK 811 andi. r0,r9,_TIF_USER_WORK_MASK
825 bne do_work 812 bne do_work
826 813
@@ -832,18 +819,14 @@ restore_user:
832 andis. r10,r0,DBCR0_IDM@h 819 andis. r10,r0,DBCR0_IDM@h
833 bnel- load_dbcr0 820 bnel- load_dbcr0
834#endif 821#endif
835#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 822 ACCOUNT_CPU_USER_EXIT(r2, r10, r11)
836 CURRENT_THREAD_INFO(r9, r1)
837 ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
838#endif
839 823
840 b restore 824 b restore
841 825
842/* N.B. the only way to get here is from the beq following ret_from_except. */ 826/* N.B. the only way to get here is from the beq following ret_from_except. */
843resume_kernel: 827resume_kernel:
844 /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ 828 /* check current_thread_info, _TIF_EMULATE_STACK_STORE */
845 CURRENT_THREAD_INFO(r9, r1) 829 lwz r8,TI_FLAGS(r2)
846 lwz r8,TI_FLAGS(r9)
847 andis. r0,r8,_TIF_EMULATE_STACK_STORE@h 830 andis. r0,r8,_TIF_EMULATE_STACK_STORE@h
848 beq+ 1f 831 beq+ 1f
849 832
@@ -869,7 +852,7 @@ resume_kernel:
869 852
870 /* Clear _TIF_EMULATE_STACK_STORE flag */ 853 /* Clear _TIF_EMULATE_STACK_STORE flag */
871 lis r11,_TIF_EMULATE_STACK_STORE@h 854 lis r11,_TIF_EMULATE_STACK_STORE@h
872 addi r5,r9,TI_FLAGS 855 addi r5,r2,TI_FLAGS
8730: lwarx r8,0,r5 8560: lwarx r8,0,r5
874 andc r8,r8,r11 857 andc r8,r8,r11
875#ifdef CONFIG_IBM405_ERR77 858#ifdef CONFIG_IBM405_ERR77
@@ -881,7 +864,7 @@ resume_kernel:
881 864
882#ifdef CONFIG_PREEMPT 865#ifdef CONFIG_PREEMPT
883 /* check current_thread_info->preempt_count */ 866 /* check current_thread_info->preempt_count */
884 lwz r0,TI_PREEMPT(r9) 867 lwz r0,TI_PREEMPT(r2)
885 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ 868 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
886 bne restore 869 bne restore
887 andi. r8,r8,_TIF_NEED_RESCHED 870 andi. r8,r8,_TIF_NEED_RESCHED
@@ -897,8 +880,7 @@ resume_kernel:
897 bl trace_hardirqs_off 880 bl trace_hardirqs_off
898#endif 881#endif
8991: bl preempt_schedule_irq 8821: bl preempt_schedule_irq
900 CURRENT_THREAD_INFO(r9, r1) 883 lwz r3,TI_FLAGS(r2)
901 lwz r3,TI_FLAGS(r9)
902 andi. r0,r3,_TIF_NEED_RESCHED 884 andi. r0,r3,_TIF_NEED_RESCHED
903 bne- 1b 885 bne- 1b
904#ifdef CONFIG_TRACE_IRQFLAGS 886#ifdef CONFIG_TRACE_IRQFLAGS
@@ -982,6 +964,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
982 mtcrf 0xFF,r10 964 mtcrf 0xFF,r10
983 mtlr r11 965 mtlr r11
984 966
967 /* Clear the exception_marker on the stack to avoid confusing stacktrace */
968 li r10, 0
969 stw r10, 8(r1)
985 /* 970 /*
986 * Once we put values in SRR0 and SRR1, we are in a state 971 * Once we put values in SRR0 and SRR1, we are in a state
987 * where exceptions are not recoverable, since taking an 972 * where exceptions are not recoverable, since taking an
@@ -997,9 +982,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
997 .globl exc_exit_restart 982 .globl exc_exit_restart
998exc_exit_restart: 983exc_exit_restart:
999 lwz r12,_NIP(r1) 984 lwz r12,_NIP(r1)
1000#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
1001 mtspr SPRN_NRI, r0
1002#endif
1003 mtspr SPRN_SRR0,r12 985 mtspr SPRN_SRR0,r12
1004 mtspr SPRN_SRR1,r9 986 mtspr SPRN_SRR1,r9
1005 REST_4GPRS(9, r1) 987 REST_4GPRS(9, r1)
@@ -1021,6 +1003,9 @@ exc_exit_restart_end:
1021 mtlr r11 1003 mtlr r11
1022 lwz r10,_CCR(r1) 1004 lwz r10,_CCR(r1)
1023 mtcrf 0xff,r10 1005 mtcrf 0xff,r10
1006 /* Clear the exception_marker on the stack to avoid confusing stacktrace */
1007 li r10, 0
1008 stw r10, 8(r1)
1024 REST_2GPRS(9, r1) 1009 REST_2GPRS(9, r1)
1025 .globl exc_exit_restart 1010 .globl exc_exit_restart
1026exc_exit_restart: 1011exc_exit_restart:
@@ -1166,10 +1151,6 @@ ret_from_debug_exc:
1166 mfspr r9,SPRN_SPRG_THREAD 1151 mfspr r9,SPRN_SPRG_THREAD
1167 lwz r10,SAVED_KSP_LIMIT(r1) 1152 lwz r10,SAVED_KSP_LIMIT(r1)
1168 stw r10,KSP_LIMIT(r9) 1153 stw r10,KSP_LIMIT(r9)
1169 lwz r9,THREAD_INFO-THREAD(r9)
1170 CURRENT_THREAD_INFO(r10, r1)
1171 lwz r10,TI_PREEMPT(r10)
1172 stw r10,TI_PREEMPT(r9)
1173 RESTORE_xSRR(SRR0,SRR1); 1154 RESTORE_xSRR(SRR0,SRR1);
1174 RESTORE_xSRR(CSRR0,CSRR1); 1155 RESTORE_xSRR(CSRR0,CSRR1);
1175 RESTORE_MMU_REGS; 1156 RESTORE_MMU_REGS;
@@ -1201,8 +1182,7 @@ load_dbcr0:
1201 lis r11,global_dbcr0@ha 1182 lis r11,global_dbcr0@ha
1202 addi r11,r11,global_dbcr0@l 1183 addi r11,r11,global_dbcr0@l
1203#ifdef CONFIG_SMP 1184#ifdef CONFIG_SMP
1204 CURRENT_THREAD_INFO(r9, r1) 1185 lwz r9,TASK_CPU(r2)
1205 lwz r9,TI_CPU(r9)
1206 slwi r9,r9,3 1186 slwi r9,r9,3
1207 add r11,r11,r9 1187 add r11,r11,r9
1208#endif 1188#endif
@@ -1242,8 +1222,7 @@ recheck:
1242 LOAD_MSR_KERNEL(r10,MSR_KERNEL) 1222 LOAD_MSR_KERNEL(r10,MSR_KERNEL)
1243 SYNC 1223 SYNC
1244 MTMSRD(r10) /* disable interrupts */ 1224 MTMSRD(r10) /* disable interrupts */
1245 CURRENT_THREAD_INFO(r9, r1) 1225 lwz r9,TI_FLAGS(r2)
1246 lwz r9,TI_FLAGS(r9)
1247 andi. r0,r9,_TIF_NEED_RESCHED 1226 andi. r0,r9,_TIF_NEED_RESCHED
1248 bne- do_resched 1227 bne- do_resched
1249 andi. r0,r9,_TIF_USER_WORK_MASK 1228 andi. r0,r9,_TIF_USER_WORK_MASK
@@ -1292,10 +1271,13 @@ BEGIN_FTR_SECTION
1292END_FTR_SECTION_IFSET(CPU_FTR_601) 1271END_FTR_SECTION_IFSET(CPU_FTR_601)
1293 lwz r3,_TRAP(r1) 1272 lwz r3,_TRAP(r1)
1294 andi. r0,r3,1 1273 andi. r0,r3,1
1295 beq 4f 1274 beq 5f
1296 SAVE_NVGPRS(r1) 1275 SAVE_NVGPRS(r1)
1297 rlwinm r3,r3,0,0,30 1276 rlwinm r3,r3,0,0,30
1298 stw r3,_TRAP(r1) 1277 stw r3,_TRAP(r1)
12785: mfspr r2,SPRN_SPRG_THREAD
1279 addi r2,r2,-THREAD
1280 tovirt(r2,r2) /* set back r2 to current */
12994: addi r3,r1,STACK_FRAME_OVERHEAD 12814: addi r3,r1,STACK_FRAME_OVERHEAD
1300 bl unrecoverable_exception 1282 bl unrecoverable_exception
1301 /* shouldn't return */ 1283 /* shouldn't return */
@@ -1335,7 +1317,7 @@ _GLOBAL(enter_rtas)
1335 MTMSRD(r0) /* don't get trashed */ 1317 MTMSRD(r0) /* don't get trashed */
1336 li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) 1318 li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
1337 mtlr r6 1319 mtlr r6
1338 mtspr SPRN_SPRG_RTAS,r7 1320 stw r7, THREAD + RTAS_SP(r2)
1339 mtspr SPRN_SRR0,r8 1321 mtspr SPRN_SRR0,r8
1340 mtspr SPRN_SRR1,r9 1322 mtspr SPRN_SRR1,r9
1341 RFI 1323 RFI
@@ -1344,7 +1326,8 @@ _GLOBAL(enter_rtas)
1344 lwz r9,8(r9) /* original msr value */ 1326 lwz r9,8(r9) /* original msr value */
1345 addi r1,r1,INT_FRAME_SIZE 1327 addi r1,r1,INT_FRAME_SIZE
1346 li r0,0 1328 li r0,0
1347 mtspr SPRN_SPRG_RTAS,r0 1329 tophys(r7, r2)
1330 stw r0, THREAD + RTAS_SP(r7)
1348 mtspr SPRN_SRR0,r8 1331 mtspr SPRN_SRR0,r8
1349 mtspr SPRN_SRR1,r9 1332 mtspr SPRN_SRR1,r9
1350 RFI /* return to caller */ 1333 RFI /* return to caller */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 435927f549c4..15c67d2c0534 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -166,7 +166,7 @@ system_call: /* label this so stack traces look sane */
166 li r10,IRQS_ENABLED 166 li r10,IRQS_ENABLED
167 std r10,SOFTE(r1) 167 std r10,SOFTE(r1)
168 168
169 CURRENT_THREAD_INFO(r11, r1) 169 ld r11, PACA_THREAD_INFO(r13)
170 ld r10,TI_FLAGS(r11) 170 ld r10,TI_FLAGS(r11)
171 andi. r11,r10,_TIF_SYSCALL_DOTRACE 171 andi. r11,r10,_TIF_SYSCALL_DOTRACE
172 bne .Lsyscall_dotrace /* does not return */ 172 bne .Lsyscall_dotrace /* does not return */
@@ -213,7 +213,7 @@ system_call: /* label this so stack traces look sane */
213 ld r3,RESULT(r1) 213 ld r3,RESULT(r1)
214#endif 214#endif
215 215
216 CURRENT_THREAD_INFO(r12, r1) 216 ld r12, PACA_THREAD_INFO(r13)
217 217
218 ld r8,_MSR(r1) 218 ld r8,_MSR(r1)
219#ifdef CONFIG_PPC_BOOK3S 219#ifdef CONFIG_PPC_BOOK3S
@@ -236,18 +236,14 @@ system_call_exit:
236 /* 236 /*
237 * Disable interrupts so current_thread_info()->flags can't change, 237 * Disable interrupts so current_thread_info()->flags can't change,
238 * and so that we don't get interrupted after loading SRR0/1. 238 * and so that we don't get interrupted after loading SRR0/1.
239 *
240 * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we
241 * could fault on the load of the TI_FLAGS below.
239 */ 242 */
240#ifdef CONFIG_PPC_BOOK3E 243#ifdef CONFIG_PPC_BOOK3E
241 wrteei 0 244 wrteei 0
242#else 245#else
243 /* 246 li r11,MSR_RI
244 * For performance reasons we clear RI the same time that we
245 * clear EE. We only need to clear RI just before we restore r13
246 * below, but batching it with EE saves us one expensive mtmsrd call.
247 * We have to be careful to restore RI if we branch anywhere from
248 * here (eg syscall_exit_work).
249 */
250 li r11,0
251 mtmsrd r11,1 247 mtmsrd r11,1
252#endif /* CONFIG_PPC_BOOK3E */ 248#endif /* CONFIG_PPC_BOOK3E */
253 249
@@ -263,15 +259,7 @@ system_call_exit:
263 bne 3f 259 bne 3f
264#endif 260#endif
2652: addi r3,r1,STACK_FRAME_OVERHEAD 2612: addi r3,r1,STACK_FRAME_OVERHEAD
266#ifdef CONFIG_PPC_BOOK3S
267 li r10,MSR_RI
268 mtmsrd r10,1 /* Restore RI */
269#endif
270 bl restore_math 262 bl restore_math
271#ifdef CONFIG_PPC_BOOK3S
272 li r11,0
273 mtmsrd r11,1
274#endif
275 ld r8,_MSR(r1) 263 ld r8,_MSR(r1)
276 ld r3,RESULT(r1) 264 ld r3,RESULT(r1)
277 li r11,-MAX_ERRNO 265 li r11,-MAX_ERRNO
@@ -287,6 +275,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
287 andi. r6,r8,MSR_PR 275 andi. r6,r8,MSR_PR
288 ld r4,_LINK(r1) 276 ld r4,_LINK(r1)
289 277
278#ifdef CONFIG_PPC_BOOK3S
279 /*
280 * Clear MSR_RI, MSR_EE is already and remains disabled. We could do
281 * this later, but testing shows that doing it here causes less slow
282 * down than doing it closer to the rfid.
283 */
284 li r11,0
285 mtmsrd r11,1
286#endif
287
290 beq- 1f 288 beq- 1f
291 ACCOUNT_CPU_USER_EXIT(r13, r11, r12) 289 ACCOUNT_CPU_USER_EXIT(r13, r11, r12)
292 290
@@ -348,7 +346,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
348 346
349 /* Repopulate r9 and r10 for the syscall path */ 347 /* Repopulate r9 and r10 for the syscall path */
350 addi r9,r1,STACK_FRAME_OVERHEAD 348 addi r9,r1,STACK_FRAME_OVERHEAD
351 CURRENT_THREAD_INFO(r10, r1) 349 ld r10, PACA_THREAD_INFO(r13)
352 ld r10,TI_FLAGS(r10) 350 ld r10,TI_FLAGS(r10)
353 351
354 cmpldi r0,NR_syscalls 352 cmpldi r0,NR_syscalls
@@ -363,10 +361,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
363 b .Lsyscall_exit 361 b .Lsyscall_exit
364 362
365.Lsyscall_exit_work: 363.Lsyscall_exit_work:
366#ifdef CONFIG_PPC_BOOK3S
367 li r10,MSR_RI
368 mtmsrd r10,1 /* Restore RI */
369#endif
370 /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. 364 /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
371 If TIF_NOERROR is set, just save r3 as it is. */ 365 If TIF_NOERROR is set, just save r3 as it is. */
372 366
@@ -695,7 +689,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
6952: 6892:
696#endif /* CONFIG_PPC_BOOK3S_64 */ 690#endif /* CONFIG_PPC_BOOK3S_64 */
697 691
698 CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ 692 clrrdi r7, r8, THREAD_SHIFT /* base of new stack */
699 /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE 693 /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
700 because we don't need to leave the 288-byte ABI gap at the 694 because we don't need to leave the 288-byte ABI gap at the
701 top of the kernel stack. */ 695 top of the kernel stack. */
@@ -746,7 +740,7 @@ _GLOBAL(ret_from_except_lite)
746 mtmsrd r10,1 /* Update machine state */ 740 mtmsrd r10,1 /* Update machine state */
747#endif /* CONFIG_PPC_BOOK3E */ 741#endif /* CONFIG_PPC_BOOK3E */
748 742
749 CURRENT_THREAD_INFO(r9, r1) 743 ld r9, PACA_THREAD_INFO(r13)
750 ld r3,_MSR(r1) 744 ld r3,_MSR(r1)
751#ifdef CONFIG_PPC_BOOK3E 745#ifdef CONFIG_PPC_BOOK3E
752 ld r10,PACACURRENT(r13) 746 ld r10,PACACURRENT(r13)
@@ -860,7 +854,7 @@ resume_kernel:
8601: bl preempt_schedule_irq 8541: bl preempt_schedule_irq
861 855
862 /* Re-test flags and eventually loop */ 856 /* Re-test flags and eventually loop */
863 CURRENT_THREAD_INFO(r9, r1) 857 ld r9, PACA_THREAD_INFO(r13)
864 ld r4,TI_FLAGS(r9) 858 ld r4,TI_FLAGS(r9)
865 andi. r0,r4,_TIF_NEED_RESCHED 859 andi. r0,r4,_TIF_NEED_RESCHED
866 bne 1b 860 bne 1b
@@ -1002,6 +996,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1002 ld r2,_NIP(r1) 996 ld r2,_NIP(r1)
1003 mtspr SPRN_SRR0,r2 997 mtspr SPRN_SRR0,r2
1004 998
999 /*
1000 * Leaving a stale exception_marker on the stack can confuse
1001 * the reliable stack unwinder later on. Clear it.
1002 */
1003 li r2,0
1004 std r2,STACK_FRAME_OVERHEAD-16(r1)
1005
1005 ld r0,GPR0(r1) 1006 ld r0,GPR0(r1)
1006 ld r2,GPR2(r1) 1007 ld r2,GPR2(r1)
1007 ld r3,GPR3(r1) 1008 ld r3,GPR3(r1)
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 52ca2471ee1a..d252f4663a23 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -21,10 +21,9 @@
21#ifndef CONFIG_PPC64 21#ifndef CONFIG_PPC64
22/* epapr_ev_idle() was derived from e500_idle() */ 22/* epapr_ev_idle() was derived from e500_idle() */
23_GLOBAL(epapr_ev_idle) 23_GLOBAL(epapr_ev_idle)
24 CURRENT_THREAD_INFO(r3, r1) 24 PPC_LL r4, TI_LOCAL_FLAGS(r2) /* set napping bit */
25 PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */
26 ori r4, r4,_TLF_NAPPING /* so when we take an exception */ 25 ori r4, r4,_TLF_NAPPING /* so when we take an exception */
27 PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ 26 PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */
28 27
29 wrteei 1 28 wrteei 1
30 29
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index afb638778f44..49381f32b374 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -77,17 +77,6 @@ special_reg_save:
77 andi. r3,r3,MSR_PR 77 andi. r3,r3,MSR_PR
78 bnelr 78 bnelr
79 79
80 /* Copy info into temporary exception thread info */
81 ld r11,PACAKSAVE(r13)
82 CURRENT_THREAD_INFO(r11, r11)
83 CURRENT_THREAD_INFO(r12, r1)
84 ld r10,TI_FLAGS(r11)
85 std r10,TI_FLAGS(r12)
86 ld r10,TI_PREEMPT(r11)
87 std r10,TI_PREEMPT(r12)
88 ld r10,TI_TASK(r11)
89 std r10,TI_TASK(r12)
90
91 /* 80 /*
92 * Advance to the next TLB exception frame for handler 81 * Advance to the next TLB exception frame for handler
93 * types that don't do it automatically. 82 * types that don't do it automatically.
@@ -349,6 +338,7 @@ ret_from_mc_except:
349#define GEN_BTB_FLUSH 338#define GEN_BTB_FLUSH
350#define CRIT_BTB_FLUSH 339#define CRIT_BTB_FLUSH
351#define DBG_BTB_FLUSH 340#define DBG_BTB_FLUSH
341#define MC_BTB_FLUSH
352#define GDBELL_BTB_FLUSH 342#define GDBELL_BTB_FLUSH
353#endif 343#endif
354 344
@@ -504,7 +494,7 @@ exc_##n##_bad_stack: \
504 * interrupts happen before the wait instruction. 494 * interrupts happen before the wait instruction.
505 */ 495 */
506#define CHECK_NAPPING() \ 496#define CHECK_NAPPING() \
507 CURRENT_THREAD_INFO(r11, r1); \ 497 ld r11, PACA_THREAD_INFO(r13); \
508 ld r10,TI_LOCAL_FLAGS(r11); \ 498 ld r10,TI_LOCAL_FLAGS(r11); \
509 andi. r9,r10,_TLF_NAPPING; \ 499 andi. r9,r10,_TLF_NAPPING; \
510 beq+ 1f; \ 500 beq+ 1f; \
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9e253ce27e08..a5b8fbae56a0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -68,6 +68,14 @@ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900)
68OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) 68OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000)
69OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) 69OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900)
70OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) 70OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000)
71
72#ifdef CONFIG_PPC_POWERNV
73 .globl start_real_trampolines
74 .globl end_real_trampolines
75 .globl start_virt_trampolines
76 .globl end_virt_trampolines
77#endif
78
71#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) 79#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
72/* 80/*
73 * Data area reserved for FWNMI option. 81 * Data area reserved for FWNMI option.
@@ -566,8 +574,36 @@ EXC_COMMON_BEGIN(mce_return)
566 RFI_TO_KERNEL 574 RFI_TO_KERNEL
567 b . 575 b .
568 576
569EXC_REAL(data_access, 0x300, 0x80) 577EXC_REAL_BEGIN(data_access, 0x300, 0x80)
570EXC_VIRT(data_access, 0x4300, 0x80, 0x300) 578SET_SCRATCH0(r13) /* save r13 */
579EXCEPTION_PROLOG_0(PACA_EXGEN)
580 b tramp_real_data_access
581EXC_REAL_END(data_access, 0x300, 0x80)
582
583TRAMP_REAL_BEGIN(tramp_real_data_access)
584EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300)
585 /*
586 * DAR/DSISR must be read before setting MSR[RI], because
587 * a d-side MCE will clobber those registers so is not
588 * recoverable if they are live.
589 */
590 mfspr r10,SPRN_DAR
591 mfspr r11,SPRN_DSISR
592 std r10,PACA_EXGEN+EX_DAR(r13)
593 stw r11,PACA_EXGEN+EX_DSISR(r13)
594EXCEPTION_PROLOG_2(data_access_common, EXC_STD)
595
596EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
597SET_SCRATCH0(r13) /* save r13 */
598EXCEPTION_PROLOG_0(PACA_EXGEN)
599EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300)
600 mfspr r10,SPRN_DAR
601 mfspr r11,SPRN_DSISR
602 std r10,PACA_EXGEN+EX_DAR(r13)
603 stw r11,PACA_EXGEN+EX_DSISR(r13)
604EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD)
605EXC_VIRT_END(data_access, 0x4300, 0x80)
606
571TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) 607TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
572 608
573EXC_COMMON_BEGIN(data_access_common) 609EXC_COMMON_BEGIN(data_access_common)
@@ -575,11 +611,8 @@ EXC_COMMON_BEGIN(data_access_common)
575 * Here r13 points to the paca, r9 contains the saved CR, 611 * Here r13 points to the paca, r9 contains the saved CR,
576 * SRR0 and SRR1 are saved in r11 and r12, 612 * SRR0 and SRR1 are saved in r11 and r12,
577 * r9 - r13 are saved in paca->exgen. 613 * r9 - r13 are saved in paca->exgen.
614 * EX_DAR and EX_DSISR have saved DAR/DSISR
578 */ 615 */
579 mfspr r10,SPRN_DAR
580 std r10,PACA_EXGEN+EX_DAR(r13)
581 mfspr r10,SPRN_DSISR
582 stw r10,PACA_EXGEN+EX_DSISR(r13)
583 EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) 616 EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
584 RECONCILE_IRQ_STATE(r10, r11) 617 RECONCILE_IRQ_STATE(r10, r11)
585 ld r12,_MSR(r1) 618 ld r12,_MSR(r1)
@@ -596,18 +629,29 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
596 629
597 630
598EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) 631EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
599EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); 632SET_SCRATCH0(r13) /* save r13 */
633EXCEPTION_PROLOG_0(PACA_EXSLB)
634 b tramp_real_data_access_slb
600EXC_REAL_END(data_access_slb, 0x380, 0x80) 635EXC_REAL_END(data_access_slb, 0x380, 0x80)
601 636
637TRAMP_REAL_BEGIN(tramp_real_data_access_slb)
638EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
639 mfspr r10,SPRN_DAR
640 std r10,PACA_EXSLB+EX_DAR(r13)
641EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD)
642
602EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) 643EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
603EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); 644SET_SCRATCH0(r13) /* save r13 */
645EXCEPTION_PROLOG_0(PACA_EXSLB)
646EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
647 mfspr r10,SPRN_DAR
648 std r10,PACA_EXSLB+EX_DAR(r13)
649EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD)
604EXC_VIRT_END(data_access_slb, 0x4380, 0x80) 650EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
605 651
606TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) 652TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
607 653
608EXC_COMMON_BEGIN(data_access_slb_common) 654EXC_COMMON_BEGIN(data_access_slb_common)
609 mfspr r10,SPRN_DAR
610 std r10,PACA_EXSLB+EX_DAR(r13)
611 EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) 655 EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB)
612 ld r4,PACA_EXSLB+EX_DAR(r13) 656 ld r4,PACA_EXSLB+EX_DAR(r13)
613 std r4,_DAR(r1) 657 std r4,_DAR(r1)
@@ -703,14 +747,30 @@ TRAMP_KVM_HV(PACA_EXGEN, 0x500)
703EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) 747EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
704 748
705 749
706EXC_REAL(alignment, 0x600, 0x100) 750EXC_REAL_BEGIN(alignment, 0x600, 0x100)
707EXC_VIRT(alignment, 0x4600, 0x100, 0x600) 751SET_SCRATCH0(r13) /* save r13 */
708TRAMP_KVM(PACA_EXGEN, 0x600) 752EXCEPTION_PROLOG_0(PACA_EXGEN)
709EXC_COMMON_BEGIN(alignment_common) 753EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600)
710 mfspr r10,SPRN_DAR 754 mfspr r10,SPRN_DAR
755 mfspr r11,SPRN_DSISR
711 std r10,PACA_EXGEN+EX_DAR(r13) 756 std r10,PACA_EXGEN+EX_DAR(r13)
712 mfspr r10,SPRN_DSISR 757 stw r11,PACA_EXGEN+EX_DSISR(r13)
713 stw r10,PACA_EXGEN+EX_DSISR(r13) 758EXCEPTION_PROLOG_2(alignment_common, EXC_STD)
759EXC_REAL_END(alignment, 0x600, 0x100)
760
761EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
762SET_SCRATCH0(r13) /* save r13 */
763EXCEPTION_PROLOG_0(PACA_EXGEN)
764EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600)
765 mfspr r10,SPRN_DAR
766 mfspr r11,SPRN_DSISR
767 std r10,PACA_EXGEN+EX_DAR(r13)
768 stw r11,PACA_EXGEN+EX_DSISR(r13)
769EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD)
770EXC_VIRT_END(alignment, 0x4600, 0x100)
771
772TRAMP_KVM(PACA_EXGEN, 0x600)
773EXC_COMMON_BEGIN(alignment_common)
714 EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) 774 EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN)
715 ld r3,PACA_EXGEN+EX_DAR(r13) 775 ld r3,PACA_EXGEN+EX_DAR(r13)
716 lwz r4,PACA_EXGEN+EX_DSISR(r13) 776 lwz r4,PACA_EXGEN+EX_DSISR(r13)
@@ -1629,7 +1689,7 @@ do_hash_page:
1629 ori r0,r0,DSISR_BAD_FAULT_64S@l 1689 ori r0,r0,DSISR_BAD_FAULT_64S@l
1630 and. r0,r4,r0 /* weird error? */ 1690 and. r0,r4,r0 /* weird error? */
1631 bne- handle_page_fault /* if not, try to insert a HPTE */ 1691 bne- handle_page_fault /* if not, try to insert a HPTE */
1632 CURRENT_THREAD_INFO(r11, r1) 1692 ld r11, PACA_THREAD_INFO(r13)
1633 lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ 1693 lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
1634 andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ 1694 andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
1635 bne 77f /* then don't call hash_page now */ 1695 bne 77f /* then don't call hash_page now */
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 05b08db3901d..ce6a972f2584 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -261,7 +261,7 @@ __secondary_hold_acknowledge:
261 tophys(r11,r1); /* use tophys(r1) if kernel */ \ 261 tophys(r11,r1); /* use tophys(r1) if kernel */ \
262 beq 1f; \ 262 beq 1f; \
263 mfspr r11,SPRN_SPRG_THREAD; \ 263 mfspr r11,SPRN_SPRG_THREAD; \
264 lwz r11,THREAD_INFO-THREAD(r11); \ 264 lwz r11,TASK_STACK-THREAD(r11); \
265 addi r11,r11,THREAD_SIZE; \ 265 addi r11,r11,THREAD_SIZE; \
266 tophys(r11,r11); \ 266 tophys(r11,r11); \
2671: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ 2671: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */
@@ -352,9 +352,8 @@ i##n: \
352 * registers that might have bad values includes all the GPRs 352 * registers that might have bad values includes all the GPRs
353 * and all the BATs. We indicate that we are in RTAS by putting 353 * and all the BATs. We indicate that we are in RTAS by putting
354 * a non-zero value, the address of the exception frame to use, 354 * a non-zero value, the address of the exception frame to use,
355 * in SPRG2. The machine check handler checks SPRG2 and uses its 355 * in thread.rtas_sp. The machine check handler checks thread.rtas_sp
356 * value if it is non-zero. If we ever needed to free up SPRG2, 356 * and uses its value if it is non-zero.
357 * we could use a field in the thread_info or thread_struct instead.
358 * (Other exception handlers assume that r1 is a valid kernel stack 357 * (Other exception handlers assume that r1 is a valid kernel stack
359 * pointer when we take an exception from supervisor mode.) 358 * pointer when we take an exception from supervisor mode.)
360 * -- paulus. 359 * -- paulus.
@@ -365,16 +364,15 @@ i##n: \
365 mtspr SPRN_SPRG_SCRATCH1,r11 364 mtspr SPRN_SPRG_SCRATCH1,r11
366 mfcr r10 365 mfcr r10
367#ifdef CONFIG_PPC_CHRP 366#ifdef CONFIG_PPC_CHRP
368 mfspr r11,SPRN_SPRG_RTAS 367 mfspr r11, SPRN_SPRG_THREAD
369 cmpwi 0,r11,0 368 lwz r11, RTAS_SP(r11)
370 bne 7f 369 cmpwi cr1, r11, 0
370 bne cr1, 7f
371#endif /* CONFIG_PPC_CHRP */ 371#endif /* CONFIG_PPC_CHRP */
372 EXCEPTION_PROLOG_1 372 EXCEPTION_PROLOG_1
3737: EXCEPTION_PROLOG_2 3737: EXCEPTION_PROLOG_2
374 addi r3,r1,STACK_FRAME_OVERHEAD 374 addi r3,r1,STACK_FRAME_OVERHEAD
375#ifdef CONFIG_PPC_CHRP 375#ifdef CONFIG_PPC_CHRP
376 mfspr r4,SPRN_SPRG_RTAS
377 cmpwi cr1,r4,0
378 bne cr1,1f 376 bne cr1,1f
379#endif 377#endif
380 EXC_XFER_STD(0x200, machine_check_exception) 378 EXC_XFER_STD(0x200, machine_check_exception)
@@ -500,18 +498,22 @@ InstructionTLBMiss:
500 */ 498 */
501 /* Get PTE (linux-style) and check access */ 499 /* Get PTE (linux-style) and check access */
502 mfspr r3,SPRN_IMISS 500 mfspr r3,SPRN_IMISS
501#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
503 lis r1,PAGE_OFFSET@h /* check if kernel address */ 502 lis r1,PAGE_OFFSET@h /* check if kernel address */
504 cmplw 0,r1,r3 503 cmplw 0,r1,r3
505 mfspr r2,SPRN_SPRG_THREAD 504#endif
506 li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ 505 mfspr r2, SPRN_SPRG_PGDIR
507 lwz r2,PGDIR(r2) 506#ifdef CONFIG_SWAP
507 li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
508#else
509 li r1,_PAGE_PRESENT | _PAGE_EXEC
510#endif
511#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC)
508 bge- 112f 512 bge- 112f
509 mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ 513 lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */
510 rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 514 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */
511 lis r2,swapper_pg_dir@ha /* if kernel address, use */ 515#endif
512 addi r2,r2,swapper_pg_dir@l /* kernel page table */ 516112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
513112: tophys(r2,r2)
514 rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
515 lwz r2,0(r2) /* get pmd entry */ 517 lwz r2,0(r2) /* get pmd entry */
516 rlwinm. r2,r2,0,0,19 /* extract address of pte page */ 518 rlwinm. r2,r2,0,0,19 /* extract address of pte page */
517 beq- InstructionAddressInvalid /* return if no mapping */ 519 beq- InstructionAddressInvalid /* return if no mapping */
@@ -519,20 +521,10 @@ InstructionTLBMiss:
519 lwz r0,0(r2) /* get linux-style pte */ 521 lwz r0,0(r2) /* get linux-style pte */
520 andc. r1,r1,r0 /* check access & ~permission */ 522 andc. r1,r1,r0 /* check access & ~permission */
521 bne- InstructionAddressInvalid /* return if access not permitted */ 523 bne- InstructionAddressInvalid /* return if access not permitted */
522 ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */
523 /*
524 * NOTE! We are assuming this is not an SMP system, otherwise
525 * we would need to update the pte atomically with lwarx/stwcx.
526 */
527 stw r0,0(r2) /* update PTE (accessed bit) */
528 /* Convert linux-style PTE to low word of PPC-style PTE */ 524 /* Convert linux-style PTE to low word of PPC-style PTE */
529 rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */
530 rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
531 and r1,r1,r2 /* writable if _RW and _DIRTY */
532 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ 525 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */
533 rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ 526 ori r1, r1, 0xe05 /* clear out reserved bits */
534 ori r1,r1,0xe04 /* clear out reserved bits */ 527 andc r1, r0, r1 /* PP = user? 2 : 0 */
535 andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */
536BEGIN_FTR_SECTION 528BEGIN_FTR_SECTION
537 rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ 529 rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */
538END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) 530END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -576,16 +568,16 @@ DataLoadTLBMiss:
576 mfspr r3,SPRN_DMISS 568 mfspr r3,SPRN_DMISS
577 lis r1,PAGE_OFFSET@h /* check if kernel address */ 569 lis r1,PAGE_OFFSET@h /* check if kernel address */
578 cmplw 0,r1,r3 570 cmplw 0,r1,r3
579 mfspr r2,SPRN_SPRG_THREAD 571 mfspr r2, SPRN_SPRG_PGDIR
580 li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ 572#ifdef CONFIG_SWAP
581 lwz r2,PGDIR(r2) 573 li r1, _PAGE_PRESENT | _PAGE_ACCESSED
574#else
575 li r1, _PAGE_PRESENT
576#endif
582 bge- 112f 577 bge- 112f
583 mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ 578 lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */
584 rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 579 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */
585 lis r2,swapper_pg_dir@ha /* if kernel address, use */ 580112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
586 addi r2,r2,swapper_pg_dir@l /* kernel page table */
587112: tophys(r2,r2)
588 rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
589 lwz r2,0(r2) /* get pmd entry */ 581 lwz r2,0(r2) /* get pmd entry */
590 rlwinm. r2,r2,0,0,19 /* extract address of pte page */ 582 rlwinm. r2,r2,0,0,19 /* extract address of pte page */
591 beq- DataAddressInvalid /* return if no mapping */ 583 beq- DataAddressInvalid /* return if no mapping */
@@ -593,20 +585,16 @@ DataLoadTLBMiss:
593 lwz r0,0(r2) /* get linux-style pte */ 585 lwz r0,0(r2) /* get linux-style pte */
594 andc. r1,r1,r0 /* check access & ~permission */ 586 andc. r1,r1,r0 /* check access & ~permission */
595 bne- DataAddressInvalid /* return if access not permitted */ 587 bne- DataAddressInvalid /* return if access not permitted */
596 ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */
597 /* 588 /*
598 * NOTE! We are assuming this is not an SMP system, otherwise 589 * NOTE! We are assuming this is not an SMP system, otherwise
599 * we would need to update the pte atomically with lwarx/stwcx. 590 * we would need to update the pte atomically with lwarx/stwcx.
600 */ 591 */
601 stw r0,0(r2) /* update PTE (accessed bit) */
602 /* Convert linux-style PTE to low word of PPC-style PTE */ 592 /* Convert linux-style PTE to low word of PPC-style PTE */
603 rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ 593 rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */
604 rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
605 and r1,r1,r2 /* writable if _RW and _DIRTY */
606 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ 594 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */
607 rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ 595 rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */
608 ori r1,r1,0xe04 /* clear out reserved bits */ 596 ori r1,r1,0xe04 /* clear out reserved bits */
609 andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ 597 andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */
610BEGIN_FTR_SECTION 598BEGIN_FTR_SECTION
611 rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ 599 rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */
612END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) 600END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -660,16 +648,16 @@ DataStoreTLBMiss:
660 mfspr r3,SPRN_DMISS 648 mfspr r3,SPRN_DMISS
661 lis r1,PAGE_OFFSET@h /* check if kernel address */ 649 lis r1,PAGE_OFFSET@h /* check if kernel address */
662 cmplw 0,r1,r3 650 cmplw 0,r1,r3
663 mfspr r2,SPRN_SPRG_THREAD 651 mfspr r2, SPRN_SPRG_PGDIR
664 li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ 652#ifdef CONFIG_SWAP
665 lwz r2,PGDIR(r2) 653 li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED
654#else
655 li r1, _PAGE_RW | _PAGE_PRESENT
656#endif
666 bge- 112f 657 bge- 112f
667 mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ 658 lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */
668 rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 659 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */
669 lis r2,swapper_pg_dir@ha /* if kernel address, use */ 660112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
670 addi r2,r2,swapper_pg_dir@l /* kernel page table */
671112: tophys(r2,r2)
672 rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */
673 lwz r2,0(r2) /* get pmd entry */ 661 lwz r2,0(r2) /* get pmd entry */
674 rlwinm. r2,r2,0,0,19 /* extract address of pte page */ 662 rlwinm. r2,r2,0,0,19 /* extract address of pte page */
675 beq- DataAddressInvalid /* return if no mapping */ 663 beq- DataAddressInvalid /* return if no mapping */
@@ -677,12 +665,10 @@ DataStoreTLBMiss:
677 lwz r0,0(r2) /* get linux-style pte */ 665 lwz r0,0(r2) /* get linux-style pte */
678 andc. r1,r1,r0 /* check access & ~permission */ 666 andc. r1,r1,r0 /* check access & ~permission */
679 bne- DataAddressInvalid /* return if access not permitted */ 667 bne- DataAddressInvalid /* return if access not permitted */
680 ori r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY
681 /* 668 /*
682 * NOTE! We are assuming this is not an SMP system, otherwise 669 * NOTE! We are assuming this is not an SMP system, otherwise
683 * we would need to update the pte atomically with lwarx/stwcx. 670 * we would need to update the pte atomically with lwarx/stwcx.
684 */ 671 */
685 stw r0,0(r2) /* update PTE (accessed/dirty bits) */
686 /* Convert linux-style PTE to low word of PPC-style PTE */ 672 /* Convert linux-style PTE to low word of PPC-style PTE */
687 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ 673 rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */
688 li r1,0xe05 /* clear out reserved bits & PP lsb */ 674 li r1,0xe05 /* clear out reserved bits & PP lsb */
@@ -845,12 +831,12 @@ __secondary_start:
845 bl init_idle_6xx 831 bl init_idle_6xx
846#endif /* CONFIG_PPC_BOOK3S_32 */ 832#endif /* CONFIG_PPC_BOOK3S_32 */
847 833
848 /* get current_thread_info and current */ 834 /* get current's stack and current */
849 lis r1,secondary_ti@ha 835 lis r2,secondary_current@ha
850 tophys(r1,r1) 836 tophys(r2,r2)
851 lwz r1,secondary_ti@l(r1) 837 lwz r2,secondary_current@l(r2)
852 tophys(r2,r1) 838 tophys(r1,r2)
853 lwz r2,TI_TASK(r2) 839 lwz r1,TASK_STACK(r1)
854 840
855 /* stack */ 841 /* stack */
856 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD 842 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
@@ -865,8 +851,10 @@ __secondary_start:
865 tophys(r4,r2) 851 tophys(r4,r2)
866 addi r4,r4,THREAD /* phys address of our thread_struct */ 852 addi r4,r4,THREAD /* phys address of our thread_struct */
867 mtspr SPRN_SPRG_THREAD,r4 853 mtspr SPRN_SPRG_THREAD,r4
854#ifdef CONFIG_PPC_RTAS
868 li r3,0 855 li r3,0
869 mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ 856 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */
857#endif
870 858
871 /* enable MMU and jump to start_secondary */ 859 /* enable MMU and jump to start_secondary */
872 li r4,MSR_KERNEL 860 li r4,MSR_KERNEL
@@ -950,8 +938,10 @@ start_here:
950 tophys(r4,r2) 938 tophys(r4,r2)
951 addi r4,r4,THREAD /* init task's THREAD */ 939 addi r4,r4,THREAD /* init task's THREAD */
952 mtspr SPRN_SPRG_THREAD,r4 940 mtspr SPRN_SPRG_THREAD,r4
941#ifdef CONFIG_PPC_RTAS
953 li r3,0 942 li r3,0
954 mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ 943 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */
944#endif
955 945
956 /* stack */ 946 /* stack */
957 lis r1,init_thread_union@ha 947 lis r1,init_thread_union@ha
@@ -1022,15 +1012,16 @@ _ENTRY(switch_mmu_context)
1022 li r0,NUM_USER_SEGMENTS 1012 li r0,NUM_USER_SEGMENTS
1023 mtctr r0 1013 mtctr r0
1024 1014
1015 lwz r4, MM_PGD(r4)
1025#ifdef CONFIG_BDI_SWITCH 1016#ifdef CONFIG_BDI_SWITCH
1026 /* Context switch the PTE pointer for the Abatron BDI2000. 1017 /* Context switch the PTE pointer for the Abatron BDI2000.
1027 * The PGDIR is passed as second argument. 1018 * The PGDIR is passed as second argument.
1028 */ 1019 */
1029 lwz r4,MM_PGD(r4) 1020 lis r5, abatron_pteptrs@ha
1030 lis r5, KERNELBASE@h 1021 stw r4, abatron_pteptrs@l + 0x4(r5)
1031 lwz r5, 0xf0(r5)
1032 stw r4, 0x4(r5)
1033#endif 1022#endif
1023 tophys(r4, r4)
1024 mtspr SPRN_SPRG_PGDIR, r4
1034 li r4,0 1025 li r4,0
1035 isync 1026 isync
10363: 10273:
@@ -1105,6 +1096,41 @@ BEGIN_MMU_FTR_SECTION
1105END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) 1096END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
1106 blr 1097 blr
1107 1098
1099_ENTRY(update_bats)
1100 lis r4, 1f@h
1101 ori r4, r4, 1f@l
1102 tophys(r4, r4)
1103 mfmsr r6
1104 mflr r7
1105 li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)
1106 rlwinm r0, r6, 0, ~MSR_RI
1107 rlwinm r0, r0, 0, ~MSR_EE
1108 mtmsr r0
1109 mtspr SPRN_SRR0, r4
1110 mtspr SPRN_SRR1, r3
1111 SYNC
1112 RFI
11131: bl clear_bats
1114 lis r3, BATS@ha
1115 addi r3, r3, BATS@l
1116 tophys(r3, r3)
1117 LOAD_BAT(0, r3, r4, r5)
1118 LOAD_BAT(1, r3, r4, r5)
1119 LOAD_BAT(2, r3, r4, r5)
1120 LOAD_BAT(3, r3, r4, r5)
1121BEGIN_MMU_FTR_SECTION
1122 LOAD_BAT(4, r3, r4, r5)
1123 LOAD_BAT(5, r3, r4, r5)
1124 LOAD_BAT(6, r3, r4, r5)
1125 LOAD_BAT(7, r3, r4, r5)
1126END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
1127 li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI)
1128 mtmsr r3
1129 mtspr SPRN_SRR0, r7
1130 mtspr SPRN_SRR1, r6
1131 SYNC
1132 RFI
1133
1108flush_tlbs: 1134flush_tlbs:
1109 lis r10, 0x40 1135 lis r10, 0x40
11101: addic. r10, r10, -0x1000 11361: addic. r10, r10, -0x1000
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index b19d78410511..a9c934f2319b 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -115,7 +115,7 @@ _ENTRY(saved_ksp_limit)
115 andi. r11,r11,MSR_PR; \ 115 andi. r11,r11,MSR_PR; \
116 beq 1f; \ 116 beq 1f; \
117 mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ 117 mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\
118 lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ 118 lwz r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack */\
119 addi r1,r1,THREAD_SIZE; \ 119 addi r1,r1,THREAD_SIZE; \
1201: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ 1201: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\
121 tophys(r11,r1); \ 121 tophys(r11,r1); \
@@ -158,7 +158,7 @@ _ENTRY(saved_ksp_limit)
158 beq 1f; \ 158 beq 1f; \
159 /* COMING FROM USER MODE */ \ 159 /* COMING FROM USER MODE */ \
160 mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ 160 mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\
161 lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ 161 lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\
1621: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ 1621: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\
163 tophys(r11,r11); \ 163 tophys(r11,r11); \
164 stw r10,_CCR(r11); /* save various registers */\ 164 stw r10,_CCR(r11); /* save various registers */\
@@ -953,9 +953,8 @@ _GLOBAL(set_context)
953 /* Context switch the PTE pointer for the Abatron BDI2000. 953 /* Context switch the PTE pointer for the Abatron BDI2000.
954 * The PGDIR is the second parameter. 954 * The PGDIR is the second parameter.
955 */ 955 */
956 lis r5, KERNELBASE@h 956 lis r5, abatron_pteptrs@ha
957 lwz r5, 0xf0(r5) 957 stw r4, abatron_pteptrs@l + 0x4(r5)
958 stw r4, 0x4(r5)
959#endif 958#endif
960 sync 959 sync
961 mtspr SPRN_PID,r3 960 mtspr SPRN_PID,r3
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index bf23c19c92d6..37117ab11584 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -1019,10 +1019,10 @@ _GLOBAL(start_secondary_47x)
1019 1019
1020 /* Now we can get our task struct and real stack pointer */ 1020 /* Now we can get our task struct and real stack pointer */
1021 1021
1022 /* Get current_thread_info and current */ 1022 /* Get current's stack and current */
1023 lis r1,secondary_ti@ha 1023 lis r2,secondary_current@ha
1024 lwz r1,secondary_ti@l(r1) 1024 lwz r2,secondary_current@l(r2)
1025 lwz r2,TI_TASK(r1) 1025 lwz r1,TASK_STACK(r2)
1026 1026
1027 /* Current stack pointer */ 1027 /* Current stack pointer */
1028 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD 1028 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 4898e9491a1c..3fad8d499767 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -801,21 +801,19 @@ __secondary_start:
801 /* Set thread priority to MEDIUM */ 801 /* Set thread priority to MEDIUM */
802 HMT_MEDIUM 802 HMT_MEDIUM
803 803
804 /* Initialize the kernel stack */ 804 /*
805 LOAD_REG_ADDR(r3, current_set) 805 * Do early setup for this CPU, in particular initialising the MMU so we
806 sldi r28,r24,3 /* get current_set[cpu#] */ 806 * can turn it on below. This is a call to C, which is OK, we're still
807 ldx r14,r3,r28 807 * running on the emergency stack.
808 addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD 808 */
809 std r14,PACAKSAVE(r13)
810
811 /* Do early setup for that CPU (SLB and hash table pointer) */
812 bl early_setup_secondary 809 bl early_setup_secondary
813 810
814 /* 811 /*
815 * setup the new stack pointer, but *don't* use this until 812 * The primary has initialized our kernel stack for us in the paca, grab
816 * translation is on. 813 * it and put it in r1. We must *not* use it until we turn on the MMU
814 * below, because it may not be inside the RMO.
817 */ 815 */
818 mr r1, r14 816 ld r1, PACAKSAVE(r13)
819 817
820 /* Clear backchain so we get nice backtraces */ 818 /* Clear backchain so we get nice backtraces */
821 li r7,0 819 li r7,0
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 20cc816b3508..03c73b4c6435 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -142,7 +142,7 @@ instruction_counter:
142 tophys(r11,r1); /* use tophys(r1) if kernel */ \ 142 tophys(r11,r1); /* use tophys(r1) if kernel */ \
143 beq 1f; \ 143 beq 1f; \
144 mfspr r11,SPRN_SPRG_THREAD; \ 144 mfspr r11,SPRN_SPRG_THREAD; \
145 lwz r11,THREAD_INFO-THREAD(r11); \ 145 lwz r11,TASK_STACK-THREAD(r11); \
146 addi r11,r11,THREAD_SIZE; \ 146 addi r11,r11,THREAD_SIZE; \
147 tophys(r11,r11); \ 147 tophys(r11,r11); \
1481: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ 1481: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */
@@ -292,6 +292,17 @@ SystemCall:
292 */ 292 */
293 EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) 293 EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
294 294
295/* Called from DataStoreTLBMiss when perf TLB misses events are activated */
296#ifdef CONFIG_PERF_EVENTS
297 patch_site 0f, patch__dtlbmiss_perf
2980: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
299 addi r10, r10, 1
300 stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
301 mfspr r10, SPRN_SPRG_SCRATCH0
302 mfspr r11, SPRN_SPRG_SCRATCH1
303 rfi
304#endif
305
295 . = 0x1100 306 . = 0x1100
296/* 307/*
297 * For the MPC8xx, this is a software tablewalk to load the instruction 308 * For the MPC8xx, this is a software tablewalk to load the instruction
@@ -337,8 +348,8 @@ InstructionTLBMiss:
337 rlwinm r10, r10, 16, 0xfff8 348 rlwinm r10, r10, 16, 0xfff8
338 cmpli cr0, r10, PAGE_OFFSET@h 349 cmpli cr0, r10, PAGE_OFFSET@h
339#ifndef CONFIG_PIN_TLB_TEXT 350#ifndef CONFIG_PIN_TLB_TEXT
340 /* It is assumed that kernel code fits into the first 8M page */ 351 /* It is assumed that kernel code fits into the first 32M */
3410: cmpli cr7, r10, (PAGE_OFFSET + 0x0800000)@h 3520: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h
342 patch_site 0b, patch__itlbmiss_linmem_top 353 patch_site 0b, patch__itlbmiss_linmem_top
343#endif 354#endif
344#endif 355#endif
@@ -405,10 +416,20 @@ InstructionTLBMiss:
405#ifndef CONFIG_PIN_TLB_TEXT 416#ifndef CONFIG_PIN_TLB_TEXT
406ITLBMissLinear: 417ITLBMissLinear:
407 mtcr r11 418 mtcr r11
419#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23
420 patch_site 0f, patch__itlbmiss_linmem_top8
421
422 mfspr r10, SPRN_SRR0
4230: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha
424 rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K
425 ori r11, r11, MI_PS512K | MI_SVALID
426 rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */
427#else
408 /* Set 8M byte page and mark it valid */ 428 /* Set 8M byte page and mark it valid */
409 li r11, MI_PS8MEG | MI_SVALID 429 li r11, MI_PS8MEG | MI_SVALID
410 mtspr SPRN_MI_TWC, r11
411 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ 430 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */
431#endif
432 mtspr SPRN_MI_TWC, r11
412 ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ 433 ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
413 _PAGE_PRESENT 434 _PAGE_PRESENT
414 mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ 435 mtspr SPRN_MI_RPN, r10 /* Update TLB entry */
@@ -434,7 +455,7 @@ DataStoreTLBMiss:
434#ifndef CONFIG_PIN_TLB_IMMR 455#ifndef CONFIG_PIN_TLB_IMMR
435 cmpli cr6, r10, VIRT_IMMR_BASE@h 456 cmpli cr6, r10, VIRT_IMMR_BASE@h
436#endif 457#endif
4370: cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h 4580: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h
438 patch_site 0b, patch__dtlbmiss_linmem_top 459 patch_site 0b, patch__dtlbmiss_linmem_top
439 460
440 mfspr r10, SPRN_M_TWB /* Get level 1 table */ 461 mfspr r10, SPRN_M_TWB /* Get level 1 table */
@@ -494,16 +515,6 @@ DataStoreTLBMiss:
494 rfi 515 rfi
495 patch_site 0b, patch__dtlbmiss_exit_1 516 patch_site 0b, patch__dtlbmiss_exit_1
496 517
497#ifdef CONFIG_PERF_EVENTS
498 patch_site 0f, patch__dtlbmiss_perf
4990: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
500 addi r10, r10, 1
501 stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
502 mfspr r10, SPRN_SPRG_SCRATCH0
503 mfspr r11, SPRN_SPRG_SCRATCH1
504 rfi
505#endif
506
507DTLBMissIMMR: 518DTLBMissIMMR:
508 mtcr r11 519 mtcr r11
509 /* Set 512k byte guarded page and mark it valid */ 520 /* Set 512k byte guarded page and mark it valid */
@@ -525,10 +536,29 @@ DTLBMissIMMR:
525 536
526DTLBMissLinear: 537DTLBMissLinear:
527 mtcr r11 538 mtcr r11
539 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */
540#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23
541 patch_site 0f, patch__dtlbmiss_romem_top8
542
5430: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha
544 rlwinm r11, r11, 0, 0xff800000
545 neg r10, r11
546 or r11, r11, r10
547 rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K
548 ori r11, r11, MI_PS512K | MI_SVALID
549 mfspr r10, SPRN_MD_EPN
550 rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */
551#else
528 /* Set 8M byte page and mark it valid */ 552 /* Set 8M byte page and mark it valid */
529 li r11, MD_PS8MEG | MD_SVALID 553 li r11, MD_PS8MEG | MD_SVALID
554#endif
530 mtspr SPRN_MD_TWC, r11 555 mtspr SPRN_MD_TWC, r11
531 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ 556#ifdef CONFIG_STRICT_KERNEL_RWX
557 patch_site 0f, patch__dtlbmiss_romem_top
558
5590: subis r11, r10, 0
560 rlwimi r10, r11, 11, _PAGE_RO
561#endif
532 ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ 562 ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
533 _PAGE_PRESENT 563 _PAGE_PRESENT
534 mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ 564 mtspr SPRN_MD_RPN, r10 /* Update TLB entry */
@@ -551,11 +581,11 @@ InstructionTLBError:
551 mr r4,r12 581 mr r4,r12
552 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ 582 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
553 andis. r10,r9,SRR1_ISI_NOPT@h 583 andis. r10,r9,SRR1_ISI_NOPT@h
554 beq+ 1f 584 beq+ .Litlbie
555 tlbie r4 585 tlbie r4
556itlbie:
557 /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ 586 /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
5581: EXC_XFER_LITE(0x400, handle_page_fault) 587.Litlbie:
588 EXC_XFER_LITE(0x400, handle_page_fault)
559 589
560/* This is the data TLB error on the MPC8xx. This could be due to 590/* This is the data TLB error on the MPC8xx. This could be due to
561 * many reasons, including a dirty update to a pte. We bail out to 591 * many reasons, including a dirty update to a pte. We bail out to
@@ -577,10 +607,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */
577 stw r5,_DSISR(r11) 607 stw r5,_DSISR(r11)
578 mfspr r4,SPRN_DAR 608 mfspr r4,SPRN_DAR
579 andis. r10,r5,DSISR_NOHPTE@h 609 andis. r10,r5,DSISR_NOHPTE@h
580 beq+ 1f 610 beq+ .Ldtlbie
581 tlbie r4 611 tlbie r4
582dtlbie: 612.Ldtlbie:
5831: li r10,RPN_PATTERN 613 li r10,RPN_PATTERN
584 mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ 614 mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */
585 /* 0x300 is DataAccess exception, needed by bad_page_fault() */ 615 /* 0x300 is DataAccess exception, needed by bad_page_fault() */
586 EXC_XFER_LITE(0x300, handle_page_fault) 616 EXC_XFER_LITE(0x300, handle_page_fault)
@@ -603,8 +633,8 @@ DataBreakpoint:
603 mtspr SPRN_SPRG_SCRATCH1, r11 633 mtspr SPRN_SPRG_SCRATCH1, r11
604 mfcr r10 634 mfcr r10
605 mfspr r11, SPRN_SRR0 635 mfspr r11, SPRN_SRR0
606 cmplwi cr0, r11, (dtlbie - PAGE_OFFSET)@l 636 cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l
607 cmplwi cr7, r11, (itlbie - PAGE_OFFSET)@l 637 cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l
608 beq- cr0, 11f 638 beq- cr0, 11f
609 beq- cr7, 11f 639 beq- cr7, 11f
610 EXCEPTION_PROLOG_1 640 EXCEPTION_PROLOG_1
@@ -886,28 +916,11 @@ initial_mmu:
886 mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ 916 mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */
887 917
888 tlbia /* Invalidate all TLB entries */ 918 tlbia /* Invalidate all TLB entries */
889#ifdef CONFIG_PIN_TLB_TEXT
890 lis r8, MI_RSV4I@h
891 ori r8, r8, 0x1c00
892
893 mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */
894#endif
895
896#ifdef CONFIG_PIN_TLB_DATA 919#ifdef CONFIG_PIN_TLB_DATA
897 oris r10, r10, MD_RSV4I@h 920 oris r10, r10, MD_RSV4I@h
898 mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ 921 mtspr SPRN_MD_CTR, r10 /* Set data TLB control */
899#endif 922#endif
900 923
901 /* Now map the lower 8 Meg into the ITLB. */
902 lis r8, KERNELBASE@h /* Create vaddr for TLB */
903 ori r8, r8, MI_EVALID /* Mark it valid */
904 mtspr SPRN_MI_EPN, r8
905 li r8, MI_PS8MEG /* Set 8M byte page */
906 ori r8, r8, MI_SVALID /* Make it valid */
907 mtspr SPRN_MI_TWC, r8
908 li r8, MI_BOOTINIT /* Create RPN for address 0 */
909 mtspr SPRN_MI_RPN, r8 /* Store TLB entry */
910
911 lis r8, MI_APG_INIT@h /* Set protection modes */ 924 lis r8, MI_APG_INIT@h /* Set protection modes */
912 ori r8, r8, MI_APG_INIT@l 925 ori r8, r8, MI_APG_INIT@l
913 mtspr SPRN_MI_AP, r8 926 mtspr SPRN_MI_AP, r8
@@ -937,6 +950,34 @@ initial_mmu:
937 mtspr SPRN_MD_RPN, r8 950 mtspr SPRN_MD_RPN, r8
938#endif 951#endif
939 952
953 /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */
954#ifdef CONFIG_PIN_TLB_TEXT
955 lis r8, MI_RSV4I@h
956 ori r8, r8, 0x1c00
957#endif
958 li r9, 4 /* up to 4 pages of 8M */
959 mtctr r9
960 lis r9, KERNELBASE@h /* Create vaddr for TLB */
961 li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */
962 li r11, MI_BOOTINIT /* Create RPN for address 0 */
963 lis r12, _einittext@h
964 ori r12, r12, _einittext@l
9651:
966#ifdef CONFIG_PIN_TLB_TEXT
967 mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */
968 addi r8, r8, 0x100
969#endif
970
971 ori r0, r9, MI_EVALID /* Mark it valid */
972 mtspr SPRN_MI_EPN, r0
973 mtspr SPRN_MI_TWC, r10
974 mtspr SPRN_MI_RPN, r11 /* Store TLB entry */
975 addis r9, r9, 0x80
976 addis r11, r11, 0x80
977
978 cmpl cr0, r9, r12
979 bdnzf gt, 1b
980
940 /* Since the cache is enabled according to the information we 981 /* Since the cache is enabled according to the information we
941 * just loaded into the TLB, invalidate and enable the caches here. 982 * just loaded into the TLB, invalidate and enable the caches here.
942 * We should probably check/set other modes....later. 983 * We should probably check/set other modes....later.
@@ -989,5 +1030,6 @@ swapper_pg_dir:
989/* Room for two PTE table poiners, usually the kernel and current user 1030/* Room for two PTE table poiners, usually the kernel and current user
990 * pointer to their respective root page table (pgdir). 1031 * pointer to their respective root page table (pgdir).
991 */ 1032 */
1033 .globl abatron_pteptrs
992abatron_pteptrs: 1034abatron_pteptrs:
993 .space 8 1035 .space 8
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 306e26c073a0..1b22a8dea399 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -55,7 +55,7 @@ END_BTB_FLUSH_SECTION
55 beq 1f; \ 55 beq 1f; \
56 BOOKE_CLEAR_BTB(r11) \ 56 BOOKE_CLEAR_BTB(r11) \
57 /* if from user, start at top of this thread's kernel stack */ \ 57 /* if from user, start at top of this thread's kernel stack */ \
58 lwz r11, THREAD_INFO-THREAD(r10); \ 58 lwz r11, TASK_STACK - THREAD(r10); \
59 ALLOC_STACK_FRAME(r11, THREAD_SIZE); \ 59 ALLOC_STACK_FRAME(r11, THREAD_SIZE); \
601 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \ 601 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \
61 stw r13, _CCR(r11); /* save various registers */ \ 61 stw r13, _CCR(r11); /* save various registers */ \
@@ -142,7 +142,7 @@ END_BTB_FLUSH_SECTION
142 BOOKE_CLEAR_BTB(r10) \ 142 BOOKE_CLEAR_BTB(r10) \
143 andi. r11,r11,MSR_PR; \ 143 andi. r11,r11,MSR_PR; \
144 mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ 144 mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\
145 lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ 145 lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\
146 addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ 146 addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\
147 beq 1f; \ 147 beq 1f; \
148 /* COMING FROM USER MODE */ \ 148 /* COMING FROM USER MODE */ \
@@ -155,13 +155,7 @@ END_BTB_FLUSH_SECTION
155 stw r10,GPR11(r11); \ 155 stw r10,GPR11(r11); \
156 b 2f; \ 156 b 2f; \
157 /* COMING FROM PRIV MODE */ \ 157 /* COMING FROM PRIV MODE */ \
1581: lwz r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11); \ 1581: mr r11, r8; \
159 lwz r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11); \
160 stw r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8); \
161 stw r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8); \
162 lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \
163 stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \
164 mr r11,r8; \
1652: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ 1592: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \
166 stw r12,GPR12(r11); /* save various registers */\ 160 stw r12,GPR12(r11); /* save various registers */\
167 mflr r10; \ 161 mflr r10; \
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 2386ce2a9c6e..1881127682e9 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -243,8 +243,9 @@ set_ivor:
243 li r0,0 243 li r0,0
244 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) 244 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
245 245
246 CURRENT_THREAD_INFO(r22, r1) 246#ifdef CONFIG_SMP
247 stw r24, TI_CPU(r22) 247 stw r24, TASK_CPU(r2)
248#endif
248 249
249 bl early_init 250 bl early_init
250 251
@@ -717,8 +718,7 @@ finish_tlb_load:
717 718
718 /* Get the next_tlbcam_idx percpu var */ 719 /* Get the next_tlbcam_idx percpu var */
719#ifdef CONFIG_SMP 720#ifdef CONFIG_SMP
720 lwz r12, THREAD_INFO-THREAD(r12) 721 lwz r15, TASK_CPU-THREAD(r12)
721 lwz r15, TI_CPU(r12)
722 lis r14, __per_cpu_offset@h 722 lis r14, __per_cpu_offset@h
723 ori r14, r14, __per_cpu_offset@l 723 ori r14, r14, __per_cpu_offset@l
724 rlwinm r15, r15, 2, 0, 29 724 rlwinm r15, r15, 2, 0, 29
@@ -1089,10 +1089,10 @@ __secondary_start:
1089 mr r4,r24 /* Why? */ 1089 mr r4,r24 /* Why? */
1090 bl call_setup_cpu 1090 bl call_setup_cpu
1091 1091
1092 /* get current_thread_info and current */ 1092 /* get current's stack and current */
1093 lis r1,secondary_ti@ha 1093 lis r2,secondary_current@ha
1094 lwz r1,secondary_ti@l(r1) 1094 lwz r2,secondary_current@l(r2)
1095 lwz r2,TI_TASK(r1) 1095 lwz r1,TASK_STACK(r2)
1096 1096
1097 /* stack */ 1097 /* stack */
1098 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD 1098 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index ff026c9d3cab..c5e7f5bb2e66 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -136,10 +136,9 @@ BEGIN_FTR_SECTION
136 DSSALL 136 DSSALL
137 sync 137 sync
138END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 138END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
139 CURRENT_THREAD_INFO(r9, r1) 139 lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */
140 lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */
141 ori r8,r8,_TLF_NAPPING /* so when we take an exception */ 140 ori r8,r8,_TLF_NAPPING /* so when we take an exception */
142 stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ 141 stw r8,TI_LOCAL_FLAGS(r2) /* it will return to our caller */
143 mfmsr r7 142 mfmsr r7
144 ori r7,r7,MSR_EE 143 ori r7,r7,MSR_EE
145 oris r7,r7,MSR_POW@h 144 oris r7,r7,MSR_POW@h
@@ -159,8 +158,7 @@ _GLOBAL(power_save_ppc32_restore)
159 stw r9,_NIP(r11) /* make it do a blr */ 158 stw r9,_NIP(r11) /* make it do a blr */
160 159
161#ifdef CONFIG_SMP 160#ifdef CONFIG_SMP
162 CURRENT_THREAD_INFO(r12, r11) 161 lwz r11,TASK_CPU(r2) /* get cpu number * 4 */
163 lwz r11,TI_CPU(r12) /* get cpu number * 4 */
164 slwi r11,r11,2 162 slwi r11,r11,2
165#else 163#else
166 li r11,0 164 li r11,0
diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S
index 4e0d94d02030..31e732c378ad 100644
--- a/arch/powerpc/kernel/idle_book3e.S
+++ b/arch/powerpc/kernel/idle_book3e.S
@@ -63,7 +63,7 @@ _GLOBAL(\name)
631: /* Let's set the _TLF_NAPPING flag so interrupts make us return 631: /* Let's set the _TLF_NAPPING flag so interrupts make us return
64 * to the right spot 64 * to the right spot
65 */ 65 */
66 CURRENT_THREAD_INFO(r11, r1) 66 ld r11, PACACURRENT(r13)
67 ld r10,TI_LOCAL_FLAGS(r11) 67 ld r10,TI_LOCAL_FLAGS(r11)
68 ori r10,r10,_TLF_NAPPING 68 ori r10,r10,_TLF_NAPPING
69 std r10,TI_LOCAL_FLAGS(r11) 69 std r10,TI_LOCAL_FLAGS(r11)
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 583e55ac7d26..69dfcd2ca011 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -22,10 +22,9 @@
22 .text 22 .text
23 23
24_GLOBAL(e500_idle) 24_GLOBAL(e500_idle)
25 CURRENT_THREAD_INFO(r3, r1) 25 lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */
26 lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */
27 ori r4,r4,_TLF_NAPPING /* so when we take an exception */ 26 ori r4,r4,_TLF_NAPPING /* so when we take an exception */
28 stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ 27 stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */
29 28
30#ifdef CONFIG_PPC_E500MC 29#ifdef CONFIG_PPC_E500MC
31 wrteei 1 30 wrteei 1
@@ -88,8 +87,7 @@ _GLOBAL(power_save_ppc32_restore)
88 stw r9,_NIP(r11) /* make it do a blr */ 87 stw r9,_NIP(r11) /* make it do a blr */
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
91 CURRENT_THREAD_INFO(r12, r1) 90 lwz r11,TASK_CPU(r2) /* get cpu number * 4 */
92 lwz r11,TI_CPU(r12) /* get cpu number * 4 */
93 slwi r11,r11,2 91 slwi r11,r11,2
94#else 92#else
95 li r11,0 93 li r11,0
diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
index a09b3c7ca176..a2fdb0a34b75 100644
--- a/arch/powerpc/kernel/idle_power4.S
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -68,7 +68,7 @@ BEGIN_FTR_SECTION
68 DSSALL 68 DSSALL
69 sync 69 sync
70END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 70END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
71 CURRENT_THREAD_INFO(r9, r1) 71 ld r9, PACA_THREAD_INFO(r13)
72 ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ 72 ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */
73 ori r8,r8,_TLF_NAPPING /* so when we take an exception */ 73 ori r8,r8,_TLF_NAPPING /* so when we take an exception */
74 std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ 74 std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 916ddc4aac44..8a936723c791 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -618,9 +618,8 @@ static inline void check_stack_overflow(void)
618 sp = current_stack_pointer() & (THREAD_SIZE-1); 618 sp = current_stack_pointer() & (THREAD_SIZE-1);
619 619
620 /* check for stack overflow: is there less than 2KB free? */ 620 /* check for stack overflow: is there less than 2KB free? */
621 if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { 621 if (unlikely(sp < 2048)) {
622 pr_err("do_IRQ: stack overflow: %ld\n", 622 pr_err("do_IRQ: stack overflow: %ld\n", sp);
623 sp - sizeof(struct thread_info));
624 dump_stack(); 623 dump_stack();
625 } 624 }
626#endif 625#endif
@@ -660,36 +659,21 @@ void __do_irq(struct pt_regs *regs)
660void do_IRQ(struct pt_regs *regs) 659void do_IRQ(struct pt_regs *regs)
661{ 660{
662 struct pt_regs *old_regs = set_irq_regs(regs); 661 struct pt_regs *old_regs = set_irq_regs(regs);
663 struct thread_info *curtp, *irqtp, *sirqtp; 662 void *cursp, *irqsp, *sirqsp;
664 663
665 /* Switch to the irq stack to handle this */ 664 /* Switch to the irq stack to handle this */
666 curtp = current_thread_info(); 665 cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
667 irqtp = hardirq_ctx[raw_smp_processor_id()]; 666 irqsp = hardirq_ctx[raw_smp_processor_id()];
668 sirqtp = softirq_ctx[raw_smp_processor_id()]; 667 sirqsp = softirq_ctx[raw_smp_processor_id()];
669 668
670 /* Already there ? */ 669 /* Already there ? */
671 if (unlikely(curtp == irqtp || curtp == sirqtp)) { 670 if (unlikely(cursp == irqsp || cursp == sirqsp)) {
672 __do_irq(regs); 671 __do_irq(regs);
673 set_irq_regs(old_regs); 672 set_irq_regs(old_regs);
674 return; 673 return;
675 } 674 }
676
677 /* Prepare the thread_info in the irq stack */
678 irqtp->task = curtp->task;
679 irqtp->flags = 0;
680
681 /* Copy the preempt_count so that the [soft]irq checks work. */
682 irqtp->preempt_count = curtp->preempt_count;
683
684 /* Switch stack and call */ 675 /* Switch stack and call */
685 call_do_irq(regs, irqtp); 676 call_do_irq(regs, irqsp);
686
687 /* Restore stack limit */
688 irqtp->task = NULL;
689
690 /* Copy back updates to the thread_info */
691 if (irqtp->flags)
692 set_bits(irqtp->flags, &curtp->flags);
693 677
694 set_irq_regs(old_regs); 678 set_irq_regs(old_regs);
695} 679}
@@ -698,90 +682,20 @@ void __init init_IRQ(void)
698{ 682{
699 if (ppc_md.init_IRQ) 683 if (ppc_md.init_IRQ)
700 ppc_md.init_IRQ(); 684 ppc_md.init_IRQ();
701
702 exc_lvl_ctx_init();
703
704 irq_ctx_init();
705} 685}
706 686
707#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) 687#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
708struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; 688void *critirq_ctx[NR_CPUS] __read_mostly;
709struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; 689void *dbgirq_ctx[NR_CPUS] __read_mostly;
710struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; 690void *mcheckirq_ctx[NR_CPUS] __read_mostly;
711
712void exc_lvl_ctx_init(void)
713{
714 struct thread_info *tp;
715 int i, cpu_nr;
716
717 for_each_possible_cpu(i) {
718#ifdef CONFIG_PPC64
719 cpu_nr = i;
720#else
721#ifdef CONFIG_SMP
722 cpu_nr = get_hard_smp_processor_id(i);
723#else
724 cpu_nr = 0;
725#endif
726#endif 691#endif
727 692
728 memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); 693void *softirq_ctx[NR_CPUS] __read_mostly;
729 tp = critirq_ctx[cpu_nr]; 694void *hardirq_ctx[NR_CPUS] __read_mostly;
730 tp->cpu = cpu_nr;
731 tp->preempt_count = 0;
732
733#ifdef CONFIG_BOOKE
734 memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE);
735 tp = dbgirq_ctx[cpu_nr];
736 tp->cpu = cpu_nr;
737 tp->preempt_count = 0;
738
739 memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE);
740 tp = mcheckirq_ctx[cpu_nr];
741 tp->cpu = cpu_nr;
742 tp->preempt_count = HARDIRQ_OFFSET;
743#endif
744 }
745}
746#endif
747
748struct thread_info *softirq_ctx[NR_CPUS] __read_mostly;
749struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly;
750
751void irq_ctx_init(void)
752{
753 struct thread_info *tp;
754 int i;
755
756 for_each_possible_cpu(i) {
757 memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
758 tp = softirq_ctx[i];
759 tp->cpu = i;
760 klp_init_thread_info(tp);
761
762 memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
763 tp = hardirq_ctx[i];
764 tp->cpu = i;
765 klp_init_thread_info(tp);
766 }
767}
768 695
769void do_softirq_own_stack(void) 696void do_softirq_own_stack(void)
770{ 697{
771 struct thread_info *curtp, *irqtp; 698 call_do_softirq(softirq_ctx[smp_processor_id()]);
772
773 curtp = current_thread_info();
774 irqtp = softirq_ctx[smp_processor_id()];
775 irqtp->task = curtp->task;
776 irqtp->flags = 0;
777 call_do_softirq(irqtp);
778 irqtp->task = NULL;
779
780 /* Set any flag that may have been set on the
781 * alternate stack
782 */
783 if (irqtp->flags)
784 set_bits(irqtp->flags, &curtp->flags);
785} 699}
786 700
787irq_hw_number_t virq_to_hw(unsigned int virq) 701irq_hw_number_t virq_to_hw(unsigned int virq)
@@ -827,11 +741,6 @@ int irq_choose_cpu(const struct cpumask *mask)
827} 741}
828#endif 742#endif
829 743
830int arch_early_irq_init(void)
831{
832 return 0;
833}
834
835#ifdef CONFIG_PPC64 744#ifdef CONFIG_PPC64
836static int __init setup_noirqdistrib(char *str) 745static int __init setup_noirqdistrib(char *str)
837{ 746{
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index e1865565f0ae..7dd55eb1259d 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -151,41 +151,13 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
151 return 1; 151 return 1;
152} 152}
153 153
154static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
155static int kgdb_singlestep(struct pt_regs *regs) 154static int kgdb_singlestep(struct pt_regs *regs)
156{ 155{
157 struct thread_info *thread_info, *exception_thread_info;
158 struct thread_info *backup_current_thread_info =
159 this_cpu_ptr(&kgdb_thread_info);
160
161 if (user_mode(regs)) 156 if (user_mode(regs))
162 return 0; 157 return 0;
163 158
164 /*
165 * On Book E and perhaps other processors, singlestep is handled on
166 * the critical exception stack. This causes current_thread_info()
167 * to fail, since it it locates the thread_info by masking off
168 * the low bits of the current stack pointer. We work around
169 * this issue by copying the thread_info from the kernel stack
170 * before calling kgdb_handle_exception, and copying it back
171 * afterwards. On most processors the copy is avoided since
172 * exception_thread_info == thread_info.
173 */
174 thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
175 exception_thread_info = current_thread_info();
176
177 if (thread_info != exception_thread_info) {
178 /* Save the original current_thread_info. */
179 memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info);
180 memcpy(exception_thread_info, thread_info, sizeof *thread_info);
181 }
182
183 kgdb_handle_exception(0, SIGTRAP, 0, regs); 159 kgdb_handle_exception(0, SIGTRAP, 0, regs);
184 160
185 if (thread_info != exception_thread_info)
186 /* Restore current_thread_info lastly. */
187 memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
188
189 return 1; 161 return 1;
190} 162}
191 163
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index a0f6f45005bd..75692c327ba0 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -317,10 +317,8 @@ void default_machine_kexec(struct kimage *image)
317 * We setup preempt_count to avoid using VMX in memcpy. 317 * We setup preempt_count to avoid using VMX in memcpy.
318 * XXX: the task struct will likely be invalid once we do the copy! 318 * XXX: the task struct will likely be invalid once we do the copy!
319 */ 319 */
320 kexec_stack.thread_info.task = current_thread_info()->task; 320 current_thread_info()->flags = 0;
321 kexec_stack.thread_info.flags = 0; 321 current_thread_info()->preempt_count = HARDIRQ_OFFSET;
322 kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
323 kexec_stack.thread_info.cpu = current_thread_info()->cpu;
324 322
325 /* We need a static PACA, too; copy this CPU's PACA over and switch to 323 /* We need a static PACA, too; copy this CPU's PACA over and switch to
326 * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using 324 * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index bd933a75f0bc..b5fec1f9751a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -31,6 +31,7 @@
31 31
32#include <asm/machdep.h> 32#include <asm/machdep.h>
33#include <asm/mce.h> 33#include <asm/mce.h>
34#include <asm/nmi.h>
34 35
35static DEFINE_PER_CPU(int, mce_nest_count); 36static DEFINE_PER_CPU(int, mce_nest_count);
36static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); 37static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -301,13 +302,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
301 while (__this_cpu_read(mce_queue_count) > 0) { 302 while (__this_cpu_read(mce_queue_count) > 0) {
302 index = __this_cpu_read(mce_queue_count) - 1; 303 index = __this_cpu_read(mce_queue_count) - 1;
303 evt = this_cpu_ptr(&mce_event_queue[index]); 304 evt = this_cpu_ptr(&mce_event_queue[index]);
304 machine_check_print_event_info(evt, false); 305 machine_check_print_event_info(evt, false, false);
305 __this_cpu_dec(mce_queue_count); 306 __this_cpu_dec(mce_queue_count);
306 } 307 }
307} 308}
308 309
309void machine_check_print_event_info(struct machine_check_event *evt, 310void machine_check_print_event_info(struct machine_check_event *evt,
310 bool user_mode) 311 bool user_mode, bool in_guest)
311{ 312{
312 const char *level, *sevstr, *subtype; 313 const char *level, *sevstr, *subtype;
313 static const char *mc_ue_types[] = { 314 static const char *mc_ue_types[] = {
@@ -387,7 +388,9 @@ void machine_check_print_event_info(struct machine_check_event *evt,
387 evt->disposition == MCE_DISPOSITION_RECOVERED ? 388 evt->disposition == MCE_DISPOSITION_RECOVERED ?
388 "Recovered" : "Not recovered"); 389 "Recovered" : "Not recovered");
389 390
390 if (user_mode) { 391 if (in_guest) {
392 printk("%s Guest NIP: %016llx\n", level, evt->srr0);
393 } else if (user_mode) {
391 printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, 394 printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level,
392 evt->srr0, current->pid, current->comm); 395 evt->srr0, current->pid, current->comm);
393 } else { 396 } else {
@@ -488,6 +491,8 @@ long machine_check_early(struct pt_regs *regs)
488{ 491{
489 long handled = 0; 492 long handled = 0;
490 493
494 hv_nmi_check_nonrecoverable(regs);
495
491 /* 496 /*
492 * See if platform is capable of handling machine check. 497 * See if platform is capable of handling machine check.
493 */ 498 */
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 57d2ffb2d45c..0dda4f8e3d7a 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -46,11 +46,10 @@ _GLOBAL(call_do_softirq)
46 mflr r0 46 mflr r0
47 stw r0,4(r1) 47 stw r0,4(r1)
48 lwz r10,THREAD+KSP_LIMIT(r2) 48 lwz r10,THREAD+KSP_LIMIT(r2)
49 addi r11,r3,THREAD_INFO_GAP 49 stw r3, THREAD+KSP_LIMIT(r2)
50 stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) 50 stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
51 mr r1,r3 51 mr r1,r3
52 stw r10,8(r1) 52 stw r10,8(r1)
53 stw r11,THREAD+KSP_LIMIT(r2)
54 bl __do_softirq 53 bl __do_softirq
55 lwz r10,8(r1) 54 lwz r10,8(r1)
56 lwz r1,0(r1) 55 lwz r1,0(r1)
@@ -60,17 +59,16 @@ _GLOBAL(call_do_softirq)
60 blr 59 blr
61 60
62/* 61/*
63 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); 62 * void call_do_irq(struct pt_regs *regs, void *sp);
64 */ 63 */
65_GLOBAL(call_do_irq) 64_GLOBAL(call_do_irq)
66 mflr r0 65 mflr r0
67 stw r0,4(r1) 66 stw r0,4(r1)
68 lwz r10,THREAD+KSP_LIMIT(r2) 67 lwz r10,THREAD+KSP_LIMIT(r2)
69 addi r11,r4,THREAD_INFO_GAP 68 stw r4, THREAD+KSP_LIMIT(r2)
70 stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) 69 stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
71 mr r1,r4 70 mr r1,r4
72 stw r10,8(r1) 71 stw r10,8(r1)
73 stw r11,THREAD+KSP_LIMIT(r2)
74 bl __do_irq 72 bl __do_irq
75 lwz r10,8(r1) 73 lwz r10,8(r1)
76 lwz r1,0(r1) 74 lwz r1,0(r1)
@@ -183,10 +181,13 @@ _GLOBAL(low_choose_750fx_pll)
183 or r4,r4,r5 181 or r4,r4,r5
184 mtspr SPRN_HID1,r4 182 mtspr SPRN_HID1,r4
185 183
184#ifdef CONFIG_SMP
186 /* Store new HID1 image */ 185 /* Store new HID1 image */
187 CURRENT_THREAD_INFO(r6, r1) 186 lwz r6,TASK_CPU(r2)
188 lwz r6,TI_CPU(r6)
189 slwi r6,r6,2 187 slwi r6,r6,2
188#else
189 li r6, 0
190#endif
190 addis r6,r6,nap_save_hid1@ha 191 addis r6,r6,nap_save_hid1@ha
191 stw r4,nap_save_hid1@l(r6) 192 stw r4,nap_save_hid1@l(r6)
192 193
@@ -599,7 +600,7 @@ EXPORT_SYMBOL(__bswapdi2)
599#ifdef CONFIG_SMP 600#ifdef CONFIG_SMP
600_GLOBAL(start_secondary_resume) 601_GLOBAL(start_secondary_resume)
601 /* Reset stack */ 602 /* Reset stack */
602 CURRENT_THREAD_INFO(r1, r1) 603 rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT
603 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD 604 addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
604 li r3,0 605 li r3,0
605 stw r3,0(r1) /* Zero the stack frame pointer */ 606 stw r3,0(r1) /* Zero the stack frame pointer */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 4538e8ddde80..ff4b7539cbdf 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -63,19 +63,13 @@ resource_size_t isa_mem_base;
63EXPORT_SYMBOL(isa_mem_base); 63EXPORT_SYMBOL(isa_mem_base);
64 64
65 65
66static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops; 66static const struct dma_map_ops *pci_dma_ops;
67 67
68void set_pci_dma_ops(const struct dma_map_ops *dma_ops) 68void set_pci_dma_ops(const struct dma_map_ops *dma_ops)
69{ 69{
70 pci_dma_ops = dma_ops; 70 pci_dma_ops = dma_ops;
71} 71}
72 72
73const struct dma_map_ops *get_pci_dma_ops(void)
74{
75 return pci_dma_ops;
76}
77EXPORT_SYMBOL(get_pci_dma_ops);
78
79/* 73/*
80 * This function should run under locking protection, specifically 74 * This function should run under locking protection, specifically
81 * hose_spinlock. 75 * hose_spinlock.
@@ -358,6 +352,17 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node)
358 return NULL; 352 return NULL;
359} 353}
360 354
355struct pci_controller *pci_find_controller_for_domain(int domain_nr)
356{
357 struct pci_controller *hose;
358
359 list_for_each_entry(hose, &hose_list, list_node)
360 if (hose->global_number == domain_nr)
361 return hose;
362
363 return NULL;
364}
365
361/* 366/*
362 * Reads the interrupt pin to determine if interrupt is use by card. 367 * Reads the interrupt pin to determine if interrupt is use by card.
363 * If the interrupt is used, then gets the interrupt line from the 368 * If the interrupt is used, then gets the interrupt line from the
@@ -973,7 +978,7 @@ static void pcibios_setup_device(struct pci_dev *dev)
973 978
974 /* Hook up default DMA ops */ 979 /* Hook up default DMA ops */
975 set_dma_ops(&dev->dev, pci_dma_ops); 980 set_dma_ops(&dev->dev, pci_dma_ops);
976 set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); 981 dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET;
977 982
978 /* Additional platform DMA/iommu setup */ 983 /* Additional platform DMA/iommu setup */
979 phb = pci_bus_to_host(dev->bus); 984 phb = pci_bus_to_host(dev->bus);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ce393df243aa..dd9e0d5386ee 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -176,7 +176,7 @@ static void __giveup_fpu(struct task_struct *tsk)
176 176
177 save_fpu(tsk); 177 save_fpu(tsk);
178 msr = tsk->thread.regs->msr; 178 msr = tsk->thread.regs->msr;
179 msr &= ~MSR_FP; 179 msr &= ~(MSR_FP|MSR_FE0|MSR_FE1);
180#ifdef CONFIG_VSX 180#ifdef CONFIG_VSX
181 if (cpu_has_feature(CPU_FTR_VSX)) 181 if (cpu_has_feature(CPU_FTR_VSX))
182 msr &= ~MSR_VSX; 182 msr &= ~MSR_VSX;
@@ -1231,8 +1231,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
1231 batch->active = 1; 1231 batch->active = 1;
1232 } 1232 }
1233 1233
1234 if (current_thread_info()->task->thread.regs) { 1234 if (current->thread.regs) {
1235 restore_math(current_thread_info()->task->thread.regs); 1235 restore_math(current->thread.regs);
1236 1236
1237 /* 1237 /*
1238 * The copy-paste buffer can only store into foreign real 1238 * The copy-paste buffer can only store into foreign real
@@ -1242,7 +1242,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
1242 * mappings, we must issue a cp_abort to clear any state and 1242 * mappings, we must issue a cp_abort to clear any state and
1243 * prevent snooping, corruption or a covert channel. 1243 * prevent snooping, corruption or a covert channel.
1244 */ 1244 */
1245 if (current_thread_info()->task->thread.used_vas) 1245 if (current->thread.used_vas)
1246 asm volatile(PPC_CP_ABORT); 1246 asm volatile(PPC_CP_ABORT);
1247 } 1247 }
1248#endif /* CONFIG_PPC_BOOK3S_64 */ 1248#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -1634,7 +1634,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
1634 unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; 1634 unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
1635 struct thread_info *ti = task_thread_info(p); 1635 struct thread_info *ti = task_thread_info(p);
1636 1636
1637 klp_init_thread_info(ti); 1637 klp_init_thread_info(p);
1638 1638
1639 /* Copy registers */ 1639 /* Copy registers */
1640 sp -= sizeof(struct pt_regs); 1640 sp -= sizeof(struct pt_regs);
@@ -1691,8 +1691,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
1691 sp -= STACK_FRAME_OVERHEAD; 1691 sp -= STACK_FRAME_OVERHEAD;
1692 p->thread.ksp = sp; 1692 p->thread.ksp = sp;
1693#ifdef CONFIG_PPC32 1693#ifdef CONFIG_PPC32
1694 p->thread.ksp_limit = (unsigned long)task_stack_page(p) + 1694 p->thread.ksp_limit = (unsigned long)end_of_stack(p);
1695 _ALIGN_UP(sizeof(struct thread_info), 16);
1696#endif 1695#endif
1697#ifdef CONFIG_HAVE_HW_BREAKPOINT 1696#ifdef CONFIG_HAVE_HW_BREAKPOINT
1698 p->thread.ptrace_bps[0] = NULL; 1697 p->thread.ptrace_bps[0] = NULL;
@@ -1995,21 +1994,14 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
1995 unsigned long stack_page; 1994 unsigned long stack_page;
1996 unsigned long cpu = task_cpu(p); 1995 unsigned long cpu = task_cpu(p);
1997 1996
1998 /* 1997 stack_page = (unsigned long)hardirq_ctx[cpu];
1999 * Avoid crashing if the stack has overflowed and corrupted 1998 if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
2000 * task_cpu(p), which is in the thread_info struct. 1999 return 1;
2001 */ 2000
2002 if (cpu < NR_CPUS && cpu_possible(cpu)) { 2001 stack_page = (unsigned long)softirq_ctx[cpu];
2003 stack_page = (unsigned long) hardirq_ctx[cpu]; 2002 if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
2004 if (sp >= stack_page + sizeof(struct thread_struct) 2003 return 1;
2005 && sp <= stack_page + THREAD_SIZE - nbytes) 2004
2006 return 1;
2007
2008 stack_page = (unsigned long) softirq_ctx[cpu];
2009 if (sp >= stack_page + sizeof(struct thread_struct)
2010 && sp <= stack_page + THREAD_SIZE - nbytes)
2011 return 1;
2012 }
2013 return 0; 2005 return 0;
2014} 2006}
2015 2007
@@ -2018,8 +2010,10 @@ int validate_sp(unsigned long sp, struct task_struct *p,
2018{ 2010{
2019 unsigned long stack_page = (unsigned long)task_stack_page(p); 2011 unsigned long stack_page = (unsigned long)task_stack_page(p);
2020 2012
2021 if (sp >= stack_page + sizeof(struct thread_struct) 2013 if (sp < THREAD_SIZE)
2022 && sp <= stack_page + THREAD_SIZE - nbytes) 2014 return 0;
2015
2016 if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
2023 return 1; 2017 return 1;
2024 2018
2025 return valid_irq_stack(sp, p, nbytes); 2019 return valid_irq_stack(sp, p, nbytes);
@@ -2027,7 +2021,7 @@ int validate_sp(unsigned long sp, struct task_struct *p,
2027 2021
2028EXPORT_SYMBOL(validate_sp); 2022EXPORT_SYMBOL(validate_sp);
2029 2023
2030unsigned long get_wchan(struct task_struct *p) 2024static unsigned long __get_wchan(struct task_struct *p)
2031{ 2025{
2032 unsigned long ip, sp; 2026 unsigned long ip, sp;
2033 int count = 0; 2027 int count = 0;
@@ -2053,6 +2047,20 @@ unsigned long get_wchan(struct task_struct *p)
2053 return 0; 2047 return 0;
2054} 2048}
2055 2049
2050unsigned long get_wchan(struct task_struct *p)
2051{
2052 unsigned long ret;
2053
2054 if (!try_get_task_stack(p))
2055 return 0;
2056
2057 ret = __get_wchan(p);
2058
2059 put_task_stack(p);
2060
2061 return ret;
2062}
2063
2056static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; 2064static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH;
2057 2065
2058void show_stack(struct task_struct *tsk, unsigned long *stack) 2066void show_stack(struct task_struct *tsk, unsigned long *stack)
@@ -2067,9 +2075,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
2067 int curr_frame = 0; 2075 int curr_frame = 0;
2068#endif 2076#endif
2069 2077
2070 sp = (unsigned long) stack;
2071 if (tsk == NULL) 2078 if (tsk == NULL)
2072 tsk = current; 2079 tsk = current;
2080
2081 if (!try_get_task_stack(tsk))
2082 return;
2083
2084 sp = (unsigned long) stack;
2073 if (sp == 0) { 2085 if (sp == 0) {
2074 if (tsk == current) 2086 if (tsk == current)
2075 sp = current_stack_pointer(); 2087 sp = current_stack_pointer();
@@ -2081,7 +2093,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
2081 printk("Call Trace:\n"); 2093 printk("Call Trace:\n");
2082 do { 2094 do {
2083 if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) 2095 if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD))
2084 return; 2096 break;
2085 2097
2086 stack = (unsigned long *) sp; 2098 stack = (unsigned long *) sp;
2087 newsp = stack[0]; 2099 newsp = stack[0];
@@ -2121,6 +2133,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
2121 2133
2122 sp = newsp; 2134 sp = newsp;
2123 } while (count++ < kstack_depth_to_print); 2135 } while (count++ < kstack_depth_to_print);
2136
2137 put_task_stack(tsk);
2124} 2138}
2125 2139
2126#ifdef CONFIG_PPC64 2140#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index cdd5d1d3ae41..d9ac7d94656e 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -33,6 +33,7 @@
33#include <linux/hw_breakpoint.h> 33#include <linux/hw_breakpoint.h>
34#include <linux/perf_event.h> 34#include <linux/perf_event.h>
35#include <linux/context_tracking.h> 35#include <linux/context_tracking.h>
36#include <linux/nospec.h>
36 37
37#include <linux/uaccess.h> 38#include <linux/uaccess.h>
38#include <linux/pkeys.h> 39#include <linux/pkeys.h>
@@ -274,6 +275,8 @@ static int set_user_trap(struct task_struct *task, unsigned long trap)
274 */ 275 */
275int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) 276int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
276{ 277{
278 unsigned int regs_max;
279
277 if ((task->thread.regs == NULL) || !data) 280 if ((task->thread.regs == NULL) || !data)
278 return -EIO; 281 return -EIO;
279 282
@@ -297,7 +300,9 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
297 } 300 }
298#endif 301#endif
299 302
300 if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { 303 regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long);
304 if (regno < regs_max) {
305 regno = array_index_nospec(regno, regs_max);
301 *data = ((unsigned long *)task->thread.regs)[regno]; 306 *data = ((unsigned long *)task->thread.regs)[regno];
302 return 0; 307 return 0;
303 } 308 }
@@ -321,6 +326,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
321 return set_user_dscr(task, data); 326 return set_user_dscr(task, data);
322 327
323 if (regno <= PT_MAX_PUT_REG) { 328 if (regno <= PT_MAX_PUT_REG) {
329 regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1);
324 ((unsigned long *)task->thread.regs)[regno] = data; 330 ((unsigned long *)task->thread.regs)[regno] = data;
325 return 0; 331 return 0;
326 } 332 }
@@ -561,6 +567,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
561 /* 567 /*
562 * Copy out only the low-order word of vrsave. 568 * Copy out only the low-order word of vrsave.
563 */ 569 */
570 int start, end;
564 union { 571 union {
565 elf_vrreg_t reg; 572 elf_vrreg_t reg;
566 u32 word; 573 u32 word;
@@ -569,8 +576,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
569 576
570 vrsave.word = target->thread.vrsave; 577 vrsave.word = target->thread.vrsave;
571 578
579 start = 33 * sizeof(vector128);
580 end = start + sizeof(vrsave);
572 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, 581 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
573 33 * sizeof(vector128), -1); 582 start, end);
574 } 583 }
575 584
576 return ret; 585 return ret;
@@ -608,6 +617,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
608 /* 617 /*
609 * We use only the first word of vrsave. 618 * We use only the first word of vrsave.
610 */ 619 */
620 int start, end;
611 union { 621 union {
612 elf_vrreg_t reg; 622 elf_vrreg_t reg;
613 u32 word; 623 u32 word;
@@ -616,8 +626,10 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
616 626
617 vrsave.word = target->thread.vrsave; 627 vrsave.word = target->thread.vrsave;
618 628
629 start = 33 * sizeof(vector128);
630 end = start + sizeof(vrsave);
619 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, 631 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
620 33 * sizeof(vector128), -1); 632 start, end);
621 if (!ret) 633 if (!ret)
622 target->thread.vrsave = vrsave.word; 634 target->thread.vrsave = vrsave.word;
623 } 635 }
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 82be48c123cf..f17868e19e2c 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -634,7 +634,7 @@ void probe_machine(void)
634 } 634 }
635 /* What can we do if we didn't find ? */ 635 /* What can we do if we didn't find ? */
636 if (machine_id >= &__machine_desc_end) { 636 if (machine_id >= &__machine_desc_end) {
637 DBG("No suitable machine found !\n"); 637 pr_err("No suitable machine description found !\n");
638 for (;;); 638 for (;;);
639 } 639 }
640 640
@@ -791,7 +791,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev)
791{ 791{
792 pdev->archdata.dma_mask = DMA_BIT_MASK(32); 792 pdev->archdata.dma_mask = DMA_BIT_MASK(32);
793 pdev->dev.dma_mask = &pdev->archdata.dma_mask; 793 pdev->dev.dma_mask = &pdev->archdata.dma_mask;
794 set_dma_ops(&pdev->dev, &dma_nommu_ops);
795} 794}
796 795
797static __init void print_system_info(void) 796static __init void print_system_info(void)
@@ -938,7 +937,7 @@ void __init setup_arch(char **cmdline_p)
938 /* Reserve large chunks of memory for use by CMA for KVM. */ 937 /* Reserve large chunks of memory for use by CMA for KVM. */
939 kvm_cma_reserve(); 938 kvm_cma_reserve();
940 939
941 klp_init_thread_info(&init_thread_info); 940 klp_init_thread_info(&init_task);
942 941
943 init_mm.start_code = (unsigned long)_stext; 942 init_mm.start_code = (unsigned long)_stext;
944 init_mm.end_code = (unsigned long) _etext; 943 init_mm.end_code = (unsigned long) _etext;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index c31082233a25..4a65e08a6042 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -162,6 +162,17 @@ static int __init ppc_init(void)
162} 162}
163arch_initcall(ppc_init); 163arch_initcall(ppc_init);
164 164
165static void *__init alloc_stack(void)
166{
167 void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
168
169 if (!ptr)
170 panic("cannot allocate %d bytes for stack at %pS\n",
171 THREAD_SIZE, (void *)_RET_IP_);
172
173 return ptr;
174}
175
165void __init irqstack_early_init(void) 176void __init irqstack_early_init(void)
166{ 177{
167 unsigned int i; 178 unsigned int i;
@@ -169,10 +180,8 @@ void __init irqstack_early_init(void)
169 /* interrupt stacks must be in lowmem, we get that for free on ppc32 180 /* interrupt stacks must be in lowmem, we get that for free on ppc32
170 * as the memblock is limited to lowmem by default */ 181 * as the memblock is limited to lowmem by default */
171 for_each_possible_cpu(i) { 182 for_each_possible_cpu(i) {
172 softirq_ctx[i] = (struct thread_info *) 183 softirq_ctx[i] = alloc_stack();
173 __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); 184 hardirq_ctx[i] = alloc_stack();
174 hardirq_ctx[i] = (struct thread_info *)
175 __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
176 } 185 }
177} 186}
178 187
@@ -190,13 +199,10 @@ void __init exc_lvl_early_init(void)
190 hw_cpu = 0; 199 hw_cpu = 0;
191#endif 200#endif
192 201
193 critirq_ctx[hw_cpu] = (struct thread_info *) 202 critirq_ctx[hw_cpu] = alloc_stack();
194 __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
195#ifdef CONFIG_BOOKE 203#ifdef CONFIG_BOOKE
196 dbgirq_ctx[hw_cpu] = (struct thread_info *) 204 dbgirq_ctx[hw_cpu] = alloc_stack();
197 __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); 205 mcheckirq_ctx[hw_cpu] = alloc_stack();
198 mcheckirq_ctx[hw_cpu] = (struct thread_info *)
199 __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE));
200#endif 206#endif
201 } 207 }
202} 208}
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 5de413ae3cd6..ff0aac42bb33 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void)
634 634
635static void *__init alloc_stack(unsigned long limit, int cpu) 635static void *__init alloc_stack(unsigned long limit, int cpu)
636{ 636{
637 unsigned long pa; 637 void *ptr;
638 638
639 BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); 639 BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16);
640 640
641 pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit, 641 ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE,
642 early_cpu_to_node(cpu), MEMBLOCK_NONE); 642 MEMBLOCK_LOW_LIMIT, limit,
643 if (!pa) { 643 early_cpu_to_node(cpu));
644 pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); 644 if (!ptr)
645 if (!pa) 645 panic("cannot allocate stacks");
646 panic("cannot allocate stacks");
647 }
648 646
649 return __va(pa); 647 return ptr;
650} 648}
651 649
652void __init irqstack_early_init(void) 650void __init irqstack_early_init(void)
@@ -692,24 +690,6 @@ void __init exc_lvl_early_init(void)
692#endif 690#endif
693 691
694/* 692/*
695 * Emergency stacks are used for a range of things, from asynchronous
696 * NMIs (system reset, machine check) to synchronous, process context.
697 * We set preempt_count to zero, even though that isn't necessarily correct. To
698 * get the right value we'd need to copy it from the previous thread_info, but
699 * doing that might fault causing more problems.
700 * TODO: what to do with accounting?
701 */
702static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu)
703{
704 ti->task = NULL;
705 ti->cpu = cpu;
706 ti->preempt_count = 0;
707 ti->local_flags = 0;
708 ti->flags = 0;
709 klp_init_thread_info(ti);
710}
711
712/*
713 * Stack space used when we detect a bad kernel stack pointer, and 693 * Stack space used when we detect a bad kernel stack pointer, and
714 * early in SMP boots before relocation is enabled. Exclusive emergency 694 * early in SMP boots before relocation is enabled. Exclusive emergency
715 * stack for machine checks. 695 * stack for machine checks.
@@ -736,25 +716,14 @@ void __init emergency_stack_init(void)
736 limit = min(ppc64_bolted_size(), ppc64_rma_size); 716 limit = min(ppc64_bolted_size(), ppc64_rma_size);
737 717
738 for_each_possible_cpu(i) { 718 for_each_possible_cpu(i) {
739 struct thread_info *ti; 719 paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
740
741 ti = alloc_stack(limit, i);
742 memset(ti, 0, THREAD_SIZE);
743 emerg_stack_init_thread_info(ti, i);
744 paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
745 720
746#ifdef CONFIG_PPC_BOOK3S_64 721#ifdef CONFIG_PPC_BOOK3S_64
747 /* emergency stack for NMI exception handling. */ 722 /* emergency stack for NMI exception handling. */
748 ti = alloc_stack(limit, i); 723 paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
749 memset(ti, 0, THREAD_SIZE);
750 emerg_stack_init_thread_info(ti, i);
751 paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
752 724
753 /* emergency stack for machine check exception handling. */ 725 /* emergency stack for machine check exception handling. */
754 ti = alloc_stack(limit, i); 726 paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
755 memset(ti, 0, THREAD_SIZE);
756 emerg_stack_init_thread_info(ti, i);
757 paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
758#endif 727#endif
759 } 728 }
760} 729}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3f15edf25a0d..e784342bdaa1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -20,6 +20,7 @@
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/sched/mm.h> 22#include <linux/sched/mm.h>
23#include <linux/sched/task_stack.h>
23#include <linux/sched/topology.h> 24#include <linux/sched/topology.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/interrupt.h> 26#include <linux/interrupt.h>
@@ -75,7 +76,7 @@
75static DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76static DEFINE_PER_CPU(int, cpu_state) = { 0 };
76#endif 77#endif
77 78
78struct thread_info *secondary_ti; 79struct task_struct *secondary_current;
79bool has_big_cores; 80bool has_big_cores;
80 81
81DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); 82DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
@@ -358,13 +359,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
358 * NMI IPIs may not be recoverable, so should not be used as ongoing part of 359 * NMI IPIs may not be recoverable, so should not be used as ongoing part of
359 * a running system. They can be used for crash, debug, halt/reboot, etc. 360 * a running system. They can be used for crash, debug, halt/reboot, etc.
360 * 361 *
361 * NMI IPIs are globally single threaded. No more than one in progress at
362 * any time.
363 *
364 * The IPI call waits with interrupts disabled until all targets enter the 362 * The IPI call waits with interrupts disabled until all targets enter the
365 * NMI handler, then the call returns. 363 * NMI handler, then returns. Subsequent IPIs can be issued before targets
364 * have returned from their handlers, so there is no guarantee about
365 * concurrency or re-entrancy.
366 * 366 *
367 * No new NMI can be initiated until targets exit the handler. 367 * A new NMI can be issued before all targets exit the handler.
368 * 368 *
369 * The IPI call may time out without all targets entering the NMI handler. 369 * The IPI call may time out without all targets entering the NMI handler.
370 * In that case, there is some logic to recover (and ignore subsequent 370 * In that case, there is some logic to recover (and ignore subsequent
@@ -375,7 +375,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
375 375
376static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0); 376static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
377static struct cpumask nmi_ipi_pending_mask; 377static struct cpumask nmi_ipi_pending_mask;
378static int nmi_ipi_busy_count = 0; 378static bool nmi_ipi_busy = false;
379static void (*nmi_ipi_function)(struct pt_regs *) = NULL; 379static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
380 380
381static void nmi_ipi_lock_start(unsigned long *flags) 381static void nmi_ipi_lock_start(unsigned long *flags)
@@ -414,7 +414,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags)
414 */ 414 */
415int smp_handle_nmi_ipi(struct pt_regs *regs) 415int smp_handle_nmi_ipi(struct pt_regs *regs)
416{ 416{
417 void (*fn)(struct pt_regs *); 417 void (*fn)(struct pt_regs *) = NULL;
418 unsigned long flags; 418 unsigned long flags;
419 int me = raw_smp_processor_id(); 419 int me = raw_smp_processor_id();
420 int ret = 0; 420 int ret = 0;
@@ -425,29 +425,17 @@ int smp_handle_nmi_ipi(struct pt_regs *regs)
425 * because the caller may have timed out. 425 * because the caller may have timed out.
426 */ 426 */
427 nmi_ipi_lock_start(&flags); 427 nmi_ipi_lock_start(&flags);
428 if (!nmi_ipi_busy_count) 428 if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) {
429 goto out; 429 cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
430 if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask)) 430 fn = READ_ONCE(nmi_ipi_function);
431 goto out; 431 WARN_ON_ONCE(!fn);
432 432 ret = 1;
433 fn = nmi_ipi_function; 433 }
434 if (!fn)
435 goto out;
436
437 cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
438 nmi_ipi_busy_count++;
439 nmi_ipi_unlock();
440
441 ret = 1;
442
443 fn(regs);
444
445 nmi_ipi_lock();
446 if (nmi_ipi_busy_count > 1) /* Can race with caller time-out */
447 nmi_ipi_busy_count--;
448out:
449 nmi_ipi_unlock_end(&flags); 434 nmi_ipi_unlock_end(&flags);
450 435
436 if (fn)
437 fn(regs);
438
451 return ret; 439 return ret;
452} 440}
453 441
@@ -473,9 +461,10 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe)
473 * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. 461 * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
474 * - fn is the target callback function. 462 * - fn is the target callback function.
475 * - delay_us > 0 is the delay before giving up waiting for targets to 463 * - delay_us > 0 is the delay before giving up waiting for targets to
476 * complete executing the handler, == 0 specifies indefinite delay. 464 * begin executing the handler, == 0 specifies indefinite delay.
477 */ 465 */
478int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe) 466static int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *),
467 u64 delay_us, bool safe)
479{ 468{
480 unsigned long flags; 469 unsigned long flags;
481 int me = raw_smp_processor_id(); 470 int me = raw_smp_processor_id();
@@ -487,31 +476,33 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool
487 if (unlikely(!smp_ops)) 476 if (unlikely(!smp_ops))
488 return 0; 477 return 0;
489 478
490 /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
491 nmi_ipi_lock_start(&flags); 479 nmi_ipi_lock_start(&flags);
492 while (nmi_ipi_busy_count) { 480 while (nmi_ipi_busy) {
493 nmi_ipi_unlock_end(&flags); 481 nmi_ipi_unlock_end(&flags);
494 spin_until_cond(nmi_ipi_busy_count == 0); 482 spin_until_cond(!nmi_ipi_busy);
495 nmi_ipi_lock_start(&flags); 483 nmi_ipi_lock_start(&flags);
496 } 484 }
497 485 nmi_ipi_busy = true;
498 nmi_ipi_function = fn; 486 nmi_ipi_function = fn;
499 487
488 WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask));
489
500 if (cpu < 0) { 490 if (cpu < 0) {
501 /* ALL_OTHERS */ 491 /* ALL_OTHERS */
502 cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask); 492 cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
503 cpumask_clear_cpu(me, &nmi_ipi_pending_mask); 493 cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
504 } else { 494 } else {
505 /* cpumask starts clear */
506 cpumask_set_cpu(cpu, &nmi_ipi_pending_mask); 495 cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
507 } 496 }
508 nmi_ipi_busy_count++; 497
509 nmi_ipi_unlock(); 498 nmi_ipi_unlock();
510 499
500 /* Interrupts remain hard disabled */
501
511 do_smp_send_nmi_ipi(cpu, safe); 502 do_smp_send_nmi_ipi(cpu, safe);
512 503
513 nmi_ipi_lock(); 504 nmi_ipi_lock();
514 /* nmi_ipi_busy_count is held here, so unlock/lock is okay */ 505 /* nmi_ipi_busy is set here, so unlock/lock is okay */
515 while (!cpumask_empty(&nmi_ipi_pending_mask)) { 506 while (!cpumask_empty(&nmi_ipi_pending_mask)) {
516 nmi_ipi_unlock(); 507 nmi_ipi_unlock();
517 udelay(1); 508 udelay(1);
@@ -523,29 +514,15 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool
523 } 514 }
524 } 515 }
525 516
526 while (nmi_ipi_busy_count > 1) {
527 nmi_ipi_unlock();
528 udelay(1);
529 nmi_ipi_lock();
530 if (delay_us) {
531 delay_us--;
532 if (!delay_us)
533 break;
534 }
535 }
536
537 if (!cpumask_empty(&nmi_ipi_pending_mask)) { 517 if (!cpumask_empty(&nmi_ipi_pending_mask)) {
538 /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */ 518 /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */
539 ret = 0; 519 ret = 0;
540 cpumask_clear(&nmi_ipi_pending_mask); 520 cpumask_clear(&nmi_ipi_pending_mask);
541 } 521 }
542 if (nmi_ipi_busy_count > 1) {
543 /* Timeout waiting for CPUs to execute fn */
544 ret = 0;
545 nmi_ipi_busy_count = 1;
546 }
547 522
548 nmi_ipi_busy_count--; 523 nmi_ipi_function = NULL;
524 nmi_ipi_busy = false;
525
549 nmi_ipi_unlock_end(&flags); 526 nmi_ipi_unlock_end(&flags);
550 527
551 return ret; 528 return ret;
@@ -613,17 +590,8 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
613static void nmi_stop_this_cpu(struct pt_regs *regs) 590static void nmi_stop_this_cpu(struct pt_regs *regs)
614{ 591{
615 /* 592 /*
616 * This is a special case because it never returns, so the NMI IPI
617 * handling would never mark it as done, which makes any later
618 * smp_send_nmi_ipi() call spin forever. Mark it done now.
619 *
620 * IRQs are already hard disabled by the smp_handle_nmi_ipi. 593 * IRQs are already hard disabled by the smp_handle_nmi_ipi.
621 */ 594 */
622 nmi_ipi_lock();
623 if (nmi_ipi_busy_count > 1)
624 nmi_ipi_busy_count--;
625 nmi_ipi_unlock();
626
627 spin_begin(); 595 spin_begin();
628 while (1) 596 while (1)
629 spin_cpu_relax(); 597 spin_cpu_relax();
@@ -663,7 +631,7 @@ void smp_send_stop(void)
663} 631}
664#endif /* CONFIG_NMI_IPI */ 632#endif /* CONFIG_NMI_IPI */
665 633
666struct thread_info *current_set[NR_CPUS]; 634struct task_struct *current_set[NR_CPUS];
667 635
668static void smp_store_cpu_info(int id) 636static void smp_store_cpu_info(int id)
669{ 637{
@@ -928,7 +896,7 @@ void smp_prepare_boot_cpu(void)
928 paca_ptrs[boot_cpuid]->__current = current; 896 paca_ptrs[boot_cpuid]->__current = current;
929#endif 897#endif
930 set_numa_node(numa_cpu_lookup_table[boot_cpuid]); 898 set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
931 current_set[boot_cpuid] = task_thread_info(current); 899 current_set[boot_cpuid] = current;
932} 900}
933 901
934#ifdef CONFIG_HOTPLUG_CPU 902#ifdef CONFIG_HOTPLUG_CPU
@@ -1013,14 +981,13 @@ static bool secondaries_inhibited(void)
1013 981
1014static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) 982static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
1015{ 983{
1016 struct thread_info *ti = task_thread_info(idle);
1017
1018#ifdef CONFIG_PPC64 984#ifdef CONFIG_PPC64
1019 paca_ptrs[cpu]->__current = idle; 985 paca_ptrs[cpu]->__current = idle;
1020 paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; 986 paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) +
987 THREAD_SIZE - STACK_FRAME_OVERHEAD;
1021#endif 988#endif
1022 ti->cpu = cpu; 989 idle->cpu = cpu;
1023 secondary_ti = current_set[cpu] = ti; 990 secondary_current = current_set[cpu] = idle;
1024} 991}
1025 992
1026int __cpu_up(unsigned int cpu, struct task_struct *tidle) 993int __cpu_up(unsigned int cpu, struct task_struct *tidle)
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index e2c50b55138f..1e2276963f6d 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -67,12 +67,17 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
67{ 67{
68 unsigned long sp; 68 unsigned long sp;
69 69
70 if (!try_get_task_stack(tsk))
71 return;
72
70 if (tsk == current) 73 if (tsk == current)
71 sp = current_stack_pointer(); 74 sp = current_stack_pointer();
72 else 75 else
73 sp = tsk->thread.ksp; 76 sp = tsk->thread.ksp;
74 77
75 save_context_stack(trace, sp, tsk, 0); 78 save_context_stack(trace, sp, tsk, 0);
79
80 put_task_stack(tsk);
76} 81}
77EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 82EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
78 83
@@ -84,25 +89,21 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
84EXPORT_SYMBOL_GPL(save_stack_trace_regs); 89EXPORT_SYMBOL_GPL(save_stack_trace_regs);
85 90
86#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE 91#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
87int 92/*
88save_stack_trace_tsk_reliable(struct task_struct *tsk, 93 * This function returns an error if it detects any unreliable features of the
89 struct stack_trace *trace) 94 * stack. Otherwise it guarantees that the stack trace is reliable.
95 *
96 * If the task is not 'current', the caller *must* ensure the task is inactive.
97 */
98static int __save_stack_trace_tsk_reliable(struct task_struct *tsk,
99 struct stack_trace *trace)
90{ 100{
91 unsigned long sp; 101 unsigned long sp;
102 unsigned long newsp;
92 unsigned long stack_page = (unsigned long)task_stack_page(tsk); 103 unsigned long stack_page = (unsigned long)task_stack_page(tsk);
93 unsigned long stack_end; 104 unsigned long stack_end;
94 int graph_idx = 0; 105 int graph_idx = 0;
95 106 bool firstframe;
96 /*
97 * The last frame (unwinding first) may not yet have saved
98 * its LR onto the stack.
99 */
100 int firstframe = 1;
101
102 if (tsk == current)
103 sp = current_stack_pointer();
104 else
105 sp = tsk->thread.ksp;
106 107
107 stack_end = stack_page + THREAD_SIZE; 108 stack_end = stack_page + THREAD_SIZE;
108 if (!is_idle_task(tsk)) { 109 if (!is_idle_task(tsk)) {
@@ -129,40 +130,53 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
129 stack_end -= STACK_FRAME_OVERHEAD; 130 stack_end -= STACK_FRAME_OVERHEAD;
130 } 131 }
131 132
133 if (tsk == current)
134 sp = current_stack_pointer();
135 else
136 sp = tsk->thread.ksp;
137
132 if (sp < stack_page + sizeof(struct thread_struct) || 138 if (sp < stack_page + sizeof(struct thread_struct) ||
133 sp > stack_end - STACK_FRAME_MIN_SIZE) { 139 sp > stack_end - STACK_FRAME_MIN_SIZE) {
134 return 1; 140 return -EINVAL;
135 } 141 }
136 142
137 for (;;) { 143 for (firstframe = true; sp != stack_end;
144 firstframe = false, sp = newsp) {
138 unsigned long *stack = (unsigned long *) sp; 145 unsigned long *stack = (unsigned long *) sp;
139 unsigned long newsp, ip; 146 unsigned long ip;
140 147
141 /* sanity check: ABI requires SP to be aligned 16 bytes. */ 148 /* sanity check: ABI requires SP to be aligned 16 bytes. */
142 if (sp & 0xF) 149 if (sp & 0xF)
143 return 1; 150 return -EINVAL;
144
145 /* Mark stacktraces with exception frames as unreliable. */
146 if (sp <= stack_end - STACK_INT_FRAME_SIZE &&
147 stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
148 return 1;
149 }
150 151
151 newsp = stack[0]; 152 newsp = stack[0];
152 /* Stack grows downwards; unwinder may only go up. */ 153 /* Stack grows downwards; unwinder may only go up. */
153 if (newsp <= sp) 154 if (newsp <= sp)
154 return 1; 155 return -EINVAL;
155 156
156 if (newsp != stack_end && 157 if (newsp != stack_end &&
157 newsp > stack_end - STACK_FRAME_MIN_SIZE) { 158 newsp > stack_end - STACK_FRAME_MIN_SIZE) {
158 return 1; /* invalid backlink, too far up. */ 159 return -EINVAL; /* invalid backlink, too far up. */
160 }
161
162 /*
163 * We can only trust the bottom frame's backlink, the
164 * rest of the frame may be uninitialized, continue to
165 * the next.
166 */
167 if (firstframe)
168 continue;
169
170 /* Mark stacktraces with exception frames as unreliable. */
171 if (sp <= stack_end - STACK_INT_FRAME_SIZE &&
172 stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
173 return -EINVAL;
159 } 174 }
160 175
161 /* Examine the saved LR: it must point into kernel code. */ 176 /* Examine the saved LR: it must point into kernel code. */
162 ip = stack[STACK_FRAME_LR_SAVE]; 177 ip = stack[STACK_FRAME_LR_SAVE];
163 if (!firstframe && !__kernel_text_address(ip)) 178 if (!__kernel_text_address(ip))
164 return 1; 179 return -EINVAL;
165 firstframe = 0;
166 180
167 /* 181 /*
168 * FIXME: IMHO these tests do not belong in 182 * FIXME: IMHO these tests do not belong in
@@ -175,25 +189,37 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
175 * as unreliable. 189 * as unreliable.
176 */ 190 */
177 if (ip == (unsigned long)kretprobe_trampoline) 191 if (ip == (unsigned long)kretprobe_trampoline)
178 return 1; 192 return -EINVAL;
179#endif 193#endif
180 194
195 if (trace->nr_entries >= trace->max_entries)
196 return -E2BIG;
181 if (!trace->skip) 197 if (!trace->skip)
182 trace->entries[trace->nr_entries++] = ip; 198 trace->entries[trace->nr_entries++] = ip;
183 else 199 else
184 trace->skip--; 200 trace->skip--;
201 }
202 return 0;
203}
185 204
186 if (newsp == stack_end) 205int save_stack_trace_tsk_reliable(struct task_struct *tsk,
187 break; 206 struct stack_trace *trace)
207{
208 int ret;
188 209
189 if (trace->nr_entries >= trace->max_entries) 210 /*
190 return -E2BIG; 211 * If the task doesn't have a stack (e.g., a zombie), the stack is
212 * "reliably" empty.
213 */
214 if (!try_get_task_stack(tsk))
215 return 0;
191 216
192 sp = newsp; 217 ret = __save_stack_trace_tsk_reliable(tsk, trace);
193 } 218
194 return 0; 219 put_task_stack(tsk);
220
221 return ret;
195} 222}
196EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable);
197#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ 223#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
198 224
199#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) 225#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI)
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index e6982ab21816..e52a8878c2fb 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -123,7 +123,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
123 (u64)len_high << 32 | len_low, advice); 123 (u64)len_high << 32 | len_low, advice);
124} 124}
125 125
126long sys_switch_endian(void) 126SYSCALL_DEFINE0(switch_endian)
127{ 127{
128 struct thread_info *ti; 128 struct thread_info *ti;
129 129
diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh
index fd620490a542..f7393a7b18aa 100644
--- a/arch/powerpc/kernel/syscalls/syscalltbl.sh
+++ b/arch/powerpc/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
13 t_entry="$3" 13 t_entry="$3"
14 14
15 while [ $t_nxt -lt $t_nr ]; do 15 while [ $t_nxt -lt $t_nr ]; do
16 printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}" 16 printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
17 t_nxt=$((t_nxt+1)) 17 t_nxt=$((t_nxt+1))
18 done 18 done
19 printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}" 19 printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
20} 20}
21 21
22grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( 22grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 23265a28740b..02f28faba125 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -25,11 +25,11 @@
25.globl sys_call_table 25.globl sys_call_table
26sys_call_table: 26sys_call_table:
27#ifdef CONFIG_PPC64 27#ifdef CONFIG_PPC64
28#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) 28#define __SYSCALL(nr, entry) .8byte DOTSYM(entry)
29#include <asm/syscall_table_64.h> 29#include <asm/syscall_table_64.h>
30#undef __SYSCALL 30#undef __SYSCALL
31#else 31#else
32#define __SYSCALL(nr, entry, nargs) .long entry 32#define __SYSCALL(nr, entry) .long entry
33#include <asm/syscall_table_32.h> 33#include <asm/syscall_table_32.h>
34#undef __SYSCALL 34#undef __SYSCALL
35#endif 35#endif
@@ -38,7 +38,7 @@ sys_call_table:
38.globl compat_sys_call_table 38.globl compat_sys_call_table
39compat_sys_call_table: 39compat_sys_call_table:
40#define compat_sys_sigsuspend sys_sigsuspend 40#define compat_sys_sigsuspend sys_sigsuspend
41#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) 41#define __SYSCALL(nr, entry) .8byte DOTSYM(entry)
42#include <asm/syscall_table_c32.h> 42#include <asm/syscall_table_c32.h>
43#undef __SYSCALL 43#undef __SYSCALL
44#endif 44#endif
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3646affae963..bc0503ef9c9c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -57,7 +57,6 @@
57#include <linux/irq_work.h> 57#include <linux/irq_work.h>
58#include <linux/clk-provider.h> 58#include <linux/clk-provider.h>
59#include <linux/suspend.h> 59#include <linux/suspend.h>
60#include <linux/rtc.h>
61#include <linux/sched/cputime.h> 60#include <linux/sched/cputime.h>
62#include <linux/processor.h> 61#include <linux/processor.h>
63#include <asm/trace.h> 62#include <asm/trace.h>
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
index b1725ad3e13d..858503775c58 100644
--- a/arch/powerpc/kernel/trace/Makefile
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_PPC64) += $(obj64-y) 23obj-$(CONFIG_PPC64) += $(obj64-y)
24obj-$(CONFIG_PPC32) += $(obj32-y) 24obj-$(CONFIG_PPC32) += $(obj32-y)
25 25
26# Disable GCOV & sanitizers in odd or sensitive code 26# Disable GCOV, KCOV & sanitizers in odd or sensitive code
27GCOV_PROFILE_ftrace.o := n 27GCOV_PROFILE_ftrace.o := n
28KCOV_INSTRUMENT_ftrace.o := n
28UBSAN_SANITIZE_ftrace.o := n 29UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index 32476a6e4e9c..01b1224add49 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -229,7 +229,7 @@ ftrace_call:
229 * - r0, r11 & r12 are free 229 * - r0, r11 & r12 are free
230 */ 230 */
231livepatch_handler: 231livepatch_handler:
232 CURRENT_THREAD_INFO(r12, r1) 232 ld r12, PACA_THREAD_INFO(r13)
233 233
234 /* Allocate 3 x 8 bytes */ 234 /* Allocate 3 x 8 bytes */
235 ld r11, TI_livepatch_sp(r12) 235 ld r11, TI_livepatch_sp(r12)
@@ -256,7 +256,7 @@ livepatch_handler:
256 * restore it. 256 * restore it.
257 */ 257 */
258 258
259 CURRENT_THREAD_INFO(r12, r1) 259 ld r12, PACA_THREAD_INFO(r13)
260 260
261 ld r11, TI_livepatch_sp(r12) 261 ld r11, TI_livepatch_sp(r12)
262 262
@@ -273,7 +273,7 @@ livepatch_handler:
273 ld r2, -24(r11) 273 ld r2, -24(r11)
274 274
275 /* Pop livepatch stack frame */ 275 /* Pop livepatch stack frame */
276 CURRENT_THREAD_INFO(r12, r1) 276 ld r12, PACA_THREAD_INFO(r13)
277 subi r11, r11, 24 277 subi r11, r11, 24
278 std r11, TI_livepatch_sp(r12) 278 std r11, TI_livepatch_sp(r12)
279 279
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 64936b60d521..a21200c6aaea 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -257,24 +257,17 @@ static int __die(const char *str, struct pt_regs *regs, long err)
257{ 257{
258 printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); 258 printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
259 259
260 if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) 260 printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n",
261 printk("LE "); 261 IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
262 else 262 PAGE_SIZE / 1024,
263 printk("BE "); 263 early_radix_enabled() ? " MMU=Radix" : "",
264 264 early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "",
265 if (IS_ENABLED(CONFIG_PREEMPT)) 265 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
266 pr_cont("PREEMPT "); 266 IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
267 267 IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
268 if (IS_ENABLED(CONFIG_SMP)) 268 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
269 pr_cont("SMP NR_CPUS=%d ", NR_CPUS); 269 IS_ENABLED(CONFIG_NUMA) ? " NUMA" : "",
270 270 ppc_md.name ? ppc_md.name : "");
271 if (debug_pagealloc_enabled())
272 pr_cont("DEBUG_PAGEALLOC ");
273
274 if (IS_ENABLED(CONFIG_NUMA))
275 pr_cont("NUMA ");
276
277 pr_cont("%s\n", ppc_md.name ? ppc_md.name : "");
278 271
279 if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) 272 if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP)
280 return 1; 273 return 1;
@@ -376,16 +369,101 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
376 force_sig_fault(signr, code, (void __user *)addr, current); 369 force_sig_fault(signr, code, (void __user *)addr, current);
377} 370}
378 371
372/*
373 * The interrupt architecture has a quirk in that the HV interrupts excluding
374 * the NMIs (0x100 and 0x200) do not clear MSR[RI] at entry. The first thing
375 * that an interrupt handler must do is save off a GPR into a scratch register,
376 * and all interrupts on POWERNV (HV=1) use the HSPRG1 register as scratch.
377 * Therefore an NMI can clobber an HV interrupt's live HSPRG1 without noticing
378 * that it is non-reentrant, which leads to random data corruption.
379 *
380 * The solution is for NMI interrupts in HV mode to check if they originated
381 * from these critical HV interrupt regions. If so, then mark them not
382 * recoverable.
383 *
384 * An alternative would be for HV NMIs to use SPRG for scratch to avoid the
385 * HSPRG1 clobber, however this would cause guest SPRG to be clobbered. Linux
386 * guests should always have MSR[RI]=0 when its scratch SPRG is in use, so
387 * that would work. However any other guest OS that may have the SPRG live
388 * and MSR[RI]=1 could encounter silent corruption.
389 *
390 * Builds that do not support KVM could take this second option to increase
391 * the recoverability of NMIs.
392 */
393void hv_nmi_check_nonrecoverable(struct pt_regs *regs)
394{
395#ifdef CONFIG_PPC_POWERNV
396 unsigned long kbase = (unsigned long)_stext;
397 unsigned long nip = regs->nip;
398
399 if (!(regs->msr & MSR_RI))
400 return;
401 if (!(regs->msr & MSR_HV))
402 return;
403 if (regs->msr & MSR_PR)
404 return;
405
406 /*
407 * Now test if the interrupt has hit a range that may be using
408 * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The
409 * problem ranges all run un-relocated. Test real and virt modes
410 * at the same time by droping the high bit of the nip (virt mode
411 * entry points still have the +0x4000 offset).
412 */
413 nip &= ~0xc000000000000000ULL;
414 if ((nip >= 0x500 && nip < 0x600) || (nip >= 0x4500 && nip < 0x4600))
415 goto nonrecoverable;
416 if ((nip >= 0x980 && nip < 0xa00) || (nip >= 0x4980 && nip < 0x4a00))
417 goto nonrecoverable;
418 if ((nip >= 0xe00 && nip < 0xec0) || (nip >= 0x4e00 && nip < 0x4ec0))
419 goto nonrecoverable;
420 if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0))
421 goto nonrecoverable;
422
423 /* Trampoline code runs un-relocated so subtract kbase. */
424 if (nip >= (unsigned long)(start_real_trampolines - kbase) &&
425 nip < (unsigned long)(end_real_trampolines - kbase))
426 goto nonrecoverable;
427 if (nip >= (unsigned long)(start_virt_trampolines - kbase) &&
428 nip < (unsigned long)(end_virt_trampolines - kbase))
429 goto nonrecoverable;
430 return;
431
432nonrecoverable:
433 regs->msr &= ~MSR_RI;
434#endif
435}
436
379void system_reset_exception(struct pt_regs *regs) 437void system_reset_exception(struct pt_regs *regs)
380{ 438{
439 unsigned long hsrr0, hsrr1;
440 bool nested = in_nmi();
441 bool saved_hsrrs = false;
442
381 /* 443 /*
382 * Avoid crashes in case of nested NMI exceptions. Recoverability 444 * Avoid crashes in case of nested NMI exceptions. Recoverability
383 * is determined by RI and in_nmi 445 * is determined by RI and in_nmi
384 */ 446 */
385 bool nested = in_nmi();
386 if (!nested) 447 if (!nested)
387 nmi_enter(); 448 nmi_enter();
388 449
450 /*
451 * System reset can interrupt code where HSRRs are live and MSR[RI]=1.
452 * The system reset interrupt itself may clobber HSRRs (e.g., to call
453 * OPAL), so save them here and restore them before returning.
454 *
455 * Machine checks don't need to save HSRRs, as the real mode handler
456 * is careful to avoid them, and the regular handler is not delivered
457 * as an NMI.
458 */
459 if (cpu_has_feature(CPU_FTR_HVMODE)) {
460 hsrr0 = mfspr(SPRN_HSRR0);
461 hsrr1 = mfspr(SPRN_HSRR1);
462 saved_hsrrs = true;
463 }
464
465 hv_nmi_check_nonrecoverable(regs);
466
389 __this_cpu_inc(irq_stat.sreset_irqs); 467 __this_cpu_inc(irq_stat.sreset_irqs);
390 468
391 /* See if any machine dependent calls */ 469 /* See if any machine dependent calls */
@@ -433,6 +511,11 @@ out:
433 if (!(regs->msr & MSR_RI)) 511 if (!(regs->msr & MSR_RI))
434 nmi_panic(regs, "Unrecoverable System Reset"); 512 nmi_panic(regs, "Unrecoverable System Reset");
435 513
514 if (saved_hsrrs) {
515 mtspr(SPRN_HSRR0, hsrr0);
516 mtspr(SPRN_HSRR1, hsrr1);
517 }
518
436 if (!nested) 519 if (!nested)
437 nmi_exit(); 520 nmi_exit();
438 521
@@ -763,15 +846,15 @@ void machine_check_exception(struct pt_regs *regs)
763 if (check_io_access(regs)) 846 if (check_io_access(regs))
764 goto bail; 847 goto bail;
765 848
766 /* Must die if the interrupt is not recoverable */
767 if (!(regs->msr & MSR_RI))
768 nmi_panic(regs, "Unrecoverable Machine check");
769
770 if (!nested) 849 if (!nested)
771 nmi_exit(); 850 nmi_exit();
772 851
773 die("Machine check", regs, SIGBUS); 852 die("Machine check", regs, SIGBUS);
774 853
854 /* Must die if the interrupt is not recoverable */
855 if (!(regs->msr & MSR_RI))
856 nmi_panic(regs, "Unrecoverable Machine check");
857
775 return; 858 return;
776 859
777bail: 860bail:
@@ -1542,8 +1625,8 @@ bail:
1542 1625
1543void StackOverflow(struct pt_regs *regs) 1626void StackOverflow(struct pt_regs *regs)
1544{ 1627{
1545 printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n", 1628 pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n",
1546 current, regs->gpr[1]); 1629 current->comm, task_pid_nr(current), regs->gpr[1]);
1547 debugger(regs); 1630 debugger(regs);
1548 show_regs(regs); 1631 show_regs(regs);
1549 panic("kernel stack overflow"); 1632 panic("kernel stack overflow");
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index 7cc38b5b58bc..8db4891acdaf 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -74,7 +74,7 @@ void __init udbg_early_init(void)
74#endif 74#endif
75 75
76#ifdef CONFIG_PPC_EARLY_DEBUG 76#ifdef CONFIG_PPC_EARLY_DEBUG
77 console_loglevel = 10; 77 console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
78 78
79 register_early_udbg_console(); 79 register_early_udbg_console();
80#endif 80#endif
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index 50112d4473bb..ce199f6e4256 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -23,6 +23,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
23obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) 23obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
24 24
25GCOV_PROFILE := n 25GCOV_PROFILE := n
26KCOV_INSTRUMENT := n
26UBSAN_SANITIZE := n 27UBSAN_SANITIZE := n
27 28
28ccflags-y := -shared -fno-common -fno-builtin 29ccflags-y := -shared -fno-common -fno-builtin
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index 69cecb346269..28e7d112aa2f 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -9,6 +9,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
9obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) 9obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
10 10
11GCOV_PROFILE := n 11GCOV_PROFILE := n
12KCOV_INSTRUMENT := n
12UBSAN_SANITIZE := n 13UBSAN_SANITIZE := n
13 14
14ccflags-y := -shared -fno-common -fno-builtin 15ccflags-y := -shared -fno-common -fno-builtin
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index ad1c77f71f54..060a1acd7c6d 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -12,11 +12,8 @@
12#include <asm/cache.h> 12#include <asm/cache.h>
13#include <asm/thread_info.h> 13#include <asm/thread_info.h>
14 14
15#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) 15#define STRICT_ALIGN_SIZE (1 << CONFIG_DATA_SHIFT)
16#define STRICT_ALIGN_SIZE (1 << 24) 16#define ETEXT_ALIGN_SIZE (1 << CONFIG_ETEXT_SHIFT)
17#else
18#define STRICT_ALIGN_SIZE PAGE_SIZE
19#endif
20 17
21ENTRY(_stext) 18ENTRY(_stext)
22 19
@@ -86,11 +83,11 @@ SECTIONS
86 83
87#ifdef CONFIG_PPC64 84#ifdef CONFIG_PPC64
88 /* 85 /*
89 * BLOCK(0) overrides the default output section alignment because 86 * ALIGN(0) overrides the default output section alignment because
90 * this needs to start right after .head.text in order for fixed 87 * this needs to start right after .head.text in order for fixed
91 * section placement to work. 88 * section placement to work.
92 */ 89 */
93 .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { 90 .text ALIGN(0) : AT(ADDR(.text) - LOAD_OFFSET) {
94#ifdef CONFIG_LD_HEAD_STUB_CATCH 91#ifdef CONFIG_LD_HEAD_STUB_CATCH
95 KEEP(*(.linker_stub_catch)); 92 KEEP(*(.linker_stub_catch));
96 . = . ; 93 . = . ;
@@ -131,7 +128,7 @@ SECTIONS
131 128
132 } :kernel 129 } :kernel
133 130
134 . = ALIGN(PAGE_SIZE); 131 . = ALIGN(ETEXT_ALIGN_SIZE);
135 _etext = .; 132 _etext = .;
136 PROVIDE32 (etext = .); 133 PROVIDE32 (etext = .);
137 134
@@ -319,6 +316,7 @@ SECTIONS
319 *(.sdata2) 316 *(.sdata2)
320 *(.got.plt) *(.got) 317 *(.got.plt) *(.got)
321 *(.plt) 318 *(.plt)
319 *(.branch_lt)
322 } 320 }
323#else 321#else
324 .data : AT(ADDR(.data) - LOAD_OFFSET) { 322 .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 64f1135e7732..3223aec88b2c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
10common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o 10common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
11common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o 11common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
12 12
13CFLAGS_e500_mmu.o := -I.
14CFLAGS_e500_mmu_host.o := -I.
15CFLAGS_emulate.o := -I.
16CFLAGS_emulate_loadstore.o := -I.
17
18common-objs-y += powerpc.o emulate_loadstore.o 13common-objs-y += powerpc.o emulate_loadstore.o
19obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o 14obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
20obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o 15obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index bd1a677dd9e4..9a7dadbe1f17 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -192,6 +192,13 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
192} 192}
193EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); 193EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
194 194
195void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags)
196{
197 /* might as well deliver this straight away */
198 kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags);
199}
200EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
201
195void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) 202void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
196{ 203{
197 /* might as well deliver this straight away */ 204 /* might as well deliver this straight away */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5a066fc299e1..a3d5318f5d1e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1215,6 +1215,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1215 r = RESUME_GUEST; 1215 r = RESUME_GUEST;
1216 break; 1216 break;
1217 case BOOK3S_INTERRUPT_MACHINE_CHECK: 1217 case BOOK3S_INTERRUPT_MACHINE_CHECK:
1218 /* Print the MCE event to host console. */
1219 machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1220
1221 /*
1222 * If the guest can do FWNMI, exit to userspace so it can
1223 * deliver a FWNMI to the guest.
1224 * Otherwise we synthesize a machine check for the guest
1225 * so that it knows that the machine check occurred.
1226 */
1227 if (!vcpu->kvm->arch.fwnmi_enabled) {
1228 ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
1229 kvmppc_core_queue_machine_check(vcpu, flags);
1230 r = RESUME_GUEST;
1231 break;
1232 }
1233
1218 /* Exit to guest with KVM_EXIT_NMI as exit reason */ 1234 /* Exit to guest with KVM_EXIT_NMI as exit reason */
1219 run->exit_reason = KVM_EXIT_NMI; 1235 run->exit_reason = KVM_EXIT_NMI;
1220 run->hw.hardware_exit_reason = vcpu->arch.trap; 1236 run->hw.hardware_exit_reason = vcpu->arch.trap;
@@ -1227,8 +1243,6 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1227 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; 1243 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1228 1244
1229 r = RESUME_HOST; 1245 r = RESUME_HOST;
1230 /* Print the MCE event to host console. */
1231 machine_check_print_event_info(&vcpu->arch.mce_evt, false);
1232 break; 1246 break;
1233 case BOOK3S_INTERRUPT_PROGRAM: 1247 case BOOK3S_INTERRUPT_PROGRAM:
1234 { 1248 {
@@ -1392,7 +1406,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1392 /* Pass the machine check to the L1 guest */ 1406 /* Pass the machine check to the L1 guest */
1393 r = RESUME_HOST; 1407 r = RESUME_HOST;
1394 /* Print the MCE event to host console. */ 1408 /* Print the MCE event to host console. */
1395 machine_check_print_event_info(&vcpu->arch.mce_evt, false); 1409 machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1396 break; 1410 break;
1397 /* 1411 /*
1398 * We get these next two if the guest accesses a page which it thinks 1412 * We get these next two if the guest accesses a page which it thinks
@@ -3455,6 +3469,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3455 unsigned long host_dscr = mfspr(SPRN_DSCR); 3469 unsigned long host_dscr = mfspr(SPRN_DSCR);
3456 unsigned long host_tidr = mfspr(SPRN_TIDR); 3470 unsigned long host_tidr = mfspr(SPRN_TIDR);
3457 unsigned long host_iamr = mfspr(SPRN_IAMR); 3471 unsigned long host_iamr = mfspr(SPRN_IAMR);
3472 unsigned long host_amr = mfspr(SPRN_AMR);
3458 s64 dec; 3473 s64 dec;
3459 u64 tb; 3474 u64 tb;
3460 int trap, save_pmu; 3475 int trap, save_pmu;
@@ -3571,13 +3586,15 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3571 3586
3572 mtspr(SPRN_PSPB, 0); 3587 mtspr(SPRN_PSPB, 0);
3573 mtspr(SPRN_WORT, 0); 3588 mtspr(SPRN_WORT, 0);
3574 mtspr(SPRN_AMR, 0);
3575 mtspr(SPRN_UAMOR, 0); 3589 mtspr(SPRN_UAMOR, 0);
3576 mtspr(SPRN_DSCR, host_dscr); 3590 mtspr(SPRN_DSCR, host_dscr);
3577 mtspr(SPRN_TIDR, host_tidr); 3591 mtspr(SPRN_TIDR, host_tidr);
3578 mtspr(SPRN_IAMR, host_iamr); 3592 mtspr(SPRN_IAMR, host_iamr);
3579 mtspr(SPRN_PSPB, 0); 3593 mtspr(SPRN_PSPB, 0);
3580 3594
3595 if (host_amr != vcpu->arch.amr)
3596 mtspr(SPRN_AMR, host_amr);
3597
3581 msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); 3598 msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3582 store_fp_state(&vcpu->arch.fp); 3599 store_fp_state(&vcpu->arch.fp);
3583#ifdef CONFIG_ALTIVEC 3600#ifdef CONFIG_ALTIVEC
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index e3f738eb1cac..64b5011475c7 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -24,6 +24,7 @@
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <asm/paca.h> 25#include <asm/paca.h>
26#include <asm/hmi.h> 26#include <asm/hmi.h>
27#include <asm/processor.h>
27 28
28void wait_for_subcore_guest_exit(void) 29void wait_for_subcore_guest_exit(void)
29{ 30{
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 0787f12c1a1b..8c24c3bea0bf 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -66,10 +66,8 @@ static void reload_slb(struct kvm_vcpu *vcpu)
66/* 66/*
67 * On POWER7, see if we can handle a machine check that occurred inside 67 * On POWER7, see if we can handle a machine check that occurred inside
68 * the guest in real mode, without switching to the host partition. 68 * the guest in real mode, without switching to the host partition.
69 *
70 * Returns: 0 => exit guest, 1 => deliver machine check to guest
71 */ 69 */
72static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) 70static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
73{ 71{
74 unsigned long srr1 = vcpu->arch.shregs.msr; 72 unsigned long srr1 = vcpu->arch.shregs.msr;
75 struct machine_check_event mce_evt; 73 struct machine_check_event mce_evt;
@@ -111,52 +109,24 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
111 } 109 }
112 110
113 /* 111 /*
114 * See if we have already handled the condition in the linux host. 112 * Now get the event and stash it in the vcpu struct so it can
115 * We assume that if the condition is recovered then linux host 113 * be handled by the primary thread in virtual mode. We can't
116 * will have generated an error log event that we will pick 114 * call machine_check_queue_event() here if we are running on
117 * up and log later. 115 * an offline secondary thread.
118 * Don't release mce event now. We will queue up the event so that
119 * we can log the MCE event info on host console.
120 */ 116 */
121 if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) 117 if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
122 goto out; 118 if (handled && mce_evt.version == MCE_V1)
123 119 mce_evt.disposition = MCE_DISPOSITION_RECOVERED;
124 if (mce_evt.version == MCE_V1 && 120 } else {
125 (mce_evt.severity == MCE_SEV_NO_ERROR || 121 memset(&mce_evt, 0, sizeof(mce_evt));
126 mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) 122 }
127 handled = 1;
128
129out:
130 /*
131 * For guest that supports FWNMI capability, hook the MCE event into
132 * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
133 * exit reason. On our way to exit we will pull this event from vcpu
134 * structure and print it from thread 0 of the core/subcore.
135 *
136 * For guest that does not support FWNMI capability (old QEMU):
137 * We are now going enter guest either through machine check
138 * interrupt (for unhandled errors) or will continue from
139 * current HSRR0 (for handled errors) in guest. Hence
140 * queue up the event so that we can log it from host console later.
141 */
142 if (vcpu->kvm->arch.fwnmi_enabled) {
143 /*
144 * Hook up the mce event on to vcpu structure.
145 * First clear the old event.
146 */
147 memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
148 if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
149 vcpu->arch.mce_evt = mce_evt;
150 }
151 } else
152 machine_check_queue_event();
153 123
154 return handled; 124 vcpu->arch.mce_evt = mce_evt;
155} 125}
156 126
157long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) 127void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
158{ 128{
159 return kvmppc_realmode_mc_power7(vcpu); 129 kvmppc_realmode_mc_power7(vcpu);
160} 130}
161 131
162/* Check if dynamic split is in force and return subcore size accordingly. */ 132/* Check if dynamic split is in force and return subcore size accordingly. */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9b8d50a7cbaf..25043b50cb30 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -58,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
58#define STACK_SLOT_DAWR (SFS-56) 58#define STACK_SLOT_DAWR (SFS-56)
59#define STACK_SLOT_DAWRX (SFS-64) 59#define STACK_SLOT_DAWRX (SFS-64)
60#define STACK_SLOT_HFSCR (SFS-72) 60#define STACK_SLOT_HFSCR (SFS-72)
61#define STACK_SLOT_AMR (SFS-80)
62#define STACK_SLOT_UAMOR (SFS-88)
61/* the following is used by the P9 short path */ 63/* the following is used by the P9 short path */
62#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ 64#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
63 65
@@ -726,11 +728,9 @@ BEGIN_FTR_SECTION
726 mfspr r5, SPRN_TIDR 728 mfspr r5, SPRN_TIDR
727 mfspr r6, SPRN_PSSCR 729 mfspr r6, SPRN_PSSCR
728 mfspr r7, SPRN_PID 730 mfspr r7, SPRN_PID
729 mfspr r8, SPRN_IAMR
730 std r5, STACK_SLOT_TID(r1) 731 std r5, STACK_SLOT_TID(r1)
731 std r6, STACK_SLOT_PSSCR(r1) 732 std r6, STACK_SLOT_PSSCR(r1)
732 std r7, STACK_SLOT_PID(r1) 733 std r7, STACK_SLOT_PID(r1)
733 std r8, STACK_SLOT_IAMR(r1)
734 mfspr r5, SPRN_HFSCR 734 mfspr r5, SPRN_HFSCR
735 std r5, STACK_SLOT_HFSCR(r1) 735 std r5, STACK_SLOT_HFSCR(r1)
736END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 736END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
@@ -738,11 +738,18 @@ BEGIN_FTR_SECTION
738 mfspr r5, SPRN_CIABR 738 mfspr r5, SPRN_CIABR
739 mfspr r6, SPRN_DAWR 739 mfspr r6, SPRN_DAWR
740 mfspr r7, SPRN_DAWRX 740 mfspr r7, SPRN_DAWRX
741 mfspr r8, SPRN_IAMR
741 std r5, STACK_SLOT_CIABR(r1) 742 std r5, STACK_SLOT_CIABR(r1)
742 std r6, STACK_SLOT_DAWR(r1) 743 std r6, STACK_SLOT_DAWR(r1)
743 std r7, STACK_SLOT_DAWRX(r1) 744 std r7, STACK_SLOT_DAWRX(r1)
745 std r8, STACK_SLOT_IAMR(r1)
744END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 746END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
745 747
748 mfspr r5, SPRN_AMR
749 std r5, STACK_SLOT_AMR(r1)
750 mfspr r6, SPRN_UAMOR
751 std r6, STACK_SLOT_UAMOR(r1)
752
746BEGIN_FTR_SECTION 753BEGIN_FTR_SECTION
747 /* Set partition DABR */ 754 /* Set partition DABR */
748 /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ 755 /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -1631,22 +1638,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
1631 mtspr SPRN_PSPB, r0 1638 mtspr SPRN_PSPB, r0
1632 mtspr SPRN_WORT, r0 1639 mtspr SPRN_WORT, r0
1633BEGIN_FTR_SECTION 1640BEGIN_FTR_SECTION
1634 mtspr SPRN_IAMR, r0
1635 mtspr SPRN_TCSCR, r0 1641 mtspr SPRN_TCSCR, r0
1636 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ 1642 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
1637 li r0, 1 1643 li r0, 1
1638 sldi r0, r0, 31 1644 sldi r0, r0, 31
1639 mtspr SPRN_MMCRS, r0 1645 mtspr SPRN_MMCRS, r0
1640END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 1646END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
16418:
1642 1647
1643 /* Save and reset AMR and UAMOR before turning on the MMU */ 1648 /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
1649 ld r8, STACK_SLOT_IAMR(r1)
1650 mtspr SPRN_IAMR, r8
1651
16528: /* Power7 jumps back in here */
1644 mfspr r5,SPRN_AMR 1653 mfspr r5,SPRN_AMR
1645 mfspr r6,SPRN_UAMOR 1654 mfspr r6,SPRN_UAMOR
1646 std r5,VCPU_AMR(r9) 1655 std r5,VCPU_AMR(r9)
1647 std r6,VCPU_UAMOR(r9) 1656 std r6,VCPU_UAMOR(r9)
1648 li r6,0 1657 ld r5,STACK_SLOT_AMR(r1)
1649 mtspr SPRN_AMR,r6 1658 ld r6,STACK_SLOT_UAMOR(r1)
1659 mtspr SPRN_AMR, r5
1650 mtspr SPRN_UAMOR, r6 1660 mtspr SPRN_UAMOR, r6
1651 1661
1652 /* Switch DSCR back to host value */ 1662 /* Switch DSCR back to host value */
@@ -1746,11 +1756,9 @@ BEGIN_FTR_SECTION
1746 ld r5, STACK_SLOT_TID(r1) 1756 ld r5, STACK_SLOT_TID(r1)
1747 ld r6, STACK_SLOT_PSSCR(r1) 1757 ld r6, STACK_SLOT_PSSCR(r1)
1748 ld r7, STACK_SLOT_PID(r1) 1758 ld r7, STACK_SLOT_PID(r1)
1749 ld r8, STACK_SLOT_IAMR(r1)
1750 mtspr SPRN_TIDR, r5 1759 mtspr SPRN_TIDR, r5
1751 mtspr SPRN_PSSCR, r6 1760 mtspr SPRN_PSSCR, r6
1752 mtspr SPRN_PID, r7 1761 mtspr SPRN_PID, r7
1753 mtspr SPRN_IAMR, r8
1754END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1762END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1755 1763
1756#ifdef CONFIG_PPC_RADIX_MMU 1764#ifdef CONFIG_PPC_RADIX_MMU
@@ -2826,49 +2834,15 @@ kvm_cede_exit:
2826#endif /* CONFIG_KVM_XICS */ 2834#endif /* CONFIG_KVM_XICS */
28273: b guest_exit_cont 28353: b guest_exit_cont
2828 2836
2829 /* Try to handle a machine check in real mode */ 2837 /* Try to do machine check recovery in real mode */
2830machine_check_realmode: 2838machine_check_realmode:
2831 mr r3, r9 /* get vcpu pointer */ 2839 mr r3, r9 /* get vcpu pointer */
2832 bl kvmppc_realmode_machine_check 2840 bl kvmppc_realmode_machine_check
2833 nop 2841 nop
2842 /* all machine checks go to virtual mode for further handling */
2834 ld r9, HSTATE_KVM_VCPU(r13) 2843 ld r9, HSTATE_KVM_VCPU(r13)
2835 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 2844 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
2836 /* 2845 b guest_exit_cont
2837 * For the guest that is FWNMI capable, deliver all the MCE errors
2838 * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
2839 * reason. This new approach injects machine check errors in guest
2840 * address space to guest with additional information in the form
2841 * of RTAS event, thus enabling guest kernel to suitably handle
2842 * such errors.
2843 *
2844 * For the guest that is not FWNMI capable (old QEMU) fallback
2845 * to old behaviour for backward compatibility:
2846 * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
2847 * through machine check interrupt (set HSRR0 to 0x200).
2848 * For handled errors (no-fatal), just go back to guest execution
2849 * with current HSRR0.
2850 * if we receive machine check with MSR(RI=0) then deliver it to
2851 * guest as machine check causing guest to crash.
2852 */
2853 ld r11, VCPU_MSR(r9)
2854 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
2855 bne guest_exit_cont /* if so, exit to host */
2856 /* Check if guest is capable of handling NMI exit */
2857 ld r10, VCPU_KVM(r9)
2858 lbz r10, KVM_FWNMI(r10)
2859 cmpdi r10, 1 /* FWNMI capable? */
2860 beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */
2861
2862 /* if not, fall through for backward compatibility. */
2863 andi. r10, r11, MSR_RI /* check for unrecoverable exception */
2864 beq 1f /* Deliver a machine check to guest */
2865 ld r10, VCPU_PC(r9)
2866 cmpdi r3, 0 /* Did we handle MCE ? */
2867 bne 2f /* Continue guest execution. */
2868 /* If not, deliver a machine check. SRR0/1 are already set */
28691: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
2870 bl kvmppc_msr_interrupt
28712: b fast_interrupt_c_return
2872 2846
2873/* 2847/*
2874 * Call C code to handle a HMI in real mode. 2848 * Call C code to handle a HMI in real mode.
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 3bf9fc6fd36c..79396e184bca 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -30,7 +30,8 @@ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
30 30
31obj64-$(CONFIG_SMP) += locks.o 31obj64-$(CONFIG_SMP) += locks.o
32obj64-$(CONFIG_ALTIVEC) += vmx-helper.o 32obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
33obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o 33obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \
34 test_emulate_step_exec_instr.o
34 35
35obj-y += checksum_$(BITS).o checksum_wrappers.o \ 36obj-y += checksum_$(BITS).o checksum_wrappers.o \
36 string_$(BITS).o memcmp_$(BITS).o 37 string_$(BITS).o memcmp_$(BITS).o
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index d81568f783e5..3d33fb509ef4 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1169,7 +1169,7 @@ static nokprobe_inline int trap_compare(long v1, long v2)
1169int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, 1169int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1170 unsigned int instr) 1170 unsigned int instr)
1171{ 1171{
1172 unsigned int opcode, ra, rb, rd, spr, u; 1172 unsigned int opcode, ra, rb, rc, rd, spr, u;
1173 unsigned long int imm; 1173 unsigned long int imm;
1174 unsigned long int val, val2; 1174 unsigned long int val, val2;
1175 unsigned int mb, me, sh; 1175 unsigned int mb, me, sh;
@@ -1292,6 +1292,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1292 rd = (instr >> 21) & 0x1f; 1292 rd = (instr >> 21) & 0x1f;
1293 ra = (instr >> 16) & 0x1f; 1293 ra = (instr >> 16) & 0x1f;
1294 rb = (instr >> 11) & 0x1f; 1294 rb = (instr >> 11) & 0x1f;
1295 rc = (instr >> 6) & 0x1f;
1295 1296
1296 switch (opcode) { 1297 switch (opcode) {
1297#ifdef __powerpc64__ 1298#ifdef __powerpc64__
@@ -1305,6 +1306,38 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1305 goto trap; 1306 goto trap;
1306 return 1; 1307 return 1;
1307 1308
1309#ifdef __powerpc64__
1310 case 4:
1311 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1312 return -1;
1313
1314 switch (instr & 0x3f) {
1315 case 48: /* maddhd */
1316 asm volatile(PPC_MADDHD(%0, %1, %2, %3) :
1317 "=r" (op->val) : "r" (regs->gpr[ra]),
1318 "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
1319 goto compute_done;
1320
1321 case 49: /* maddhdu */
1322 asm volatile(PPC_MADDHDU(%0, %1, %2, %3) :
1323 "=r" (op->val) : "r" (regs->gpr[ra]),
1324 "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
1325 goto compute_done;
1326
1327 case 51: /* maddld */
1328 asm volatile(PPC_MADDLD(%0, %1, %2, %3) :
1329 "=r" (op->val) : "r" (regs->gpr[ra]),
1330 "r" (regs->gpr[rb]), "r" (regs->gpr[rc]));
1331 goto compute_done;
1332 }
1333
1334 /*
1335 * There are other instructions from ISA 3.0 with the same
1336 * primary opcode which do not have emulation support yet.
1337 */
1338 return -1;
1339#endif
1340
1308 case 7: /* mulli */ 1341 case 7: /* mulli */
1309 op->val = regs->gpr[ra] * (short) instr; 1342 op->val = regs->gpr[ra] * (short) instr;
1310 goto compute_done; 1343 goto compute_done;
@@ -1671,10 +1704,23 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1671 (int) regs->gpr[rb]; 1704 (int) regs->gpr[rb];
1672 1705
1673 goto arith_done; 1706 goto arith_done;
1674 1707#ifdef __powerpc64__
1708 case 265: /* modud */
1709 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1710 return -1;
1711 op->val = regs->gpr[ra] % regs->gpr[rb];
1712 goto compute_done;
1713#endif
1675 case 266: /* add */ 1714 case 266: /* add */
1676 op->val = regs->gpr[ra] + regs->gpr[rb]; 1715 op->val = regs->gpr[ra] + regs->gpr[rb];
1677 goto arith_done; 1716 goto arith_done;
1717
1718 case 267: /* moduw */
1719 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1720 return -1;
1721 op->val = (unsigned int) regs->gpr[ra] %
1722 (unsigned int) regs->gpr[rb];
1723 goto compute_done;
1678#ifdef __powerpc64__ 1724#ifdef __powerpc64__
1679 case 457: /* divdu */ 1725 case 457: /* divdu */
1680 op->val = regs->gpr[ra] / regs->gpr[rb]; 1726 op->val = regs->gpr[ra] / regs->gpr[rb];
@@ -1695,6 +1741,42 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1695 (int) regs->gpr[rb]; 1741 (int) regs->gpr[rb];
1696 goto arith_done; 1742 goto arith_done;
1697 1743
1744 case 755: /* darn */
1745 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1746 return -1;
1747 switch (ra & 0x3) {
1748 case 0:
1749 /* 32-bit conditioned */
1750 asm volatile(PPC_DARN(%0, 0) : "=r" (op->val));
1751 goto compute_done;
1752
1753 case 1:
1754 /* 64-bit conditioned */
1755 asm volatile(PPC_DARN(%0, 1) : "=r" (op->val));
1756 goto compute_done;
1757
1758 case 2:
1759 /* 64-bit raw */
1760 asm volatile(PPC_DARN(%0, 2) : "=r" (op->val));
1761 goto compute_done;
1762 }
1763
1764 return -1;
1765#ifdef __powerpc64__
1766 case 777: /* modsd */
1767 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1768 return -1;
1769 op->val = (long int) regs->gpr[ra] %
1770 (long int) regs->gpr[rb];
1771 goto compute_done;
1772#endif
1773 case 779: /* modsw */
1774 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1775 return -1;
1776 op->val = (int) regs->gpr[ra] %
1777 (int) regs->gpr[rb];
1778 goto compute_done;
1779
1698 1780
1699/* 1781/*
1700 * Logical instructions 1782 * Logical instructions
@@ -1765,6 +1847,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1765 do_popcnt(regs, op, regs->gpr[rd], 64); 1847 do_popcnt(regs, op, regs->gpr[rd], 64);
1766 goto logical_done_nocc; 1848 goto logical_done_nocc;
1767#endif 1849#endif
1850 case 538: /* cnttzw */
1851 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1852 return -1;
1853 val = (unsigned int) regs->gpr[rd];
1854 op->val = (val ? __builtin_ctz(val) : 32);
1855 goto logical_done;
1856#ifdef __powerpc64__
1857 case 570: /* cnttzd */
1858 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1859 return -1;
1860 val = regs->gpr[rd];
1861 op->val = (val ? __builtin_ctzl(val) : 64);
1862 goto logical_done;
1863#endif
1768 case 922: /* extsh */ 1864 case 922: /* extsh */
1769 op->val = (signed short) regs->gpr[rd]; 1865 op->val = (signed short) regs->gpr[rd];
1770 goto logical_done; 1866 goto logical_done;
@@ -1866,6 +1962,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
1866 op->xerval &= ~XER_CA; 1962 op->xerval &= ~XER_CA;
1867 set_ca32(op, op->xerval & XER_CA); 1963 set_ca32(op, op->xerval & XER_CA);
1868 goto logical_done; 1964 goto logical_done;
1965
1966 case 890: /* extswsli with sh_5 = 0 */
1967 case 891: /* extswsli with sh_5 = 1 */
1968 if (!cpu_has_feature(CPU_FTR_ARCH_300))
1969 return -1;
1970 op->type = COMPUTE + SETREG;
1971 sh = rb | ((instr & 2) << 4);
1972 val = (signed int) regs->gpr[rd];
1973 if (sh)
1974 op->val = ROTATE(val, sh) & MASK64(0, 63 - sh);
1975 else
1976 op->val = val;
1977 goto logical_done;
1978
1869#endif /* __powerpc64__ */ 1979#endif /* __powerpc64__ */
1870 1980
1871/* 1981/*
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 6c47daa61614..9992c1ea7a1d 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Simple sanity test for emulate_step load/store instructions. 2 * Simple sanity tests for instruction emulation infrastructure.
3 * 3 *
4 * Copyright IBM Corp. 2016 4 * Copyright IBM Corp. 2016
5 * 5 *
@@ -14,6 +14,7 @@
14#include <linux/ptrace.h> 14#include <linux/ptrace.h>
15#include <asm/sstep.h> 15#include <asm/sstep.h>
16#include <asm/ppc-opcode.h> 16#include <asm/ppc-opcode.h>
17#include <asm/code-patching.h>
17 18
18#define IMM_L(i) ((uintptr_t)(i) & 0xffff) 19#define IMM_L(i) ((uintptr_t)(i) & 0xffff)
19 20
@@ -48,7 +49,20 @@
48 ___PPC_RA(a) | ___PPC_RB(b)) 49 ___PPC_RA(a) | ___PPC_RB(b))
49#define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b)) 50#define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b))
50#define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b)) 51#define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b))
52#define TEST_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \
53 ___PPC_RA(a) | ___PPC_RB(b))
54#define TEST_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \
55 ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
56#define TEST_ADDC(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \
57 ___PPC_RA(a) | ___PPC_RB(b))
58#define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \
59 ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
60
61#define MAX_SUBTESTS 16
51 62
63#define IGNORE_GPR(n) (0x1UL << (n))
64#define IGNORE_XER (0x1UL << 32)
65#define IGNORE_CCR (0x1UL << 33)
52 66
53static void __init init_pt_regs(struct pt_regs *regs) 67static void __init init_pt_regs(struct pt_regs *regs)
54{ 68{
@@ -72,9 +86,15 @@ static void __init init_pt_regs(struct pt_regs *regs)
72 msr_cached = true; 86 msr_cached = true;
73} 87}
74 88
75static void __init show_result(char *ins, char *result) 89static void __init show_result(char *mnemonic, char *result)
76{ 90{
77 pr_info("%-14s : %s\n", ins, result); 91 pr_info("%-14s : %s\n", mnemonic, result);
92}
93
94static void __init show_result_with_descr(char *mnemonic, char *descr,
95 char *result)
96{
97 pr_info("%-14s : %-50s %s\n", mnemonic, descr, result);
78} 98}
79 99
80static void __init test_ld(void) 100static void __init test_ld(void)
@@ -426,7 +446,7 @@ static void __init test_lxvd2x_stxvd2x(void)
426} 446}
427#endif /* CONFIG_VSX */ 447#endif /* CONFIG_VSX */
428 448
429static int __init test_emulate_step(void) 449static void __init run_tests_load_store(void)
430{ 450{
431 test_ld(); 451 test_ld();
432 test_lwz(); 452 test_lwz();
@@ -437,6 +457,513 @@ static int __init test_emulate_step(void)
437 test_lfdx_stfdx(); 457 test_lfdx_stfdx();
438 test_lvx_stvx(); 458 test_lvx_stvx();
439 test_lxvd2x_stxvd2x(); 459 test_lxvd2x_stxvd2x();
460}
461
462struct compute_test {
463 char *mnemonic;
464 struct {
465 char *descr;
466 unsigned long flags;
467 unsigned int instr;
468 struct pt_regs regs;
469 } subtests[MAX_SUBTESTS + 1];
470};
471
472static struct compute_test compute_tests[] = {
473 {
474 .mnemonic = "nop",
475 .subtests = {
476 {
477 .descr = "R0 = LONG_MAX",
478 .instr = PPC_INST_NOP,
479 .regs = {
480 .gpr[0] = LONG_MAX,
481 }
482 }
483 }
484 },
485 {
486 .mnemonic = "add",
487 .subtests = {
488 {
489 .descr = "RA = LONG_MIN, RB = LONG_MIN",
490 .instr = TEST_ADD(20, 21, 22),
491 .regs = {
492 .gpr[21] = LONG_MIN,
493 .gpr[22] = LONG_MIN,
494 }
495 },
496 {
497 .descr = "RA = LONG_MIN, RB = LONG_MAX",
498 .instr = TEST_ADD(20, 21, 22),
499 .regs = {
500 .gpr[21] = LONG_MIN,
501 .gpr[22] = LONG_MAX,
502 }
503 },
504 {
505 .descr = "RA = LONG_MAX, RB = LONG_MAX",
506 .instr = TEST_ADD(20, 21, 22),
507 .regs = {
508 .gpr[21] = LONG_MAX,
509 .gpr[22] = LONG_MAX,
510 }
511 },
512 {
513 .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
514 .instr = TEST_ADD(20, 21, 22),
515 .regs = {
516 .gpr[21] = ULONG_MAX,
517 .gpr[22] = ULONG_MAX,
518 }
519 },
520 {
521 .descr = "RA = ULONG_MAX, RB = 0x1",
522 .instr = TEST_ADD(20, 21, 22),
523 .regs = {
524 .gpr[21] = ULONG_MAX,
525 .gpr[22] = 0x1,
526 }
527 },
528 {
529 .descr = "RA = INT_MIN, RB = INT_MIN",
530 .instr = TEST_ADD(20, 21, 22),
531 .regs = {
532 .gpr[21] = INT_MIN,
533 .gpr[22] = INT_MIN,
534 }
535 },
536 {
537 .descr = "RA = INT_MIN, RB = INT_MAX",
538 .instr = TEST_ADD(20, 21, 22),
539 .regs = {
540 .gpr[21] = INT_MIN,
541 .gpr[22] = INT_MAX,
542 }
543 },
544 {
545 .descr = "RA = INT_MAX, RB = INT_MAX",
546 .instr = TEST_ADD(20, 21, 22),
547 .regs = {
548 .gpr[21] = INT_MAX,
549 .gpr[22] = INT_MAX,
550 }
551 },
552 {
553 .descr = "RA = UINT_MAX, RB = UINT_MAX",
554 .instr = TEST_ADD(20, 21, 22),
555 .regs = {
556 .gpr[21] = UINT_MAX,
557 .gpr[22] = UINT_MAX,
558 }
559 },
560 {
561 .descr = "RA = UINT_MAX, RB = 0x1",
562 .instr = TEST_ADD(20, 21, 22),
563 .regs = {
564 .gpr[21] = UINT_MAX,
565 .gpr[22] = 0x1,
566 }
567 }
568 }
569 },
570 {
571 .mnemonic = "add.",
572 .subtests = {
573 {
574 .descr = "RA = LONG_MIN, RB = LONG_MIN",
575 .flags = IGNORE_CCR,
576 .instr = TEST_ADD_DOT(20, 21, 22),
577 .regs = {
578 .gpr[21] = LONG_MIN,
579 .gpr[22] = LONG_MIN,
580 }
581 },
582 {
583 .descr = "RA = LONG_MIN, RB = LONG_MAX",
584 .instr = TEST_ADD_DOT(20, 21, 22),
585 .regs = {
586 .gpr[21] = LONG_MIN,
587 .gpr[22] = LONG_MAX,
588 }
589 },
590 {
591 .descr = "RA = LONG_MAX, RB = LONG_MAX",
592 .flags = IGNORE_CCR,
593 .instr = TEST_ADD_DOT(20, 21, 22),
594 .regs = {
595 .gpr[21] = LONG_MAX,
596 .gpr[22] = LONG_MAX,
597 }
598 },
599 {
600 .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
601 .instr = TEST_ADD_DOT(20, 21, 22),
602 .regs = {
603 .gpr[21] = ULONG_MAX,
604 .gpr[22] = ULONG_MAX,
605 }
606 },
607 {
608 .descr = "RA = ULONG_MAX, RB = 0x1",
609 .instr = TEST_ADD_DOT(20, 21, 22),
610 .regs = {
611 .gpr[21] = ULONG_MAX,
612 .gpr[22] = 0x1,
613 }
614 },
615 {
616 .descr = "RA = INT_MIN, RB = INT_MIN",
617 .instr = TEST_ADD_DOT(20, 21, 22),
618 .regs = {
619 .gpr[21] = INT_MIN,
620 .gpr[22] = INT_MIN,
621 }
622 },
623 {
624 .descr = "RA = INT_MIN, RB = INT_MAX",
625 .instr = TEST_ADD_DOT(20, 21, 22),
626 .regs = {
627 .gpr[21] = INT_MIN,
628 .gpr[22] = INT_MAX,
629 }
630 },
631 {
632 .descr = "RA = INT_MAX, RB = INT_MAX",
633 .instr = TEST_ADD_DOT(20, 21, 22),
634 .regs = {
635 .gpr[21] = INT_MAX,
636 .gpr[22] = INT_MAX,
637 }
638 },
639 {
640 .descr = "RA = UINT_MAX, RB = UINT_MAX",
641 .instr = TEST_ADD_DOT(20, 21, 22),
642 .regs = {
643 .gpr[21] = UINT_MAX,
644 .gpr[22] = UINT_MAX,
645 }
646 },
647 {
648 .descr = "RA = UINT_MAX, RB = 0x1",
649 .instr = TEST_ADD_DOT(20, 21, 22),
650 .regs = {
651 .gpr[21] = UINT_MAX,
652 .gpr[22] = 0x1,
653 }
654 }
655 }
656 },
657 {
658 .mnemonic = "addc",
659 .subtests = {
660 {
661 .descr = "RA = LONG_MIN, RB = LONG_MIN",
662 .instr = TEST_ADDC(20, 21, 22),
663 .regs = {
664 .gpr[21] = LONG_MIN,
665 .gpr[22] = LONG_MIN,
666 }
667 },
668 {
669 .descr = "RA = LONG_MIN, RB = LONG_MAX",
670 .instr = TEST_ADDC(20, 21, 22),
671 .regs = {
672 .gpr[21] = LONG_MIN,
673 .gpr[22] = LONG_MAX,
674 }
675 },
676 {
677 .descr = "RA = LONG_MAX, RB = LONG_MAX",
678 .instr = TEST_ADDC(20, 21, 22),
679 .regs = {
680 .gpr[21] = LONG_MAX,
681 .gpr[22] = LONG_MAX,
682 }
683 },
684 {
685 .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
686 .instr = TEST_ADDC(20, 21, 22),
687 .regs = {
688 .gpr[21] = ULONG_MAX,
689 .gpr[22] = ULONG_MAX,
690 }
691 },
692 {
693 .descr = "RA = ULONG_MAX, RB = 0x1",
694 .instr = TEST_ADDC(20, 21, 22),
695 .regs = {
696 .gpr[21] = ULONG_MAX,
697 .gpr[22] = 0x1,
698 }
699 },
700 {
701 .descr = "RA = INT_MIN, RB = INT_MIN",
702 .instr = TEST_ADDC(20, 21, 22),
703 .regs = {
704 .gpr[21] = INT_MIN,
705 .gpr[22] = INT_MIN,
706 }
707 },
708 {
709 .descr = "RA = INT_MIN, RB = INT_MAX",
710 .instr = TEST_ADDC(20, 21, 22),
711 .regs = {
712 .gpr[21] = INT_MIN,
713 .gpr[22] = INT_MAX,
714 }
715 },
716 {
717 .descr = "RA = INT_MAX, RB = INT_MAX",
718 .instr = TEST_ADDC(20, 21, 22),
719 .regs = {
720 .gpr[21] = INT_MAX,
721 .gpr[22] = INT_MAX,
722 }
723 },
724 {
725 .descr = "RA = UINT_MAX, RB = UINT_MAX",
726 .instr = TEST_ADDC(20, 21, 22),
727 .regs = {
728 .gpr[21] = UINT_MAX,
729 .gpr[22] = UINT_MAX,
730 }
731 },
732 {
733 .descr = "RA = UINT_MAX, RB = 0x1",
734 .instr = TEST_ADDC(20, 21, 22),
735 .regs = {
736 .gpr[21] = UINT_MAX,
737 .gpr[22] = 0x1,
738 }
739 },
740 {
741 .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN",
742 .instr = TEST_ADDC(20, 21, 22),
743 .regs = {
744 .gpr[21] = LONG_MIN | (uint)INT_MIN,
745 .gpr[22] = LONG_MIN | (uint)INT_MIN,
746 }
747 }
748 }
749 },
750 {
751 .mnemonic = "addc.",
752 .subtests = {
753 {
754 .descr = "RA = LONG_MIN, RB = LONG_MIN",
755 .flags = IGNORE_CCR,
756 .instr = TEST_ADDC_DOT(20, 21, 22),
757 .regs = {
758 .gpr[21] = LONG_MIN,
759 .gpr[22] = LONG_MIN,
760 }
761 },
762 {
763 .descr = "RA = LONG_MIN, RB = LONG_MAX",
764 .instr = TEST_ADDC_DOT(20, 21, 22),
765 .regs = {
766 .gpr[21] = LONG_MIN,
767 .gpr[22] = LONG_MAX,
768 }
769 },
770 {
771 .descr = "RA = LONG_MAX, RB = LONG_MAX",
772 .flags = IGNORE_CCR,
773 .instr = TEST_ADDC_DOT(20, 21, 22),
774 .regs = {
775 .gpr[21] = LONG_MAX,
776 .gpr[22] = LONG_MAX,
777 }
778 },
779 {
780 .descr = "RA = ULONG_MAX, RB = ULONG_MAX",
781 .instr = TEST_ADDC_DOT(20, 21, 22),
782 .regs = {
783 .gpr[21] = ULONG_MAX,
784 .gpr[22] = ULONG_MAX,
785 }
786 },
787 {
788 .descr = "RA = ULONG_MAX, RB = 0x1",
789 .instr = TEST_ADDC_DOT(20, 21, 22),
790 .regs = {
791 .gpr[21] = ULONG_MAX,
792 .gpr[22] = 0x1,
793 }
794 },
795 {
796 .descr = "RA = INT_MIN, RB = INT_MIN",
797 .instr = TEST_ADDC_DOT(20, 21, 22),
798 .regs = {
799 .gpr[21] = INT_MIN,
800 .gpr[22] = INT_MIN,
801 }
802 },
803 {
804 .descr = "RA = INT_MIN, RB = INT_MAX",
805 .instr = TEST_ADDC_DOT(20, 21, 22),
806 .regs = {
807 .gpr[21] = INT_MIN,
808 .gpr[22] = INT_MAX,
809 }
810 },
811 {
812 .descr = "RA = INT_MAX, RB = INT_MAX",
813 .instr = TEST_ADDC_DOT(20, 21, 22),
814 .regs = {
815 .gpr[21] = INT_MAX,
816 .gpr[22] = INT_MAX,
817 }
818 },
819 {
820 .descr = "RA = UINT_MAX, RB = UINT_MAX",
821 .instr = TEST_ADDC_DOT(20, 21, 22),
822 .regs = {
823 .gpr[21] = UINT_MAX,
824 .gpr[22] = UINT_MAX,
825 }
826 },
827 {
828 .descr = "RA = UINT_MAX, RB = 0x1",
829 .instr = TEST_ADDC_DOT(20, 21, 22),
830 .regs = {
831 .gpr[21] = UINT_MAX,
832 .gpr[22] = 0x1,
833 }
834 },
835 {
836 .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN",
837 .instr = TEST_ADDC_DOT(20, 21, 22),
838 .regs = {
839 .gpr[21] = LONG_MIN | (uint)INT_MIN,
840 .gpr[22] = LONG_MIN | (uint)INT_MIN,
841 }
842 }
843 }
844 }
845};
846
847static int __init emulate_compute_instr(struct pt_regs *regs,
848 unsigned int instr)
849{
850 struct instruction_op op;
851
852 if (!regs || !instr)
853 return -EINVAL;
854
855 if (analyse_instr(&op, regs, instr) != 1 ||
856 GETTYPE(op.type) != COMPUTE) {
857 pr_info("emulation failed, instruction = 0x%08x\n", instr);
858 return -EFAULT;
859 }
860
861 emulate_update_regs(regs, &op);
862 return 0;
863}
864
865static int __init execute_compute_instr(struct pt_regs *regs,
866 unsigned int instr)
867{
868 extern int exec_instr(struct pt_regs *regs);
869 extern s32 patch__exec_instr;
870
871 if (!regs || !instr)
872 return -EINVAL;
873
874 /* Patch the NOP with the actual instruction */
875 patch_instruction_site(&patch__exec_instr, instr);
876 if (exec_instr(regs)) {
877 pr_info("execution failed, instruction = 0x%08x\n", instr);
878 return -EFAULT;
879 }
880
881 return 0;
882}
883
884#define gpr_mismatch(gprn, exp, got) \
885 pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n", \
886 gprn, exp, got)
887
888#define reg_mismatch(name, exp, got) \
889 pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n", \
890 name, exp, got)
891
892static void __init run_tests_compute(void)
893{
894 unsigned long flags;
895 struct compute_test *test;
896 struct pt_regs *regs, exp, got;
897 unsigned int i, j, k, instr;
898 bool ignore_gpr, ignore_xer, ignore_ccr, passed;
899
900 for (i = 0; i < ARRAY_SIZE(compute_tests); i++) {
901 test = &compute_tests[i];
902
903 for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) {
904 instr = test->subtests[j].instr;
905 flags = test->subtests[j].flags;
906 regs = &test->subtests[j].regs;
907 ignore_xer = flags & IGNORE_XER;
908 ignore_ccr = flags & IGNORE_CCR;
909 passed = true;
910
911 memcpy(&exp, regs, sizeof(struct pt_regs));
912 memcpy(&got, regs, sizeof(struct pt_regs));
913
914 /*
915 * Set a compatible MSR value explicitly to ensure
916 * that XER and CR bits are updated appropriately
917 */
918 exp.msr = MSR_KERNEL;
919 got.msr = MSR_KERNEL;
920
921 if (emulate_compute_instr(&got, instr) ||
922 execute_compute_instr(&exp, instr)) {
923 passed = false;
924 goto print;
925 }
926
927 /* Verify GPR values */
928 for (k = 0; k < 32; k++) {
929 ignore_gpr = flags & IGNORE_GPR(k);
930 if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) {
931 passed = false;
932 gpr_mismatch(k, exp.gpr[k], got.gpr[k]);
933 }
934 }
935
936 /* Verify LR value */
937 if (exp.link != got.link) {
938 passed = false;
939 reg_mismatch("LR", exp.link, got.link);
940 }
941
942 /* Verify XER value */
943 if (!ignore_xer && exp.xer != got.xer) {
944 passed = false;
945 reg_mismatch("XER", exp.xer, got.xer);
946 }
947
948 /* Verify CR value */
949 if (!ignore_ccr && exp.ccr != got.ccr) {
950 passed = false;
951 reg_mismatch("CR", exp.ccr, got.ccr);
952 }
953
954print:
955 show_result_with_descr(test->mnemonic,
956 test->subtests[j].descr,
957 passed ? "PASS" : "FAIL");
958 }
959 }
960}
961
962static int __init test_emulate_step(void)
963{
964 printk(KERN_INFO "Running instruction emulation self-tests ...\n");
965 run_tests_load_store();
966 run_tests_compute();
440 967
441 return 0; 968 return 0;
442} 969}
diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S
new file mode 100644
index 000000000000..1580f34f4f4f
--- /dev/null
+++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S
@@ -0,0 +1,150 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Non-emulated single-stepping support (currently limited to basic integer
4 * computations) used to validate the instruction emulation infrastructure.
5 *
6 * Copyright (C) 2019 IBM Corporation
7 */
8
9#include <asm/asm-offsets.h>
10#include <asm/ppc_asm.h>
11#include <asm/code-patching-asm.h>
12#include <linux/errno.h>
13
14/* int exec_instr(struct pt_regs *regs) */
15_GLOBAL(exec_instr)
16
17 /*
18 * Stack frame layout (INT_FRAME_SIZE bytes)
19 * In-memory pt_regs (SP + STACK_FRAME_OVERHEAD)
20 * Scratch space (SP + 8)
21 * Back chain (SP + 0)
22 */
23
24 /*
25 * Allocate a new stack frame with enough space to hold the register
26 * states in an in-memory pt_regs and also create the back chain to
27 * the caller's stack frame.
28 */
29 stdu r1, -INT_FRAME_SIZE(r1)
30
31 /*
32 * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2)
33 * and local variables (GPR14 to GPR31). The register for the pt_regs
34 * parameter (GPR3) is saved additionally to ensure that the resulting
35 * register state can still be saved even if GPR3 gets overwritten
36 * when loading the initial register state for the test instruction.
37 * The stack pointer (GPR1) and the thread pointer (GPR13) are not
38 * saved as these should not be modified anyway.
39 */
40 SAVE_2GPRS(2, r1)
41 SAVE_NVGPRS(r1)
42
43 /*
44 * Save LR on stack to ensure that the return address is available
45 * even if it gets overwritten by the test instruction.
46 */
47 mflr r0
48 std r0, _LINK(r1)
49
50 /*
51 * Save CR on stack. For simplicity, the entire register is saved
52 * even though only fields 2 to 4 are non-volatile.
53 */
54 mfcr r0
55 std r0, _CCR(r1)
56
57 /*
58 * Load register state for the test instruction without touching the
59 * critical non-volatile registers. The register state is passed as a
60 * pointer to a pt_regs instance.
61 */
62 subi r31, r3, GPR0
63
64 /* Load LR from pt_regs */
65 ld r0, _LINK(r31)
66 mtlr r0
67
68 /* Load CR from pt_regs */
69 ld r0, _CCR(r31)
70 mtcr r0
71
72 /* Load XER from pt_regs */
73 ld r0, _XER(r31)
74 mtxer r0
75
76 /* Load GPRs from pt_regs */
77 REST_GPR(0, r31)
78 REST_10GPRS(2, r31)
79 REST_GPR(12, r31)
80 REST_NVGPRS(r31)
81
82 /* Placeholder for the test instruction */
831: nop
84 patch_site 1b patch__exec_instr
85
86 /*
87 * Since GPR3 is overwritten, temporarily restore it back to its
88 * original state, i.e. the pointer to pt_regs, to ensure that the
89 * resulting register state can be saved. Before doing this, a copy
90 * of it is created in the scratch space which is used later on to
91 * save it to pt_regs.
92 */
93 std r3, 8(r1)
94 REST_GPR(3, r1)
95
96 /* Save resulting GPR state to pt_regs */
97 subi r3, r3, GPR0
98 SAVE_GPR(0, r3)
99 SAVE_GPR(2, r3)
100 SAVE_8GPRS(4, r3)
101 SAVE_GPR(12, r3)
102 SAVE_NVGPRS(r3)
103
104 /* Save resulting LR to pt_regs */
105 mflr r0
106 std r0, _LINK(r3)
107
108 /* Save resulting CR to pt_regs */
109 mfcr r0
110 std r0, _CCR(r3)
111
112 /* Save resulting XER to pt_regs */
113 mfxer r0
114 std r0, _XER(r3)
115
116 /* Restore resulting GPR3 from scratch space and save it to pt_regs */
117 ld r0, 8(r1)
118 std r0, GPR3(r3)
119
120 /* Set return value to denote execution success */
121 li r3, 0
122
123 /* Continue */
124 b 3f
125
126 /* Set return value to denote execution failure */
1272: li r3, -EFAULT
128
129 /* Restore the non-volatile GPRs from stack */
1303: REST_GPR(2, r1)
131 REST_NVGPRS(r1)
132
133 /* Restore LR from stack to be able to return */
134 ld r0, _LINK(r1)
135 mtlr r0
136
137 /* Restore CR from stack */
138 ld r0, _CCR(r1)
139 mtcr r0
140
141 /* Tear down stack frame */
142 addi r1, r1, INT_FRAME_SIZE
143
144 /* Return */
145 blr
146
147 /* Setup exception table */
148 EX_TABLE(1b, 2b)
149
150_ASM_NOKPROBE_SYMBOL(exec_instr)
diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile
index 494df26c5988..a8794032f15f 100644
--- a/arch/powerpc/math-emu/Makefile
+++ b/arch/powerpc/math-emu/Makefile
@@ -17,4 +17,4 @@ obj-$(CONFIG_SPE) += math_efp.o
17CFLAGS_fabs.o = -fno-builtin-fabs 17CFLAGS_fabs.o = -fno-builtin-fabs
18CFLAGS_math.o = -fno-builtin-fabs 18CFLAGS_math.o = -fno-builtin-fabs
19 19
20ccflags-y = -I. -Iinclude/math-emu -w 20ccflags-y = -w
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 61ac468c87c6..b9cf6f8764b0 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -93,7 +93,7 @@ void __init MMU_init_hw(void)
93#define LARGE_PAGE_SIZE_16M (1<<24) 93#define LARGE_PAGE_SIZE_16M (1<<24)
94#define LARGE_PAGE_SIZE_4M (1<<22) 94#define LARGE_PAGE_SIZE_4M (1<<22)
95 95
96unsigned long __init mmu_mapin_ram(unsigned long top) 96unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
97{ 97{
98 unsigned long v, s, mapped; 98 unsigned long v, s, mapped;
99 phys_addr_t p; 99 phys_addr_t p;
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index ea2b9af08a48..aad127acdbaa 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -170,7 +170,7 @@ void __init MMU_init_hw(void)
170 flush_instruction_cache(); 170 flush_instruction_cache();
171} 171}
172 172
173unsigned long __init mmu_mapin_ram(unsigned long top) 173unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
174{ 174{
175 unsigned long addr; 175 unsigned long addr;
176 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); 176 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index bfa503cff351..fe1f6443d57f 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa)
66void __init MMU_init_hw(void) 66void __init MMU_init_hw(void)
67{ 67{
68 /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ 68 /* PIN up to the 3 first 8Mb after IMMR in DTLB table */
69#ifdef CONFIG_PIN_TLB_DATA 69 if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
70 unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; 70 unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
71 unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; 71 unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
72#ifdef CONFIG_PIN_TLB_IMMR 72 int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
73 int i = 29; 73 unsigned long addr = 0;
74#else 74 unsigned long mem = total_lowmem;
75 int i = 28; 75
76#endif 76 for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
77 unsigned long addr = 0; 77 mtspr(SPRN_MD_CTR, ctr | (i << 8));
78 unsigned long mem = total_lowmem; 78 mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
79 79 mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
80 for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { 80 mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
81 mtspr(SPRN_MD_CTR, ctr | (i << 8)); 81 addr += LARGE_PAGE_SIZE_8M;
82 mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); 82 mem -= LARGE_PAGE_SIZE_8M;
83 mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); 83 }
84 mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
85 addr += LARGE_PAGE_SIZE_8M;
86 mem -= LARGE_PAGE_SIZE_8M;
87 } 84 }
88#endif
89} 85}
90 86
91static void __init mmu_mapin_immr(void) 87static void __init mmu_mapin_immr(void)
@@ -98,26 +94,36 @@ static void __init mmu_mapin_immr(void)
98 map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); 94 map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
99} 95}
100 96
101static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) 97static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
102{ 98{
103 modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); 99 modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
104} 100}
105 101
106unsigned long __init mmu_mapin_ram(unsigned long top) 102static void mmu_patch_addis(s32 *site, long simm)
103{
104 unsigned int instr = *(unsigned int *)patch_site_addr(site);
105
106 instr &= 0xffff0000;
107 instr |= ((unsigned long)simm) >> 16;
108 patch_instruction_site(site, instr);
109}
110
111unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
107{ 112{
108 unsigned long mapped; 113 unsigned long mapped;
109 114
110 if (__map_without_ltlbs) { 115 if (__map_without_ltlbs) {
111 mapped = 0; 116 mapped = 0;
112 mmu_mapin_immr(); 117 mmu_mapin_immr();
113#ifndef CONFIG_PIN_TLB_IMMR 118 if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
114 patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); 119 patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
115#endif 120 if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
116#ifndef CONFIG_PIN_TLB_TEXT 121 mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
117 mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
118#endif
119 } else { 122 } else {
120 mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); 123 mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
124 if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
125 mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
126 _ALIGN(__pa(_einittext), 8 << 20));
121 } 127 }
122 128
123 mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); 129 mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
@@ -138,6 +144,26 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
138 return mapped; 144 return mapped;
139} 145}
140 146
147void mmu_mark_initmem_nx(void)
148{
149 if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
150 mmu_patch_addis(&patch__itlbmiss_linmem_top8,
151 -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
152 if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
153 mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
154}
155
156#ifdef CONFIG_STRICT_KERNEL_RWX
157void mmu_mark_rodata_ro(void)
158{
159 if (CONFIG_DATA_SHIFT < 23)
160 mmu_patch_addis(&patch__dtlbmiss_romem_top8,
161 -__pa(((unsigned long)_sinittext) &
162 ~(LARGE_PAGE_SIZE_8M - 1)));
163 mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
164}
165#endif
166
141void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, 167void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
142 phys_addr_t first_memblock_size) 168 phys_addr_t first_memblock_size)
143{ 169{
@@ -146,8 +172,8 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
146 */ 172 */
147 BUG_ON(first_memblock_base != 0); 173 BUG_ON(first_memblock_base != 0);
148 174
149 /* 8xx can only access 24MB at the moment */ 175 /* 8xx can only access 32MB at the moment */
150 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); 176 memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
151} 177}
152 178
153/* 179/*
@@ -162,14 +188,11 @@ void set_context(unsigned long id, pgd_t *pgd)
162{ 188{
163 s16 offset = (s16)(__pa(swapper_pg_dir)); 189 s16 offset = (s16)(__pa(swapper_pg_dir));
164 190
165#ifdef CONFIG_BDI_SWITCH
166 pgd_t **ptr = *(pgd_t ***)(KERNELBASE + 0xf0);
167
168 /* Context switch the PTE pointer for the Abatron BDI2000. 191 /* Context switch the PTE pointer for the Abatron BDI2000.
169 * The PGDIR is passed as second argument. 192 * The PGDIR is passed as second argument.
170 */ 193 */
171 *(ptr + 1) = pgd; 194 if (IS_ENABLED(CONFIG_BDI_SWITCH))
172#endif 195 abatron_pteptrs[1] = pgd;
173 196
174 /* Register M_TWB will contain base address of level 1 table minus the 197 /* Register M_TWB will contain base address of level 1 table minus the
175 * lower part of the kernel PGDIR base address, so that all accesses to 198 * lower part of the kernel PGDIR base address, so that all accesses to
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index f965fc33a8b7..d52ec118e09d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -45,13 +45,10 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
45obj-$(CONFIG_HIGHMEM) += highmem.o 45obj-$(CONFIG_HIGHMEM) += highmem.o
46obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o 46obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
47obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o 47obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
48obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o 48obj-$(CONFIG_PPC_PTDUMP) += ptdump/
49ifdef CONFIG_PPC_PTDUMP
50obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o
51obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o
52obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o
53obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o dump_bats.o dump_sr.o
54obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o
55endif
56obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o
57obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o 49obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
50
51# Disable kcov instrumentation on sensitive code
52# This is necessary for booting with kcov enabled on book3e machines
53KCOV_INSTRUMENT_tlb_nohash.o := n
54KCOV_INSTRUMENT_fsl_booke_mmu.o := n
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index e955539686a4..b5d2658c26af 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -30,6 +30,7 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/highmem.h> 31#include <linux/highmem.h>
32#include <linux/dma-direct.h> 32#include <linux/dma-direct.h>
33#include <linux/dma-noncoherent.h>
33#include <linux/export.h> 34#include <linux/export.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
@@ -151,8 +152,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi
151 * Allocate DMA-coherent memory space and return both the kernel remapped 152 * Allocate DMA-coherent memory space and return both the kernel remapped
152 * virtual and bus address for that space. 153 * virtual and bus address for that space.
153 */ 154 */
154void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, 155void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
155 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) 156 gfp_t gfp, unsigned long attrs)
156{ 157{
157 struct page *page; 158 struct page *page;
158 struct ppc_vm_region *c; 159 struct ppc_vm_region *c;
@@ -253,7 +254,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
253/* 254/*
254 * free a page as defined by the above mapping. 255 * free a page as defined by the above mapping.
255 */ 256 */
256void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, 257void arch_dma_free(struct device *dev, size_t size, void *vaddr,
257 dma_addr_t dma_handle, unsigned long attrs) 258 dma_addr_t dma_handle, unsigned long attrs)
258{ 259{
259 struct ppc_vm_region *c; 260 struct ppc_vm_region *c;
@@ -313,7 +314,7 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
313/* 314/*
314 * make an area consistent. 315 * make an area consistent.
315 */ 316 */
316void __dma_sync(void *vaddr, size_t size, int direction) 317static void __dma_sync(void *vaddr, size_t size, int direction)
317{ 318{
318 unsigned long start = (unsigned long)vaddr; 319 unsigned long start = (unsigned long)vaddr;
319 unsigned long end = start + size; 320 unsigned long end = start + size;
@@ -339,7 +340,6 @@ void __dma_sync(void *vaddr, size_t size, int direction)
339 break; 340 break;
340 } 341 }
341} 342}
342EXPORT_SYMBOL(__dma_sync);
343 343
344#ifdef CONFIG_HIGHMEM 344#ifdef CONFIG_HIGHMEM
345/* 345/*
@@ -386,28 +386,42 @@ static inline void __dma_sync_page_highmem(struct page *page,
386 * __dma_sync_page makes memory consistent. identical to __dma_sync, but 386 * __dma_sync_page makes memory consistent. identical to __dma_sync, but
387 * takes a struct page instead of a virtual address 387 * takes a struct page instead of a virtual address
388 */ 388 */
389void __dma_sync_page(struct page *page, unsigned long offset, 389static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
390 size_t size, int direction)
391{ 390{
391 struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
392 unsigned offset = paddr & ~PAGE_MASK;
393
392#ifdef CONFIG_HIGHMEM 394#ifdef CONFIG_HIGHMEM
393 __dma_sync_page_highmem(page, offset, size, direction); 395 __dma_sync_page_highmem(page, offset, size, dir);
394#else 396#else
395 unsigned long start = (unsigned long)page_address(page) + offset; 397 unsigned long start = (unsigned long)page_address(page) + offset;
396 __dma_sync((void *)start, size, direction); 398 __dma_sync((void *)start, size, dir);
397#endif 399#endif
398} 400}
399EXPORT_SYMBOL(__dma_sync_page); 401
402void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
403 size_t size, enum dma_data_direction dir)
404{
405 __dma_sync_page(paddr, size, dir);
406}
407
408void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
409 size_t size, enum dma_data_direction dir)
410{
411 __dma_sync_page(paddr, size, dir);
412}
400 413
401/* 414/*
402 * Return the PFN for a given cpu virtual address returned by 415 * Return the PFN for a given cpu virtual address returned by arch_dma_alloc.
403 * __dma_nommu_alloc_coherent. This is used by dma_mmap_coherent()
404 */ 416 */
405unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) 417long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr,
418 dma_addr_t dma_addr)
406{ 419{
407 /* This should always be populated, so we don't test every 420 /* This should always be populated, so we don't test every
408 * level. If that fails, we'll have a nice crash which 421 * level. If that fails, we'll have a nice crash which
409 * will be as good as a BUG_ON() 422 * will be as good as a BUG_ON()
410 */ 423 */
424 unsigned long cpu_addr = (unsigned long)vaddr;
411 pgd_t *pgd = pgd_offset_k(cpu_addr); 425 pgd_t *pgd = pgd_offset_k(cpu_addr);
412 pud_t *pud = pud_offset(pgd, cpu_addr); 426 pud_t *pud = pud_offset(pgd, cpu_addr);
413 pmd_t *pmd = pmd_offset(pud, cpu_addr); 427 pmd_t *pmd = pmd_offset(pud, cpu_addr);
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 080d49b26c3a..210cbc1faf63 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -221,7 +221,7 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
221#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" 221#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
222#endif 222#endif
223 223
224unsigned long __init mmu_mapin_ram(unsigned long top) 224unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
225{ 225{
226 return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; 226 return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
227} 227}
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 1e2df3e9f9ea..1f13494efb2b 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -47,14 +47,13 @@ mmu_hash_lock:
47 * Returns to the caller if the access is illegal or there is no 47 * Returns to the caller if the access is illegal or there is no
48 * mapping for the address. Otherwise it places an appropriate PTE 48 * mapping for the address. Otherwise it places an appropriate PTE
49 * in the hash table and returns from the exception. 49 * in the hash table and returns from the exception.
50 * Uses r0, r3 - r8, r10, ctr, lr. 50 * Uses r0, r3 - r6, r8, r10, ctr, lr.
51 */ 51 */
52 .text 52 .text
53_GLOBAL(hash_page) 53_GLOBAL(hash_page)
54 tophys(r7,0) /* gets -KERNELBASE into r7 */
55#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
56 addis r8,r7,mmu_hash_lock@h 55 lis r8, (mmu_hash_lock - PAGE_OFFSET)@h
57 ori r8,r8,mmu_hash_lock@l 56 ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
58 lis r0,0x0fff 57 lis r0,0x0fff
59 b 10f 58 b 10f
6011: lwz r6,0(r8) 5911: lwz r6,0(r8)
@@ -70,14 +69,13 @@ _GLOBAL(hash_page)
70 /* Get PTE (linux-style) and check access */ 69 /* Get PTE (linux-style) and check access */
71 lis r0,KERNELBASE@h /* check if kernel address */ 70 lis r0,KERNELBASE@h /* check if kernel address */
72 cmplw 0,r4,r0 71 cmplw 0,r4,r0
73 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
74 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ 72 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
75 lwz r5,PGDIR(r8) /* virt page-table root */ 73 mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */
76 blt+ 112f /* assume user more likely */ 74 blt+ 112f /* assume user more likely */
77 lis r5,swapper_pg_dir@ha /* if kernel address, use */ 75 lis r5,swapper_pg_dir@ha /* if kernel address, use */
78 addi r5,r5,swapper_pg_dir@l /* kernel page table */ 76 addi r5,r5,swapper_pg_dir@l /* kernel page table */
79 rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ 77 rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
80112: add r5,r5,r7 /* convert to phys addr */ 78112: tophys(r5, r5)
81#ifndef CONFIG_PTE_64BIT 79#ifndef CONFIG_PTE_64BIT
82 rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ 80 rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
83 lwz r8,0(r5) /* get pmd entry */ 81 lwz r8,0(r5) /* get pmd entry */
@@ -144,25 +142,24 @@ retry:
144 142
145#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
146 eieio 144 eieio
147 addis r8,r7,mmu_hash_lock@ha 145 lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
148 li r0,0 146 li r0,0
149 stw r0,mmu_hash_lock@l(r8) 147 stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
150#endif 148#endif
151 149
152 /* Return from the exception */ 150 /* Return from the exception */
153 lwz r5,_CTR(r11) 151 lwz r5,_CTR(r11)
154 mtctr r5 152 mtctr r5
155 lwz r0,GPR0(r11) 153 lwz r0,GPR0(r11)
156 lwz r7,GPR7(r11)
157 lwz r8,GPR8(r11) 154 lwz r8,GPR8(r11)
158 b fast_exception_return 155 b fast_exception_return
159 156
160#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
161hash_page_out: 158hash_page_out:
162 eieio 159 eieio
163 addis r8,r7,mmu_hash_lock@ha 160 lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
164 li r0,0 161 li r0,0
165 stw r0,mmu_hash_lock@l(r8) 162 stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
166 blr 163 blr
167#endif /* CONFIG_SMP */ 164#endif /* CONFIG_SMP */
168 165
@@ -186,8 +183,7 @@ _GLOBAL(add_hash_page)
186 add r3,r3,r0 /* note create_hpte trims to 24 bits */ 183 add r3,r3,r0 /* note create_hpte trims to 24 bits */
187 184
188#ifdef CONFIG_SMP 185#ifdef CONFIG_SMP
189 CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ 186 lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */
190 lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
191 oris r8,r8,12 187 oris r8,r8,12
192#endif /* CONFIG_SMP */ 188#endif /* CONFIG_SMP */
193 189
@@ -208,11 +204,9 @@ _GLOBAL(add_hash_page)
208 SYNC_601 204 SYNC_601
209 isync 205 isync
210 206
211 tophys(r7,0)
212
213#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
214 addis r6,r7,mmu_hash_lock@ha 208 lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
215 addi r6,r6,mmu_hash_lock@l 209 addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
21610: lwarx r0,0,r6 /* take the mmu_hash_lock */ 21010: lwarx r0,0,r6 /* take the mmu_hash_lock */
217 cmpi 0,r0,0 211 cmpi 0,r0,0
218 bne- 11f 212 bne- 11f
@@ -257,8 +251,8 @@ _GLOBAL(add_hash_page)
257 251
2589: 2529:
259#ifdef CONFIG_SMP 253#ifdef CONFIG_SMP
260 addis r6,r7,mmu_hash_lock@ha 254 lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
261 addi r6,r6,mmu_hash_lock@l 255 addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
262 eieio 256 eieio
263 li r0,0 257 li r0,0
264 stw r0,0(r6) /* clear mmu_hash_lock */ 258 stw r0,0(r6) /* clear mmu_hash_lock */
@@ -278,10 +272,8 @@ _GLOBAL(add_hash_page)
278 * It is designed to be called with the MMU either on or off. 272 * It is designed to be called with the MMU either on or off.
279 * r3 contains the VSID, r4 contains the virtual address, 273 * r3 contains the VSID, r4 contains the virtual address,
280 * r5 contains the linux PTE, r6 contains the old value of the 274 * r5 contains the linux PTE, r6 contains the old value of the
281 * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the 275 * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
282 * offset to be added to addresses (0 if the MMU is on, 276 * upper half of the PTE if CONFIG_PTE_64BIT.
283 * -KERNELBASE if it is off). r10 contains the upper half of
284 * the PTE if CONFIG_PTE_64BIT.
285 * On SMP, the caller should have the mmu_hash_lock held. 277 * On SMP, the caller should have the mmu_hash_lock held.
286 * We assume that the caller has (or will) set the _PAGE_HASHPTE 278 * We assume that the caller has (or will) set the _PAGE_HASHPTE
287 * bit in the linux PTE in memory. The value passed in r6 should 279 * bit in the linux PTE in memory. The value passed in r6 should
@@ -342,7 +334,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
342 patch_site 1f, patch__hash_page_A1 334 patch_site 1f, patch__hash_page_A1
343 patch_site 2f, patch__hash_page_A2 335 patch_site 2f, patch__hash_page_A2
344 /* Get the address of the primary PTE group in the hash table (r3) */ 336 /* Get the address of the primary PTE group in the hash table (r3) */
3450: addis r0,r7,Hash_base@h /* base address of hash table */ 3370: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
3461: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 3381: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
3472: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ 3392: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
348 xor r3,r3,r0 /* make primary hash */ 340 xor r3,r3,r0 /* make primary hash */
@@ -356,10 +348,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
356 beq+ 10f /* no PTE: go look for an empty slot */ 348 beq+ 10f /* no PTE: go look for an empty slot */
357 tlbie r4 349 tlbie r4
358 350
359 addis r4,r7,htab_hash_searches@ha 351 lis r4, (htab_hash_searches - PAGE_OFFSET)@ha
360 lwz r6,htab_hash_searches@l(r4) 352 lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
361 addi r6,r6,1 /* count how many searches we do */ 353 addi r6,r6,1 /* count how many searches we do */
362 stw r6,htab_hash_searches@l(r4) 354 stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
363 355
364 /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ 356 /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
365 mtctr r0 357 mtctr r0
@@ -391,10 +383,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
391 beq+ found_empty 383 beq+ found_empty
392 384
393 /* update counter of times that the primary PTEG is full */ 385 /* update counter of times that the primary PTEG is full */
394 addis r4,r7,primary_pteg_full@ha 386 lis r4, (primary_pteg_full - PAGE_OFFSET)@ha
395 lwz r6,primary_pteg_full@l(r4) 387 lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
396 addi r6,r6,1 388 addi r6,r6,1
397 stw r6,primary_pteg_full@l(r4) 389 stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
398 390
399 patch_site 0f, patch__hash_page_C 391 patch_site 0f, patch__hash_page_C
400 /* Search the secondary PTEG for an empty slot */ 392 /* Search the secondary PTEG for an empty slot */
@@ -428,8 +420,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
428 * lockup here but that shouldn't happen 420 * lockup here but that shouldn't happen
429 */ 421 */
430 422
4311: addis r4,r7,next_slot@ha /* get next evict slot */ 4231: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */
432 lwz r6,next_slot@l(r4) 424 lwz r6, (next_slot - PAGE_OFFSET)@l(r4)
433 addi r6,r6,HPTE_SIZE /* search for candidate */ 425 addi r6,r6,HPTE_SIZE /* search for candidate */
434 andi. r6,r6,7*HPTE_SIZE 426 andi. r6,r6,7*HPTE_SIZE
435 stw r6,next_slot@l(r4) 427 stw r6,next_slot@l(r4)
@@ -501,8 +493,6 @@ htab_hash_searches:
501 * We assume that there is a hash table in use (Hash != 0). 493 * We assume that there is a hash table in use (Hash != 0).
502 */ 494 */
503_GLOBAL(flush_hash_pages) 495_GLOBAL(flush_hash_pages)
504 tophys(r7,0)
505
506 /* 496 /*
507 * We disable interrupts here, even on UP, because we want 497 * We disable interrupts here, even on UP, because we want
508 * the _PAGE_HASHPTE bit to be a reliable indication of 498 * the _PAGE_HASHPTE bit to be a reliable indication of
@@ -547,11 +537,9 @@ _GLOBAL(flush_hash_pages)
547 SET_V(r11) /* set V (valid) bit */ 537 SET_V(r11) /* set V (valid) bit */
548 538
549#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
550 addis r9,r7,mmu_hash_lock@ha 540 lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha
551 addi r9,r9,mmu_hash_lock@l 541 addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
552 CURRENT_THREAD_INFO(r8, r1) 542 lwz r8,TASK_CPU(r2)
553 add r8,r8,r7
554 lwz r8,TI_CPU(r8)
555 oris r8,r8,9 543 oris r8,r8,9
55610: lwarx r0,0,r9 54410: lwarx r0,0,r9
557 cmpi 0,r0,0 545 cmpi 0,r0,0
@@ -584,7 +572,7 @@ _GLOBAL(flush_hash_pages)
584 patch_site 1f, patch__flush_hash_A1 572 patch_site 1f, patch__flush_hash_A1
585 patch_site 2f, patch__flush_hash_A2 573 patch_site 2f, patch__flush_hash_A2
586 /* Get the address of the primary PTE group in the hash table (r3) */ 574 /* Get the address of the primary PTE group in the hash table (r3) */
5870: addis r8,r7,Hash_base@h /* base address of hash table */ 5750: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
5881: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 5761: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
5892: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ 5772: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
590 xor r8,r0,r8 /* make primary hash */ 578 xor r8,r0,r8 /* make primary hash */
@@ -646,8 +634,7 @@ EXPORT_SYMBOL(flush_hash_pages)
646 */ 634 */
647_GLOBAL(_tlbie) 635_GLOBAL(_tlbie)
648#ifdef CONFIG_SMP 636#ifdef CONFIG_SMP
649 CURRENT_THREAD_INFO(r8, r1) 637 lwz r8,TASK_CPU(r2)
650 lwz r8,TI_CPU(r8)
651 oris r8,r8,11 638 oris r8,r8,11
652 mfmsr r10 639 mfmsr r10
653 SYNC 640 SYNC
@@ -684,8 +671,7 @@ _GLOBAL(_tlbie)
684 */ 671 */
685_GLOBAL(_tlbia) 672_GLOBAL(_tlbia)
686#if defined(CONFIG_SMP) 673#if defined(CONFIG_SMP)
687 CURRENT_THREAD_INFO(r8, r1) 674 lwz r8,TASK_CPU(r2)
688 lwz r8,TI_CPU(r8)
689 oris r8,r8,10 675 oris r8,r8,10
690 mfmsr r10 676 mfmsr r10
691 SYNC 677 SYNC
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index bc6be44913d4..3d4b2399192f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1889,12 +1889,12 @@ static int hpt_order_set(void *data, u64 val)
1889 return mmu_hash_ops.resize_hpt(val); 1889 return mmu_hash_ops.resize_hpt(val);
1890} 1890}
1891 1891
1892DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); 1892DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
1893 1893
1894static int __init hash64_debugfs(void) 1894static int __init hash64_debugfs(void)
1895{ 1895{
1896 if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, 1896 if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
1897 NULL, &fops_hpt_order)) { 1897 NULL, &fops_hpt_order)) {
1898 pr_err("lpar: unable to create hpt_order debugsfs file\n"); 1898 pr_err("lpar: unable to create hpt_order debugsfs file\n");
1899 } 1899 }
1900 1900
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 367ce3a4a503..b0d9209d9a86 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -26,7 +26,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
26 real_pte_t rpte; 26 real_pte_t rpte;
27 unsigned long vpn; 27 unsigned long vpn;
28 unsigned long old_pte, new_pte; 28 unsigned long old_pte, new_pte;
29 unsigned long rflags, pa, sz; 29 unsigned long rflags, pa;
30 long slot, offset; 30 long slot, offset;
31 31
32 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); 32 BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
@@ -73,7 +73,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
73 offset = PTRS_PER_PMD; 73 offset = PTRS_PER_PMD;
74 rpte = __real_pte(__pte(old_pte), ptep, offset); 74 rpte = __real_pte(__pte(old_pte), ptep, offset);
75 75
76 sz = ((1UL) << shift);
77 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 76 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
78 /* No CPU has hugepages but lacks no execute, so we 77 /* No CPU has hugepages but lacks no execute, so we
79 * don't need to worry about that case */ 78 * don't need to worry about that case */
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 11d9ea28a816..cab06331c0c0 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -1,6 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/hugetlb.h> 3#include <linux/hugetlb.h>
4#include <linux/security.h>
4#include <asm/pgtable.h> 5#include <asm/pgtable.h>
5#include <asm/pgalloc.h> 6#include <asm/pgalloc.h>
6#include <asm/cacheflush.h> 7#include <asm/cacheflush.h>
@@ -73,7 +74,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
73 if (addr) { 74 if (addr) {
74 addr = ALIGN(addr, huge_page_size(h)); 75 addr = ALIGN(addr, huge_page_size(h));
75 vma = find_vma(mm, addr); 76 vma = find_vma(mm, addr);
76 if (high_limit - len >= addr && 77 if (high_limit - len >= addr && addr >= mmap_min_addr &&
77 (!vma || addr + len <= vm_start_gap(vma))) 78 (!vma || addr + len <= vm_start_gap(vma)))
78 return addr; 79 return addr;
79 } 80 }
@@ -83,7 +84,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
83 */ 84 */
84 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 85 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
85 info.length = len; 86 info.length = len;
86 info.low_limit = PAGE_SIZE; 87 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
87 info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); 88 info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
88 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 89 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
89 info.align_offset = 0; 90 info.align_offset = 0;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3e59e5d64b01..41a3513cadc9 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -108,12 +108,8 @@ static void __init MMU_setup(void)
108 __map_without_bats = 1; 108 __map_without_bats = 1;
109 __map_without_ltlbs = 1; 109 __map_without_ltlbs = 1;
110 } 110 }
111#ifdef CONFIG_STRICT_KERNEL_RWX 111 if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx))
112 if (rodata_enabled) {
113 __map_without_bats = 1;
114 __map_without_ltlbs = 1; 112 __map_without_ltlbs = 1;
115 }
116#endif
117} 113}
118 114
119/* 115/*
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a5091c034747..a4c155af1597 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -274,7 +274,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
274 274
275 for (; start < end; start += page_size) { 275 for (; start < end; start += page_size) {
276 unsigned long nr_pages, addr; 276 unsigned long nr_pages, addr;
277 struct page *section_base;
278 struct page *page; 277 struct page *page;
279 278
280 /* 279 /*
@@ -290,7 +289,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
290 continue; 289 continue;
291 290
292 page = pfn_to_page(addr >> PAGE_SHIFT); 291 page = pfn_to_page(addr >> PAGE_SHIFT);
293 section_base = pfn_to_page(vmemmap_section_start(start));
294 nr_pages = 1 << page_order; 292 nr_pages = 1 << page_order;
295 base_pfn = PHYS_PFN(addr); 293 base_pfn = PHYS_PFN(addr);
296 294
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 33cc6f676fa6..f6787f90e158 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -69,22 +69,14 @@ pte_t *kmap_pte;
69EXPORT_SYMBOL(kmap_pte); 69EXPORT_SYMBOL(kmap_pte);
70pgprot_t kmap_prot; 70pgprot_t kmap_prot;
71EXPORT_SYMBOL(kmap_prot); 71EXPORT_SYMBOL(kmap_prot);
72#define TOP_ZONE ZONE_HIGHMEM
73 72
74static inline pte_t *virt_to_kpte(unsigned long vaddr) 73static inline pte_t *virt_to_kpte(unsigned long vaddr)
75{ 74{
76 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), 75 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
77 vaddr), vaddr), vaddr); 76 vaddr), vaddr), vaddr);
78} 77}
79#else
80#define TOP_ZONE ZONE_NORMAL
81#endif 78#endif
82 79
83int page_is_ram(unsigned long pfn)
84{
85 return memblock_is_memory(__pfn_to_phys(pfn));
86}
87
88pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 80pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
89 unsigned long size, pgprot_t vma_prot) 81 unsigned long size, pgprot_t vma_prot)
90{ 82{
@@ -176,34 +168,6 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size,
176#endif 168#endif
177#endif /* CONFIG_MEMORY_HOTPLUG */ 169#endif /* CONFIG_MEMORY_HOTPLUG */
178 170
179/*
180 * walk_memory_resource() needs to make sure there is no holes in a given
181 * memory range. PPC64 does not maintain the memory layout in /proc/iomem.
182 * Instead it maintains it in memblock.memory structures. Walk through the
183 * memory regions, find holes and callback for contiguous regions.
184 */
185int
186walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
187 void *arg, int (*func)(unsigned long, unsigned long, void *))
188{
189 struct memblock_region *reg;
190 unsigned long end_pfn = start_pfn + nr_pages;
191 unsigned long tstart, tend;
192 int ret = -1;
193
194 for_each_memblock(memory, reg) {
195 tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
196 tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
197 if (tstart >= tend)
198 continue;
199 ret = (*func)(tstart, tend - tstart, arg);
200 if (ret)
201 break;
202 }
203 return ret;
204}
205EXPORT_SYMBOL_GPL(walk_system_ram_range);
206
207#ifndef CONFIG_NEED_MULTIPLE_NODES 171#ifndef CONFIG_NEED_MULTIPLE_NODES
208void __init mem_topology_setup(void) 172void __init mem_topology_setup(void)
209{ 173{
@@ -262,25 +226,6 @@ static int __init mark_nonram_nosave(void)
262static unsigned long max_zone_pfns[MAX_NR_ZONES]; 226static unsigned long max_zone_pfns[MAX_NR_ZONES];
263 227
264/* 228/*
265 * Find the least restrictive zone that is entirely below the
266 * specified pfn limit. Returns < 0 if no suitable zone is found.
267 *
268 * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
269 * systems -- the DMA limit can be higher than any possible real pfn.
270 */
271int dma_pfn_limit_to_zone(u64 pfn_limit)
272{
273 int i;
274
275 for (i = TOP_ZONE; i >= 0; i--) {
276 if (max_zone_pfns[i] <= pfn_limit)
277 return i;
278 }
279
280 return -EPERM;
281}
282
283/*
284 * paging_init() sets up the page tables - in fact we've already done this. 229 * paging_init() sets up the page tables - in fact we've already done this.
285 */ 230 */
286void __init paging_init(void) 231void __init paging_init(void)
@@ -585,3 +530,9 @@ int devmem_is_allowed(unsigned long pfn)
585 return 0; 530 return 0;
586} 531}
587#endif /* CONFIG_STRICT_DEVMEM */ 532#endif /* CONFIG_STRICT_DEVMEM */
533
534/*
535 * This is defined in kernel/resource.c but only powerpc needs to export it, for
536 * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
537 */
538EXPORT_SYMBOL_GPL(walk_system_ram_range);
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index c4a717da65eb..74ff61dabcb1 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -130,7 +130,7 @@ extern void wii_memory_fixups(void);
130 */ 130 */
131#ifdef CONFIG_PPC32 131#ifdef CONFIG_PPC32
132extern void MMU_init_hw(void); 132extern void MMU_init_hw(void);
133extern unsigned long mmu_mapin_ram(unsigned long top); 133unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
134#endif 134#endif
135 135
136#ifdef CONFIG_PPC_FSL_BOOK3E 136#ifdef CONFIG_PPC_FSL_BOOK3E
@@ -165,3 +165,11 @@ unsigned long p_block_mapped(phys_addr_t pa);
165static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } 165static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
166static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } 166static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
167#endif 167#endif
168
169#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
170void mmu_mark_initmem_nx(void);
171void mmu_mark_rodata_ro(void);
172#else
173static inline void mmu_mark_initmem_nx(void) { }
174static inline void mmu_mark_rodata_ro(void) { }
175#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index df1e11ebbabb..ac49e4158e50 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1460,13 +1460,6 @@ static void reset_topology_timer(void)
1460 1460
1461#ifdef CONFIG_SMP 1461#ifdef CONFIG_SMP
1462 1462
1463static void stage_topology_update(int core_id)
1464{
1465 cpumask_or(&cpu_associativity_changes_mask,
1466 &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
1467 reset_topology_timer();
1468}
1469
1470static int dt_update_callback(struct notifier_block *nb, 1463static int dt_update_callback(struct notifier_block *nb,
1471 unsigned long action, void *data) 1464 unsigned long action, void *data)
1472{ 1465{
@@ -1479,7 +1472,7 @@ static int dt_update_callback(struct notifier_block *nb,
1479 !of_prop_cmp(update->prop->name, "ibm,associativity")) { 1472 !of_prop_cmp(update->prop->name, "ibm,associativity")) {
1480 u32 core_id; 1473 u32 core_id;
1481 of_property_read_u32(update->dn, "reg", &core_id); 1474 of_property_read_u32(update->dn, "reg", &core_id);
1482 stage_topology_update(core_id); 1475 rc = dlpar_cpu_readd(core_id);
1483 rc = NOTIFY_OK; 1476 rc = NOTIFY_OK;
1484 } 1477 }
1485 break; 1478 break;
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index ded71126ce4c..6e56a6240bfa 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -254,26 +254,20 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
254 254
255void __init mapin_ram(void) 255void __init mapin_ram(void)
256{ 256{
257 unsigned long s, top; 257 struct memblock_region *reg;
258 258
259#ifndef CONFIG_WII 259 for_each_memblock(memory, reg) {
260 top = total_lowmem; 260 phys_addr_t base = reg->base;
261 s = mmu_mapin_ram(top); 261 phys_addr_t top = min(base + reg->size, total_lowmem);
262 __mapin_ram_chunk(s, top); 262
263#else 263 if (base >= top)
264 if (!wii_hole_size) { 264 continue;
265 s = mmu_mapin_ram(total_lowmem); 265 base = mmu_mapin_ram(base, top);
266 __mapin_ram_chunk(s, total_lowmem); 266 if (IS_ENABLED(CONFIG_BDI_SWITCH))
267 } else { 267 __mapin_ram_chunk(reg->base, top);
268 top = wii_hole_start; 268 else
269 s = mmu_mapin_ram(top); 269 __mapin_ram_chunk(base, top);
270 __mapin_ram_chunk(s, top);
271
272 top = memblock_end_of_DRAM();
273 s = wii_mmu_mapin_mem2(top);
274 __mapin_ram_chunk(s, top);
275 } 270 }
276#endif
277} 271}
278 272
279/* Scan the real Linux page tables and return a PTE pointer for 273/* Scan the real Linux page tables and return a PTE pointer for
@@ -359,7 +353,10 @@ void mark_initmem_nx(void)
359 unsigned long numpages = PFN_UP((unsigned long)_einittext) - 353 unsigned long numpages = PFN_UP((unsigned long)_einittext) -
360 PFN_DOWN((unsigned long)_sinittext); 354 PFN_DOWN((unsigned long)_sinittext);
361 355
362 change_page_attr(page, numpages, PAGE_KERNEL); 356 if (v_block_mapped((unsigned long)_stext) + 1)
357 mmu_mark_initmem_nx();
358 else
359 change_page_attr(page, numpages, PAGE_KERNEL);
363} 360}
364 361
365#ifdef CONFIG_STRICT_KERNEL_RWX 362#ifdef CONFIG_STRICT_KERNEL_RWX
@@ -368,6 +365,11 @@ void mark_rodata_ro(void)
368 struct page *page; 365 struct page *page;
369 unsigned long numpages; 366 unsigned long numpages;
370 367
368 if (v_block_mapped((unsigned long)_sinittext)) {
369 mmu_mark_rodata_ro();
370 return;
371 }
372
371 page = virt_to_page(_stext); 373 page = virt_to_page(_stext);
372 numpages = PFN_UP((unsigned long)_etext) - 374 numpages = PFN_UP((unsigned long)_etext) -
373 PFN_DOWN((unsigned long)_stext); 375 PFN_DOWN((unsigned long)_stext);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 36a664f06c65..6c8a60b1e31d 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -32,6 +32,7 @@
32#include <asm/mmu.h> 32#include <asm/mmu.h>
33#include <asm/machdep.h> 33#include <asm/machdep.h>
34#include <asm/code-patching.h> 34#include <asm/code-patching.h>
35#include <asm/sections.h>
35 36
36#include "mmu_decl.h" 37#include "mmu_decl.h"
37 38
@@ -73,45 +74,171 @@ unsigned long p_block_mapped(phys_addr_t pa)
73 return 0; 74 return 0;
74} 75}
75 76
76unsigned long __init mmu_mapin_ram(unsigned long top) 77static int find_free_bat(void)
77{ 78{
78 unsigned long tot, bl, done; 79 int b;
79 unsigned long max_size = (256<<20); 80
81 if (cpu_has_feature(CPU_FTR_601)) {
82 for (b = 0; b < 4; b++) {
83 struct ppc_bat *bat = BATS[b];
84
85 if (!(bat[0].batl & 0x40))
86 return b;
87 }
88 } else {
89 int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
90
91 for (b = 0; b < n; b++) {
92 struct ppc_bat *bat = BATS[b];
93
94 if (!(bat[1].batu & 3))
95 return b;
96 }
97 }
98 return -1;
99}
100
101static unsigned int block_size(unsigned long base, unsigned long top)
102{
103 unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20;
104 unsigned int base_shift = (fls(base) - 1) & 31;
105 unsigned int block_shift = (fls(top - base) - 1) & 31;
106
107 return min3(max_size, 1U << base_shift, 1U << block_shift);
108}
109
110/*
111 * Set up one of the IBAT (block address translation) register pairs.
112 * The parameters are not checked; in particular size must be a power
113 * of 2 between 128k and 256M.
114 * Only for 603+ ...
115 */
116static void setibat(int index, unsigned long virt, phys_addr_t phys,
117 unsigned int size, pgprot_t prot)
118{
119 unsigned int bl = (size >> 17) - 1;
120 int wimgxpp;
121 struct ppc_bat *bat = BATS[index];
122 unsigned long flags = pgprot_val(prot);
123
124 if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
125 flags &= ~_PAGE_COHERENT;
126
127 wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
128 bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
129 bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
130 if (flags & _PAGE_USER)
131 bat[0].batu |= 1; /* Vp = 1 */
132}
133
134static void clearibat(int index)
135{
136 struct ppc_bat *bat = BATS[index];
137
138 bat[0].batu = 0;
139 bat[0].batl = 0;
140}
141
142static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
143{
144 int idx;
145
146 while ((idx = find_free_bat()) != -1 && base != top) {
147 unsigned int size = block_size(base, top);
148
149 if (size < 128 << 10)
150 break;
151 setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
152 base += size;
153 }
154
155 return base;
156}
157
158unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
159{
160 int done;
161 unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
80 162
81 if (__map_without_bats) { 163 if (__map_without_bats) {
82 printk(KERN_DEBUG "RAM mapped without BATs\n"); 164 pr_debug("RAM mapped without BATs\n");
83 return 0; 165 return base;
166 }
167
168 if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
169 return __mmu_mapin_ram(base, top);
170
171 done = __mmu_mapin_ram(base, border);
172 if (done != border - base)
173 return done;
174
175 return done + __mmu_mapin_ram(border, top);
176}
177
178void mmu_mark_initmem_nx(void)
179{
180 int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
181 int i;
182 unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
183 unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
184 unsigned long size;
185
186 if (cpu_has_feature(CPU_FTR_601))
187 return;
188
189 for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
190 size = block_size(base, top);
191 setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
192 base += size;
84 } 193 }
194 if (base < top) {
195 size = block_size(base, top);
196 size = max(size, 128UL << 10);
197 if ((top - base) > size) {
198 if (strict_kernel_rwx_enabled())
199 pr_warn("Kernel _etext not properly aligned\n");
200 size <<= 1;
201 }
202 setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
203 base += size;
204 }
205 for (; i < nb; i++)
206 clearibat(i);
85 207
86 /* Set up BAT2 and if necessary BAT3 to cover RAM. */ 208 update_bats();
87 209
88 /* Make sure we don't map a block larger than the 210 for (i = TASK_SIZE >> 28; i < 16; i++) {
89 smallest alignment of the physical address. */ 211 /* Do not set NX on VM space for modules */
90 tot = top; 212 if (IS_ENABLED(CONFIG_MODULES) &&
91 for (bl = 128<<10; bl < max_size; bl <<= 1) { 213 (VMALLOC_START & 0xf0000000) == i << 28)
92 if (bl * 2 > tot)
93 break; 214 break;
215 mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
94 } 216 }
217}
218
219void mmu_mark_rodata_ro(void)
220{
221 int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
222 int i;
223
224 if (cpu_has_feature(CPU_FTR_601))
225 return;
226
227 for (i = 0; i < nb; i++) {
228 struct ppc_bat *bat = BATS[i];
95 229
96 setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); 230 if (bat_addrs[i].start < (unsigned long)__init_begin)
97 done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; 231 bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
98 if ((done < tot) && !bat_addrs[3].limit) {
99 /* use BAT3 to cover a bit more */
100 tot -= done;
101 for (bl = 128<<10; bl < max_size; bl <<= 1)
102 if (bl * 2 > tot)
103 break;
104 setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
105 done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
106 } 232 }
107 233
108 return done; 234 update_bats();
109} 235}
110 236
111/* 237/*
112 * Set up one of the I/D BAT (block address translation) register pairs. 238 * Set up one of the I/D BAT (block address translation) register pairs.
113 * The parameters are not checked; in particular size must be a power 239 * The parameters are not checked; in particular size must be a power
114 * of 2 between 128k and 256M. 240 * of 2 between 128k and 256M.
241 * On 603+, only set IBAT when _PAGE_EXEC is set
115 */ 242 */
116void __init setbat(int index, unsigned long virt, phys_addr_t phys, 243void __init setbat(int index, unsigned long virt, phys_addr_t phys,
117 unsigned int size, pgprot_t prot) 244 unsigned int size, pgprot_t prot)
@@ -138,11 +265,12 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys,
138 bat[1].batu |= 1; /* Vp = 1 */ 265 bat[1].batu |= 1; /* Vp = 1 */
139 if (flags & _PAGE_GUARDED) { 266 if (flags & _PAGE_GUARDED) {
140 /* G bit must be zero in IBATs */ 267 /* G bit must be zero in IBATs */
141 bat[0].batu = bat[0].batl = 0; 268 flags &= ~_PAGE_EXEC;
142 } else {
143 /* make IBAT same as DBAT */
144 bat[0] = bat[1];
145 } 269 }
270 if (flags & _PAGE_EXEC)
271 bat[0] = bat[1];
272 else
273 bat[0].batu = bat[0].batl = 0;
146 } else { 274 } else {
147 /* 601 cpu */ 275 /* 601 cpu */
148 if (bl > BL_8M) 276 if (bl > BL_8M)
@@ -230,7 +358,8 @@ void __init MMU_init_hw(void)
230 if (lg_n_hpteg > 16) 358 if (lg_n_hpteg > 16)
231 mb2 = 16 - LG_HPTEG_SIZE; 359 mb2 = 16 - LG_HPTEG_SIZE;
232 360
233 modify_instruction_site(&patch__hash_page_A0, 0xffff, (unsigned int)Hash >> 16); 361 modify_instruction_site(&patch__hash_page_A0, 0xffff,
362 ((unsigned int)Hash - PAGE_OFFSET) >> 16);
234 modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); 363 modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6);
235 modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); 364 modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6);
236 modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); 365 modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
@@ -239,7 +368,8 @@ void __init MMU_init_hw(void)
239 /* 368 /*
240 * Patch up the instructions in hashtable.S:flush_hash_page 369 * Patch up the instructions in hashtable.S:flush_hash_page
241 */ 370 */
242 modify_instruction_site(&patch__flush_hash_A0, 0xffff, (unsigned int)Hash >> 16); 371 modify_instruction_site(&patch__flush_hash_A0, 0xffff,
372 ((unsigned int)Hash - PAGE_OFFSET) >> 16);
243 modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); 373 modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6);
244 modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); 374 modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6);
245 modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); 375 modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index ab9e3f24db2f..9e2d8e847d6e 100644
--- a/arch/powerpc/mm/dump_linuxpagetables-8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -7,7 +7,7 @@
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <asm/pgtable.h> 8#include <asm/pgtable.h>
9 9
10#include "dump_linuxpagetables.h" 10#include "ptdump.h"
11 11
12static const struct flag_info flag_array[] = { 12static const struct flag_info flag_array[] = {
13 { 13 {
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
new file mode 100644
index 000000000000..712762be3cb1
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -0,0 +1,9 @@
1# SPDX-License-Identifier: GPL-2.0
2
3obj-y += ptdump.o
4
5obj-$(CONFIG_4xx) += shared.o
6obj-$(CONFIG_PPC_8xx) += 8xx.o
7obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o
8obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o
9obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o
diff --git a/arch/powerpc/mm/dump_bats.c b/arch/powerpc/mm/ptdump/bats.c
index a0d23e96e841..a0d23e96e841 100644
--- a/arch/powerpc/mm/dump_bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index ed6fcf78256e..0dfca72cb9bd 100644
--- a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -7,7 +7,7 @@
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <asm/pgtable.h> 8#include <asm/pgtable.h>
9 9
10#include "dump_linuxpagetables.h" 10#include "ptdump.h"
11 11
12static const struct flag_info flag_array[] = { 12static const struct flag_info flag_array[] = {
13 { 13 {
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index 869294695048..b430e4e08af6 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -342,7 +342,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize)
342 342
343 /* Look in secondary table */ 343 /* Look in secondary table */
344 if (slot == -1) 344 if (slot == -1)
345 slot = base_hpte_find(ea, psize, true, &v, &r); 345 slot = base_hpte_find(ea, psize, false, &v, &r);
346 346
347 /* No entry found */ 347 /* No entry found */
348 if (slot == -1) 348 if (slot == -1)
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/ptdump/ptdump.c
index 6aa41669ac1a..37138428ab55 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -28,7 +28,7 @@
28#include <asm/page.h> 28#include <asm/page.h>
29#include <asm/pgalloc.h> 29#include <asm/pgalloc.h>
30 30
31#include "dump_linuxpagetables.h" 31#include "ptdump.h"
32 32
33#ifdef CONFIG_PPC32 33#ifdef CONFIG_PPC32
34#define KERN_VIRT_START 0 34#define KERN_VIRT_START 0
@@ -143,14 +143,19 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
143 unsigned long delta; 143 unsigned long delta;
144 144
145#ifdef CONFIG_PPC64 145#ifdef CONFIG_PPC64
146 seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); 146#define REG "0x%016lx"
147 seq_printf(st->seq, "0x%016lx ", st->start_pa);
148#else 147#else
149 seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); 148#define REG "0x%08lx"
150 seq_printf(st->seq, "0x%08lx ", st->start_pa);
151#endif 149#endif
152 150
153 delta = (addr - st->start_address) >> 10; 151 seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
152 if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) {
153 seq_printf(st->seq, "[" REG "]", st->start_pa);
154 delta = PAGE_SIZE >> 10;
155 } else {
156 seq_printf(st->seq, " " REG " ", st->start_pa);
157 delta = (addr - st->start_address) >> 10;
158 }
154 /* Work out what appropriate unit to use */ 159 /* Work out what appropriate unit to use */
155 while (!(delta & 1023) && unit[1]) { 160 while (!(delta & 1023) && unit[1]) {
156 delta >>= 10; 161 delta >>= 10;
@@ -184,7 +189,8 @@ static void note_page(struct pg_state *st, unsigned long addr,
184 */ 189 */
185 } else if (flag != st->current_flags || level != st->level || 190 } else if (flag != st->current_flags || level != st->level ||
186 addr >= st->marker[1].start_address || 191 addr >= st->marker[1].start_address ||
187 pa != st->last_pa + PAGE_SIZE) { 192 (pa != st->last_pa + PAGE_SIZE &&
193 (pa != st->start_pa || st->start_pa != st->last_pa))) {
188 194
189 /* Check the PTE flags */ 195 /* Check the PTE flags */
190 if (st->current_flags) { 196 if (st->current_flags) {
diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/ptdump/ptdump.h
index 5d513636de73..5d513636de73 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.h
+++ b/arch/powerpc/mm/ptdump/ptdump.h
diff --git a/arch/powerpc/mm/dump_sr.c b/arch/powerpc/mm/ptdump/segment_regs.c
index 501843664bb9..501843664bb9 100644
--- a/arch/powerpc/mm/dump_sr.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/ptdump/shared.c
index 3fe98a0974c6..f7ed2f187cb0 100644
--- a/arch/powerpc/mm/dump_linuxpagetables-generic.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -7,7 +7,7 @@
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <asm/pgtable.h> 8#include <asm/pgtable.h>
9 9
10#include "dump_linuxpagetables.h" 10#include "ptdump.h"
11 11
12static const struct flag_info flag_array[] = { 12static const struct flag_info flag_array[] = {
13 { 13 {
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index bc3914d54e26..5986df48359b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -69,6 +69,11 @@ static void assert_slb_presence(bool present, unsigned long ea)
69 if (!cpu_has_feature(CPU_FTR_ARCH_206)) 69 if (!cpu_has_feature(CPU_FTR_ARCH_206))
70 return; 70 return;
71 71
72 /*
73 * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
74 * ignores all other bits from 0-27, so just clear them all.
75 */
76 ea &= ~((1UL << 28) - 1);
72 asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); 77 asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
73 78
74 WARN_ON(present == (tmp == 0)); 79 WARN_ON(present == (tmp == 0));
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 06898c13901d..aec91dbcdc0b 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -32,6 +32,7 @@
32#include <linux/export.h> 32#include <linux/export.h>
33#include <linux/hugetlb.h> 33#include <linux/hugetlb.h>
34#include <linux/sched/mm.h> 34#include <linux/sched/mm.h>
35#include <linux/security.h>
35#include <asm/mman.h> 36#include <asm/mman.h>
36#include <asm/mmu.h> 37#include <asm/mmu.h>
37#include <asm/copro.h> 38#include <asm/copro.h>
@@ -377,6 +378,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
377 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 378 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
378 unsigned long addr, found, prev; 379 unsigned long addr, found, prev;
379 struct vm_unmapped_area_info info; 380 struct vm_unmapped_area_info info;
381 unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
380 382
381 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 383 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
382 info.length = len; 384 info.length = len;
@@ -393,7 +395,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
393 if (high_limit > DEFAULT_MAP_WINDOW) 395 if (high_limit > DEFAULT_MAP_WINDOW)
394 addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; 396 addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
395 397
396 while (addr > PAGE_SIZE) { 398 while (addr > min_addr) {
397 info.high_limit = addr; 399 info.high_limit = addr;
398 if (!slice_scan_available(addr - 1, available, 0, &addr)) 400 if (!slice_scan_available(addr - 1, available, 0, &addr))
399 continue; 401 continue;
@@ -405,8 +407,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
405 * Check if we need to reduce the range, or if we can 407 * Check if we need to reduce the range, or if we can
406 * extend it to cover the previous available slice. 408 * extend it to cover the previous available slice.
407 */ 409 */
408 if (addr < PAGE_SIZE) 410 if (addr < min_addr)
409 addr = PAGE_SIZE; 411 addr = min_addr;
410 else if (slice_scan_available(addr - 1, available, 0, &prev)) { 412 else if (slice_scan_available(addr - 1, available, 0, &prev)) {
411 addr = prev; 413 addr = prev;
412 goto prev_slice; 414 goto prev_slice;
@@ -528,7 +530,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
528 addr = _ALIGN_UP(addr, page_size); 530 addr = _ALIGN_UP(addr, page_size);
529 slice_dbg(" aligned addr=%lx\n", addr); 531 slice_dbg(" aligned addr=%lx\n", addr);
530 /* Ignore hint if it's too large or overlaps a VMA */ 532 /* Ignore hint if it's too large or overlaps a VMA */
531 if (addr > high_limit - len || 533 if (addr > high_limit - len || addr < mmap_min_addr ||
532 !slice_area_is_free(mm, addr, len)) 534 !slice_area_is_free(mm, addr, len))
533 addr = 0; 535 addr = 0;
534 } 536 }
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ae5d568e267f..ac23dc1c6535 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -302,7 +302,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
302 * This function as well as __local_flush_tlb_page() must only be called 302 * This function as well as __local_flush_tlb_page() must only be called
303 * for user contexts. 303 * for user contexts.
304 */ 304 */
305 if (unlikely(WARN_ON(!mm))) 305 if (WARN_ON(!mm))
306 return; 306 return;
307 307
308 preempt_disable(); 308 preempt_disable();
diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h
index 6f4daacad296..dc50a8d4b3b9 100644
--- a/arch/powerpc/net/bpf_jit32.h
+++ b/arch/powerpc/net/bpf_jit32.h
@@ -106,9 +106,8 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh);
106 } while (0) 106 } while (0)
107#else 107#else
108#define PPC_BPF_LOAD_CPU(r) \ 108#define PPC_BPF_LOAD_CPU(r) \
109 do { BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); \ 109 do { BUILD_BUG_ON(FIELD_SIZEOF(struct task_struct, cpu) != 4); \
110 PPC_LHZ_OFFS(r, (1 & ~(THREAD_SIZE - 1)), \ 110 PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu)); \
111 offsetof(struct thread_info, cpu)); \
112 } while(0) 111 } while(0)
113#endif 112#endif
114#else 113#else
diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h
index 7de344b7d9cc..063c9d9f2516 100644
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -97,3 +97,27 @@ EVENT(PM_MRK_DTLB_MISS_64K, 0x3d156)
97EVENT(PM_DTLB_MISS_16M, 0x4c056) 97EVENT(PM_DTLB_MISS_16M, 0x4c056)
98EVENT(PM_DTLB_MISS_1G, 0x4c05a) 98EVENT(PM_DTLB_MISS_1G, 0x4c05a)
99EVENT(PM_MRK_DTLB_MISS_16M, 0x4c15e) 99EVENT(PM_MRK_DTLB_MISS_16M, 0x4c15e)
100
101/*
102 * Memory Access Events
103 *
104 * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0)
105 * To enable capturing of memory profiling, these MMCRA bits
106 * needs to be programmed and corresponding raw event format
107 * encoding.
108 *
109 * MMCRA bits encoding needed are
110 * SM (Sampling Mode)
111 * EM (Eligibility for Random Sampling)
112 * TECE (Threshold Event Counter Event)
113 * TS (Threshold Start Event)
114 * TE (Threshold End Event)
115 *
116 * Corresponding Raw Encoding bits:
117 * sample [EM,SM]
118 * thresh_sel (TECE)
119 * thresh start (TS)
120 * thresh end (TE)
121 */
122EVENT(MEM_LOADS, 0x34340401e0)
123EVENT(MEM_STORES, 0x343c0401e0)
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 0ff9c43733e9..030544e35959 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -160,6 +160,8 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL);
160GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); 160GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
161GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); 161GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1);
162GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN); 162GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN);
163GENERIC_EVENT_ATTR(mem-loads, MEM_LOADS);
164GENERIC_EVENT_ATTR(mem-stores, MEM_STORES);
163 165
164CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1_FIN); 166CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1_FIN);
165CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); 167CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1);
@@ -185,6 +187,8 @@ static struct attribute *power9_events_attr[] = {
185 GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), 187 GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
186 GENERIC_EVENT_PTR(PM_LD_REF_L1), 188 GENERIC_EVENT_PTR(PM_LD_REF_L1),
187 GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN), 189 GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN),
190 GENERIC_EVENT_PTR(MEM_LOADS),
191 GENERIC_EVENT_PTR(MEM_STORES),
188 CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN), 192 CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN),
189 CACHE_EVENT_PTR(PM_LD_REF_L1), 193 CACHE_EVENT_PTR(PM_LD_REF_L1),
190 CACHE_EVENT_PTR(PM_L1_PREF), 194 CACHE_EVENT_PTR(PM_L1_PREF),
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 4a9a72d01c3c..35be81fd2dc2 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -180,6 +180,7 @@ config CURRITUCK
180 depends on PPC_47x 180 depends on PPC_47x
181 select SWIOTLB 181 select SWIOTLB
182 select 476FPE 182 select 476FPE
183 select FORCE_PCI
183 select PPC4xx_PCI_EXPRESS 184 select PPC4xx_PCI_EXPRESS
184 help 185 help
185 This option enables support for the IBM Currituck (476fpe) evaluation board 186 This option enables support for the IBM Currituck (476fpe) evaluation board
diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c
index e55933f9cd55..a5e61e5c16e2 100644
--- a/arch/powerpc/platforms/44x/ppc476.c
+++ b/arch/powerpc/platforms/44x/ppc476.c
@@ -34,6 +34,7 @@
34#include <asm/ppc4xx.h> 34#include <asm/ppc4xx.h>
35#include <asm/mpic.h> 35#include <asm/mpic.h>
36#include <asm/mmu.h> 36#include <asm/mmu.h>
37#include <asm/swiotlb.h>
37 38
38#include <linux/pci.h> 39#include <linux/pci.h>
39#include <linux/i2c.h> 40#include <linux/i2c.h>
diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c
index f467247fd1c4..18422dbd061a 100644
--- a/arch/powerpc/platforms/44x/warp.c
+++ b/arch/powerpc/platforms/44x/warp.c
@@ -47,7 +47,7 @@ static int __init warp_probe(void)
47 if (!of_machine_is_compatible("pika,warp")) 47 if (!of_machine_is_compatible("pika,warp"))
48 return 0; 48 return 0;
49 49
50 /* For __dma_nommu_alloc_coherent */ 50 /* For arch_dma_alloc */
51 ISA_DMA_THRESHOLD = ~0L; 51 ISA_DMA_THRESHOLD = ~0L;
52 52
53 return 1; 53 return 1;
diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S
index 3d1ecd211776..8137f77abad5 100644
--- a/arch/powerpc/platforms/83xx/suspend-asm.S
+++ b/arch/powerpc/platforms/83xx/suspend-asm.S
@@ -26,13 +26,13 @@
26#define SS_MSR 0x74 26#define SS_MSR 0x74
27#define SS_SDR1 0x78 27#define SS_SDR1 0x78
28#define SS_LR 0x7c 28#define SS_LR 0x7c
29#define SS_SPRG 0x80 /* 4 SPRGs */ 29#define SS_SPRG 0x80 /* 8 SPRGs */
30#define SS_DBAT 0x90 /* 8 DBATs */ 30#define SS_DBAT 0xa0 /* 8 DBATs */
31#define SS_IBAT 0xd0 /* 8 IBATs */ 31#define SS_IBAT 0xe0 /* 8 IBATs */
32#define SS_TB 0x110 32#define SS_TB 0x120
33#define SS_CR 0x118 33#define SS_CR 0x128
34#define SS_GPREG 0x11c /* r12-r31 */ 34#define SS_GPREG 0x12c /* r12-r31 */
35#define STATE_SAVE_SIZE 0x16c 35#define STATE_SAVE_SIZE 0x17c
36 36
37 .section .data 37 .section .data
38 .align 5 38 .align 5
@@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep)
103 stw r7, SS_SPRG+12(r3) 103 stw r7, SS_SPRG+12(r3)
104 stw r8, SS_SDR1(r3) 104 stw r8, SS_SDR1(r3)
105 105
106 mfspr r4, SPRN_SPRG4
107 mfspr r5, SPRN_SPRG5
108 mfspr r6, SPRN_SPRG6
109 mfspr r7, SPRN_SPRG7
110
111 stw r4, SS_SPRG+16(r3)
112 stw r5, SS_SPRG+20(r3)
113 stw r6, SS_SPRG+24(r3)
114 stw r7, SS_SPRG+28(r3)
115
106 mfspr r4, SPRN_DBAT0U 116 mfspr r4, SPRN_DBAT0U
107 mfspr r5, SPRN_DBAT0L 117 mfspr r5, SPRN_DBAT0L
108 mfspr r6, SPRN_DBAT1U 118 mfspr r6, SPRN_DBAT1U
@@ -493,6 +503,16 @@ mpc83xx_deep_resume:
493 mtspr SPRN_IBAT7U, r6 503 mtspr SPRN_IBAT7U, r6
494 mtspr SPRN_IBAT7L, r7 504 mtspr SPRN_IBAT7L, r7
495 505
506 lwz r4, SS_SPRG+16(r3)
507 lwz r5, SS_SPRG+20(r3)
508 lwz r6, SS_SPRG+24(r3)
509 lwz r7, SS_SPRG+28(r3)
510
511 mtspr SPRN_SPRG4, r4
512 mtspr SPRN_SPRG5, r5
513 mtspr SPRN_SPRG6, r6
514 mtspr SPRN_SPRG7, r7
515
496 lwz r4, SS_SPRG+0(r3) 516 lwz r4, SS_SPRG+0(r3)
497 lwz r5, SS_SPRG+4(r3) 517 lwz r5, SS_SPRG+4(r3)
498 lwz r6, SS_SPRG+8(r3) 518 lwz r6, SS_SPRG+8(r3)
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index b0dac307bebf..785e9641220d 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -27,6 +27,7 @@
27#include <asm/udbg.h> 27#include <asm/udbg.h>
28#include <asm/mpic.h> 28#include <asm/mpic.h>
29#include <asm/ehv_pic.h> 29#include <asm/ehv_pic.h>
30#include <asm/swiotlb.h>
30#include <soc/fsl/qe/qe_ic.h> 31#include <soc/fsl/qe/qe_ic.h>
31 32
32#include <linux/of_platform.h> 33#include <linux/of_platform.h>
@@ -223,7 +224,3 @@ define_machine(corenet_generic) {
223}; 224};
224 225
225machine_arch_initcall(corenet_generic, corenet_gen_publish_devices); 226machine_arch_initcall(corenet_generic, corenet_gen_publish_devices);
226
227#ifdef CONFIG_SWIOTLB
228machine_arch_initcall(corenet_generic, swiotlb_setup_bus_notifier);
229#endif
diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c
index f29c6f0909f3..c64fa2483ea9 100644
--- a/arch/powerpc/platforms/85xx/ge_imp3a.c
+++ b/arch/powerpc/platforms/85xx/ge_imp3a.c
@@ -202,8 +202,6 @@ static int __init ge_imp3a_probe(void)
202 202
203machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices); 203machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices);
204 204
205machine_arch_initcall(ge_imp3a, swiotlb_setup_bus_notifier);
206
207define_machine(ge_imp3a) { 205define_machine(ge_imp3a) {
208 .name = "GE_IMP3A", 206 .name = "GE_IMP3A",
209 .probe = ge_imp3a_probe, 207 .probe = ge_imp3a_probe,
diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c
index 94a7f92c858f..94194bad4954 100644
--- a/arch/powerpc/platforms/85xx/mpc8536_ds.c
+++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c
@@ -57,8 +57,6 @@ static void __init mpc8536_ds_setup_arch(void)
57 57
58machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices); 58machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices);
59 59
60machine_arch_initcall(mpc8536_ds, swiotlb_setup_bus_notifier);
61
62/* 60/*
63 * Called very early, device-tree isn't unflattened 61 * Called very early, device-tree isn't unflattened
64 */ 62 */
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
index dc9e035cc637..b7e29ce1f266 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c
@@ -174,10 +174,6 @@ machine_arch_initcall(mpc8544_ds, mpc85xx_common_publish_devices);
174machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices); 174machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices);
175machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices); 175machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices);
176 176
177machine_arch_initcall(mpc8544_ds, swiotlb_setup_bus_notifier);
178machine_arch_initcall(mpc8572_ds, swiotlb_setup_bus_notifier);
179machine_arch_initcall(p2020_ds, swiotlb_setup_bus_notifier);
180
181/* 177/*
182 * Called very early, device-tree isn't unflattened 178 * Called very early, device-tree isn't unflattened
183 */ 179 */
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
index d7e440e6dba3..80939a425de5 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c
@@ -367,10 +367,6 @@ machine_arch_initcall(mpc8568_mds, mpc85xx_publish_devices);
367machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices); 367machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices);
368machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices); 368machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices);
369 369
370machine_arch_initcall(mpc8568_mds, swiotlb_setup_bus_notifier);
371machine_arch_initcall(mpc8569_mds, swiotlb_setup_bus_notifier);
372machine_arch_initcall(p1021_mds, swiotlb_setup_bus_notifier);
373
374static void __init mpc85xx_mds_pic_init(void) 370static void __init mpc85xx_mds_pic_init(void)
375{ 371{
376 struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | 372 struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN |
diff --git a/arch/powerpc/platforms/85xx/p1010rdb.c b/arch/powerpc/platforms/85xx/p1010rdb.c
index 78d13b364cd6..33ca373322e1 100644
--- a/arch/powerpc/platforms/85xx/p1010rdb.c
+++ b/arch/powerpc/platforms/85xx/p1010rdb.c
@@ -55,7 +55,6 @@ static void __init p1010_rdb_setup_arch(void)
55} 55}
56 56
57machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices); 57machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices);
58machine_arch_initcall(p1010_rdb, swiotlb_setup_bus_notifier);
59 58
60/* 59/*
61 * Called very early, device-tree isn't unflattened 60 * Called very early, device-tree isn't unflattened
diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c
index 9fb57f78cdbe..1f1af0557470 100644
--- a/arch/powerpc/platforms/85xx/p1022_ds.c
+++ b/arch/powerpc/platforms/85xx/p1022_ds.c
@@ -548,8 +548,6 @@ static void __init p1022_ds_setup_arch(void)
548 548
549machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices); 549machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices);
550 550
551machine_arch_initcall(p1022_ds, swiotlb_setup_bus_notifier);
552
553/* 551/*
554 * Called very early, device-tree isn't unflattened 552 * Called very early, device-tree isn't unflattened
555 */ 553 */
diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c
index 276e00ab3dde..fd9e3e7ef234 100644
--- a/arch/powerpc/platforms/85xx/p1022_rdk.c
+++ b/arch/powerpc/platforms/85xx/p1022_rdk.c
@@ -128,8 +128,6 @@ static void __init p1022_rdk_setup_arch(void)
128 128
129machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices); 129machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices);
130 130
131machine_arch_initcall(p1022_rdk, swiotlb_setup_bus_notifier);
132
133/* 131/*
134 * Called very early, device-tree isn't unflattened 132 * Called very early, device-tree isn't unflattened
135 */ 133 */
diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c
index 27631c607f3d..c52c8f9e8385 100644
--- a/arch/powerpc/platforms/85xx/qemu_e500.c
+++ b/arch/powerpc/platforms/85xx/qemu_e500.c
@@ -22,6 +22,7 @@
22#include <asm/time.h> 22#include <asm/time.h>
23#include <asm/udbg.h> 23#include <asm/udbg.h>
24#include <asm/mpic.h> 24#include <asm/mpic.h>
25#include <asm/swiotlb.h>
25#include <sysdev/fsl_soc.h> 26#include <sysdev/fsl_soc.h>
26#include <sysdev/fsl_pci.h> 27#include <sysdev/fsl_pci.h>
27#include "smp.h" 28#include "smp.h"
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
index 17c6cd3d02e6..775a92353c83 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c
@@ -121,7 +121,6 @@ static int __init declare_of_platform_devices(void)
121 return 0; 121 return 0;
122} 122}
123machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices); 123machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices);
124machine_arch_initcall(mpc86xx_hpcn, swiotlb_setup_bus_notifier);
125 124
126define_machine(mpc86xx_hpcn) { 125define_machine(mpc86xx_hpcn) {
127 .name = "MPC86xx HPCN", 126 .name = "MPC86xx HPCN",
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 8c7464c3f27f..842b2c7e156a 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -153,6 +153,11 @@ config E300C3_CPU
153 bool "e300c3 (831x)" 153 bool "e300c3 (831x)"
154 depends on PPC_BOOK3S_32 154 depends on PPC_BOOK3S_32
155 155
156config G4_CPU
157 bool "G4 (74xx)"
158 depends on PPC_BOOK3S_32
159 select ALTIVEC
160
156endchoice 161endchoice
157 162
158config TARGET_CPU_BOOL 163config TARGET_CPU_BOOL
@@ -171,6 +176,7 @@ config TARGET_CPU
171 default "860" if 860_CPU 176 default "860" if 860_CPU
172 default "e300c2" if E300C2_CPU 177 default "e300c2" if E300C2_CPU
173 default "e300c3" if E300C3_CPU 178 default "e300c3" if E300C3_CPU
179 default "G4" if G4_CPU
174 180
175config PPC_BOOK3S 181config PPC_BOOK3S
176 def_bool y 182 def_bool y
@@ -402,6 +408,9 @@ config NOT_COHERENT_CACHE
402 bool 408 bool
403 depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ 409 depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
404 GAMECUBE_COMMON || AMIGAONE 410 GAMECUBE_COMMON || AMIGAONE
411 select ARCH_HAS_DMA_COHERENT_TO_PFN
412 select ARCH_HAS_SYNC_DMA_FOR_DEVICE
413 select ARCH_HAS_SYNC_DMA_FOR_CPU
405 default n if PPC_47x 414 default n if PPC_47x
406 default y 415 default y
407 416
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index af2a3c15e0ec..54e012e1f720 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -544,9 +544,10 @@ static struct cbe_iommu *cell_iommu_for_node(int nid)
544static unsigned long cell_dma_nommu_offset; 544static unsigned long cell_dma_nommu_offset;
545 545
546static unsigned long dma_iommu_fixed_base; 546static unsigned long dma_iommu_fixed_base;
547static bool cell_iommu_enabled;
547 548
548/* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */ 549/* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */
549static int iommu_fixed_is_weak; 550bool iommu_fixed_is_weak;
550 551
551static struct iommu_table *cell_get_iommu_table(struct device *dev) 552static struct iommu_table *cell_get_iommu_table(struct device *dev)
552{ 553{
@@ -568,102 +569,19 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev)
568 return &window->table; 569 return &window->table;
569} 570}
570 571
571/* A coherent allocation implies strong ordering */ 572static u64 cell_iommu_get_fixed_address(struct device *dev);
572
573static void *dma_fixed_alloc_coherent(struct device *dev, size_t size,
574 dma_addr_t *dma_handle, gfp_t flag,
575 unsigned long attrs)
576{
577 if (iommu_fixed_is_weak)
578 return iommu_alloc_coherent(dev, cell_get_iommu_table(dev),
579 size, dma_handle,
580 device_to_mask(dev), flag,
581 dev_to_node(dev));
582 else
583 return dma_nommu_ops.alloc(dev, size, dma_handle, flag,
584 attrs);
585}
586
587static void dma_fixed_free_coherent(struct device *dev, size_t size,
588 void *vaddr, dma_addr_t dma_handle,
589 unsigned long attrs)
590{
591 if (iommu_fixed_is_weak)
592 iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr,
593 dma_handle);
594 else
595 dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs);
596}
597
598static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page,
599 unsigned long offset, size_t size,
600 enum dma_data_direction direction,
601 unsigned long attrs)
602{
603 if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
604 return dma_nommu_ops.map_page(dev, page, offset, size,
605 direction, attrs);
606 else
607 return iommu_map_page(dev, cell_get_iommu_table(dev), page,
608 offset, size, device_to_mask(dev),
609 direction, attrs);
610}
611
612static void dma_fixed_unmap_page(struct device *dev, dma_addr_t dma_addr,
613 size_t size, enum dma_data_direction direction,
614 unsigned long attrs)
615{
616 if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
617 dma_nommu_ops.unmap_page(dev, dma_addr, size, direction,
618 attrs);
619 else
620 iommu_unmap_page(cell_get_iommu_table(dev), dma_addr, size,
621 direction, attrs);
622}
623
624static int dma_fixed_map_sg(struct device *dev, struct scatterlist *sg,
625 int nents, enum dma_data_direction direction,
626 unsigned long attrs)
627{
628 if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
629 return dma_nommu_ops.map_sg(dev, sg, nents, direction, attrs);
630 else
631 return ppc_iommu_map_sg(dev, cell_get_iommu_table(dev), sg,
632 nents, device_to_mask(dev),
633 direction, attrs);
634}
635
636static void dma_fixed_unmap_sg(struct device *dev, struct scatterlist *sg,
637 int nents, enum dma_data_direction direction,
638 unsigned long attrs)
639{
640 if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
641 dma_nommu_ops.unmap_sg(dev, sg, nents, direction, attrs);
642 else
643 ppc_iommu_unmap_sg(cell_get_iommu_table(dev), sg, nents,
644 direction, attrs);
645}
646
647static int dma_suported_and_switch(struct device *dev, u64 dma_mask);
648
649static const struct dma_map_ops dma_iommu_fixed_ops = {
650 .alloc = dma_fixed_alloc_coherent,
651 .free = dma_fixed_free_coherent,
652 .map_sg = dma_fixed_map_sg,
653 .unmap_sg = dma_fixed_unmap_sg,
654 .dma_supported = dma_suported_and_switch,
655 .map_page = dma_fixed_map_page,
656 .unmap_page = dma_fixed_unmap_page,
657};
658 573
659static void cell_dma_dev_setup(struct device *dev) 574static void cell_dma_dev_setup(struct device *dev)
660{ 575{
661 if (get_pci_dma_ops() == &dma_iommu_ops) 576 if (cell_iommu_enabled) {
577 u64 addr = cell_iommu_get_fixed_address(dev);
578
579 if (addr != OF_BAD_ADDR)
580 dev->archdata.dma_offset = addr + dma_iommu_fixed_base;
662 set_iommu_table_base(dev, cell_get_iommu_table(dev)); 581 set_iommu_table_base(dev, cell_get_iommu_table(dev));
663 else if (get_pci_dma_ops() == &dma_nommu_ops) 582 } else {
664 set_dma_offset(dev, cell_dma_nommu_offset); 583 dev->archdata.dma_offset = cell_dma_nommu_offset;
665 else 584 }
666 BUG();
667} 585}
668 586
669static void cell_pci_dma_dev_setup(struct pci_dev *dev) 587static void cell_pci_dma_dev_setup(struct pci_dev *dev)
@@ -680,11 +598,9 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action,
680 if (action != BUS_NOTIFY_ADD_DEVICE) 598 if (action != BUS_NOTIFY_ADD_DEVICE)
681 return 0; 599 return 0;
682 600
683 /* We use the PCI DMA ops */ 601 if (cell_iommu_enabled)
684 dev->dma_ops = get_pci_dma_ops(); 602 dev->dma_ops = &dma_iommu_ops;
685
686 cell_dma_dev_setup(dev); 603 cell_dma_dev_setup(dev);
687
688 return 0; 604 return 0;
689} 605}
690 606
@@ -809,7 +725,6 @@ static int __init cell_iommu_init_disabled(void)
809 unsigned long base = 0, size; 725 unsigned long base = 0, size;
810 726
811 /* When no iommu is present, we use direct DMA ops */ 727 /* When no iommu is present, we use direct DMA ops */
812 set_pci_dma_ops(&dma_nommu_ops);
813 728
814 /* First make sure all IOC translation is turned off */ 729 /* First make sure all IOC translation is turned off */
815 cell_disable_iommus(); 730 cell_disable_iommus();
@@ -894,7 +809,11 @@ static u64 cell_iommu_get_fixed_address(struct device *dev)
894 const u32 *ranges = NULL; 809 const u32 *ranges = NULL;
895 int i, len, best, naddr, nsize, pna, range_size; 810 int i, len, best, naddr, nsize, pna, range_size;
896 811
812 /* We can be called for platform devices that have no of_node */
897 np = of_node_get(dev->of_node); 813 np = of_node_get(dev->of_node);
814 if (!np)
815 goto out;
816
898 while (1) { 817 while (1) {
899 naddr = of_n_addr_cells(np); 818 naddr = of_n_addr_cells(np);
900 nsize = of_n_size_cells(np); 819 nsize = of_n_size_cells(np);
@@ -945,27 +864,10 @@ out:
945 return dev_addr; 864 return dev_addr;
946} 865}
947 866
948static int dma_suported_and_switch(struct device *dev, u64 dma_mask) 867static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask)
949{ 868{
950 if (dma_mask == DMA_BIT_MASK(64) && 869 return mask == DMA_BIT_MASK(64) &&
951 cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) { 870 cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR;
952 u64 addr = cell_iommu_get_fixed_address(dev) +
953 dma_iommu_fixed_base;
954 dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
955 dev_dbg(dev, "iommu: fixed addr = %llx\n", addr);
956 set_dma_ops(dev, &dma_iommu_fixed_ops);
957 set_dma_offset(dev, addr);
958 return 1;
959 }
960
961 if (dma_iommu_dma_supported(dev, dma_mask)) {
962 dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
963 set_dma_ops(dev, get_pci_dma_ops());
964 cell_dma_dev_setup(dev);
965 return 1;
966 }
967
968 return 0;
969} 871}
970 872
971static void insert_16M_pte(unsigned long addr, unsigned long *ptab, 873static void insert_16M_pte(unsigned long addr, unsigned long *ptab,
@@ -1119,9 +1021,8 @@ static int __init cell_iommu_fixed_mapping_init(void)
1119 cell_iommu_setup_window(iommu, np, dbase, dsize, 0); 1021 cell_iommu_setup_window(iommu, np, dbase, dsize, 0);
1120 } 1022 }
1121 1023
1122 dma_iommu_ops.dma_supported = dma_suported_and_switch; 1024 cell_pci_controller_ops.iommu_bypass_supported =
1123 set_pci_dma_ops(&dma_iommu_ops); 1025 cell_pci_iommu_bypass_supported;
1124
1125 return 0; 1026 return 0;
1126} 1027}
1127 1028
@@ -1142,7 +1043,7 @@ static int __init setup_iommu_fixed(char *str)
1142 pciep = of_find_node_by_type(NULL, "pcie-endpoint"); 1043 pciep = of_find_node_by_type(NULL, "pcie-endpoint");
1143 1044
1144 if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0)) 1045 if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
1145 iommu_fixed_is_weak = DMA_ATTR_WEAK_ORDERING; 1046 iommu_fixed_is_weak = true;
1146 1047
1147 of_node_put(pciep); 1048 of_node_put(pciep);
1148 1049
@@ -1150,26 +1051,6 @@ static int __init setup_iommu_fixed(char *str)
1150} 1051}
1151__setup("iommu_fixed=", setup_iommu_fixed); 1052__setup("iommu_fixed=", setup_iommu_fixed);
1152 1053
1153static u64 cell_dma_get_required_mask(struct device *dev)
1154{
1155 const struct dma_map_ops *dma_ops;
1156
1157 if (!dev->dma_mask)
1158 return 0;
1159
1160 if (!iommu_fixed_disabled &&
1161 cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR)
1162 return DMA_BIT_MASK(64);
1163
1164 dma_ops = get_dma_ops(dev);
1165 if (dma_ops->get_required_mask)
1166 return dma_ops->get_required_mask(dev);
1167
1168 WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops);
1169
1170 return DMA_BIT_MASK(64);
1171}
1172
1173static int __init cell_iommu_init(void) 1054static int __init cell_iommu_init(void)
1174{ 1055{
1175 struct device_node *np; 1056 struct device_node *np;
@@ -1186,10 +1067,9 @@ static int __init cell_iommu_init(void)
1186 1067
1187 /* Setup various callbacks */ 1068 /* Setup various callbacks */
1188 cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; 1069 cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
1189 ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
1190 1070
1191 if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) 1071 if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
1192 goto bail; 1072 goto done;
1193 1073
1194 /* Create an iommu for each /axon node. */ 1074 /* Create an iommu for each /axon node. */
1195 for_each_node_by_name(np, "axon") { 1075 for_each_node_by_name(np, "axon") {
@@ -1206,10 +1086,10 @@ static int __init cell_iommu_init(void)
1206 continue; 1086 continue;
1207 cell_iommu_init_one(np, SPIDER_DMA_OFFSET); 1087 cell_iommu_init_one(np, SPIDER_DMA_OFFSET);
1208 } 1088 }
1209 1089 done:
1210 /* Setup default PCI iommu ops */ 1090 /* Setup default PCI iommu ops */
1211 set_pci_dma_ops(&dma_iommu_ops); 1091 set_pci_dma_ops(&dma_iommu_ops);
1212 1092 cell_iommu_enabled = true;
1213 bail: 1093 bail:
1214 /* Register callbacks on OF platform device addition/removal 1094 /* Register callbacks on OF platform device addition/removal
1215 * to handle linking them to the right DMA operations 1095 * to handle linking them to the right DMA operations
diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
index 125f2a5f02de..b5f35cbe9e21 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -34,7 +34,7 @@
34 */ 34 */
35 35
36static void *spu_syscall_table[] = { 36static void *spu_syscall_table[] = {
37#define __SYSCALL(nr, entry, nargs) entry, 37#define __SYSCALL(nr, entry) entry,
38#include <asm/syscall_table_spu.h> 38#include <asm/syscall_table_spu.h>
39#undef __SYSCALL 39#undef __SYSCALL
40}; 40};
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 263413a34823..b95d6afc39b5 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -26,7 +26,6 @@
26#include <linux/syscalls.h> 26#include <linux/syscalls.h>
27#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/binfmts.h> 28#include <linux/binfmts.h>
29#include <linux/syscalls.h>
30 29
31#include <asm/spu.h> 30#include <asm/spu.h>
32 31
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index ae8123edddc6..48c2477e7e2a 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -2338,9 +2338,8 @@ static int spufs_switch_log_open(struct inode *inode, struct file *file)
2338 goto out; 2338 goto out;
2339 } 2339 }
2340 2340
2341 ctx->switch_log = kmalloc(sizeof(struct switch_log) + 2341 ctx->switch_log = kmalloc(struct_size(ctx->switch_log, log,
2342 SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry), 2342 SWITCH_LOG_BUFSIZE), GFP_KERNEL);
2343 GFP_KERNEL);
2344 2343
2345 if (!ctx->switch_log) { 2344 if (!ctx->switch_log) {
2346 rc = -ENOMEM; 2345 rc = -ENOMEM;
diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c
index ecf703ee3a76..235fe81aa2b1 100644
--- a/arch/powerpc/platforms/embedded6xx/wii.c
+++ b/arch/powerpc/platforms/embedded6xx/wii.c
@@ -54,10 +54,6 @@
54static void __iomem *hw_ctrl; 54static void __iomem *hw_ctrl;
55static void __iomem *hw_gpio; 55static void __iomem *hw_gpio;
56 56
57unsigned long wii_hole_start;
58unsigned long wii_hole_size;
59
60
61static int __init page_aligned(unsigned long x) 57static int __init page_aligned(unsigned long x)
62{ 58{
63 return !(x & (PAGE_SIZE-1)); 59 return !(x & (PAGE_SIZE-1));
@@ -69,26 +65,6 @@ void __init wii_memory_fixups(void)
69 65
70 BUG_ON(memblock.memory.cnt != 2); 66 BUG_ON(memblock.memory.cnt != 2);
71 BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); 67 BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base));
72
73 /* determine hole */
74 wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE);
75 wii_hole_size = p[1].base - wii_hole_start;
76}
77
78unsigned long __init wii_mmu_mapin_mem2(unsigned long top)
79{
80 unsigned long delta, size, bl;
81 unsigned long max_size = (256<<20);
82
83 /* MEM2 64MB@0x10000000 */
84 delta = wii_hole_start + wii_hole_size;
85 size = top - delta;
86 for (bl = 128<<10; bl < max_size; bl <<= 1) {
87 if (bl * 2 > size)
88 break;
89 }
90 setbat(4, PAGE_OFFSET+delta, delta, bl, PAGE_KERNEL_X);
91 return delta + bl;
92} 68}
93 69
94static void __noreturn wii_spin(void) 70static void __noreturn wii_spin(void)
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index f62930f839ca..86368e238f6e 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev)
186 */ 186 */
187 if (dev->vendor == 0x1959 && dev->device == 0xa007 && 187 if (dev->vendor == 0x1959 && dev->device == 0xa007 &&
188 !firmware_has_feature(FW_FEATURE_LPAR)) { 188 !firmware_has_feature(FW_FEATURE_LPAR)) {
189 dev->dev.dma_ops = &dma_nommu_ops; 189 dev->dev.dma_ops = NULL;
190 /* 190 /*
191 * Set the coherent DMA mask to prevent the iommu 191 * Set the coherent DMA mask to prevent the iommu
192 * being used unnecessarily 192 * being used unnecessarily
diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c
index c0532999f854..46dd463faaa7 100644
--- a/arch/powerpc/platforms/pasemi/setup.c
+++ b/arch/powerpc/platforms/pasemi/setup.c
@@ -411,55 +411,6 @@ out:
411 return !!(srr1 & 0x2); 411 return !!(srr1 & 0x2);
412} 412}
413 413
414#ifdef CONFIG_PCMCIA
415static int pcmcia_notify(struct notifier_block *nb, unsigned long action,
416 void *data)
417{
418 struct device *dev = data;
419 struct device *parent;
420 struct pcmcia_device *pdev = to_pcmcia_dev(dev);
421
422 /* We are only intereted in device addition */
423 if (action != BUS_NOTIFY_ADD_DEVICE)
424 return 0;
425
426 parent = pdev->socket->dev.parent;
427
428 /* We know electra_cf devices will always have of_node set, since
429 * electra_cf is an of_platform driver.
430 */
431 if (!parent->of_node)
432 return 0;
433
434 if (!of_device_is_compatible(parent->of_node, "electra-cf"))
435 return 0;
436
437 /* We use the direct ops for localbus */
438 dev->dma_ops = &dma_nommu_ops;
439
440 return 0;
441}
442
443static struct notifier_block pcmcia_notifier = {
444 .notifier_call = pcmcia_notify,
445};
446
447static inline void pasemi_pcmcia_init(void)
448{
449 extern struct bus_type pcmcia_bus_type;
450
451 bus_register_notifier(&pcmcia_bus_type, &pcmcia_notifier);
452}
453
454#else
455
456static inline void pasemi_pcmcia_init(void)
457{
458}
459
460#endif
461
462
463static const struct of_device_id pasemi_bus_ids[] = { 414static const struct of_device_id pasemi_bus_ids[] = {
464 /* Unfortunately needed for legacy firmwares */ 415 /* Unfortunately needed for legacy firmwares */
465 { .type = "localbus", }, 416 { .type = "localbus", },
@@ -472,8 +423,6 @@ static const struct of_device_id pasemi_bus_ids[] = {
472 423
473static int __init pasemi_publish_devices(void) 424static int __init pasemi_publish_devices(void)
474{ 425{
475 pasemi_pcmcia_init();
476
477 /* Publish OF platform devices for SDC and other non-PCI devices */ 426 /* Publish OF platform devices for SDC and other non-PCI devices */
478 of_platform_bus_probe(NULL, pasemi_bus_ids, NULL); 427 of_platform_bus_probe(NULL, pasemi_bus_ids, NULL);
479 428
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index b540ce8eec55..da2e99efbd04 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o 2obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
3obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o 3obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
4obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o 4obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
5obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o 5obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
6obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o 6obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
@@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE) += pci-cxl.o
11obj-$(CONFIG_EEH) += eeh-powernv.o 11obj-$(CONFIG_EEH) += eeh-powernv.o
12obj-$(CONFIG_PPC_SCOM) += opal-xscom.o 12obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
13obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o 13obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
14obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
15obj-$(CONFIG_OPAL_PRD) += opal-prd.o 14obj-$(CONFIG_OPAL_PRD) += opal-prd.o
16obj-$(CONFIG_PERF_EVENTS) += opal-imc.o 15obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
17obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o 16obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 35f699ebb662..e52f9b06dd9c 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -458,7 +458,8 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
458#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 458#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
459 459
460#ifdef CONFIG_HOTPLUG_CPU 460#ifdef CONFIG_HOTPLUG_CPU
461static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) 461
462void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
462{ 463{
463 u64 pir = get_hard_smp_processor_id(cpu); 464 u64 pir = get_hard_smp_processor_id(cpu);
464 465
@@ -481,20 +482,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
481{ 482{
482 unsigned long srr1; 483 unsigned long srr1;
483 u32 idle_states = pnv_get_supported_cpuidle_states(); 484 u32 idle_states = pnv_get_supported_cpuidle_states();
484 u64 lpcr_val;
485
486 /*
487 * We don't want to take decrementer interrupts while we are
488 * offline, so clear LPCR:PECE1. We keep PECE2 (and
489 * LPCR_PECE_HVEE on P9) enabled as to let IPIs in.
490 *
491 * If the CPU gets woken up by a special wakeup, ensure that
492 * the SLW engine sets LPCR with decrementer bit cleared, else
493 * the CPU will come back to the kernel due to a spurious
494 * wakeup.
495 */
496 lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
497 pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
498 485
499 __ppc64_runlatch_off(); 486 __ppc64_runlatch_off();
500 487
@@ -526,16 +513,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
526 513
527 __ppc64_runlatch_on(); 514 __ppc64_runlatch_on();
528 515
529 /*
530 * Re-enable decrementer interrupts in LPCR.
531 *
532 * Further, we want stop states to be woken up by decrementer
533 * for non-hotplug cases. So program the LPCR via stop api as
534 * well.
535 */
536 lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
537 pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
538
539 return srr1; 516 return srr1;
540} 517}
541#endif 518#endif
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 3f58c7dbd581..dc23d9d2a7d9 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -28,10 +28,6 @@
28 */ 28 */
29static DEFINE_SPINLOCK(npu_context_lock); 29static DEFINE_SPINLOCK(npu_context_lock);
30 30
31/*
32 * Other types of TCE cache invalidation are not functional in the
33 * hardware.
34 */
35static struct pci_dev *get_pci_dev(struct device_node *dn) 31static struct pci_dev *get_pci_dev(struct device_node *dn)
36{ 32{
37 struct pci_dn *pdn = PCI_DN(dn); 33 struct pci_dn *pdn = PCI_DN(dn);
@@ -220,7 +216,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
220 * their parent device so drivers shouldn't be doing DMA 216 * their parent device so drivers shouldn't be doing DMA
221 * operations directly on these devices. 217 * operations directly on these devices.
222 */ 218 */
223 set_dma_ops(&npe->pdev->dev, NULL); 219 set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
224} 220}
225 221
226/* 222/*
@@ -917,15 +913,6 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
917 mmio_invalidate(npu_context, 0, ~0UL); 913 mmio_invalidate(npu_context, 0, ~0UL);
918} 914}
919 915
920static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
921 struct mm_struct *mm,
922 unsigned long address,
923 pte_t pte)
924{
925 struct npu_context *npu_context = mn_to_npu_context(mn);
926 mmio_invalidate(npu_context, address, PAGE_SIZE);
927}
928
929static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, 916static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
930 struct mm_struct *mm, 917 struct mm_struct *mm,
931 unsigned long start, unsigned long end) 918 unsigned long start, unsigned long end)
@@ -936,7 +923,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
936 923
937static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { 924static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
938 .release = pnv_npu2_mn_release, 925 .release = pnv_npu2_mn_release,
939 .change_pte = pnv_npu2_mn_change_pte,
940 .invalidate_range = pnv_npu2_mn_invalidate_range, 926 .invalidate_range = pnv_npu2_mn_invalidate_range,
941}; 927};
942 928
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index 000000000000..578757d403ab
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,283 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/percpu.h>
3#include <linux/jump_label.h>
4#include <asm/opal-api.h>
5#include <asm/trace.h>
6#include <asm/asm-prototypes.h>
7
8#ifdef CONFIG_TRACEPOINTS
9/*
10 * Since the tracing code might execute OPAL calls we need to guard against
11 * recursion.
12 */
13static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
14
15static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
16 s64 a4, s64 a5, s64 a6, s64 a7,
17 unsigned long opcode)
18{
19 unsigned int *depth;
20 unsigned long args[8];
21
22 depth = this_cpu_ptr(&opal_trace_depth);
23
24 if (*depth)
25 return;
26
27 args[0] = a0;
28 args[1] = a1;
29 args[2] = a2;
30 args[3] = a3;
31 args[4] = a4;
32 args[5] = a5;
33 args[6] = a6;
34 args[7] = a7;
35
36 (*depth)++;
37 trace_opal_entry(opcode, &args[0]);
38 (*depth)--;
39}
40
41static void __trace_opal_exit(unsigned long opcode, unsigned long retval)
42{
43 unsigned int *depth;
44
45 depth = this_cpu_ptr(&opal_trace_depth);
46
47 if (*depth)
48 return;
49
50 (*depth)++;
51 trace_opal_exit(opcode, retval);
52 (*depth)--;
53}
54
55static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key);
56
57int opal_tracepoint_regfunc(void)
58{
59 static_branch_inc(&opal_tracepoint_key);
60 return 0;
61}
62
63void opal_tracepoint_unregfunc(void)
64{
65 static_branch_dec(&opal_tracepoint_key);
66}
67
68static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
69 s64 a4, s64 a5, s64 a6, s64 a7,
70 unsigned long opcode, unsigned long msr)
71{
72 s64 ret;
73
74 __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode);
75 ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
76 __trace_opal_exit(opcode, ret);
77
78 return ret;
79}
80
81#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key))
82
83#else /* CONFIG_TRACEPOINTS */
84
85static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
86 s64 a4, s64 a5, s64 a6, s64 a7,
87 unsigned long opcode, unsigned long msr)
88{
89}
90
91#define DO_TRACE false
92#endif /* CONFIG_TRACEPOINTS */
93
94static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
95 int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode)
96{
97 unsigned long flags;
98 unsigned long msr = mfmsr();
99 bool mmu = (msr & (MSR_IR|MSR_DR));
100 int64_t ret;
101
102 msr &= ~MSR_EE;
103
104 if (unlikely(!mmu))
105 return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
106
107 local_save_flags(flags);
108 hard_irq_disable();
109
110 if (DO_TRACE) {
111 ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
112 } else {
113 ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
114 }
115
116 local_irq_restore(flags);
117
118 return ret;
119}
120
121#define OPAL_CALL(name, opcode) \
122int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \
123 int64_t a4, int64_t a5, int64_t a6, int64_t a7) \
124{ \
125 return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \
126}
127
128OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
129OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
130OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
131OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
132OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
133OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
134OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
135OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
136OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
137OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
138OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
139OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
140OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
141OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
142OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
143OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
144OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
145OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
146OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
147OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
148OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
149OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
150OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
151OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
152OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
153OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
154OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
155OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
156OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
157OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
158OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
159OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
160OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
161OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
162OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
163OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
164OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
165OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
166OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
167OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
168OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
169OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
170OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
171OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
172OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
173OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
174OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
175OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
176OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
177OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
178OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
179OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
180OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
181OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
182OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
183OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
184OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
185OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
186OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
187OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
188OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
189OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
190OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
191OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
192OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
193OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
194OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
195OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
196OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
197OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
198OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
199OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
200OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
201OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
202OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
203OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
204OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
205OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
206OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
207OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
208OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
209OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
210OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
211OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
212OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
213OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
214OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
215OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
216OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
217OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
218OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
219OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
220OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
221OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
222OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
223OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
224OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
225OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
226OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
227OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
228OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
229OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
230OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
231OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
232OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
233OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
234OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
235OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
236OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
237OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
238OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
239OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
240OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
241OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
242OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
243OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
244OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
245OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
246OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
247OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
248OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
249OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
250OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
251OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
252OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
253OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
254OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
255OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
256OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
257OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
258OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
259OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
260OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
261OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
262OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
263OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
264OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
265OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
266OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
267OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
268OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
269OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
270OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
271OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
272OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
273OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
274OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
275OPAL_CALL(opal_quiesce, OPAL_QUIESCE);
276OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP);
277OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE);
278OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
279OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
280OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
281OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
282OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
283OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index acd3206dfae3..06628c71cef6 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -98,7 +98,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
98} 98}
99 99
100static struct bin_attribute opal_msglog_attr = { 100static struct bin_attribute opal_msglog_attr = {
101 .attr = {.name = "msglog", .mode = 0444}, 101 .attr = {.name = "msglog", .mode = 0400},
102 .read = opal_msglog_read 102 .read = opal_msglog_read
103}; 103};
104 104
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index f4875fe3f8ff..7d2052d8af9d 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -17,317 +17,51 @@
17#include <asm/asm-compat.h> 17#include <asm/asm-compat.h>
18#include <asm/feature-fixups.h> 18#include <asm/feature-fixups.h>
19 19
20 .section ".text" 20 .section ".text"
21
22#ifdef CONFIG_TRACEPOINTS
23#ifdef CONFIG_JUMP_LABEL
24#define OPAL_BRANCH(LABEL) \
25 ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
26#else
27
28 .section ".toc","aw"
29
30 .globl opal_tracepoint_refcount
31opal_tracepoint_refcount:
32 .8byte 0
33
34 .section ".text"
35
36/*
37 * We branch around this in early init by using an unconditional cpu
38 * feature.
39 */
40#define OPAL_BRANCH(LABEL) \
41BEGIN_FTR_SECTION; \
42 b 1f; \
43END_FTR_SECTION(0, 1); \
44 ld r11,opal_tracepoint_refcount@toc(r2); \
45 cmpdi r11,0; \
46 bne- LABEL; \
471:
48
49#endif
50
51#else
52#define OPAL_BRANCH(LABEL)
53#endif
54 21
55/* 22/*
56 * DO_OPAL_CALL assumes: 23 * r3-r10 - OPAL call arguments
57 * r0 = opal call token 24 * STK_PARAM(R11) - OPAL opcode
58 * r12 = msr 25 * STK_PARAM(R12) - MSR to restore
59 * LR has been saved
60 */ 26 */
61#define DO_OPAL_CALL() \ 27_GLOBAL_TOC(__opal_call)
62 mfcr r11; \ 28 mflr r0
63 stw r11,8(r1); \ 29 std r0,PPC_LR_STKOFF(r1)
64 li r11,0; \ 30 ld r12,STK_PARAM(R12)(r1)
65 ori r11,r11,MSR_EE; \ 31 li r0,MSR_IR|MSR_DR|MSR_LE
66 std r12,PACASAVEDMSR(r13); \ 32 andc r12,r12,r0
67 andc r12,r12,r11; \ 33 LOAD_REG_ADDR(r11, opal_return)
68 mtmsrd r12,1; \ 34 mtlr r11
69 LOAD_REG_ADDR(r11,opal_return); \ 35 LOAD_REG_ADDR(r11, opal)
70 mtlr r11; \ 36 ld r2,0(r11)
71 li r11,MSR_DR|MSR_IR|MSR_LE;\ 37 ld r11,8(r11)
72 andc r12,r12,r11; \ 38 mtspr SPRN_HSRR0,r11
73 mtspr SPRN_HSRR1,r12; \ 39 mtspr SPRN_HSRR1,r12
74 LOAD_REG_ADDR(r11,opal); \ 40 /* set token to r0 */
75 ld r12,8(r11); \ 41 ld r0,STK_PARAM(R11)(r1)
76 ld r2,0(r11); \
77 mtspr SPRN_HSRR0,r12; \
78 hrfid 42 hrfid
79
80#define OPAL_CALL(name, token) \
81 _GLOBAL_TOC(name); \
82 mfmsr r12; \
83 mflr r0; \
84 andi. r11,r12,MSR_IR|MSR_DR; \
85 std r0,PPC_LR_STKOFF(r1); \
86 li r0,token; \
87 beq opal_real_call; \
88 OPAL_BRANCH(opal_tracepoint_entry) \
89 DO_OPAL_CALL()
90
91
92opal_return: 43opal_return:
93 /* 44 /*
94 * Fixup endian on OPAL return... we should be able to simplify 45 * Restore MSR on OPAL return. The MSR is set to big-endian.
95 * this by instead converting the below trampoline to a set of
96 * bytes (always BE) since MSR:LE will end up fixed up as a side
97 * effect of the rfid.
98 */ 46 */
99 FIXUP_ENDIAN_HV 47#ifdef __BIG_ENDIAN__
100 ld r2,PACATOC(r13); 48 ld r11,STK_PARAM(R12)(r1)
101 lwz r4,8(r1); 49 mtmsrd r11
102 ld r5,PPC_LR_STKOFF(r1); 50#else
103 ld r6,PACASAVEDMSR(r13); 51 /* Endian can only be switched with rfi, must byte reverse MSR load */
104 mtcr r4; 52 .short 0x4039 /* li r10,STK_PARAM(R12) */
105 mtspr SPRN_HSRR0,r5; 53 .byte (STK_PARAM(R12) >> 8) & 0xff
106 mtspr SPRN_HSRR1,r6; 54 .byte STK_PARAM(R12) & 0xff
107 hrfid 55
108 56 .long 0x280c6a7d /* ldbrx r11,r10,r1 */
109opal_real_call: 57 .long 0x05009f42 /* bcl 20,31,$+4 */
110 mfcr r11 58 .long 0xa602487d /* mflr r10 */
111 stw r11,8(r1) 59 .long 0x14004a39 /* addi r10,r10,20 */
112 /* Set opal return address */ 60 .long 0xa64b5a7d /* mthsrr0 r10 */
113 LOAD_REG_ADDR(r11, opal_return_realmode) 61 .long 0xa64b7b7d /* mthsrr1 r11 */
114 mtlr r11 62 .long 0x2402004c /* hrfid */
115 li r11,MSR_LE 63#endif
116 andc r12,r12,r11 64 ld r2,PACATOC(r13)
117 mtspr SPRN_HSRR1,r12 65 ld r0,PPC_LR_STKOFF(r1)
118 LOAD_REG_ADDR(r11,opal)
119 ld r12,8(r11)
120 ld r2,0(r11)
121 mtspr SPRN_HSRR0,r12
122 hrfid
123
124opal_return_realmode:
125 FIXUP_ENDIAN_HV
126 ld r2,PACATOC(r13);
127 lwz r11,8(r1);
128 ld r12,PPC_LR_STKOFF(r1)
129 mtcr r11;
130 mtlr r12
131 blr
132
133#ifdef CONFIG_TRACEPOINTS
134opal_tracepoint_entry:
135 stdu r1,-STACKFRAMESIZE(r1)
136 std r0,STK_REG(R23)(r1)
137 std r3,STK_REG(R24)(r1)
138 std r4,STK_REG(R25)(r1)
139 std r5,STK_REG(R26)(r1)
140 std r6,STK_REG(R27)(r1)
141 std r7,STK_REG(R28)(r1)
142 std r8,STK_REG(R29)(r1)
143 std r9,STK_REG(R30)(r1)
144 std r10,STK_REG(R31)(r1)
145 mr r3,r0
146 addi r4,r1,STK_REG(R24)
147 bl __trace_opal_entry
148 ld r0,STK_REG(R23)(r1)
149 ld r3,STK_REG(R24)(r1)
150 ld r4,STK_REG(R25)(r1)
151 ld r5,STK_REG(R26)(r1)
152 ld r6,STK_REG(R27)(r1)
153 ld r7,STK_REG(R28)(r1)
154 ld r8,STK_REG(R29)(r1)
155 ld r9,STK_REG(R30)(r1)
156 ld r10,STK_REG(R31)(r1)
157
158 /* setup LR so we return via tracepoint_return */
159 LOAD_REG_ADDR(r11,opal_tracepoint_return)
160 std r11,16(r1)
161
162 mfmsr r12
163 DO_OPAL_CALL()
164
165opal_tracepoint_return:
166 std r3,STK_REG(R31)(r1)
167 mr r4,r3
168 ld r3,STK_REG(R23)(r1)
169 bl __trace_opal_exit
170 ld r3,STK_REG(R31)(r1)
171 addi r1,r1,STACKFRAMESIZE
172 ld r0,16(r1)
173 mtlr r0 66 mtlr r0
174 blr 67 blr
175#endif
176
177
178OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
179OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
180OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
181OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
182OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
183OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
184OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
185OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
186OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
187OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
188OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
189OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
190OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
191OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
192OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
193OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
194OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
195OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
196OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
197OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
198OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
199OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
200OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
201OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
202OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
203OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
204OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
205OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
206OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
207OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
208OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
209OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
210OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
211OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
212OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
213OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
214OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
215OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
216OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
217OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
218OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
219OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
220OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
221OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
222OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
223OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
224OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
225OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
226OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
227OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
228OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
229OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
230OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
231OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
232OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
233OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
234OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
235OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
236OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
237OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
238OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
239OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
240OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
241OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
242OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
243OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
244OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
245OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
246OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
247OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
248OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
249OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
250OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
251OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
252OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
253OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
254OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
255OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
256OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
257OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
258OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
259OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
260OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
261OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
262OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
263OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
264OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
265OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
266OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
267OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
268OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
269OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
270OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
271OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
272OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
273OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
274OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
275OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
276OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
277OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
278OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
279OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
280OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
281OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
282OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
283OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
284OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
285OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
286OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
287OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
288OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
289OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
290OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
291OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
292OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
293OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
294OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
295OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
296OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
297OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
298OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
299OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
300OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
301OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
302OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
303OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
304OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
305OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
306OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
307OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
308OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
309OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
310OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
311OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
312OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
313OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
314OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
315OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
316OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
317OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
318OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
319OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
320OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
321OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
322OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
323OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
324OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
325OPAL_CALL(opal_quiesce, OPAL_QUIESCE);
326OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP);
327OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE);
328OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
329OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
330OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
331OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
332OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
333OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 8e157f9f1ff2..727a7de08635 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -26,7 +26,6 @@
26#include <linux/memblock.h> 26#include <linux/memblock.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/printk.h>
30#include <linux/kmsg_dump.h> 29#include <linux/kmsg_dump.h>
31#include <linux/console.h> 30#include <linux/console.h>
32#include <linux/sched/debug.h> 31#include <linux/sched/debug.h>
@@ -586,7 +585,7 @@ int opal_machine_check(struct pt_regs *regs)
586 evt.version); 585 evt.version);
587 return 0; 586 return 0;
588 } 587 }
589 machine_check_print_event_info(&evt, user_mode(regs)); 588 machine_check_print_event_info(&evt, user_mode(regs), false);
590 589
591 if (opal_recover_mce(regs, &evt)) 590 if (opal_recover_mce(regs, &evt))
592 return 1; 591 return 1;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 697449afb3f7..e28f03e1eb5e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -313,7 +313,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
313 page_shift); 313 page_shift);
314 tbl->it_level_size = 1ULL << (level_shift - 3); 314 tbl->it_level_size = 1ULL << (level_shift - 3);
315 tbl->it_indirect_levels = levels - 1; 315 tbl->it_indirect_levels = levels - 1;
316 tbl->it_allocated_size = total_allocated;
317 tbl->it_userspace = uas; 316 tbl->it_userspace = uas;
318 tbl->it_nid = nid; 317 tbl->it_nid = nid;
319 318
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 145373f0e5dc..fa6af52b5219 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1748,7 +1748,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
1748 1748
1749 pe = &phb->ioda.pe_array[pdn->pe_number]; 1749 pe = &phb->ioda.pe_array[pdn->pe_number];
1750 WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); 1750 WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1751 set_dma_offset(&pdev->dev, pe->tce_bypass_base); 1751 pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1752 set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); 1752 set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1753 /* 1753 /*
1754 * Note: iommu_add_device() will fail here as 1754 * Note: iommu_add_device() will fail here as
@@ -1758,31 +1758,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
1758 */ 1758 */
1759} 1759}
1760 1760
1761static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
1762{
1763 unsigned short vendor = 0;
1764 struct pci_dev *pdev;
1765
1766 if (pe->device_count == 1)
1767 return true;
1768
1769 /* pe->pdev should be set if it's a single device, pe->pbus if not */
1770 if (!pe->pbus)
1771 return true;
1772
1773 list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
1774 if (!vendor) {
1775 vendor = pdev->vendor;
1776 continue;
1777 }
1778
1779 if (pdev->vendor != vendor)
1780 return false;
1781 }
1782
1783 return true;
1784}
1785
1786/* 1761/*
1787 * Reconfigure TVE#0 to be usable as 64-bit DMA space. 1762 * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1788 * 1763 *
@@ -1852,88 +1827,45 @@ err:
1852 return -EIO; 1827 return -EIO;
1853} 1828}
1854 1829
1855static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) 1830static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1831 u64 dma_mask)
1856{ 1832{
1857 struct pci_controller *hose = pci_bus_to_host(pdev->bus); 1833 struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1858 struct pnv_phb *phb = hose->private_data; 1834 struct pnv_phb *phb = hose->private_data;
1859 struct pci_dn *pdn = pci_get_pdn(pdev); 1835 struct pci_dn *pdn = pci_get_pdn(pdev);
1860 struct pnv_ioda_pe *pe; 1836 struct pnv_ioda_pe *pe;
1861 uint64_t top;
1862 bool bypass = false;
1863 s64 rc;
1864 1837
1865 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1838 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1866 return -ENODEV; 1839 return -ENODEV;
1867 1840
1868 pe = &phb->ioda.pe_array[pdn->pe_number]; 1841 pe = &phb->ioda.pe_array[pdn->pe_number];
1869 if (pe->tce_bypass_enabled) { 1842 if (pe->tce_bypass_enabled) {
1870 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; 1843 u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1871 bypass = (dma_mask >= top); 1844 if (dma_mask >= top)
1845 return true;
1872 } 1846 }
1873 1847
1874 if (bypass) { 1848 /*
1875 dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); 1849 * If the device can't set the TCE bypass bit but still wants
1876 set_dma_ops(&pdev->dev, &dma_nommu_ops); 1850 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1877 } else { 1851 * bypass the 32-bit region and be usable for 64-bit DMAs.
1878 /* 1852 * The device needs to be able to address all of this space.
1879 * If the device can't set the TCE bypass bit but still wants 1853 */
1880 * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to 1854 if (dma_mask >> 32 &&
1881 * bypass the 32-bit region and be usable for 64-bit DMAs. 1855 dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1882 * The device needs to be able to address all of this space. 1856 /* pe->pdev should be set if it's a single device, pe->pbus if not */
1883 */ 1857 (pe->device_count == 1 || !pe->pbus) &&
1884 if (dma_mask >> 32 && 1858 phb->model == PNV_PHB_MODEL_PHB3) {
1885 dma_mask > (memory_hotplug_max() + (1ULL << 32)) && 1859 /* Configure the bypass mode */
1886 pnv_pci_ioda_pe_single_vendor(pe) && 1860 s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1887 phb->model == PNV_PHB_MODEL_PHB3) { 1861 if (rc)
1888 /* Configure the bypass mode */ 1862 return rc;
1889 rc = pnv_pci_ioda_dma_64bit_bypass(pe); 1863 /* 4GB offset bypasses 32-bit space */
1890 if (rc) 1864 pdev->dev.archdata.dma_offset = (1ULL << 32);
1891 return rc; 1865 return true;
1892 /* 4GB offset bypasses 32-bit space */
1893 set_dma_offset(&pdev->dev, (1ULL << 32));
1894 set_dma_ops(&pdev->dev, &dma_nommu_ops);
1895 } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
1896 /*
1897 * Fail the request if a DMA mask between 32 and 64 bits
1898 * was requested but couldn't be fulfilled. Ideally we
1899 * would do this for 64-bits but historically we have
1900 * always fallen back to 32-bits.
1901 */
1902 return -ENOMEM;
1903 } else {
1904 dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
1905 set_dma_ops(&pdev->dev, &dma_iommu_ops);
1906 }
1907 } 1866 }
1908 *pdev->dev.dma_mask = dma_mask;
1909 1867
1910 /* Update peer npu devices */ 1868 return false;
1911 pnv_npu_try_dma_set_bypass(pdev, bypass);
1912
1913 return 0;
1914}
1915
1916static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
1917{
1918 struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1919 struct pnv_phb *phb = hose->private_data;
1920 struct pci_dn *pdn = pci_get_pdn(pdev);
1921 struct pnv_ioda_pe *pe;
1922 u64 end, mask;
1923
1924 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1925 return 0;
1926
1927 pe = &phb->ioda.pe_array[pdn->pe_number];
1928 if (!pe->tce_bypass_enabled)
1929 return __dma_get_required_mask(&pdev->dev);
1930
1931
1932 end = pe->tce_bypass_base + memblock_end_of_DRAM();
1933 mask = 1ULL << (fls64(end) - 1);
1934 mask += mask - 1;
1935
1936 return mask;
1937} 1869}
1938 1870
1939static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) 1871static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
@@ -1942,7 +1874,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
1942 1874
1943 list_for_each_entry(dev, &bus->devices, bus_list) { 1875 list_for_each_entry(dev, &bus->devices, bus_list) {
1944 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); 1876 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1945 set_dma_offset(&dev->dev, pe->tce_bypass_base); 1877 dev->dev.archdata.dma_offset = pe->tce_bypass_base;
1946 1878
1947 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 1879 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1948 pnv_ioda_setup_bus_dma(pe, dev->subordinate); 1880 pnv_ioda_setup_bus_dma(pe, dev->subordinate);
@@ -2594,8 +2526,13 @@ static long pnv_pci_ioda2_create_table_userspace(
2594 int num, __u32 page_shift, __u64 window_size, __u32 levels, 2526 int num, __u32 page_shift, __u64 window_size, __u32 levels,
2595 struct iommu_table **ptbl) 2527 struct iommu_table **ptbl)
2596{ 2528{
2597 return pnv_pci_ioda2_create_table(table_group, 2529 long ret = pnv_pci_ioda2_create_table(table_group,
2598 num, page_shift, window_size, levels, true, ptbl); 2530 num, page_shift, window_size, levels, true, ptbl);
2531
2532 if (!ret)
2533 (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2534 page_shift, window_size, levels);
2535 return ret;
2599} 2536}
2600 2537
2601static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) 2538static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
@@ -3661,6 +3598,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3661static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { 3598static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3662 .dma_dev_setup = pnv_pci_dma_dev_setup, 3599 .dma_dev_setup = pnv_pci_dma_dev_setup,
3663 .dma_bus_setup = pnv_pci_dma_bus_setup, 3600 .dma_bus_setup = pnv_pci_dma_bus_setup,
3601 .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
3664 .setup_msi_irqs = pnv_setup_msi_irqs, 3602 .setup_msi_irqs = pnv_setup_msi_irqs,
3665 .teardown_msi_irqs = pnv_teardown_msi_irqs, 3603 .teardown_msi_irqs = pnv_teardown_msi_irqs,
3666 .enable_device_hook = pnv_pci_enable_device_hook, 3604 .enable_device_hook = pnv_pci_enable_device_hook,
@@ -3668,19 +3606,9 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3668 .window_alignment = pnv_pci_window_alignment, 3606 .window_alignment = pnv_pci_window_alignment,
3669 .setup_bridge = pnv_pci_setup_bridge, 3607 .setup_bridge = pnv_pci_setup_bridge,
3670 .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3608 .reset_secondary_bus = pnv_pci_reset_secondary_bus,
3671 .dma_set_mask = pnv_pci_ioda_dma_set_mask,
3672 .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
3673 .shutdown = pnv_pci_ioda_shutdown, 3609 .shutdown = pnv_pci_ioda_shutdown,
3674}; 3610};
3675 3611
3676static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
3677{
3678 dev_err_once(&npdev->dev,
3679 "%s operation unsupported for NVLink devices\n",
3680 __func__);
3681 return -EPERM;
3682}
3683
3684static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { 3612static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3685 .dma_dev_setup = pnv_pci_dma_dev_setup, 3613 .dma_dev_setup = pnv_pci_dma_dev_setup,
3686 .setup_msi_irqs = pnv_setup_msi_irqs, 3614 .setup_msi_irqs = pnv_setup_msi_irqs,
@@ -3688,7 +3616,6 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3688 .enable_device_hook = pnv_pci_enable_device_hook, 3616 .enable_device_hook = pnv_pci_enable_device_hook,
3689 .window_alignment = pnv_pci_window_alignment, 3617 .window_alignment = pnv_pci_window_alignment,
3690 .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3618 .reset_secondary_bus = pnv_pci_reset_secondary_bus,
3691 .dma_set_mask = pnv_npu_dma_set_mask,
3692 .shutdown = pnv_pci_ioda_shutdown, 3619 .shutdown = pnv_pci_ioda_shutdown,
3693 .disable_device = pnv_npu_disable_device, 3620 .disable_device = pnv_npu_disable_device,
3694}; 3621};
@@ -3946,9 +3873,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3946 * shutdown PCI devices correctly. We already got IODA table 3873 * shutdown PCI devices correctly. We already got IODA table
3947 * cleaned out. So we have to issue PHB reset to stop all PCI 3874 * cleaned out. So we have to issue PHB reset to stop all PCI
3948 * transactions from previous kernel. The ppc_pci_reset_phbs 3875 * transactions from previous kernel. The ppc_pci_reset_phbs
3949 * kernel parameter will force this reset too. 3876 * kernel parameter will force this reset too. Additionally,
3877 * if the IODA reset above failed then use a bigger hammer.
3878 * This can happen if we get a PHB fatal error in very early
3879 * boot.
3950 */ 3880 */
3951 if (is_kdump_kernel() || pci_reset_phbs) { 3881 if (is_kdump_kernel() || pci_reset_phbs || rc) {
3952 pr_info(" Issue PHB reset ...\n"); 3882 pr_info(" Issue PHB reset ...\n");
3953 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); 3883 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3954 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); 3884 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 0d354e19ef92..db09c7022635 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -39,6 +39,7 @@
39#include <asm/cpuidle.h> 39#include <asm/cpuidle.h>
40#include <asm/kexec.h> 40#include <asm/kexec.h>
41#include <asm/reg.h> 41#include <asm/reg.h>
42#include <asm/powernv.h>
42 43
43#include "powernv.h" 44#include "powernv.h"
44 45
@@ -153,6 +154,7 @@ static void pnv_smp_cpu_kill_self(void)
153{ 154{
154 unsigned int cpu; 155 unsigned int cpu;
155 unsigned long srr1, wmask; 156 unsigned long srr1, wmask;
157 u64 lpcr_val;
156 158
157 /* Standard hot unplug procedure */ 159 /* Standard hot unplug procedure */
158 /* 160 /*
@@ -174,6 +176,19 @@ static void pnv_smp_cpu_kill_self(void)
174 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 176 if (cpu_has_feature(CPU_FTR_ARCH_207S))
175 wmask = SRR1_WAKEMASK_P8; 177 wmask = SRR1_WAKEMASK_P8;
176 178
179 /*
180 * We don't want to take decrementer interrupts while we are
181 * offline, so clear LPCR:PECE1. We keep PECE2 (and
182 * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in.
183 *
184 * If the CPU gets woken up by a special wakeup, ensure that
185 * the SLW engine sets LPCR with decrementer bit cleared, else
186 * the CPU will come back to the kernel due to a spurious
187 * wakeup.
188 */
189 lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
190 pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
191
177 while (!generic_check_cpu_restart(cpu)) { 192 while (!generic_check_cpu_restart(cpu)) {
178 /* 193 /*
179 * Clear IPI flag, since we don't handle IPIs while 194 * Clear IPI flag, since we don't handle IPIs while
@@ -246,6 +261,16 @@ static void pnv_smp_cpu_kill_self(void)
246 261
247 } 262 }
248 263
264 /*
265 * Re-enable decrementer interrupts in LPCR.
266 *
267 * Further, we want stop states to be woken up by decrementer
268 * for non-hotplug cases. So program the LPCR via stop api as
269 * well.
270 */
271 lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
272 pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
273
249 DBG("CPU%d coming online...\n", cpu); 274 DBG("CPU%d coming online...\n", cpu);
250} 275}
251 276
diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
index e7075aaff1bb..59587b75493d 100644
--- a/arch/powerpc/platforms/ps3/device-init.c
+++ b/arch/powerpc/platforms/ps3/device-init.c
@@ -354,9 +354,7 @@ static int ps3_setup_storage_dev(const struct ps3_repository_device *repo,
354 repo->dev_index, repo->dev_type, port, blk_size, num_blocks, 354 repo->dev_index, repo->dev_type, port, blk_size, num_blocks,
355 num_regions); 355 num_regions);
356 356
357 p = kzalloc(sizeof(struct ps3_storage_device) + 357 p = kzalloc(struct_size(p, regions, num_regions), GFP_KERNEL);
358 num_regions * sizeof(struct ps3_storage_region),
359 GFP_KERNEL);
360 if (!p) { 358 if (!p) {
361 result = -ENOMEM; 359 result = -ENOMEM;
362 goto fail_malloc; 360 goto fail_malloc;
diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c
index f5387ad82279..4d65c5380020 100644
--- a/arch/powerpc/platforms/ps3/os-area.c
+++ b/arch/powerpc/platforms/ps3/os-area.c
@@ -205,11 +205,11 @@ static const struct os_area_db_id os_area_db_id_rtc_diff = {
205 * 3) The number of seconds from 1970 to 2000. 205 * 3) The number of seconds from 1970 to 2000.
206 */ 206 */
207 207
208struct saved_params { 208static struct saved_params {
209 unsigned int valid; 209 unsigned int valid;
210 s64 rtc_diff; 210 s64 rtc_diff;
211 unsigned int av_multi_out; 211 unsigned int av_multi_out;
212} static saved_params; 212} saved_params;
213 213
214static struct property property_rtc_diff = { 214static struct property property_rtc_diff = {
215 .name = "linux,rtc_diff", 215 .name = "linux,rtc_diff",
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index 5cc35d6b94b6..7c227e784247 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -37,12 +37,12 @@ static struct device ps3_system_bus = {
37}; 37};
38 38
39/* FIXME: need device usage counters! */ 39/* FIXME: need device usage counters! */
40struct { 40static struct {
41 struct mutex mutex; 41 struct mutex mutex;
42 int sb_11; /* usb 0 */ 42 int sb_11; /* usb 0 */
43 int sb_12; /* usb 0 */ 43 int sb_12; /* usb 0 */
44 int gpu; 44 int gpu;
45} static usage_hack; 45} usage_hack;
46 46
47static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id, 47static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id,
48 u64 dev_id) 48 u64 dev_id)
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 2f8e62163602..97feb6e79f1a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
802 return rc; 802 return rc;
803} 803}
804 804
805int dlpar_cpu_readd(int cpu)
806{
807 struct device_node *dn;
808 struct device *dev;
809 u32 drc_index;
810 int rc;
811
812 dev = get_cpu_device(cpu);
813 dn = dev->of_node;
814
815 rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index);
816
817 rc = dlpar_cpu_remove_by_index(drc_index);
818 if (!rc)
819 rc = dlpar_cpu_add(drc_index);
820
821 return rc;
822}
823
805int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) 824int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
806{ 825{
807 u32 count, drc_index; 826 u32 count, drc_index;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 8fc8fe0b9848..36eb1ddbac69 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -978,7 +978,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
978 * pdn: the parent pe node with the ibm,dma_window property 978 * pdn: the parent pe node with the ibm,dma_window property
979 * Future: also check if we can remap the base window for our base page size 979 * Future: also check if we can remap the base window for our base page size
980 * 980 *
981 * returns the dma offset for use by dma_set_mask 981 * returns the dma offset for use by the direct mapped DMA code.
982 */ 982 */
983static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) 983static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
984{ 984{
@@ -1198,87 +1198,37 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1198 iommu_add_device(pci->table_group, &dev->dev); 1198 iommu_add_device(pci->table_group, &dev->dev);
1199} 1199}
1200 1200
1201static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) 1201static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
1202{ 1202{
1203 bool ddw_enabled = false; 1203 struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
1204 struct device_node *pdn, *dn;
1205 struct pci_dev *pdev;
1206 const __be32 *dma_window = NULL; 1204 const __be32 *dma_window = NULL;
1207 u64 dma_offset;
1208
1209 if (!dev->dma_mask)
1210 return -EIO;
1211
1212 if (!dev_is_pci(dev))
1213 goto check_mask;
1214
1215 pdev = to_pci_dev(dev);
1216 1205
1217 /* only attempt to use a new window if 64-bit DMA is requested */ 1206 /* only attempt to use a new window if 64-bit DMA is requested */
1218 if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { 1207 if (dma_mask < DMA_BIT_MASK(64))
1219 dn = pci_device_to_OF_node(pdev); 1208 return false;
1220 dev_dbg(dev, "node is %pOF\n", dn);
1221 1209
1222 /* 1210 dev_dbg(&pdev->dev, "node is %pOF\n", dn);
1223 * the device tree might contain the dma-window properties
1224 * per-device and not necessarily for the bus. So we need to
1225 * search upwards in the tree until we either hit a dma-window
1226 * property, OR find a parent with a table already allocated.
1227 */
1228 for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1229 pdn = pdn->parent) {
1230 dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1231 if (dma_window)
1232 break;
1233 }
1234 if (pdn && PCI_DN(pdn)) {
1235 dma_offset = enable_ddw(pdev, pdn);
1236 if (dma_offset != 0) {
1237 dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
1238 set_dma_offset(dev, dma_offset);
1239 set_dma_ops(dev, &dma_nommu_ops);
1240 ddw_enabled = true;
1241 }
1242 }
1243 }
1244 1211
1245 /* fall back on iommu ops */ 1212 /*
1246 if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { 1213 * the device tree might contain the dma-window properties
1247 dev_info(dev, "Restoring 32-bit DMA via iommu\n"); 1214 * per-device and not necessarily for the bus. So we need to
1248 set_dma_ops(dev, &dma_iommu_ops); 1215 * search upwards in the tree until we either hit a dma-window
1216 * property, OR find a parent with a table already allocated.
1217 */
1218 for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
1219 pdn = pdn->parent) {
1220 dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
1221 if (dma_window)
1222 break;
1249 } 1223 }
1250 1224
1251check_mask: 1225 if (pdn && PCI_DN(pdn)) {
1252 if (!dma_supported(dev, dma_mask)) 1226 pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
1253 return -EIO; 1227 if (pdev->dev.archdata.dma_offset)
1254 1228 return true;
1255 *dev->dma_mask = dma_mask;
1256 return 0;
1257}
1258
1259static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
1260{
1261 if (!dev->dma_mask)
1262 return 0;
1263
1264 if (!disable_ddw && dev_is_pci(dev)) {
1265 struct pci_dev *pdev = to_pci_dev(dev);
1266 struct device_node *dn;
1267
1268 dn = pci_device_to_OF_node(pdev);
1269
1270 /* search upwards for ibm,dma-window */
1271 for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
1272 dn = dn->parent)
1273 if (of_get_property(dn, "ibm,dma-window", NULL))
1274 break;
1275 /* if there is a ibm,ddw-applicable property require 64 bits */
1276 if (dn && PCI_DN(dn) &&
1277 of_get_property(dn, "ibm,ddw-applicable", NULL))
1278 return DMA_BIT_MASK(64);
1279 } 1229 }
1280 1230
1281 return dma_iommu_ops.get_required_mask(dev); 1231 return false;
1282} 1232}
1283 1233
1284static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, 1234static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
@@ -1373,8 +1323,9 @@ void iommu_init_early_pSeries(void)
1373 if (firmware_has_feature(FW_FEATURE_LPAR)) { 1323 if (firmware_has_feature(FW_FEATURE_LPAR)) {
1374 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; 1324 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
1375 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; 1325 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
1376 ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; 1326 if (!disable_ddw)
1377 ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; 1327 pseries_pci_controller_ops.iommu_bypass_supported =
1328 iommu_bypass_supported_pSeriesLP;
1378 } else { 1329 } else {
1379 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; 1330 pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
1380 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; 1331 pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index 794487313cc8..e73c7e30efe6 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -475,6 +475,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
475 splpar_dispatch_data(m); 475 splpar_dispatch_data(m);
476 476
477 seq_printf(m, "purr=%ld\n", get_purr()); 477 seq_printf(m, "purr=%ld\n", get_purr());
478 seq_printf(m, "tbr=%ld\n", mftb());
478 } else { /* non SPLPAR case */ 479 } else { /* non SPLPAR case */
479 480
480 seq_printf(m, "system_active_processors=%d\n", 481 seq_printf(m, "system_active_processors=%d\n",
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 1fad4649735b..141795275ccb 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
492 return NULL; 492 return NULL;
493 } 493 }
494 494
495 ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs); 495 ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
496 dma_handle, dev->coherent_dma_mask, flag,
497 dev_to_node(dev));
496 if (unlikely(ret == NULL)) { 498 if (unlikely(ret == NULL)) {
497 vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); 499 vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
498 atomic_inc(&viodev->cmo.allocs_failed); 500 atomic_inc(&viodev->cmo.allocs_failed);
@@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
507{ 509{
508 struct vio_dev *viodev = to_vio_dev(dev); 510 struct vio_dev *viodev = to_vio_dev(dev);
509 511
510 dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs); 512 iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
511
512 vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); 513 vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
513} 514}
514 515
@@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
518 unsigned long attrs) 519 unsigned long attrs)
519{ 520{
520 struct vio_dev *viodev = to_vio_dev(dev); 521 struct vio_dev *viodev = to_vio_dev(dev);
521 struct iommu_table *tbl; 522 struct iommu_table *tbl = get_iommu_table_base(dev);
522 dma_addr_t ret = DMA_MAPPING_ERROR; 523 dma_addr_t ret = DMA_MAPPING_ERROR;
523 524
524 tbl = get_iommu_table_base(dev); 525 if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))))
525 if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { 526 goto out_fail;
526 atomic_inc(&viodev->cmo.allocs_failed); 527 ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev),
527 return ret; 528 direction, attrs);
528 } 529 if (unlikely(ret == DMA_MAPPING_ERROR))
529 530 goto out_deallocate;
530 ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
531 if (unlikely(dma_mapping_error(dev, ret))) {
532 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
533 atomic_inc(&viodev->cmo.allocs_failed);
534 }
535
536 return ret; 531 return ret;
532
533out_deallocate:
534 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
535out_fail:
536 atomic_inc(&viodev->cmo.allocs_failed);
537 return DMA_MAPPING_ERROR;
537} 538}
538 539
539static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, 540static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
@@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
542 unsigned long attrs) 543 unsigned long attrs)
543{ 544{
544 struct vio_dev *viodev = to_vio_dev(dev); 545 struct vio_dev *viodev = to_vio_dev(dev);
545 struct iommu_table *tbl; 546 struct iommu_table *tbl = get_iommu_table_base(dev);
546
547 tbl = get_iommu_table_base(dev);
548 dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
549 547
548 iommu_unmap_page(tbl, dma_handle, size, direction, attrs);
550 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); 549 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
551} 550}
552 551
@@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
555 unsigned long attrs) 554 unsigned long attrs)
556{ 555{
557 struct vio_dev *viodev = to_vio_dev(dev); 556 struct vio_dev *viodev = to_vio_dev(dev);
558 struct iommu_table *tbl; 557 struct iommu_table *tbl = get_iommu_table_base(dev);
559 struct scatterlist *sgl; 558 struct scatterlist *sgl;
560 int ret, count; 559 int ret, count;
561 size_t alloc_size = 0; 560 size_t alloc_size = 0;
562 561
563 tbl = get_iommu_table_base(dev);
564 for_each_sg(sglist, sgl, nelems, count) 562 for_each_sg(sglist, sgl, nelems, count)
565 alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); 563 alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
566 564
567 if (vio_cmo_alloc(viodev, alloc_size)) { 565 if (vio_cmo_alloc(viodev, alloc_size))
568 atomic_inc(&viodev->cmo.allocs_failed); 566 goto out_fail;
569 return 0; 567 ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev),
570 } 568 direction, attrs);
571 569 if (unlikely(!ret))
572 ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); 570 goto out_deallocate;
573
574 if (unlikely(!ret)) {
575 vio_cmo_dealloc(viodev, alloc_size);
576 atomic_inc(&viodev->cmo.allocs_failed);
577 return ret;
578 }
579 571
580 for_each_sg(sglist, sgl, ret, count) 572 for_each_sg(sglist, sgl, ret, count)
581 alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); 573 alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
582 if (alloc_size) 574 if (alloc_size)
583 vio_cmo_dealloc(viodev, alloc_size); 575 vio_cmo_dealloc(viodev, alloc_size);
584
585 return ret; 576 return ret;
577
578out_deallocate:
579 vio_cmo_dealloc(viodev, alloc_size);
580out_fail:
581 atomic_inc(&viodev->cmo.allocs_failed);
582 return 0;
586} 583}
587 584
588static void vio_dma_iommu_unmap_sg(struct device *dev, 585static void vio_dma_iommu_unmap_sg(struct device *dev,
@@ -591,40 +588,27 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
591 unsigned long attrs) 588 unsigned long attrs)
592{ 589{
593 struct vio_dev *viodev = to_vio_dev(dev); 590 struct vio_dev *viodev = to_vio_dev(dev);
594 struct iommu_table *tbl; 591 struct iommu_table *tbl = get_iommu_table_base(dev);
595 struct scatterlist *sgl; 592 struct scatterlist *sgl;
596 size_t alloc_size = 0; 593 size_t alloc_size = 0;
597 int count; 594 int count;
598 595
599 tbl = get_iommu_table_base(dev);
600 for_each_sg(sglist, sgl, nelems, count) 596 for_each_sg(sglist, sgl, nelems, count)
601 alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); 597 alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
602 598
603 dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); 599 ppc_iommu_unmap_sg(tbl, sglist, nelems, direction, attrs);
604
605 vio_cmo_dealloc(viodev, alloc_size); 600 vio_cmo_dealloc(viodev, alloc_size);
606} 601}
607 602
608static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
609{
610 return dma_iommu_ops.dma_supported(dev, mask);
611}
612
613static u64 vio_dma_get_required_mask(struct device *dev)
614{
615 return dma_iommu_ops.get_required_mask(dev);
616}
617
618static const struct dma_map_ops vio_dma_mapping_ops = { 603static const struct dma_map_ops vio_dma_mapping_ops = {
619 .alloc = vio_dma_iommu_alloc_coherent, 604 .alloc = vio_dma_iommu_alloc_coherent,
620 .free = vio_dma_iommu_free_coherent, 605 .free = vio_dma_iommu_free_coherent,
621 .mmap = dma_nommu_mmap_coherent,
622 .map_sg = vio_dma_iommu_map_sg, 606 .map_sg = vio_dma_iommu_map_sg,
623 .unmap_sg = vio_dma_iommu_unmap_sg, 607 .unmap_sg = vio_dma_iommu_unmap_sg,
624 .map_page = vio_dma_iommu_map_page, 608 .map_page = vio_dma_iommu_map_page,
625 .unmap_page = vio_dma_iommu_unmap_page, 609 .unmap_page = vio_dma_iommu_unmap_page,
626 .dma_supported = vio_dma_iommu_dma_supported, 610 .dma_supported = dma_iommu_dma_supported,
627 .get_required_mask = vio_dma_get_required_mask, 611 .get_required_mask = dma_iommu_get_required_mask,
628}; 612};
629 613
630/** 614/**
@@ -1715,3 +1699,10 @@ int vio_disable_interrupts(struct vio_dev *dev)
1715} 1699}
1716EXPORT_SYMBOL(vio_disable_interrupts); 1700EXPORT_SYMBOL(vio_disable_interrupts);
1717#endif /* CONFIG_PPC_PSERIES */ 1701#endif /* CONFIG_PPC_PSERIES */
1702
1703static int __init vio_init(void)
1704{
1705 dma_debug_add_bus(&vio_bus_type);
1706 return 0;
1707}
1708fs_initcall(vio_init);
diff --git a/arch/powerpc/sysdev/6xx-suspend.S b/arch/powerpc/sysdev/6xx-suspend.S
index cf48e9cb2575..6c4aec25c4ba 100644
--- a/arch/powerpc/sysdev/6xx-suspend.S
+++ b/arch/powerpc/sysdev/6xx-suspend.S
@@ -29,10 +29,9 @@ _GLOBAL(mpc6xx_enter_standby)
29 ori r5, r5, ret_from_standby@l 29 ori r5, r5, ret_from_standby@l
30 mtlr r5 30 mtlr r5
31 31
32 CURRENT_THREAD_INFO(r5, r1) 32 lwz r6, TI_LOCAL_FLAGS(r2)
33 lwz r6, TI_LOCAL_FLAGS(r5)
34 ori r6, r6, _TLF_SLEEPING 33 ori r6, r6, _TLF_SLEEPING
35 stw r6, TI_LOCAL_FLAGS(r5) 34 stw r6, TI_LOCAL_FLAGS(r2)
36 35
37 mfmsr r5 36 mfmsr r5
38 ori r5, r5, MSR_EE 37 ori r5, r5, MSR_EE
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 25bc25fe0d93..fc5c5c23303e 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -363,13 +363,6 @@ static void iommu_table_dart_setup(void)
363 set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map); 363 set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
364} 364}
365 365
366static void pci_dma_dev_setup_dart(struct pci_dev *dev)
367{
368 if (dart_is_u4)
369 set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE);
370 set_iommu_table_base(&dev->dev, &iommu_table_dart);
371}
372
373static void pci_dma_bus_setup_dart(struct pci_bus *bus) 366static void pci_dma_bus_setup_dart(struct pci_bus *bus)
374{ 367{
375 if (!iommu_table_dart_inited) { 368 if (!iommu_table_dart_inited) {
@@ -393,27 +386,18 @@ static bool dart_device_on_pcie(struct device *dev)
393 return false; 386 return false;
394} 387}
395 388
396static int dart_dma_set_mask(struct device *dev, u64 dma_mask) 389static void pci_dma_dev_setup_dart(struct pci_dev *dev)
397{ 390{
398 if (!dev->dma_mask || !dma_supported(dev, dma_mask)) 391 if (dart_is_u4 && dart_device_on_pcie(&dev->dev))
399 return -EIO; 392 dev->dev.archdata.dma_offset = DART_U4_BYPASS_BASE;
400 393 set_iommu_table_base(&dev->dev, &iommu_table_dart);
401 /* U4 supports a DART bypass, we use it for 64-bit capable 394}
402 * devices to improve performances. However, that only works
403 * for devices connected to U4 own PCIe interface, not bridged
404 * through hypertransport. We need the device to support at
405 * least 40 bits of addresses.
406 */
407 if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) {
408 dev_info(dev, "Using 64-bit DMA iommu bypass\n");
409 set_dma_ops(dev, &dma_nommu_ops);
410 } else {
411 dev_info(dev, "Using 32-bit DMA via iommu\n");
412 set_dma_ops(dev, &dma_iommu_ops);
413 }
414 395
415 *dev->dma_mask = dma_mask; 396static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask)
416 return 0; 397{
398 return dart_is_u4 &&
399 dart_device_on_pcie(&dev->dev) &&
400 mask >= DMA_BIT_MASK(40);
417} 401}
418 402
419void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) 403void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
@@ -431,26 +415,20 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
431 415
432 /* Initialize the DART HW */ 416 /* Initialize the DART HW */
433 if (dart_init(dn) != 0) 417 if (dart_init(dn) != 0)
434 goto bail; 418 return;
435
436 /* Setup bypass if supported */
437 if (dart_is_u4)
438 ppc_md.dma_set_mask = dart_dma_set_mask;
439 419
420 /*
421 * U4 supports a DART bypass, we use it for 64-bit capable devices to
422 * improve performance. However, that only works for devices connected
423 * to the U4 own PCIe interface, not bridged through hypertransport.
424 * We need the device to support at least 40 bits of addresses.
425 */
440 controller_ops->dma_dev_setup = pci_dma_dev_setup_dart; 426 controller_ops->dma_dev_setup = pci_dma_dev_setup_dart;
441 controller_ops->dma_bus_setup = pci_dma_bus_setup_dart; 427 controller_ops->dma_bus_setup = pci_dma_bus_setup_dart;
428 controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart;
442 429
443 /* Setup pci_dma ops */ 430 /* Setup pci_dma ops */
444 set_pci_dma_ops(&dma_iommu_ops); 431 set_pci_dma_ops(&dma_iommu_ops);
445 return;
446
447 bail:
448 /* If init failed, use direct iommu and null setup functions */
449 controller_ops->dma_dev_setup = NULL;
450 controller_ops->dma_bus_setup = NULL;
451
452 /* Setup pci_dma ops */
453 set_pci_dma_ops(&dma_nommu_ops);
454} 432}
455 433
456#ifdef CONFIG_PM 434#ifdef CONFIG_PM
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 918be816b097..f49aec251a5a 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -40,6 +40,7 @@
40#include <asm/mpc85xx.h> 40#include <asm/mpc85xx.h>
41#include <asm/disassemble.h> 41#include <asm/disassemble.h>
42#include <asm/ppc-opcode.h> 42#include <asm/ppc-opcode.h>
43#include <asm/swiotlb.h>
43#include <sysdev/fsl_soc.h> 44#include <sysdev/fsl_soc.h>
44#include <sysdev/fsl_pci.h> 45#include <sysdev/fsl_pci.h>
45 46
@@ -114,33 +115,33 @@ static struct pci_ops fsl_indirect_pcie_ops =
114static u64 pci64_dma_offset; 115static u64 pci64_dma_offset;
115 116
116#ifdef CONFIG_SWIOTLB 117#ifdef CONFIG_SWIOTLB
118static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
119{
120 struct pci_controller *hose = pci_bus_to_host(pdev->bus);
121
122 pdev->dev.bus_dma_mask =
123 hose->dma_window_base_cur + hose->dma_window_size;
124}
125
117static void setup_swiotlb_ops(struct pci_controller *hose) 126static void setup_swiotlb_ops(struct pci_controller *hose)
118{ 127{
119 if (ppc_swiotlb_enable) { 128 if (ppc_swiotlb_enable)
120 hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb; 129 hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb;
121 set_pci_dma_ops(&powerpc_swiotlb_dma_ops);
122 }
123} 130}
124#else 131#else
125static inline void setup_swiotlb_ops(struct pci_controller *hose) {} 132static inline void setup_swiotlb_ops(struct pci_controller *hose) {}
126#endif 133#endif
127 134
128static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) 135static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
129{ 136{
130 if (!dev->dma_mask || !dma_supported(dev, dma_mask))
131 return -EIO;
132
133 /* 137 /*
134 * Fix up PCI devices that are able to DMA to the large inbound 138 * Fix up PCI devices that are able to DMA to the large inbound
135 * mapping that allows addressing any RAM address from across PCI. 139 * mapping that allows addressing any RAM address from across PCI.
136 */ 140 */
137 if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { 141 if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
138 set_dma_ops(dev, &dma_nommu_ops); 142 dev->bus_dma_mask = 0;
139 set_dma_offset(dev, pci64_dma_offset); 143 dev->archdata.dma_offset = pci64_dma_offset;
140 } 144 }
141
142 *dev->dma_mask = dma_mask;
143 return 0;
144} 145}
145 146
146static int setup_one_atmu(struct ccsr_pci __iomem *pci, 147static int setup_one_atmu(struct ccsr_pci __iomem *pci,
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 8030a0f55e96..fd129c8ecceb 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -771,21 +771,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags)
771 return ipic; 771 return ipic;
772} 772}
773 773
774void ipic_set_highest_priority(unsigned int virq)
775{
776 struct ipic *ipic = ipic_from_irq(virq);
777 unsigned int src = virq_to_hw(virq);
778 u32 temp;
779
780 temp = ipic_read(ipic->regs, IPIC_SICFR);
781
782 /* clear and set HPI */
783 temp &= 0x7f000000;
784 temp |= (src & 0x7f) << 24;
785
786 ipic_write(ipic->regs, IPIC_SICFR, temp);
787}
788
789void ipic_set_default_priority(void) 774void ipic_set_default_priority(void)
790{ 775{
791 ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT); 776 ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT);
@@ -796,26 +781,6 @@ void ipic_set_default_priority(void)
796 ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT); 781 ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT);
797} 782}
798 783
799void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq)
800{
801 struct ipic *ipic = primary_ipic;
802 u32 temp;
803
804 temp = ipic_read(ipic->regs, IPIC_SERMR);
805 temp |= (1 << (31 - mcp_irq));
806 ipic_write(ipic->regs, IPIC_SERMR, temp);
807}
808
809void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq)
810{
811 struct ipic *ipic = primary_ipic;
812 u32 temp;
813
814 temp = ipic_read(ipic->regs, IPIC_SERMR);
815 temp &= (1 << (31 - mcp_irq));
816 ipic_write(ipic->regs, IPIC_SERMR, temp);
817}
818
819u32 ipic_get_mcp_status(void) 784u32 ipic_get_mcp_status(void)
820{ 785{
821 return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; 786 return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0;
diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c
index 1fd0717ade02..1f1af12f23e2 100644
--- a/arch/powerpc/sysdev/tsi108_dev.c
+++ b/arch/powerpc/sysdev/tsi108_dev.c
@@ -51,7 +51,7 @@ phys_addr_t get_csrbase(void)
51 const void *prop = of_get_property(tsi, "reg", &size); 51 const void *prop = of_get_property(tsi, "reg", &size);
52 tsi108_csr_base = of_translate_address(tsi, prop); 52 tsi108_csr_base = of_translate_address(tsi, prop);
53 of_node_put(tsi); 53 of_node_put(tsi);
54 }; 54 }
55 return tsi108_csr_base; 55 return tsi108_csr_base;
56} 56}
57 57
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 94a69a62f5db..70a8f9e31a2d 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -442,7 +442,7 @@ static void xive_dec_target_count(int cpu)
442 struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 442 struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
443 struct xive_q *q = &xc->queue[xive_irq_priority]; 443 struct xive_q *q = &xc->queue[xive_irq_priority];
444 444
445 if (unlikely(WARN_ON(cpu < 0 || !xc))) { 445 if (WARN_ON(cpu < 0 || !xc)) {
446 pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); 446 pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
447 return; 447 return;
448 } 448 }
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index 878f9c1d3615..3050f9323254 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -5,6 +5,7 @@
5subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) 5subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header)
6 6
7GCOV_PROFILE := n 7GCOV_PROFILE := n
8KCOV_INSTRUMENT := n
8UBSAN_SANITIZE := n 9UBSAN_SANITIZE := n
9 10
10# Disable ftrace for the entire directory 11# Disable ftrace for the entire directory
diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c
index 9deea5ee13f6..27f1e6415036 100644
--- a/arch/powerpc/xmon/ppc-dis.c
+++ b/arch/powerpc/xmon/ppc-dis.c
@@ -158,7 +158,7 @@ int print_insn_powerpc (unsigned long insn, unsigned long memaddr)
158 dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7 158 dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7
159 | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM 159 | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM
160 | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2 160 | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2
161 | PPC_OPCODE_VSX | PPC_OPCODE_VSX3), 161 | PPC_OPCODE_VSX | PPC_OPCODE_VSX3);
162 162
163 /* Get the major opcode of the insn. */ 163 /* Get the major opcode of the insn. */
164 opcode = NULL; 164 opcode = NULL;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 757b8499aba2..a0f44f992360 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2997,7 +2997,7 @@ static void show_task(struct task_struct *tsk)
2997 printf("%px %016lx %6d %6d %c %2d %s\n", tsk, 2997 printf("%px %016lx %6d %6d %c %2d %s\n", tsk,
2998 tsk->thread.ksp, 2998 tsk->thread.ksp,
2999 tsk->pid, rcu_dereference(tsk->parent)->pid, 2999 tsk->pid, rcu_dereference(tsk->parent)->pid,
3000 state, task_thread_info(tsk)->cpu, 3000 state, task_cpu(tsk),
3001 tsk->comm); 3001 tsk->comm);
3002} 3002}
3003 3003
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bd149905a5b5..b41311f6a94f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -90,14 +90,14 @@ config GENERIC_CSUM
90config GENERIC_HWEIGHT 90config GENERIC_HWEIGHT
91 def_bool y 91 def_bool y
92 92
93config FIX_EARLYCON_MEM
94 def_bool y
95
93config PGTABLE_LEVELS 96config PGTABLE_LEVELS
94 int 97 int
95 default 3 if 64BIT 98 default 3 if 64BIT
96 default 2 99 default 2
97 100
98config HAVE_KPROBES
99 def_bool n
100
101menu "Platform type" 101menu "Platform type"
102 102
103choice 103choice
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
new file mode 100644
index 000000000000..57afe604b495
--- /dev/null
+++ b/arch/riscv/include/asm/fixmap.h
@@ -0,0 +1,44 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2019 Western Digital Corporation or its affiliates.
4 */
5
6#ifndef _ASM_RISCV_FIXMAP_H
7#define _ASM_RISCV_FIXMAP_H
8
9#include <linux/kernel.h>
10#include <linux/sizes.h>
11#include <asm/page.h>
12#include <asm/pgtable.h>
13
14/*
15 * Here we define all the compile-time 'special' virtual addresses.
16 * The point is to have a constant address at compile time, but to
17 * set the physical address only in the boot process.
18 *
19 * These 'compile-time allocated' memory buffers are page-sized. Use
20 * set_fixmap(idx,phys) to associate physical memory with fixmap indices.
21 */
22enum fixed_addresses {
23 FIX_HOLE,
24 FIX_EARLYCON_MEM_BASE,
25 __end_of_fixed_addresses
26};
27
28#define FIXADDR_SIZE (__end_of_fixed_addresses * PAGE_SIZE)
29#define FIXADDR_TOP (PAGE_OFFSET)
30#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
31
32#define FIXMAP_PAGE_IO PAGE_KERNEL
33
34#define __early_set_fixmap __set_fixmap
35
36#define __late_set_fixmap __set_fixmap
37#define __late_clear_fixmap(idx) __set_fixmap((idx), 0, FIXMAP_PAGE_CLEAR)
38
39extern void __set_fixmap(enum fixed_addresses idx,
40 phys_addr_t phys, pgprot_t prot);
41
42#include <asm-generic/fixmap.h>
43
44#endif /* _ASM_RISCV_FIXMAP_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index a8179a8c1491..1141364d990e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -404,6 +404,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
404#define kern_addr_valid(addr) (1) /* FIXME */ 404#define kern_addr_valid(addr) (1) /* FIXME */
405#endif 405#endif
406 406
407extern void setup_bootmem(void);
407extern void paging_init(void); 408extern void paging_init(void);
408 409
409static inline void pgtable_cache_init(void) 410static inline void pgtable_cache_init(void)
diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
index 41aa73b476f4..636a934f013a 100644
--- a/arch/riscv/include/asm/smp.h
+++ b/arch/riscv/include/asm/smp.h
@@ -19,16 +19,17 @@
19#include <linux/thread_info.h> 19#include <linux/thread_info.h>
20 20
21#define INVALID_HARTID ULONG_MAX 21#define INVALID_HARTID ULONG_MAX
22
23struct seq_file;
24extern unsigned long boot_cpu_hartid;
25
26#ifdef CONFIG_SMP
22/* 27/*
23 * Mapping between linux logical cpu index and hartid. 28 * Mapping between linux logical cpu index and hartid.
24 */ 29 */
25extern unsigned long __cpuid_to_hartid_map[NR_CPUS]; 30extern unsigned long __cpuid_to_hartid_map[NR_CPUS];
26#define cpuid_to_hartid_map(cpu) __cpuid_to_hartid_map[cpu] 31#define cpuid_to_hartid_map(cpu) __cpuid_to_hartid_map[cpu]
27 32
28struct seq_file;
29
30#ifdef CONFIG_SMP
31
32/* print IPI stats */ 33/* print IPI stats */
33void show_ipi_stats(struct seq_file *p, int prec); 34void show_ipi_stats(struct seq_file *p, int prec);
34 35
@@ -58,7 +59,14 @@ static inline void show_ipi_stats(struct seq_file *p, int prec)
58 59
59static inline int riscv_hartid_to_cpuid(int hartid) 60static inline int riscv_hartid_to_cpuid(int hartid)
60{ 61{
61 return 0; 62 if (hartid == boot_cpu_hartid)
63 return 0;
64
65 return -1;
66}
67static inline unsigned long cpuid_to_hartid_map(int cpu)
68{
69 return boot_cpu_hartid;
62} 70}
63 71
64static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, 72static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index f8fa2c63aa89..cf2fca12414a 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -17,44 +17,36 @@
17#include <asm/smp.h> 17#include <asm/smp.h>
18 18
19/* 19/*
20 * Returns the hart ID of the given device tree node, or -1 if the device tree 20 * Returns the hart ID of the given device tree node, or -ENODEV if the node
21 * node isn't a RISC-V hart. 21 * isn't an enabled and valid RISC-V hart node.
22 */ 22 */
23int riscv_of_processor_hartid(struct device_node *node) 23int riscv_of_processor_hartid(struct device_node *node)
24{ 24{
25 const char *isa, *status; 25 const char *isa;
26 u32 hart; 26 u32 hart;
27 27
28 if (!of_device_is_compatible(node, "riscv")) { 28 if (!of_device_is_compatible(node, "riscv")) {
29 pr_warn("Found incompatible CPU\n"); 29 pr_warn("Found incompatible CPU\n");
30 return -(ENODEV); 30 return -ENODEV;
31 } 31 }
32 32
33 if (of_property_read_u32(node, "reg", &hart)) { 33 if (of_property_read_u32(node, "reg", &hart)) {
34 pr_warn("Found CPU without hart ID\n"); 34 pr_warn("Found CPU without hart ID\n");
35 return -(ENODEV); 35 return -ENODEV;
36 }
37 if (hart >= NR_CPUS) {
38 pr_info("Found hart ID %d, which is above NR_CPUs. Disabling this hart\n", hart);
39 return -(ENODEV);
40 } 36 }
41 37
42 if (of_property_read_string(node, "status", &status)) { 38 if (!of_device_is_available(node)) {
43 pr_warn("CPU with hartid=%d has no \"status\" property\n", hart); 39 pr_info("CPU with hartid=%d is not available\n", hart);
44 return -(ENODEV); 40 return -ENODEV;
45 }
46 if (strcmp(status, "okay")) {
47 pr_info("CPU with hartid=%d has a non-okay status of \"%s\"\n", hart, status);
48 return -(ENODEV);
49 } 41 }
50 42
51 if (of_property_read_string(node, "riscv,isa", &isa)) { 43 if (of_property_read_string(node, "riscv,isa", &isa)) {
52 pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart); 44 pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart);
53 return -(ENODEV); 45 return -ENODEV;
54 } 46 }
55 if (isa[0] != 'r' || isa[1] != 'v') { 47 if (isa[0] != 'r' || isa[1] != 'v') {
56 pr_warn("CPU with hartid=%d has an invalid ISA of \"%s\"\n", hart, isa); 48 pr_warn("CPU with hartid=%d has an invalid ISA of \"%s\"\n", hart, isa);
57 return -(ENODEV); 49 return -ENODEV;
58 } 50 }
59 51
60 return hart; 52 return hart;
@@ -106,7 +98,7 @@ static void print_isa(struct seq_file *f, const char *orig_isa)
106 * a bit of info describing what went wrong. 98 * a bit of info describing what went wrong.
107 */ 99 */
108 if (isa[0] != '\0') 100 if (isa[0] != '\0')
109 pr_info("unsupported ISA \"%s\" in device tree", orig_isa); 101 pr_info("unsupported ISA \"%s\" in device tree\n", orig_isa);
110} 102}
111 103
112static void print_mmu(struct seq_file *f, const char *mmu_type) 104static void print_mmu(struct seq_file *f, const char *mmu_type)
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index a6e369edbbd7..bc29b010b722 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -20,6 +20,7 @@
20#include <linux/of.h> 20#include <linux/of.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/hwcap.h> 22#include <asm/hwcap.h>
23#include <asm/smp.h>
23 24
24unsigned long elf_hwcap __read_mostly; 25unsigned long elf_hwcap __read_mostly;
25#ifdef CONFIG_FPU 26#ifdef CONFIG_FPU
@@ -28,7 +29,7 @@ bool has_fpu __read_mostly;
28 29
29void riscv_fill_hwcap(void) 30void riscv_fill_hwcap(void)
30{ 31{
31 struct device_node *node = NULL; 32 struct device_node *node;
32 const char *isa; 33 const char *isa;
33 size_t i; 34 size_t i;
34 static unsigned long isa2hwcap[256] = {0}; 35 static unsigned long isa2hwcap[256] = {0};
@@ -42,36 +43,39 @@ void riscv_fill_hwcap(void)
42 43
43 elf_hwcap = 0; 44 elf_hwcap = 0;
44 45
45 /* 46 for_each_of_cpu_node(node) {
46 * We don't support running Linux on hertergenous ISA systems. For 47 unsigned long this_hwcap = 0;
47 * now, we just check the ISA of the first "okay" processor.
48 */
49 while ((node = of_find_node_by_type(node, "cpu")))
50 if (riscv_of_processor_hartid(node) >= 0)
51 break;
52 if (!node) {
53 pr_warning("Unable to find \"cpu\" devicetree entry");
54 return;
55 }
56 48
57 if (of_property_read_string(node, "riscv,isa", &isa)) { 49 if (riscv_of_processor_hartid(node) < 0)
58 pr_warning("Unable to find \"riscv,isa\" devicetree entry"); 50 continue;
59 of_node_put(node); 51
60 return; 52 if (of_property_read_string(node, "riscv,isa", &isa)) {
61 } 53 pr_warn("Unable to find \"riscv,isa\" devicetree entry\n");
62 of_node_put(node); 54 continue;
55 }
63 56
64 for (i = 0; i < strlen(isa); ++i) 57 for (i = 0; i < strlen(isa); ++i)
65 elf_hwcap |= isa2hwcap[(unsigned char)(isa[i])]; 58 this_hwcap |= isa2hwcap[(unsigned char)(isa[i])];
59
60 /*
61 * All "okay" hart should have same isa. Set HWCAP based on
62 * common capabilities of every "okay" hart, in case they don't
63 * have.
64 */
65 if (elf_hwcap)
66 elf_hwcap &= this_hwcap;
67 else
68 elf_hwcap = this_hwcap;
69 }
66 70
67 /* We don't support systems with F but without D, so mask those out 71 /* We don't support systems with F but without D, so mask those out
68 * here. */ 72 * here. */
69 if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) { 73 if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) {
70 pr_info("This kernel does not support systems with F but not D"); 74 pr_info("This kernel does not support systems with F but not D\n");
71 elf_hwcap &= ~COMPAT_HWCAP_ISA_F; 75 elf_hwcap &= ~COMPAT_HWCAP_ISA_F;
72 } 76 }
73 77
74 pr_info("elf_hwcap is 0x%lx", elf_hwcap); 78 pr_info("elf_hwcap is 0x%lx\n", elf_hwcap);
75 79
76#ifdef CONFIG_FPU 80#ifdef CONFIG_FPU
77 if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D)) 81 if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D))
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index a840b7d074f7..b94d8db5ddcc 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -32,7 +32,7 @@ static int ftrace_check_current_call(unsigned long hook_pos,
32 * return must be -EINVAL on failed comparison 32 * return must be -EINVAL on failed comparison
33 */ 33 */
34 if (memcmp(expected, replaced, sizeof(replaced))) { 34 if (memcmp(expected, replaced, sizeof(replaced))) {
35 pr_err("%p: expected (%08x %08x) but get (%08x %08x)", 35 pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n",
36 (void *)hook_pos, expected[0], expected[1], replaced[0], 36 (void *)hook_pos, expected[0], expected[1], replaced[0],
37 replaced[1]); 37 replaced[1]);
38 return -EINVAL; 38 return -EINVAL;
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 77564310235f..ecb654f6a79e 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -23,7 +23,6 @@
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/memblock.h> 24#include <linux/memblock.h>
25#include <linux/sched.h> 25#include <linux/sched.h>
26#include <linux/initrd.h>
27#include <linux/console.h> 26#include <linux/console.h>
28#include <linux/screen_info.h> 27#include <linux/screen_info.h>
29#include <linux/of_fdt.h> 28#include <linux/of_fdt.h>
@@ -61,95 +60,9 @@ EXPORT_SYMBOL(empty_zero_page);
61atomic_t hart_lottery; 60atomic_t hart_lottery;
62unsigned long boot_cpu_hartid; 61unsigned long boot_cpu_hartid;
63 62
64unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
65 [0 ... NR_CPUS-1] = INVALID_HARTID
66};
67
68void __init smp_setup_processor_id(void)
69{
70 cpuid_to_hartid_map(0) = boot_cpu_hartid;
71}
72
73#ifdef CONFIG_BLK_DEV_INITRD
74static void __init setup_initrd(void)
75{
76 unsigned long size;
77
78 if (initrd_start >= initrd_end) {
79 printk(KERN_INFO "initrd not found or empty");
80 goto disable;
81 }
82 if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
83 printk(KERN_ERR "initrd extends beyond end of memory");
84 goto disable;
85 }
86
87 size = initrd_end - initrd_start;
88 memblock_reserve(__pa(initrd_start), size);
89 initrd_below_start_ok = 1;
90
91 printk(KERN_INFO "Initial ramdisk at: 0x%p (%lu bytes)\n",
92 (void *)(initrd_start), size);
93 return;
94disable:
95 pr_cont(" - disabling initrd\n");
96 initrd_start = 0;
97 initrd_end = 0;
98}
99#endif /* CONFIG_BLK_DEV_INITRD */
100
101pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
102pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
103
104#ifndef __PAGETABLE_PMD_FOLDED
105#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
106pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
107pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
108#endif
109
110asmlinkage void __init setup_vm(void)
111{
112 extern char _start;
113 uintptr_t i;
114 uintptr_t pa = (uintptr_t) &_start;
115 pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
116
117 va_pa_offset = PAGE_OFFSET - pa;
118 pfn_base = PFN_DOWN(pa);
119
120 /* Sanity check alignment and size */
121 BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
122 BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
123
124#ifndef __PAGETABLE_PMD_FOLDED
125 trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
126 pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd),
127 __pgprot(_PAGE_TABLE));
128 trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot);
129
130 for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
131 size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
132 swapper_pg_dir[o] =
133 pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
134 __pgprot(_PAGE_TABLE));
135 }
136 for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
137 swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
138#else
139 trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
140 pfn_pgd(PFN_DOWN(pa), prot);
141
142 for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
143 size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
144 swapper_pg_dir[o] =
145 pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
146 }
147#endif
148}
149
150void __init parse_dtb(unsigned int hartid, void *dtb) 63void __init parse_dtb(unsigned int hartid, void *dtb)
151{ 64{
152 if (!early_init_dt_scan(__va(dtb))) 65 if (early_init_dt_scan(__va(dtb)))
153 return; 66 return;
154 67
155 pr_err("No DTB passed to the kernel\n"); 68 pr_err("No DTB passed to the kernel\n");
@@ -159,60 +72,17 @@ void __init parse_dtb(unsigned int hartid, void *dtb)
159#endif 72#endif
160} 73}
161 74
162static void __init setup_bootmem(void)
163{
164 struct memblock_region *reg;
165 phys_addr_t mem_size = 0;
166
167 /* Find the memory region containing the kernel */
168 for_each_memblock(memory, reg) {
169 phys_addr_t vmlinux_end = __pa(_end);
170 phys_addr_t end = reg->base + reg->size;
171
172 if (reg->base <= vmlinux_end && vmlinux_end <= end) {
173 /*
174 * Reserve from the start of the region to the end of
175 * the kernel
176 */
177 memblock_reserve(reg->base, vmlinux_end - reg->base);
178 mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
179 }
180 }
181 BUG_ON(mem_size == 0);
182
183 set_max_mapnr(PFN_DOWN(mem_size));
184 max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
185
186#ifdef CONFIG_BLK_DEV_INITRD
187 setup_initrd();
188#endif /* CONFIG_BLK_DEV_INITRD */
189
190 early_init_fdt_reserve_self();
191 early_init_fdt_scan_reserved_mem();
192 memblock_allow_resize();
193 memblock_dump_all();
194
195 for_each_memblock(memory, reg) {
196 unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
197 unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
198
199 memblock_set_node(PFN_PHYS(start_pfn),
200 PFN_PHYS(end_pfn - start_pfn),
201 &memblock.memory, 0);
202 }
203}
204
205void __init setup_arch(char **cmdline_p) 75void __init setup_arch(char **cmdline_p)
206{ 76{
207 *cmdline_p = boot_command_line;
208
209 parse_early_param();
210
211 init_mm.start_code = (unsigned long) _stext; 77 init_mm.start_code = (unsigned long) _stext;
212 init_mm.end_code = (unsigned long) _etext; 78 init_mm.end_code = (unsigned long) _etext;
213 init_mm.end_data = (unsigned long) _edata; 79 init_mm.end_data = (unsigned long) _edata;
214 init_mm.brk = (unsigned long) _end; 80 init_mm.brk = (unsigned long) _end;
215 81
82 *cmdline_p = boot_command_line;
83
84 parse_early_param();
85
216 setup_bootmem(); 86 setup_bootmem();
217 paging_init(); 87 paging_init();
218 unflatten_device_tree(); 88 unflatten_device_tree();
@@ -231,4 +101,3 @@ void __init setup_arch(char **cmdline_p)
231 101
232 riscv_fill_hwcap(); 102 riscv_fill_hwcap();
233} 103}
234
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 246635eac7bb..0c41d07ec281 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -36,6 +36,15 @@ enum ipi_message_type {
36 IPI_MAX 36 IPI_MAX
37}; 37};
38 38
39unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
40 [0 ... NR_CPUS-1] = INVALID_HARTID
41};
42
43void __init smp_setup_processor_id(void)
44{
45 cpuid_to_hartid_map(0) = boot_cpu_hartid;
46}
47
39/* A collection of single bit ipi messages. */ 48/* A collection of single bit ipi messages. */
40static struct { 49static struct {
41 unsigned long stats[IPI_MAX] ____cacheline_aligned; 50 unsigned long stats[IPI_MAX] ____cacheline_aligned;
@@ -51,7 +60,6 @@ int riscv_hartid_to_cpuid(int hartid)
51 return i; 60 return i;
52 61
53 pr_err("Couldn't find cpu id for hartid [%d]\n", hartid); 62 pr_err("Couldn't find cpu id for hartid [%d]\n", hartid);
54 BUG();
55 return i; 63 return i;
56} 64}
57 65
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 18cda0e8cf94..eb533b5c2c8c 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -39,6 +39,7 @@
39 39
40void *__cpu_up_stack_pointer[NR_CPUS]; 40void *__cpu_up_stack_pointer[NR_CPUS];
41void *__cpu_up_task_pointer[NR_CPUS]; 41void *__cpu_up_task_pointer[NR_CPUS];
42static DECLARE_COMPLETION(cpu_running);
42 43
43void __init smp_prepare_boot_cpu(void) 44void __init smp_prepare_boot_cpu(void)
44{ 45{
@@ -50,12 +51,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
50 51
51void __init setup_smp(void) 52void __init setup_smp(void)
52{ 53{
53 struct device_node *dn = NULL; 54 struct device_node *dn;
54 int hart; 55 int hart;
55 bool found_boot_cpu = false; 56 bool found_boot_cpu = false;
56 int cpuid = 1; 57 int cpuid = 1;
57 58
58 while ((dn = of_find_node_by_type(dn, "cpu"))) { 59 for_each_of_cpu_node(dn) {
59 hart = riscv_of_processor_hartid(dn); 60 hart = riscv_of_processor_hartid(dn);
60 if (hart < 0) 61 if (hart < 0)
61 continue; 62 continue;
@@ -65,6 +66,11 @@ void __init setup_smp(void)
65 found_boot_cpu = 1; 66 found_boot_cpu = 1;
66 continue; 67 continue;
67 } 68 }
69 if (cpuid >= NR_CPUS) {
70 pr_warn("Invalid cpuid [%d] for hartid [%d]\n",
71 cpuid, hart);
72 break;
73 }
68 74
69 cpuid_to_hartid_map(cpuid) = hart; 75 cpuid_to_hartid_map(cpuid) = hart;
70 set_cpu_possible(cpuid, true); 76 set_cpu_possible(cpuid, true);
@@ -77,6 +83,7 @@ void __init setup_smp(void)
77 83
78int __cpu_up(unsigned int cpu, struct task_struct *tidle) 84int __cpu_up(unsigned int cpu, struct task_struct *tidle)
79{ 85{
86 int ret = 0;
80 int hartid = cpuid_to_hartid_map(cpu); 87 int hartid = cpuid_to_hartid_map(cpu);
81 tidle->thread_info.cpu = cpu; 88 tidle->thread_info.cpu = cpu;
82 89
@@ -92,10 +99,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
92 task_stack_page(tidle) + THREAD_SIZE); 99 task_stack_page(tidle) + THREAD_SIZE);
93 WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); 100 WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
94 101
95 while (!cpu_online(cpu)) 102 lockdep_assert_held(&cpu_running);
96 cpu_relax(); 103 wait_for_completion_timeout(&cpu_running,
104 msecs_to_jiffies(1000));
105
106 if (!cpu_online(cpu)) {
107 pr_crit("CPU%u: failed to come online\n", cpu);
108 ret = -EIO;
109 }
97 110
98 return 0; 111 return ret;
99} 112}
100 113
101void __init smp_cpus_done(unsigned int max_cpus) 114void __init smp_cpus_done(unsigned int max_cpus)
@@ -121,6 +134,7 @@ asmlinkage void __init smp_callin(void)
121 * a local TLB flush right now just in case. 134 * a local TLB flush right now just in case.
122 */ 135 */
123 local_flush_tlb_all(); 136 local_flush_tlb_all();
137 complete(&cpu_running);
124 /* 138 /*
125 * Disable preemption before enabling interrupts, so we don't try to 139 * Disable preemption before enabling interrupts, so we don't try to
126 * schedule a CPU that hasn't actually started yet. 140 * schedule a CPU that hasn't actually started yet.
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 658ebf645f42..b379a75ac6a6 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -17,7 +17,9 @@
17#include <linux/initrd.h> 17#include <linux/initrd.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/sizes.h> 19#include <linux/sizes.h>
20#include <linux/of_fdt.h>
20 21
22#include <asm/fixmap.h>
21#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
22#include <asm/sections.h> 24#include <asm/sections.h>
23#include <asm/pgtable.h> 25#include <asm/pgtable.h>
@@ -66,7 +68,159 @@ void free_initmem(void)
66} 68}
67 69
68#ifdef CONFIG_BLK_DEV_INITRD 70#ifdef CONFIG_BLK_DEV_INITRD
69void free_initrd_mem(unsigned long start, unsigned long end) 71static void __init setup_initrd(void)
70{ 72{
73 unsigned long size;
74
75 if (initrd_start >= initrd_end) {
76 pr_info("initrd not found or empty");
77 goto disable;
78 }
79 if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
80 pr_err("initrd extends beyond end of memory");
81 goto disable;
82 }
83
84 size = initrd_end - initrd_start;
85 memblock_reserve(__pa(initrd_start), size);
86 initrd_below_start_ok = 1;
87
88 pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n",
89 (void *)(initrd_start), size);
90 return;
91disable:
92 pr_cont(" - disabling initrd\n");
93 initrd_start = 0;
94 initrd_end = 0;
95}
96
97void __init free_initrd_mem(unsigned long start, unsigned long end)
98{
99 free_reserved_area((void *)start, (void *)end, -1, "initrd");
71} 100}
72#endif /* CONFIG_BLK_DEV_INITRD */ 101#endif /* CONFIG_BLK_DEV_INITRD */
102
103void __init setup_bootmem(void)
104{
105 struct memblock_region *reg;
106 phys_addr_t mem_size = 0;
107
108 /* Find the memory region containing the kernel */
109 for_each_memblock(memory, reg) {
110 phys_addr_t vmlinux_end = __pa(_end);
111 phys_addr_t end = reg->base + reg->size;
112
113 if (reg->base <= vmlinux_end && vmlinux_end <= end) {
114 /*
115 * Reserve from the start of the region to the end of
116 * the kernel
117 */
118 memblock_reserve(reg->base, vmlinux_end - reg->base);
119 mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
120 }
121 }
122 BUG_ON(mem_size == 0);
123
124 set_max_mapnr(PFN_DOWN(mem_size));
125 max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
126
127#ifdef CONFIG_BLK_DEV_INITRD
128 setup_initrd();
129#endif /* CONFIG_BLK_DEV_INITRD */
130
131 early_init_fdt_reserve_self();
132 early_init_fdt_scan_reserved_mem();
133 memblock_allow_resize();
134 memblock_dump_all();
135
136 for_each_memblock(memory, reg) {
137 unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
138 unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
139
140 memblock_set_node(PFN_PHYS(start_pfn),
141 PFN_PHYS(end_pfn - start_pfn),
142 &memblock.memory, 0);
143 }
144}
145
146pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
147pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
148
149#ifndef __PAGETABLE_PMD_FOLDED
150#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
151pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
152pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
153pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
154#endif
155
156pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
157
158void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
159{
160 unsigned long addr = __fix_to_virt(idx);
161 pte_t *ptep;
162
163 BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
164
165 ptep = &fixmap_pte[pte_index(addr)];
166
167 if (pgprot_val(prot)) {
168 set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
169 } else {
170 pte_clear(&init_mm, addr, ptep);
171 local_flush_tlb_page(addr);
172 }
173}
174
175asmlinkage void __init setup_vm(void)
176{
177 extern char _start;
178 uintptr_t i;
179 uintptr_t pa = (uintptr_t) &_start;
180 pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
181
182 va_pa_offset = PAGE_OFFSET - pa;
183 pfn_base = PFN_DOWN(pa);
184
185 /* Sanity check alignment and size */
186 BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
187 BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
188
189#ifndef __PAGETABLE_PMD_FOLDED
190 trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
191 pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd),
192 __pgprot(_PAGE_TABLE));
193 trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot);
194
195 for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
196 size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
197
198 swapper_pg_dir[o] =
199 pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
200 __pgprot(_PAGE_TABLE));
201 }
202 for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
203 swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
204
205 swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
206 pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pmd),
207 __pgprot(_PAGE_TABLE));
208 fixmap_pmd[(FIXADDR_START >> PMD_SHIFT) % PTRS_PER_PMD] =
209 pfn_pmd(PFN_DOWN((uintptr_t)fixmap_pte),
210 __pgprot(_PAGE_TABLE));
211#else
212 trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
213 pfn_pgd(PFN_DOWN(pa), prot);
214
215 for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
216 size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
217
218 swapper_pg_dir[o] =
219 pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
220 }
221
222 swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
223 pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pte),
224 __pgprot(_PAGE_TABLE));
225#endif
226}
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9c5a67d1b9c1..2d8b9d8ca4f8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -187,7 +187,6 @@ cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,
187cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) 187cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
188 188
189# does binutils support specific instructions? 189# does binutils support specific instructions?
190asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
191asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) 190asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
192avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) 191avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
193avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) 192avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
@@ -217,6 +216,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
217# Avoid indirect branches in kernel to deal with Spectre 216# Avoid indirect branches in kernel to deal with Spectre
218ifdef CONFIG_RETPOLINE 217ifdef CONFIG_RETPOLINE
219 KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) 218 KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
219 # Additionally, avoid generating expensive indirect jumps which
220 # are subject to retpolines for small number of switch cases.
221 # clang turns off jump table generation by default when under
222 # retpoline builds, however, gcc does not for x86.
223 KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
220endif 224endif
221 225
222archscripts: scripts_basic 226archscripts: scripts_basic
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 9b5adae9cc40..e2839b5c246c 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
100AFLAGS_header.o += -I$(objtree)/$(obj) 100AFLAGS_header.o += -I$(objtree)/$(obj)
101$(obj)/header.o: $(obj)/zoffset.h 101$(obj)/header.o: $(obj)/zoffset.h
102 102
103LDFLAGS_setup.elf := -T 103LDFLAGS_setup.elf := -m elf_i386 -T
104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
105 $(call if_changed,ld) 105 $(call if_changed,ld)
106 106
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f0515ac895a4..6b84afdd7538 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
84 vmlinux-objs-y += $(obj)/pgtable_64.o 84 vmlinux-objs-y += $(obj)/pgtable_64.o
85endif 85endif
86 86
87vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
88
87$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone 89$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
88 90
89vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ 91vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
new file mode 100644
index 000000000000..0ef4ad55b29b
--- /dev/null
+++ b/arch/x86/boot/compressed/acpi.c
@@ -0,0 +1,338 @@
1// SPDX-License-Identifier: GPL-2.0
2#define BOOT_CTYPE_H
3#include "misc.h"
4#include "error.h"
5#include "../string.h"
6
7#include <linux/numa.h>
8#include <linux/efi.h>
9#include <asm/efi.h>
10
11/*
12 * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
13 * for termination.
14 */
15#define MAX_ACPI_ARG_LENGTH 10
16
17/*
18 * Immovable memory regions representation. Max amount of memory regions is
19 * MAX_NUMNODES*2.
20 */
21struct mem_vector immovable_mem[MAX_NUMNODES*2];
22
23/*
24 * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
25 * digits, and '\0' for termination.
26 */
27#define MAX_ADDR_LEN 19
28
29static acpi_physical_address get_acpi_rsdp(void)
30{
31 acpi_physical_address addr = 0;
32
33#ifdef CONFIG_KEXEC
34 char val[MAX_ADDR_LEN] = { };
35 int ret;
36
37 ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
38 if (ret < 0)
39 return 0;
40
41 if (kstrtoull(val, 16, &addr))
42 return 0;
43#endif
44 return addr;
45}
46
47/* Search EFI system tables for RSDP. */
48static acpi_physical_address efi_get_rsdp_addr(void)
49{
50 acpi_physical_address rsdp_addr = 0;
51
52#ifdef CONFIG_EFI
53 unsigned long systab, systab_tables, config_tables;
54 unsigned int nr_tables;
55 struct efi_info *ei;
56 bool efi_64;
57 int size, i;
58 char *sig;
59
60 ei = &boot_params->efi_info;
61 sig = (char *)&ei->efi_loader_signature;
62
63 if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
64 efi_64 = true;
65 } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
66 efi_64 = false;
67 } else {
68 debug_putstr("Wrong EFI loader signature.\n");
69 return 0;
70 }
71
72 /* Get systab from boot params. */
73#ifdef CONFIG_X86_64
74 systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
75#else
76 if (ei->efi_systab_hi || ei->efi_memmap_hi) {
77 debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n");
78 return 0;
79 }
80 systab = ei->efi_systab;
81#endif
82 if (!systab)
83 error("EFI system table not found.");
84
85 /* Handle EFI bitness properly */
86 if (efi_64) {
87 efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab;
88
89 config_tables = stbl->tables;
90 nr_tables = stbl->nr_tables;
91 size = sizeof(efi_config_table_64_t);
92 } else {
93 efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab;
94
95 config_tables = stbl->tables;
96 nr_tables = stbl->nr_tables;
97 size = sizeof(efi_config_table_32_t);
98 }
99
100 if (!config_tables)
101 error("EFI config tables not found.");
102
103 /* Get EFI tables from systab. */
104 for (i = 0; i < nr_tables; i++) {
105 acpi_physical_address table;
106 efi_guid_t guid;
107
108 config_tables += size;
109
110 if (efi_64) {
111 efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables;
112
113 guid = tbl->guid;
114 table = tbl->table;
115
116 if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
117 debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n");
118 return 0;
119 }
120 } else {
121 efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables;
122
123 guid = tbl->guid;
124 table = tbl->table;
125 }
126
127 if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
128 rsdp_addr = table;
129 else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
130 return table;
131 }
132#endif
133 return rsdp_addr;
134}
135
136static u8 compute_checksum(u8 *buffer, u32 length)
137{
138 u8 *end = buffer + length;
139 u8 sum = 0;
140
141 while (buffer < end)
142 sum += *(buffer++);
143
144 return sum;
145}
146
147/* Search a block of memory for the RSDP signature. */
148static u8 *scan_mem_for_rsdp(u8 *start, u32 length)
149{
150 struct acpi_table_rsdp *rsdp;
151 u8 *address, *end;
152
153 end = start + length;
154
155 /* Search from given start address for the requested length */
156 for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) {
157 /*
158 * Both RSDP signature and checksum must be correct.
159 * Note: Sometimes there exists more than one RSDP in memory;
160 * the valid RSDP has a valid checksum, all others have an
161 * invalid checksum.
162 */
163 rsdp = (struct acpi_table_rsdp *)address;
164
165 /* BAD Signature */
166 if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature))
167 continue;
168
169 /* Check the standard checksum */
170 if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH))
171 continue;
172
173 /* Check extended checksum if table version >= 2 */
174 if ((rsdp->revision >= 2) &&
175 (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)))
176 continue;
177
178 /* Signature and checksum valid, we have found a real RSDP */
179 return address;
180 }
181 return NULL;
182}
183
184/* Search RSDP address in EBDA. */
185static acpi_physical_address bios_get_rsdp_addr(void)
186{
187 unsigned long address;
188 u8 *rsdp;
189
190 /* Get the location of the Extended BIOS Data Area (EBDA) */
191 address = *(u16 *)ACPI_EBDA_PTR_LOCATION;
192 address <<= 4;
193
194 /*
195 * Search EBDA paragraphs (EBDA is required to be a minimum of
196 * 1K length)
197 */
198 if (address > 0x400) {
199 rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE);
200 if (rsdp)
201 return (acpi_physical_address)(unsigned long)rsdp;
202 }
203
204 /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */
205 rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE,
206 ACPI_HI_RSDP_WINDOW_SIZE);
207 if (rsdp)
208 return (acpi_physical_address)(unsigned long)rsdp;
209
210 return 0;
211}
212
213/* Return RSDP address on success, otherwise 0. */
214acpi_physical_address get_rsdp_addr(void)
215{
216 acpi_physical_address pa;
217
218 pa = get_acpi_rsdp();
219
220 if (!pa)
221 pa = boot_params->acpi_rsdp_addr;
222
223 if (!pa)
224 pa = efi_get_rsdp_addr();
225
226 if (!pa)
227 pa = bios_get_rsdp_addr();
228
229 return pa;
230}
231
232#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
233/* Compute SRAT address from RSDP. */
234static unsigned long get_acpi_srat_table(void)
235{
236 unsigned long root_table, acpi_table;
237 struct acpi_table_header *header;
238 struct acpi_table_rsdp *rsdp;
239 u32 num_entries, size, len;
240 char arg[10];
241 u8 *entry;
242
243 rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr;
244 if (!rsdp)
245 return 0;
246
247 /* Get ACPI root table from RSDP.*/
248 if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
249 !strncmp(arg, "rsdt", 4)) &&
250 rsdp->xsdt_physical_address &&
251 rsdp->revision > 1) {
252 root_table = rsdp->xsdt_physical_address;
253 size = ACPI_XSDT_ENTRY_SIZE;
254 } else {
255 root_table = rsdp->rsdt_physical_address;
256 size = ACPI_RSDT_ENTRY_SIZE;
257 }
258
259 if (!root_table)
260 return 0;
261
262 header = (struct acpi_table_header *)root_table;
263 len = header->length;
264 if (len < sizeof(struct acpi_table_header) + size)
265 return 0;
266
267 num_entries = (len - sizeof(struct acpi_table_header)) / size;
268 entry = (u8 *)(root_table + sizeof(struct acpi_table_header));
269
270 while (num_entries--) {
271 if (size == ACPI_RSDT_ENTRY_SIZE)
272 acpi_table = *(u32 *)entry;
273 else
274 acpi_table = *(u64 *)entry;
275
276 if (acpi_table) {
277 header = (struct acpi_table_header *)acpi_table;
278
279 if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT))
280 return acpi_table;
281 }
282 entry += size;
283 }
284 return 0;
285}
286
287/**
288 * count_immovable_mem_regions - Parse SRAT and cache the immovable
289 * memory regions into the immovable_mem array.
290 *
291 * Return the number of immovable memory regions on success, 0 on failure:
292 *
293 * - Too many immovable memory regions
294 * - ACPI off or no SRAT found
295 * - No immovable memory region found.
296 */
297int count_immovable_mem_regions(void)
298{
299 unsigned long table_addr, table_end, table;
300 struct acpi_subtable_header *sub_table;
301 struct acpi_table_header *table_header;
302 char arg[MAX_ACPI_ARG_LENGTH];
303 int num = 0;
304
305 if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
306 !strncmp(arg, "off", 3))
307 return 0;
308
309 table_addr = get_acpi_srat_table();
310 if (!table_addr)
311 return 0;
312
313 table_header = (struct acpi_table_header *)table_addr;
314 table_end = table_addr + table_header->length;
315 table = table_addr + sizeof(struct acpi_table_srat);
316
317 while (table + sizeof(struct acpi_subtable_header) < table_end) {
318 sub_table = (struct acpi_subtable_header *)table;
319 if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
320 struct acpi_srat_mem_affinity *ma;
321
322 ma = (struct acpi_srat_mem_affinity *)sub_table;
323 if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
324 immovable_mem[num].start = ma->base_address;
325 immovable_mem[num].size = ma->length;
326 num++;
327 }
328
329 if (num >= MAX_NUMNODES*2) {
330 debug_putstr("Too many immovable memory regions, aborting.\n");
331 return 0;
332 }
333 }
334 table += sub_table->length;
335 }
336 return num;
337}
338#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index af6cda0b7900..f1add5d85da9 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,8 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "misc.h" 2#include "misc.h"
3 3
4#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
5
6static unsigned long fs; 4static unsigned long fs;
7static inline void set_fs(unsigned long seg) 5static inline void set_fs(unsigned long seg)
8{ 6{
@@ -30,5 +28,3 @@ int cmdline_find_option_bool(const char *option)
30{ 28{
31 return __cmdline_find_option_bool(get_cmd_line_ptr(), option); 29 return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
32} 30}
33
34#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62e347862cc..fafb75c6c592 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -358,8 +358,11 @@ ENTRY(startup_64)
358 * paging_prepare() sets up the trampoline and checks if we need to 358 * paging_prepare() sets up the trampoline and checks if we need to
359 * enable 5-level paging. 359 * enable 5-level paging.
360 * 360 *
361 * Address of the trampoline is returned in RAX. 361 * paging_prepare() returns a two-quadword structure which lands
362 * Non zero RDX on return means we need to enable 5-level paging. 362 * into RDX:RAX:
363 * - Address of the trampoline is returned in RAX.
364 * - Non zero RDX means trampoline needs to enable 5-level
365 * paging.
363 * 366 *
364 * RSI holds real mode data and needs to be preserved across 367 * RSI holds real mode data and needs to be preserved across
365 * this function call. 368 * this function call.
@@ -565,7 +568,7 @@ adjust_got:
565 * 568 *
566 * RDI contains the return address (might be above 4G). 569 * RDI contains the return address (might be above 4G).
567 * ECX contains the base address of the trampoline memory. 570 * ECX contains the base address of the trampoline memory.
568 * Non zero RDX on return means we need to enable 5-level paging. 571 * Non zero RDX means trampoline needs to enable 5-level paging.
569 */ 572 */
570ENTRY(trampoline_32bit_src) 573ENTRY(trampoline_32bit_src)
571 /* Set up data and stack segments */ 574 /* Set up data and stack segments */
@@ -655,8 +658,6 @@ no_longmode:
655 .data 658 .data
656gdt64: 659gdt64:
657 .word gdt_end - gdt 660 .word gdt_end - gdt
658 .long 0
659 .word 0
660 .quad 0 661 .quad 0
661gdt: 662gdt:
662 .word gdt_end - gdt 663 .word gdt_end - gdt
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 9ed9709d9947..fa0332dda9f2 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -87,10 +87,6 @@ static unsigned long get_boot_seed(void)
87#define KASLR_COMPRESSED_BOOT 87#define KASLR_COMPRESSED_BOOT
88#include "../../lib/kaslr.c" 88#include "../../lib/kaslr.c"
89 89
90struct mem_vector {
91 unsigned long long start;
92 unsigned long long size;
93};
94 90
95/* Only supporting at most 4 unusable memmap regions with kaslr */ 91/* Only supporting at most 4 unusable memmap regions with kaslr */
96#define MAX_MEMMAP_REGIONS 4 92#define MAX_MEMMAP_REGIONS 4
@@ -101,6 +97,8 @@ static bool memmap_too_large;
101/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ 97/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
102static unsigned long long mem_limit = ULLONG_MAX; 98static unsigned long long mem_limit = ULLONG_MAX;
103 99
100/* Number of immovable memory regions */
101static int num_immovable_mem;
104 102
105enum mem_avoid_index { 103enum mem_avoid_index {
106 MEM_AVOID_ZO_RANGE = 0, 104 MEM_AVOID_ZO_RANGE = 0,
@@ -417,6 +415,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
417 /* Mark the memmap regions we need to avoid */ 415 /* Mark the memmap regions we need to avoid */
418 handle_mem_options(); 416 handle_mem_options();
419 417
418 /* Enumerate the immovable memory regions */
419 num_immovable_mem = count_immovable_mem_regions();
420
420#ifdef CONFIG_X86_VERBOSE_BOOTUP 421#ifdef CONFIG_X86_VERBOSE_BOOTUP
421 /* Make sure video RAM can be used. */ 422 /* Make sure video RAM can be used. */
422 add_identity_map(0, PMD_SIZE); 423 add_identity_map(0, PMD_SIZE);
@@ -572,9 +573,9 @@ static unsigned long slots_fetch_random(void)
572 return 0; 573 return 0;
573} 574}
574 575
575static void process_mem_region(struct mem_vector *entry, 576static void __process_mem_region(struct mem_vector *entry,
576 unsigned long minimum, 577 unsigned long minimum,
577 unsigned long image_size) 578 unsigned long image_size)
578{ 579{
579 struct mem_vector region, overlap; 580 struct mem_vector region, overlap;
580 unsigned long start_orig, end; 581 unsigned long start_orig, end;
@@ -650,6 +651,56 @@ static void process_mem_region(struct mem_vector *entry,
650 } 651 }
651} 652}
652 653
654static bool process_mem_region(struct mem_vector *region,
655 unsigned long long minimum,
656 unsigned long long image_size)
657{
658 int i;
659 /*
660 * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
661 * use @region directly.
662 */
663 if (!num_immovable_mem) {
664 __process_mem_region(region, minimum, image_size);
665
666 if (slot_area_index == MAX_SLOT_AREA) {
667 debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
668 return 1;
669 }
670 return 0;
671 }
672
673#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
674 /*
675 * If immovable memory found, filter the intersection between
676 * immovable memory and @region.
677 */
678 for (i = 0; i < num_immovable_mem; i++) {
679 unsigned long long start, end, entry_end, region_end;
680 struct mem_vector entry;
681
682 if (!mem_overlaps(region, &immovable_mem[i]))
683 continue;
684
685 start = immovable_mem[i].start;
686 end = start + immovable_mem[i].size;
687 region_end = region->start + region->size;
688
689 entry.start = clamp(region->start, start, end);
690 entry_end = clamp(region_end, start, end);
691 entry.size = entry_end - entry.start;
692
693 __process_mem_region(&entry, minimum, image_size);
694
695 if (slot_area_index == MAX_SLOT_AREA) {
696 debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
697 return 1;
698 }
699 }
700 return 0;
701#endif
702}
703
653#ifdef CONFIG_EFI 704#ifdef CONFIG_EFI
654/* 705/*
655 * Returns true if mirror region found (and must have been processed 706 * Returns true if mirror region found (and must have been processed
@@ -715,11 +766,8 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
715 766
716 region.start = md->phys_addr; 767 region.start = md->phys_addr;
717 region.size = md->num_pages << EFI_PAGE_SHIFT; 768 region.size = md->num_pages << EFI_PAGE_SHIFT;
718 process_mem_region(&region, minimum, image_size); 769 if (process_mem_region(&region, minimum, image_size))
719 if (slot_area_index == MAX_SLOT_AREA) {
720 debug_putstr("Aborted EFI scan (slot_areas full)!\n");
721 break; 770 break;
722 }
723 } 771 }
724 return true; 772 return true;
725} 773}
@@ -746,11 +794,8 @@ static void process_e820_entries(unsigned long minimum,
746 continue; 794 continue;
747 region.start = entry->addr; 795 region.start = entry->addr;
748 region.size = entry->size; 796 region.size = entry->size;
749 process_mem_region(&region, minimum, image_size); 797 if (process_mem_region(&region, minimum, image_size))
750 if (slot_area_index == MAX_SLOT_AREA) {
751 debug_putstr("Aborted e820 scan (slot_areas full)!\n");
752 break; 798 break;
753 }
754 } 799 }
755} 800}
756 801
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8dd1d5ccae58..c0d6c560df69 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -351,6 +351,9 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
351 /* Clear flags intended for solely in-kernel use. */ 351 /* Clear flags intended for solely in-kernel use. */
352 boot_params->hdr.loadflags &= ~KASLR_FLAG; 352 boot_params->hdr.loadflags &= ~KASLR_FLAG;
353 353
354 /* Save RSDP address for later use. */
355 boot_params->acpi_rsdp_addr = get_rsdp_addr();
356
354 sanitize_boot_params(boot_params); 357 sanitize_boot_params(boot_params);
355 358
356 if (boot_params->screen_info.orig_video_mode == 7) { 359 if (boot_params->screen_info.orig_video_mode == 7) {
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index a1d5918765f3..fd13655e0f9b 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -25,6 +25,9 @@
25#include <asm/bootparam.h> 25#include <asm/bootparam.h>
26#include <asm/bootparam_utils.h> 26#include <asm/bootparam_utils.h>
27 27
28#define BOOT_CTYPE_H
29#include <linux/acpi.h>
30
28#define BOOT_BOOT_H 31#define BOOT_BOOT_H
29#include "../ctype.h" 32#include "../ctype.h"
30 33
@@ -63,12 +66,14 @@ static inline void debug_puthex(const char *s)
63 66
64#endif 67#endif
65 68
66#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
67/* cmdline.c */ 69/* cmdline.c */
68int cmdline_find_option(const char *option, char *buffer, int bufsize); 70int cmdline_find_option(const char *option, char *buffer, int bufsize);
69int cmdline_find_option_bool(const char *option); 71int cmdline_find_option_bool(const char *option);
70#endif
71 72
73struct mem_vector {
74 unsigned long long start;
75 unsigned long long size;
76};
72 77
73#if CONFIG_RANDOMIZE_BASE 78#if CONFIG_RANDOMIZE_BASE
74/* kaslr.c */ 79/* kaslr.c */
@@ -116,3 +121,17 @@ static inline void console_init(void)
116void set_sev_encryption_mask(void); 121void set_sev_encryption_mask(void);
117 122
118#endif 123#endif
124
125/* acpi.c */
126#ifdef CONFIG_ACPI
127acpi_physical_address get_rsdp_addr(void);
128#else
129static inline acpi_physical_address get_rsdp_addr(void) { return 0; }
130#endif
131
132#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
133extern struct mem_vector immovable_mem[MAX_NUMNODES*2];
134int count_immovable_mem_regions(void);
135#else
136static inline int count_immovable_mem_regions(void) { return 0; }
137#endif
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 9e2157371491..f8debf7aeb4c 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,5 +1,7 @@
1#include <linux/efi.h>
1#include <asm/e820/types.h> 2#include <asm/e820/types.h>
2#include <asm/processor.h> 3#include <asm/processor.h>
4#include <asm/efi.h>
3#include "pgtable.h" 5#include "pgtable.h"
4#include "../string.h" 6#include "../string.h"
5 7
@@ -37,9 +39,10 @@ int cmdline_find_option_bool(const char *option);
37 39
38static unsigned long find_trampoline_placement(void) 40static unsigned long find_trampoline_placement(void)
39{ 41{
40 unsigned long bios_start, ebda_start; 42 unsigned long bios_start = 0, ebda_start = 0;
41 unsigned long trampoline_start; 43 unsigned long trampoline_start;
42 struct boot_e820_entry *entry; 44 struct boot_e820_entry *entry;
45 char *signature;
43 int i; 46 int i;
44 47
45 /* 48 /*
@@ -47,8 +50,18 @@ static unsigned long find_trampoline_placement(void)
47 * This code is based on reserve_bios_regions(). 50 * This code is based on reserve_bios_regions().
48 */ 51 */
49 52
50 ebda_start = *(unsigned short *)0x40e << 4; 53 /*
51 bios_start = *(unsigned short *)0x413 << 10; 54 * EFI systems may not provide legacy ROM. The memory may not be mapped
55 * at all.
56 *
57 * Only look for values in the legacy ROM for non-EFI system.
58 */
59 signature = (char *)&boot_params->efi_info.efi_loader_signature;
60 if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
61 strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
62 ebda_start = *(unsigned short *)0x40e << 4;
63 bios_start = *(unsigned short *)0x413 << 10;
64 }
52 65
53 if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) 66 if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
54 bios_start = BIOS_START_MAX; 67 bios_start = BIOS_START_MAX;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f491bbde8493..508cfa6828c5 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,7 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#include <asm-generic/vmlinux.lds.h> 2#include <asm-generic/vmlinux.lds.h>
3 3
4OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) 4OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
5 5
6#undef i386 6#undef i386
7 7
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 96a6c7563538..0149e41d42c2 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -3,7 +3,7 @@
3 * 3 *
4 * Linker script for the i386 setup code 4 * Linker script for the i386 setup code
5 */ 5 */
6OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 6OUTPUT_FORMAT("elf32-i386")
7OUTPUT_ARCH(i386) 7OUTPUT_ARCH(i386)
8ENTRY(_start) 8ENTRY(_start)
9 9
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index c4428a176973..315a67b8896b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -13,10 +13,14 @@
13 */ 13 */
14 14
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/errno.h>
16#include <asm/asm.h> 18#include <asm/asm.h>
17#include "ctype.h" 19#include "ctype.h"
18#include "string.h" 20#include "string.h"
19 21
22#define KSTRTOX_OVERFLOW (1U << 31)
23
20/* 24/*
21 * Undef these macros so that the functions that we provide 25 * Undef these macros so that the functions that we provide
22 * here will have the correct names regardless of how string.h 26 * here will have the correct names regardless of how string.h
@@ -187,3 +191,140 @@ char *strchr(const char *s, int c)
187 return NULL; 191 return NULL;
188 return (char *)s; 192 return (char *)s;
189} 193}
194
195static inline u64 __div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
196{
197 union {
198 u64 v64;
199 u32 v32[2];
200 } d = { dividend };
201 u32 upper;
202
203 upper = d.v32[1];
204 d.v32[1] = 0;
205 if (upper >= divisor) {
206 d.v32[1] = upper / divisor;
207 upper %= divisor;
208 }
209 asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
210 "rm" (divisor), "0" (d.v32[0]), "1" (upper));
211 return d.v64;
212}
213
214static inline u64 __div_u64(u64 dividend, u32 divisor)
215{
216 u32 remainder;
217
218 return __div_u64_rem(dividend, divisor, &remainder);
219}
220
221static inline char _tolower(const char c)
222{
223 return c | 0x20;
224}
225
226static const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
227{
228 if (*base == 0) {
229 if (s[0] == '0') {
230 if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
231 *base = 16;
232 else
233 *base = 8;
234 } else
235 *base = 10;
236 }
237 if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
238 s += 2;
239 return s;
240}
241
242/*
243 * Convert non-negative integer string representation in explicitly given radix
244 * to an integer.
245 * Return number of characters consumed maybe or-ed with overflow bit.
246 * If overflow occurs, result integer (incorrect) is still returned.
247 *
248 * Don't you dare use this function.
249 */
250static unsigned int _parse_integer(const char *s,
251 unsigned int base,
252 unsigned long long *p)
253{
254 unsigned long long res;
255 unsigned int rv;
256
257 res = 0;
258 rv = 0;
259 while (1) {
260 unsigned int c = *s;
261 unsigned int lc = c | 0x20; /* don't tolower() this line */
262 unsigned int val;
263
264 if ('0' <= c && c <= '9')
265 val = c - '0';
266 else if ('a' <= lc && lc <= 'f')
267 val = lc - 'a' + 10;
268 else
269 break;
270
271 if (val >= base)
272 break;
273 /*
274 * Check for overflow only if we are within range of
275 * it in the max base we support (16)
276 */
277 if (unlikely(res & (~0ull << 60))) {
278 if (res > __div_u64(ULLONG_MAX - val, base))
279 rv |= KSTRTOX_OVERFLOW;
280 }
281 res = res * base + val;
282 rv++;
283 s++;
284 }
285 *p = res;
286 return rv;
287}
288
289static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
290{
291 unsigned long long _res;
292 unsigned int rv;
293
294 s = _parse_integer_fixup_radix(s, &base);
295 rv = _parse_integer(s, base, &_res);
296 if (rv & KSTRTOX_OVERFLOW)
297 return -ERANGE;
298 if (rv == 0)
299 return -EINVAL;
300 s += rv;
301 if (*s == '\n')
302 s++;
303 if (*s)
304 return -EINVAL;
305 *res = _res;
306 return 0;
307}
308
309/**
310 * kstrtoull - convert a string to an unsigned long long
311 * @s: The start of the string. The string must be null-terminated, and may also
312 * include a single newline before its terminating null. The first character
313 * may also be a plus sign, but not a minus sign.
314 * @base: The number base to use. The maximum supported base is 16. If base is
315 * given as 0, then the base of the string is automatically detected with the
316 * conventional semantics - If it begins with 0x the number will be parsed as a
317 * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
318 * parsed as an octal number. Otherwise it will be parsed as a decimal.
319 * @res: Where to write the result of the conversion on success.
320 *
321 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
322 * Used as a replacement for the obsolete simple_strtoull. Return code must
323 * be checked.
324 */
325int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
326{
327 if (s[0] == '+')
328 s++;
329 return _kstrtoull(s, base, res);
330}
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index 3d78e27077f4..38d8f2f5e47e 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -29,4 +29,5 @@ extern unsigned int atou(const char *s);
29extern unsigned long long simple_strtoull(const char *cp, char **endp, 29extern unsigned long long simple_strtoull(const char *cp, char **endp,
30 unsigned int base); 30 unsigned int base);
31 31
32int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
32#endif /* BOOT_STRING_H */ 33#endif /* BOOT_STRING_H */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 7ca67c482f4c..9f908112bbb9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -309,3 +309,5 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y
309CONFIG_SECURITY_SELINUX_DISABLE=y 309CONFIG_SECURITY_SELINUX_DISABLE=y
310CONFIG_CRYPTO_AES_586=y 310CONFIG_CRYPTO_AES_586=y
311# CONFIG_CRYPTO_ANSI_CPRNG is not set 311# CONFIG_CRYPTO_ANSI_CPRNG is not set
312CONFIG_EFI_STUB=y
313CONFIG_ACPI_BGRT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 5d42a20e0986..1d3badfda09e 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -307,3 +307,6 @@ CONFIG_SECURITY_SELINUX=y
307CONFIG_SECURITY_SELINUX_BOOTPARAM=y 307CONFIG_SECURITY_SELINUX_BOOTPARAM=y
308CONFIG_SECURITY_SELINUX_DISABLE=y 308CONFIG_SECURITY_SELINUX_DISABLE=y
309# CONFIG_CRYPTO_ANSI_CPRNG is not set 309# CONFIG_CRYPTO_ANSI_CPRNG is not set
310CONFIG_EFI_STUB=y
311CONFIG_EFI_MIXED=y
312CONFIG_ACPI_BGRT=y
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 17096d3cd616..7ec265bacb6a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4220,6 +4220,8 @@ __init int intel_pmu_init(void)
4220 4220
4221 case INTEL_FAM6_CORE2_MEROM: 4221 case INTEL_FAM6_CORE2_MEROM:
4222 x86_add_quirk(intel_clovertown_quirk); 4222 x86_add_quirk(intel_clovertown_quirk);
4223 /* fall through */
4224
4223 case INTEL_FAM6_CORE2_MEROM_L: 4225 case INTEL_FAM6_CORE2_MEROM_L:
4224 case INTEL_FAM6_CORE2_PENRYN: 4226 case INTEL_FAM6_CORE2_PENRYN:
4225 case INTEL_FAM6_CORE2_DUNNINGTON: 4227 case INTEL_FAM6_CORE2_DUNNINGTON:
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index c88ed39582a1..580c1b91c454 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -931,6 +931,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
931 ret = X86_BR_ZERO_CALL; 931 ret = X86_BR_ZERO_CALL;
932 break; 932 break;
933 } 933 }
934 /* fall through */
934 case 0x9a: /* call far absolute */ 935 case 0x9a: /* call far absolute */
935 ret = X86_BR_CALL; 936 ret = X86_BR_CALL;
936 break; 937 break;
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 1908214b9125..ce92c4acc913 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -7,7 +7,6 @@
7 7
8#include <asm-generic/asm-prototypes.h> 8#include <asm-generic/asm-prototypes.h>
9 9
10#include <asm/page.h>
11#include <asm/pgtable.h> 10#include <asm/pgtable.h>
12#include <asm/special_insns.h> 11#include <asm/special_insns.h>
13#include <asm/preempt.h> 12#include <asm/preempt.h>
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fa2c93cb42a2..fb04a3ded7dd 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -137,37 +137,25 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
137{ 137{
138 if (IS_ENABLED(CONFIG_X86_32)) 138 if (IS_ENABLED(CONFIG_X86_32))
139 return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); 139 return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
140 else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) 140 else
141 return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); 141 return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
142 142
143 /* See comment in copy_fxregs_to_kernel() below. */
144 return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
145} 143}
146 144
147static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) 145static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
148{ 146{
149 if (IS_ENABLED(CONFIG_X86_32)) { 147 if (IS_ENABLED(CONFIG_X86_32))
150 kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); 148 kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
151 } else { 149 else
152 if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { 150 kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
153 kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
154 } else {
155 /* See comment in copy_fxregs_to_kernel() below. */
156 kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
157 }
158 }
159} 151}
160 152
161static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) 153static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
162{ 154{
163 if (IS_ENABLED(CONFIG_X86_32)) 155 if (IS_ENABLED(CONFIG_X86_32))
164 return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); 156 return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
165 else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) 157 else
166 return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); 158 return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
167
168 /* See comment in copy_fxregs_to_kernel() below. */
169 return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
170 "m" (*fx));
171} 159}
172 160
173static inline void copy_kernel_to_fregs(struct fregs_state *fx) 161static inline void copy_kernel_to_fregs(struct fregs_state *fx)
@@ -184,34 +172,8 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
184{ 172{
185 if (IS_ENABLED(CONFIG_X86_32)) 173 if (IS_ENABLED(CONFIG_X86_32))
186 asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); 174 asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
187 else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) 175 else
188 asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); 176 asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
189 else {
190 /* Using "rex64; fxsave %0" is broken because, if the memory
191 * operand uses any extended registers for addressing, a second
192 * REX prefix will be generated (to the assembler, rex64
193 * followed by semicolon is a separate instruction), and hence
194 * the 64-bitness is lost.
195 *
196 * Using "fxsaveq %0" would be the ideal choice, but is only
197 * supported starting with gas 2.16.
198 *
199 * Using, as a workaround, the properly prefixed form below
200 * isn't accepted by any binutils version so far released,
201 * complaining that the same type of prefix is used twice if
202 * an extended register is needed for addressing (fix submitted
203 * to mainline 2005-11-21).
204 *
205 * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
206 *
207 * This, however, we can work around by forcing the compiler to
208 * select an addressing mode that doesn't require extended
209 * registers.
210 */
211 asm volatile( "rex64/fxsave (%[fx])"
212 : "=m" (fpu->state.fxsave)
213 : [fx] "R" (&fpu->state.fxsave));
214 }
215} 177}
216 178
217/* These macros all use (%edi)/(%rdi) as the single memory argument. */ 179/* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -414,6 +376,13 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
414{ 376{
415 if (likely(use_xsave())) { 377 if (likely(use_xsave())) {
416 copy_xregs_to_kernel(&fpu->state.xsave); 378 copy_xregs_to_kernel(&fpu->state.xsave);
379
380 /*
381 * AVX512 state is tracked here because its use is
382 * known to slow the max clock speed of the core.
383 */
384 if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
385 fpu->avx512_timestamp = jiffies;
417 return 1; 386 return 1;
418 } 387 }
419 388
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 202c53918ecf..2e32e178e064 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,6 +303,13 @@ struct fpu {
303 unsigned char initialized; 303 unsigned char initialized;
304 304
305 /* 305 /*
306 * @avx512_timestamp:
307 *
308 * Records the timestamp of AVX512 use during last context switch.
309 */
310 unsigned long avx512_timestamp;
311
312 /*
306 * @state: 313 * @state:
307 * 314 *
308 * In-memory copy of all FPU registers that we save/restore 315 * In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03c..0bb566315621 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -259,8 +259,7 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
259extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 259extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
260 260
261#define gup_fast_permitted gup_fast_permitted 261#define gup_fast_permitted gup_fast_permitted
262static inline bool gup_fast_permitted(unsigned long start, int nr_pages, 262static inline bool gup_fast_permitted(unsigned long start, int nr_pages)
263 int write)
264{ 263{
265 unsigned long len, end; 264 unsigned long len, end;
266 265
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 33051436c864..2bb3a648fc12 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -742,7 +742,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
742extern void enable_sep_cpu(void); 742extern void enable_sep_cpu(void);
743extern int sysenter_setup(void); 743extern int sysenter_setup(void);
744 744
745void early_trap_pf_init(void);
746 745
747/* Defined in head.S */ 746/* Defined in head.S */
748extern struct desc_ptr early_gdt_descr; 747extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 62004d22524a..1954dd5552a2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -34,10 +34,7 @@ static inline void set_fs(mm_segment_t fs)
34} 34}
35 35
36#define segment_eq(a, b) ((a).seg == (b).seg) 36#define segment_eq(a, b) ((a).seg == (b).seg)
37
38#define user_addr_max() (current->thread.addr_limit.seg) 37#define user_addr_max() (current->thread.addr_limit.seg)
39#define __addr_ok(addr) \
40 ((unsigned long __force)(addr) < user_addr_max())
41 38
42/* 39/*
43 * Test whether a block of memory is a valid user space address. 40 * Test whether a block of memory is a valid user space address.
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 3f697a9e3f59..8cfccc3cbbf4 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -141,7 +141,6 @@ enum uv_memprotect {
141 */ 141 */
142extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); 142extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
143extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); 143extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
144extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
145 144
146extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); 145extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
147extern s64 uv_bios_freq_base(u64, u64 *); 146extern s64 uv_bios_freq_base(u64, u64 *);
@@ -152,11 +151,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
152extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); 151extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
153extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); 152extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
154 153
155#ifdef CONFIG_EFI
156extern void uv_bios_init(void); 154extern void uv_bios_init(void);
157#else
158void uv_bios_init(void) { }
159#endif
160 155
161extern unsigned long sn_rtc_cycles_per_second; 156extern unsigned long sn_rtc_cycles_per_second;
162extern int uv_type; 157extern int uv_type;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 0c26b1b44e51..4203d4f0c68d 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -90,7 +90,7 @@ ret_point:
90.data 90.data
91ALIGN 91ALIGN
92ENTRY(saved_magic) .long 0 92ENTRY(saved_magic) .long 0
93ENTRY(saved_eip) .long 0 93saved_eip: .long 0
94 94
95# saved registers 95# saved registers
96saved_idt: .long 0,0 96saved_idt: .long 0,0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 50b8ed0317a3..510fa12aab73 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -125,12 +125,12 @@ ENTRY(do_suspend_lowlevel)
125ENDPROC(do_suspend_lowlevel) 125ENDPROC(do_suspend_lowlevel)
126 126
127.data 127.data
128ENTRY(saved_rbp) .quad 0 128saved_rbp: .quad 0
129ENTRY(saved_rsi) .quad 0 129saved_rsi: .quad 0
130ENTRY(saved_rdi) .quad 0 130saved_rdi: .quad 0
131ENTRY(saved_rbx) .quad 0 131saved_rbx: .quad 0
132 132
133ENTRY(saved_rip) .quad 0 133saved_rip: .quad 0
134ENTRY(saved_rsp) .quad 0 134saved_rsp: .quad 0
135 135
136ENTRY(saved_magic) .quad 0 136ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2953bbf05c08..264e3221d923 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -812,6 +812,7 @@ static int irq_polarity(int idx)
812 return IOAPIC_POL_HIGH; 812 return IOAPIC_POL_HIGH;
813 case MP_IRQPOL_RESERVED: 813 case MP_IRQPOL_RESERVED:
814 pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); 814 pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
815 /* fall through */
815 case MP_IRQPOL_ACTIVE_LOW: 816 case MP_IRQPOL_ACTIVE_LOW:
816 default: /* Pointless default required due to do gcc stupidity */ 817 default: /* Pointless default required due to do gcc stupidity */
817 return IOAPIC_POL_LOW; 818 return IOAPIC_POL_LOW;
@@ -859,6 +860,7 @@ static int irq_trigger(int idx)
859 return IOAPIC_EDGE; 860 return IOAPIC_EDGE;
860 case MP_IRQTRIG_RESERVED: 861 case MP_IRQTRIG_RESERVED:
861 pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); 862 pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
863 /* fall through */
862 case MP_IRQTRIG_LEVEL: 864 case MP_IRQTRIG_LEVEL:
863 default: /* Pointless default required due to do gcc stupidity */ 865 default: /* Pointless default required due to do gcc stupidity */
864 return IOAPIC_LEVEL; 866 return IOAPIC_LEVEL;
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index c4d1023fb0ab..395d46f78582 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -248,6 +248,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
248 switch (leaf) { 248 switch (leaf) {
249 case 1: 249 case 1:
250 l1 = &l1i; 250 l1 = &l1i;
251 /* fall through */
251 case 0: 252 case 0:
252 if (!l1->val) 253 if (!l1->val)
253 return; 254 return;
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 3668c5df90c6..5bd011737272 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -296,7 +296,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
296 unsigned long sizek) 296 unsigned long sizek)
297{ 297{
298 unsigned long hole_basek, hole_sizek; 298 unsigned long hole_basek, hole_sizek;
299 unsigned long second_basek, second_sizek; 299 unsigned long second_sizek;
300 unsigned long range0_basek, range0_sizek; 300 unsigned long range0_basek, range0_sizek;
301 unsigned long range_basek, range_sizek; 301 unsigned long range_basek, range_sizek;
302 unsigned long chunk_sizek; 302 unsigned long chunk_sizek;
@@ -304,7 +304,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
304 304
305 hole_basek = 0; 305 hole_basek = 0;
306 hole_sizek = 0; 306 hole_sizek = 0;
307 second_basek = 0;
308 second_sizek = 0; 307 second_sizek = 0;
309 chunk_sizek = state->chunk_sizek; 308 chunk_sizek = state->chunk_sizek;
310 gran_sizek = state->gran_sizek; 309 gran_sizek = state->gran_sizek;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 14bed6af8377..604c0e3bcc83 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -34,13 +34,6 @@
34#include "pseudo_lock_event.h" 34#include "pseudo_lock_event.h"
35 35
36/* 36/*
37 * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
38 * prefetcher state. Details about this register can be found in the MSR
39 * tables for specific platforms found in Intel's SDM.
40 */
41#define MSR_MISC_FEATURE_CONTROL 0x000001a4
42
43/*
44 * The bits needed to disable hardware prefetching varies based on the 37 * The bits needed to disable hardware prefetching varies based on the
45 * platform. During initialization we will discover which bits to use. 38 * platform. During initialization we will discover which bits to use.
46 */ 39 */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..a687d10da417 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -671,21 +671,18 @@ __init void e820__reallocate_tables(void)
671 int size; 671 int size;
672 672
673 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; 673 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
674 n = kmalloc(size, GFP_KERNEL); 674 n = kmemdup(e820_table, size, GFP_KERNEL);
675 BUG_ON(!n); 675 BUG_ON(!n);
676 memcpy(n, e820_table, size);
677 e820_table = n; 676 e820_table = n;
678 677
679 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; 678 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
680 n = kmalloc(size, GFP_KERNEL); 679 n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
681 BUG_ON(!n); 680 BUG_ON(!n);
682 memcpy(n, e820_table_kexec, size);
683 e820_table_kexec = n; 681 e820_table_kexec = n;
684 682
685 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; 683 size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
686 n = kmalloc(size, GFP_KERNEL); 684 n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
687 BUG_ON(!n); 685 BUG_ON(!n);
688 memcpy(n, e820_table_firmware, size);
689 e820_table_firmware = n; 686 e820_table_firmware = n;
690} 687}
691 688
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9cc108456d0b..d7432c2b1051 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -669,7 +669,7 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size)
669 return false; 669 return false;
670} 670}
671 671
672static int init_xstate_size(void) 672static int __init init_xstate_size(void)
673{ 673{
674 /* Recompute the context size for enabled features: */ 674 /* Recompute the context size for enabled features: */
675 unsigned int possible_xstate_size; 675 unsigned int possible_xstate_size;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 34a5c1715148..ff9bfd40429e 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -261,12 +261,8 @@ static int arch_build_bp_info(struct perf_event *bp,
261 * allow kernel breakpoints at all. 261 * allow kernel breakpoints at all.
262 */ 262 */
263 if (attr->bp_addr >= TASK_SIZE_MAX) { 263 if (attr->bp_addr >= TASK_SIZE_MAX) {
264#ifdef CONFIG_KPROBES
265 if (within_kprobe_blacklist(attr->bp_addr)) 264 if (within_kprobe_blacklist(attr->bp_addr))
266 return -EINVAL; 265 return -EINVAL;
267#else
268 return -EINVAL;
269#endif
270 } 266 }
271 267
272 hw->type = X86_BREAKPOINT_EXECUTE; 268 hw->type = X86_BREAKPOINT_EXECUTE;
@@ -279,6 +275,7 @@ static int arch_build_bp_info(struct perf_event *bp,
279 hw->len = X86_BREAKPOINT_LEN_X; 275 hw->len = X86_BREAKPOINT_LEN_X;
280 return 0; 276 return 0;
281 } 277 }
278 /* fall through */
282 default: 279 default:
283 return -EINVAL; 280 return -EINVAL;
284 } 281 }
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 53917a3ebf94..1f3b77367948 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -218,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
218 params->screen_info.ext_mem_k = 0; 218 params->screen_info.ext_mem_k = 0;
219 params->alt_mem_k = 0; 219 params->alt_mem_k = 0;
220 220
221 /* Always fill in RSDP: it is either 0 or a valid value */
222 params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
223
221 /* Default APM info */ 224 /* Default APM info */
222 memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info)); 225 memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
223 226
@@ -256,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
256 setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, 259 setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
257 efi_setup_data_offset); 260 efi_setup_data_offset);
258#endif 261#endif
259
260 /* Setup EDD info */ 262 /* Setup EDD info */
261 memcpy(params->eddbuf, boot_params.eddbuf, 263 memcpy(params->eddbuf, boot_params.eddbuf,
262 EDDMAXNR * sizeof(struct edd_info)); 264 EDDMAXNR * sizeof(struct edd_info));
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5db08425063e..4ff6b4cdb941 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -467,6 +467,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
467 ptr = &remcomInBuffer[1]; 467 ptr = &remcomInBuffer[1];
468 if (kgdb_hex2long(&ptr, &addr)) 468 if (kgdb_hex2long(&ptr, &addr))
469 linux_regs->ip = addr; 469 linux_regs->ip = addr;
470 /* fall through */
470 case 'D': 471 case 'D':
471 case 'k': 472 case 'k':
472 /* clear the trace bit */ 473 /* clear the trace bit */
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..ceba408ea982 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,6 +352,8 @@ void machine_kexec(struct kimage *image)
352 352
353void arch_crash_save_vmcoreinfo(void) 353void arch_crash_save_vmcoreinfo(void)
354{ 354{
355 u64 sme_mask = sme_me_mask;
356
355 VMCOREINFO_NUMBER(phys_base); 357 VMCOREINFO_NUMBER(phys_base);
356 VMCOREINFO_SYMBOL(init_top_pgt); 358 VMCOREINFO_SYMBOL(init_top_pgt);
357 vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", 359 vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
@@ -364,6 +366,7 @@ void arch_crash_save_vmcoreinfo(void)
364 vmcoreinfo_append_str("KERNELOFFSET=%lx\n", 366 vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
365 kaslr_offset()); 367 kaslr_offset());
366 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); 368 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
369 VMCOREINFO_NUMBER(sme_mask);
367} 370}
368 371
369/* arch-dependent functionality related to kexec file-based syscall */ 372/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c91ff9f9fe8a..ce1a67b70168 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -150,7 +150,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
150 */ 150 */
151static void smp_callin(void) 151static void smp_callin(void)
152{ 152{
153 int cpuid, phys_id; 153 int cpuid;
154 154
155 /* 155 /*
156 * If waken up by an INIT in an 82489DX configuration 156 * If waken up by an INIT in an 82489DX configuration
@@ -161,11 +161,6 @@ static void smp_callin(void)
161 cpuid = smp_processor_id(); 161 cpuid = smp_processor_id();
162 162
163 /* 163 /*
164 * (This works even if the APIC is not enabled.)
165 */
166 phys_id = read_apic_id();
167
168 /*
169 * the boot CPU has finished the init stage and is spinning 164 * the boot CPU has finished the init stage and is spinning
170 * on callin_map until we finish. We are free to set up this 165 * on callin_map until we finish. We are free to set up this
171 * CPU, first the APIC. (this is probably redundant on most 166 * CPU, first the APIC. (this is probably redundant on most
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e289ce1332ab..d26f9e9c3d83 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -881,12 +881,12 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
881dotraplinkage void 881dotraplinkage void
882do_device_not_available(struct pt_regs *regs, long error_code) 882do_device_not_available(struct pt_regs *regs, long error_code)
883{ 883{
884 unsigned long cr0; 884 unsigned long cr0 = read_cr0();
885 885
886 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 886 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
887 887
888#ifdef CONFIG_MATH_EMULATION 888#ifdef CONFIG_MATH_EMULATION
889 if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { 889 if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
890 struct math_emu_info info = { }; 890 struct math_emu_info info = { };
891 891
892 cond_local_irq_enable(regs); 892 cond_local_irq_enable(regs);
@@ -898,7 +898,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
898#endif 898#endif
899 899
900 /* This should not happen. */ 900 /* This should not happen. */
901 cr0 = read_cr0();
902 if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { 901 if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
903 /* Try to fix it up and carry on. */ 902 /* Try to fix it up and carry on. */
904 write_cr0(cr0 & ~X86_CR0_TS); 903 write_cr0(cr0 & ~X86_CR0_TS);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 843feb94a950..ccf03416e434 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -745,6 +745,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
745 * OPCODE1() of the "short" jmp which checks the same condition. 745 * OPCODE1() of the "short" jmp which checks the same condition.
746 */ 746 */
747 opc1 = OPCODE2(insn) - 0x10; 747 opc1 = OPCODE2(insn) - 0x10;
748 /* fall through */
748 default: 749 default:
749 if (!is_cond_jmp_opcode(opc1)) 750 if (!is_cond_jmp_opcode(opc1))
750 return -ENOSYS; 751 return -ENOSYS;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0d618ee634ac..bad8c51fee6e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -31,7 +31,7 @@
31 31
32#undef i386 /* in case the preprocessor is a 32bit one */ 32#undef i386 /* in case the preprocessor is a 32bit one */
33 33
34OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) 34OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
35 35
36#ifdef CONFIG_X86_32 36#ifdef CONFIG_X86_32
37OUTPUT_ARCH(i386) 37OUTPUT_ARCH(i386)
@@ -401,7 +401,7 @@ SECTIONS
401 * Per-cpu symbols which need to be offset from __per_cpu_load 401 * Per-cpu symbols which need to be offset from __per_cpu_load
402 * for the boot processor. 402 * for the boot processor.
403 */ 403 */
404#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load 404#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
405INIT_PER_CPU(gdt_page); 405INIT_PER_CPU(gdt_page);
406INIT_PER_CPU(irq_stack_union); 406INIT_PER_CPU(irq_stack_union);
407 407
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 9119d8e41f1f..cf00ab6c6621 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -179,6 +179,8 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
179 if (insn->addr_bytes == 2) 179 if (insn->addr_bytes == 2)
180 return -EINVAL; 180 return -EINVAL;
181 181
182 /* fall through */
183
182 case -EDOM: 184 case -EDOM:
183 case offsetof(struct pt_regs, bx): 185 case offsetof(struct pt_regs, bx):
184 case offsetof(struct pt_regs, si): 186 case offsetof(struct pt_regs, si):
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 12d7e7fb4efd..19c6abf9ea31 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -52,7 +52,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
52 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); 52 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
53} 53}
54 54
55static void percpu_setup_debug_store(int cpu) 55static void __init percpu_setup_debug_store(int cpu)
56{ 56{
57#ifdef CONFIG_CPU_SUP_INTEL 57#ifdef CONFIG_CPU_SUP_INTEL
58 int npages; 58 int npages;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e3cdc85ce5b6..ee8f8ab46941 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
444 int i; 444 int i;
445 pud_t *start, *pud_start; 445 pud_t *start, *pud_start;
446 pgprotval_t prot, eff; 446 pgprotval_t prot, eff;
447 pud_t *prev_pud = NULL;
448 447
449 pud_start = start = (pud_t *)p4d_page_vaddr(addr); 448 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
450 449
@@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
462 } else 461 } else
463 note_page(m, st, __pgprot(0), 0, 3); 462 note_page(m, st, __pgprot(0), 0, 3);
464 463
465 prev_pud = start;
466 start++; 464 start++;
467 } 465 }
468} 466}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 999d6d8f0bef..bc4bc7b2f075 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
685 * that UV should be updated so that smp_call_function_many(), 685 * that UV should be updated so that smp_call_function_many(),
686 * etc, are optimal on UV. 686 * etc, are optimal on UV.
687 */ 687 */
688 unsigned int cpu;
689
690 cpu = smp_processor_id();
691 cpumask = uv_flush_tlb_others(cpumask, info); 688 cpumask = uv_flush_tlb_others(cpumask, info);
692 if (cpumask) 689 if (cpumask)
693 smp_call_function_many(cpumask, flush_tlb_func_remote, 690 smp_call_function_many(cpumask, flush_tlb_func_remote,
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index eb33432f2f24..ef60d789c76e 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -45,7 +45,7 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI 45 * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
46 * callback method, which uses efi_call() directly, with the kernel page tables: 46 * callback method, which uses efi_call() directly, with the kernel page tables:
47 */ 47 */
48 if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags))) 48 if (unlikely(efi_enabled(EFI_OLD_MEMMAP)))
49 ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); 49 ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
50 else 50 else
51 ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); 51 ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
@@ -85,18 +85,6 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
85 return ret; 85 return ret;
86} 86}
87 87
88s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
89 u64 a4, u64 a5)
90{
91 s64 ret;
92
93 preempt_disable();
94 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
95 preempt_enable();
96
97 return ret;
98}
99
100 88
101long sn_partition_id; 89long sn_partition_id;
102EXPORT_SYMBOL_GPL(sn_partition_id); 90EXPORT_SYMBOL_GPL(sn_partition_id);
@@ -207,7 +195,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
207} 195}
208EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); 196EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
209 197
210#ifdef CONFIG_EFI
211void uv_bios_init(void) 198void uv_bios_init(void)
212{ 199{
213 uv_systab = NULL; 200 uv_systab = NULL;
@@ -237,4 +224,3 @@ void uv_bios_init(void)
237 } 224 }
238 pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); 225 pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
239} 226}
240#endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a4130b84d1ff..2c53b0f19329 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -2010,8 +2010,7 @@ static void make_per_cpu_thp(struct bau_control *smaster)
2010 int cpu; 2010 int cpu;
2011 size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); 2011 size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
2012 2012
2013 smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); 2013 smaster->thp = kzalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
2014 memset(smaster->thp, 0, hpsz);
2015 for_each_present_cpu(cpu) { 2014 for_each_present_cpu(cpu) {
2016 smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; 2015 smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
2017 smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 2016 smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
@@ -2135,15 +2134,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
2135static int __init init_per_cpu(int nuvhubs, int base_part_pnode) 2134static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
2136{ 2135{
2137 unsigned char *uvhub_mask; 2136 unsigned char *uvhub_mask;
2138 void *vp;
2139 struct uvhub_desc *uvhub_descs; 2137 struct uvhub_desc *uvhub_descs;
2140 2138
2141 if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) 2139 if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
2142 timeout_us = calculate_destination_timeout(); 2140 timeout_us = calculate_destination_timeout();
2143 2141
2144 vp = kmalloc_array(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); 2142 uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
2145 uvhub_descs = (struct uvhub_desc *)vp;
2146 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
2147 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); 2143 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
2148 2144
2149 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) 2145 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 4463fa72db94..96cb20de08af 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
47targets += realmode.lds 47targets += realmode.lds
48$(obj)/realmode.lds: $(obj)/pasyms.h 48$(obj)/realmode.lds: $(obj)/pasyms.h
49 49
50LDFLAGS_realmode.elf := --emit-relocs -T 50LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
51CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) 51CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
52 52
53targets += realmode.elf 53targets += realmode.elf
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
index df8e11e26bc3..3bb980800c58 100644
--- a/arch/x86/realmode/rm/realmode.lds.S
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -9,7 +9,7 @@
9 9
10#undef i386 10#undef i386
11 11
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 12OUTPUT_FORMAT("elf32-i386")
13OUTPUT_ARCH(i386) 13OUTPUT_ARCH(i386)
14 14
15SECTIONS 15SECTIONS
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 963986a48c62..bacf87ee7975 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -5,6 +5,8 @@ config XTENSA
5 select ARCH_HAS_SYNC_DMA_FOR_CPU 5 select ARCH_HAS_SYNC_DMA_FOR_CPU
6 select ARCH_HAS_SYNC_DMA_FOR_DEVICE 6 select ARCH_HAS_SYNC_DMA_FOR_DEVICE
7 select ARCH_NO_COHERENT_DMA_MMAP if !MMU 7 select ARCH_NO_COHERENT_DMA_MMAP if !MMU
8 select ARCH_USE_QUEUED_RWLOCKS
9 select ARCH_USE_QUEUED_SPINLOCKS
8 select ARCH_WANT_FRAME_POINTERS 10 select ARCH_WANT_FRAME_POINTERS
9 select ARCH_WANT_IPC_PARSE_VERSION 11 select ARCH_WANT_IPC_PARSE_VERSION
10 select BUILDTIME_EXTABLE_SORT 12 select BUILDTIME_EXTABLE_SORT
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 809f39ce08c0..d939e13e8d84 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -23,6 +23,8 @@ generic-y += mm-arch-hooks.h
23generic-y += param.h 23generic-y += param.h
24generic-y += percpu.h 24generic-y += percpu.h
25generic-y += preempt.h 25generic-y += preempt.h
26generic-y += qrwlock.h
27generic-y += qspinlock.h
26generic-y += rwsem.h 28generic-y += rwsem.h
27generic-y += sections.h 29generic-y += sections.h
28generic-y += socket.h 30generic-y += socket.h
diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h
index 201e9009efd8..22a10c715c1f 100644
--- a/arch/xtensa/include/asm/cmpxchg.h
+++ b/arch/xtensa/include/asm/cmpxchg.h
@@ -13,6 +13,7 @@
13 13
14#ifndef __ASSEMBLY__ 14#ifndef __ASSEMBLY__
15 15
16#include <linux/bits.h>
16#include <linux/stringify.h> 17#include <linux/stringify.h>
17 18
18/* 19/*
@@ -138,6 +139,28 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
138#define xchg(ptr,x) \ 139#define xchg(ptr,x) \
139 ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) 140 ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
140 141
142static inline u32 xchg_small(volatile void *ptr, u32 x, int size)
143{
144 int off = (unsigned long)ptr % sizeof(u32);
145 volatile u32 *p = ptr - off;
146#ifdef __BIG_ENDIAN
147 int bitoff = (sizeof(u32) - size - off) * BITS_PER_BYTE;
148#else
149 int bitoff = off * BITS_PER_BYTE;
150#endif
151 u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;
152 u32 oldv, newv;
153 u32 ret;
154
155 do {
156 oldv = READ_ONCE(*p);
157 ret = (oldv & bitmask) >> bitoff;
158 newv = (oldv & ~bitmask) | (x << bitoff);
159 } while (__cmpxchg_u32(p, oldv, newv) != oldv);
160
161 return ret;
162}
163
141/* 164/*
142 * This only works if the compiler isn't horribly bad at optimizing. 165 * This only works if the compiler isn't horribly bad at optimizing.
143 * gcc-2.5.8 reportedly can't handle this, but I define that one to 166 * gcc-2.5.8 reportedly can't handle this, but I define that one to
@@ -150,11 +173,16 @@ static __inline__ unsigned long
150__xchg(unsigned long x, volatile void * ptr, int size) 173__xchg(unsigned long x, volatile void * ptr, int size)
151{ 174{
152 switch (size) { 175 switch (size) {
153 case 4: 176 case 1:
154 return xchg_u32(ptr, x); 177 return xchg_small(ptr, x, 1);
178 case 2:
179 return xchg_small(ptr, x, 2);
180 case 4:
181 return xchg_u32(ptr, x);
182 default:
183 __xchg_called_with_bad_pointer();
184 return x;
155 } 185 }
156 __xchg_called_with_bad_pointer();
157 return x;
158} 186}
159 187
160#endif /* __ASSEMBLY__ */ 188#endif /* __ASSEMBLY__ */
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index c6e1290dcbb7..584b0de6f2ca 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -12,188 +12,9 @@
12#define _XTENSA_SPINLOCK_H 12#define _XTENSA_SPINLOCK_H
13 13
14#include <asm/barrier.h> 14#include <asm/barrier.h>
15#include <asm/processor.h> 15#include <asm/qrwlock.h>
16#include <asm/qspinlock.h>
16 17
17/* 18#define smp_mb__after_spinlock() smp_mb()
18 * spinlock
19 *
20 * There is at most one owner of a spinlock. There are not different
21 * types of spinlock owners like there are for rwlocks (see below).
22 *
23 * When trying to obtain a spinlock, the function "spins" forever, or busy-
24 * waits, until the lock is obtained. When spinning, presumably some other
25 * owner will soon give up the spinlock making it available to others. Use
26 * the trylock functions to avoid spinning forever.
27 *
28 * possible values:
29 *
30 * 0 nobody owns the spinlock
31 * 1 somebody owns the spinlock
32 */
33
34#define arch_spin_is_locked(x) ((x)->slock != 0)
35
36static inline void arch_spin_lock(arch_spinlock_t *lock)
37{
38 unsigned long tmp;
39
40 __asm__ __volatile__(
41 " movi %0, 0\n"
42 " wsr %0, scompare1\n"
43 "1: movi %0, 1\n"
44 " s32c1i %0, %1, 0\n"
45 " bnez %0, 1b\n"
46 : "=&a" (tmp)
47 : "a" (&lock->slock)
48 : "memory");
49}
50
51/* Returns 1 if the lock is obtained, 0 otherwise. */
52
53static inline int arch_spin_trylock(arch_spinlock_t *lock)
54{
55 unsigned long tmp;
56
57 __asm__ __volatile__(
58 " movi %0, 0\n"
59 " wsr %0, scompare1\n"
60 " movi %0, 1\n"
61 " s32c1i %0, %1, 0\n"
62 : "=&a" (tmp)
63 : "a" (&lock->slock)
64 : "memory");
65
66 return tmp == 0 ? 1 : 0;
67}
68
69static inline void arch_spin_unlock(arch_spinlock_t *lock)
70{
71 unsigned long tmp;
72
73 __asm__ __volatile__(
74 " movi %0, 0\n"
75 " s32ri %0, %1, 0\n"
76 : "=&a" (tmp)
77 : "a" (&lock->slock)
78 : "memory");
79}
80
81/*
82 * rwlock
83 *
84 * Read-write locks are really a more flexible spinlock. They allow
85 * multiple readers but only one writer. Write ownership is exclusive
86 * (i.e., all other readers and writers are blocked from ownership while
87 * there is a write owner). These rwlocks are unfair to writers. Writers
88 * can be starved for an indefinite time by readers.
89 *
90 * possible values:
91 *
92 * 0 nobody owns the rwlock
93 * >0 one or more readers own the rwlock
94 * (the positive value is the actual number of readers)
95 * 0x80000000 one writer owns the rwlock, no other writers, no readers
96 */
97
98static inline void arch_write_lock(arch_rwlock_t *rw)
99{
100 unsigned long tmp;
101
102 __asm__ __volatile__(
103 " movi %0, 0\n"
104 " wsr %0, scompare1\n"
105 "1: movi %0, 1\n"
106 " slli %0, %0, 31\n"
107 " s32c1i %0, %1, 0\n"
108 " bnez %0, 1b\n"
109 : "=&a" (tmp)
110 : "a" (&rw->lock)
111 : "memory");
112}
113
114/* Returns 1 if the lock is obtained, 0 otherwise. */
115
116static inline int arch_write_trylock(arch_rwlock_t *rw)
117{
118 unsigned long tmp;
119
120 __asm__ __volatile__(
121 " movi %0, 0\n"
122 " wsr %0, scompare1\n"
123 " movi %0, 1\n"
124 " slli %0, %0, 31\n"
125 " s32c1i %0, %1, 0\n"
126 : "=&a" (tmp)
127 : "a" (&rw->lock)
128 : "memory");
129
130 return tmp == 0 ? 1 : 0;
131}
132
133static inline void arch_write_unlock(arch_rwlock_t *rw)
134{
135 unsigned long tmp;
136
137 __asm__ __volatile__(
138 " movi %0, 0\n"
139 " s32ri %0, %1, 0\n"
140 : "=&a" (tmp)
141 : "a" (&rw->lock)
142 : "memory");
143}
144
145static inline void arch_read_lock(arch_rwlock_t *rw)
146{
147 unsigned long tmp;
148 unsigned long result;
149
150 __asm__ __volatile__(
151 "1: l32i %1, %2, 0\n"
152 " bltz %1, 1b\n"
153 " wsr %1, scompare1\n"
154 " addi %0, %1, 1\n"
155 " s32c1i %0, %2, 0\n"
156 " bne %0, %1, 1b\n"
157 : "=&a" (result), "=&a" (tmp)
158 : "a" (&rw->lock)
159 : "memory");
160}
161
162/* Returns 1 if the lock is obtained, 0 otherwise. */
163
164static inline int arch_read_trylock(arch_rwlock_t *rw)
165{
166 unsigned long result;
167 unsigned long tmp;
168
169 __asm__ __volatile__(
170 " l32i %1, %2, 0\n"
171 " addi %0, %1, 1\n"
172 " bltz %0, 1f\n"
173 " wsr %1, scompare1\n"
174 " s32c1i %0, %2, 0\n"
175 " sub %0, %0, %1\n"
176 "1:\n"
177 : "=&a" (result), "=&a" (tmp)
178 : "a" (&rw->lock)
179 : "memory");
180
181 return result == 0;
182}
183
184static inline void arch_read_unlock(arch_rwlock_t *rw)
185{
186 unsigned long tmp1, tmp2;
187
188 __asm__ __volatile__(
189 "1: l32i %1, %2, 0\n"
190 " addi %0, %1, -1\n"
191 " wsr %1, scompare1\n"
192 " s32c1i %0, %2, 0\n"
193 " bne %0, %1, 1b\n"
194 : "=&a" (tmp1), "=&a" (tmp2)
195 : "a" (&rw->lock)
196 : "memory");
197}
198 19
199#endif /* _XTENSA_SPINLOCK_H */ 20#endif /* _XTENSA_SPINLOCK_H */
diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
index bb1fe6c1816e..64c9389254f1 100644
--- a/arch/xtensa/include/asm/spinlock_types.h
+++ b/arch/xtensa/include/asm/spinlock_types.h
@@ -2,20 +2,11 @@
2#ifndef __ASM_SPINLOCK_TYPES_H 2#ifndef __ASM_SPINLOCK_TYPES_H
3#define __ASM_SPINLOCK_TYPES_H 3#define __ASM_SPINLOCK_TYPES_H
4 4
5#ifndef __LINUX_SPINLOCK_TYPES_H 5#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
6# error "please don't include this file directly" 6# error "please don't include this file directly"
7#endif 7#endif
8 8
9typedef struct { 9#include <asm-generic/qspinlock_types.h>
10 volatile unsigned int slock; 10#include <asm-generic/qrwlock_types.h>
11} arch_spinlock_t;
12
13#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
14
15typedef struct {
16 volatile unsigned int lock;
17} arch_rwlock_t;
18
19#define __ARCH_RW_LOCK_UNLOCKED { 0 }
20 11
21#endif 12#endif
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index f333f10a7650..f092cc3f4e66 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -121,15 +121,6 @@ static inline struct thread_info *current_thread_info(void)
121#define _TIF_WORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ 121#define _TIF_WORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \
122 _TIF_SYSCALL_TRACEPOINT) 122 _TIF_SYSCALL_TRACEPOINT)
123 123
124/*
125 * Thread-synchronous status.
126 *
127 * This is different from the flags in that nobody else
128 * ever touches our thread-synchronous status, so we don't
129 * have to worry about atomic accesses.
130 */
131#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */
132
133#define THREAD_SIZE KERNEL_STACK_SIZE 124#define THREAD_SIZE KERNEL_STACK_SIZE
134#define THREAD_SIZE_ORDER (KERNEL_STACK_SHIFT - PAGE_SHIFT) 125#define THREAD_SIZE_ORDER (KERNEL_STACK_SHIFT - PAGE_SHIFT)
135 126
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 74969a437a37..db278a9e80c7 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -52,8 +52,6 @@
52extern void ret_from_fork(void); 52extern void ret_from_fork(void);
53extern void ret_from_kernel_thread(void); 53extern void ret_from_kernel_thread(void);
54 54
55struct task_struct *current_set[NR_CPUS] = {&init_task, };
56
57void (*pm_power_off)(void) = NULL; 55void (*pm_power_off)(void) = NULL;
58EXPORT_SYMBOL(pm_power_off); 56EXPORT_SYMBOL(pm_power_off);
59 57
@@ -321,8 +319,8 @@ unsigned long get_wchan(struct task_struct *p)
321 319
322 /* Stack layout: sp-4: ra, sp-3: sp' */ 320 /* Stack layout: sp-4: ra, sp-3: sp' */
323 321
324 pc = MAKE_PC_FROM_RA(*(unsigned long*)sp - 4, sp); 322 pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), sp);
325 sp = *(unsigned long *)sp - 3; 323 sp = SPILL_SLOT(sp, 1);
326 } while (count++ < 16); 324 } while (count++ < 16);
327 return 0; 325 return 0;
328} 326}
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index be1f280c322c..3699d6d3e479 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -372,8 +372,7 @@ static void send_ipi_message(const struct cpumask *callmask,
372 unsigned long mask = 0; 372 unsigned long mask = 0;
373 373
374 for_each_cpu(index, callmask) 374 for_each_cpu(index, callmask)
375 if (index != smp_processor_id()) 375 mask |= 1 << index;
376 mask |= 1 << index;
377 376
378 set_er(mask, MIPISET(msg_id)); 377 set_er(mask, MIPISET(msg_id));
379} 378}
@@ -412,22 +411,31 @@ irqreturn_t ipi_interrupt(int irq, void *dev_id)
412{ 411{
413 unsigned int cpu = smp_processor_id(); 412 unsigned int cpu = smp_processor_id();
414 struct ipi_data *ipi = &per_cpu(ipi_data, cpu); 413 struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
415 unsigned int msg;
416 unsigned i;
417 414
418 msg = get_er(MIPICAUSE(cpu)); 415 for (;;) {
419 for (i = 0; i < IPI_MAX; i++) 416 unsigned int msg;
420 if (msg & (1 << i)) { 417
421 set_er(1 << i, MIPICAUSE(cpu)); 418 msg = get_er(MIPICAUSE(cpu));
422 ++ipi->ipi_count[i]; 419 set_er(msg, MIPICAUSE(cpu));
420
421 if (!msg)
422 break;
423
424 if (msg & (1 << IPI_CALL_FUNC)) {
425 ++ipi->ipi_count[IPI_CALL_FUNC];
426 generic_smp_call_function_interrupt();
423 } 427 }
424 428
425 if (msg & (1 << IPI_RESCHEDULE)) 429 if (msg & (1 << IPI_RESCHEDULE)) {
426 scheduler_ipi(); 430 ++ipi->ipi_count[IPI_RESCHEDULE];
427 if (msg & (1 << IPI_CALL_FUNC)) 431 scheduler_ipi();
428 generic_smp_call_function_interrupt(); 432 }
429 if (msg & (1 << IPI_CPU_STOP)) 433
430 ipi_cpu_stop(cpu); 434 if (msg & (1 << IPI_CPU_STOP)) {
435 ++ipi->ipi_count[IPI_CPU_STOP];
436 ipi_cpu_stop(cpu);
437 }
438 }
431 439
432 return IRQ_HANDLED; 440 return IRQ_HANDLED;
433} 441}
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 378186b5eb40..69db8c93c1f9 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -52,14 +52,11 @@ static struct clocksource ccount_clocksource = {
52 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 52 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
53}; 53};
54 54
55static int ccount_timer_set_next_event(unsigned long delta,
56 struct clock_event_device *dev);
57struct ccount_timer { 55struct ccount_timer {
58 struct clock_event_device evt; 56 struct clock_event_device evt;
59 int irq_enabled; 57 int irq_enabled;
60 char name[24]; 58 char name[24];
61}; 59};
62static DEFINE_PER_CPU(struct ccount_timer, ccount_timer);
63 60
64static int ccount_timer_set_next_event(unsigned long delta, 61static int ccount_timer_set_next_event(unsigned long delta,
65 struct clock_event_device *dev) 62 struct clock_event_device *dev)
@@ -107,7 +104,30 @@ static int ccount_timer_set_oneshot(struct clock_event_device *evt)
107 return 0; 104 return 0;
108} 105}
109 106
110static irqreturn_t timer_interrupt(int irq, void *dev_id); 107static DEFINE_PER_CPU(struct ccount_timer, ccount_timer) = {
108 .evt = {
109 .features = CLOCK_EVT_FEAT_ONESHOT,
110 .rating = 300,
111 .set_next_event = ccount_timer_set_next_event,
112 .set_state_shutdown = ccount_timer_shutdown,
113 .set_state_oneshot = ccount_timer_set_oneshot,
114 .tick_resume = ccount_timer_set_oneshot,
115 },
116};
117
118static irqreturn_t timer_interrupt(int irq, void *dev_id)
119{
120 struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt;
121
122 set_linux_timer(get_linux_timer());
123 evt->event_handler(evt);
124
125 /* Allow platform to do something useful (Wdog). */
126 platform_heartbeat();
127
128 return IRQ_HANDLED;
129}
130
111static struct irqaction timer_irqaction = { 131static struct irqaction timer_irqaction = {
112 .handler = timer_interrupt, 132 .handler = timer_interrupt,
113 .flags = IRQF_TIMER, 133 .flags = IRQF_TIMER,
@@ -120,14 +140,8 @@ void local_timer_setup(unsigned cpu)
120 struct clock_event_device *clockevent = &timer->evt; 140 struct clock_event_device *clockevent = &timer->evt;
121 141
122 timer->irq_enabled = 1; 142 timer->irq_enabled = 1;
123 clockevent->name = timer->name;
124 snprintf(timer->name, sizeof(timer->name), "ccount_clockevent_%u", cpu); 143 snprintf(timer->name, sizeof(timer->name), "ccount_clockevent_%u", cpu);
125 clockevent->features = CLOCK_EVT_FEAT_ONESHOT; 144 clockevent->name = timer->name;
126 clockevent->rating = 300;
127 clockevent->set_next_event = ccount_timer_set_next_event;
128 clockevent->set_state_shutdown = ccount_timer_shutdown;
129 clockevent->set_state_oneshot = ccount_timer_set_oneshot;
130 clockevent->tick_resume = ccount_timer_set_oneshot;
131 clockevent->cpumask = cpumask_of(cpu); 145 clockevent->cpumask = cpumask_of(cpu);
132 clockevent->irq = irq_create_mapping(NULL, LINUX_TIMER_INT); 146 clockevent->irq = irq_create_mapping(NULL, LINUX_TIMER_INT);
133 if (WARN(!clockevent->irq, "error: can't map timer irq")) 147 if (WARN(!clockevent->irq, "error: can't map timer irq"))
@@ -190,23 +204,6 @@ void __init time_init(void)
190 timer_probe(); 204 timer_probe();
191} 205}
192 206
193/*
194 * The timer interrupt is called HZ times per second.
195 */
196
197irqreturn_t timer_interrupt(int irq, void *dev_id)
198{
199 struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt;
200
201 set_linux_timer(get_linux_timer());
202 evt->event_handler(evt);
203
204 /* Allow platform to do something useful (Wdog). */
205 platform_heartbeat();
206
207 return IRQ_HANDLED;
208}
209
210#ifndef CONFIG_GENERIC_CALIBRATE_DELAY 207#ifndef CONFIG_GENERIC_CALIBRATE_DELAY
211void calibrate_delay(void) 208void calibrate_delay(void)
212{ 209{
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index e6fa55aa1ccb..454d53096bc9 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -420,16 +420,15 @@ void __init trap_init(void)
420 /* Setup specific handlers. */ 420 /* Setup specific handlers. */
421 421
422 for(i = 0; dispatch_init_table[i].cause >= 0; i++) { 422 for(i = 0; dispatch_init_table[i].cause >= 0; i++) {
423
424 int fast = dispatch_init_table[i].fast; 423 int fast = dispatch_init_table[i].fast;
425 int cause = dispatch_init_table[i].cause; 424 int cause = dispatch_init_table[i].cause;
426 void *handler = dispatch_init_table[i].handler; 425 void *handler = dispatch_init_table[i].handler;
427 426
428 if (fast == 0) 427 if (fast == 0)
429 set_handler(default_handler, cause, handler); 428 set_handler(default_handler, cause, handler);
430 if (fast && fast & USER) 429 if ((fast & USER) != 0)
431 set_handler(fast_user_handler, cause, handler); 430 set_handler(fast_user_handler, cause, handler);
432 if (fast && fast & KRNL) 431 if ((fast & KRNL) != 0)
433 set_handler(fast_kernel_handler, cause, handler); 432 set_handler(fast_kernel_handler, cause, handler);
434 } 433 }
435 434
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index 5d28d9e454f5..08f4a512afad 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
267 int i, rc; 267 int i, rc;
268 268
269 pr_devel("Adapter reset request\n"); 269 pr_devel("Adapter reset request\n");
270 spin_lock(&adapter->afu_list_lock);
270 for (i = 0; i < adapter->slices; i++) { 271 for (i = 0; i < adapter->slices; i++) {
271 if ((afu = adapter->afu[i])) { 272 if ((afu = adapter->afu[i])) {
272 pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, 273 pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
@@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
283 pci_error_handlers(afu, CXL_RESUME_EVENT, 0); 284 pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
284 } 285 }
285 } 286 }
287 spin_unlock(&adapter->afu_list_lock);
286 return rc; 288 return rc;
287} 289}
288 290
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index c79ba1c699ad..300531d6136f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
1805 /* There should only be one entry, but go through the list 1805 /* There should only be one entry, but go through the list
1806 * anyway 1806 * anyway
1807 */ 1807 */
1808 if (afu->phb == NULL) 1808 if (afu == NULL || afu->phb == NULL)
1809 return result; 1809 return result;
1810 1810
1811 list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { 1811 list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1832{ 1832{
1833 struct cxl *adapter = pci_get_drvdata(pdev); 1833 struct cxl *adapter = pci_get_drvdata(pdev);
1834 struct cxl_afu *afu; 1834 struct cxl_afu *afu;
1835 pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result; 1835 pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
1836 pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
1836 int i; 1837 int i;
1837 1838
1838 /* At this point, we could still have an interrupt pending. 1839 /* At this point, we could still have an interrupt pending.
@@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1843 1844
1844 /* If we're permanently dead, give up. */ 1845 /* If we're permanently dead, give up. */
1845 if (state == pci_channel_io_perm_failure) { 1846 if (state == pci_channel_io_perm_failure) {
1847 spin_lock(&adapter->afu_list_lock);
1846 for (i = 0; i < adapter->slices; i++) { 1848 for (i = 0; i < adapter->slices; i++) {
1847 afu = adapter->afu[i]; 1849 afu = adapter->afu[i];
1848 /* 1850 /*
@@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1851 */ 1853 */
1852 cxl_vphb_error_detected(afu, state); 1854 cxl_vphb_error_detected(afu, state);
1853 } 1855 }
1856 spin_unlock(&adapter->afu_list_lock);
1854 return PCI_ERS_RESULT_DISCONNECT; 1857 return PCI_ERS_RESULT_DISCONNECT;
1855 } 1858 }
1856 1859
@@ -1932,11 +1935,17 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1932 * * In slot_reset, free the old resources and allocate new ones. 1935 * * In slot_reset, free the old resources and allocate new ones.
1933 * * In resume, clear the flag to allow things to start. 1936 * * In resume, clear the flag to allow things to start.
1934 */ 1937 */
1938
1939 /* Make sure no one else changes the afu list */
1940 spin_lock(&adapter->afu_list_lock);
1941
1935 for (i = 0; i < adapter->slices; i++) { 1942 for (i = 0; i < adapter->slices; i++) {
1936 afu = adapter->afu[i]; 1943 afu = adapter->afu[i];
1937 1944
1938 afu_result = cxl_vphb_error_detected(afu, state); 1945 if (afu == NULL)
1946 continue;
1939 1947
1948 afu_result = cxl_vphb_error_detected(afu, state);
1940 cxl_context_detach_all(afu); 1949 cxl_context_detach_all(afu);
1941 cxl_ops->afu_deactivate_mode(afu, afu->current_mode); 1950 cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
1942 pci_deconfigure_afu(afu); 1951 pci_deconfigure_afu(afu);
@@ -1948,6 +1957,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
1948 (result == PCI_ERS_RESULT_NEED_RESET)) 1957 (result == PCI_ERS_RESULT_NEED_RESET))
1949 result = PCI_ERS_RESULT_NONE; 1958 result = PCI_ERS_RESULT_NONE;
1950 } 1959 }
1960 spin_unlock(&adapter->afu_list_lock);
1951 1961
1952 /* should take the context lock here */ 1962 /* should take the context lock here */
1953 if (cxl_adapter_context_lock(adapter) != 0) 1963 if (cxl_adapter_context_lock(adapter) != 0)
@@ -1980,14 +1990,18 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
1980 */ 1990 */
1981 cxl_adapter_context_unlock(adapter); 1991 cxl_adapter_context_unlock(adapter);
1982 1992
1993 spin_lock(&adapter->afu_list_lock);
1983 for (i = 0; i < adapter->slices; i++) { 1994 for (i = 0; i < adapter->slices; i++) {
1984 afu = adapter->afu[i]; 1995 afu = adapter->afu[i];
1985 1996
1997 if (afu == NULL)
1998 continue;
1999
1986 if (pci_configure_afu(afu, adapter, pdev)) 2000 if (pci_configure_afu(afu, adapter, pdev))
1987 goto err; 2001 goto err_unlock;
1988 2002
1989 if (cxl_afu_select_best_mode(afu)) 2003 if (cxl_afu_select_best_mode(afu))
1990 goto err; 2004 goto err_unlock;
1991 2005
1992 if (afu->phb == NULL) 2006 if (afu->phb == NULL)
1993 continue; 2007 continue;
@@ -1999,16 +2013,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
1999 ctx = cxl_get_context(afu_dev); 2013 ctx = cxl_get_context(afu_dev);
2000 2014
2001 if (ctx && cxl_release_context(ctx)) 2015 if (ctx && cxl_release_context(ctx))
2002 goto err; 2016 goto err_unlock;
2003 2017
2004 ctx = cxl_dev_context_init(afu_dev); 2018 ctx = cxl_dev_context_init(afu_dev);
2005 if (IS_ERR(ctx)) 2019 if (IS_ERR(ctx))
2006 goto err; 2020 goto err_unlock;
2007 2021
2008 afu_dev->dev.archdata.cxl_ctx = ctx; 2022 afu_dev->dev.archdata.cxl_ctx = ctx;
2009 2023
2010 if (cxl_ops->afu_check_and_enable(afu)) 2024 if (cxl_ops->afu_check_and_enable(afu))
2011 goto err; 2025 goto err_unlock;
2012 2026
2013 afu_dev->error_state = pci_channel_io_normal; 2027 afu_dev->error_state = pci_channel_io_normal;
2014 2028
@@ -2029,8 +2043,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
2029 result = PCI_ERS_RESULT_DISCONNECT; 2043 result = PCI_ERS_RESULT_DISCONNECT;
2030 } 2044 }
2031 } 2045 }
2046
2047 spin_unlock(&adapter->afu_list_lock);
2032 return result; 2048 return result;
2033 2049
2050err_unlock:
2051 spin_unlock(&adapter->afu_list_lock);
2052
2034err: 2053err:
2035 /* All the bits that happen in both error_detected and cxl_remove 2054 /* All the bits that happen in both error_detected and cxl_remove
2036 * should be idempotent, so we don't need to worry about leaving a mix 2055 * should be idempotent, so we don't need to worry about leaving a mix
@@ -2051,10 +2070,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
2051 * This is not the place to be checking if everything came back up 2070 * This is not the place to be checking if everything came back up
2052 * properly, because there's no return value: do that in slot_reset. 2071 * properly, because there's no return value: do that in slot_reset.
2053 */ 2072 */
2073 spin_lock(&adapter->afu_list_lock);
2054 for (i = 0; i < adapter->slices; i++) { 2074 for (i = 0; i < adapter->slices; i++) {
2055 afu = adapter->afu[i]; 2075 afu = adapter->afu[i];
2056 2076
2057 if (afu->phb == NULL) 2077 if (afu == NULL || afu->phb == NULL)
2058 continue; 2078 continue;
2059 2079
2060 list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { 2080 list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
@@ -2063,6 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
2063 afu_dev->driver->err_handler->resume(afu_dev); 2083 afu_dev->driver->err_handler->resume(afu_dev);
2064 } 2084 }
2065 } 2085 }
2086 spin_unlock(&adapter->afu_list_lock);
2066} 2087}
2067 2088
2068static const struct pci_error_handlers cxl_err_handler = { 2089static const struct pci_error_handlers cxl_err_handler = {
diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index 49da2f744bbf..631c5df246d4 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -43,8 +43,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev)
43 return false; 43 return false;
44 } 44 }
45 45
46 set_dma_ops(&dev->dev, &dma_nommu_ops); 46 dev->dev.archdata.dma_offset = PAGE_OFFSET;
47 set_dma_offset(&dev->dev, PAGE_OFFSET);
48 47
49 /* 48 /*
50 * Allocate a context to do cxl things too. If we eventually do real 49 * Allocate a context to do cxl things too. If we eventually do real
diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c
index d21041554507..a5bf46310f60 100644
--- a/drivers/net/ethernet/pasemi/pasemi_mac.c
+++ b/drivers/net/ethernet/pasemi/pasemi_mac.c
@@ -1716,6 +1716,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
1716 err = -ENODEV; 1716 err = -ENODEV;
1717 goto out; 1717 goto out;
1718 } 1718 }
1719 dma_set_mask(&mac->dma_pdev->dev, DMA_BIT_MASK(64));
1719 1720
1720 mac->iob_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa001, NULL); 1721 mac->iob_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa001, NULL);
1721 if (!mac->iob_pdev) { 1722 if (!mac->iob_pdev) {
diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 28f87fd6a28e..9f906a5b8e81 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -66,7 +66,7 @@ static void tty_audit_log(const char *description, dev_t dev,
66 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); 66 uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
67 unsigned int sessionid = audit_get_sessionid(current); 67 unsigned int sessionid = audit_get_sessionid(current);
68 68
69 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY); 69 ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY);
70 if (ab) { 70 if (ab) {
71 char name[sizeof(current->comm)]; 71 char name[sizeof(current->comm)];
72 72
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
index 38edeb4729a9..1a742fe8f6db 100644
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -74,13 +74,13 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
74 ret = eeh_pe_get_state(pe); 74 ret = eeh_pe_get_state(pe);
75 break; 75 break;
76 case VFIO_EEH_PE_RESET_DEACTIVATE: 76 case VFIO_EEH_PE_RESET_DEACTIVATE:
77 ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); 77 ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
78 break; 78 break;
79 case VFIO_EEH_PE_RESET_HOT: 79 case VFIO_EEH_PE_RESET_HOT:
80 ret = eeh_pe_reset(pe, EEH_RESET_HOT); 80 ret = eeh_pe_reset(pe, EEH_RESET_HOT, true);
81 break; 81 break;
82 case VFIO_EEH_PE_RESET_FUNDAMENTAL: 82 case VFIO_EEH_PE_RESET_FUNDAMENTAL:
83 ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL); 83 ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
84 break; 84 break;
85 case VFIO_EEH_PE_CONFIGURE: 85 case VFIO_EEH_PE_CONFIGURE:
86 ret = eeh_pe_configure(pe); 86 ret = eeh_pe_configure(pe);
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..23fcd8c164a3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
12 attr.o bad_inode.o file.o filesystems.o namespace.o \ 12 attr.o bad_inode.o file.o filesystems.o namespace.o \
13 seq_file.o xattr.o libfs.o fs-writeback.o \ 13 seq_file.o xattr.o libfs.o fs-writeback.o \
14 pnode.o splice.o sync.o utimes.o d_path.o \ 14 pnode.o splice.o sync.o utimes.o d_path.o \
15 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o 15 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
16 fs_types.o
16 17
17ifeq ($(CONFIG_BLOCK),y) 18ifeq ($(CONFIG_BLOCK),y)
18obj-y += buffer.o block_dev.o direct-io.o mpage.o 19obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
9#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
10#include <linux/posix_acl.h> 10#include <linux/posix_acl.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/sched/mm.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13 14
14#include "ctree.h" 15#include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
72 } 73 }
73 74
74 if (acl) { 75 if (acl) {
76 unsigned int nofs_flag;
77
75 size = posix_acl_xattr_size(acl->a_count); 78 size = posix_acl_xattr_size(acl->a_count);
79 /*
80 * We're holding a transaction handle, so use a NOFS memory
81 * allocation context to avoid deadlock if reclaim happens.
82 */
83 nofs_flag = memalloc_nofs_save();
76 value = kmalloc(size, GFP_KERNEL); 84 value = kmalloc(size, GFP_KERNEL);
85 memalloc_nofs_restore(nofs_flag);
77 if (!value) { 86 if (!value) {
78 ret = -ENOMEM; 87 ret = -ENOMEM;
79 goto out; 88 goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..122cb97c7909 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
139 } 139 }
140 140
141 if (flags & WQ_HIGHPRI) 141 if (flags & WQ_HIGHPRI)
142 ret->normal_wq = alloc_workqueue("%s-%s-high", flags, 142 ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
143 ret->current_active, "btrfs", 143 ret->current_active, name);
144 name);
145 else 144 else
146 ret->normal_wq = alloc_workqueue("%s-%s", flags, 145 ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
147 ret->current_active, "btrfs", 146 ret->current_active, name);
148 name);
149 if (!ret->normal_wq) { 147 if (!ret->normal_wq) {
150 kfree(ret); 148 kfree(ret);
151 return NULL; 149 return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78556447e1d5..11459fe84a29 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -712,7 +712,7 @@ out:
712 * read tree blocks and add keys where required. 712 * read tree blocks and add keys where required.
713 */ 713 */
714static int add_missing_keys(struct btrfs_fs_info *fs_info, 714static int add_missing_keys(struct btrfs_fs_info *fs_info,
715 struct preftrees *preftrees) 715 struct preftrees *preftrees, bool lock)
716{ 716{
717 struct prelim_ref *ref; 717 struct prelim_ref *ref;
718 struct extent_buffer *eb; 718 struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
737 free_extent_buffer(eb); 737 free_extent_buffer(eb);
738 return -EIO; 738 return -EIO;
739 } 739 }
740 btrfs_tree_read_lock(eb); 740 if (lock)
741 btrfs_tree_read_lock(eb);
741 if (btrfs_header_level(eb) == 0) 742 if (btrfs_header_level(eb) == 0)
742 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); 743 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
743 else 744 else
744 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); 745 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
745 btrfs_tree_read_unlock(eb); 746 if (lock)
747 btrfs_tree_read_unlock(eb);
746 free_extent_buffer(eb); 748 free_extent_buffer(eb);
747 prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); 749 prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
748 cond_resched(); 750 cond_resched();
@@ -1227,7 +1229,7 @@ again:
1227 1229
1228 btrfs_release_path(path); 1230 btrfs_release_path(path);
1229 1231
1230 ret = add_missing_keys(fs_info, &preftrees); 1232 ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
1231 if (ret) 1233 if (ret)
1232 goto out; 1234 goto out;
1233 1235
@@ -1288,11 +1290,15 @@ again:
1288 ret = -EIO; 1290 ret = -EIO;
1289 goto out; 1291 goto out;
1290 } 1292 }
1291 btrfs_tree_read_lock(eb); 1293
1292 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1294 if (!path->skip_locking) {
1295 btrfs_tree_read_lock(eb);
1296 btrfs_set_lock_blocking_read(eb);
1297 }
1293 ret = find_extent_in_eb(eb, bytenr, 1298 ret = find_extent_in_eb(eb, bytenr,
1294 *extent_item_pos, &eie, ignore_offset); 1299 *extent_item_pos, &eie, ignore_offset);
1295 btrfs_tree_read_unlock_blocking(eb); 1300 if (!path->skip_locking)
1301 btrfs_tree_read_unlock_blocking(eb);
1296 free_extent_buffer(eb); 1302 free_extent_buffer(eb);
1297 if (ret < 0) 1303 if (ret < 0)
1298 goto out; 1304 goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1650 /* make sure we can use eb after releasing the path */ 1656 /* make sure we can use eb after releasing the path */
1651 if (eb != eb_in) { 1657 if (eb != eb_in) {
1652 if (!path->skip_locking) 1658 if (!path->skip_locking)
1653 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1659 btrfs_set_lock_blocking_read(eb);
1654 path->nodes[0] = NULL; 1660 path->nodes[0] = NULL;
1655 path->locks[0] = 0; 1661 path->locks[0] = 0;
1656 } 1662 }
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 548057630b69..eb8e20b740d6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -730,6 +730,28 @@ struct heuristic_ws {
730 struct list_head list; 730 struct list_head list;
731}; 731};
732 732
733static struct workspace_manager heuristic_wsm;
734
735static void heuristic_init_workspace_manager(void)
736{
737 btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
738}
739
740static void heuristic_cleanup_workspace_manager(void)
741{
742 btrfs_cleanup_workspace_manager(&heuristic_wsm);
743}
744
745static struct list_head *heuristic_get_workspace(unsigned int level)
746{
747 return btrfs_get_workspace(&heuristic_wsm, level);
748}
749
750static void heuristic_put_workspace(struct list_head *ws)
751{
752 btrfs_put_workspace(&heuristic_wsm, ws);
753}
754
733static void free_heuristic_ws(struct list_head *ws) 755static void free_heuristic_ws(struct list_head *ws)
734{ 756{
735 struct heuristic_ws *workspace; 757 struct heuristic_ws *workspace;
@@ -742,7 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
742 kfree(workspace); 764 kfree(workspace);
743} 765}
744 766
745static struct list_head *alloc_heuristic_ws(void) 767static struct list_head *alloc_heuristic_ws(unsigned int level)
746{ 768{
747 struct heuristic_ws *ws; 769 struct heuristic_ws *ws;
748 770
@@ -769,65 +791,59 @@ fail:
769 return ERR_PTR(-ENOMEM); 791 return ERR_PTR(-ENOMEM);
770} 792}
771 793
772struct workspaces_list { 794const struct btrfs_compress_op btrfs_heuristic_compress = {
773 struct list_head idle_ws; 795 .init_workspace_manager = heuristic_init_workspace_manager,
774 spinlock_t ws_lock; 796 .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
775 /* Number of free workspaces */ 797 .get_workspace = heuristic_get_workspace,
776 int free_ws; 798 .put_workspace = heuristic_put_workspace,
777 /* Total number of allocated workspaces */ 799 .alloc_workspace = alloc_heuristic_ws,
778 atomic_t total_ws; 800 .free_workspace = free_heuristic_ws,
779 /* Waiters for a free workspace */
780 wait_queue_head_t ws_wait;
781}; 801};
782 802
783static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
784
785static struct workspaces_list btrfs_heuristic_ws;
786
787static const struct btrfs_compress_op * const btrfs_compress_op[] = { 803static const struct btrfs_compress_op * const btrfs_compress_op[] = {
804 /* The heuristic is represented as compression type 0 */
805 &btrfs_heuristic_compress,
788 &btrfs_zlib_compress, 806 &btrfs_zlib_compress,
789 &btrfs_lzo_compress, 807 &btrfs_lzo_compress,
790 &btrfs_zstd_compress, 808 &btrfs_zstd_compress,
791}; 809};
792 810
793void __init btrfs_init_compress(void) 811void btrfs_init_workspace_manager(struct workspace_manager *wsm,
812 const struct btrfs_compress_op *ops)
794{ 813{
795 struct list_head *workspace; 814 struct list_head *workspace;
796 int i;
797 815
798 INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); 816 wsm->ops = ops;
799 spin_lock_init(&btrfs_heuristic_ws.ws_lock);
800 atomic_set(&btrfs_heuristic_ws.total_ws, 0);
801 init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
802 817
803 workspace = alloc_heuristic_ws(); 818 INIT_LIST_HEAD(&wsm->idle_ws);
819 spin_lock_init(&wsm->ws_lock);
820 atomic_set(&wsm->total_ws, 0);
821 init_waitqueue_head(&wsm->ws_wait);
822
823 /*
824 * Preallocate one workspace for each compression type so we can
825 * guarantee forward progress in the worst case
826 */
827 workspace = wsm->ops->alloc_workspace(0);
804 if (IS_ERR(workspace)) { 828 if (IS_ERR(workspace)) {
805 pr_warn( 829 pr_warn(
806 "BTRFS: cannot preallocate heuristic workspace, will try later\n"); 830 "BTRFS: cannot preallocate compression workspace, will try later\n");
807 } else { 831 } else {
808 atomic_set(&btrfs_heuristic_ws.total_ws, 1); 832 atomic_set(&wsm->total_ws, 1);
809 btrfs_heuristic_ws.free_ws = 1; 833 wsm->free_ws = 1;
810 list_add(workspace, &btrfs_heuristic_ws.idle_ws); 834 list_add(workspace, &wsm->idle_ws);
811 } 835 }
836}
812 837
813 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 838void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
814 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); 839{
815 spin_lock_init(&btrfs_comp_ws[i].ws_lock); 840 struct list_head *ws;
816 atomic_set(&btrfs_comp_ws[i].total_ws, 0);
817 init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
818 841
819 /* 842 while (!list_empty(&wsman->idle_ws)) {
820 * Preallocate one workspace for each compression type so 843 ws = wsman->idle_ws.next;
821 * we can guarantee forward progress in the worst case 844 list_del(ws);
822 */ 845 wsman->ops->free_workspace(ws);
823 workspace = btrfs_compress_op[i]->alloc_workspace(); 846 atomic_dec(&wsman->total_ws);
824 if (IS_ERR(workspace)) {
825 pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
826 } else {
827 atomic_set(&btrfs_comp_ws[i].total_ws, 1);
828 btrfs_comp_ws[i].free_ws = 1;
829 list_add(workspace, &btrfs_comp_ws[i].idle_ws);
830 }
831 } 847 }
832} 848}
833 849
@@ -837,11 +853,11 @@ void __init btrfs_init_compress(void)
837 * Preallocation makes a forward progress guarantees and we do not return 853 * Preallocation makes a forward progress guarantees and we do not return
838 * errors. 854 * errors.
839 */ 855 */
840static struct list_head *__find_workspace(int type, bool heuristic) 856struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
857 unsigned int level)
841{ 858{
842 struct list_head *workspace; 859 struct list_head *workspace;
843 int cpus = num_online_cpus(); 860 int cpus = num_online_cpus();
844 int idx = type - 1;
845 unsigned nofs_flag; 861 unsigned nofs_flag;
846 struct list_head *idle_ws; 862 struct list_head *idle_ws;
847 spinlock_t *ws_lock; 863 spinlock_t *ws_lock;
@@ -849,19 +865,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
849 wait_queue_head_t *ws_wait; 865 wait_queue_head_t *ws_wait;
850 int *free_ws; 866 int *free_ws;
851 867
852 if (heuristic) { 868 idle_ws = &wsm->idle_ws;
853 idle_ws = &btrfs_heuristic_ws.idle_ws; 869 ws_lock = &wsm->ws_lock;
854 ws_lock = &btrfs_heuristic_ws.ws_lock; 870 total_ws = &wsm->total_ws;
855 total_ws = &btrfs_heuristic_ws.total_ws; 871 ws_wait = &wsm->ws_wait;
856 ws_wait = &btrfs_heuristic_ws.ws_wait; 872 free_ws = &wsm->free_ws;
857 free_ws = &btrfs_heuristic_ws.free_ws;
858 } else {
859 idle_ws = &btrfs_comp_ws[idx].idle_ws;
860 ws_lock = &btrfs_comp_ws[idx].ws_lock;
861 total_ws = &btrfs_comp_ws[idx].total_ws;
862 ws_wait = &btrfs_comp_ws[idx].ws_wait;
863 free_ws = &btrfs_comp_ws[idx].free_ws;
864 }
865 873
866again: 874again:
867 spin_lock(ws_lock); 875 spin_lock(ws_lock);
@@ -892,10 +900,7 @@ again:
892 * context of btrfs_compress_bio/btrfs_compress_pages 900 * context of btrfs_compress_bio/btrfs_compress_pages
893 */ 901 */
894 nofs_flag = memalloc_nofs_save(); 902 nofs_flag = memalloc_nofs_save();
895 if (heuristic) 903 workspace = wsm->ops->alloc_workspace(level);
896 workspace = alloc_heuristic_ws();
897 else
898 workspace = btrfs_compress_op[idx]->alloc_workspace();
899 memalloc_nofs_restore(nofs_flag); 904 memalloc_nofs_restore(nofs_flag);
900 905
901 if (IS_ERR(workspace)) { 906 if (IS_ERR(workspace)) {
@@ -926,85 +931,47 @@ again:
926 return workspace; 931 return workspace;
927} 932}
928 933
929static struct list_head *find_workspace(int type) 934static struct list_head *get_workspace(int type, int level)
930{ 935{
931 return __find_workspace(type, false); 936 return btrfs_compress_op[type]->get_workspace(level);
932} 937}
933 938
934/* 939/*
935 * put a workspace struct back on the list or free it if we have enough 940 * put a workspace struct back on the list or free it if we have enough
936 * idle ones sitting around 941 * idle ones sitting around
937 */ 942 */
938static void __free_workspace(int type, struct list_head *workspace, 943void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
939 bool heuristic)
940{ 944{
941 int idx = type - 1;
942 struct list_head *idle_ws; 945 struct list_head *idle_ws;
943 spinlock_t *ws_lock; 946 spinlock_t *ws_lock;
944 atomic_t *total_ws; 947 atomic_t *total_ws;
945 wait_queue_head_t *ws_wait; 948 wait_queue_head_t *ws_wait;
946 int *free_ws; 949 int *free_ws;
947 950
948 if (heuristic) { 951 idle_ws = &wsm->idle_ws;
949 idle_ws = &btrfs_heuristic_ws.idle_ws; 952 ws_lock = &wsm->ws_lock;
950 ws_lock = &btrfs_heuristic_ws.ws_lock; 953 total_ws = &wsm->total_ws;
951 total_ws = &btrfs_heuristic_ws.total_ws; 954 ws_wait = &wsm->ws_wait;
952 ws_wait = &btrfs_heuristic_ws.ws_wait; 955 free_ws = &wsm->free_ws;
953 free_ws = &btrfs_heuristic_ws.free_ws;
954 } else {
955 idle_ws = &btrfs_comp_ws[idx].idle_ws;
956 ws_lock = &btrfs_comp_ws[idx].ws_lock;
957 total_ws = &btrfs_comp_ws[idx].total_ws;
958 ws_wait = &btrfs_comp_ws[idx].ws_wait;
959 free_ws = &btrfs_comp_ws[idx].free_ws;
960 }
961 956
962 spin_lock(ws_lock); 957 spin_lock(ws_lock);
963 if (*free_ws <= num_online_cpus()) { 958 if (*free_ws <= num_online_cpus()) {
964 list_add(workspace, idle_ws); 959 list_add(ws, idle_ws);
965 (*free_ws)++; 960 (*free_ws)++;
966 spin_unlock(ws_lock); 961 spin_unlock(ws_lock);
967 goto wake; 962 goto wake;
968 } 963 }
969 spin_unlock(ws_lock); 964 spin_unlock(ws_lock);
970 965
971 if (heuristic) 966 wsm->ops->free_workspace(ws);
972 free_heuristic_ws(workspace);
973 else
974 btrfs_compress_op[idx]->free_workspace(workspace);
975 atomic_dec(total_ws); 967 atomic_dec(total_ws);
976wake: 968wake:
977 cond_wake_up(ws_wait); 969 cond_wake_up(ws_wait);
978} 970}
979 971
980static void free_workspace(int type, struct list_head *ws) 972static void put_workspace(int type, struct list_head *ws)
981{ 973{
982 return __free_workspace(type, ws, false); 974 return btrfs_compress_op[type]->put_workspace(ws);
983}
984
985/*
986 * cleanup function for module exit
987 */
988static void free_workspaces(void)
989{
990 struct list_head *workspace;
991 int i;
992
993 while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
994 workspace = btrfs_heuristic_ws.idle_ws.next;
995 list_del(workspace);
996 free_heuristic_ws(workspace);
997 atomic_dec(&btrfs_heuristic_ws.total_ws);
998 }
999
1000 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
1001 while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
1002 workspace = btrfs_comp_ws[i].idle_ws.next;
1003 list_del(workspace);
1004 btrfs_compress_op[i]->free_workspace(workspace);
1005 atomic_dec(&btrfs_comp_ws[i].total_ws);
1006 }
1007 }
1008} 975}
1009 976
1010/* 977/*
@@ -1036,18 +1003,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
1036 unsigned long *total_in, 1003 unsigned long *total_in,
1037 unsigned long *total_out) 1004 unsigned long *total_out)
1038{ 1005{
1006 int type = btrfs_compress_type(type_level);
1007 int level = btrfs_compress_level(type_level);
1039 struct list_head *workspace; 1008 struct list_head *workspace;
1040 int ret; 1009 int ret;
1041 int type = type_level & 0xF;
1042
1043 workspace = find_workspace(type);
1044 1010
1045 btrfs_compress_op[type - 1]->set_level(workspace, type_level); 1011 workspace = get_workspace(type, level);
1046 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 1012 ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
1047 start, pages, 1013 start, pages,
1048 out_pages, 1014 out_pages,
1049 total_in, total_out); 1015 total_in, total_out);
1050 free_workspace(type, workspace); 1016 put_workspace(type, workspace);
1051 return ret; 1017 return ret;
1052} 1018}
1053 1019
@@ -1071,9 +1037,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
1071 int ret; 1037 int ret;
1072 int type = cb->compress_type; 1038 int type = cb->compress_type;
1073 1039
1074 workspace = find_workspace(type); 1040 workspace = get_workspace(type, 0);
1075 ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); 1041 ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
1076 free_workspace(type, workspace); 1042 put_workspace(type, workspace);
1077 1043
1078 return ret; 1044 return ret;
1079} 1045}
@@ -1089,19 +1055,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
1089 struct list_head *workspace; 1055 struct list_head *workspace;
1090 int ret; 1056 int ret;
1091 1057
1092 workspace = find_workspace(type); 1058 workspace = get_workspace(type, 0);
1093 1059 ret = btrfs_compress_op[type]->decompress(workspace, data_in,
1094 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
1095 dest_page, start_byte, 1060 dest_page, start_byte,
1096 srclen, destlen); 1061 srclen, destlen);
1062 put_workspace(type, workspace);
1097 1063
1098 free_workspace(type, workspace);
1099 return ret; 1064 return ret;
1100} 1065}
1101 1066
1067void __init btrfs_init_compress(void)
1068{
1069 int i;
1070
1071 for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
1072 btrfs_compress_op[i]->init_workspace_manager();
1073}
1074
1102void __cold btrfs_exit_compress(void) 1075void __cold btrfs_exit_compress(void)
1103{ 1076{
1104 free_workspaces(); 1077 int i;
1078
1079 for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
1080 btrfs_compress_op[i]->cleanup_workspace_manager();
1105} 1081}
1106 1082
1107/* 1083/*
@@ -1512,7 +1488,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1512 */ 1488 */
1513int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1489int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1514{ 1490{
1515 struct list_head *ws_list = __find_workspace(0, true); 1491 struct list_head *ws_list = get_workspace(0, 0);
1516 struct heuristic_ws *ws; 1492 struct heuristic_ws *ws;
1517 u32 i; 1493 u32 i;
1518 u8 byte; 1494 u8 byte;
@@ -1581,18 +1557,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1581 } 1557 }
1582 1558
1583out: 1559out:
1584 __free_workspace(0, ws_list, true); 1560 put_workspace(0, ws_list);
1585 return ret; 1561 return ret;
1586} 1562}
1587 1563
1588unsigned int btrfs_compress_str2level(const char *str) 1564/*
1565 * Convert the compression suffix (eg. after "zlib" starting with ":") to
1566 * level, unrecognized string will set the default level
1567 */
1568unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
1589{ 1569{
1590 if (strncmp(str, "zlib", 4) != 0) 1570 unsigned int level = 0;
1571 int ret;
1572
1573 if (!type)
1591 return 0; 1574 return 0;
1592 1575
1593 /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ 1576 if (str[0] == ':') {
1594 if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) 1577 ret = kstrtouint(str + 1, 10, &level);
1595 return str[5] - '0'; 1578 if (ret)
1579 level = 0;
1580 }
1581
1582 level = btrfs_compress_op[type]->set_level(level);
1596 1583
1597 return BTRFS_ZLIB_DEFAULT_LEVEL; 1584 return level;
1598} 1585}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf20..9976fe0f7526 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -64,6 +64,16 @@ struct compressed_bio {
64 u32 sums; 64 u32 sums;
65}; 65};
66 66
67static inline unsigned int btrfs_compress_type(unsigned int type_level)
68{
69 return (type_level & 0xF);
70}
71
72static inline unsigned int btrfs_compress_level(unsigned int type_level)
73{
74 return ((type_level & 0xF0) >> 4);
75}
76
67void __init btrfs_init_compress(void); 77void __init btrfs_init_compress(void);
68void __cold btrfs_exit_compress(void); 78void __cold btrfs_exit_compress(void);
69 79
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
87blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 97blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
88 int mirror_num, unsigned long bio_flags); 98 int mirror_num, unsigned long bio_flags);
89 99
90unsigned btrfs_compress_str2level(const char *str); 100unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
91 101
92enum btrfs_compression_type { 102enum btrfs_compression_type {
93 BTRFS_COMPRESS_NONE = 0, 103 BTRFS_COMPRESS_NONE = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
97 BTRFS_COMPRESS_TYPES = 3, 107 BTRFS_COMPRESS_TYPES = 3,
98}; 108};
99 109
110struct workspace_manager {
111 const struct btrfs_compress_op *ops;
112 struct list_head idle_ws;
113 spinlock_t ws_lock;
114 /* Number of free workspaces */
115 int free_ws;
116 /* Total number of allocated workspaces */
117 atomic_t total_ws;
118 /* Waiters for a free workspace */
119 wait_queue_head_t ws_wait;
120};
121
122void btrfs_init_workspace_manager(struct workspace_manager *wsm,
123 const struct btrfs_compress_op *ops);
124struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
125 unsigned int level);
126void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
127void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
128
100struct btrfs_compress_op { 129struct btrfs_compress_op {
101 struct list_head *(*alloc_workspace)(void); 130 void (*init_workspace_manager)(void);
131
132 void (*cleanup_workspace_manager)(void);
133
134 struct list_head *(*get_workspace)(unsigned int level);
135
136 void (*put_workspace)(struct list_head *ws);
137
138 struct list_head *(*alloc_workspace)(unsigned int level);
102 139
103 void (*free_workspace)(struct list_head *workspace); 140 void (*free_workspace)(struct list_head *workspace);
104 141
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
119 unsigned long start_byte, 156 unsigned long start_byte,
120 size_t srclen, size_t destlen); 157 size_t srclen, size_t destlen);
121 158
122 void (*set_level)(struct list_head *ws, unsigned int type); 159 /*
160 * This bounds the level set by the user to be within range of a
161 * particular compression type. It returns the level that will be used
162 * if the level is out of bounds or the default if 0 is passed in.
163 */
164 unsigned int (*set_level)(unsigned int level);
123}; 165};
124 166
167/* The heuristic workspaces are managed via the 0th workspace manager */
168#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
169
170extern const struct btrfs_compress_op btrfs_heuristic_compress;
125extern const struct btrfs_compress_op btrfs_zlib_compress; 171extern const struct btrfs_compress_op btrfs_zlib_compress;
126extern const struct btrfs_compress_op btrfs_lzo_compress; 172extern const struct btrfs_compress_op btrfs_lzo_compress;
127extern const struct btrfs_compress_op btrfs_zstd_compress; 173extern const struct btrfs_compress_op btrfs_zstd_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5a6c39b44c84..324df36d28bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -13,6 +13,7 @@
13#include "print-tree.h" 13#include "print-tree.h"
14#include "locking.h" 14#include "locking.h"
15#include "volumes.h" 15#include "volumes.h"
16#include "qgroup.h"
16 17
17static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
18 *root, struct btrfs_path *path, int level); 19 *root, struct btrfs_path *path, int level);
@@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
45 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 46 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
46 if (!p->nodes[i] || !p->locks[i]) 47 if (!p->nodes[i] || !p->locks[i])
47 continue; 48 continue;
48 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 49 /*
49 if (p->locks[i] == BTRFS_READ_LOCK) 50 * If we currently have a spinning reader or writer lock this
51 * will bump the count of blocking holders and drop the
52 * spinlock.
53 */
54 if (p->locks[i] == BTRFS_READ_LOCK) {
55 btrfs_set_lock_blocking_read(p->nodes[i]);
50 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 56 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
51 else if (p->locks[i] == BTRFS_WRITE_LOCK) 57 } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
58 btrfs_set_lock_blocking_write(p->nodes[i]);
52 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 59 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
60 }
53 } 61 }
54} 62}
55 63
@@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1288 return eb; 1296 return eb;
1289 1297
1290 btrfs_set_path_blocking(path); 1298 btrfs_set_path_blocking(path);
1291 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1299 btrfs_set_lock_blocking_read(eb);
1292 1300
1293 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1301 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1294 BUG_ON(tm->slot != 0); 1302 BUG_ON(tm->slot != 0);
@@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1378 free_extent_buffer(eb_root); 1386 free_extent_buffer(eb_root);
1379 eb = alloc_dummy_extent_buffer(fs_info, logical); 1387 eb = alloc_dummy_extent_buffer(fs_info, logical);
1380 } else { 1388 } else {
1381 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); 1389 btrfs_set_lock_blocking_read(eb_root);
1382 eb = btrfs_clone_extent_buffer(eb_root); 1390 eb = btrfs_clone_extent_buffer(eb_root);
1383 btrfs_tree_read_unlock_blocking(eb_root); 1391 btrfs_tree_read_unlock_blocking(eb_root);
1384 free_extent_buffer(eb_root); 1392 free_extent_buffer(eb_root);
@@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1486 search_start = buf->start & ~((u64)SZ_1G - 1); 1494 search_start = buf->start & ~((u64)SZ_1G - 1);
1487 1495
1488 if (parent) 1496 if (parent)
1489 btrfs_set_lock_blocking(parent); 1497 btrfs_set_lock_blocking_write(parent);
1490 btrfs_set_lock_blocking(buf); 1498 btrfs_set_lock_blocking_write(buf);
1491 1499
1500 /*
1501 * Before CoWing this block for later modification, check if it's
1502 * the subtree root and do the delayed subtree trace if needed.
1503 *
1504 * Also We don't care about the error, as it's handled internally.
1505 */
1506 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1492 ret = __btrfs_cow_block(trans, root, buf, parent, 1507 ret = __btrfs_cow_block(trans, root, buf, parent,
1493 parent_slot, cow_ret, search_start, 0); 1508 parent_slot, cow_ret, search_start, 0);
1494 1509
@@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1582 if (parent_nritems <= 1) 1597 if (parent_nritems <= 1)
1583 return 0; 1598 return 0;
1584 1599
1585 btrfs_set_lock_blocking(parent); 1600 btrfs_set_lock_blocking_write(parent);
1586 1601
1587 for (i = start_slot; i <= end_slot; i++) { 1602 for (i = start_slot; i <= end_slot; i++) {
1588 struct btrfs_key first_key; 1603 struct btrfs_key first_key;
@@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1641 search_start = last_block; 1656 search_start = last_block;
1642 1657
1643 btrfs_tree_lock(cur); 1658 btrfs_tree_lock(cur);
1644 btrfs_set_lock_blocking(cur); 1659 btrfs_set_lock_blocking_write(cur);
1645 err = __btrfs_cow_block(trans, root, cur, parent, i, 1660 err = __btrfs_cow_block(trans, root, cur, parent, i,
1646 &cur, search_start, 1661 &cur, search_start,
1647 min(16 * blocksize, 1662 min(16 * blocksize,
@@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1856 } 1871 }
1857 1872
1858 btrfs_tree_lock(child); 1873 btrfs_tree_lock(child);
1859 btrfs_set_lock_blocking(child); 1874 btrfs_set_lock_blocking_write(child);
1860 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1875 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1861 if (ret) { 1876 if (ret) {
1862 btrfs_tree_unlock(child); 1877 btrfs_tree_unlock(child);
@@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1894 1909
1895 if (left) { 1910 if (left) {
1896 btrfs_tree_lock(left); 1911 btrfs_tree_lock(left);
1897 btrfs_set_lock_blocking(left); 1912 btrfs_set_lock_blocking_write(left);
1898 wret = btrfs_cow_block(trans, root, left, 1913 wret = btrfs_cow_block(trans, root, left,
1899 parent, pslot - 1, &left); 1914 parent, pslot - 1, &left);
1900 if (wret) { 1915 if (wret) {
@@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1909 1924
1910 if (right) { 1925 if (right) {
1911 btrfs_tree_lock(right); 1926 btrfs_tree_lock(right);
1912 btrfs_set_lock_blocking(right); 1927 btrfs_set_lock_blocking_write(right);
1913 wret = btrfs_cow_block(trans, root, right, 1928 wret = btrfs_cow_block(trans, root, right,
1914 parent, pslot + 1, &right); 1929 parent, pslot + 1, &right);
1915 if (wret) { 1930 if (wret) {
@@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2072 u32 left_nr; 2087 u32 left_nr;
2073 2088
2074 btrfs_tree_lock(left); 2089 btrfs_tree_lock(left);
2075 btrfs_set_lock_blocking(left); 2090 btrfs_set_lock_blocking_write(left);
2076 2091
2077 left_nr = btrfs_header_nritems(left); 2092 left_nr = btrfs_header_nritems(left);
2078 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2093 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2127 u32 right_nr; 2142 u32 right_nr;
2128 2143
2129 btrfs_tree_lock(right); 2144 btrfs_tree_lock(right);
2130 btrfs_set_lock_blocking(right); 2145 btrfs_set_lock_blocking_write(right);
2131 2146
2132 right_nr = btrfs_header_nritems(right); 2147 right_nr = btrfs_header_nritems(right);
2133 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2148 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2529,26 +2544,6 @@ done:
2529 return ret; 2544 return ret;
2530} 2545}
2531 2546
2532static void key_search_validate(struct extent_buffer *b,
2533 const struct btrfs_key *key,
2534 int level)
2535{
2536#ifdef CONFIG_BTRFS_ASSERT
2537 struct btrfs_disk_key disk_key;
2538
2539 btrfs_cpu_key_to_disk(&disk_key, key);
2540
2541 if (level == 0)
2542 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2543 offsetof(struct btrfs_leaf, items[0].key),
2544 sizeof(disk_key)));
2545 else
2546 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2547 offsetof(struct btrfs_node, ptrs[0].key),
2548 sizeof(disk_key)));
2549#endif
2550}
2551
2552static int key_search(struct extent_buffer *b, const struct btrfs_key *key, 2547static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2553 int level, int *prev_cmp, int *slot) 2548 int level, int *prev_cmp, int *slot)
2554{ 2549{
@@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2557 return *prev_cmp; 2552 return *prev_cmp;
2558 } 2553 }
2559 2554
2560 key_search_validate(b, key, level);
2561 *slot = 0; 2555 *slot = 0;
2562 2556
2563 return 0; 2557 return 0;
@@ -3005,6 +2999,8 @@ again:
3005 */ 2999 */
3006 prev_cmp = -1; 3000 prev_cmp = -1;
3007 ret = key_search(b, key, level, &prev_cmp, &slot); 3001 ret = key_search(b, key, level, &prev_cmp, &slot);
3002 if (ret < 0)
3003 goto done;
3008 3004
3009 if (level != 0) { 3005 if (level != 0) {
3010 int dec = 0; 3006 int dec = 0;
@@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3771 return 1; 3767 return 1;
3772 3768
3773 btrfs_tree_lock(right); 3769 btrfs_tree_lock(right);
3774 btrfs_set_lock_blocking(right); 3770 btrfs_set_lock_blocking_write(right);
3775 3771
3776 free_space = btrfs_leaf_free_space(fs_info, right); 3772 free_space = btrfs_leaf_free_space(fs_info, right);
3777 if (free_space < data_size) 3773 if (free_space < data_size)
@@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4005 return 1; 4001 return 1;
4006 4002
4007 btrfs_tree_lock(left); 4003 btrfs_tree_lock(left);
4008 btrfs_set_lock_blocking(left); 4004 btrfs_set_lock_blocking_write(left);
4009 4005
4010 free_space = btrfs_leaf_free_space(fs_info, left); 4006 free_space = btrfs_leaf_free_space(fs_info, left);
4011 if (free_space < data_size) { 4007 if (free_space < data_size) {
@@ -5156,6 +5152,10 @@ again:
5156 nritems = btrfs_header_nritems(cur); 5152 nritems = btrfs_header_nritems(cur);
5157 level = btrfs_header_level(cur); 5153 level = btrfs_header_level(cur);
5158 sret = btrfs_bin_search(cur, min_key, level, &slot); 5154 sret = btrfs_bin_search(cur, min_key, level, &slot);
5155 if (sret < 0) {
5156 ret = sret;
5157 goto out;
5158 }
5159 5159
5160 /* at the lowest level, we're done, setup the path and exit */ 5160 /* at the lowest level, we're done, setup the path and exit */
5161 if (level == path->lowest_level) { 5161 if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 94618a028730..129d26226e70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -934,7 +934,8 @@ struct btrfs_fs_info {
934 934
935 spinlock_t delayed_iput_lock; 935 spinlock_t delayed_iput_lock;
936 struct list_head delayed_iputs; 936 struct list_head delayed_iputs;
937 struct mutex cleaner_delayed_iput_mutex; 937 atomic_t nr_delayed_iputs;
938 wait_queue_head_t delayed_iputs_wait;
938 939
939 /* this protects tree_mod_seq_list */ 940 /* this protects tree_mod_seq_list */
940 spinlock_t tree_mod_seq_lock; 941 spinlock_t tree_mod_seq_lock;
@@ -1074,10 +1075,13 @@ struct btrfs_fs_info {
1074 atomic_t scrubs_paused; 1075 atomic_t scrubs_paused;
1075 atomic_t scrub_cancel_req; 1076 atomic_t scrub_cancel_req;
1076 wait_queue_head_t scrub_pause_wait; 1077 wait_queue_head_t scrub_pause_wait;
1077 int scrub_workers_refcnt; 1078 /*
1079 * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
1080 * running.
1081 */
1082 refcount_t scrub_workers_refcnt;
1078 struct btrfs_workqueue *scrub_workers; 1083 struct btrfs_workqueue *scrub_workers;
1079 struct btrfs_workqueue *scrub_wr_completion_workers; 1084 struct btrfs_workqueue *scrub_wr_completion_workers;
1080 struct btrfs_workqueue *scrub_nocow_workers;
1081 struct btrfs_workqueue *scrub_parity_workers; 1085 struct btrfs_workqueue *scrub_parity_workers;
1082 1086
1083#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1087#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1199,6 +1203,24 @@ enum {
1199 BTRFS_ROOT_MULTI_LOG_TASKS, 1203 BTRFS_ROOT_MULTI_LOG_TASKS,
1200 BTRFS_ROOT_DIRTY, 1204 BTRFS_ROOT_DIRTY,
1201 BTRFS_ROOT_DELETING, 1205 BTRFS_ROOT_DELETING,
1206
1207 /*
1208 * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
1209 *
1210 * Set for the subvolume tree owning the reloc tree.
1211 */
1212 BTRFS_ROOT_DEAD_RELOC_TREE,
1213};
1214
1215/*
1216 * Record swapped tree blocks of a subvolume tree for delayed subtree trace
1217 * code. For detail check comment in fs/btrfs/qgroup.c.
1218 */
1219struct btrfs_qgroup_swapped_blocks {
1220 spinlock_t lock;
1221 /* RM_EMPTY_ROOT() of above blocks[] */
1222 bool swapped;
1223 struct rb_root blocks[BTRFS_MAX_LEVEL];
1202}; 1224};
1203 1225
1204/* 1226/*
@@ -1312,6 +1334,14 @@ struct btrfs_root {
1312 u64 nr_ordered_extents; 1334 u64 nr_ordered_extents;
1313 1335
1314 /* 1336 /*
1337 * Not empty if this subvolume root has gone through tree block swap
1338 * (relocation)
1339 *
1340 * Will be used by reloc_control::dirty_subvol_roots.
1341 */
1342 struct list_head reloc_dirty_list;
1343
1344 /*
1315 * Number of currently running SEND ioctls to prevent 1345 * Number of currently running SEND ioctls to prevent
1316 * manipulation with the read-only status via SUBVOL_SETFLAGS 1346 * manipulation with the read-only status via SUBVOL_SETFLAGS
1317 */ 1347 */
@@ -1328,6 +1358,9 @@ struct btrfs_root {
1328 /* Number of active swapfiles */ 1358 /* Number of active swapfiles */
1329 atomic_t nr_swapfiles; 1359 atomic_t nr_swapfiles;
1330 1360
1361 /* Record pairs of swapped blocks for qgroup */
1362 struct btrfs_qgroup_swapped_blocks swapped_blocks;
1363
1331#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1364#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1332 u64 alloc_bytenr; 1365 u64 alloc_bytenr;
1333#endif 1366#endif
@@ -2775,7 +2808,8 @@ enum btrfs_flush_state {
2775 FLUSH_DELALLOC = 5, 2808 FLUSH_DELALLOC = 5,
2776 FLUSH_DELALLOC_WAIT = 6, 2809 FLUSH_DELALLOC_WAIT = 6,
2777 ALLOC_CHUNK = 7, 2810 ALLOC_CHUNK = 7,
2778 COMMIT_TRANS = 8, 2811 ALLOC_CHUNK_FORCE = 8,
2812 COMMIT_TRANS = 9,
2779}; 2813};
2780 2814
2781int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); 2815int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -3181,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
3181 3215
3182/* inode.c */ 3216/* inode.c */
3183struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 3217struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
3184 struct page *page, size_t pg_offset, u64 start, 3218 u64 start, u64 len);
3185 u64 len, int create);
3186noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 3219noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
3187 u64 *orig_start, u64 *orig_block_len, 3220 u64 *orig_start, u64 *orig_block_len,
3188 u64 *ram_bytes); 3221 u64 *ram_bytes);
@@ -3254,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
3254int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 3287int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
3255void btrfs_add_delayed_iput(struct inode *inode); 3288void btrfs_add_delayed_iput(struct inode *inode);
3256void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); 3289void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
3290int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
3257int btrfs_prealloc_file_range(struct inode *inode, int mode, 3291int btrfs_prealloc_file_range(struct inode *inode, int mode,
3258 u64 start, u64 num_bytes, u64 min_size, 3292 u64 start, u64 num_bytes, u64 min_size,
3259 loff_t actual_len, u64 *alloc_hint); 3293 loff_t actual_len, u64 *alloc_hint);
@@ -3261,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
3261 struct btrfs_trans_handle *trans, int mode, 3295 struct btrfs_trans_handle *trans, int mode,
3262 u64 start, u64 num_bytes, u64 min_size, 3296 u64 start, u64 num_bytes, u64 min_size,
3263 loff_t actual_len, u64 *alloc_hint); 3297 loff_t actual_len, u64 *alloc_hint);
3264int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, 3298int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
3265 u64 start, u64 end, int *page_started, unsigned long *nr_written, 3299 u64 start, u64 end, int *page_started, unsigned long *nr_written,
3266 struct writeback_control *wbc); 3300 struct writeback_control *wbc);
3267int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); 3301int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -3476,21 +3510,18 @@ do { \
3476 rcu_read_unlock(); \ 3510 rcu_read_unlock(); \
3477} while (0) 3511} while (0)
3478 3512
3479#ifdef CONFIG_BTRFS_ASSERT
3480
3481__cold 3513__cold
3482static inline void assfail(const char *expr, const char *file, int line) 3514static inline void assfail(const char *expr, const char *file, int line)
3483{ 3515{
3484 pr_err("assertion failed: %s, file: %s, line: %d\n", 3516 if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
3485 expr, file, line); 3517 pr_err("assertion failed: %s, file: %s, line: %d\n",
3486 BUG(); 3518 expr, file, line);
3519 BUG();
3520 }
3487} 3521}
3488 3522
3489#define ASSERT(expr) \ 3523#define ASSERT(expr) \
3490 (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) 3524 (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
3491#else
3492#define ASSERT(expr) ((void)0)
3493#endif
3494 3525
3495/* 3526/*
3496 * Use that for functions that are conditionally exported for sanity tests but 3527 * Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cad36c99a483..7d2a413df90d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
602 RB_CLEAR_NODE(&head_ref->href_node); 602 RB_CLEAR_NODE(&head_ref->href_node);
603 head_ref->processing = 0; 603 head_ref->processing = 0;
604 head_ref->total_ref_mod = count_mod; 604 head_ref->total_ref_mod = count_mod;
605 head_ref->qgroup_reserved = 0;
606 head_ref->qgroup_ref_root = 0;
607 spin_lock_init(&head_ref->lock); 605 spin_lock_init(&head_ref->lock);
608 mutex_init(&head_ref->mutex); 606 mutex_init(&head_ref->mutex);
609 607
610 if (qrecord) { 608 if (qrecord) {
611 if (ref_root && reserved) { 609 if (ref_root && reserved) {
612 head_ref->qgroup_ref_root = ref_root; 610 qrecord->data_rsv = reserved;
613 head_ref->qgroup_reserved = reserved; 611 qrecord->data_rsv_refroot = ref_root;
614 } 612 }
615
616 qrecord->bytenr = bytenr; 613 qrecord->bytenr = bytenr;
617 qrecord->num_bytes = num_bytes; 614 qrecord->num_bytes = num_bytes;
618 qrecord->old_roots = NULL; 615 qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
651 existing = htree_insert(&delayed_refs->href_root, 648 existing = htree_insert(&delayed_refs->href_root,
652 &head_ref->href_node); 649 &head_ref->href_node);
653 if (existing) { 650 if (existing) {
654 WARN_ON(qrecord && head_ref->qgroup_ref_root
655 && head_ref->qgroup_reserved
656 && existing->qgroup_ref_root
657 && existing->qgroup_reserved);
658 update_existing_head_ref(trans, existing, head_ref, 651 update_existing_head_ref(trans, existing, head_ref,
659 old_ref_mod); 652 old_ref_mod);
660 /* 653 /*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
770 763
771 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 764 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
772 is_fstree(ref_root)) { 765 is_fstree(ref_root)) {
773 record = kmalloc(sizeof(*record), GFP_NOFS); 766 record = kzalloc(sizeof(*record), GFP_NOFS);
774 if (!record) { 767 if (!record) {
775 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 768 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
776 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 769 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
867 860
868 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 861 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
869 is_fstree(ref_root)) { 862 is_fstree(ref_root)) {
870 record = kmalloc(sizeof(*record), GFP_NOFS); 863 record = kzalloc(sizeof(*record), GFP_NOFS);
871 if (!record) { 864 if (!record) {
872 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 865 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
873 kmem_cache_free(btrfs_delayed_ref_head_cachep, 866 kmem_cache_free(btrfs_delayed_ref_head_cachep,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d2af974f68a1..70606da440aa 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head {
103 int ref_mod; 103 int ref_mod;
104 104
105 /* 105 /*
106 * For qgroup reserved space freeing.
107 *
108 * ref_root and reserved will be recorded after
109 * BTRFS_ADD_DELAYED_EXTENT is called.
110 * And will be used to free reserved qgroup space at
111 * run_delayed_refs() time.
112 */
113 u64 qgroup_ref_root;
114 u64 qgroup_reserved;
115
116 /*
117 * when a new extent is allocated, it is just reserved in memory 106 * when a new extent is allocated, it is just reserved in memory
118 * The actual extent isn't inserted into the extent allocation tree 107 * The actual extent isn't inserted into the extent allocation tree
119 * until the delayed ref is processed. must_insert_reserved is 108 * until the delayed ref is processed. must_insert_reserved is
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8750c835f535..ee193c5222b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found:
111 break; 111 break;
112 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 112 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
113 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 113 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
114 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, 114 dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
115 NULL, NULL); 115 src_devid, NULL, NULL, true);
116 dev_replace->tgtdev = btrfs_find_device(fs_info, 116 dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
117 BTRFS_DEV_REPLACE_DEVID, 117 BTRFS_DEV_REPLACE_DEVID,
118 NULL, NULL); 118 NULL, NULL, true);
119 /* 119 /*
120 * allow 'btrfs dev replace_cancel' if src/tgt device is 120 * allow 'btrfs dev replace_cancel' if src/tgt device is
121 * missing 121 * missing
@@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
862 btrfs_destroy_dev_replace_tgtdev(tgt_device); 862 btrfs_destroy_dev_replace_tgtdev(tgt_device);
863 break; 863 break;
864 default: 864 default:
865 up_write(&dev_replace->rwsem);
865 result = -EINVAL; 866 result = -EINVAL;
866 } 867 }
867 868
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a2a2a951705..5216e7b3f9ad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
17#include <linux/semaphore.h> 17#include <linux/semaphore.h>
18#include <linux/error-injection.h> 18#include <linux/error-injection.h>
19#include <linux/crc32c.h> 19#include <linux/crc32c.h>
20#include <linux/sched/mm.h>
20#include <asm/unaligned.h> 21#include <asm/unaligned.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
341 342
342 if (need_lock) { 343 if (need_lock) {
343 btrfs_tree_read_lock(eb); 344 btrfs_tree_read_lock(eb);
344 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 345 btrfs_set_lock_blocking_read(eb);
345 } 346 }
346 347
347 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 348 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -1120,7 +1121,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
1120 -buf->len, 1121 -buf->len,
1121 fs_info->dirty_metadata_batch); 1122 fs_info->dirty_metadata_batch);
1122 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1123 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1123 btrfs_set_lock_blocking(buf); 1124 btrfs_set_lock_blocking_write(buf);
1124 clear_extent_buffer_dirty(buf); 1125 clear_extent_buffer_dirty(buf);
1125 } 1126 }
1126 } 1127 }
@@ -1175,6 +1176,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1175 INIT_LIST_HEAD(&root->delalloc_root); 1176 INIT_LIST_HEAD(&root->delalloc_root);
1176 INIT_LIST_HEAD(&root->ordered_extents); 1177 INIT_LIST_HEAD(&root->ordered_extents);
1177 INIT_LIST_HEAD(&root->ordered_root); 1178 INIT_LIST_HEAD(&root->ordered_root);
1179 INIT_LIST_HEAD(&root->reloc_dirty_list);
1178 INIT_LIST_HEAD(&root->logged_list[0]); 1180 INIT_LIST_HEAD(&root->logged_list[0]);
1179 INIT_LIST_HEAD(&root->logged_list[1]); 1181 INIT_LIST_HEAD(&root->logged_list[1]);
1180 spin_lock_init(&root->inode_lock); 1182 spin_lock_init(&root->inode_lock);
@@ -1218,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1218 root->anon_dev = 0; 1220 root->anon_dev = 0;
1219 1221
1220 spin_lock_init(&root->root_item_lock); 1222 spin_lock_init(&root->root_item_lock);
1223 btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1221} 1224}
1222 1225
1223static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, 1226static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1258,10 +1261,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1258 struct btrfs_root *tree_root = fs_info->tree_root; 1261 struct btrfs_root *tree_root = fs_info->tree_root;
1259 struct btrfs_root *root; 1262 struct btrfs_root *root;
1260 struct btrfs_key key; 1263 struct btrfs_key key;
1264 unsigned int nofs_flag;
1261 int ret = 0; 1265 int ret = 0;
1262 uuid_le uuid = NULL_UUID_LE; 1266 uuid_le uuid = NULL_UUID_LE;
1263 1267
1268 /*
1269 * We're holding a transaction handle, so use a NOFS memory allocation
1270 * context to avoid deadlock if reclaim happens.
1271 */
1272 nofs_flag = memalloc_nofs_save();
1264 root = btrfs_alloc_root(fs_info, GFP_KERNEL); 1273 root = btrfs_alloc_root(fs_info, GFP_KERNEL);
1274 memalloc_nofs_restore(nofs_flag);
1265 if (!root) 1275 if (!root)
1266 return ERR_PTR(-ENOMEM); 1276 return ERR_PTR(-ENOMEM);
1267 1277
@@ -1707,9 +1717,7 @@ static int cleaner_kthread(void *arg)
1707 goto sleep; 1717 goto sleep;
1708 } 1718 }
1709 1719
1710 mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
1711 btrfs_run_delayed_iputs(fs_info); 1720 btrfs_run_delayed_iputs(fs_info);
1712 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
1713 1721
1714 again = btrfs_clean_one_deleted_snapshot(root); 1722 again = btrfs_clean_one_deleted_snapshot(root);
1715 mutex_unlock(&fs_info->cleaner_mutex); 1723 mutex_unlock(&fs_info->cleaner_mutex);
@@ -2101,7 +2109,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2101 atomic_set(&fs_info->scrubs_paused, 0); 2109 atomic_set(&fs_info->scrubs_paused, 0);
2102 atomic_set(&fs_info->scrub_cancel_req, 0); 2110 atomic_set(&fs_info->scrub_cancel_req, 0);
2103 init_waitqueue_head(&fs_info->scrub_pause_wait); 2111 init_waitqueue_head(&fs_info->scrub_pause_wait);
2104 fs_info->scrub_workers_refcnt = 0; 2112 refcount_set(&fs_info->scrub_workers_refcnt, 0);
2105} 2113}
2106 2114
2107static void btrfs_init_balance(struct btrfs_fs_info *fs_info) 2115static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2666,7 +2674,6 @@ int open_ctree(struct super_block *sb,
2666 mutex_init(&fs_info->delete_unused_bgs_mutex); 2674 mutex_init(&fs_info->delete_unused_bgs_mutex);
2667 mutex_init(&fs_info->reloc_mutex); 2675 mutex_init(&fs_info->reloc_mutex);
2668 mutex_init(&fs_info->delalloc_root_mutex); 2676 mutex_init(&fs_info->delalloc_root_mutex);
2669 mutex_init(&fs_info->cleaner_delayed_iput_mutex);
2670 seqlock_init(&fs_info->profiles_lock); 2677 seqlock_init(&fs_info->profiles_lock);
2671 2678
2672 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2679 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2688,6 +2695,7 @@ int open_ctree(struct super_block *sb,
2688 atomic_set(&fs_info->defrag_running, 0); 2695 atomic_set(&fs_info->defrag_running, 0);
2689 atomic_set(&fs_info->qgroup_op_seq, 0); 2696 atomic_set(&fs_info->qgroup_op_seq, 0);
2690 atomic_set(&fs_info->reada_works_cnt, 0); 2697 atomic_set(&fs_info->reada_works_cnt, 0);
2698 atomic_set(&fs_info->nr_delayed_iputs, 0);
2691 atomic64_set(&fs_info->tree_mod_seq, 0); 2699 atomic64_set(&fs_info->tree_mod_seq, 0);
2692 fs_info->sb = sb; 2700 fs_info->sb = sb;
2693 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 2701 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2765,6 +2773,7 @@ int open_ctree(struct super_block *sb,
2765 init_waitqueue_head(&fs_info->transaction_wait); 2773 init_waitqueue_head(&fs_info->transaction_wait);
2766 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2774 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2767 init_waitqueue_head(&fs_info->async_submit_wait); 2775 init_waitqueue_head(&fs_info->async_submit_wait);
2776 init_waitqueue_head(&fs_info->delayed_iputs_wait);
2768 2777
2769 INIT_LIST_HEAD(&fs_info->pinned_chunks); 2778 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2770 2779
@@ -4238,16 +4247,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4238 4247
4239 head = rb_entry(node, struct btrfs_delayed_ref_head, 4248 head = rb_entry(node, struct btrfs_delayed_ref_head,
4240 href_node); 4249 href_node);
4241 if (!mutex_trylock(&head->mutex)) { 4250 if (btrfs_delayed_ref_lock(delayed_refs, head))
4242 refcount_inc(&head->refs);
4243 spin_unlock(&delayed_refs->lock);
4244
4245 mutex_lock(&head->mutex);
4246 mutex_unlock(&head->mutex);
4247 btrfs_put_delayed_ref_head(head);
4248 spin_lock(&delayed_refs->lock);
4249 continue; 4251 continue;
4250 } 4252
4251 spin_lock(&head->lock); 4253 spin_lock(&head->lock);
4252 while ((n = rb_first_cached(&head->ref_tree)) != NULL) { 4254 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4253 ref = rb_entry(n, struct btrfs_delayed_ref_node, 4255 ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4263,12 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4263 if (head->must_insert_reserved) 4265 if (head->must_insert_reserved)
4264 pin_bytes = true; 4266 pin_bytes = true;
4265 btrfs_free_delayed_extent_op(head->extent_op); 4267 btrfs_free_delayed_extent_op(head->extent_op);
4266 delayed_refs->num_heads--; 4268 btrfs_delete_ref_head(delayed_refs, head);
4267 if (head->processing == 0)
4268 delayed_refs->num_heads_ready--;
4269 atomic_dec(&delayed_refs->num_entries);
4270 rb_erase_cached(&head->href_node, &delayed_refs->href_root);
4271 RB_CLEAR_NODE(&head->href_node);
4272 spin_unlock(&head->lock); 4269 spin_unlock(&head->lock);
4273 spin_unlock(&delayed_refs->lock); 4270 spin_unlock(&delayed_refs->lock);
4274 mutex_unlock(&head->mutex); 4271 mutex_unlock(&head->mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d81035b7ea7d..994f0cc41799 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2492 } 2492 }
2493 } 2493 }
2494 2494
2495 /* Also free its reserved qgroup space */
2496 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2497 head->qgroup_reserved);
2498 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 2495 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2499} 2496}
2500 2497
@@ -3013,8 +3010,7 @@ again:
3013 } 3010 }
3014 3011
3015 if (run_all) { 3012 if (run_all) {
3016 if (!list_empty(&trans->new_bgs)) 3013 btrfs_create_pending_block_groups(trans);
3017 btrfs_create_pending_block_groups(trans);
3018 3014
3019 spin_lock(&delayed_refs->lock); 3015 spin_lock(&delayed_refs->lock);
3020 node = rb_first_cached(&delayed_refs->href_root); 3016 node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4276,14 @@ commit_trans:
4280 /* 4276 /*
4281 * The cleaner kthread might still be doing iput 4277 * The cleaner kthread might still be doing iput
4282 * operations. Wait for it to finish so that 4278 * operations. Wait for it to finish so that
4283 * more space is released. 4279 * more space is released. We don't need to
4280 * explicitly run the delayed iputs here because
4281 * the commit_transaction would have woken up
4282 * the cleaner.
4284 */ 4283 */
4285 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4284 ret = btrfs_wait_on_delayed_iputs(fs_info);
4286 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4285 if (ret)
4286 return ret;
4287 goto again; 4287 goto again;
4288 } else { 4288 } else {
4289 btrfs_end_transaction(trans); 4289 btrfs_end_transaction(trans);
@@ -4396,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4396static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4396static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4397 struct btrfs_space_info *sinfo, int force) 4397 struct btrfs_space_info *sinfo, int force)
4398{ 4398{
4399 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4400 u64 bytes_used = btrfs_space_info_used(sinfo, false); 4399 u64 bytes_used = btrfs_space_info_used(sinfo, false);
4401 u64 thresh; 4400 u64 thresh;
4402 4401
@@ -4404,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4404 return 1; 4403 return 1;
4405 4404
4406 /* 4405 /*
4407 * We need to take into account the global rsv because for all intents
4408 * and purposes it's used space. Don't worry about locking the
4409 * global_rsv, it doesn't change except when the transaction commits.
4410 */
4411 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4412 bytes_used += calc_global_rsv_need_space(global_rsv);
4413
4414 /*
4415 * in limited mode, we want to have some free space up to 4406 * in limited mode, we want to have some free space up to
4416 * about 1% of the FS size. 4407 * about 1% of the FS size.
4417 */ 4408 */
@@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4741 struct btrfs_space_info *space_info; 4732 struct btrfs_space_info *space_info;
4742 struct btrfs_trans_handle *trans; 4733 struct btrfs_trans_handle *trans;
4743 u64 delalloc_bytes; 4734 u64 delalloc_bytes;
4744 u64 max_reclaim; 4735 u64 async_pages;
4745 u64 items; 4736 u64 items;
4746 long time_left; 4737 long time_left;
4747 unsigned long nr_pages; 4738 unsigned long nr_pages;
@@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4766 4757
4767 loops = 0; 4758 loops = 0;
4768 while (delalloc_bytes && loops < 3) { 4759 while (delalloc_bytes && loops < 3) {
4769 max_reclaim = min(delalloc_bytes, to_reclaim); 4760 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4770 nr_pages = max_reclaim >> PAGE_SHIFT; 4761
4762 /*
4763 * Triggers inode writeback for up to nr_pages. This will invoke
4764 * ->writepages callback and trigger delalloc filling
4765 * (btrfs_run_delalloc_range()).
4766 */
4771 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4767 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4768
4772 /* 4769 /*
4773 * We need to wait for the async pages to actually start before 4770 * We need to wait for the compressed pages to start before
4774 * we do anything. 4771 * we continue.
4775 */ 4772 */
4776 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4773 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4777 if (!max_reclaim) 4774 if (!async_pages)
4778 goto skip_async; 4775 goto skip_async;
4779 4776
4780 if (max_reclaim <= nr_pages) 4777 /*
4781 max_reclaim = 0; 4778 * Calculate how many compressed pages we want to be written
4779 * before we continue. I.e if there are more async pages than we
4780 * require wait_event will wait until nr_pages are written.
4781 */
4782 if (async_pages <= nr_pages)
4783 async_pages = 0;
4782 else 4784 else
4783 max_reclaim -= nr_pages; 4785 async_pages -= nr_pages;
4784 4786
4785 wait_event(fs_info->async_submit_wait, 4787 wait_event(fs_info->async_submit_wait,
4786 atomic_read(&fs_info->async_delalloc_pages) <= 4788 atomic_read(&fs_info->async_delalloc_pages) <=
4787 (int)max_reclaim); 4789 (int)async_pages);
4788skip_async: 4790skip_async:
4789 spin_lock(&space_info->lock); 4791 spin_lock(&space_info->lock);
4790 if (list_empty(&space_info->tickets) && 4792 if (list_empty(&space_info->tickets) &&
@@ -4808,6 +4810,7 @@ skip_async:
4808} 4810}
4809 4811
4810struct reserve_ticket { 4812struct reserve_ticket {
4813 u64 orig_bytes;
4811 u64 bytes; 4814 u64 bytes;
4812 int error; 4815 int error;
4813 struct list_head list; 4816 struct list_head list;
@@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4851 if (!bytes_needed) 4854 if (!bytes_needed)
4852 return 0; 4855 return 0;
4853 4856
4854 /* See if there is enough pinned space to make this reservation */ 4857 trans = btrfs_join_transaction(fs_info->extent_root);
4855 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4858 if (IS_ERR(trans))
4856 bytes_needed, 4859 return PTR_ERR(trans);
4857 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 4860
4861 /*
4862 * See if there is enough pinned space to make this reservation, or if
4863 * we have block groups that are going to be freed, allowing us to
4864 * possibly do a chunk allocation the next loop through.
4865 */
4866 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4867 __percpu_counter_compare(&space_info->total_bytes_pinned,
4868 bytes_needed,
4869 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4858 goto commit; 4870 goto commit;
4859 4871
4860 /* 4872 /*
@@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4862 * this reservation. 4874 * this reservation.
4863 */ 4875 */
4864 if (space_info != delayed_rsv->space_info) 4876 if (space_info != delayed_rsv->space_info)
4865 return -ENOSPC; 4877 goto enospc;
4866 4878
4867 spin_lock(&delayed_rsv->lock); 4879 spin_lock(&delayed_rsv->lock);
4868 reclaim_bytes += delayed_rsv->reserved; 4880 reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4877 4889
4878 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4890 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4879 bytes_needed, 4891 bytes_needed,
4880 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { 4892 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4881 return -ENOSPC; 4893 goto enospc;
4882 }
4883 4894
4884commit: 4895commit:
4885 trans = btrfs_join_transaction(fs_info->extent_root);
4886 if (IS_ERR(trans))
4887 return -ENOSPC;
4888
4889 return btrfs_commit_transaction(trans); 4896 return btrfs_commit_transaction(trans);
4897enospc:
4898 btrfs_end_transaction(trans);
4899 return -ENOSPC;
4890} 4900}
4891 4901
4892/* 4902/*
@@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
4939 btrfs_end_transaction(trans); 4949 btrfs_end_transaction(trans);
4940 break; 4950 break;
4941 case ALLOC_CHUNK: 4951 case ALLOC_CHUNK:
4952 case ALLOC_CHUNK_FORCE:
4942 trans = btrfs_join_transaction(root); 4953 trans = btrfs_join_transaction(root);
4943 if (IS_ERR(trans)) { 4954 if (IS_ERR(trans)) {
4944 ret = PTR_ERR(trans); 4955 ret = PTR_ERR(trans);
@@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
4946 } 4957 }
4947 ret = do_chunk_alloc(trans, 4958 ret = do_chunk_alloc(trans,
4948 btrfs_metadata_alloc_profile(fs_info), 4959 btrfs_metadata_alloc_profile(fs_info),
4949 CHUNK_ALLOC_NO_FORCE); 4960 (state == ALLOC_CHUNK) ?
4961 CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
4950 btrfs_end_transaction(trans); 4962 btrfs_end_transaction(trans);
4951 if (ret > 0 || ret == -ENOSPC) 4963 if (ret > 0 || ret == -ENOSPC)
4952 ret = 0; 4964 ret = 0;
@@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
4957 * bunch of pinned space, so make sure we run the iputs before 4969 * bunch of pinned space, so make sure we run the iputs before
4958 * we do our pinned bytes check below. 4970 * we do our pinned bytes check below.
4959 */ 4971 */
4960 mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4961 btrfs_run_delayed_iputs(fs_info); 4972 btrfs_run_delayed_iputs(fs_info);
4962 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4973 btrfs_wait_on_delayed_iputs(fs_info);
4963 4974
4964 ret = may_commit_transaction(fs_info, space_info); 4975 ret = may_commit_transaction(fs_info, space_info);
4965 break; 4976 break;
@@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5030 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 5041 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5031} 5042}
5032 5043
5033static void wake_all_tickets(struct list_head *head) 5044static bool wake_all_tickets(struct list_head *head)
5034{ 5045{
5035 struct reserve_ticket *ticket; 5046 struct reserve_ticket *ticket;
5036 5047
@@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
5039 list_del_init(&ticket->list); 5050 list_del_init(&ticket->list);
5040 ticket->error = -ENOSPC; 5051 ticket->error = -ENOSPC;
5041 wake_up(&ticket->wait); 5052 wake_up(&ticket->wait);
5053 if (ticket->bytes != ticket->orig_bytes)
5054 return true;
5042 } 5055 }
5056 return false;
5043} 5057}
5044 5058
5045/* 5059/*
@@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5091 commit_cycles--; 5105 commit_cycles--;
5092 } 5106 }
5093 5107
5108 /*
5109 * We don't want to force a chunk allocation until we've tried
5110 * pretty hard to reclaim space. Think of the case where we
5111 * freed up a bunch of space and so have a lot of pinned space
5112 * to reclaim. We would rather use that than possibly create a
5113 * underutilized metadata chunk. So if this is our first run
5114 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
5115 * commit the transaction. If nothing has changed the next go
5116 * around then we can force a chunk allocation.
5117 */
5118 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
5119 flush_state++;
5120
5094 if (flush_state > COMMIT_TRANS) { 5121 if (flush_state > COMMIT_TRANS) {
5095 commit_cycles++; 5122 commit_cycles++;
5096 if (commit_cycles > 2) { 5123 if (commit_cycles > 2) {
5097 wake_all_tickets(&space_info->tickets); 5124 if (wake_all_tickets(&space_info->tickets)) {
5098 space_info->flush = 0; 5125 flush_state = FLUSH_DELAYED_ITEMS_NR;
5126 commit_cycles--;
5127 } else {
5128 space_info->flush = 0;
5129 }
5099 } else { 5130 } else {
5100 flush_state = FLUSH_DELAYED_ITEMS_NR; 5131 flush_state = FLUSH_DELAYED_ITEMS_NR;
5101 } 5132 }
@@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
5109 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5140 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5110} 5141}
5111 5142
5143static const enum btrfs_flush_state priority_flush_states[] = {
5144 FLUSH_DELAYED_ITEMS_NR,
5145 FLUSH_DELAYED_ITEMS,
5146 ALLOC_CHUNK,
5147};
5148
5112static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5149static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5113 struct btrfs_space_info *space_info, 5150 struct btrfs_space_info *space_info,
5114 struct reserve_ticket *ticket) 5151 struct reserve_ticket *ticket)
5115{ 5152{
5116 u64 to_reclaim; 5153 u64 to_reclaim;
5117 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5154 int flush_state;
5118 5155
5119 spin_lock(&space_info->lock); 5156 spin_lock(&space_info->lock);
5120 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5157 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5125 } 5162 }
5126 spin_unlock(&space_info->lock); 5163 spin_unlock(&space_info->lock);
5127 5164
5165 flush_state = 0;
5128 do { 5166 do {
5129 flush_space(fs_info, space_info, to_reclaim, flush_state); 5167 flush_space(fs_info, space_info, to_reclaim,
5168 priority_flush_states[flush_state]);
5130 flush_state++; 5169 flush_state++;
5131 spin_lock(&space_info->lock); 5170 spin_lock(&space_info->lock);
5132 if (ticket->bytes == 0) { 5171 if (ticket->bytes == 0) {
@@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5134 return; 5173 return;
5135 } 5174 }
5136 spin_unlock(&space_info->lock); 5175 spin_unlock(&space_info->lock);
5137 5176 } while (flush_state < ARRAY_SIZE(priority_flush_states));
5138 /*
5139 * Priority flushers can't wait on delalloc without
5140 * deadlocking.
5141 */
5142 if (flush_state == FLUSH_DELALLOC ||
5143 flush_state == FLUSH_DELALLOC_WAIT)
5144 flush_state = ALLOC_CHUNK;
5145 } while (flush_state < COMMIT_TRANS);
5146} 5177}
5147 5178
5148static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5179static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5149 struct btrfs_space_info *space_info, 5180 struct btrfs_space_info *space_info,
5150 struct reserve_ticket *ticket, u64 orig_bytes) 5181 struct reserve_ticket *ticket)
5151 5182
5152{ 5183{
5153 DEFINE_WAIT(wait); 5184 DEFINE_WAIT(wait);
5185 u64 reclaim_bytes = 0;
5154 int ret = 0; 5186 int ret = 0;
5155 5187
5156 spin_lock(&space_info->lock); 5188 spin_lock(&space_info->lock);
@@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5171 ret = ticket->error; 5203 ret = ticket->error;
5172 if (!list_empty(&ticket->list)) 5204 if (!list_empty(&ticket->list))
5173 list_del_init(&ticket->list); 5205 list_del_init(&ticket->list);
5174 if (ticket->bytes && ticket->bytes < orig_bytes) { 5206 if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
5175 u64 num_bytes = orig_bytes - ticket->bytes; 5207 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
5176 update_bytes_may_use(space_info, -num_bytes);
5177 trace_btrfs_space_reservation(fs_info, "space_info",
5178 space_info->flags, num_bytes, 0);
5179 }
5180 spin_unlock(&space_info->lock); 5208 spin_unlock(&space_info->lock);
5181 5209
5210 if (reclaim_bytes)
5211 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5182 return ret; 5212 return ret;
5183} 5213}
5184 5214
@@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5204{ 5234{
5205 struct reserve_ticket ticket; 5235 struct reserve_ticket ticket;
5206 u64 used; 5236 u64 used;
5237 u64 reclaim_bytes = 0;
5207 int ret = 0; 5238 int ret = 0;
5208 5239
5209 ASSERT(orig_bytes); 5240 ASSERT(orig_bytes);
@@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5239 * the list and we will do our own flushing further down. 5270 * the list and we will do our own flushing further down.
5240 */ 5271 */
5241 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5272 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5273 ticket.orig_bytes = orig_bytes;
5242 ticket.bytes = orig_bytes; 5274 ticket.bytes = orig_bytes;
5243 ticket.error = 0; 5275 ticket.error = 0;
5244 init_waitqueue_head(&ticket.wait); 5276 init_waitqueue_head(&ticket.wait);
@@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5279 return ret; 5311 return ret;
5280 5312
5281 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5313 if (flush == BTRFS_RESERVE_FLUSH_ALL)
5282 return wait_reserve_ticket(fs_info, space_info, &ticket, 5314 return wait_reserve_ticket(fs_info, space_info, &ticket);
5283 orig_bytes);
5284 5315
5285 ret = 0; 5316 ret = 0;
5286 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5317 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5287 spin_lock(&space_info->lock); 5318 spin_lock(&space_info->lock);
5288 if (ticket.bytes) { 5319 if (ticket.bytes) {
5289 if (ticket.bytes < orig_bytes) { 5320 if (ticket.bytes < orig_bytes)
5290 u64 num_bytes = orig_bytes - ticket.bytes; 5321 reclaim_bytes = orig_bytes - ticket.bytes;
5291 update_bytes_may_use(space_info, -num_bytes);
5292 trace_btrfs_space_reservation(fs_info, "space_info",
5293 space_info->flags,
5294 num_bytes, 0);
5295
5296 }
5297 list_del_init(&ticket.list); 5322 list_del_init(&ticket.list);
5298 ret = -ENOSPC; 5323 ret = -ENOSPC;
5299 } 5324 }
5300 spin_unlock(&space_info->lock); 5325 spin_unlock(&space_info->lock);
5326
5327 if (reclaim_bytes)
5328 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5301 ASSERT(list_empty(&ticket.list)); 5329 ASSERT(list_empty(&ticket.list));
5302 return ret; 5330 return ret;
5303} 5331}
@@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
5775 return ret; 5803 return ret;
5776} 5804}
5777 5805
5806static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
5807 u64 *metadata_bytes, u64 *qgroup_bytes)
5808{
5809 *metadata_bytes = 0;
5810 *qgroup_bytes = 0;
5811
5812 spin_lock(&block_rsv->lock);
5813 if (block_rsv->reserved < block_rsv->size)
5814 *metadata_bytes = block_rsv->size - block_rsv->reserved;
5815 if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5816 *qgroup_bytes = block_rsv->qgroup_rsv_size -
5817 block_rsv->qgroup_rsv_reserved;
5818 spin_unlock(&block_rsv->lock);
5819}
5820
5778/** 5821/**
5779 * btrfs_inode_rsv_refill - refill the inode block rsv. 5822 * btrfs_inode_rsv_refill - refill the inode block rsv.
5780 * @inode - the inode we are refilling. 5823 * @inode - the inode we are refilling.
@@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5790{ 5833{
5791 struct btrfs_root *root = inode->root; 5834 struct btrfs_root *root = inode->root;
5792 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5835 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5793 u64 num_bytes = 0; 5836 u64 num_bytes, last = 0;
5794 u64 qgroup_num_bytes = 0; 5837 u64 qgroup_num_bytes;
5795 int ret = -ENOSPC; 5838 int ret = -ENOSPC;
5796 5839
5797 spin_lock(&block_rsv->lock); 5840 calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
5798 if (block_rsv->reserved < block_rsv->size)
5799 num_bytes = block_rsv->size - block_rsv->reserved;
5800 if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5801 qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5802 block_rsv->qgroup_rsv_reserved;
5803 spin_unlock(&block_rsv->lock);
5804
5805 if (num_bytes == 0) 5841 if (num_bytes == 0)
5806 return 0; 5842 return 0;
5807 5843
5808 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); 5844 do {
5809 if (ret) 5845 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
5810 return ret; 5846 true);
5811 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5847 if (ret)
5848 return ret;
5849 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5850 if (ret) {
5851 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5852 last = num_bytes;
5853 /*
5854 * If we are fragmented we can end up with a lot of
5855 * outstanding extents which will make our size be much
5856 * larger than our reserved amount.
5857 *
5858 * If the reservation happens here, it might be very
5859 * big though not needed in the end, if the delalloc
5860 * flushing happens.
5861 *
5862 * If this is the case try and do the reserve again.
5863 */
5864 if (flush == BTRFS_RESERVE_FLUSH_ALL)
5865 calc_refill_bytes(block_rsv, &num_bytes,
5866 &qgroup_num_bytes);
5867 if (num_bytes == 0)
5868 return 0;
5869 }
5870 } while (ret && last != num_bytes);
5871
5812 if (!ret) { 5872 if (!ret) {
5813 block_rsv_add_bytes(block_rsv, num_bytes, false); 5873 block_rsv_add_bytes(block_rsv, num_bytes, false);
5814 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5874 trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5818 spin_lock(&block_rsv->lock); 5878 spin_lock(&block_rsv->lock);
5819 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; 5879 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5820 spin_unlock(&block_rsv->lock); 5880 spin_unlock(&block_rsv->lock);
5821 } else 5881 }
5822 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5823 return ret; 5882 return ret;
5824} 5883}
5825 5884
@@ -8066,6 +8125,15 @@ loop:
8066 return ret; 8125 return ret;
8067} 8126}
8068 8127
8128#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
8129do { \
8130 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
8131 spin_lock(&__rsv->lock); \
8132 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
8133 __rsv->size, __rsv->reserved); \
8134 spin_unlock(&__rsv->lock); \
8135} while (0)
8136
8069static void dump_space_info(struct btrfs_fs_info *fs_info, 8137static void dump_space_info(struct btrfs_fs_info *fs_info,
8070 struct btrfs_space_info *info, u64 bytes, 8138 struct btrfs_space_info *info, u64 bytes,
8071 int dump_block_groups) 8139 int dump_block_groups)
@@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
8085 info->bytes_readonly); 8153 info->bytes_readonly);
8086 spin_unlock(&info->lock); 8154 spin_unlock(&info->lock);
8087 8155
8156 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
8157 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
8158 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
8159 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
8160 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
8161
8088 if (!dump_block_groups) 8162 if (!dump_block_groups)
8089 return; 8163 return;
8090 8164
@@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8492 clean_tree_block(fs_info, buf); 8566 clean_tree_block(fs_info, buf);
8493 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8567 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8494 8568
8495 btrfs_set_lock_blocking(buf); 8569 btrfs_set_lock_blocking_write(buf);
8496 set_extent_buffer_uptodate(buf); 8570 set_extent_buffer_uptodate(buf);
8497 8571
8498 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 8572 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8917,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8917 reada = 1; 8991 reada = 1;
8918 } 8992 }
8919 btrfs_tree_lock(next); 8993 btrfs_tree_lock(next);
8920 btrfs_set_lock_blocking(next); 8994 btrfs_set_lock_blocking_write(next);
8921 8995
8922 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8996 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8923 &wc->refs[level - 1], 8997 &wc->refs[level - 1],
@@ -8977,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8977 return -EIO; 9051 return -EIO;
8978 } 9052 }
8979 btrfs_tree_lock(next); 9053 btrfs_tree_lock(next);
8980 btrfs_set_lock_blocking(next); 9054 btrfs_set_lock_blocking_write(next);
8981 } 9055 }
8982 9056
8983 level--; 9057 level--;
@@ -9089,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9089 if (!path->locks[level]) { 9163 if (!path->locks[level]) {
9090 BUG_ON(level == 0); 9164 BUG_ON(level == 0);
9091 btrfs_tree_lock(eb); 9165 btrfs_tree_lock(eb);
9092 btrfs_set_lock_blocking(eb); 9166 btrfs_set_lock_blocking_write(eb);
9093 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9167 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9094 9168
9095 ret = btrfs_lookup_extent_info(trans, fs_info, 9169 ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9131 if (!path->locks[level] && 9205 if (!path->locks[level] &&
9132 btrfs_header_generation(eb) == trans->transid) { 9206 btrfs_header_generation(eb) == trans->transid) {
9133 btrfs_tree_lock(eb); 9207 btrfs_tree_lock(eb);
9134 btrfs_set_lock_blocking(eb); 9208 btrfs_set_lock_blocking_write(eb);
9135 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9209 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9136 } 9210 }
9137 clean_tree_block(fs_info, eb); 9211 clean_tree_block(fs_info, eb);
@@ -9298,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
9298 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9372 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9299 level = btrfs_header_level(root->node); 9373 level = btrfs_header_level(root->node);
9300 path->nodes[level] = btrfs_lock_root_node(root); 9374 path->nodes[level] = btrfs_lock_root_node(root);
9301 btrfs_set_lock_blocking(path->nodes[level]); 9375 btrfs_set_lock_blocking_write(path->nodes[level]);
9302 path->slots[level] = 0; 9376 path->slots[level] = 0;
9303 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9377 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9304 memset(&wc->update_progress, 0, 9378 memset(&wc->update_progress, 0,
@@ -9328,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
9328 level = btrfs_header_level(root->node); 9402 level = btrfs_header_level(root->node);
9329 while (1) { 9403 while (1) {
9330 btrfs_tree_lock(path->nodes[level]); 9404 btrfs_tree_lock(path->nodes[level]);
9331 btrfs_set_lock_blocking(path->nodes[level]); 9405 btrfs_set_lock_blocking_write(path->nodes[level]);
9332 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9406 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9333 9407
9334 ret = btrfs_lookup_extent_info(trans, fs_info, 9408 ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9595,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9595{ 9669{
9596 struct btrfs_space_info *sinfo = cache->space_info; 9670 struct btrfs_space_info *sinfo = cache->space_info;
9597 u64 num_bytes; 9671 u64 num_bytes;
9672 u64 sinfo_used;
9598 u64 min_allocable_bytes; 9673 u64 min_allocable_bytes;
9599 int ret = -ENOSPC; 9674 int ret = -ENOSPC;
9600 9675
@@ -9621,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9621 9696
9622 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9697 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9623 cache->bytes_super - btrfs_block_group_used(&cache->item); 9698 cache->bytes_super - btrfs_block_group_used(&cache->item);
9699 sinfo_used = btrfs_space_info_used(sinfo, true);
9624 9700
9625 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9701 if (sinfo_used + num_bytes + min_allocable_bytes <=
9626 min_allocable_bytes <= sinfo->total_bytes) { 9702 sinfo->total_bytes) {
9627 sinfo->bytes_readonly += num_bytes; 9703 sinfo->bytes_readonly += num_bytes;
9628 cache->ro++; 9704 cache->ro++;
9629 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9705 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9632out: 9708out:
9633 spin_unlock(&cache->lock); 9709 spin_unlock(&cache->lock);
9634 spin_unlock(&sinfo->lock); 9710 spin_unlock(&sinfo->lock);
9711 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9712 btrfs_info(cache->fs_info,
9713 "unable to make block group %llu ro",
9714 cache->key.objectid);
9715 btrfs_info(cache->fs_info,
9716 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9717 sinfo_used, num_bytes, min_allocable_bytes);
9718 dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9719 }
9635 return ret; 9720 return ret;
9636} 9721}
9637 9722
@@ -10781,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10781 } 10866 }
10782 10867
10783 spin_lock(&trans->transaction->dirty_bgs_lock); 10868 spin_lock(&trans->transaction->dirty_bgs_lock);
10784 if (!list_empty(&block_group->dirty_list)) { 10869 WARN_ON(!list_empty(&block_group->dirty_list));
10785 WARN_ON(1); 10870 WARN_ON(!list_empty(&block_group->io_list));
10786 }
10787 if (!list_empty(&block_group->io_list)) {
10788 WARN_ON(1);
10789 }
10790 spin_unlock(&trans->transaction->dirty_bgs_lock); 10871 spin_unlock(&trans->transaction->dirty_bgs_lock);
10872
10791 btrfs_remove_free_space_cache(block_group); 10873 btrfs_remove_free_space_cache(block_group);
10792 10874
10793 spin_lock(&block_group->space_info->lock); 10875 spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..ca259c75bbcd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -147,7 +147,38 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
147 return ret; 147 return ret;
148} 148}
149 149
150static void flush_write_bio(struct extent_page_data *epd); 150static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
151 unsigned long bio_flags)
152{
153 blk_status_t ret = 0;
154 struct bio_vec *bvec = bio_last_bvec_all(bio);
155 struct page *page = bvec->bv_page;
156 struct extent_io_tree *tree = bio->bi_private;
157 u64 start;
158
159 start = page_offset(page) + bvec->bv_offset;
160
161 bio->bi_private = NULL;
162
163 if (tree->ops)
164 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
165 mirror_num, bio_flags, start);
166 else
167 btrfsic_submit_bio(bio);
168
169 return blk_status_to_errno(ret);
170}
171
172static void flush_write_bio(struct extent_page_data *epd)
173{
174 if (epd->bio) {
175 int ret;
176
177 ret = submit_one_bio(epd->bio, 0, 0);
178 BUG_ON(ret < 0); /* -ENOMEM */
179 epd->bio = NULL;
180 }
181}
151 182
152int __init extent_io_init(void) 183int __init extent_io_init(void)
153{ 184{
@@ -281,8 +312,8 @@ do_insert:
281} 312}
282 313
283static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 314static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
284 struct rb_node **prev_ret,
285 struct rb_node **next_ret, 315 struct rb_node **next_ret,
316 struct rb_node **prev_ret,
286 struct rb_node ***p_ret, 317 struct rb_node ***p_ret,
287 struct rb_node **parent_ret) 318 struct rb_node **parent_ret)
288{ 319{
@@ -311,23 +342,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
311 if (parent_ret) 342 if (parent_ret)
312 *parent_ret = prev; 343 *parent_ret = prev;
313 344
314 if (prev_ret) { 345 if (next_ret) {
315 orig_prev = prev; 346 orig_prev = prev;
316 while (prev && offset > prev_entry->end) { 347 while (prev && offset > prev_entry->end) {
317 prev = rb_next(prev); 348 prev = rb_next(prev);
318 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 349 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
319 } 350 }
320 *prev_ret = prev; 351 *next_ret = prev;
321 prev = orig_prev; 352 prev = orig_prev;
322 } 353 }
323 354
324 if (next_ret) { 355 if (prev_ret) {
325 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 356 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
326 while (prev && offset < prev_entry->start) { 357 while (prev && offset < prev_entry->start) {
327 prev = rb_prev(prev); 358 prev = rb_prev(prev);
328 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 359 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
329 } 360 }
330 *next_ret = prev; 361 *prev_ret = prev;
331 } 362 }
332 return NULL; 363 return NULL;
333} 364}
@@ -338,12 +369,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
338 struct rb_node ***p_ret, 369 struct rb_node ***p_ret,
339 struct rb_node **parent_ret) 370 struct rb_node **parent_ret)
340{ 371{
341 struct rb_node *prev = NULL; 372 struct rb_node *next= NULL;
342 struct rb_node *ret; 373 struct rb_node *ret;
343 374
344 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 375 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
345 if (!ret) 376 if (!ret)
346 return prev; 377 return next;
347 return ret; 378 return ret;
348} 379}
349 380
@@ -585,7 +616,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
585 616
586 if (delete) 617 if (delete)
587 bits |= ~EXTENT_CTLBITS; 618 bits |= ~EXTENT_CTLBITS;
588 bits |= EXTENT_FIRST_DELALLOC;
589 619
590 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 620 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
591 clear = 1; 621 clear = 1;
@@ -850,7 +880,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
850 880
851 btrfs_debug_check_extent_io_range(tree, start, end); 881 btrfs_debug_check_extent_io_range(tree, start, end);
852 882
853 bits |= EXTENT_FIRST_DELALLOC;
854again: 883again:
855 if (!prealloc && gfpflags_allow_blocking(mask)) { 884 if (!prealloc && gfpflags_allow_blocking(mask)) {
856 /* 885 /*
@@ -2692,28 +2721,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2692 return bio; 2721 return bio;
2693} 2722}
2694 2723
2695static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2696 unsigned long bio_flags)
2697{
2698 blk_status_t ret = 0;
2699 struct bio_vec *bvec = bio_last_bvec_all(bio);
2700 struct page *page = bvec->bv_page;
2701 struct extent_io_tree *tree = bio->bi_private;
2702 u64 start;
2703
2704 start = page_offset(page) + bvec->bv_offset;
2705
2706 bio->bi_private = NULL;
2707
2708 if (tree->ops)
2709 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
2710 mirror_num, bio_flags, start);
2711 else
2712 btrfsic_submit_bio(bio);
2713
2714 return blk_status_to_errno(ret);
2715}
2716
2717/* 2724/*
2718 * @opf: bio REQ_OP_* and REQ_* flags as one value 2725 * @opf: bio REQ_OP_* and REQ_* flags as one value
2719 * @tree: tree so we can call our merge_bio hook 2726 * @tree: tree so we can call our merge_bio hook
@@ -4007,17 +4014,6 @@ retry:
4007 return ret; 4014 return ret;
4008} 4015}
4009 4016
4010static void flush_write_bio(struct extent_page_data *epd)
4011{
4012 if (epd->bio) {
4013 int ret;
4014
4015 ret = submit_one_bio(epd->bio, 0, 0);
4016 BUG_ON(ret < 0); /* -ENOMEM */
4017 epd->bio = NULL;
4018 }
4019}
4020
4021int extent_write_full_page(struct page *page, struct writeback_control *wbc) 4017int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4022{ 4018{
4023 int ret; 4019 int ret;
@@ -4259,8 +4255,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
4259 if (len == 0) 4255 if (len == 0)
4260 break; 4256 break;
4261 len = ALIGN(len, sectorsize); 4257 len = ALIGN(len, sectorsize);
4262 em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, 4258 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
4263 len, 0);
4264 if (IS_ERR_OR_NULL(em)) 4259 if (IS_ERR_OR_NULL(em))
4265 return em; 4260 return em;
4266 4261
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9673be3f3d1f..08749e0b9c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,17 +18,16 @@
18#define EXTENT_BOUNDARY (1U << 9) 18#define EXTENT_BOUNDARY (1U << 9)
19#define EXTENT_NODATASUM (1U << 10) 19#define EXTENT_NODATASUM (1U << 10)
20#define EXTENT_CLEAR_META_RESV (1U << 11) 20#define EXTENT_CLEAR_META_RESV (1U << 11)
21#define EXTENT_FIRST_DELALLOC (1U << 12) 21#define EXTENT_NEED_WAIT (1U << 12)
22#define EXTENT_NEED_WAIT (1U << 13) 22#define EXTENT_DAMAGED (1U << 13)
23#define EXTENT_DAMAGED (1U << 14) 23#define EXTENT_NORESERVE (1U << 14)
24#define EXTENT_NORESERVE (1U << 15) 24#define EXTENT_QGROUP_RESERVED (1U << 15)
25#define EXTENT_QGROUP_RESERVED (1U << 16) 25#define EXTENT_CLEAR_DATA_RESV (1U << 16)
26#define EXTENT_CLEAR_DATA_RESV (1U << 17) 26#define EXTENT_DELALLOC_NEW (1U << 17)
27#define EXTENT_DELALLOC_NEW (1U << 18)
28#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 27#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
29#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ 28#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
30 EXTENT_CLEAR_DATA_RESV) 29 EXTENT_CLEAR_DATA_RESV)
31#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 30#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
32 31
33/* 32/*
34 * flags for bio submission. The high bits indicate the compression 33 * flags for bio submission. The high bits indicate the compression
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a042a193c120..928f729c55ba 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
210 if (!list_empty(&prev->list) || !list_empty(&next->list)) 210 if (!list_empty(&prev->list) || !list_empty(&next->list))
211 return 0; 211 return 0;
212 212
213 ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
214 prev->block_start != EXTENT_MAP_DELALLOC);
215
213 if (extent_map_end(prev) == next->start && 216 if (extent_map_end(prev) == next->start &&
214 prev->flags == next->flags && 217 prev->flags == next->flags &&
215 prev->bdev == next->bdev && 218 prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
217 prev->block_start == EXTENT_MAP_HOLE) || 220 prev->block_start == EXTENT_MAP_HOLE) ||
218 (next->block_start == EXTENT_MAP_INLINE && 221 (next->block_start == EXTENT_MAP_INLINE &&
219 prev->block_start == EXTENT_MAP_INLINE) || 222 prev->block_start == EXTENT_MAP_INLINE) ||
220 (next->block_start == EXTENT_MAP_DELALLOC &&
221 prev->block_start == EXTENT_MAP_DELALLOC) ||
222 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && 223 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
223 next->block_start == extent_map_block_end(prev)))) { 224 next->block_start == extent_map_block_end(prev)))) {
224 return 1; 225 return 1;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ef05a0121652..473f039fcd7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -9,6 +9,7 @@
9#define EXTENT_MAP_LAST_BYTE ((u64)-4) 9#define EXTENT_MAP_LAST_BYTE ((u64)-4)
10#define EXTENT_MAP_HOLE ((u64)-3) 10#define EXTENT_MAP_HOLE ((u64)-3)
11#define EXTENT_MAP_INLINE ((u64)-2) 11#define EXTENT_MAP_INLINE ((u64)-2)
12/* used only during fiemap calls */
12#define EXTENT_MAP_DELALLOC ((u64)-1) 13#define EXTENT_MAP_DELALLOC ((u64)-1)
13 14
14/* bits for the extent_map::flags field */ 15/* bits for the extent_map::flags field */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d38dc8c31533..34fe8a58b0e9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
3218 &cached_state); 3218 &cached_state);
3219 3219
3220 while (start < inode->i_size) { 3220 while (start < inode->i_size) {
3221 em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, 3221 em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
3222 start, len, 0);
3223 if (IS_ERR(em)) { 3222 if (IS_ERR(em)) {
3224 ret = PTR_ERR(em); 3223 ret = PTR_ERR(em);
3225 em = NULL; 3224 em = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c349667c761..3f180b857e20 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
453 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 453 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
454 u64 blocksize = fs_info->sectorsize; 454 u64 blocksize = fs_info->sectorsize;
455 u64 actual_end; 455 u64 actual_end;
456 u64 isize = i_size_read(inode);
457 int ret = 0; 456 int ret = 0;
458 struct page **pages = NULL; 457 struct page **pages = NULL;
459 unsigned long nr_pages; 458 unsigned long nr_pages;
@@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
467 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, 466 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
468 SZ_16K); 467 SZ_16K);
469 468
470 actual_end = min_t(u64, isize, end + 1); 469 actual_end = min_t(u64, i_size_read(inode), end + 1);
471again: 470again:
472 will_compress = 0; 471 will_compress = 0;
473 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 472 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
714 * queued. We walk all the async extents created by compress_file_range 713 * queued. We walk all the async extents created by compress_file_range
715 * and send them down to the disk. 714 * and send them down to the disk.
716 */ 715 */
717static noinline void submit_compressed_extents(struct inode *inode, 716static noinline void submit_compressed_extents(struct async_cow *async_cow)
718 struct async_cow *async_cow)
719{ 717{
718 struct inode *inode = async_cow->inode;
720 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 719 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
721 struct async_extent *async_extent; 720 struct async_extent *async_extent;
722 u64 alloc_hint = 0; 721 u64 alloc_hint = 0;
@@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1166 5 * SZ_1M) 1165 5 * SZ_1M)
1167 cond_wake_up_nomb(&fs_info->async_submit_wait); 1166 cond_wake_up_nomb(&fs_info->async_submit_wait);
1168 1167
1168 /*
1169 * ->inode could be NULL if async_cow_start has failed to compress,
1170 * in which case we don't have anything to submit, yet we need to
1171 * always adjust ->async_delalloc_pages as its paired with the init
1172 * happening in cow_file_range_async
1173 */
1169 if (async_cow->inode) 1174 if (async_cow->inode)
1170 submit_compressed_extents(async_cow->inode, async_cow); 1175 submit_compressed_extents(async_cow);
1171} 1176}
1172 1177
1173static noinline void async_cow_free(struct btrfs_work *work) 1178static noinline void async_cow_free(struct btrfs_work *work)
@@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1194 while (start < end) { 1199 while (start < end) {
1195 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1200 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1196 BUG_ON(!async_cow); /* -ENOMEM */ 1201 BUG_ON(!async_cow); /* -ENOMEM */
1197 async_cow->inode = igrab(inode); 1202 /*
1203 * igrab is called higher up in the call chain, take only the
1204 * lightweight reference for the callback lifetime
1205 */
1206 ihold(inode);
1207 async_cow->inode = inode;
1198 async_cow->fs_info = fs_info; 1208 async_cow->fs_info = fs_info;
1199 async_cow->locked_page = locked_page; 1209 async_cow->locked_page = locked_page;
1200 async_cow->start = start; 1210 async_cow->start = start;
@@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1586 * Function to process delayed allocation (create CoW) for ranges which are 1596 * Function to process delayed allocation (create CoW) for ranges which are
1587 * being touched for the first time. 1597 * being touched for the first time.
1588 */ 1598 */
1589int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, 1599int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
1590 u64 start, u64 end, int *page_started, unsigned long *nr_written, 1600 u64 start, u64 end, int *page_started, unsigned long *nr_written,
1591 struct writeback_control *wbc) 1601 struct writeback_control *wbc)
1592{ 1602{
1593 struct inode *inode = private_data;
1594 int ret; 1603 int ret;
1595 int force_cow = need_force_cow(inode, start, end); 1604 int force_cow = need_force_cow(inode, start, end);
1596 unsigned int write_flags = wbc_to_write_flags(wbc); 1605 unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
3247 if (atomic_add_unless(&inode->i_count, -1, 1)) 3256 if (atomic_add_unless(&inode->i_count, -1, 1))
3248 return; 3257 return;
3249 3258
3259 atomic_inc(&fs_info->nr_delayed_iputs);
3250 spin_lock(&fs_info->delayed_iput_lock); 3260 spin_lock(&fs_info->delayed_iput_lock);
3251 ASSERT(list_empty(&binode->delayed_iput)); 3261 ASSERT(list_empty(&binode->delayed_iput));
3252 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3262 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3267 list_del_init(&inode->delayed_iput); 3277 list_del_init(&inode->delayed_iput);
3268 spin_unlock(&fs_info->delayed_iput_lock); 3278 spin_unlock(&fs_info->delayed_iput_lock);
3269 iput(&inode->vfs_inode); 3279 iput(&inode->vfs_inode);
3280 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3281 wake_up(&fs_info->delayed_iputs_wait);
3270 spin_lock(&fs_info->delayed_iput_lock); 3282 spin_lock(&fs_info->delayed_iput_lock);
3271 } 3283 }
3272 spin_unlock(&fs_info->delayed_iput_lock); 3284 spin_unlock(&fs_info->delayed_iput_lock);
3273} 3285}
3274 3286
3287/**
3288 * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
3289 * @fs_info - the fs_info for this fs
3290 * @return - EINTR if we were killed, 0 if nothing's pending
3291 *
3292 * This will wait on any delayed iputs that are currently running with KILLABLE
3293 * set. Once they are all done running we will return, unless we are killed in
3294 * which case we return EINTR. This helps in user operations like fallocate etc
3295 * that might get blocked on the iputs.
3296 */
3297int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3298{
3299 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3300 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3301 if (ret)
3302 return -EINTR;
3303 return 0;
3304}
3305
3275/* 3306/*
3276 * This creates an orphan entry for the given inode in case something goes wrong 3307 * This creates an orphan entry for the given inode in case something goes wrong
3277 * in the middle of an unlink. 3308 * in the middle of an unlink.
@@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5262{ 5293{
5263 struct btrfs_fs_info *fs_info = root->fs_info; 5294 struct btrfs_fs_info *fs_info = root->fs_info;
5264 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5295 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5296 u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
5265 int failures = 0; 5297 int failures = 0;
5266 5298
5267 for (;;) { 5299 for (;;) {
5268 struct btrfs_trans_handle *trans; 5300 struct btrfs_trans_handle *trans;
5269 int ret; 5301 int ret;
5270 5302
5271 ret = btrfs_block_rsv_refill(root, rsv, rsv->size, 5303 ret = btrfs_block_rsv_refill(root, rsv,
5304 rsv->size + delayed_refs_extra,
5272 BTRFS_RESERVE_FLUSH_LIMIT); 5305 BTRFS_RESERVE_FLUSH_LIMIT);
5273 5306
5274 if (ret && ++failures > 2) { 5307 if (ret && ++failures > 2) {
@@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5277 return ERR_PTR(-ENOSPC); 5310 return ERR_PTR(-ENOSPC);
5278 } 5311 }
5279 5312
5313 /*
5314 * Evict can generate a large amount of delayed refs without
5315 * having a way to add space back since we exhaust our temporary
5316 * block rsv. We aren't allowed to do FLUSH_ALL in this case
5317 * because we could deadlock with so many things in the flushing
5318 * code, so we have to try and hold some extra space to
5319 * compensate for our delayed ref generation. If we can't get
5320 * that space then we need see if we can steal our minimum from
5321 * the global reserve. We will be ratelimited by the amount of
5322 * space we have for the delayed refs rsv, so we'll end up
5323 * committing and trying again.
5324 */
5280 trans = btrfs_join_transaction(root); 5325 trans = btrfs_join_transaction(root);
5281 if (IS_ERR(trans) || !ret) 5326 if (IS_ERR(trans) || !ret) {
5327 if (!IS_ERR(trans)) {
5328 trans->block_rsv = &fs_info->trans_block_rsv;
5329 trans->bytes_reserved = delayed_refs_extra;
5330 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5331 delayed_refs_extra, 1);
5332 }
5282 return trans; 5333 return trans;
5334 }
5283 5335
5284 /* 5336 /*
5285 * Try to steal from the global reserve if there is space for 5337 * Try to steal from the global reserve if there is space for
@@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6731 u64 extent_start = 0; 6783 u64 extent_start = 0;
6732 u64 extent_end = 0; 6784 u64 extent_end = 0;
6733 u64 objectid = btrfs_ino(inode); 6785 u64 objectid = btrfs_ino(inode);
6734 u32 found_type; 6786 u8 extent_type;
6735 struct btrfs_path *path = NULL; 6787 struct btrfs_path *path = NULL;
6736 struct btrfs_root *root = inode->root; 6788 struct btrfs_root *root = inode->root;
6737 struct btrfs_file_extent_item *item; 6789 struct btrfs_file_extent_item *item;
@@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6786 if (ret < 0) { 6838 if (ret < 0) {
6787 err = ret; 6839 err = ret;
6788 goto out; 6840 goto out;
6789 } 6841 } else if (ret > 0) {
6790
6791 if (ret != 0) {
6792 if (path->slots[0] == 0) 6842 if (path->slots[0] == 0)
6793 goto not_found; 6843 goto not_found;
6794 path->slots[0]--; 6844 path->slots[0]--;
@@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6797 leaf = path->nodes[0]; 6847 leaf = path->nodes[0];
6798 item = btrfs_item_ptr(leaf, path->slots[0], 6848 item = btrfs_item_ptr(leaf, path->slots[0],
6799 struct btrfs_file_extent_item); 6849 struct btrfs_file_extent_item);
6800 /* are we inside the extent that was found? */
6801 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6850 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6802 found_type = found_key.type;
6803 if (found_key.objectid != objectid || 6851 if (found_key.objectid != objectid ||
6804 found_type != BTRFS_EXTENT_DATA_KEY) { 6852 found_key.type != BTRFS_EXTENT_DATA_KEY) {
6805 /* 6853 /*
6806 * If we backup past the first extent we want to move forward 6854 * If we backup past the first extent we want to move forward
6807 * and see if there is an extent in front of us, otherwise we'll 6855 * and see if there is an extent in front of us, otherwise we'll
@@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6812 goto next; 6860 goto next;
6813 } 6861 }
6814 6862
6815 found_type = btrfs_file_extent_type(leaf, item); 6863 extent_type = btrfs_file_extent_type(leaf, item);
6816 extent_start = found_key.offset; 6864 extent_start = found_key.offset;
6817 if (found_type == BTRFS_FILE_EXTENT_REG || 6865 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6818 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6866 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6819 extent_end = extent_start + 6867 extent_end = extent_start +
6820 btrfs_file_extent_num_bytes(leaf, item); 6868 btrfs_file_extent_num_bytes(leaf, item);
6821 6869
6822 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 6870 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6823 extent_start); 6871 extent_start);
6824 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6872 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6825 size_t size; 6873 size_t size;
6826 6874
6827 size = btrfs_file_extent_ram_bytes(leaf, item); 6875 size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6840,9 +6888,9 @@ next:
6840 if (ret < 0) { 6888 if (ret < 0) {
6841 err = ret; 6889 err = ret;
6842 goto out; 6890 goto out;
6843 } 6891 } else if (ret > 0) {
6844 if (ret > 0)
6845 goto not_found; 6892 goto not_found;
6893 }
6846 leaf = path->nodes[0]; 6894 leaf = path->nodes[0];
6847 } 6895 }
6848 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6896 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6853,19 +6901,22 @@ next:
6853 goto not_found; 6901 goto not_found;
6854 if (start > found_key.offset) 6902 if (start > found_key.offset)
6855 goto next; 6903 goto next;
6904
6905 /* New extent overlaps with existing one */
6856 em->start = start; 6906 em->start = start;
6857 em->orig_start = start; 6907 em->orig_start = start;
6858 em->len = found_key.offset - start; 6908 em->len = found_key.offset - start;
6859 goto not_found_em; 6909 em->block_start = EXTENT_MAP_HOLE;
6910 goto insert;
6860 } 6911 }
6861 6912
6862 btrfs_extent_item_to_extent_map(inode, path, item, 6913 btrfs_extent_item_to_extent_map(inode, path, item,
6863 new_inline, em); 6914 new_inline, em);
6864 6915
6865 if (found_type == BTRFS_FILE_EXTENT_REG || 6916 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6866 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6917 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6867 goto insert; 6918 goto insert;
6868 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6919 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6869 unsigned long ptr; 6920 unsigned long ptr;
6870 char *map; 6921 char *map;
6871 size_t size; 6922 size_t size;
@@ -6916,7 +6967,6 @@ not_found:
6916 em->start = start; 6967 em->start = start;
6917 em->orig_start = start; 6968 em->orig_start = start;
6918 em->len = len; 6969 em->len = len;
6919not_found_em:
6920 em->block_start = EXTENT_MAP_HOLE; 6970 em->block_start = EXTENT_MAP_HOLE;
6921insert: 6971insert:
6922 btrfs_release_path(path); 6972 btrfs_release_path(path);
@@ -6946,19 +6996,17 @@ out:
6946} 6996}
6947 6997
6948struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 6998struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
6949 struct page *page, 6999 u64 start, u64 len)
6950 size_t pg_offset, u64 start, u64 len,
6951 int create)
6952{ 7000{
6953 struct extent_map *em; 7001 struct extent_map *em;
6954 struct extent_map *hole_em = NULL; 7002 struct extent_map *hole_em = NULL;
6955 u64 range_start = start; 7003 u64 delalloc_start = start;
6956 u64 end; 7004 u64 end;
6957 u64 found; 7005 u64 delalloc_len;
6958 u64 found_end; 7006 u64 delalloc_end;
6959 int err = 0; 7007 int err = 0;
6960 7008
6961 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 7009 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6962 if (IS_ERR(em)) 7010 if (IS_ERR(em))
6963 return em; 7011 return em;
6964 /* 7012 /*
@@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
6983 em = NULL; 7031 em = NULL;
6984 7032
6985 /* ok, we didn't find anything, lets look for delalloc */ 7033 /* ok, we didn't find anything, lets look for delalloc */
6986 found = count_range_bits(&inode->io_tree, &range_start, 7034 delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
6987 end, len, EXTENT_DELALLOC, 1); 7035 end, len, EXTENT_DELALLOC, 1);
6988 found_end = range_start + found; 7036 delalloc_end = delalloc_start + delalloc_len;
6989 if (found_end < range_start) 7037 if (delalloc_end < delalloc_start)
6990 found_end = (u64)-1; 7038 delalloc_end = (u64)-1;
6991 7039
6992 /* 7040 /*
6993 * we didn't find anything useful, return 7041 * We didn't find anything useful, return the original results from
6994 * the original results from get_extent() 7042 * get_extent()
6995 */ 7043 */
6996 if (range_start > end || found_end <= start) { 7044 if (delalloc_start > end || delalloc_end <= start) {
6997 em = hole_em; 7045 em = hole_em;
6998 hole_em = NULL; 7046 hole_em = NULL;
6999 goto out; 7047 goto out;
7000 } 7048 }
7001 7049
7002 /* adjust the range_start to make sure it doesn't 7050 /*
7003 * go backwards from the start they passed in 7051 * Adjust the delalloc_start to make sure it doesn't go backwards from
7052 * the start they passed in
7004 */ 7053 */
7005 range_start = max(start, range_start); 7054 delalloc_start = max(start, delalloc_start);
7006 found = found_end - range_start; 7055 delalloc_len = delalloc_end - delalloc_start;
7007 7056
7008 if (found > 0) { 7057 if (delalloc_len > 0) {
7009 u64 hole_start = start; 7058 u64 hole_start;
7010 u64 hole_len = len; 7059 u64 hole_len;
7060 const u64 hole_end = extent_map_end(hole_em);
7011 7061
7012 em = alloc_extent_map(); 7062 em = alloc_extent_map();
7013 if (!em) { 7063 if (!em) {
7014 err = -ENOMEM; 7064 err = -ENOMEM;
7015 goto out; 7065 goto out;
7016 } 7066 }
7067 em->bdev = NULL;
7068
7069 ASSERT(hole_em);
7017 /* 7070 /*
7018 * when btrfs_get_extent can't find anything it 7071 * When btrfs_get_extent can't find anything it returns one
7019 * returns one huge hole 7072 * huge hole
7020 * 7073 *
7021 * make sure what it found really fits our range, and 7074 * Make sure what it found really fits our range, and adjust to
7022 * adjust to make sure it is based on the start from 7075 * make sure it is based on the start from the caller
7023 * the caller
7024 */ 7076 */
7025 if (hole_em) { 7077 if (hole_end <= start || hole_em->start > end) {
7026 u64 calc_end = extent_map_end(hole_em); 7078 free_extent_map(hole_em);
7027 7079 hole_em = NULL;
7028 if (calc_end <= start || (hole_em->start > end)) { 7080 } else {
7029 free_extent_map(hole_em); 7081 hole_start = max(hole_em->start, start);
7030 hole_em = NULL; 7082 hole_len = hole_end - hole_start;
7031 } else {
7032 hole_start = max(hole_em->start, start);
7033 hole_len = calc_end - hole_start;
7034 }
7035 } 7083 }
7036 em->bdev = NULL; 7084
7037 if (hole_em && range_start > hole_start) { 7085 if (hole_em && delalloc_start > hole_start) {
7038 /* our hole starts before our delalloc, so we 7086 /*
7039 * have to return just the parts of the hole 7087 * Our hole starts before our delalloc, so we have to
7040 * that go until the delalloc starts 7088 * return just the parts of the hole that go until the
7089 * delalloc starts
7041 */ 7090 */
7042 em->len = min(hole_len, 7091 em->len = min(hole_len, delalloc_start - hole_start);
7043 range_start - hole_start);
7044 em->start = hole_start; 7092 em->start = hole_start;
7045 em->orig_start = hole_start; 7093 em->orig_start = hole_start;
7046 /* 7094 /*
7047 * don't adjust block start at all, 7095 * Don't adjust block start at all, it is fixed at
7048 * it is fixed at EXTENT_MAP_HOLE 7096 * EXTENT_MAP_HOLE
7049 */ 7097 */
7050 em->block_start = hole_em->block_start; 7098 em->block_start = hole_em->block_start;
7051 em->block_len = hole_len; 7099 em->block_len = hole_len;
7052 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7100 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7053 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7101 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7054 } else { 7102 } else {
7055 em->start = range_start; 7103 /*
7056 em->len = found; 7104 * Hole is out of passed range or it starts after
7057 em->orig_start = range_start; 7105 * delalloc range
7106 */
7107 em->start = delalloc_start;
7108 em->len = delalloc_len;
7109 em->orig_start = delalloc_start;
7058 em->block_start = EXTENT_MAP_DELALLOC; 7110 em->block_start = EXTENT_MAP_DELALLOC;
7059 em->block_len = found; 7111 em->block_len = delalloc_len;
7060 } 7112 }
7061 } else { 7113 } else {
7062 return hole_em; 7114 return hole_em;
@@ -9910,7 +9962,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
9910 init_completion(&work->completion); 9962 init_completion(&work->completion);
9911 INIT_LIST_HEAD(&work->list); 9963 INIT_LIST_HEAD(&work->list);
9912 work->inode = inode; 9964 work->inode = inode;
9913 WARN_ON_ONCE(!inode);
9914 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, 9965 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
9915 btrfs_run_delalloc_work, NULL, NULL); 9966 btrfs_run_delalloc_work, NULL, NULL);
9916 9967
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9c8e1734429c..494f0f10d70e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1642 btrfs_info(fs_info, "resizing devid %llu", devid); 1642 btrfs_info(fs_info, "resizing devid %llu", devid);
1643 } 1643 }
1644 1644
1645 device = btrfs_find_device(fs_info, devid, NULL, NULL); 1645 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
1646 if (!device) { 1646 if (!device) {
1647 btrfs_info(fs_info, "resizer unable to find device %llu", 1647 btrfs_info(fs_info, "resizer unable to find device %llu",
1648 devid); 1648 devid);
@@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
3178 s_uuid = di_args->uuid; 3178 s_uuid = di_args->uuid;
3179 3179
3180 rcu_read_lock(); 3180 rcu_read_lock();
3181 dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); 3181 dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
3182 NULL, true);
3182 3183
3183 if (!dev) { 3184 if (!dev) {
3184 ret = -ENODEV; 3185 ret = -ENODEV;
@@ -3241,32 +3242,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
3241 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 3242 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
3242} 3243}
3243 3244
3244static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, 3245static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
3245 struct inode *dst, u64 dst_loff) 3246 struct inode *dst, u64 dst_loff)
3246{ 3247{
3247 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
3248 int ret; 3248 int ret;
3249 u64 len = olen;
3250
3251 if (loff + len == src->i_size)
3252 len = ALIGN(src->i_size, bs) - loff;
3253 /*
3254 * For same inode case we don't want our length pushed out past i_size
3255 * as comparing that data range makes no sense.
3256 *
3257 * This effectively means we require aligned extents for the single
3258 * inode case, whereas the other cases allow an unaligned length so long
3259 * as it ends at i_size.
3260 */
3261 if (dst == src && len != olen)
3262 return -EINVAL;
3263 3249
3264 /* 3250 /*
3265 * Lock destination range to serialize with concurrent readpages() and 3251 * Lock destination range to serialize with concurrent readpages() and
3266 * source range to serialize with relocation. 3252 * source range to serialize with relocation.
3267 */ 3253 */
3268 btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 3254 btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
3269 ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); 3255 ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
3270 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 3256 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
3271 3257
3272 return ret; 3258 return ret;
@@ -3278,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3278 struct inode *dst, u64 dst_loff) 3264 struct inode *dst, u64 dst_loff)
3279{ 3265{
3280 int ret; 3266 int ret;
3281 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
3282 u64 i, tail_len, chunk_count; 3267 u64 i, tail_len, chunk_count;
3283 3268
3284 /* don't make the dst file partly checksummed */
3285 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
3286 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
3287 return -EINVAL;
3288
3289 if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
3290 return -ETXTBSY;
3291
3292 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 3269 tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
3293 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 3270 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
3294 if (chunk_count == 0)
3295 num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
3296 3271
3297 for (i = 0; i < chunk_count; i++) { 3272 for (i = 0; i < chunk_count; i++) {
3298 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 3273 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@@ -3908,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
3908 * be either compressed or non-compressed. 3883 * be either compressed or non-compressed.
3909 */ 3884 */
3910 3885
3911 /* don't make the dst file partly checksummed */
3912 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
3913 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
3914 return -EINVAL;
3915
3916 if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
3917 return -ETXTBSY;
3918
3919 /* 3886 /*
3920 * VFS's generic_remap_file_range_prep() protects us from cloning the 3887 * VFS's generic_remap_file_range_prep() protects us from cloning the
3921 * eof block into the middle of a file, which would result in corruption 3888 * eof block into the middle of a file, which would result in corruption
@@ -3991,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
3991 else 3958 else
3992 btrfs_double_inode_lock(inode_in, inode_out); 3959 btrfs_double_inode_lock(inode_in, inode_out);
3993 3960
3961 /* don't make the dst file partly checksummed */
3962 if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
3963 (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
3964 ret = -EINVAL;
3965 goto out_unlock;
3966 }
3967
3994 /* 3968 /*
3995 * Now that the inodes are locked, we need to start writeback ourselves 3969 * Now that the inodes are locked, we need to start writeback ourselves
3996 * and can not rely on the writeback from the VFS's generic helper 3970 * and can not rely on the writeback from the VFS's generic helper
@@ -4381,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
4381 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 4355 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
4382 0); 4356 0);
4383 4357
4384 if (copy_to_user(arg, sa, sizeof(*sa))) 4358 if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4385 ret = -EFAULT; 4359 ret = -EFAULT;
4386 4360
4387 if (!(sa->flags & BTRFS_SCRUB_READONLY)) 4361 if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4414,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
4414 4388
4415 ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); 4389 ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
4416 4390
4417 if (copy_to_user(arg, sa, sizeof(*sa))) 4391 if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4418 ret = -EFAULT; 4392 ret = -EFAULT;
4419 4393
4420 kfree(sa); 4394 kfree(sa);
@@ -4438,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
4438 4412
4439 ret = btrfs_get_dev_stats(fs_info, sa); 4413 ret = btrfs_get_dev_stats(fs_info, sa);
4440 4414
4441 if (copy_to_user(arg, sa, sizeof(*sa))) 4415 if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4442 ret = -EFAULT; 4416 ret = -EFAULT;
4443 4417
4444 kfree(sa); 4418 kfree(sa);
@@ -4484,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
4484 break; 4458 break;
4485 } 4459 }
4486 4460
4487 if (copy_to_user(arg, p, sizeof(*p))) 4461 if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
4488 ret = -EFAULT; 4462 ret = -EFAULT;
4489out: 4463out:
4490 kfree(p); 4464 kfree(p);
@@ -4790,7 +4764,7 @@ do_balance:
4790 ret = btrfs_balance(fs_info, bctl, bargs); 4764 ret = btrfs_balance(fs_info, bctl, bargs);
4791 bctl = NULL; 4765 bctl = NULL;
4792 4766
4793 if (arg) { 4767 if ((ret == 0 || ret == -ECANCELED) && arg) {
4794 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4768 if (copy_to_user(arg, bargs, sizeof(*bargs)))
4795 ret = -EFAULT; 4769 ret = -EFAULT;
4796 } 4770 }
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e5ef75..82b84e4daad1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,43 +14,58 @@
14 14
15static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); 15static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
16 16
17/* 17void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
18 * if we currently have a spinning reader or writer lock
19 * (indicated by the rw flag) this will bump the count
20 * of blocking holders and drop the spinlock.
21 */
22void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
23{ 18{
24 /* 19 /*
25 * no lock is required. The lock owner may change if 20 * No lock is required. The lock owner may change if we have a read
26 * we have a read lock, but it won't change to or away 21 * lock, but it won't change to or away from us. If we have the write
27 * from us. If we have the write lock, we are the owner 22 * lock, we are the owner and it'll never change.
28 * and it'll never change.
29 */ 23 */
30 if (eb->lock_nested && current->pid == eb->lock_owner) 24 if (eb->lock_nested && current->pid == eb->lock_owner)
31 return; 25 return;
32 if (rw == BTRFS_WRITE_LOCK) { 26 btrfs_assert_tree_read_locked(eb);
33 if (atomic_read(&eb->blocking_writers) == 0) { 27 atomic_inc(&eb->blocking_readers);
34 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 28 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
35 atomic_dec(&eb->spinning_writers); 29 atomic_dec(&eb->spinning_readers);
36 btrfs_assert_tree_locked(eb); 30 read_unlock(&eb->lock);
37 atomic_inc(&eb->blocking_writers); 31}
38 write_unlock(&eb->lock); 32
39 } 33void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
40 } else if (rw == BTRFS_READ_LOCK) { 34{
41 btrfs_assert_tree_read_locked(eb); 35 /*
42 atomic_inc(&eb->blocking_readers); 36 * No lock is required. The lock owner may change if we have a read
43 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 37 * lock, but it won't change to or away from us. If we have the write
44 atomic_dec(&eb->spinning_readers); 38 * lock, we are the owner and it'll never change.
45 read_unlock(&eb->lock); 39 */
40 if (eb->lock_nested && current->pid == eb->lock_owner)
41 return;
42 if (atomic_read(&eb->blocking_writers) == 0) {
43 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
44 atomic_dec(&eb->spinning_writers);
45 btrfs_assert_tree_locked(eb);
46 atomic_inc(&eb->blocking_writers);
47 write_unlock(&eb->lock);
46 } 48 }
47} 49}
48 50
49/* 51void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
50 * if we currently have a blocking lock, take the spinlock 52{
51 * and drop our blocking count 53 /*
52 */ 54 * No lock is required. The lock owner may change if we have a read
53void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 55 * lock, but it won't change to or away from us. If we have the write
56 * lock, we are the owner and it'll never change.
57 */
58 if (eb->lock_nested && current->pid == eb->lock_owner)
59 return;
60 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
61 read_lock(&eb->lock);
62 atomic_inc(&eb->spinning_readers);
63 /* atomic_dec_and_test implies a barrier */
64 if (atomic_dec_and_test(&eb->blocking_readers))
65 cond_wake_up_nomb(&eb->read_lock_wq);
66}
67
68void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
54{ 69{
55 /* 70 /*
56 * no lock is required. The lock owner may change if 71 * no lock is required. The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
60 */ 75 */
61 if (eb->lock_nested && current->pid == eb->lock_owner) 76 if (eb->lock_nested && current->pid == eb->lock_owner)
62 return; 77 return;
63 78 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
64 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 79 write_lock(&eb->lock);
65 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 80 WARN_ON(atomic_read(&eb->spinning_writers));
66 write_lock(&eb->lock); 81 atomic_inc(&eb->spinning_writers);
67 WARN_ON(atomic_read(&eb->spinning_writers)); 82 /* atomic_dec_and_test implies a barrier */
68 atomic_inc(&eb->spinning_writers); 83 if (atomic_dec_and_test(&eb->blocking_writers))
69 /* atomic_dec_and_test implies a barrier */ 84 cond_wake_up_nomb(&eb->write_lock_wq);
70 if (atomic_dec_and_test(&eb->blocking_writers))
71 cond_wake_up_nomb(&eb->write_lock_wq);
72 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
73 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
74 read_lock(&eb->lock);
75 atomic_inc(&eb->spinning_readers);
76 /* atomic_dec_and_test implies a barrier */
77 if (atomic_dec_and_test(&eb->blocking_readers))
78 cond_wake_up_nomb(&eb->read_lock_wq);
79 }
80} 85}
81 86
82/* 87/*
@@ -232,16 +237,9 @@ again:
232 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); 237 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
233 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 238 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
234 write_lock(&eb->lock); 239 write_lock(&eb->lock);
235 if (atomic_read(&eb->blocking_readers)) { 240 if (atomic_read(&eb->blocking_readers) ||
241 atomic_read(&eb->blocking_writers)) {
236 write_unlock(&eb->lock); 242 write_unlock(&eb->lock);
237 wait_event(eb->read_lock_wq,
238 atomic_read(&eb->blocking_readers) == 0);
239 goto again;
240 }
241 if (atomic_read(&eb->blocking_writers)) {
242 write_unlock(&eb->lock);
243 wait_event(eb->write_lock_wq,
244 atomic_read(&eb->blocking_writers) == 0);
245 goto again; 243 goto again;
246 } 244 }
247 WARN_ON(atomic_read(&eb->spinning_writers)); 245 WARN_ON(atomic_read(&eb->spinning_writers));
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135def468e..595014f64830 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
17void btrfs_tree_read_lock(struct extent_buffer *eb); 17void btrfs_tree_read_lock(struct extent_buffer *eb);
18void btrfs_tree_read_unlock(struct extent_buffer *eb); 18void btrfs_tree_read_unlock(struct extent_buffer *eb);
19void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); 19void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
20void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); 20void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
21void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); 21void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
22void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
23void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
22void btrfs_assert_tree_locked(struct extent_buffer *eb); 24void btrfs_assert_tree_locked(struct extent_buffer *eb);
23int btrfs_try_tree_read_lock(struct extent_buffer *eb); 25int btrfs_try_tree_read_lock(struct extent_buffer *eb);
24int btrfs_try_tree_write_lock(struct extent_buffer *eb); 26int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
37 BUG(); 39 BUG();
38} 40}
39 41
40static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
41{
42 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
43}
44
45static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
46{
47 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
48}
49#endif 42#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 90639140439f..579d53ae256f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -61,6 +61,28 @@ struct workspace {
61 struct list_head list; 61 struct list_head list;
62}; 62};
63 63
64static struct workspace_manager wsm;
65
66static void lzo_init_workspace_manager(void)
67{
68 btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
69}
70
71static void lzo_cleanup_workspace_manager(void)
72{
73 btrfs_cleanup_workspace_manager(&wsm);
74}
75
76static struct list_head *lzo_get_workspace(unsigned int level)
77{
78 return btrfs_get_workspace(&wsm, level);
79}
80
81static void lzo_put_workspace(struct list_head *ws)
82{
83 btrfs_put_workspace(&wsm, ws);
84}
85
64static void lzo_free_workspace(struct list_head *ws) 86static void lzo_free_workspace(struct list_head *ws)
65{ 87{
66 struct workspace *workspace = list_entry(ws, struct workspace, list); 88 struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
71 kfree(workspace); 93 kfree(workspace);
72} 94}
73 95
74static struct list_head *lzo_alloc_workspace(void) 96static struct list_head *lzo_alloc_workspace(unsigned int level)
75{ 97{
76 struct workspace *workspace; 98 struct workspace *workspace;
77 99
@@ -485,11 +507,16 @@ out:
485 return ret; 507 return ret;
486} 508}
487 509
488static void lzo_set_level(struct list_head *ws, unsigned int type) 510static unsigned int lzo_set_level(unsigned int level)
489{ 511{
512 return 0;
490} 513}
491 514
492const struct btrfs_compress_op btrfs_lzo_compress = { 515const struct btrfs_compress_op btrfs_lzo_compress = {
516 .init_workspace_manager = lzo_init_workspace_manager,
517 .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
518 .get_workspace = lzo_get_workspace,
519 .put_workspace = lzo_put_workspace,
493 .alloc_workspace = lzo_alloc_workspace, 520 .alloc_workspace = lzo_alloc_workspace,
494 .free_workspace = lzo_free_workspace, 521 .free_workspace = lzo_free_workspace,
495 .compress_pages = lzo_compress_pages, 522 .compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e473a998219..c1cd5558a646 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1546 parent_node = *p; 1546 parent_node = *p;
1547 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1547 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1548 node); 1548 node);
1549 if (bytenr < entry->bytenr) 1549 if (bytenr < entry->bytenr) {
1550 p = &(*p)->rb_left; 1550 p = &(*p)->rb_left;
1551 else if (bytenr > entry->bytenr) 1551 } else if (bytenr > entry->bytenr) {
1552 p = &(*p)->rb_right; 1552 p = &(*p)->rb_right;
1553 else 1553 } else {
1554 if (record->data_rsv && !entry->data_rsv) {
1555 entry->data_rsv = record->data_rsv;
1556 entry->data_rsv_refroot =
1557 record->data_rsv_refroot;
1558 }
1554 return 1; 1559 return 1;
1560 }
1555 } 1561 }
1556 1562
1557 rb_link_node(&record->node, parent_node, p); 1563 rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1597 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1603 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1598 || bytenr == 0 || num_bytes == 0) 1604 || bytenr == 0 || num_bytes == 0)
1599 return 0; 1605 return 0;
1600 record = kmalloc(sizeof(*record), gfp_flag); 1606 record = kzalloc(sizeof(*record), gfp_flag);
1601 if (!record) 1607 if (!record)
1602 return -ENOMEM; 1608 return -ENOMEM;
1603 1609
@@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1832 src_path->nodes[cur_level] = eb; 1838 src_path->nodes[cur_level] = eb;
1833 1839
1834 btrfs_tree_read_lock(eb); 1840 btrfs_tree_read_lock(eb);
1835 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1841 btrfs_set_lock_blocking_read(eb);
1836 src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; 1842 src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1837 } 1843 }
1838 1844
@@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
1973 dst_path->slots[cur_level] = 0; 1979 dst_path->slots[cur_level] = 0;
1974 1980
1975 btrfs_tree_read_lock(eb); 1981 btrfs_tree_read_lock(eb);
1976 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1982 btrfs_set_lock_blocking_read(eb);
1977 dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; 1983 dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1978 need_cleanup = true; 1984 need_cleanup = true;
1979 } 1985 }
@@ -2017,86 +2023,30 @@ out:
2017 return ret; 2023 return ret;
2018} 2024}
2019 2025
2020/* 2026static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2021 * Inform qgroup to trace subtree swap used in balance. 2027 struct extent_buffer *src_eb,
2022 * 2028 struct extent_buffer *dst_eb,
2023 * Unlike btrfs_qgroup_trace_subtree(), this function will only trace 2029 u64 last_snapshot, bool trace_leaf)
2024 * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
2025 *
2026 * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
2027 * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
2028 * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
2029 * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
2030 * and skip all tree blocks whose generation is smaller than last_snapshot.
2031 *
2032 * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
2033 * which could be the cause of very slow balance if the file tree is large.
2034 *
2035 * @src_parent, @src_slot: pointer to src (file tree) eb.
2036 * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
2037 */
2038int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2039 struct btrfs_block_group_cache *bg_cache,
2040 struct extent_buffer *src_parent, int src_slot,
2041 struct extent_buffer *dst_parent, int dst_slot,
2042 u64 last_snapshot)
2043{ 2030{
2044 struct btrfs_fs_info *fs_info = trans->fs_info; 2031 struct btrfs_fs_info *fs_info = trans->fs_info;
2045 struct btrfs_path *dst_path = NULL; 2032 struct btrfs_path *dst_path = NULL;
2046 struct btrfs_key first_key;
2047 struct extent_buffer *src_eb = NULL;
2048 struct extent_buffer *dst_eb = NULL;
2049 bool trace_leaf = false;
2050 u64 child_gen;
2051 u64 child_bytenr;
2052 int level; 2033 int level;
2053 int ret; 2034 int ret;
2054 2035
2055 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2036 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2056 return 0; 2037 return 0;
2057 2038
2058 /* Check parameter order */ 2039 /* Wrong parameter order */
2059 if (btrfs_node_ptr_generation(src_parent, src_slot) > 2040 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2060 btrfs_node_ptr_generation(dst_parent, dst_slot)) {
2061 btrfs_err_rl(fs_info, 2041 btrfs_err_rl(fs_info,
2062 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2042 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2063 btrfs_node_ptr_generation(src_parent, src_slot), 2043 btrfs_header_generation(src_eb),
2064 btrfs_node_ptr_generation(dst_parent, dst_slot)); 2044 btrfs_header_generation(dst_eb));
2065 return -EUCLEAN; 2045 return -EUCLEAN;
2066 } 2046 }
2067 2047
2068 /*
2069 * Only trace leaf if we're relocating data block groups, this could
2070 * reduce tons of data extents tracing for meta/sys bg relocation.
2071 */
2072 if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
2073 trace_leaf = true;
2074 /* Read out real @src_eb, pointed by @src_parent and @src_slot */
2075 child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
2076 child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
2077 btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
2078
2079 src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
2080 btrfs_header_level(src_parent) - 1, &first_key);
2081 if (IS_ERR(src_eb)) {
2082 ret = PTR_ERR(src_eb);
2083 goto out;
2084 }
2085
2086 /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
2087 child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
2088 child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
2089 btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
2090
2091 dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
2092 btrfs_header_level(dst_parent) - 1, &first_key);
2093 if (IS_ERR(dst_eb)) {
2094 ret = PTR_ERR(dst_eb);
2095 goto out;
2096 }
2097
2098 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2048 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2099 ret = -EINVAL; 2049 ret = -EIO;
2100 goto out; 2050 goto out;
2101 } 2051 }
2102 2052
@@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2106 ret = -ENOMEM; 2056 ret = -ENOMEM;
2107 goto out; 2057 goto out;
2108 } 2058 }
2109
2110 /* For dst_path */ 2059 /* For dst_path */
2111 extent_buffer_get(dst_eb); 2060 extent_buffer_get(dst_eb);
2112 dst_path->nodes[level] = dst_eb; 2061 dst_path->nodes[level] = dst_eb;
2113 dst_path->slots[level] = 0; 2062 dst_path->slots[level] = 0;
2114 dst_path->locks[level] = 0; 2063 dst_path->locks[level] = 0;
2115 2064
2116 /* Do the generation-aware breadth-first search */ 2065 /* Do the generation aware breadth-first search */
2117 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2066 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2118 level, last_snapshot, trace_leaf); 2067 level, last_snapshot, trace_leaf);
2119 if (ret < 0) 2068 if (ret < 0)
@@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2121 ret = 0; 2070 ret = 0;
2122 2071
2123out: 2072out:
2124 free_extent_buffer(src_eb);
2125 free_extent_buffer(dst_eb);
2126 btrfs_free_path(dst_path); 2073 btrfs_free_path(dst_path);
2127 if (ret < 0) 2074 if (ret < 0)
2128 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2075 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2154,7 @@ walk_down:
2207 path->slots[level] = 0; 2154 path->slots[level] = 0;
2208 2155
2209 btrfs_tree_read_lock(eb); 2156 btrfs_tree_read_lock(eb);
2210 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 2157 btrfs_set_lock_blocking_read(eb);
2211 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 2158 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
2212 2159
2213 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2160 ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2576 goto cleanup; 2523 goto cleanup;
2577 } 2524 }
2578 2525
2526 /* Free the reserved data space */
2527 btrfs_qgroup_free_refroot(fs_info,
2528 record->data_rsv_refroot,
2529 record->data_rsv,
2530 BTRFS_QGROUP_RSV_DATA);
2579 /* 2531 /*
2580 * Use SEQ_LAST as time_seq to do special search, which 2532 * Use SEQ_LAST as time_seq to do special search, which
2581 * doesn't lock tree or delayed_refs and search current 2533 * doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2794,15 @@ out:
2842/* 2794/*
2843 * Two limits to commit transaction in advance. 2795 * Two limits to commit transaction in advance.
2844 * 2796 *
2845 * For RATIO, it will be 1/RATIO of the remaining limit 2797 * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
2846 * (excluding data and prealloc meta) as threshold.
2847 * For SIZE, it will be in byte unit as threshold. 2798 * For SIZE, it will be in byte unit as threshold.
2848 */ 2799 */
2849#define QGROUP_PERTRANS_RATIO 32 2800#define QGROUP_FREE_RATIO 32
2850#define QGROUP_PERTRANS_SIZE SZ_32M 2801#define QGROUP_FREE_SIZE SZ_32M
2851static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, 2802static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2852 const struct btrfs_qgroup *qg, u64 num_bytes) 2803 const struct btrfs_qgroup *qg, u64 num_bytes)
2853{ 2804{
2854 u64 limit; 2805 u64 free;
2855 u64 threshold; 2806 u64 threshold;
2856 2807
2857 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 2808 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2870 */ 2821 */
2871 if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | 2822 if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
2872 BTRFS_QGROUP_LIMIT_MAX_EXCL))) { 2823 BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
2873 if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) 2824 if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
2874 limit = qg->max_excl; 2825 free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
2875 else 2826 threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
2876 limit = qg->max_rfer; 2827 QGROUP_FREE_SIZE);
2877 threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - 2828 } else {
2878 qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / 2829 free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
2879 QGROUP_PERTRANS_RATIO; 2830 threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
2880 threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); 2831 QGROUP_FREE_SIZE);
2832 }
2881 2833
2882 /* 2834 /*
2883 * Use transaction_kthread to commit transaction, so we no 2835 * Use transaction_kthread to commit transaction, so we no
2884 * longer need to bother nested transaction nor lock context. 2836 * longer need to bother nested transaction nor lock context.
2885 */ 2837 */
2886 if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) 2838 if (free < threshold)
2887 btrfs_commit_transaction_locksafe(fs_info); 2839 btrfs_commit_transaction_locksafe(fs_info);
2888 } 2840 }
2889 2841
@@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
2959 2911
2960 qg = unode_aux_to_qgroup(unode); 2912 qg = unode_aux_to_qgroup(unode);
2961 2913
2962 trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
2963 qgroup_rsv_add(fs_info, qg, num_bytes, type); 2914 qgroup_rsv_add(fs_info, qg, num_bytes, type);
2964 } 2915 }
2965 2916
@@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3026 2977
3027 qg = unode_aux_to_qgroup(unode); 2978 qg = unode_aux_to_qgroup(unode);
3028 2979
3029 trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
3030 qgroup_rsv_release(fs_info, qg, num_bytes, type); 2980 qgroup_rsv_release(fs_info, qg, num_bytes, type);
3031 2981
3032 list_for_each_entry(glist, &qg->groups, next_group) { 2982 list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
3783 } 3733 }
3784 extent_changeset_release(&changeset); 3734 extent_changeset_release(&changeset);
3785} 3735}
3736
3737void btrfs_qgroup_init_swapped_blocks(
3738 struct btrfs_qgroup_swapped_blocks *swapped_blocks)
3739{
3740 int i;
3741
3742 spin_lock_init(&swapped_blocks->lock);
3743 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
3744 swapped_blocks->blocks[i] = RB_ROOT;
3745 swapped_blocks->swapped = false;
3746}
3747
3748/*
3749 * Delete all swapped blocks record of @root.
3750 * Every record here means we skipped a full subtree scan for qgroup.
3751 *
3752 * Gets called when committing one transaction.
3753 */
3754void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
3755{
3756 struct btrfs_qgroup_swapped_blocks *swapped_blocks;
3757 int i;
3758
3759 swapped_blocks = &root->swapped_blocks;
3760
3761 spin_lock(&swapped_blocks->lock);
3762 if (!swapped_blocks->swapped)
3763 goto out;
3764 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3765 struct rb_root *cur_root = &swapped_blocks->blocks[i];
3766 struct btrfs_qgroup_swapped_block *entry;
3767 struct btrfs_qgroup_swapped_block *next;
3768
3769 rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
3770 node)
3771 kfree(entry);
3772 swapped_blocks->blocks[i] = RB_ROOT;
3773 }
3774 swapped_blocks->swapped = false;
3775out:
3776 spin_unlock(&swapped_blocks->lock);
3777}
3778
3779/*
3780 * Add subtree roots record into @subvol_root.
3781 *
3782 * @subvol_root: tree root of the subvolume tree get swapped
3783 * @bg: block group under balance
3784 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
3785 * @reloc_parent/slot: pointer to the subtree root in reloc tree
3786 * BOTH POINTERS ARE BEFORE TREE SWAP
3787 * @last_snapshot: last snapshot generation of the subvolume tree
3788 */
3789int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
3790 struct btrfs_root *subvol_root,
3791 struct btrfs_block_group_cache *bg,
3792 struct extent_buffer *subvol_parent, int subvol_slot,
3793 struct extent_buffer *reloc_parent, int reloc_slot,
3794 u64 last_snapshot)
3795{
3796 struct btrfs_fs_info *fs_info = subvol_root->fs_info;
3797 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
3798 struct btrfs_qgroup_swapped_block *block;
3799 struct rb_node **cur;
3800 struct rb_node *parent = NULL;
3801 int level = btrfs_header_level(subvol_parent) - 1;
3802 int ret = 0;
3803
3804 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3805 return 0;
3806
3807 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
3808 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
3809 btrfs_err_rl(fs_info,
3810 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
3811 __func__,
3812 btrfs_node_ptr_generation(subvol_parent, subvol_slot),
3813 btrfs_node_ptr_generation(reloc_parent, reloc_slot));
3814 return -EUCLEAN;
3815 }
3816
3817 block = kmalloc(sizeof(*block), GFP_NOFS);
3818 if (!block) {
3819 ret = -ENOMEM;
3820 goto out;
3821 }
3822
3823 /*
3824 * @reloc_parent/slot is still before swap, while @block is going to
3825 * record the bytenr after swap, so we do the swap here.
3826 */
3827 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
3828 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
3829 reloc_slot);
3830 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
3831 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
3832 subvol_slot);
3833 block->last_snapshot = last_snapshot;
3834 block->level = level;
3835 if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
3836 block->trace_leaf = true;
3837 else
3838 block->trace_leaf = false;
3839 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
3840
3841 /* Insert @block into @blocks */
3842 spin_lock(&blocks->lock);
3843 cur = &blocks->blocks[level].rb_node;
3844 while (*cur) {
3845 struct btrfs_qgroup_swapped_block *entry;
3846
3847 parent = *cur;
3848 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
3849 node);
3850
3851 if (entry->subvol_bytenr < block->subvol_bytenr) {
3852 cur = &(*cur)->rb_left;
3853 } else if (entry->subvol_bytenr > block->subvol_bytenr) {
3854 cur = &(*cur)->rb_right;
3855 } else {
3856 if (entry->subvol_generation !=
3857 block->subvol_generation ||
3858 entry->reloc_bytenr != block->reloc_bytenr ||
3859 entry->reloc_generation !=
3860 block->reloc_generation) {
3861 /*
3862 * Duplicated but mismatch entry found.
3863 * Shouldn't happen.
3864 *
3865 * Marking qgroup inconsistent should be enough
3866 * for end users.
3867 */
3868 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
3869 ret = -EEXIST;
3870 }
3871 kfree(block);
3872 goto out_unlock;
3873 }
3874 }
3875 rb_link_node(&block->node, parent, cur);
3876 rb_insert_color(&block->node, &blocks->blocks[level]);
3877 blocks->swapped = true;
3878out_unlock:
3879 spin_unlock(&blocks->lock);
3880out:
3881 if (ret < 0)
3882 fs_info->qgroup_flags |=
3883 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3884 return ret;
3885}
3886
3887/*
3888 * Check if the tree block is a subtree root, and if so do the needed
3889 * delayed subtree trace for qgroup.
3890 *
3891 * This is called during btrfs_cow_block().
3892 */
3893int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
3894 struct btrfs_root *root,
3895 struct extent_buffer *subvol_eb)
3896{
3897 struct btrfs_fs_info *fs_info = root->fs_info;
3898 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
3899 struct btrfs_qgroup_swapped_block *block;
3900 struct extent_buffer *reloc_eb = NULL;
3901 struct rb_node *node;
3902 bool found = false;
3903 bool swapped = false;
3904 int level = btrfs_header_level(subvol_eb);
3905 int ret = 0;
3906 int i;
3907
3908 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3909 return 0;
3910 if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
3911 return 0;
3912
3913 spin_lock(&blocks->lock);
3914 if (!blocks->swapped) {
3915 spin_unlock(&blocks->lock);
3916 return 0;
3917 }
3918 node = blocks->blocks[level].rb_node;
3919
3920 while (node) {
3921 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
3922 if (block->subvol_bytenr < subvol_eb->start) {
3923 node = node->rb_left;
3924 } else if (block->subvol_bytenr > subvol_eb->start) {
3925 node = node->rb_right;
3926 } else {
3927 found = true;
3928 break;
3929 }
3930 }
3931 if (!found) {
3932 spin_unlock(&blocks->lock);
3933 goto out;
3934 }
3935 /* Found one, remove it from @blocks first and update blocks->swapped */
3936 rb_erase(&block->node, &blocks->blocks[level]);
3937 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3938 if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
3939 swapped = true;
3940 break;
3941 }
3942 }
3943 blocks->swapped = swapped;
3944 spin_unlock(&blocks->lock);
3945
3946 /* Read out reloc subtree root */
3947 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
3948 block->reloc_generation, block->level,
3949 &block->first_key);
3950 if (IS_ERR(reloc_eb)) {
3951 ret = PTR_ERR(reloc_eb);
3952 reloc_eb = NULL;
3953 goto free_out;
3954 }
3955 if (!extent_buffer_uptodate(reloc_eb)) {
3956 ret = -EIO;
3957 goto free_out;
3958 }
3959
3960 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
3961 block->last_snapshot, block->trace_leaf);
3962free_out:
3963 kfree(block);
3964 free_extent_buffer(reloc_eb);
3965out:
3966 if (ret < 0) {
3967 btrfs_err_rl(fs_info,
3968 "failed to account subtree at bytenr %llu: %d",
3969 subvol_eb->start, ret);
3970 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3971 }
3972 return ret;
3973}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..46ba7bd2961c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
6#ifndef BTRFS_QGROUP_H 6#ifndef BTRFS_QGROUP_H
7#define BTRFS_QGROUP_H 7#define BTRFS_QGROUP_H
8 8
9#include <linux/spinlock.h>
10#include <linux/rbtree.h>
9#include "ulist.h" 11#include "ulist.h"
10#include "delayed-ref.h" 12#include "delayed-ref.h"
11 13
@@ -38,6 +40,66 @@
38 */ 40 */
39 41
40/* 42/*
43 * Special performance optimization for balance.
44 *
45 * For balance, we need to swap subtree of subvolume and reloc trees.
46 * In theory, we need to trace all subtree blocks of both subvolume and reloc
47 * trees, since their owner has changed during such swap.
48 *
49 * However since balance has ensured that both subtrees are containing the
50 * same contents and have the same tree structures, such swap won't cause
51 * qgroup number change.
52 *
53 * But there is a race window between subtree swap and transaction commit,
54 * during that window, if we increase/decrease tree level or merge/split tree
55 * blocks, we still need to trace the original subtrees.
56 *
57 * So for balance, we use a delayed subtree tracing, whose workflow is:
58 *
59 * 1) Record the subtree root block get swapped.
60 *
61 * During subtree swap:
62 * O = Old tree blocks
63 * N = New tree blocks
64 * reloc tree subvolume tree X
65 * Root Root
66 * / \ / \
67 * NA OB OA OB
68 * / | | \ / | | \
69 * NC ND OE OF OC OD OE OF
70 *
71 * In this case, NA and OA are going to be swapped, record (NA, OA) into
72 * subvolume tree X.
73 *
74 * 2) After subtree swap.
75 * reloc tree subvolume tree X
76 * Root Root
77 * / \ / \
78 * OA OB NA OB
79 * / | | \ / | | \
80 * OC OD OE OF NC ND OE OF
81 *
82 * 3a) COW happens for OB
83 * If we are going to COW tree block OB, we check OB's bytenr against
84 * tree X's swapped_blocks structure.
85 * If it doesn't fit any, nothing will happen.
86 *
87 * 3b) COW happens for NA
88 * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
89 * Then we do subtree scan on both subtrees OA and NA.
90 * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
91 *
92 * Then no matter what we do to subvolume tree X, qgroup numbers will
93 * still be correct.
94 * Then NA's record gets removed from X's swapped_blocks.
95 *
96 * 4) Transaction commit
97 * Any record in X's swapped_blocks gets removed, since there is no
98 * modification to the swapped subtrees, no need to trigger heavy qgroup
99 * subtree rescan for them.
100 */
101
102/*
41 * Record a dirty extent, and info qgroup to update quota on it 103 * Record a dirty extent, and info qgroup to update quota on it
42 * TODO: Use kmem cache to alloc it. 104 * TODO: Use kmem cache to alloc it.
43 */ 105 */
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
45 struct rb_node node; 107 struct rb_node node;
46 u64 bytenr; 108 u64 bytenr;
47 u64 num_bytes; 109 u64 num_bytes;
110
111 /*
112 * For qgroup reserved data space freeing.
113 *
114 * @data_rsv_refroot and @data_rsv will be recorded after
115 * BTRFS_ADD_DELAYED_EXTENT is called.
116 * And will be used to free reserved qgroup space at
117 * transaction commit time.
118 */
119 u32 data_rsv; /* reserved data space needs to be freed */
120 u64 data_rsv_refroot; /* which root the reserved data belongs to */
48 struct ulist *old_roots; 121 struct ulist *old_roots;
49}; 122};
50 123
124struct btrfs_qgroup_swapped_block {
125 struct rb_node node;
126
127 int level;
128 bool trace_leaf;
129
130 /* bytenr/generation of the tree block in subvolume tree after swap */
131 u64 subvol_bytenr;
132 u64 subvol_generation;
133
134 /* bytenr/generation of the tree block in reloc tree after swap */
135 u64 reloc_bytenr;
136 u64 reloc_generation;
137
138 u64 last_snapshot;
139 struct btrfs_key first_key;
140};
141
51/* 142/*
52 * Qgroup reservation types: 143 * Qgroup reservation types:
53 * 144 *
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
236int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 327int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
237 struct extent_buffer *root_eb, 328 struct extent_buffer *root_eb,
238 u64 root_gen, int root_level); 329 u64 root_gen, int root_level);
239
240int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
241 struct btrfs_block_group_cache *bg_cache,
242 struct extent_buffer *src_parent, int src_slot,
243 struct extent_buffer *dst_parent, int dst_slot,
244 u64 last_snapshot);
245int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 330int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
246 u64 num_bytes, struct ulist *old_roots, 331 u64 num_bytes, struct ulist *old_roots,
247 struct ulist *new_roots); 332 struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
252void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 337void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
253 u64 ref_root, u64 num_bytes, 338 u64 ref_root, u64 num_bytes,
254 enum btrfs_qgroup_rsv_type type); 339 enum btrfs_qgroup_rsv_type type);
255static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
256 u64 ref_root, u64 num_bytes)
257{
258 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
259 return;
260 trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
261 btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
262 BTRFS_QGROUP_RSV_DATA);
263}
264 340
265#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 341#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
266int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 342int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
325 401
326void btrfs_qgroup_check_reserved_leak(struct inode *inode); 402void btrfs_qgroup_check_reserved_leak(struct inode *inode);
327 403
404/* btrfs_qgroup_swapped_blocks related functions */
405void btrfs_qgroup_init_swapped_blocks(
406 struct btrfs_qgroup_swapped_blocks *swapped_blocks);
407
408void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
409int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
410 struct btrfs_root *subvol_root,
411 struct btrfs_block_group_cache *bg,
412 struct extent_buffer *subvol_parent, int subvol_slot,
413 struct extent_buffer *reloc_parent, int reloc_slot,
414 u64 last_snapshot);
415int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
416 struct btrfs_root *root, struct extent_buffer *eb);
417
328#endif 418#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index c3557c12656b..d09b6cdb785a 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
583 return -EIO; 583 return -EIO;
584 } 584 }
585 btrfs_tree_read_lock(eb); 585 btrfs_tree_read_lock(eb);
586 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 586 btrfs_set_lock_blocking_read(eb);
587 path->nodes[level-1] = eb; 587 path->nodes[level-1] = eb;
588 path->slots[level-1] = 0; 588 path->slots[level-1] = 0;
589 path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; 589 path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
987 return -ENOMEM; 987 return -ENOMEM;
988 988
989 eb = btrfs_read_lock_root_node(fs_info->extent_root); 989 eb = btrfs_read_lock_root_node(fs_info->extent_root);
990 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 990 btrfs_set_lock_blocking_read(eb);
991 level = btrfs_header_level(eb); 991 level = btrfs_header_level(eb);
992 path->nodes[level] = eb; 992 path->nodes[level] = eb;
993 path->slots[level] = 0; 993 path->slots[level] = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 272b287f8cf0..ddf028509931 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -162,6 +162,8 @@ struct reloc_control {
162 struct mapping_tree reloc_root_tree; 162 struct mapping_tree reloc_root_tree;
163 /* list of reloc trees */ 163 /* list of reloc trees */
164 struct list_head reloc_roots; 164 struct list_head reloc_roots;
165 /* list of subvolume trees that get relocated */
166 struct list_head dirty_subvol_roots;
165 /* size of metadata reservation for merging reloc trees */ 167 /* size of metadata reservation for merging reloc trees */
166 u64 merging_rsv_size; 168 u64 merging_rsv_size;
167 /* size of relocated tree nodes */ 169 /* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1467 struct btrfs_root_item *root_item; 1469 struct btrfs_root_item *root_item;
1468 int ret; 1470 int ret;
1469 1471
1470 if (!root->reloc_root) 1472 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
1473 !root->reloc_root)
1471 goto out; 1474 goto out;
1472 1475
1473 reloc_root = root->reloc_root; 1476 reloc_root = root->reloc_root;
1474 root_item = &reloc_root->root_item; 1477 root_item = &reloc_root->root_item;
1475 1478
1479 /* root->reloc_root will stay until current relocation finished */
1476 if (fs_info->reloc_ctl->merge_reloc_tree && 1480 if (fs_info->reloc_ctl->merge_reloc_tree &&
1477 btrfs_root_refs(root_item) == 0) { 1481 btrfs_root_refs(root_item) == 0) {
1478 root->reloc_root = NULL; 1482 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1479 __del_reloc_root(reloc_root); 1483 __del_reloc_root(reloc_root);
1480 } 1484 }
1481 1485
@@ -1773,7 +1777,7 @@ again:
1773 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1777 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1774 1778
1775 eb = btrfs_lock_root_node(dest); 1779 eb = btrfs_lock_root_node(dest);
1776 btrfs_set_lock_blocking(eb); 1780 btrfs_set_lock_blocking_write(eb);
1777 level = btrfs_header_level(eb); 1781 level = btrfs_header_level(eb);
1778 1782
1779 if (level < lowest_level) { 1783 if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
1786 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1790 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1787 BUG_ON(ret); 1791 BUG_ON(ret);
1788 } 1792 }
1789 btrfs_set_lock_blocking(eb); 1793 btrfs_set_lock_blocking_write(eb);
1790 1794
1791 if (next_key) { 1795 if (next_key) {
1792 next_key->objectid = (u64)-1; 1796 next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
1802 BUG_ON(level < lowest_level); 1806 BUG_ON(level < lowest_level);
1803 1807
1804 ret = btrfs_bin_search(parent, &key, level, &slot); 1808 ret = btrfs_bin_search(parent, &key, level, &slot);
1809 if (ret < 0)
1810 break;
1805 if (ret && slot > 0) 1811 if (ret && slot > 0)
1806 slot--; 1812 slot--;
1807 1813
@@ -1852,7 +1858,7 @@ again:
1852 slot, &eb); 1858 slot, &eb);
1853 BUG_ON(ret); 1859 BUG_ON(ret);
1854 } 1860 }
1855 btrfs_set_lock_blocking(eb); 1861 btrfs_set_lock_blocking_write(eb);
1856 1862
1857 btrfs_tree_unlock(parent); 1863 btrfs_tree_unlock(parent);
1858 free_extent_buffer(parent); 1864 free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
1885 * If not traced, we will leak data numbers 1891 * If not traced, we will leak data numbers
1886 * 2) Fs subtree 1892 * 2) Fs subtree
1887 * If not traced, we will double count old data 1893 * If not traced, we will double count old data
1888 * and tree block numbers, if current trans doesn't free 1894 *
1889 * data reloc tree inode. 1895 * We don't scan the subtree right now, but only record
1896 * the swapped tree blocks.
1897 * The real subtree rescan is delayed until we have new
1898 * CoW on the subtree root node before transaction commit.
1890 */ 1899 */
1891 ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, 1900 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1892 parent, slot, path->nodes[level], 1901 rc->block_group, parent, slot,
1893 path->slots[level], last_snapshot); 1902 path->nodes[level], path->slots[level],
1903 last_snapshot);
1894 if (ret < 0) 1904 if (ret < 0)
1895 break; 1905 break;
1896
1897 /* 1906 /*
1898 * swap blocks in fs tree and reloc tree. 1907 * swap blocks in fs tree and reloc tree.
1899 */ 1908 */
@@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level,
2121} 2130}
2122 2131
2123/* 2132/*
2133 * Insert current subvolume into reloc_control::dirty_subvol_roots
2134 */
2135static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2136 struct reloc_control *rc,
2137 struct btrfs_root *root)
2138{
2139 struct btrfs_root *reloc_root = root->reloc_root;
2140 struct btrfs_root_item *reloc_root_item;
2141
2142 /* @root must be a subvolume tree root with a valid reloc tree */
2143 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2144 ASSERT(reloc_root);
2145
2146 reloc_root_item = &reloc_root->root_item;
2147 memset(&reloc_root_item->drop_progress, 0,
2148 sizeof(reloc_root_item->drop_progress));
2149 reloc_root_item->drop_level = 0;
2150 btrfs_set_root_refs(reloc_root_item, 0);
2151 btrfs_update_reloc_root(trans, root);
2152
2153 if (list_empty(&root->reloc_dirty_list)) {
2154 btrfs_grab_fs_root(root);
2155 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2156 }
2157}
2158
2159static int clean_dirty_subvols(struct reloc_control *rc)
2160{
2161 struct btrfs_root *root;
2162 struct btrfs_root *next;
2163 int ret = 0;
2164
2165 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2166 reloc_dirty_list) {
2167 struct btrfs_root *reloc_root = root->reloc_root;
2168
2169 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
2170 list_del_init(&root->reloc_dirty_list);
2171 root->reloc_root = NULL;
2172 if (reloc_root) {
2173 int ret2;
2174
2175 ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
2176 if (ret2 < 0 && !ret)
2177 ret = ret2;
2178 }
2179 btrfs_put_fs_root(root);
2180 }
2181 return ret;
2182}
2183
2184/*
2124 * merge the relocated tree blocks in reloc tree with corresponding 2185 * merge the relocated tree blocks in reloc tree with corresponding
2125 * fs tree. 2186 * fs tree.
2126 */ 2187 */
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2128 struct btrfs_root *root) 2189 struct btrfs_root *root)
2129{ 2190{
2130 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2191 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2131 LIST_HEAD(inode_list);
2132 struct btrfs_key key; 2192 struct btrfs_key key;
2133 struct btrfs_key next_key; 2193 struct btrfs_key next_key;
2134 struct btrfs_trans_handle *trans = NULL; 2194 struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2259out: 2319out:
2260 btrfs_free_path(path); 2320 btrfs_free_path(path);
2261 2321
2262 if (err == 0) { 2322 if (err == 0)
2263 memset(&root_item->drop_progress, 0, 2323 insert_dirty_subvol(trans, rc, root);
2264 sizeof(root_item->drop_progress));
2265 root_item->drop_level = 0;
2266 btrfs_set_root_refs(root_item, 0);
2267 btrfs_update_reloc_root(trans, root);
2268 }
2269 2324
2270 if (trans) 2325 if (trans)
2271 btrfs_end_transaction_throttle(trans); 2326 btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
2410 } else { 2465 } else {
2411 list_del_init(&reloc_root->root_list); 2466 list_del_init(&reloc_root->root_list);
2412 } 2467 }
2413
2414 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2415 if (ret < 0) {
2416 if (list_empty(&reloc_root->root_list))
2417 list_add_tail(&reloc_root->root_list,
2418 &reloc_roots);
2419 goto out;
2420 }
2421 } 2468 }
2422 2469
2423 if (found) { 2470 if (found) {
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2685 if (!lowest) { 2732 if (!lowest) {
2686 ret = btrfs_bin_search(upper->eb, key, 2733 ret = btrfs_bin_search(upper->eb, key,
2687 upper->level, &slot); 2734 upper->level, &slot);
2735 if (ret < 0) {
2736 err = ret;
2737 goto next;
2738 }
2688 BUG_ON(ret); 2739 BUG_ON(ret);
2689 bytenr = btrfs_node_blockptr(upper->eb, slot); 2740 bytenr = btrfs_node_blockptr(upper->eb, slot);
2690 if (node->eb->start == bytenr) 2741 if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2720 } else { 2771 } else {
2721 ret = btrfs_bin_search(upper->eb, key, upper->level, 2772 ret = btrfs_bin_search(upper->eb, key, upper->level,
2722 &slot); 2773 &slot);
2774 if (ret < 0) {
2775 err = ret;
2776 goto next;
2777 }
2723 BUG_ON(ret); 2778 BUG_ON(ret);
2724 } 2779 }
2725 2780
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2752 goto next; 2807 goto next;
2753 } 2808 }
2754 btrfs_tree_lock(eb); 2809 btrfs_tree_lock(eb);
2755 btrfs_set_lock_blocking(eb); 2810 btrfs_set_lock_blocking_write(eb);
2756 2811
2757 if (!node->eb) { 2812 if (!node->eb) {
2758 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2813 ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -4079,6 +4134,9 @@ restart:
4079 goto out_free; 4134 goto out_free;
4080 } 4135 }
4081 btrfs_commit_transaction(trans); 4136 btrfs_commit_transaction(trans);
4137 ret = clean_dirty_subvols(rc);
4138 if (ret < 0 && !err)
4139 err = ret;
4082out_free: 4140out_free:
4083 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4141 btrfs_free_block_rsv(fs_info, rc->block_rsv);
4084 btrfs_free_path(path); 4142 btrfs_free_path(path);
@@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
4173 return NULL; 4231 return NULL;
4174 4232
4175 INIT_LIST_HEAD(&rc->reloc_roots); 4233 INIT_LIST_HEAD(&rc->reloc_roots);
4234 INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4176 backref_cache_init(&rc->backref_cache); 4235 backref_cache_init(&rc->backref_cache);
4177 mapping_tree_init(&rc->reloc_root_tree); 4236 mapping_tree_init(&rc->reloc_root_tree);
4178 extent_io_tree_init(&rc->processed_blocks, NULL); 4237 extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4468 goto out_free; 4527 goto out_free;
4469 } 4528 }
4470 err = btrfs_commit_transaction(trans); 4529 err = btrfs_commit_transaction(trans);
4530
4531 ret = clean_dirty_subvols(rc);
4532 if (ret < 0 && !err)
4533 err = ret;
4471out_free: 4534out_free:
4472 kfree(rc); 4535 kfree(rc);
4473out: 4536out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..0d2b957ca3a3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
21 struct btrfs_root_item *item) 21 struct btrfs_root_item *item)
22{ 22{
23 uuid_le uuid; 23 uuid_le uuid;
24 int len; 24 u32 len;
25 int need_reset = 0; 25 int need_reset = 0;
26 26
27 len = btrfs_item_size_nr(eb, slot); 27 len = btrfs_item_size_nr(eb, slot);
28 read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), 28 read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
29 min_t(int, len, (int)sizeof(*item))); 29 min_t(u32, len, sizeof(*item)));
30 if (len < sizeof(*item)) 30 if (len < sizeof(*item))
31 need_reset = 1; 31 need_reset = 1;
32 if (!need_reset && btrfs_root_generation(item) 32 if (!need_reset && btrfs_root_generation(item)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dcd36d7b849..a99588536c79 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
584 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 584 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
585 sctx->curr = -1; 585 sctx->curr = -1;
586 sctx->fs_info = fs_info; 586 sctx->fs_info = fs_info;
587 INIT_LIST_HEAD(&sctx->csum_list);
587 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 588 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
588 struct scrub_bio *sbio; 589 struct scrub_bio *sbio;
589 590
@@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
608 atomic_set(&sctx->workers_pending, 0); 609 atomic_set(&sctx->workers_pending, 0);
609 atomic_set(&sctx->cancel_req, 0); 610 atomic_set(&sctx->cancel_req, 0);
610 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 611 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
611 INIT_LIST_HEAD(&sctx->csum_list);
612 612
613 spin_lock_init(&sctx->list_lock); 613 spin_lock_init(&sctx->list_lock);
614 spin_lock_init(&sctx->stat_lock); 614 spin_lock_init(&sctx->stat_lock);
@@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3741 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3741 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3742 int max_active = fs_info->thread_pool_size; 3742 int max_active = fs_info->thread_pool_size;
3743 3743
3744 if (fs_info->scrub_workers_refcnt == 0) { 3744 lockdep_assert_held(&fs_info->scrub_lock);
3745
3746 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3747 ASSERT(fs_info->scrub_workers == NULL);
3745 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", 3748 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
3746 flags, is_dev_replace ? 1 : max_active, 4); 3749 flags, is_dev_replace ? 1 : max_active, 4);
3747 if (!fs_info->scrub_workers) 3750 if (!fs_info->scrub_workers)
3748 goto fail_scrub_workers; 3751 goto fail_scrub_workers;
3749 3752
3753 ASSERT(fs_info->scrub_wr_completion_workers == NULL);
3750 fs_info->scrub_wr_completion_workers = 3754 fs_info->scrub_wr_completion_workers =
3751 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, 3755 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3752 max_active, 2); 3756 max_active, 2);
3753 if (!fs_info->scrub_wr_completion_workers) 3757 if (!fs_info->scrub_wr_completion_workers)
3754 goto fail_scrub_wr_completion_workers; 3758 goto fail_scrub_wr_completion_workers;
3755 3759
3760 ASSERT(fs_info->scrub_parity_workers == NULL);
3756 fs_info->scrub_parity_workers = 3761 fs_info->scrub_parity_workers =
3757 btrfs_alloc_workqueue(fs_info, "scrubparity", flags, 3762 btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3758 max_active, 2); 3763 max_active, 2);
3759 if (!fs_info->scrub_parity_workers) 3764 if (!fs_info->scrub_parity_workers)
3760 goto fail_scrub_parity_workers; 3765 goto fail_scrub_parity_workers;
3766
3767 refcount_set(&fs_info->scrub_workers_refcnt, 1);
3768 } else {
3769 refcount_inc(&fs_info->scrub_workers_refcnt);
3761 } 3770 }
3762 ++fs_info->scrub_workers_refcnt;
3763 return 0; 3771 return 0;
3764 3772
3765fail_scrub_parity_workers: 3773fail_scrub_parity_workers:
@@ -3770,16 +3778,6 @@ fail_scrub_workers:
3770 return -ENOMEM; 3778 return -ENOMEM;
3771} 3779}
3772 3780
3773static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3774{
3775 if (--fs_info->scrub_workers_refcnt == 0) {
3776 btrfs_destroy_workqueue(fs_info->scrub_workers);
3777 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3778 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
3779 }
3780 WARN_ON(fs_info->scrub_workers_refcnt < 0);
3781}
3782
3783int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3781int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3784 u64 end, struct btrfs_scrub_progress *progress, 3782 u64 end, struct btrfs_scrub_progress *progress,
3785 int readonly, int is_dev_replace) 3783 int readonly, int is_dev_replace)
@@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3788 int ret; 3786 int ret;
3789 struct btrfs_device *dev; 3787 struct btrfs_device *dev;
3790 unsigned int nofs_flag; 3788 unsigned int nofs_flag;
3789 struct btrfs_workqueue *scrub_workers = NULL;
3790 struct btrfs_workqueue *scrub_wr_comp = NULL;
3791 struct btrfs_workqueue *scrub_parity = NULL;
3791 3792
3792 if (btrfs_fs_closing(fs_info)) 3793 if (btrfs_fs_closing(fs_info))
3793 return -EINVAL; 3794 return -EINVAL;
@@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3835 return PTR_ERR(sctx); 3836 return PTR_ERR(sctx);
3836 3837
3837 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3838 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3838 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 3839 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3839 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3840 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3840 !is_dev_replace)) { 3841 !is_dev_replace)) {
3841 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3842 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3903 */ 3904 */
3904 nofs_flag = memalloc_nofs_save(); 3905 nofs_flag = memalloc_nofs_save();
3905 if (!is_dev_replace) { 3906 if (!is_dev_replace) {
3907 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3906 /* 3908 /*
3907 * by holding device list mutex, we can 3909 * by holding device list mutex, we can
3908 * kick off writing super in log tree sync. 3910 * kick off writing super in log tree sync.
@@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3925 if (progress) 3927 if (progress)
3926 memcpy(progress, &sctx->stat, sizeof(*progress)); 3928 memcpy(progress, &sctx->stat, sizeof(*progress));
3927 3929
3930 if (!is_dev_replace)
3931 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3932 ret ? "not finished" : "finished", devid, ret);
3933
3928 mutex_lock(&fs_info->scrub_lock); 3934 mutex_lock(&fs_info->scrub_lock);
3929 dev->scrub_ctx = NULL; 3935 dev->scrub_ctx = NULL;
3930 scrub_workers_put(fs_info); 3936 if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
3937 scrub_workers = fs_info->scrub_workers;
3938 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3939 scrub_parity = fs_info->scrub_parity_workers;
3940
3941 fs_info->scrub_workers = NULL;
3942 fs_info->scrub_wr_completion_workers = NULL;
3943 fs_info->scrub_parity_workers = NULL;
3944 }
3931 mutex_unlock(&fs_info->scrub_lock); 3945 mutex_unlock(&fs_info->scrub_lock);
3932 3946
3947 btrfs_destroy_workqueue(scrub_workers);
3948 btrfs_destroy_workqueue(scrub_wr_comp);
3949 btrfs_destroy_workqueue(scrub_parity);
3933 scrub_put_ctx(sctx); 3950 scrub_put_ctx(sctx);
3934 3951
3935 return ret; 3952 return ret;
@@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4012 struct scrub_ctx *sctx = NULL; 4029 struct scrub_ctx *sctx = NULL;
4013 4030
4014 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4031 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4015 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 4032 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
4016 if (dev) 4033 if (dev)
4017 sctx = dev->scrub_ctx; 4034 sctx = dev->scrub_ctx;
4018 if (sctx) 4035 if (sctx)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a3f122dd61f..120e4340792a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
529 if (token != Opt_compress && 529 if (token != Opt_compress &&
530 token != Opt_compress_force) 530 token != Opt_compress_force)
531 info->compress_level = 531 info->compress_level =
532 btrfs_compress_str2level(args[0].from); 532 btrfs_compress_str2level(
533 BTRFS_COMPRESS_ZLIB,
534 args[0].from + 4);
533 btrfs_set_opt(info->mount_opt, COMPRESS); 535 btrfs_set_opt(info->mount_opt, COMPRESS);
534 btrfs_clear_opt(info->mount_opt, NODATACOW); 536 btrfs_clear_opt(info->mount_opt, NODATACOW);
535 btrfs_clear_opt(info->mount_opt, NODATASUM); 537 btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
542 btrfs_clear_opt(info->mount_opt, NODATASUM); 544 btrfs_clear_opt(info->mount_opt, NODATASUM);
543 btrfs_set_fs_incompat(info, COMPRESS_LZO); 545 btrfs_set_fs_incompat(info, COMPRESS_LZO);
544 no_compress = 0; 546 no_compress = 0;
545 } else if (strcmp(args[0].from, "zstd") == 0) { 547 } else if (strncmp(args[0].from, "zstd", 4) == 0) {
546 compress_type = "zstd"; 548 compress_type = "zstd";
547 info->compress_type = BTRFS_COMPRESS_ZSTD; 549 info->compress_type = BTRFS_COMPRESS_ZSTD;
550 info->compress_level =
551 btrfs_compress_str2level(
552 BTRFS_COMPRESS_ZSTD,
553 args[0].from + 4);
548 btrfs_set_opt(info->mount_opt, COMPRESS); 554 btrfs_set_opt(info->mount_opt, COMPRESS);
549 btrfs_clear_opt(info->mount_opt, NODATACOW); 555 btrfs_clear_opt(info->mount_opt, NODATACOW);
550 btrfs_clear_opt(info->mount_opt, NODATASUM); 556 btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
2190 ret = PTR_ERR_OR_ZERO(device); 2196 ret = PTR_ERR_OR_ZERO(device);
2191 mutex_unlock(&uuid_mutex); 2197 mutex_unlock(&uuid_mutex);
2192 break; 2198 break;
2199 case BTRFS_IOC_FORGET_DEV:
2200 ret = btrfs_forget_devices(vol->name);
2201 break;
2193 case BTRFS_IOC_DEVICES_READY: 2202 case BTRFS_IOC_DEVICES_READY:
2194 mutex_lock(&uuid_mutex); 2203 mutex_lock(&uuid_mutex);
2195 device = btrfs_scan_one_device(vol->name, FMODE_READ, 2204 device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4ec2b660d014..acdad6d658f5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
122 if (is_fstree(root->root_key.objectid)) 122 if (is_fstree(root->root_key.objectid))
123 btrfs_unpin_free_ino(root); 123 btrfs_unpin_free_ino(root);
124 clear_btree_io_tree(&root->dirty_log_pages); 124 clear_btree_io_tree(&root->dirty_log_pages);
125 btrfs_qgroup_clean_swapped_blocks(root);
125 } 126 }
126 127
127 /* We can free old roots now. */ 128 /* We can free old roots now. */
@@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
845 btrfs_trans_release_metadata(trans); 846 btrfs_trans_release_metadata(trans);
846 trans->block_rsv = NULL; 847 trans->block_rsv = NULL;
847 848
848 if (!list_empty(&trans->new_bgs)) 849 btrfs_create_pending_block_groups(trans);
849 btrfs_create_pending_block_groups(trans);
850 850
851 btrfs_trans_release_chunk_metadata(trans); 851 btrfs_trans_release_chunk_metadata(trans);
852 852
@@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1532 goto fail; 1532 goto fail;
1533 } 1533 }
1534 1534
1535 btrfs_set_lock_blocking(old); 1535 btrfs_set_lock_blocking_write(old);
1536 1536
1537 ret = btrfs_copy_root(trans, root, old, &tmp, objectid); 1537 ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1538 /* clean up in any case */ 1538 /* clean up in any case */
@@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
1943 cur_trans->delayed_refs.flushing = 1; 1943 cur_trans->delayed_refs.flushing = 1;
1944 smp_wmb(); 1944 smp_wmb();
1945 1945
1946 if (!list_empty(&trans->new_bgs)) 1946 btrfs_create_pending_block_groups(trans);
1947 btrfs_create_pending_block_groups(trans);
1948 1947
1949 ret = btrfs_run_delayed_refs(trans, 0); 1948 ret = btrfs_run_delayed_refs(trans, 0);
1950 if (ret) { 1949 if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987ab587d..5f9e2dd413af 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
52 u32 nritems; 52 u32 nritems;
53 53
54 root_node = btrfs_lock_root_node(root); 54 root_node = btrfs_lock_root_node(root);
55 btrfs_set_lock_blocking(root_node); 55 btrfs_set_lock_blocking_write(root_node);
56 nritems = btrfs_header_nritems(root_node); 56 nritems = btrfs_header_nritems(root_node);
57 root->defrag_max.objectid = 0; 57 root->defrag_max.objectid = 0;
58 /* from above we know this is not a leaf */ 58 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ac232b3d6d7e..f06454a55e00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
27#define LOG_INODE_ALL 0 27#define LOG_INODE_ALL 0
28#define LOG_INODE_EXISTS 1 28#define LOG_INODE_EXISTS 1
29#define LOG_OTHER_INODE 2 29#define LOG_OTHER_INODE 2
30#define LOG_OTHER_INODE_ALL 3
30 31
31/* 32/*
32 * directory trouble cases 33 * directory trouble cases
@@ -1330,6 +1331,67 @@ out:
1330 return ret; 1331 return ret;
1331} 1332}
1332 1333
1334static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1335 struct inode *dir, struct inode *inode, const char *name,
1336 int namelen, u64 ref_index)
1337{
1338 struct btrfs_dir_item *dir_item;
1339 struct btrfs_key key;
1340 struct btrfs_path *path;
1341 struct inode *other_inode = NULL;
1342 int ret;
1343
1344 path = btrfs_alloc_path();
1345 if (!path)
1346 return -ENOMEM;
1347
1348 dir_item = btrfs_lookup_dir_item(NULL, root, path,
1349 btrfs_ino(BTRFS_I(dir)),
1350 name, namelen, 0);
1351 if (!dir_item) {
1352 btrfs_release_path(path);
1353 goto add_link;
1354 } else if (IS_ERR(dir_item)) {
1355 ret = PTR_ERR(dir_item);
1356 goto out;
1357 }
1358
1359 /*
1360 * Our inode's dentry collides with the dentry of another inode which is
1361 * in the log but not yet processed since it has a higher inode number.
1362 * So delete that other dentry.
1363 */
1364 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1365 btrfs_release_path(path);
1366 other_inode = read_one_inode(root, key.objectid);
1367 if (!other_inode) {
1368 ret = -ENOENT;
1369 goto out;
1370 }
1371 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1372 name, namelen);
1373 if (ret)
1374 goto out;
1375 /*
1376 * If we dropped the link count to 0, bump it so that later the iput()
1377 * on the inode will not free it. We will fixup the link count later.
1378 */
1379 if (other_inode->i_nlink == 0)
1380 inc_nlink(other_inode);
1381
1382 ret = btrfs_run_delayed_items(trans);
1383 if (ret)
1384 goto out;
1385add_link:
1386 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1387 name, namelen, 0, ref_index);
1388out:
1389 iput(other_inode);
1390 btrfs_free_path(path);
1391
1392 return ret;
1393}
1394
1333/* 1395/*
1334 * replay one inode back reference item found in the log tree. 1396 * replay one inode back reference item found in the log tree.
1335 * eb, slot and key refer to the buffer and key found in the log tree. 1397 * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1466 goto out; 1528 goto out;
1467 1529
1468 /* insert our name */ 1530 /* insert our name */
1469 ret = btrfs_add_link(trans, BTRFS_I(dir), 1531 ret = add_link(trans, root, dir, inode, name, namelen,
1470 BTRFS_I(inode), 1532 ref_index);
1471 name, namelen, 0, ref_index);
1472 if (ret) 1533 if (ret)
1473 goto out; 1534 goto out;
1474 1535
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2663 2724
2664 if (trans) { 2725 if (trans) {
2665 btrfs_tree_lock(next); 2726 btrfs_tree_lock(next);
2666 btrfs_set_lock_blocking(next); 2727 btrfs_set_lock_blocking_write(next);
2667 clean_tree_block(fs_info, next); 2728 clean_tree_block(fs_info, next);
2668 btrfs_wait_tree_block_writeback(next); 2729 btrfs_wait_tree_block_writeback(next);
2669 btrfs_tree_unlock(next); 2730 btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2747 2808
2748 if (trans) { 2809 if (trans) {
2749 btrfs_tree_lock(next); 2810 btrfs_tree_lock(next);
2750 btrfs_set_lock_blocking(next); 2811 btrfs_set_lock_blocking_write(next);
2751 clean_tree_block(fs_info, next); 2812 clean_tree_block(fs_info, next);
2752 btrfs_wait_tree_block_writeback(next); 2813 btrfs_wait_tree_block_writeback(next);
2753 btrfs_tree_unlock(next); 2814 btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
2829 2890
2830 if (trans) { 2891 if (trans) {
2831 btrfs_tree_lock(next); 2892 btrfs_tree_lock(next);
2832 btrfs_set_lock_blocking(next); 2893 btrfs_set_lock_blocking_write(next);
2833 clean_tree_block(fs_info, next); 2894 clean_tree_block(fs_info, next);
2834 btrfs_wait_tree_block_writeback(next); 2895 btrfs_wait_tree_block_writeback(next);
2835 btrfs_tree_unlock(next); 2896 btrfs_tree_unlock(next);
@@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
3706 found_key.type = 0; 3767 found_key.type = 0;
3707 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3768 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3708 &start_slot); 3769 &start_slot);
3770 if (ret < 0)
3771 break;
3709 3772
3710 ret = btrfs_del_items(trans, log, path, start_slot, 3773 ret = btrfs_del_items(trans, log, path, start_slot,
3711 path->slots[0] - start_slot + 1); 3774 path->slots[0] - start_slot + 1);
@@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4717 const int slot, 4780 const int slot,
4718 const struct btrfs_key *key, 4781 const struct btrfs_key *key,
4719 struct btrfs_inode *inode, 4782 struct btrfs_inode *inode,
4720 u64 *other_ino) 4783 u64 *other_ino, u64 *other_parent)
4721{ 4784{
4722 int ret; 4785 int ret;
4723 struct btrfs_path *search_path; 4786 struct btrfs_path *search_path;
@@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4780 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4843 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4781 di, &di_key); 4844 di, &di_key);
4782 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4845 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4783 ret = 1; 4846 if (di_key.objectid != key->objectid) {
4784 *other_ino = di_key.objectid; 4847 ret = 1;
4848 *other_ino = di_key.objectid;
4849 *other_parent = parent;
4850 } else {
4851 ret = 0;
4852 }
4785 } else { 4853 } else {
4786 ret = -EAGAIN; 4854 ret = -EAGAIN;
4787 } 4855 }
@@ -4801,6 +4869,144 @@ out:
4801 return ret; 4869 return ret;
4802} 4870}
4803 4871
4872struct btrfs_ino_list {
4873 u64 ino;
4874 u64 parent;
4875 struct list_head list;
4876};
4877
4878static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4879 struct btrfs_root *root,
4880 struct btrfs_path *path,
4881 struct btrfs_log_ctx *ctx,
4882 u64 ino, u64 parent)
4883{
4884 struct btrfs_ino_list *ino_elem;
4885 LIST_HEAD(inode_list);
4886 int ret = 0;
4887
4888 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4889 if (!ino_elem)
4890 return -ENOMEM;
4891 ino_elem->ino = ino;
4892 ino_elem->parent = parent;
4893 list_add_tail(&ino_elem->list, &inode_list);
4894
4895 while (!list_empty(&inode_list)) {
4896 struct btrfs_fs_info *fs_info = root->fs_info;
4897 struct btrfs_key key;
4898 struct inode *inode;
4899
4900 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4901 list);
4902 ino = ino_elem->ino;
4903 parent = ino_elem->parent;
4904 list_del(&ino_elem->list);
4905 kfree(ino_elem);
4906 if (ret)
4907 continue;
4908
4909 btrfs_release_path(path);
4910
4911 key.objectid = ino;
4912 key.type = BTRFS_INODE_ITEM_KEY;
4913 key.offset = 0;
4914 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4915 /*
4916 * If the other inode that had a conflicting dir entry was
4917 * deleted in the current transaction, we need to log its parent
4918 * directory.
4919 */
4920 if (IS_ERR(inode)) {
4921 ret = PTR_ERR(inode);
4922 if (ret == -ENOENT) {
4923 key.objectid = parent;
4924 inode = btrfs_iget(fs_info->sb, &key, root,
4925 NULL);
4926 if (IS_ERR(inode)) {
4927 ret = PTR_ERR(inode);
4928 } else {
4929 ret = btrfs_log_inode(trans, root,
4930 BTRFS_I(inode),
4931 LOG_OTHER_INODE_ALL,
4932 0, LLONG_MAX, ctx);
4933 iput(inode);
4934 }
4935 }
4936 continue;
4937 }
4938 /*
4939 * We are safe logging the other inode without acquiring its
4940 * lock as long as we log with the LOG_INODE_EXISTS mode. We
4941 * are safe against concurrent renames of the other inode as
4942 * well because during a rename we pin the log and update the
4943 * log with the new name before we unpin it.
4944 */
4945 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
4946 LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
4947 if (ret) {
4948 iput(inode);
4949 continue;
4950 }
4951
4952 key.objectid = ino;
4953 key.type = BTRFS_INODE_REF_KEY;
4954 key.offset = 0;
4955 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4956 if (ret < 0) {
4957 iput(inode);
4958 continue;
4959 }
4960
4961 while (true) {
4962 struct extent_buffer *leaf = path->nodes[0];
4963 int slot = path->slots[0];
4964 u64 other_ino = 0;
4965 u64 other_parent = 0;
4966
4967 if (slot >= btrfs_header_nritems(leaf)) {
4968 ret = btrfs_next_leaf(root, path);
4969 if (ret < 0) {
4970 break;
4971 } else if (ret > 0) {
4972 ret = 0;
4973 break;
4974 }
4975 continue;
4976 }
4977
4978 btrfs_item_key_to_cpu(leaf, &key, slot);
4979 if (key.objectid != ino ||
4980 (key.type != BTRFS_INODE_REF_KEY &&
4981 key.type != BTRFS_INODE_EXTREF_KEY)) {
4982 ret = 0;
4983 break;
4984 }
4985
4986 ret = btrfs_check_ref_name_override(leaf, slot, &key,
4987 BTRFS_I(inode), &other_ino,
4988 &other_parent);
4989 if (ret < 0)
4990 break;
4991 if (ret > 0) {
4992 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4993 if (!ino_elem) {
4994 ret = -ENOMEM;
4995 break;
4996 }
4997 ino_elem->ino = other_ino;
4998 ino_elem->parent = other_parent;
4999 list_add_tail(&ino_elem->list, &inode_list);
5000 ret = 0;
5001 }
5002 path->slots[0]++;
5003 }
5004 iput(inode);
5005 }
5006
5007 return ret;
5008}
5009
4804/* log a single inode in the tree log. 5010/* log a single inode in the tree log.
4805 * At least one parent directory for this inode must exist in the tree 5011 * At least one parent directory for this inode must exist in the tree
4806 * or be logged already. 5012 * or be logged already.
@@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4840 u64 logged_isize = 0; 5046 u64 logged_isize = 0;
4841 bool need_log_inode_item = true; 5047 bool need_log_inode_item = true;
4842 bool xattrs_logged = false; 5048 bool xattrs_logged = false;
5049 bool recursive_logging = false;
4843 5050
4844 path = btrfs_alloc_path(); 5051 path = btrfs_alloc_path();
4845 if (!path) 5052 if (!path)
@@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4885 return ret; 5092 return ret;
4886 } 5093 }
4887 5094
4888 if (inode_only == LOG_OTHER_INODE) { 5095 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
4889 inode_only = LOG_INODE_EXISTS; 5096 recursive_logging = true;
5097 if (inode_only == LOG_OTHER_INODE)
5098 inode_only = LOG_INODE_EXISTS;
5099 else
5100 inode_only = LOG_INODE_ALL;
4890 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5101 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
4891 } else { 5102 } else {
4892 mutex_lock(&inode->log_mutex); 5103 mutex_lock(&inode->log_mutex);
@@ -4981,20 +5192,19 @@ again:
4981 5192
4982 if ((min_key.type == BTRFS_INODE_REF_KEY || 5193 if ((min_key.type == BTRFS_INODE_REF_KEY ||
4983 min_key.type == BTRFS_INODE_EXTREF_KEY) && 5194 min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4984 inode->generation == trans->transid) { 5195 inode->generation == trans->transid &&
5196 !recursive_logging) {
4985 u64 other_ino = 0; 5197 u64 other_ino = 0;
5198 u64 other_parent = 0;
4986 5199
4987 ret = btrfs_check_ref_name_override(path->nodes[0], 5200 ret = btrfs_check_ref_name_override(path->nodes[0],
4988 path->slots[0], &min_key, inode, 5201 path->slots[0], &min_key, inode,
4989 &other_ino); 5202 &other_ino, &other_parent);
4990 if (ret < 0) { 5203 if (ret < 0) {
4991 err = ret; 5204 err = ret;
4992 goto out_unlock; 5205 goto out_unlock;
4993 } else if (ret > 0 && ctx && 5206 } else if (ret > 0 && ctx &&
4994 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5207 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4995 struct btrfs_key inode_key;
4996 struct inode *other_inode;
4997
4998 if (ins_nr > 0) { 5208 if (ins_nr > 0) {
4999 ins_nr++; 5209 ins_nr++;
5000 } else { 5210 } else {
@@ -5010,43 +5220,13 @@ again:
5010 goto out_unlock; 5220 goto out_unlock;
5011 } 5221 }
5012 ins_nr = 0; 5222 ins_nr = 0;
5013 btrfs_release_path(path); 5223
5014 inode_key.objectid = other_ino; 5224 err = log_conflicting_inodes(trans, root, path,
5015 inode_key.type = BTRFS_INODE_ITEM_KEY; 5225 ctx, other_ino, other_parent);
5016 inode_key.offset = 0;
5017 other_inode = btrfs_iget(fs_info->sb,
5018 &inode_key, root,
5019 NULL);
5020 /*
5021 * If the other inode that had a conflicting dir
5022 * entry was deleted in the current transaction,
5023 * we don't need to do more work nor fallback to
5024 * a transaction commit.
5025 */
5026 if (other_inode == ERR_PTR(-ENOENT)) {
5027 goto next_key;
5028 } else if (IS_ERR(other_inode)) {
5029 err = PTR_ERR(other_inode);
5030 goto out_unlock;
5031 }
5032 /*
5033 * We are safe logging the other inode without
5034 * acquiring its i_mutex as long as we log with
5035 * the LOG_INODE_EXISTS mode. We're safe against
5036 * concurrent renames of the other inode as well
5037 * because during a rename we pin the log and
5038 * update the log with the new name before we
5039 * unpin it.
5040 */
5041 err = btrfs_log_inode(trans, root,
5042 BTRFS_I(other_inode),
5043 LOG_OTHER_INODE, 0, LLONG_MAX,
5044 ctx);
5045 iput(other_inode);
5046 if (err) 5226 if (err)
5047 goto out_unlock; 5227 goto out_unlock;
5048 else 5228 btrfs_release_path(path);
5049 goto next_key; 5229 goto next_key;
5050 } 5230 }
5051 } 5231 }
5052 5232
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 15561926ab32..9024eee889b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
415 return dev; 415 return dev;
416} 416}
417 417
418/*
419 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
420 * return NULL.
421 *
422 * If devid and uuid are both specified, the match must be exact, otherwise
423 * only devid is used.
424 */
425static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
426 u64 devid, const u8 *uuid)
427{
428 struct btrfs_device *dev;
429
430 list_for_each_entry(dev, &fs_devices->devices, dev_list) {
431 if (dev->devid == devid &&
432 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
433 return dev;
434 }
435 }
436 return NULL;
437}
438
439static noinline struct btrfs_fs_devices *find_fsid( 418static noinline struct btrfs_fs_devices *find_fsid(
440 const u8 *fsid, const u8 *metadata_fsid) 419 const u8 *fsid, const u8 *metadata_fsid)
441{ 420{
@@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
734 run_scheduled_bios(device); 713 run_scheduled_bios(device);
735} 714}
736 715
716static bool device_path_matched(const char *path, struct btrfs_device *device)
717{
718 int found;
719
720 rcu_read_lock();
721 found = strcmp(rcu_str_deref(device->name), path);
722 rcu_read_unlock();
723
724 return found == 0;
725}
726
737/* 727/*
738 * Search and remove all stale (devices which are not mounted) devices. 728 * Search and remove all stale (devices which are not mounted) devices.
739 * When both inputs are NULL, it will search and release all stale devices. 729 * When both inputs are NULL, it will search and release all stale devices.
@@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
741 * matching this path only. 731 * matching this path only.
742 * skip_dev: Optional. Will skip this device when searching for the stale 732 * skip_dev: Optional. Will skip this device when searching for the stale
743 * devices. 733 * devices.
734 * Return: 0 for success or if @path is NULL.
735 * -EBUSY if @path is a mounted device.
736 * -ENOENT if @path does not match any device in the list.
744 */ 737 */
745static void btrfs_free_stale_devices(const char *path, 738static int btrfs_free_stale_devices(const char *path,
746 struct btrfs_device *skip_device) 739 struct btrfs_device *skip_device)
747{ 740{
748 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 741 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
749 struct btrfs_device *device, *tmp_device; 742 struct btrfs_device *device, *tmp_device;
743 int ret = 0;
744
745 if (path)
746 ret = -ENOENT;
750 747
751 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 748 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
752 mutex_lock(&fs_devices->device_list_mutex);
753 if (fs_devices->opened) {
754 mutex_unlock(&fs_devices->device_list_mutex);
755 continue;
756 }
757 749
750 mutex_lock(&fs_devices->device_list_mutex);
758 list_for_each_entry_safe(device, tmp_device, 751 list_for_each_entry_safe(device, tmp_device,
759 &fs_devices->devices, dev_list) { 752 &fs_devices->devices, dev_list) {
760 int not_found = 0;
761
762 if (skip_device && skip_device == device) 753 if (skip_device && skip_device == device)
763 continue; 754 continue;
764 if (path && !device->name) 755 if (path && !device->name)
765 continue; 756 continue;
766 757 if (path && !device_path_matched(path, device))
767 rcu_read_lock();
768 if (path)
769 not_found = strcmp(rcu_str_deref(device->name),
770 path);
771 rcu_read_unlock();
772 if (not_found)
773 continue; 758 continue;
759 if (fs_devices->opened) {
760 /* for an already deleted device return 0 */
761 if (path && ret != 0)
762 ret = -EBUSY;
763 break;
764 }
774 765
775 /* delete the stale device */ 766 /* delete the stale device */
776 fs_devices->num_devices--; 767 fs_devices->num_devices--;
777 list_del(&device->dev_list); 768 list_del(&device->dev_list);
778 btrfs_free_device(device); 769 btrfs_free_device(device);
779 770
771 ret = 0;
780 if (fs_devices->num_devices == 0) 772 if (fs_devices->num_devices == 0)
781 break; 773 break;
782 } 774 }
783 mutex_unlock(&fs_devices->device_list_mutex); 775 mutex_unlock(&fs_devices->device_list_mutex);
776
784 if (fs_devices->num_devices == 0) { 777 if (fs_devices->num_devices == 0) {
785 btrfs_sysfs_remove_fsid(fs_devices); 778 btrfs_sysfs_remove_fsid(fs_devices);
786 list_del(&fs_devices->fs_list); 779 list_del(&fs_devices->fs_list);
787 free_fs_devices(fs_devices); 780 free_fs_devices(fs_devices);
788 } 781 }
789 } 782 }
783
784 return ret;
790} 785}
791 786
792static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 787static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
968 device = NULL; 963 device = NULL;
969 } else { 964 } else {
970 mutex_lock(&fs_devices->device_list_mutex); 965 mutex_lock(&fs_devices->device_list_mutex);
971 device = find_device(fs_devices, devid, 966 device = btrfs_find_device(fs_devices, devid,
972 disk_super->dev_item.uuid); 967 disk_super->dev_item.uuid, NULL, false);
973 968
974 /* 969 /*
975 * If this disk has been pulled into an fs devices created by 970 * If this disk has been pulled into an fs devices created by
@@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1134 mutex_lock(&orig->device_list_mutex); 1129 mutex_lock(&orig->device_list_mutex);
1135 fs_devices->total_devices = orig->total_devices; 1130 fs_devices->total_devices = orig->total_devices;
1136 1131
1137 /* We have held the volume lock, it is safe to get the devices. */
1138 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1132 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1139 struct rcu_string *name; 1133 struct rcu_string *name;
1140 1134
@@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1451 return 0; 1445 return 0;
1452} 1446}
1453 1447
1448int btrfs_forget_devices(const char *path)
1449{
1450 int ret;
1451
1452 mutex_lock(&uuid_mutex);
1453 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1454 mutex_unlock(&uuid_mutex);
1455
1456 return ret;
1457}
1458
1454/* 1459/*
1455 * Look for a btrfs signature on a device. This may be called out of the mount path 1460 * Look for a btrfs signature on a device. This may be called out of the mount path
1456 * and we are not allowed to call set_blocksize during the scan. The superblock 1461 * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
2385 devid = btrfs_stack_device_id(&disk_super->dev_item); 2390 devid = btrfs_stack_device_id(&disk_super->dev_item);
2386 dev_uuid = disk_super->dev_item.uuid; 2391 dev_uuid = disk_super->dev_item.uuid;
2387 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2392 if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2388 device = btrfs_find_device(fs_info, devid, dev_uuid, 2393 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2389 disk_super->metadata_uuid); 2394 disk_super->metadata_uuid, true);
2390 else 2395 else
2391 device = btrfs_find_device(fs_info, devid, 2396 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2392 dev_uuid, disk_super->fsid); 2397 disk_super->fsid, true);
2393 2398
2394 brelse(bh); 2399 brelse(bh);
2395 if (!device) 2400 if (!device)
@@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
2398 return device; 2403 return device;
2399} 2404}
2400 2405
2401static struct btrfs_device *btrfs_find_device_missing_or_by_path(
2402 struct btrfs_fs_info *fs_info, const char *device_path)
2403{
2404 struct btrfs_device *device = NULL;
2405 if (strcmp(device_path, "missing") == 0) {
2406 struct list_head *devices;
2407 struct btrfs_device *tmp;
2408
2409 devices = &fs_info->fs_devices->devices;
2410 list_for_each_entry(tmp, devices, dev_list) {
2411 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2412 &tmp->dev_state) && !tmp->bdev) {
2413 device = tmp;
2414 break;
2415 }
2416 }
2417
2418 if (!device)
2419 return ERR_PTR(-ENOENT);
2420 } else {
2421 device = btrfs_find_device_by_path(fs_info, device_path);
2422 }
2423
2424 return device;
2425}
2426
2427/* 2406/*
2428 * Lookup a device given by device id, or the path if the id is 0. 2407 * Lookup a device given by device id, or the path if the id is 0.
2429 */ 2408 */
2430struct btrfs_device *btrfs_find_device_by_devspec( 2409struct btrfs_device *btrfs_find_device_by_devspec(
2431 struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) 2410 struct btrfs_fs_info *fs_info, u64 devid,
2411 const char *device_path)
2432{ 2412{
2433 struct btrfs_device *device; 2413 struct btrfs_device *device;
2434 2414
2435 if (devid) { 2415 if (devid) {
2436 device = btrfs_find_device(fs_info, devid, NULL, NULL); 2416 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2417 NULL, true);
2437 if (!device) 2418 if (!device)
2438 return ERR_PTR(-ENOENT); 2419 return ERR_PTR(-ENOENT);
2439 } else { 2420 return device;
2440 if (!devpath || !devpath[0])
2441 return ERR_PTR(-EINVAL);
2442 device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
2443 } 2421 }
2444 return device; 2422
2423 if (!device_path || !device_path[0])
2424 return ERR_PTR(-EINVAL);
2425
2426 if (strcmp(device_path, "missing") == 0) {
2427 /* Find first missing device */
2428 list_for_each_entry(device, &fs_info->fs_devices->devices,
2429 dev_list) {
2430 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2431 &device->dev_state) && !device->bdev)
2432 return device;
2433 }
2434 return ERR_PTR(-ENOENT);
2435 }
2436
2437 return btrfs_find_device_by_path(fs_info, device_path);
2445} 2438}
2446 2439
2447/* 2440/*
@@ -2563,7 +2556,8 @@ next_slot:
2563 BTRFS_UUID_SIZE); 2556 BTRFS_UUID_SIZE);
2564 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2557 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2565 BTRFS_FSID_SIZE); 2558 BTRFS_FSID_SIZE);
2566 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 2559 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2560 fs_uuid, true);
2567 BUG_ON(!device); /* Logic error */ 2561 BUG_ON(!device); /* Logic error */
2568 2562
2569 if (device->fs_devices->seeding) { 2563 if (device->fs_devices->seeding) {
@@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6616 return BLK_STS_OK; 6610 return BLK_STS_OK;
6617} 6611}
6618 6612
6619struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 6613/*
6620 u8 *uuid, u8 *fsid) 6614 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6615 * return NULL.
6616 *
6617 * If devid and uuid are both specified, the match must be exact, otherwise
6618 * only devid is used.
6619 *
6620 * If @seed is true, traverse through the seed devices.
6621 */
6622struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6623 u64 devid, u8 *uuid, u8 *fsid,
6624 bool seed)
6621{ 6625{
6622 struct btrfs_device *device; 6626 struct btrfs_device *device;
6623 struct btrfs_fs_devices *cur_devices;
6624 6627
6625 cur_devices = fs_info->fs_devices; 6628 while (fs_devices) {
6626 while (cur_devices) {
6627 if (!fsid || 6629 if (!fsid ||
6628 !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6630 !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6629 device = find_device(cur_devices, devid, uuid); 6631 list_for_each_entry(device, &fs_devices->devices,
6630 if (device) 6632 dev_list) {
6631 return device; 6633 if (device->devid == devid &&
6634 (!uuid || memcmp(device->uuid, uuid,
6635 BTRFS_UUID_SIZE) == 0))
6636 return device;
6637 }
6632 } 6638 }
6633 cur_devices = cur_devices->seed; 6639 if (seed)
6640 fs_devices = fs_devices->seed;
6641 else
6642 return NULL;
6634 } 6643 }
6635 return NULL; 6644 return NULL;
6636} 6645}
@@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6782 } 6791 }
6783 6792
6784 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || 6793 if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
6785 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || 6794 (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
6786 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || 6795 (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
6787 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || 6796 (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
6788 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || 6797 (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
6789 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 6798 ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
6790 num_stripes != 1)) { 6799 num_stripes != 1)) {
6791 btrfs_err(fs_info, 6800 btrfs_err(fs_info,
@@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6875 read_extent_buffer(leaf, uuid, (unsigned long) 6884 read_extent_buffer(leaf, uuid, (unsigned long)
6876 btrfs_stripe_dev_uuid_nr(chunk, i), 6885 btrfs_stripe_dev_uuid_nr(chunk, i),
6877 BTRFS_UUID_SIZE); 6886 BTRFS_UUID_SIZE);
6878 map->stripes[i].dev = btrfs_find_device(fs_info, devid, 6887 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6879 uuid, NULL); 6888 devid, uuid, NULL, true);
6880 if (!map->stripes[i].dev && 6889 if (!map->stripes[i].dev &&
6881 !btrfs_test_opt(fs_info, DEGRADED)) { 6890 !btrfs_test_opt(fs_info, DEGRADED)) {
6882 free_extent_map(em); 6891 free_extent_map(em);
@@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
7015 return PTR_ERR(fs_devices); 7024 return PTR_ERR(fs_devices);
7016 } 7025 }
7017 7026
7018 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); 7027 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7028 fs_uuid, true);
7019 if (!device) { 7029 if (!device) {
7020 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7030 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7021 btrfs_report_missing_device(fs_info, devid, 7031 btrfs_report_missing_device(fs_info, devid,
@@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7605 int i; 7615 int i;
7606 7616
7607 mutex_lock(&fs_devices->device_list_mutex); 7617 mutex_lock(&fs_devices->device_list_mutex);
7608 dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); 7618 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7619 true);
7609 mutex_unlock(&fs_devices->device_list_mutex); 7620 mutex_unlock(&fs_devices->device_list_mutex);
7610 7621
7611 if (!dev) { 7622 if (!dev) {
@@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7819 } 7830 }
7820 7831
7821 /* Make sure no dev extent is beyond device bondary */ 7832 /* Make sure no dev extent is beyond device bondary */
7822 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 7833 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7823 if (!dev) { 7834 if (!dev) {
7824 btrfs_err(fs_info, "failed to find devid %llu", devid); 7835 btrfs_err(fs_info, "failed to find devid %llu", devid);
7825 ret = -EUCLEAN; 7836 ret = -EUCLEAN;
@@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7828 7839
7829 /* It's possible this device is a dummy for seed device */ 7840 /* It's possible this device is a dummy for seed device */
7830 if (dev->disk_total_bytes == 0) { 7841 if (dev->disk_total_bytes == 0) {
7831 dev = find_device(fs_info->fs_devices->seed, devid, NULL); 7842 dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
7843 NULL, false);
7832 if (!dev) { 7844 if (!dev) {
7833 btrfs_err(fs_info, "failed to find seed devid %llu", 7845 btrfs_err(fs_info, "failed to find seed devid %llu",
7834 devid); 7846 devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ed806649a473..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
416 fmode_t flags, void *holder); 416 fmode_t flags, void *holder);
417struct btrfs_device *btrfs_scan_one_device(const char *path, 417struct btrfs_device *btrfs_scan_one_device(const char *path,
418 fmode_t flags, void *holder); 418 fmode_t flags, void *holder);
419int btrfs_forget_devices(const char *path);
419int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 420int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
420void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); 421void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
421void btrfs_assign_next_active_device(struct btrfs_device *device, 422void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
433int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); 434int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
434int btrfs_grow_device(struct btrfs_trans_handle *trans, 435int btrfs_grow_device(struct btrfs_trans_handle *trans,
435 struct btrfs_device *device, u64 new_size); 436 struct btrfs_device *device, u64 new_size);
436struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 437struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
437 u8 *uuid, u8 *fsid); 438 u64 devid, u8 *uuid, u8 *fsid, bool seed);
438int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 439int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
439int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); 440int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
440int btrfs_balance(struct btrfs_fs_info *fs_info, 441int btrfs_balance(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e35bb3..b86b7ad6b900 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -27,6 +27,33 @@ struct workspace {
27 int level; 27 int level;
28}; 28};
29 29
30static struct workspace_manager wsm;
31
32static void zlib_init_workspace_manager(void)
33{
34 btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
35}
36
37static void zlib_cleanup_workspace_manager(void)
38{
39 btrfs_cleanup_workspace_manager(&wsm);
40}
41
42static struct list_head *zlib_get_workspace(unsigned int level)
43{
44 struct list_head *ws = btrfs_get_workspace(&wsm, level);
45 struct workspace *workspace = list_entry(ws, struct workspace, list);
46
47 workspace->level = level;
48
49 return ws;
50}
51
52static void zlib_put_workspace(struct list_head *ws)
53{
54 btrfs_put_workspace(&wsm, ws);
55}
56
30static void zlib_free_workspace(struct list_head *ws) 57static void zlib_free_workspace(struct list_head *ws)
31{ 58{
32 struct workspace *workspace = list_entry(ws, struct workspace, list); 59 struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
36 kfree(workspace); 63 kfree(workspace);
37} 64}
38 65
39static struct list_head *zlib_alloc_workspace(void) 66static struct list_head *zlib_alloc_workspace(unsigned int level)
40{ 67{
41 struct workspace *workspace; 68 struct workspace *workspace;
42 int workspacesize; 69 int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
48 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 75 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
49 zlib_inflate_workspacesize()); 76 zlib_inflate_workspacesize());
50 workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); 77 workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
78 workspace->level = level;
51 workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 79 workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
52 if (!workspace->strm.workspace || !workspace->buf) 80 if (!workspace->strm.workspace || !workspace->buf)
53 goto fail; 81 goto fail;
@@ -390,18 +418,19 @@ next:
390 return ret; 418 return ret;
391} 419}
392 420
393static void zlib_set_level(struct list_head *ws, unsigned int type) 421static unsigned int zlib_set_level(unsigned int level)
394{ 422{
395 struct workspace *workspace = list_entry(ws, struct workspace, list); 423 if (!level)
396 unsigned level = (type & 0xF0) >> 4; 424 return BTRFS_ZLIB_DEFAULT_LEVEL;
397
398 if (level > 9)
399 level = 9;
400 425
401 workspace->level = level > 0 ? level : 3; 426 return min_t(unsigned int, level, 9);
402} 427}
403 428
404const struct btrfs_compress_op btrfs_zlib_compress = { 429const struct btrfs_compress_op btrfs_zlib_compress = {
430 .init_workspace_manager = zlib_init_workspace_manager,
431 .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
432 .get_workspace = zlib_get_workspace,
433 .put_workspace = zlib_put_workspace,
405 .alloc_workspace = zlib_alloc_workspace, 434 .alloc_workspace = zlib_alloc_workspace,
406 .free_workspace = zlib_free_workspace, 435 .free_workspace = zlib_free_workspace,
407 .compress_pages = zlib_compress_pages, 436 .compress_pages = zlib_compress_pages,
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59972f5..3e418a3aeb11 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -6,25 +6,31 @@
6 */ 6 */
7 7
8#include <linux/bio.h> 8#include <linux/bio.h>
9#include <linux/bitmap.h>
9#include <linux/err.h> 10#include <linux/err.h>
10#include <linux/init.h> 11#include <linux/init.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/sched/mm.h>
13#include <linux/pagemap.h> 15#include <linux/pagemap.h>
14#include <linux/refcount.h> 16#include <linux/refcount.h>
15#include <linux/sched.h> 17#include <linux/sched.h>
16#include <linux/slab.h> 18#include <linux/slab.h>
17#include <linux/zstd.h> 19#include <linux/zstd.h>
18#include "compression.h" 20#include "compression.h"
21#include "ctree.h"
19 22
20#define ZSTD_BTRFS_MAX_WINDOWLOG 17 23#define ZSTD_BTRFS_MAX_WINDOWLOG 17
21#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) 24#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
22#define ZSTD_BTRFS_DEFAULT_LEVEL 3 25#define ZSTD_BTRFS_DEFAULT_LEVEL 3
26#define ZSTD_BTRFS_MAX_LEVEL 15
27/* 307s to avoid pathologically clashing with transaction commit */
28#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
23 29
24static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) 30static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
31 size_t src_len)
25{ 32{
26 ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, 33 ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
27 src_len, 0);
28 34
29 if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) 35 if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
30 params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; 36 params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,290 @@ struct workspace {
36 void *mem; 42 void *mem;
37 size_t size; 43 size_t size;
38 char *buf; 44 char *buf;
45 unsigned int level;
46 unsigned int req_level;
47 unsigned long last_used; /* jiffies */
39 struct list_head list; 48 struct list_head list;
49 struct list_head lru_list;
40 ZSTD_inBuffer in_buf; 50 ZSTD_inBuffer in_buf;
41 ZSTD_outBuffer out_buf; 51 ZSTD_outBuffer out_buf;
42}; 52};
43 53
54/*
55 * Zstd Workspace Management
56 *
57 * Zstd workspaces have different memory requirements depending on the level.
58 * The zstd workspaces are managed by having individual lists for each level
59 * and a global lru. Forward progress is maintained by protecting a max level
60 * workspace.
61 *
62 * Getting a workspace is done by using the bitmap to identify the levels that
63 * have available workspaces and scans up. This lets us recycle higher level
64 * workspaces because of the monotonic memory guarantee. A workspace's
65 * last_used is only updated if it is being used by the corresponding memory
66 * level. Putting a workspace involves adding it back to the appropriate places
67 * and adding it back to the lru if necessary.
68 *
69 * A timer is used to reclaim workspaces if they have not been used for
70 * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
71 * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
72 */
73
74struct zstd_workspace_manager {
75 const struct btrfs_compress_op *ops;
76 spinlock_t lock;
77 struct list_head lru_list;
78 struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
79 unsigned long active_map;
80 wait_queue_head_t wait;
81 struct timer_list timer;
82};
83
84static struct zstd_workspace_manager wsm;
85
86static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
87
88static inline struct workspace *list_to_workspace(struct list_head *list)
89{
90 return container_of(list, struct workspace, list);
91}
92
93/*
94 * zstd_reclaim_timer_fn - reclaim timer
95 * @t: timer
96 *
97 * This scans the lru_list and attempts to reclaim any workspace that hasn't
98 * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
99 */
100static void zstd_reclaim_timer_fn(struct timer_list *timer)
101{
102 unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
103 struct list_head *pos, *next;
104
105 spin_lock(&wsm.lock);
106
107 if (list_empty(&wsm.lru_list)) {
108 spin_unlock(&wsm.lock);
109 return;
110 }
111
112 list_for_each_prev_safe(pos, next, &wsm.lru_list) {
113 struct workspace *victim = container_of(pos, struct workspace,
114 lru_list);
115 unsigned int level;
116
117 if (time_after(victim->last_used, reclaim_threshold))
118 break;
119
120 /* workspace is in use */
121 if (victim->req_level)
122 continue;
123
124 level = victim->level;
125 list_del(&victim->lru_list);
126 list_del(&victim->list);
127 wsm.ops->free_workspace(&victim->list);
128
129 if (list_empty(&wsm.idle_ws[level - 1]))
130 clear_bit(level - 1, &wsm.active_map);
131
132 }
133
134 if (!list_empty(&wsm.lru_list))
135 mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
136
137 spin_unlock(&wsm.lock);
138}
139
140/*
141 * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
142 *
143 * It is possible based on the level configurations that a higher level
144 * workspace uses less memory than a lower level workspace. In order to reuse
145 * workspaces, this must be made a monotonic relationship. This precomputes
146 * the required memory for each level and enforces the monotonicity between
147 * level and memory required.
148 */
149static void zstd_calc_ws_mem_sizes(void)
150{
151 size_t max_size = 0;
152 unsigned int level;
153
154 for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
155 ZSTD_parameters params =
156 zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
157 size_t level_size =
158 max_t(size_t,
159 ZSTD_CStreamWorkspaceBound(params.cParams),
160 ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
161
162 max_size = max_t(size_t, max_size, level_size);
163 zstd_ws_mem_sizes[level - 1] = max_size;
164 }
165}
166
167static void zstd_init_workspace_manager(void)
168{
169 struct list_head *ws;
170 int i;
171
172 zstd_calc_ws_mem_sizes();
173
174 wsm.ops = &btrfs_zstd_compress;
175 spin_lock_init(&wsm.lock);
176 init_waitqueue_head(&wsm.wait);
177 timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
178
179 INIT_LIST_HEAD(&wsm.lru_list);
180 for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
181 INIT_LIST_HEAD(&wsm.idle_ws[i]);
182
183 ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
184 if (IS_ERR(ws)) {
185 pr_warn(
186 "BTRFS: cannot preallocate zstd compression workspace\n");
187 } else {
188 set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
189 list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
190 }
191}
192
193static void zstd_cleanup_workspace_manager(void)
194{
195 struct workspace *workspace;
196 int i;
197
198 del_timer(&wsm.timer);
199
200 for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
201 while (!list_empty(&wsm.idle_ws[i])) {
202 workspace = container_of(wsm.idle_ws[i].next,
203 struct workspace, list);
204 list_del(&workspace->list);
205 list_del(&workspace->lru_list);
206 wsm.ops->free_workspace(&workspace->list);
207 }
208 }
209}
210
211/*
212 * zstd_find_workspace - find workspace
213 * @level: compression level
214 *
215 * This iterates over the set bits in the active_map beginning at the requested
216 * compression level. This lets us utilize already allocated workspaces before
217 * allocating a new one. If the workspace is of a larger size, it is used, but
218 * the place in the lru_list and last_used times are not updated. This is to
219 * offer the opportunity to reclaim the workspace in favor of allocating an
220 * appropriately sized one in the future.
221 */
222static struct list_head *zstd_find_workspace(unsigned int level)
223{
224 struct list_head *ws;
225 struct workspace *workspace;
226 int i = level - 1;
227
228 spin_lock(&wsm.lock);
229 for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
230 if (!list_empty(&wsm.idle_ws[i])) {
231 ws = wsm.idle_ws[i].next;
232 workspace = list_to_workspace(ws);
233 list_del_init(ws);
234 /* keep its place if it's a lower level using this */
235 workspace->req_level = level;
236 if (level == workspace->level)
237 list_del(&workspace->lru_list);
238 if (list_empty(&wsm.idle_ws[i]))
239 clear_bit(i, &wsm.active_map);
240 spin_unlock(&wsm.lock);
241 return ws;
242 }
243 }
244 spin_unlock(&wsm.lock);
245
246 return NULL;
247}
248
249/*
250 * zstd_get_workspace - zstd's get_workspace
251 * @level: compression level
252 *
253 * If @level is 0, then any compression level can be used. Therefore, we begin
254 * scanning from 1. We first scan through possible workspaces and then after
255 * attempt to allocate a new workspace. If we fail to allocate one due to
256 * memory pressure, go to sleep waiting for the max level workspace to free up.
257 */
258static struct list_head *zstd_get_workspace(unsigned int level)
259{
260 struct list_head *ws;
261 unsigned int nofs_flag;
262
263 /* level == 0 means we can use any workspace */
264 if (!level)
265 level = 1;
266
267again:
268 ws = zstd_find_workspace(level);
269 if (ws)
270 return ws;
271
272 nofs_flag = memalloc_nofs_save();
273 ws = wsm.ops->alloc_workspace(level);
274 memalloc_nofs_restore(nofs_flag);
275
276 if (IS_ERR(ws)) {
277 DEFINE_WAIT(wait);
278
279 prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
280 schedule();
281 finish_wait(&wsm.wait, &wait);
282
283 goto again;
284 }
285
286 return ws;
287}
288
289/*
290 * zstd_put_workspace - zstd put_workspace
291 * @ws: list_head for the workspace
292 *
293 * When putting back a workspace, we only need to update the LRU if we are of
294 * the requested compression level. Here is where we continue to protect the
295 * max level workspace or update last_used accordingly. If the reclaim timer
296 * isn't set, it is also set here. Only the max level workspace tries and wakes
297 * up waiting workspaces.
298 */
299static void zstd_put_workspace(struct list_head *ws)
300{
301 struct workspace *workspace = list_to_workspace(ws);
302
303 spin_lock(&wsm.lock);
304
305 /* A node is only taken off the lru if we are the corresponding level */
306 if (workspace->req_level == workspace->level) {
307 /* Hide a max level workspace from reclaim */
308 if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
309 INIT_LIST_HEAD(&workspace->lru_list);
310 } else {
311 workspace->last_used = jiffies;
312 list_add(&workspace->lru_list, &wsm.lru_list);
313 if (!timer_pending(&wsm.timer))
314 mod_timer(&wsm.timer,
315 jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
316 }
317 }
318
319 set_bit(workspace->level - 1, &wsm.active_map);
320 list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
321 workspace->req_level = 0;
322
323 spin_unlock(&wsm.lock);
324
325 if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
326 cond_wake_up(&wsm.wait);
327}
328
44static void zstd_free_workspace(struct list_head *ws) 329static void zstd_free_workspace(struct list_head *ws)
45{ 330{
46 struct workspace *workspace = list_entry(ws, struct workspace, list); 331 struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
50 kfree(workspace); 335 kfree(workspace);
51} 336}
52 337
53static struct list_head *zstd_alloc_workspace(void) 338static struct list_head *zstd_alloc_workspace(unsigned int level)
54{ 339{
55 ZSTD_parameters params =
56 zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
57 struct workspace *workspace; 340 struct workspace *workspace;
58 341
59 workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); 342 workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
60 if (!workspace) 343 if (!workspace)
61 return ERR_PTR(-ENOMEM); 344 return ERR_PTR(-ENOMEM);
62 345
63 workspace->size = max_t(size_t, 346 workspace->size = zstd_ws_mem_sizes[level - 1];
64 ZSTD_CStreamWorkspaceBound(params.cParams), 347 workspace->level = level;
65 ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); 348 workspace->req_level = level;
349 workspace->last_used = jiffies;
66 workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); 350 workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
67 workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 351 workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
68 if (!workspace->mem || !workspace->buf) 352 if (!workspace->mem || !workspace->buf)
69 goto fail; 353 goto fail;
70 354
71 INIT_LIST_HEAD(&workspace->list); 355 INIT_LIST_HEAD(&workspace->list);
356 INIT_LIST_HEAD(&workspace->lru_list);
72 357
73 return &workspace->list; 358 return &workspace->list;
74fail: 359fail:
@@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
95 unsigned long len = *total_out; 380 unsigned long len = *total_out;
96 const unsigned long nr_dest_pages = *out_pages; 381 const unsigned long nr_dest_pages = *out_pages;
97 unsigned long max_out = nr_dest_pages * PAGE_SIZE; 382 unsigned long max_out = nr_dest_pages * PAGE_SIZE;
98 ZSTD_parameters params = zstd_get_btrfs_parameters(len); 383 ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
384 len);
99 385
100 *out_pages = 0; 386 *out_pages = 0;
101 *total_out = 0; 387 *total_out = 0;
@@ -419,11 +705,19 @@ finish:
419 return ret; 705 return ret;
420} 706}
421 707
422static void zstd_set_level(struct list_head *ws, unsigned int type) 708static unsigned int zstd_set_level(unsigned int level)
423{ 709{
710 if (!level)
711 return ZSTD_BTRFS_DEFAULT_LEVEL;
712
713 return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
424} 714}
425 715
426const struct btrfs_compress_op btrfs_zstd_compress = { 716const struct btrfs_compress_op btrfs_zstd_compress = {
717 .init_workspace_manager = zstd_init_workspace_manager,
718 .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
719 .get_workspace = zstd_get_workspace,
720 .put_workspace = zstd_put_workspace,
427 .alloc_workspace = zstd_alloc_workspace, 721 .alloc_workspace = zstd_alloc_workspace,
428 .free_workspace = zstd_free_workspace, 722 .free_workspace = zstd_free_workspace,
429 .compress_pages = zstd_compress_pages, 723 .compress_pages = zstd_compress_pages,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114def693..13318e255ebf 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
252 return (char *)p - base; 252 return (char *)p - base;
253} 253}
254 254
255static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
256 [EXT2_FT_UNKNOWN] = DT_UNKNOWN,
257 [EXT2_FT_REG_FILE] = DT_REG,
258 [EXT2_FT_DIR] = DT_DIR,
259 [EXT2_FT_CHRDEV] = DT_CHR,
260 [EXT2_FT_BLKDEV] = DT_BLK,
261 [EXT2_FT_FIFO] = DT_FIFO,
262 [EXT2_FT_SOCK] = DT_SOCK,
263 [EXT2_FT_SYMLINK] = DT_LNK,
264};
265
266#define S_SHIFT 12
267static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
268 [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE,
269 [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR,
270 [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV,
271 [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV,
272 [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO,
273 [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK,
274 [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK,
275};
276
277static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) 255static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
278{ 256{
279 umode_t mode = inode->i_mode;
280 if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) 257 if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
281 de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 258 de->file_type = fs_umode_to_ftype(inode->i_mode);
282 else 259 else
283 de->file_type = 0; 260 de->file_type = 0;
284} 261}
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
293 unsigned long n = pos >> PAGE_SHIFT; 270 unsigned long n = pos >> PAGE_SHIFT;
294 unsigned long npages = dir_pages(inode); 271 unsigned long npages = dir_pages(inode);
295 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); 272 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
296 unsigned char *types = NULL;
297 bool need_revalidate = !inode_eq_iversion(inode, file->f_version); 273 bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
274 bool has_filetype;
298 275
299 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) 276 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
300 return 0; 277 return 0;
301 278
302 if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) 279 has_filetype =
303 types = ext2_filetype_table; 280 EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
304 281
305 for ( ; n < npages; n++, offset = 0) { 282 for ( ; n < npages; n++, offset = 0) {
306 char *kaddr, *limit; 283 char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
335 if (de->inode) { 312 if (de->inode) {
336 unsigned char d_type = DT_UNKNOWN; 313 unsigned char d_type = DT_UNKNOWN;
337 314
338 if (types && de->file_type < EXT2_FT_MAX) 315 if (has_filetype)
339 d_type = types[de->file_type]; 316 d_type = fs_ftype_to_dtype(de->file_type);
340 317
341 if (!dir_emit(ctx, de->name, de->name_len, 318 if (!dir_emit(ctx, de->name, de->name_len,
342 le32_to_cpu(de->inode), 319 le32_to_cpu(de->inode),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e770cd100a6a..10ab238de9a6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -604,22 +604,6 @@ struct ext2_dir_entry_2 {
604}; 604};
605 605
606/* 606/*
607 * Ext2 directory file types. Only the low 3 bits are used. The
608 * other bits are reserved for now.
609 */
610enum {
611 EXT2_FT_UNKNOWN = 0,
612 EXT2_FT_REG_FILE = 1,
613 EXT2_FT_DIR = 2,
614 EXT2_FT_CHRDEV = 3,
615 EXT2_FT_BLKDEV = 4,
616 EXT2_FT_FIFO = 5,
617 EXT2_FT_SOCK = 6,
618 EXT2_FT_SYMLINK = 7,
619 EXT2_FT_MAX
620};
621
622/*
623 * EXT2_DIR_PAD defines the directory entries boundaries 607 * EXT2_DIR_PAD defines the directory entries boundaries
624 * 608 *
625 * NOTE: It must be a multiple of 4 609 * NOTE: It must be a multiple of 4
@@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
774extern void ext2_evict_inode(struct inode *); 758extern void ext2_evict_inode(struct inode *);
775extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 759extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
776extern int ext2_setattr (struct dentry *, struct iattr *); 760extern int ext2_setattr (struct dentry *, struct iattr *);
761extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
777extern void ext2_set_inode_flags(struct inode *inode); 762extern void ext2_set_inode_flags(struct inode *inode);
778extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 763extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
779 u64 start, u64 len); 764 u64 start, u64 len);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609f25c1..39c4772e96c9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
199#ifdef CONFIG_EXT2_FS_XATTR 199#ifdef CONFIG_EXT2_FS_XATTR
200 .listxattr = ext2_listxattr, 200 .listxattr = ext2_listxattr,
201#endif 201#endif
202 .getattr = ext2_getattr,
202 .setattr = ext2_setattr, 203 .setattr = ext2_setattr,
203 .get_acl = ext2_get_acl, 204 .get_acl = ext2_get_acl,
204 .set_acl = ext2_set_acl, 205 .set_acl = ext2_set_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7e4975..a0c5ea91fcd4 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
222 best_desc = desc; 222 best_desc = desc;
223 } 223 }
224 } 224 }
225 if (!best_desc)
226 return -1;
227 225
228 return best_group; 226 return best_group;
229} 227}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb9386c045..c27c27300d95 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
717 /* the number of blocks need to allocate for [d,t]indirect blocks */ 717 /* the number of blocks need to allocate for [d,t]indirect blocks */
718 indirect_blks = (chain + depth) - partial - 1; 718 indirect_blks = (chain + depth) - partial - 1;
719 /* 719 /*
720 * Next look up the indirect map to count the totoal number of 720 * Next look up the indirect map to count the total number of
721 * direct blocks to allocate for this branch. 721 * direct blocks to allocate for this branch.
722 */ 722 */
723 count = ext2_blks_to_allocate(partial, indirect_blks, 723 count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
1239 mark_inode_dirty(inode); 1239 mark_inode_dirty(inode);
1240 ext2_free_branches(inode, &nr, &nr+1, 1); 1240 ext2_free_branches(inode, &nr, &nr+1, 1);
1241 } 1241 }
1242 /* fall through */
1242 case EXT2_IND_BLOCK: 1243 case EXT2_IND_BLOCK:
1243 nr = i_data[EXT2_DIND_BLOCK]; 1244 nr = i_data[EXT2_DIND_BLOCK];
1244 if (nr) { 1245 if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
1246 mark_inode_dirty(inode); 1247 mark_inode_dirty(inode);
1247 ext2_free_branches(inode, &nr, &nr+1, 2); 1248 ext2_free_branches(inode, &nr, &nr+1, 2);
1248 } 1249 }
1250 /* fall through */
1249 case EXT2_DIND_BLOCK: 1251 case EXT2_DIND_BLOCK:
1250 nr = i_data[EXT2_TIND_BLOCK]; 1252 nr = i_data[EXT2_TIND_BLOCK];
1251 if (nr) { 1253 if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1635 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1637 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1636} 1638}
1637 1639
1640int ext2_getattr(const struct path *path, struct kstat *stat,
1641 u32 request_mask, unsigned int query_falgs)
1642{
1643 struct inode *inode = d_inode(path->dentry);
1644 struct ext2_inode_info *ei = EXT2_I(inode);
1645 unsigned int flags;
1646
1647 flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
1648 if (flags & EXT2_APPEND_FL)
1649 stat->attributes |= STATX_ATTR_APPEND;
1650 if (flags & EXT2_COMPR_FL)
1651 stat->attributes |= STATX_ATTR_COMPRESSED;
1652 if (flags & EXT2_IMMUTABLE_FL)
1653 stat->attributes |= STATX_ATTR_IMMUTABLE;
1654 if (flags & EXT2_NODUMP_FL)
1655 stat->attributes |= STATX_ATTR_NODUMP;
1656 stat->attributes_mask |= (STATX_ATTR_APPEND |
1657 STATX_ATTR_COMPRESSED |
1658 STATX_ATTR_ENCRYPTED |
1659 STATX_ATTR_IMMUTABLE |
1660 STATX_ATTR_NODUMP);
1661
1662 generic_fillattr(inode, stat);
1663 return 0;
1664}
1665
1638int ext2_setattr(struct dentry *dentry, struct iattr *iattr) 1666int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1639{ 1667{
1640 struct inode *inode = d_inode(dentry); 1668 struct inode *inode = d_inode(dentry);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc5d850..ccfbbf59e2fc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
416#ifdef CONFIG_EXT2_FS_XATTR 416#ifdef CONFIG_EXT2_FS_XATTR
417 .listxattr = ext2_listxattr, 417 .listxattr = ext2_listxattr,
418#endif 418#endif
419 .getattr = ext2_getattr,
419 .setattr = ext2_setattr, 420 .setattr = ext2_setattr,
420 .get_acl = ext2_get_acl, 421 .get_acl = ext2_get_acl,
421 .set_acl = ext2_set_acl, 422 .set_acl = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
426#ifdef CONFIG_EXT2_FS_XATTR 427#ifdef CONFIG_EXT2_FS_XATTR
427 .listxattr = ext2_listxattr, 428 .listxattr = ext2_listxattr,
428#endif 429#endif
430 .getattr = ext2_getattr,
429 .setattr = ext2_setattr, 431 .setattr = ext2_setattr,
430 .get_acl = ext2_get_acl, 432 .get_acl = ext2_get_acl,
431 .set_acl = ext2_set_acl, 433 .set_acl = ext2_set_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73b2d528237f..0128010a0874 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits)
757{ 757{
758 loff_t res = EXT2_NDIR_BLOCKS; 758 loff_t res = EXT2_NDIR_BLOCKS;
759 int meta_blocks; 759 int meta_blocks;
760 loff_t upper_limit; 760 unsigned int upper_limit;
761 unsigned int ppb = 1 << (bits-2);
761 762
762 /* This is calculated to be the largest file size for a 763 /* This is calculated to be the largest file size for a
763 * dense, file such that the total number of 764 * dense, file such that the total number of
@@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits)
771 /* total blocks in file system block size */ 772 /* total blocks in file system block size */
772 upper_limit >>= (bits - 9); 773 upper_limit >>= (bits - 9);
773 774
775 /* Compute how many blocks we can address by block tree */
776 res += 1LL << (bits-2);
777 res += 1LL << (2*(bits-2));
778 res += 1LL << (3*(bits-2));
779 /* Does block tree limit file size? */
780 if (res < upper_limit)
781 goto check_lfs;
774 782
783 res = upper_limit;
784 /* How many metadata blocks are needed for addressing upper_limit? */
785 upper_limit -= EXT2_NDIR_BLOCKS;
775 /* indirect blocks */ 786 /* indirect blocks */
776 meta_blocks = 1; 787 meta_blocks = 1;
788 upper_limit -= ppb;
777 /* double indirect blocks */ 789 /* double indirect blocks */
778 meta_blocks += 1 + (1LL << (bits-2)); 790 if (upper_limit < ppb * ppb) {
779 /* tripple indirect blocks */ 791 meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
780 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); 792 res -= meta_blocks;
781 793 goto check_lfs;
782 upper_limit -= meta_blocks; 794 }
783 upper_limit <<= bits; 795 meta_blocks += 1 + ppb;
784 796 upper_limit -= ppb * ppb;
785 res += 1LL << (bits-2); 797 /* tripple indirect blocks for the rest */
786 res += 1LL << (2*(bits-2)); 798 meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
787 res += 1LL << (3*(bits-2)); 799 DIV_ROUND_UP(upper_limit, ppb*ppb);
800 res -= meta_blocks;
801check_lfs:
788 res <<= bits; 802 res <<= bits;
789 if (res > upper_limit)
790 res = upper_limit;
791
792 if (res > MAX_LFS_FILESIZE) 803 if (res > MAX_LFS_FILESIZE)
793 res = MAX_LFS_FILESIZE; 804 res = MAX_LFS_FILESIZE;
794 805
@@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1024 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); 1035 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1025 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 1036 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1026 1037
1027 if (EXT2_INODE_SIZE(sb) == 0)
1028 goto cantfind_ext2;
1029 sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); 1038 sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
1030 if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) 1039 if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
1031 goto cantfind_ext2; 1040 goto cantfind_ext2;
@@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1087 sizeof(struct buffer_head *), 1096 sizeof(struct buffer_head *),
1088 GFP_KERNEL); 1097 GFP_KERNEL);
1089 if (sbi->s_group_desc == NULL) { 1098 if (sbi->s_group_desc == NULL) {
1099 ret = -ENOMEM;
1090 ext2_msg(sb, KERN_ERR, "error: not enough memory"); 1100 ext2_msg(sb, KERN_ERR, "error: not enough memory");
1091 goto failed_mount; 1101 goto failed_mount;
1092 } 1102 }
1093 bgl_lock_init(sbi->s_blockgroup_lock); 1103 bgl_lock_init(sbi->s_blockgroup_lock);
1094 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 1104 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
1095 if (!sbi->s_debts) { 1105 if (!sbi->s_debts) {
1106 ret = -ENOMEM;
1096 ext2_msg(sb, KERN_ERR, "error: not enough memory"); 1107 ext2_msg(sb, KERN_ERR, "error: not enough memory");
1097 goto failed_mount_group_desc; 1108 goto failed_mount_group_desc;
1098 } 1109 }
@@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1148#ifdef CONFIG_EXT2_FS_XATTR 1159#ifdef CONFIG_EXT2_FS_XATTR
1149 sbi->s_ea_block_cache = ext2_xattr_create_cache(); 1160 sbi->s_ea_block_cache = ext2_xattr_create_cache();
1150 if (!sbi->s_ea_block_cache) { 1161 if (!sbi->s_ea_block_cache) {
1162 ret = -ENOMEM;
1151 ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); 1163 ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
1152 goto failed_mount3; 1164 goto failed_mount3;
1153 } 1165 }
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589ddcc281..00cdb8679486 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,6 +23,7 @@
23 23
24const struct inode_operations ext2_symlink_inode_operations = { 24const struct inode_operations ext2_symlink_inode_operations = {
25 .get_link = page_get_link, 25 .get_link = page_get_link,
26 .getattr = ext2_getattr,
26 .setattr = ext2_setattr, 27 .setattr = ext2_setattr,
27#ifdef CONFIG_EXT2_FS_XATTR 28#ifdef CONFIG_EXT2_FS_XATTR
28 .listxattr = ext2_listxattr, 29 .listxattr = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
31 32
32const struct inode_operations ext2_fast_symlink_inode_operations = { 33const struct inode_operations ext2_fast_symlink_inode_operations = {
33 .get_link = simple_get_link, 34 .get_link = simple_get_link,
35 .getattr = ext2_getattr,
34 .setattr = ext2_setattr, 36 .setattr = ext2_setattr,
35#ifdef CONFIG_EXT2_FS_XATTR 37#ifdef CONFIG_EXT2_FS_XATTR
36 .listxattr = ext2_listxattr, 38 .listxattr = ext2_listxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 4f30876ee325..1e33e0ac8cf1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
342 return; 342 return;
343 343
344 spin_lock(&EXT2_SB(sb)->s_lock); 344 spin_lock(&EXT2_SB(sb)->s_lock);
345 ext2_update_dynamic_rev(sb);
345 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); 346 EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
346 spin_unlock(&EXT2_SB(sb)->s_lock); 347 spin_unlock(&EXT2_SB(sb)->s_lock);
347 mark_buffer_dirty(EXT2_SB(sb)->s_sbh); 348 mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 000000000000..78365e5dc08c
--- /dev/null
+++ b/fs/fs_types.c
@@ -0,0 +1,105 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/fs.h>
3#include <linux/export.h>
4
5/*
6 * fs on-disk file type to dirent file type conversion
7 */
8static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
9 [FT_UNKNOWN] = DT_UNKNOWN,
10 [FT_REG_FILE] = DT_REG,
11 [FT_DIR] = DT_DIR,
12 [FT_CHRDEV] = DT_CHR,
13 [FT_BLKDEV] = DT_BLK,
14 [FT_FIFO] = DT_FIFO,
15 [FT_SOCK] = DT_SOCK,
16 [FT_SYMLINK] = DT_LNK
17};
18
19/**
20 * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
21 * @filetype: The on-disk file type to convert.
22 *
23 * This function converts the on-disk file type value (FT_*) to the directory
24 * entry type (DT_*).
25 *
26 * Context: Any context.
27 * Return:
28 * * DT_UNKNOWN - Unknown type
29 * * DT_FIFO - FIFO
30 * * DT_CHR - Character device
31 * * DT_DIR - Directory
32 * * DT_BLK - Block device
33 * * DT_REG - Regular file
34 * * DT_LNK - Symbolic link
35 * * DT_SOCK - Local-domain socket
36 */
37unsigned char fs_ftype_to_dtype(unsigned int filetype)
38{
39 if (filetype >= FT_MAX)
40 return DT_UNKNOWN;
41
42 return fs_dtype_by_ftype[filetype];
43}
44EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
45
46/*
47 * dirent file type to fs on-disk file type conversion
48 * Values not initialized explicitly are FT_UNKNOWN (0).
49 */
50static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
51 [DT_REG] = FT_REG_FILE,
52 [DT_DIR] = FT_DIR,
53 [DT_LNK] = FT_SYMLINK,
54 [DT_CHR] = FT_CHRDEV,
55 [DT_BLK] = FT_BLKDEV,
56 [DT_FIFO] = FT_FIFO,
57 [DT_SOCK] = FT_SOCK,
58};
59
60/**
61 * fs_umode_to_ftype() - file mode to on-disk file type.
62 * @mode: The file mode to convert.
63 *
64 * This function converts the file mode value to the on-disk file type (FT_*).
65 *
66 * Context: Any context.
67 * Return:
68 * * FT_UNKNOWN - Unknown type
69 * * FT_REG_FILE - Regular file
70 * * FT_DIR - Directory
71 * * FT_CHRDEV - Character device
72 * * FT_BLKDEV - Block device
73 * * FT_FIFO - FIFO
74 * * FT_SOCK - Local-domain socket
75 * * FT_SYMLINK - Symbolic link
76 */
77unsigned char fs_umode_to_ftype(umode_t mode)
78{
79 return fs_ftype_by_dtype[S_DT(mode)];
80}
81EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
82
83/**
84 * fs_umode_to_dtype() - file mode to dirent file type.
85 * @mode: The file mode to convert.
86 *
87 * This function converts the file mode value to the directory
88 * entry type (DT_*).
89 *
90 * Context: Any context.
91 * Return:
92 * * DT_UNKNOWN - Unknown type
93 * * DT_FIFO - FIFO
94 * * DT_CHR - Character device
95 * * DT_DIR - Directory
96 * * DT_BLK - Block device
97 * * DT_REG - Regular file
98 * * DT_LNK - Symbolic link
99 * * DT_SOCK - Local-domain socket
100 */
101unsigned char fs_umode_to_dtype(umode_t mode)
102{
103 return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
104}
105EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/namei.c b/fs/namei.c
index d604f6b3bcc3..0a8c5c27f90e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2718,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
2718 if (unlikely(error == -ESTALE)) 2718 if (unlikely(error == -ESTALE))
2719 error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); 2719 error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
2720 if (likely(!error)) 2720 if (likely(!error))
2721 audit_inode(name, path->dentry, 0); 2721 audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
2722 restore_nameidata(); 2722 restore_nameidata();
2723 putname(name); 2723 putname(name);
2724 return error; 2724 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index c4e83d94840c..98a8c182af4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,6 +1640,8 @@ int ksys_umount(char __user *name, int flags)
1640 if (!(flags & UMOUNT_NOFOLLOW)) 1640 if (!(flags & UMOUNT_NOFOLLOW))
1641 lookup_flags |= LOOKUP_FOLLOW; 1641 lookup_flags |= LOOKUP_FOLLOW;
1642 1642
1643 lookup_flags |= LOOKUP_NO_EVAL;
1644
1643 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); 1645 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1644 if (retval) 1646 if (retval)
1645 goto out; 1647 goto out;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce74ac0..735bfb2e9190 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -2,6 +2,7 @@ config FANOTIFY
2 bool "Filesystem wide access notification" 2 bool "Filesystem wide access notification"
3 select FSNOTIFY 3 select FSNOTIFY
4 select ANON_INODES 4 select ANON_INODES
5 select EXPORTFS
5 default n 6 default n
6 ---help--- 7 ---help---
7 Say Y here to enable fanotify support. fanotify is a file access 8 Say Y here to enable fanotify support. fanotify is a file access
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3723f3d18d20..6b9c27548997 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,22 +13,40 @@
13#include <linux/wait.h> 13#include <linux/wait.h>
14#include <linux/audit.h> 14#include <linux/audit.h>
15#include <linux/sched/mm.h> 15#include <linux/sched/mm.h>
16#include <linux/statfs.h>
16 17
17#include "fanotify.h" 18#include "fanotify.h"
18 19
19static bool should_merge(struct fsnotify_event *old_fsn, 20static bool should_merge(struct fsnotify_event *old_fsn,
20 struct fsnotify_event *new_fsn) 21 struct fsnotify_event *new_fsn)
21{ 22{
22 struct fanotify_event_info *old, *new; 23 struct fanotify_event *old, *new;
23 24
24 pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); 25 pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
25 old = FANOTIFY_E(old_fsn); 26 old = FANOTIFY_E(old_fsn);
26 new = FANOTIFY_E(new_fsn); 27 new = FANOTIFY_E(new_fsn);
27 28
28 if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && 29 if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
29 old->path.mnt == new->path.mnt && 30 old->fh_type != new->fh_type || old->fh_len != new->fh_len)
30 old->path.dentry == new->path.dentry) 31 return false;
31 return true; 32
33 if (fanotify_event_has_path(old)) {
34 return old->path.mnt == new->path.mnt &&
35 old->path.dentry == new->path.dentry;
36 } else if (fanotify_event_has_fid(old)) {
37 /*
38 * We want to merge many dirent events in the same dir (i.e.
39 * creates/unlinks/renames), but we do not want to merge dirent
40 * events referring to subdirs with dirent events referring to
41 * non subdirs, otherwise, user won't be able to tell from a
42 * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
43 * unlink pair or rmdir+create pair of events.
44 */
45 return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
46 fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
47 }
48
49 /* Do not merge events if we failed to encode fid */
32 return false; 50 return false;
33} 51}
34 52
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
36static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) 54static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
37{ 55{
38 struct fsnotify_event *test_event; 56 struct fsnotify_event *test_event;
57 struct fanotify_event *new;
39 58
40 pr_debug("%s: list=%p event=%p\n", __func__, list, event); 59 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
60 new = FANOTIFY_E(event);
41 61
42 /* 62 /*
43 * Don't merge a permission event with any other event so that we know 63 * Don't merge a permission event with any other event so that we know
44 * the event structure we have created in fanotify_handle_event() is the 64 * the event structure we have created in fanotify_handle_event() is the
45 * one we should check for permission response. 65 * one we should check for permission response.
46 */ 66 */
47 if (fanotify_is_perm_event(event->mask)) 67 if (fanotify_is_perm_event(new->mask))
48 return 0; 68 return 0;
49 69
50 list_for_each_entry_reverse(test_event, list, list) { 70 list_for_each_entry_reverse(test_event, list, list) {
51 if (should_merge(test_event, event)) { 71 if (should_merge(test_event, event)) {
52 test_event->mask |= event->mask; 72 FANOTIFY_E(test_event)->mask |= new->mask;
53 return 1; 73 return 1;
54 } 74 }
55 } 75 }
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
57 return 0; 77 return 0;
58} 78}
59 79
80/*
81 * Wait for response to permission event. The function also takes care of
82 * freeing the permission event (or offloads that in case the wait is canceled
83 * by a signal). The function returns 0 in case access got allowed by userspace,
84 * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
85 * the wait got interrupted by a signal.
86 */
60static int fanotify_get_response(struct fsnotify_group *group, 87static int fanotify_get_response(struct fsnotify_group *group,
61 struct fanotify_perm_event_info *event, 88 struct fanotify_perm_event *event,
62 struct fsnotify_iter_info *iter_info) 89 struct fsnotify_iter_info *iter_info)
63{ 90{
64 int ret; 91 int ret;
65 92
66 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
67 94
68 wait_event(group->fanotify_data.access_waitq, event->response); 95 ret = wait_event_killable(group->fanotify_data.access_waitq,
96 event->state == FAN_EVENT_ANSWERED);
97 /* Signal pending? */
98 if (ret < 0) {
99 spin_lock(&group->notification_lock);
100 /* Event reported to userspace and no answer yet? */
101 if (event->state == FAN_EVENT_REPORTED) {
102 /* Event will get freed once userspace answers to it */
103 event->state = FAN_EVENT_CANCELED;
104 spin_unlock(&group->notification_lock);
105 return ret;
106 }
107 /* Event not yet reported? Just remove it. */
108 if (event->state == FAN_EVENT_INIT)
109 fsnotify_remove_queued_event(group, &event->fae.fse);
110 /*
111 * Event may be also answered in case signal delivery raced
112 * with wakeup. In that case we have nothing to do besides
113 * freeing the event and reporting error.
114 */
115 spin_unlock(&group->notification_lock);
116 goto out;
117 }
69 118
70 /* userspace responded, convert to something usable */ 119 /* userspace responded, convert to something usable */
71 switch (event->response & ~FAN_AUDIT) { 120 switch (event->response & ~FAN_AUDIT) {
@@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
81 if (event->response & FAN_AUDIT) 130 if (event->response & FAN_AUDIT)
82 audit_fanotify(event->response & ~FAN_AUDIT); 131 audit_fanotify(event->response & ~FAN_AUDIT);
83 132
84 event->response = 0;
85
86 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 133 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
87 group, event, ret); 134 group, event, ret);
88 135out:
136 fsnotify_destroy_event(group, &event->fae.fse);
137
89 return ret; 138 return ret;
90} 139}
91 140
@@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group,
95 * been included within the event mask, but have not been explicitly 144 * been included within the event mask, but have not been explicitly
96 * requested by the user, will not be present in the returned mask. 145 * requested by the user, will not be present in the returned mask.
97 */ 146 */
98static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, 147static u32 fanotify_group_event_mask(struct fsnotify_group *group,
99 u32 event_mask, const void *data, 148 struct fsnotify_iter_info *iter_info,
100 int data_type) 149 u32 event_mask, const void *data,
150 int data_type)
101{ 151{
102 __u32 marks_mask = 0, marks_ignored_mask = 0; 152 __u32 marks_mask = 0, marks_ignored_mask = 0;
153 __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
103 const struct path *path = data; 154 const struct path *path = data;
104 struct fsnotify_mark *mark; 155 struct fsnotify_mark *mark;
105 int type; 156 int type;
@@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
107 pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", 158 pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
108 __func__, iter_info->report_mask, event_mask, data, data_type); 159 __func__, iter_info->report_mask, event_mask, data, data_type);
109 160
110 /* If we don't have enough info to send an event to userspace say no */ 161 if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
111 if (data_type != FSNOTIFY_EVENT_PATH) 162 /* Do we have path to open a file descriptor? */
112 return 0; 163 if (data_type != FSNOTIFY_EVENT_PATH)
113 164 return 0;
114 /* Sorry, fanotify only gives a damn about files and dirs */ 165 /* Path type events are only relevant for files and dirs */
115 if (!d_is_reg(path->dentry) && 166 if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
116 !d_can_lookup(path->dentry)) 167 return 0;
117 return 0; 168 }
118 169
119 fsnotify_foreach_obj_type(type) { 170 fsnotify_foreach_obj_type(type) {
120 if (!fsnotify_iter_should_report_type(iter_info, type)) 171 if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
133 marks_ignored_mask |= mark->ignored_mask; 184 marks_ignored_mask |= mark->ignored_mask;
134 } 185 }
135 186
136 if (d_is_dir(path->dentry) && 187 test_mask = event_mask & marks_mask & ~marks_ignored_mask;
188
189 /*
190 * dirent modification events (create/delete/move) do not carry the
191 * child entry name/inode information. Instead, we report FAN_ONDIR
192 * for mkdir/rmdir so user can differentiate them from creat/unlink.
193 *
194 * For backward compatibility and consistency, do not report FAN_ONDIR
195 * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
196 * to user in FAN_REPORT_FID mode for all event types.
197 */
198 if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
199 /* Do not report FAN_ONDIR without any event */
200 if (!(test_mask & ~FAN_ONDIR))
201 return 0;
202 } else {
203 user_mask &= ~FAN_ONDIR;
204 }
205
206 if (event_mask & FS_ISDIR &&
137 !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) 207 !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
138 return 0; 208 return 0;
139 209
140 return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask & 210 return test_mask & user_mask;
141 ~marks_ignored_mask; 211}
212
213static int fanotify_encode_fid(struct fanotify_event *event,
214 struct inode *inode, gfp_t gfp,
215 __kernel_fsid_t *fsid)
216{
217 struct fanotify_fid *fid = &event->fid;
218 int dwords, bytes = 0;
219 int err, type;
220
221 fid->ext_fh = NULL;
222 dwords = 0;
223 err = -ENOENT;
224 type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
225 if (!dwords)
226 goto out_err;
227
228 bytes = dwords << 2;
229 if (bytes > FANOTIFY_INLINE_FH_LEN) {
230 /* Treat failure to allocate fh as failure to allocate event */
231 err = -ENOMEM;
232 fid->ext_fh = kmalloc(bytes, gfp);
233 if (!fid->ext_fh)
234 goto out_err;
235 }
236
237 type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
238 &dwords, NULL);
239 err = -EINVAL;
240 if (!type || type == FILEID_INVALID || bytes != dwords << 2)
241 goto out_err;
242
243 fid->fsid = *fsid;
244 event->fh_len = bytes;
245
246 return type;
247
248out_err:
249 pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
250 "type=%d, bytes=%d, err=%i)\n",
251 fsid->val[0], fsid->val[1], type, bytes, err);
252 kfree(fid->ext_fh);
253 fid->ext_fh = NULL;
254 event->fh_len = 0;
255
256 return FILEID_INVALID;
142} 257}
143 258
144struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, 259/*
145 struct inode *inode, u32 mask, 260 * The inode to use as identifier when reporting fid depends on the event.
146 const struct path *path) 261 * Report the modified directory inode on dirent modification events.
262 * Report the "victim" inode otherwise.
263 * For example:
264 * FS_ATTRIB reports the child inode even if reported on a watched parent.
265 * FS_CREATE reports the modified dir inode and not the created inode.
266 */
267static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
268 const void *data, int data_type)
147{ 269{
148 struct fanotify_event_info *event = NULL; 270 if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
271 return to_tell;
272 else if (data_type == FSNOTIFY_EVENT_INODE)
273 return (struct inode *)data;
274 else if (data_type == FSNOTIFY_EVENT_PATH)
275 return d_inode(((struct path *)data)->dentry);
276 return NULL;
277}
278
279struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
280 struct inode *inode, u32 mask,
281 const void *data, int data_type,
282 __kernel_fsid_t *fsid)
283{
284 struct fanotify_event *event = NULL;
149 gfp_t gfp = GFP_KERNEL_ACCOUNT; 285 gfp_t gfp = GFP_KERNEL_ACCOUNT;
286 struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
150 287
151 /* 288 /*
152 * For queues with unlimited length lost events are not expected and 289 * For queues with unlimited length lost events are not expected and
@@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
160 memalloc_use_memcg(group->memcg); 297 memalloc_use_memcg(group->memcg);
161 298
162 if (fanotify_is_perm_event(mask)) { 299 if (fanotify_is_perm_event(mask)) {
163 struct fanotify_perm_event_info *pevent; 300 struct fanotify_perm_event *pevent;
164 301
165 pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); 302 pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
166 if (!pevent) 303 if (!pevent)
167 goto out; 304 goto out;
168 event = &pevent->fae; 305 event = &pevent->fae;
169 pevent->response = 0; 306 pevent->response = 0;
307 pevent->state = FAN_EVENT_INIT;
170 goto init; 308 goto init;
171 } 309 }
172 event = kmem_cache_alloc(fanotify_event_cachep, gfp); 310 event = kmem_cache_alloc(fanotify_event_cachep, gfp);
173 if (!event) 311 if (!event)
174 goto out; 312 goto out;
175init: __maybe_unused 313init: __maybe_unused
176 fsnotify_init_event(&event->fse, inode, mask); 314 fsnotify_init_event(&event->fse, inode);
315 event->mask = mask;
177 if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) 316 if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
178 event->pid = get_pid(task_pid(current)); 317 event->pid = get_pid(task_pid(current));
179 else 318 else
180 event->pid = get_pid(task_tgid(current)); 319 event->pid = get_pid(task_tgid(current));
181 if (path) { 320 event->fh_len = 0;
182 event->path = *path; 321 if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
322 /* Report the event without a file identifier on encode error */
323 event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
324 } else if (data_type == FSNOTIFY_EVENT_PATH) {
325 event->fh_type = FILEID_ROOT;
326 event->path = *((struct path *)data);
183 path_get(&event->path); 327 path_get(&event->path);
184 } else { 328 } else {
329 event->fh_type = FILEID_INVALID;
185 event->path.mnt = NULL; 330 event->path.mnt = NULL;
186 event->path.dentry = NULL; 331 event->path.dentry = NULL;
187 } 332 }
@@ -190,6 +335,29 @@ out:
190 return event; 335 return event;
191} 336}
192 337
338/*
339 * Get cached fsid of the filesystem containing the object from any connector.
340 * All connectors are supposed to have the same fsid, but we do not verify that
341 * here.
342 */
343static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
344{
345 int type;
346 __kernel_fsid_t fsid = {};
347
348 fsnotify_foreach_obj_type(type) {
349 if (!fsnotify_iter_should_report_type(iter_info, type))
350 continue;
351
352 fsid = iter_info->marks[type]->connector->fsid;
353 if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
354 continue;
355 return fsid;
356 }
357
358 return fsid;
359}
360
193static int fanotify_handle_event(struct fsnotify_group *group, 361static int fanotify_handle_event(struct fsnotify_group *group,
194 struct inode *inode, 362 struct inode *inode,
195 u32 mask, const void *data, int data_type, 363 u32 mask, const void *data, int data_type,
@@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group,
197 struct fsnotify_iter_info *iter_info) 365 struct fsnotify_iter_info *iter_info)
198{ 366{
199 int ret = 0; 367 int ret = 0;
200 struct fanotify_event_info *event; 368 struct fanotify_event *event;
201 struct fsnotify_event *fsn_event; 369 struct fsnotify_event *fsn_event;
370 __kernel_fsid_t fsid = {};
202 371
203 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); 372 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
204 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); 373 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
374 BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
205 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); 375 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
206 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); 376 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
207 BUILD_BUG_ON(FAN_OPEN != FS_OPEN); 377 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
378 BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
379 BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
380 BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
381 BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
382 BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
383 BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
208 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); 384 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
209 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); 385 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
210 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); 386 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
213 BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); 389 BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
214 BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); 390 BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
215 391
216 BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); 392 BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
217 393
218 mask = fanotify_group_event_mask(iter_info, mask, data, data_type); 394 mask = fanotify_group_event_mask(group, iter_info, mask, data,
395 data_type);
219 if (!mask) 396 if (!mask)
220 return 0; 397 return 0;
221 398
@@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
231 return 0; 408 return 0;
232 } 409 }
233 410
234 event = fanotify_alloc_event(group, inode, mask, data); 411 if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
412 fsid = fanotify_get_fsid(iter_info);
413
414 event = fanotify_alloc_event(group, inode, mask, data, data_type,
415 &fsid);
235 ret = -ENOMEM; 416 ret = -ENOMEM;
236 if (unlikely(!event)) { 417 if (unlikely(!event)) {
237 /* 418 /*
@@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
255 } else if (fanotify_is_perm_event(mask)) { 436 } else if (fanotify_is_perm_event(mask)) {
256 ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), 437 ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
257 iter_info); 438 iter_info);
258 fsnotify_destroy_event(group, fsn_event);
259 } 439 }
260finish: 440finish:
261 if (fanotify_is_perm_event(mask)) 441 if (fanotify_is_perm_event(mask))
@@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
275 455
276static void fanotify_free_event(struct fsnotify_event *fsn_event) 456static void fanotify_free_event(struct fsnotify_event *fsn_event)
277{ 457{
278 struct fanotify_event_info *event; 458 struct fanotify_event *event;
279 459
280 event = FANOTIFY_E(fsn_event); 460 event = FANOTIFY_E(fsn_event);
281 path_put(&event->path); 461 if (fanotify_event_has_path(event))
462 path_put(&event->path);
463 else if (fanotify_event_has_ext_fh(event))
464 kfree(event->fid.ext_fh);
282 put_pid(event->pid); 465 put_pid(event->pid);
283 if (fanotify_is_perm_event(fsn_event->mask)) { 466 if (fanotify_is_perm_event(event->mask)) {
284 kmem_cache_free(fanotify_perm_event_cachep, 467 kmem_cache_free(fanotify_perm_event_cachep,
285 FANOTIFY_PE(fsn_event)); 468 FANOTIFY_PE(fsn_event));
286 return; 469 return;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index ea05b8a401e7..68b30504284c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,26 +2,112 @@
2#include <linux/fsnotify_backend.h> 2#include <linux/fsnotify_backend.h>
3#include <linux/path.h> 3#include <linux/path.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/exportfs.h>
5 6
6extern struct kmem_cache *fanotify_mark_cache; 7extern struct kmem_cache *fanotify_mark_cache;
7extern struct kmem_cache *fanotify_event_cachep; 8extern struct kmem_cache *fanotify_event_cachep;
8extern struct kmem_cache *fanotify_perm_event_cachep; 9extern struct kmem_cache *fanotify_perm_event_cachep;
9 10
11/* Possible states of the permission event */
12enum {
13 FAN_EVENT_INIT,
14 FAN_EVENT_REPORTED,
15 FAN_EVENT_ANSWERED,
16 FAN_EVENT_CANCELED,
17};
18
19/*
20 * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
21 * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
22 * fh_* fields increase the size of fanotify_event by another 4 bytes.
23 * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
24 * fh_* fields are packed in a hole after mask.
25 */
26#if BITS_PER_LONG == 32
27#define FANOTIFY_INLINE_FH_LEN (3 << 2)
28#else
29#define FANOTIFY_INLINE_FH_LEN (4 << 2)
30#endif
31
32struct fanotify_fid {
33 __kernel_fsid_t fsid;
34 union {
35 unsigned char fh[FANOTIFY_INLINE_FH_LEN];
36 unsigned char *ext_fh;
37 };
38};
39
40static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
41 unsigned int fh_len)
42{
43 return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
44}
45
46static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
47 struct fanotify_fid *fid2,
48 unsigned int fh_len)
49{
50 return fid1->fsid.val[0] == fid2->fsid.val[0] &&
51 fid1->fsid.val[1] == fid2->fsid.val[1] &&
52 !memcmp(fanotify_fid_fh(fid1, fh_len),
53 fanotify_fid_fh(fid2, fh_len), fh_len);
54}
55
10/* 56/*
11 * Structure for normal fanotify events. It gets allocated in 57 * Structure for normal fanotify events. It gets allocated in
12 * fanotify_handle_event() and freed when the information is retrieved by 58 * fanotify_handle_event() and freed when the information is retrieved by
13 * userspace 59 * userspace
14 */ 60 */
15struct fanotify_event_info { 61struct fanotify_event {
16 struct fsnotify_event fse; 62 struct fsnotify_event fse;
63 u32 mask;
17 /* 64 /*
18 * We hold ref to this path so it may be dereferenced at any point 65 * Those fields are outside fanotify_fid to pack fanotify_event nicely
19 * during this object's lifetime 66 * on 64bit arch and to use fh_type as an indication of whether path
67 * or fid are used in the union:
68 * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
20 */ 69 */
21 struct path path; 70 u8 fh_type;
71 u8 fh_len;
72 u16 pad;
73 union {
74 /*
75 * We hold ref to this path so it may be dereferenced at any
76 * point during this object's lifetime
77 */
78 struct path path;
79 /*
80 * With FAN_REPORT_FID, we do not hold any reference on the
81 * victim object. Instead we store its NFS file handle and its
82 * filesystem's fsid as a unique identifier.
83 */
84 struct fanotify_fid fid;
85 };
22 struct pid *pid; 86 struct pid *pid;
23}; 87};
24 88
89static inline bool fanotify_event_has_path(struct fanotify_event *event)
90{
91 return event->fh_type == FILEID_ROOT;
92}
93
94static inline bool fanotify_event_has_fid(struct fanotify_event *event)
95{
96 return event->fh_type != FILEID_ROOT &&
97 event->fh_type != FILEID_INVALID;
98}
99
100static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
101{
102 return fanotify_event_has_fid(event) &&
103 event->fh_len > FANOTIFY_INLINE_FH_LEN;
104}
105
106static inline void *fanotify_event_fh(struct fanotify_event *event)
107{
108 return fanotify_fid_fh(&event->fid, event->fh_len);
109}
110
25/* 111/*
26 * Structure for permission fanotify events. It gets allocated and freed in 112 * Structure for permission fanotify events. It gets allocated and freed in
27 * fanotify_handle_event() since we wait there for user response. When the 113 * fanotify_handle_event() since we wait there for user response. When the
@@ -29,16 +115,17 @@ struct fanotify_event_info {
29 * group->notification_list to group->fanotify_data.access_list to wait for 115 * group->notification_list to group->fanotify_data.access_list to wait for
30 * user response. 116 * user response.
31 */ 117 */
32struct fanotify_perm_event_info { 118struct fanotify_perm_event {
33 struct fanotify_event_info fae; 119 struct fanotify_event fae;
34 int response; /* userspace answer to question */ 120 unsigned short response; /* userspace answer to the event */
121 unsigned short state; /* state of the event */
35 int fd; /* fd we passed to userspace for this event */ 122 int fd; /* fd we passed to userspace for this event */
36}; 123};
37 124
38static inline struct fanotify_perm_event_info * 125static inline struct fanotify_perm_event *
39FANOTIFY_PE(struct fsnotify_event *fse) 126FANOTIFY_PE(struct fsnotify_event *fse)
40{ 127{
41 return container_of(fse, struct fanotify_perm_event_info, fae.fse); 128 return container_of(fse, struct fanotify_perm_event, fae.fse);
42} 129}
43 130
44static inline bool fanotify_is_perm_event(u32 mask) 131static inline bool fanotify_is_perm_event(u32 mask)
@@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask)
47 mask & FANOTIFY_PERM_EVENTS; 134 mask & FANOTIFY_PERM_EVENTS;
48} 135}
49 136
50static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) 137static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
51{ 138{
52 return container_of(fse, struct fanotify_event_info, fse); 139 return container_of(fse, struct fanotify_event, fse);
53} 140}
54 141
55struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, 142struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
56 struct inode *inode, u32 mask, 143 struct inode *inode, u32 mask,
57 const struct path *path); 144 const void *data, int data_type,
145 __kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9c870b0d2b56..56992b32c6bb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@
17#include <linux/compat.h> 17#include <linux/compat.h>
18#include <linux/sched/signal.h> 18#include <linux/sched/signal.h>
19#include <linux/memcontrol.h> 19#include <linux/memcontrol.h>
20#include <linux/statfs.h>
21#include <linux/exportfs.h>
20 22
21#include <asm/ioctls.h> 23#include <asm/ioctls.h>
22 24
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
47struct kmem_cache *fanotify_event_cachep __read_mostly; 49struct kmem_cache *fanotify_event_cachep __read_mostly;
48struct kmem_cache *fanotify_perm_event_cachep __read_mostly; 50struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
49 51
52#define FANOTIFY_EVENT_ALIGN 4
53
54static int fanotify_event_info_len(struct fanotify_event *event)
55{
56 if (!fanotify_event_has_fid(event))
57 return 0;
58
59 return roundup(sizeof(struct fanotify_event_info_fid) +
60 sizeof(struct file_handle) + event->fh_len,
61 FANOTIFY_EVENT_ALIGN);
62}
63
50/* 64/*
51 * Get an fsnotify notification event if one exists and is small 65 * Get an fsnotify notification event if one exists and is small
52 * enough to fit in "count". Return an error pointer if the count 66 * enough to fit in "count". Return an error pointer if the count
53 * is not large enough. 67 * is not large enough. When permission event is dequeued, its state is
54 * 68 * updated accordingly.
55 * Called with the group->notification_lock held.
56 */ 69 */
57static struct fsnotify_event *get_one_event(struct fsnotify_group *group, 70static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
58 size_t count) 71 size_t count)
59{ 72{
60 assert_spin_locked(&group->notification_lock); 73 size_t event_size = FAN_EVENT_METADATA_LEN;
74 struct fsnotify_event *fsn_event = NULL;
61 75
62 pr_debug("%s: group=%p count=%zd\n", __func__, group, count); 76 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
63 77
78 spin_lock(&group->notification_lock);
64 if (fsnotify_notify_queue_is_empty(group)) 79 if (fsnotify_notify_queue_is_empty(group))
65 return NULL; 80 goto out;
66 81
67 if (FAN_EVENT_METADATA_LEN > count) 82 if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
68 return ERR_PTR(-EINVAL); 83 event_size += fanotify_event_info_len(
84 FANOTIFY_E(fsnotify_peek_first_event(group)));
85 }
69 86
70 /* held the notification_lock the whole time, so this is the 87 if (event_size > count) {
71 * same event we peeked above */ 88 fsn_event = ERR_PTR(-EINVAL);
72 return fsnotify_remove_first_event(group); 89 goto out;
90 }
91 fsn_event = fsnotify_remove_first_event(group);
92 if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
93 FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
94out:
95 spin_unlock(&group->notification_lock);
96 return fsn_event;
73} 97}
74 98
75static int create_fd(struct fsnotify_group *group, 99static int create_fd(struct fsnotify_group *group,
76 struct fanotify_event_info *event, 100 struct fanotify_event *event,
77 struct file **file) 101 struct file **file)
78{ 102{
79 int client_fd; 103 int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
114 return client_fd; 138 return client_fd;
115} 139}
116 140
117static int fill_event_metadata(struct fsnotify_group *group, 141/*
118 struct fanotify_event_metadata *metadata, 142 * Finish processing of permission event by setting it to ANSWERED state and
119 struct fsnotify_event *fsn_event, 143 * drop group->notification_lock.
120 struct file **file) 144 */
121{ 145static void finish_permission_event(struct fsnotify_group *group,
122 int ret = 0; 146 struct fanotify_perm_event *event,
123 struct fanotify_event_info *event; 147 unsigned int response)
124 148 __releases(&group->notification_lock)
125 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
126 group, metadata, fsn_event);
127
128 *file = NULL;
129 event = container_of(fsn_event, struct fanotify_event_info, fse);
130 metadata->event_len = FAN_EVENT_METADATA_LEN;
131 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
132 metadata->vers = FANOTIFY_METADATA_VERSION;
133 metadata->reserved = 0;
134 metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
135 metadata->pid = pid_vnr(event->pid);
136 if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
137 metadata->fd = FAN_NOFD;
138 else {
139 metadata->fd = create_fd(group, event, file);
140 if (metadata->fd < 0)
141 ret = metadata->fd;
142 }
143
144 return ret;
145}
146
147static struct fanotify_perm_event_info *dequeue_event(
148 struct fsnotify_group *group, int fd)
149{ 149{
150 struct fanotify_perm_event_info *event, *return_e = NULL; 150 bool destroy = false;
151
152 spin_lock(&group->notification_lock);
153 list_for_each_entry(event, &group->fanotify_data.access_list,
154 fae.fse.list) {
155 if (event->fd != fd)
156 continue;
157 151
158 list_del_init(&event->fae.fse.list); 152 assert_spin_locked(&group->notification_lock);
159 return_e = event; 153 event->response = response;
160 break; 154 if (event->state == FAN_EVENT_CANCELED)
161 } 155 destroy = true;
156 else
157 event->state = FAN_EVENT_ANSWERED;
162 spin_unlock(&group->notification_lock); 158 spin_unlock(&group->notification_lock);
163 159 if (destroy)
164 pr_debug("%s: found return_re=%p\n", __func__, return_e); 160 fsnotify_destroy_event(group, &event->fae.fse);
165
166 return return_e;
167} 161}
168 162
169static int process_access_response(struct fsnotify_group *group, 163static int process_access_response(struct fsnotify_group *group,
170 struct fanotify_response *response_struct) 164 struct fanotify_response *response_struct)
171{ 165{
172 struct fanotify_perm_event_info *event; 166 struct fanotify_perm_event *event;
173 int fd = response_struct->fd; 167 int fd = response_struct->fd;
174 int response = response_struct->response; 168 int response = response_struct->response;
175 169
@@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group,
194 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) 188 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
195 return -EINVAL; 189 return -EINVAL;
196 190
197 event = dequeue_event(group, fd); 191 spin_lock(&group->notification_lock);
198 if (!event) 192 list_for_each_entry(event, &group->fanotify_data.access_list,
199 return -ENOENT; 193 fae.fse.list) {
194 if (event->fd != fd)
195 continue;
200 196
201 event->response = response; 197 list_del_init(&event->fae.fse.list);
202 wake_up(&group->fanotify_data.access_waitq); 198 finish_permission_event(group, event, response);
199 wake_up(&group->fanotify_data.access_waitq);
200 return 0;
201 }
202 spin_unlock(&group->notification_lock);
203
204 return -ENOENT;
205}
206
207static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
208{
209 struct fanotify_event_info_fid info = { };
210 struct file_handle handle = { };
211 size_t fh_len = event->fh_len;
212 size_t len = fanotify_event_info_len(event);
213
214 if (!len)
215 return 0;
216
217 if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
218 return -EFAULT;
219
220 /* Copy event info fid header followed by vaiable sized file handle */
221 info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
222 info.hdr.len = len;
223 info.fsid = event->fid.fsid;
224 if (copy_to_user(buf, &info, sizeof(info)))
225 return -EFAULT;
226
227 buf += sizeof(info);
228 len -= sizeof(info);
229 handle.handle_type = event->fh_type;
230 handle.handle_bytes = fh_len;
231 if (copy_to_user(buf, &handle, sizeof(handle)))
232 return -EFAULT;
233
234 buf += sizeof(handle);
235 len -= sizeof(handle);
236 if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
237 return -EFAULT;
238
239 /* Pad with 0's */
240 buf += fh_len;
241 len -= fh_len;
242 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
243 if (len > 0 && clear_user(buf, len))
244 return -EFAULT;
203 245
204 return 0; 246 return 0;
205} 247}
206 248
207static ssize_t copy_event_to_user(struct fsnotify_group *group, 249static ssize_t copy_event_to_user(struct fsnotify_group *group,
208 struct fsnotify_event *event, 250 struct fsnotify_event *fsn_event,
209 char __user *buf, size_t count) 251 char __user *buf, size_t count)
210{ 252{
211 struct fanotify_event_metadata fanotify_event_metadata; 253 struct fanotify_event_metadata metadata;
212 struct file *f; 254 struct fanotify_event *event;
213 int fd, ret; 255 struct file *f = NULL;
214 256 int ret, fd = FAN_NOFD;
215 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 257
216 258 pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
217 ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); 259
218 if (ret < 0) 260 event = container_of(fsn_event, struct fanotify_event, fse);
219 return ret; 261 metadata.event_len = FAN_EVENT_METADATA_LEN;
262 metadata.metadata_len = FAN_EVENT_METADATA_LEN;
263 metadata.vers = FANOTIFY_METADATA_VERSION;
264 metadata.reserved = 0;
265 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
266 metadata.pid = pid_vnr(event->pid);
267
268 if (fanotify_event_has_path(event)) {
269 fd = create_fd(group, event, &f);
270 if (fd < 0)
271 return fd;
272 } else if (fanotify_event_has_fid(event)) {
273 metadata.event_len += fanotify_event_info_len(event);
274 }
275 metadata.fd = fd;
220 276
221 fd = fanotify_event_metadata.fd;
222 ret = -EFAULT; 277 ret = -EFAULT;
223 /* 278 /*
224 * Sanity check copy size in case get_one_event() and 279 * Sanity check copy size in case get_one_event() and
225 * fill_event_metadata() event_len sizes ever get out of sync. 280 * fill_event_metadata() event_len sizes ever get out of sync.
226 */ 281 */
227 if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count)) 282 if (WARN_ON_ONCE(metadata.event_len > count))
228 goto out_close_fd; 283 goto out_close_fd;
229 if (copy_to_user(buf, &fanotify_event_metadata, 284
230 fanotify_event_metadata.event_len)) 285 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
231 goto out_close_fd; 286 goto out_close_fd;
232 287
233 if (fanotify_is_perm_event(event->mask)) 288 if (fanotify_is_perm_event(event->mask))
234 FANOTIFY_PE(event)->fd = fd; 289 FANOTIFY_PE(fsn_event)->fd = fd;
235 290
236 if (fd != FAN_NOFD) 291 if (fanotify_event_has_path(event)) {
237 fd_install(fd, f); 292 fd_install(fd, f);
238 return fanotify_event_metadata.event_len; 293 } else if (fanotify_event_has_fid(event)) {
294 ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
295 if (ret < 0)
296 return ret;
297 }
298
299 return metadata.event_len;
239 300
240out_close_fd: 301out_close_fd:
241 if (fd != FAN_NOFD) { 302 if (fd != FAN_NOFD) {
@@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
276 337
277 add_wait_queue(&group->notification_waitq, &wait); 338 add_wait_queue(&group->notification_waitq, &wait);
278 while (1) { 339 while (1) {
279 spin_lock(&group->notification_lock);
280 kevent = get_one_event(group, count); 340 kevent = get_one_event(group, count);
281 spin_unlock(&group->notification_lock);
282
283 if (IS_ERR(kevent)) { 341 if (IS_ERR(kevent)) {
284 ret = PTR_ERR(kevent); 342 ret = PTR_ERR(kevent);
285 break; 343 break;
@@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
316 * Permission events get queued to wait for response. Other 374 * Permission events get queued to wait for response. Other
317 * events can be destroyed now. 375 * events can be destroyed now.
318 */ 376 */
319 if (!fanotify_is_perm_event(kevent->mask)) { 377 if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
320 fsnotify_destroy_event(group, kevent); 378 fsnotify_destroy_event(group, kevent);
321 } else { 379 } else {
322 if (ret <= 0) { 380 if (ret <= 0) {
323 FANOTIFY_PE(kevent)->response = FAN_DENY; 381 spin_lock(&group->notification_lock);
382 finish_permission_event(group,
383 FANOTIFY_PE(kevent), FAN_DENY);
324 wake_up(&group->fanotify_data.access_waitq); 384 wake_up(&group->fanotify_data.access_waitq);
325 } else { 385 } else {
326 spin_lock(&group->notification_lock); 386 spin_lock(&group->notification_lock);
@@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
370static int fanotify_release(struct inode *ignored, struct file *file) 430static int fanotify_release(struct inode *ignored, struct file *file)
371{ 431{
372 struct fsnotify_group *group = file->private_data; 432 struct fsnotify_group *group = file->private_data;
373 struct fanotify_perm_event_info *event, *next; 433 struct fanotify_perm_event *event;
374 struct fsnotify_event *fsn_event; 434 struct fsnotify_event *fsn_event;
375 435
376 /* 436 /*
@@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
385 * and simulate reply from userspace. 445 * and simulate reply from userspace.
386 */ 446 */
387 spin_lock(&group->notification_lock); 447 spin_lock(&group->notification_lock);
388 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, 448 while (!list_empty(&group->fanotify_data.access_list)) {
389 fae.fse.list) { 449 event = list_first_entry(&group->fanotify_data.access_list,
390 pr_debug("%s: found group=%p event=%p\n", __func__, group, 450 struct fanotify_perm_event, fae.fse.list);
391 event);
392
393 list_del_init(&event->fae.fse.list); 451 list_del_init(&event->fae.fse.list);
394 event->response = FAN_ALLOW; 452 finish_permission_event(group, event, FAN_ALLOW);
453 spin_lock(&group->notification_lock);
395 } 454 }
396 455
397 /* 456 /*
@@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
401 */ 460 */
402 while (!fsnotify_notify_queue_is_empty(group)) { 461 while (!fsnotify_notify_queue_is_empty(group)) {
403 fsn_event = fsnotify_remove_first_event(group); 462 fsn_event = fsnotify_remove_first_event(group);
404 if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { 463 if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
405 spin_unlock(&group->notification_lock); 464 spin_unlock(&group->notification_lock);
406 fsnotify_destroy_event(group, fsn_event); 465 fsnotify_destroy_event(group, fsn_event);
407 spin_lock(&group->notification_lock);
408 } else { 466 } else {
409 FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; 467 finish_permission_event(group, FANOTIFY_PE(fsn_event),
468 FAN_ALLOW);
410 } 469 }
470 spin_lock(&group->notification_lock);
411 } 471 }
412 spin_unlock(&group->notification_lock); 472 spin_unlock(&group->notification_lock);
413 473
@@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
598 658
599static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, 659static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
600 fsnotify_connp_t *connp, 660 fsnotify_connp_t *connp,
601 unsigned int type) 661 unsigned int type,
662 __kernel_fsid_t *fsid)
602{ 663{
603 struct fsnotify_mark *mark; 664 struct fsnotify_mark *mark;
604 int ret; 665 int ret;
@@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
611 return ERR_PTR(-ENOMEM); 672 return ERR_PTR(-ENOMEM);
612 673
613 fsnotify_init_mark(mark, group); 674 fsnotify_init_mark(mark, group);
614 ret = fsnotify_add_mark_locked(mark, connp, type, 0); 675 ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
615 if (ret) { 676 if (ret) {
616 fsnotify_put_mark(mark); 677 fsnotify_put_mark(mark);
617 return ERR_PTR(ret); 678 return ERR_PTR(ret);
@@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
623 684
624static int fanotify_add_mark(struct fsnotify_group *group, 685static int fanotify_add_mark(struct fsnotify_group *group,
625 fsnotify_connp_t *connp, unsigned int type, 686 fsnotify_connp_t *connp, unsigned int type,
626 __u32 mask, unsigned int flags) 687 __u32 mask, unsigned int flags,
688 __kernel_fsid_t *fsid)
627{ 689{
628 struct fsnotify_mark *fsn_mark; 690 struct fsnotify_mark *fsn_mark;
629 __u32 added; 691 __u32 added;
@@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
631 mutex_lock(&group->mark_mutex); 693 mutex_lock(&group->mark_mutex);
632 fsn_mark = fsnotify_find_mark(connp, group); 694 fsn_mark = fsnotify_find_mark(connp, group);
633 if (!fsn_mark) { 695 if (!fsn_mark) {
634 fsn_mark = fanotify_add_new_mark(group, connp, type); 696 fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
635 if (IS_ERR(fsn_mark)) { 697 if (IS_ERR(fsn_mark)) {
636 mutex_unlock(&group->mark_mutex); 698 mutex_unlock(&group->mark_mutex);
637 return PTR_ERR(fsn_mark); 699 return PTR_ERR(fsn_mark);
@@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
648 710
649static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 711static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
650 struct vfsmount *mnt, __u32 mask, 712 struct vfsmount *mnt, __u32 mask,
651 unsigned int flags) 713 unsigned int flags, __kernel_fsid_t *fsid)
652{ 714{
653 return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, 715 return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
654 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); 716 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
655} 717}
656 718
657static int fanotify_add_sb_mark(struct fsnotify_group *group, 719static int fanotify_add_sb_mark(struct fsnotify_group *group,
658 struct super_block *sb, __u32 mask, 720 struct super_block *sb, __u32 mask,
659 unsigned int flags) 721 unsigned int flags, __kernel_fsid_t *fsid)
660{ 722{
661 return fanotify_add_mark(group, &sb->s_fsnotify_marks, 723 return fanotify_add_mark(group, &sb->s_fsnotify_marks,
662 FSNOTIFY_OBJ_TYPE_SB, mask, flags); 724 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
663} 725}
664 726
665static int fanotify_add_inode_mark(struct fsnotify_group *group, 727static int fanotify_add_inode_mark(struct fsnotify_group *group,
666 struct inode *inode, __u32 mask, 728 struct inode *inode, __u32 mask,
667 unsigned int flags) 729 unsigned int flags, __kernel_fsid_t *fsid)
668{ 730{
669 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 731 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
670 732
@@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
679 return 0; 741 return 0;
680 742
681 return fanotify_add_mark(group, &inode->i_fsnotify_marks, 743 return fanotify_add_mark(group, &inode->i_fsnotify_marks,
682 FSNOTIFY_OBJ_TYPE_INODE, mask, flags); 744 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
683} 745}
684 746
685/* fanotify syscalls */ 747/* fanotify syscalls */
@@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
688 struct fsnotify_group *group; 750 struct fsnotify_group *group;
689 int f_flags, fd; 751 int f_flags, fd;
690 struct user_struct *user; 752 struct user_struct *user;
691 struct fanotify_event_info *oevent; 753 struct fanotify_event *oevent;
692 754
693 pr_debug("%s: flags=%x event_f_flags=%x\n", 755 pr_debug("%s: flags=%x event_f_flags=%x\n",
694 __func__, flags, event_f_flags); 756 __func__, flags, event_f_flags);
@@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
715 return -EINVAL; 777 return -EINVAL;
716 } 778 }
717 779
780 if ((flags & FAN_REPORT_FID) &&
781 (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
782 return -EINVAL;
783
718 user = get_current_user(); 784 user = get_current_user();
719 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { 785 if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
720 free_uid(user); 786 free_uid(user);
@@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
739 atomic_inc(&user->fanotify_listeners); 805 atomic_inc(&user->fanotify_listeners);
740 group->memcg = get_mem_cgroup_from_mm(current->mm); 806 group->memcg = get_mem_cgroup_from_mm(current->mm);
741 807
742 oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); 808 oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
809 FSNOTIFY_EVENT_NONE, NULL);
743 if (unlikely(!oevent)) { 810 if (unlikely(!oevent)) {
744 fd = -ENOMEM; 811 fd = -ENOMEM;
745 goto out_destroy_group; 812 goto out_destroy_group;
@@ -801,6 +868,48 @@ out_destroy_group:
801 return fd; 868 return fd;
802} 869}
803 870
871/* Check if filesystem can encode a unique fid */
872static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
873{
874 __kernel_fsid_t root_fsid;
875 int err;
876
877 /*
878 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
879 */
880 err = vfs_get_fsid(path->dentry, fsid);
881 if (err)
882 return err;
883
884 if (!fsid->val[0] && !fsid->val[1])
885 return -ENODEV;
886
887 /*
888 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
889 * which uses a different fsid than sb root.
890 */
891 err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
892 if (err)
893 return err;
894
895 if (root_fsid.val[0] != fsid->val[0] ||
896 root_fsid.val[1] != fsid->val[1])
897 return -EXDEV;
898
899 /*
900 * We need to make sure that the file system supports at least
901 * encoding a file handle so user can use name_to_handle_at() to
902 * compare fid returned with event to the file handle of watched
903 * objects. However, name_to_handle_at() requires that the
904 * filesystem also supports decoding file handles.
905 */
906 if (!path->dentry->d_sb->s_export_op ||
907 !path->dentry->d_sb->s_export_op->fh_to_dentry)
908 return -EOPNOTSUPP;
909
910 return 0;
911}
912
804static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, 913static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
805 int dfd, const char __user *pathname) 914 int dfd, const char __user *pathname)
806{ 915{
@@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
809 struct fsnotify_group *group; 918 struct fsnotify_group *group;
810 struct fd f; 919 struct fd f;
811 struct path path; 920 struct path path;
921 __kernel_fsid_t __fsid, *fsid = NULL;
812 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; 922 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
813 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 923 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
814 int ret; 924 int ret;
@@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
871 group->priority == FS_PRIO_0) 981 group->priority == FS_PRIO_0)
872 goto fput_and_out; 982 goto fput_and_out;
873 983
984 /*
985 * Events with data type inode do not carry enough information to report
986 * event->fd, so we do not allow setting a mask for inode events unless
987 * group supports reporting fid.
988 * inode events are not supported on a mount mark, because they do not
989 * carry enough information (i.e. path) to be filtered by mount point.
990 */
991 if (mask & FANOTIFY_INODE_EVENTS &&
992 (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
993 mark_type == FAN_MARK_MOUNT))
994 goto fput_and_out;
995
874 if (flags & FAN_MARK_FLUSH) { 996 if (flags & FAN_MARK_FLUSH) {
875 ret = 0; 997 ret = 0;
876 if (mark_type == FAN_MARK_MOUNT) 998 if (mark_type == FAN_MARK_MOUNT)
@@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
886 if (ret) 1008 if (ret)
887 goto fput_and_out; 1009 goto fput_and_out;
888 1010
1011 if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
1012 ret = fanotify_test_fid(&path, &__fsid);
1013 if (ret)
1014 goto path_put_and_out;
1015
1016 fsid = &__fsid;
1017 }
1018
889 /* inode held in place by reference to path; group by fget on fd */ 1019 /* inode held in place by reference to path; group by fget on fd */
890 if (mark_type == FAN_MARK_INODE) 1020 if (mark_type == FAN_MARK_INODE)
891 inode = path.dentry->d_inode; 1021 inode = path.dentry->d_inode;
@@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
896 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { 1026 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
897 case FAN_MARK_ADD: 1027 case FAN_MARK_ADD:
898 if (mark_type == FAN_MARK_MOUNT) 1028 if (mark_type == FAN_MARK_MOUNT)
899 ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); 1029 ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1030 flags, fsid);
900 else if (mark_type == FAN_MARK_FILESYSTEM) 1031 else if (mark_type == FAN_MARK_FILESYSTEM)
901 ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); 1032 ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1033 flags, fsid);
902 else 1034 else
903 ret = fanotify_add_inode_mark(group, inode, mask, flags); 1035 ret = fanotify_add_inode_mark(group, inode, mask,
1036 flags, fsid);
904 break; 1037 break;
905 case FAN_MARK_REMOVE: 1038 case FAN_MARK_REMOVE:
906 if (mark_type == FAN_MARK_MOUNT) 1039 if (mark_type == FAN_MARK_MOUNT)
907 ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); 1040 ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1041 flags);
908 else if (mark_type == FAN_MARK_FILESYSTEM) 1042 else if (mark_type == FAN_MARK_FILESYSTEM)
909 ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); 1043 ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1044 flags);
910 else 1045 else
911 ret = fanotify_remove_inode_mark(group, inode, mask, flags); 1046 ret = fanotify_remove_inode_mark(group, inode, mask,
1047 flags);
912 break; 1048 break;
913 default: 1049 default:
914 ret = -EINVAL; 1050 ret = -EINVAL;
915 } 1051 }
916 1052
1053path_put_and_out:
917 path_put(&path); 1054 path_put(&path);
918fput_and_out: 1055fput_and_out:
919 fdput(f); 1056 fdput(f);
@@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
950 */ 1087 */
951static int __init fanotify_user_setup(void) 1088static int __init fanotify_user_setup(void)
952{ 1089{
953 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); 1090 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
954 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); 1091 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
955 1092
956 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, 1093 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
957 SLAB_PANIC|SLAB_ACCOUNT); 1094 SLAB_PANIC|SLAB_ACCOUNT);
958 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); 1095 fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
959 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { 1096 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
960 fanotify_perm_event_cachep = 1097 fanotify_perm_event_cachep =
961 KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC); 1098 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
962 } 1099 }
963 1100
964 return 0; 1101 return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ecf09b6243d9..df06f3da166c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
328 const unsigned char *file_name, u32 cookie) 328 const unsigned char *file_name, u32 cookie)
329{ 329{
330 struct fsnotify_iter_info iter_info = {}; 330 struct fsnotify_iter_info iter_info = {};
331 struct super_block *sb = NULL; 331 struct super_block *sb = to_tell->i_sb;
332 struct mount *mnt = NULL; 332 struct mount *mnt = NULL;
333 __u32 mnt_or_sb_mask = 0; 333 __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
334 int ret = 0; 334 int ret = 0;
335 __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); 335 __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
336 336
337 if (data_is == FSNOTIFY_EVENT_PATH) { 337 if (data_is == FSNOTIFY_EVENT_PATH) {
338 mnt = real_mount(((const struct path *)data)->mnt); 338 mnt = real_mount(((const struct path *)data)->mnt);
339 sb = mnt->mnt.mnt_sb; 339 mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
340 mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
341 } 340 }
342 /* An event "on child" is not intended for a mount/sb mark */ 341 /* An event "on child" is not intended for a mount/sb mark */
343 if (mask & FS_EVENT_ON_CHILD) 342 if (mask & FS_EVENT_ON_CHILD)
@@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
350 * SRCU because we have no references to any objects and do not 349 * SRCU because we have no references to any objects and do not
351 * need SRCU to keep them "alive". 350 * need SRCU to keep them "alive".
352 */ 351 */
353 if (!to_tell->i_fsnotify_marks && 352 if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
354 (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) 353 (!mnt || !mnt->mnt_fsnotify_marks))
355 return 0; 354 return 0;
356 /* 355 /*
357 * if this is a modify event we may need to clear the ignored masks 356 * if this is a modify event we may need to clear the ignored masks
@@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
366 365
367 iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = 366 iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
368 fsnotify_first_mark(&to_tell->i_fsnotify_marks); 367 fsnotify_first_mark(&to_tell->i_fsnotify_marks);
368 iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
369 fsnotify_first_mark(&sb->s_fsnotify_marks);
369 if (mnt) { 370 if (mnt) {
370 iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = 371 iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
371 fsnotify_first_mark(&mnt->mnt_fsnotify_marks); 372 fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
372 iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
373 fsnotify_first_mark(&sb->s_fsnotify_marks);
374 } 373 }
375 374
376 /* 375 /*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d35b61..74ae60305189 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -5,6 +5,7 @@
5 5
6struct inotify_event_info { 6struct inotify_event_info {
7 struct fsnotify_event fse; 7 struct fsnotify_event fse;
8 u32 mask;
8 int wd; 9 int wd;
9 u32 sync_cookie; 10 u32 sync_cookie;
10 int name_len; 11 int name_len;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4f3815..ff30abd6a49b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
43{ 43{
44 struct inotify_event_info *old, *new; 44 struct inotify_event_info *old, *new;
45 45
46 if (old_fsn->mask & FS_IN_IGNORED)
47 return false;
48 old = INOTIFY_E(old_fsn); 46 old = INOTIFY_E(old_fsn);
49 new = INOTIFY_E(new_fsn); 47 new = INOTIFY_E(new_fsn);
50 if ((old_fsn->mask == new_fsn->mask) && 48 if (old->mask & FS_IN_IGNORED)
49 return false;
50 if ((old->mask == new->mask) &&
51 (old_fsn->inode == new_fsn->inode) && 51 (old_fsn->inode == new_fsn->inode) &&
52 (old->name_len == new->name_len) && 52 (old->name_len == new->name_len) &&
53 (!old->name_len || !strcmp(old->name, new->name))) 53 (!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
113 return -ENOMEM; 113 return -ENOMEM;
114 } 114 }
115 115
116 /*
117 * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
118 * for fanotify. inotify never reported IN_ISDIR with those events.
119 * It looks like an oversight, but to avoid the risk of breaking
120 * existing inotify programs, mask the flag out from those events.
121 */
122 if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
123 mask &= ~IN_ISDIR;
124
116 fsn_event = &event->fse; 125 fsn_event = &event->fse;
117 fsnotify_init_event(fsn_event, inode, mask); 126 fsnotify_init_event(fsn_event, inode);
127 event->mask = mask;
118 event->wd = i_mark->wd; 128 event->wd = i_mark->wd;
119 event->sync_cookie = cookie; 129 event->sync_cookie = cookie;
120 event->name_len = len; 130 event->name_len = len;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 798f1253141a..e2901fbb9f76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
189 */ 189 */
190 pad_name_len = round_event_name_len(fsn_event); 190 pad_name_len = round_event_name_len(fsn_event);
191 inotify_event.len = pad_name_len; 191 inotify_event.len = pad_name_len;
192 inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); 192 inotify_event.mask = inotify_mask_to_arg(event->mask);
193 inotify_event.wd = event->wd; 193 inotify_event.wd = event->wd;
194 inotify_event.cookie = event->sync_cookie; 194 inotify_event.cookie = event->sync_cookie;
195 195
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
634 return ERR_PTR(-ENOMEM); 634 return ERR_PTR(-ENOMEM);
635 } 635 }
636 group->overflow_event = &oevent->fse; 636 group->overflow_event = &oevent->fse;
637 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); 637 fsnotify_init_event(group->overflow_event, NULL);
638 oevent->mask = FS_Q_OVERFLOW;
638 oevent->wd = -1; 639 oevent->wd = -1;
639 oevent->sync_cookie = 0; 640 oevent->sync_cookie = 0;
640 oevent->name_len = 0; 641 oevent->name_len = 0;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d2dd16cb5989..d593d4269561 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -82,6 +82,7 @@
82#include <linux/slab.h> 82#include <linux/slab.h>
83#include <linux/spinlock.h> 83#include <linux/spinlock.h>
84#include <linux/srcu.h> 84#include <linux/srcu.h>
85#include <linux/ratelimit.h>
85 86
86#include <linux/atomic.h> 87#include <linux/atomic.h>
87 88
@@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
481} 482}
482 483
483static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, 484static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
484 unsigned int type) 485 unsigned int type,
486 __kernel_fsid_t *fsid)
485{ 487{
486 struct inode *inode = NULL; 488 struct inode *inode = NULL;
487 struct fsnotify_mark_connector *conn; 489 struct fsnotify_mark_connector *conn;
@@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
493 INIT_HLIST_HEAD(&conn->list); 495 INIT_HLIST_HEAD(&conn->list);
494 conn->type = type; 496 conn->type = type;
495 conn->obj = connp; 497 conn->obj = connp;
498 /* Cache fsid of filesystem containing the object */
499 if (fsid)
500 conn->fsid = *fsid;
501 else
502 conn->fsid.val[0] = conn->fsid.val[1] = 0;
496 if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) 503 if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
497 inode = igrab(fsnotify_conn_inode(conn)); 504 inode = igrab(fsnotify_conn_inode(conn));
498 /* 505 /*
@@ -544,7 +551,7 @@ out:
544 */ 551 */
545static int fsnotify_add_mark_list(struct fsnotify_mark *mark, 552static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
546 fsnotify_connp_t *connp, unsigned int type, 553 fsnotify_connp_t *connp, unsigned int type,
547 int allow_dups) 554 int allow_dups, __kernel_fsid_t *fsid)
548{ 555{
549 struct fsnotify_mark *lmark, *last = NULL; 556 struct fsnotify_mark *lmark, *last = NULL;
550 struct fsnotify_mark_connector *conn; 557 struct fsnotify_mark_connector *conn;
@@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
553 560
554 if (WARN_ON(!fsnotify_valid_obj_type(type))) 561 if (WARN_ON(!fsnotify_valid_obj_type(type)))
555 return -EINVAL; 562 return -EINVAL;
563
564 /* Backend is expected to check for zero fsid (e.g. tmpfs) */
565 if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
566 return -ENODEV;
567
556restart: 568restart:
557 spin_lock(&mark->lock); 569 spin_lock(&mark->lock);
558 conn = fsnotify_grab_connector(connp); 570 conn = fsnotify_grab_connector(connp);
559 if (!conn) { 571 if (!conn) {
560 spin_unlock(&mark->lock); 572 spin_unlock(&mark->lock);
561 err = fsnotify_attach_connector_to_object(connp, type); 573 err = fsnotify_attach_connector_to_object(connp, type, fsid);
562 if (err) 574 if (err)
563 return err; 575 return err;
564 goto restart; 576 goto restart;
577 } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
578 (fsid->val[0] != conn->fsid.val[0] ||
579 fsid->val[1] != conn->fsid.val[1])) {
580 /*
581 * Backend is expected to check for non uniform fsid
582 * (e.g. btrfs), but maybe we missed something?
583 * Only allow setting conn->fsid once to non zero fsid.
584 * inotify and non-fid fanotify groups do not set nor test
585 * conn->fsid.
586 */
587 pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
588 "%x.%x != %x.%x\n", __func__, conn->type,
589 fsid->val[0], fsid->val[1],
590 conn->fsid.val[0], conn->fsid.val[1]);
591 err = -EXDEV;
592 goto out_err;
565 } 593 }
566 594
567 /* is mark the first mark? */ 595 /* is mark the first mark? */
@@ -606,7 +634,7 @@ out_err:
606 */ 634 */
607int fsnotify_add_mark_locked(struct fsnotify_mark *mark, 635int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
608 fsnotify_connp_t *connp, unsigned int type, 636 fsnotify_connp_t *connp, unsigned int type,
609 int allow_dups) 637 int allow_dups, __kernel_fsid_t *fsid)
610{ 638{
611 struct fsnotify_group *group = mark->group; 639 struct fsnotify_group *group = mark->group;
612 int ret = 0; 640 int ret = 0;
@@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
627 fsnotify_get_mark(mark); /* for g_list */ 655 fsnotify_get_mark(mark); /* for g_list */
628 spin_unlock(&mark->lock); 656 spin_unlock(&mark->lock);
629 657
630 ret = fsnotify_add_mark_list(mark, connp, type, allow_dups); 658 ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
631 if (ret) 659 if (ret)
632 goto err; 660 goto err;
633 661
@@ -648,13 +676,13 @@ err:
648} 676}
649 677
650int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, 678int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
651 unsigned int type, int allow_dups) 679 unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
652{ 680{
653 int ret; 681 int ret;
654 struct fsnotify_group *group = mark->group; 682 struct fsnotify_group *group = mark->group;
655 683
656 mutex_lock(&group->mark_mutex); 684 mutex_lock(&group->mark_mutex);
657 ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups); 685 ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
658 mutex_unlock(&group->mark_mutex); 686 mutex_unlock(&group->mark_mutex);
659 return ret; 687 return ret;
660} 688}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e36745f59..5f3a54d444b5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
71 struct fsnotify_event *event) 71 struct fsnotify_event *event)
72{ 72{
73 /* Overflow events are per-group and we don't want to free them */ 73 /* Overflow events are per-group and we don't want to free them */
74 if (!event || event->mask == FS_Q_OVERFLOW) 74 if (!event || event == group->overflow_event)
75 return; 75 return;
76 /* 76 /*
77 * If the event is still queued, we have a problem... Do an unreliable 77 * If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
141 return ret; 141 return ret;
142} 142}
143 143
144void fsnotify_remove_queued_event(struct fsnotify_group *group,
145 struct fsnotify_event *event)
146{
147 assert_spin_locked(&group->notification_lock);
148 /*
149 * We need to init list head for the case of overflow event so that
150 * check in fsnotify_add_event() works
151 */
152 list_del_init(&event->list);
153 group->q_len--;
154}
155
144/* 156/*
145 * Remove and return the first event from the notification list. It is the 157 * Remove and return the first event from the notification list. It is the
146 * responsibility of the caller to destroy the obtained event 158 * responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
155 167
156 event = list_first_entry(&group->notification_list, 168 event = list_first_entry(&group->notification_list,
157 struct fsnotify_event, list); 169 struct fsnotify_event, list);
158 /* 170 fsnotify_remove_queued_event(group, event);
159 * We need to init list head for the case of overflow event so that
160 * check in fsnotify_add_event() works
161 */
162 list_del_init(&event->list);
163 group->q_len--;
164
165 return event; 171 return event;
166} 172}
167 173
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
194 } 200 }
195 spin_unlock(&group->notification_lock); 201 spin_unlock(&group->notification_lock);
196} 202}
197
198/*
199 * fsnotify_create_event - Allocate a new event which will be sent to each
200 * group's handle_event function if the group was interested in this
201 * particular event.
202 *
203 * @inode the inode which is supposed to receive the event (sometimes a
204 * parent of the inode to which the event happened.
205 * @mask what actually happened.
206 * @data pointer to the object which was actually affected
207 * @data_type flag indication if the data is a file, path, inode, nothing...
208 * @name the filename, if available
209 */
210void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
211 u32 mask)
212{
213 INIT_LIST_HEAD(&event->list);
214 event->inode = inode;
215 event->mask = mask;
216}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 511b279ec69c..5ab1849971b4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -140,9 +140,13 @@ struct pid_entry {
140#define REG(NAME, MODE, fops) \ 140#define REG(NAME, MODE, fops) \
141 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) 141 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
142#define ONE(NAME, MODE, show) \ 142#define ONE(NAME, MODE, show) \
143 NOD(NAME, (S_IFREG|(MODE)), \ 143 NOD(NAME, (S_IFREG|(MODE)), \
144 NULL, &proc_single_file_operations, \ 144 NULL, &proc_single_file_operations, \
145 { .proc_show = show } ) 145 { .proc_show = show } )
146#define ATTR(LSM, NAME, MODE) \
147 NOD(NAME, (S_IFREG|(MODE)), \
148 NULL, &proc_pid_attr_operations, \
149 { .lsm = LSM })
146 150
147/* 151/*
148 * Count the number of hardlinks for the pid_entry table, excluding the . 152 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -1206,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
1206 .llseek = default_llseek, 1210 .llseek = default_llseek,
1207}; 1211};
1208 1212
1209#ifdef CONFIG_AUDITSYSCALL 1213#ifdef CONFIG_AUDIT
1210#define TMPBUFLEN 11 1214#define TMPBUFLEN 11
1211static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 1215static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1212 size_t count, loff_t *ppos) 1216 size_t count, loff_t *ppos)
@@ -2521,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2521 if (!task) 2525 if (!task)
2522 return -ESRCH; 2526 return -ESRCH;
2523 2527
2524 length = security_getprocattr(task, 2528 length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2525 (char*)file->f_path.dentry->d_name.name, 2529 (char*)file->f_path.dentry->d_name.name,
2526 &p); 2530 &p);
2527 put_task_struct(task); 2531 put_task_struct(task);
@@ -2570,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2570 if (rv < 0) 2574 if (rv < 0)
2571 goto out_free; 2575 goto out_free;
2572 2576
2573 rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count); 2577 rv = security_setprocattr(PROC_I(inode)->op.lsm,
2578 file->f_path.dentry->d_name.name, page,
2579 count);
2574 mutex_unlock(&current->signal->cred_guard_mutex); 2580 mutex_unlock(&current->signal->cred_guard_mutex);
2575out_free: 2581out_free:
2576 kfree(page); 2582 kfree(page);
@@ -2584,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = {
2584 .llseek = generic_file_llseek, 2590 .llseek = generic_file_llseek,
2585}; 2591};
2586 2592
2593#define LSM_DIR_OPS(LSM) \
2594static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2595 struct dir_context *ctx) \
2596{ \
2597 return proc_pident_readdir(filp, ctx, \
2598 LSM##_attr_dir_stuff, \
2599 ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2600} \
2601\
2602static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2603 .read = generic_read_dir, \
2604 .iterate = proc_##LSM##_attr_dir_iterate, \
2605 .llseek = default_llseek, \
2606}; \
2607\
2608static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2609 struct dentry *dentry, unsigned int flags) \
2610{ \
2611 return proc_pident_lookup(dir, dentry, \
2612 LSM##_attr_dir_stuff, \
2613 ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2614} \
2615\
2616static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2617 .lookup = proc_##LSM##_attr_dir_lookup, \
2618 .getattr = pid_getattr, \
2619 .setattr = proc_setattr, \
2620}
2621
2622#ifdef CONFIG_SECURITY_SMACK
2623static const struct pid_entry smack_attr_dir_stuff[] = {
2624 ATTR("smack", "current", 0666),
2625};
2626LSM_DIR_OPS(smack);
2627#endif
2628
2587static const struct pid_entry attr_dir_stuff[] = { 2629static const struct pid_entry attr_dir_stuff[] = {
2588 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2630 ATTR(NULL, "current", 0666),
2589 REG("prev", S_IRUGO, proc_pid_attr_operations), 2631 ATTR(NULL, "prev", 0444),
2590 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2632 ATTR(NULL, "exec", 0666),
2591 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2633 ATTR(NULL, "fscreate", 0666),
2592 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2634 ATTR(NULL, "keycreate", 0666),
2593 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2635 ATTR(NULL, "sockcreate", 0666),
2636#ifdef CONFIG_SECURITY_SMACK
2637 DIR("smack", 0555,
2638 proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2639#endif
2594}; 2640};
2595 2641
2596static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) 2642static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -2998,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2998 ONE("oom_score", S_IRUGO, proc_oom_score), 3044 ONE("oom_score", S_IRUGO, proc_oom_score),
2999 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 3045 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3000 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3046 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3001#ifdef CONFIG_AUDITSYSCALL 3047#ifdef CONFIG_AUDIT
3002 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3048 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3003 REG("sessionid", S_IRUGO, proc_sessionid_operations), 3049 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3004#endif 3050#endif
@@ -3386,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = {
3386 ONE("oom_score", S_IRUGO, proc_oom_score), 3432 ONE("oom_score", S_IRUGO, proc_oom_score),
3387 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 3433 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3388 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3434 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3389#ifdef CONFIG_AUDITSYSCALL 3435#ifdef CONFIG_AUDIT
3390 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3436 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3391 REG("sessionid", S_IRUGO, proc_sessionid_operations), 3437 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3392#endif 3438#endif
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 4fc5a9b68f76..ea575375f210 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -82,6 +82,7 @@ union proc_op {
82 int (*proc_show)(struct seq_file *m, 82 int (*proc_show)(struct seq_file *m,
83 struct pid_namespace *ns, struct pid *pid, 83 struct pid_namespace *ns, struct pid *pid,
84 struct task_struct *task); 84 struct task_struct *task);
85 const char *lsm;
85}; 86};
86 87
87struct proc_inode { 88struct proc_inode {
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..eea7af6f2f22 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
67 return retval; 67 return retval;
68} 68}
69 69
70int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
71{
72 struct kstatfs st;
73 int error;
74
75 error = statfs_by_dentry(dentry, &st);
76 if (error)
77 return error;
78
79 *fsid = st.f_fsid;
80 return 0;
81}
82EXPORT_SYMBOL(vfs_get_fsid);
83
70int vfs_statfs(const struct path *path, struct kstatfs *buf) 84int vfs_statfs(const struct path *path, struct kstatfs *buf)
71{ 85{
72 int error; 86 int error;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e3d684ea3203..ffd8038ff728 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1474 if (lvd->integritySeqExt.extLength) 1474 if (lvd->integritySeqExt.extLength)
1475 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); 1475 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
1476 ret = 0; 1476 ret = 0;
1477
1478 if (!sbi->s_lvid_bh) {
1479 /* We can't generate unique IDs without a valid LVID */
1480 if (sb_rdonly(sb)) {
1481 UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
1482 } else {
1483 udf_warn(sb, "Damaged or missing LVID, forcing "
1484 "readonly mount\n");
1485 ret = -EACCES;
1486 }
1487 }
1477out_bh: 1488out_bh:
1478 brelse(bh); 1489 brelse(bh);
1479 return ret; 1490 return ret;
@@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1943 return 0; 1954 return 0;
1944} 1955}
1945 1956
1957static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
1958{
1959 struct timespec64 ts;
1960
1961 ktime_get_real_ts64(&ts);
1962 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
1963 lvid->descTag.descCRC = cpu_to_le16(
1964 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1965 le16_to_cpu(lvid->descTag.descCRCLength)));
1966 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1967}
1968
1946static void udf_open_lvid(struct super_block *sb) 1969static void udf_open_lvid(struct super_block *sb)
1947{ 1970{
1948 struct udf_sb_info *sbi = UDF_SB(sb); 1971 struct udf_sb_info *sbi = UDF_SB(sb);
1949 struct buffer_head *bh = sbi->s_lvid_bh; 1972 struct buffer_head *bh = sbi->s_lvid_bh;
1950 struct logicalVolIntegrityDesc *lvid; 1973 struct logicalVolIntegrityDesc *lvid;
1951 struct logicalVolIntegrityDescImpUse *lvidiu; 1974 struct logicalVolIntegrityDescImpUse *lvidiu;
1952 struct timespec64 ts;
1953 1975
1954 if (!bh) 1976 if (!bh)
1955 return; 1977 return;
@@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
1961 mutex_lock(&sbi->s_alloc_mutex); 1983 mutex_lock(&sbi->s_alloc_mutex);
1962 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1984 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1963 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1985 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1964 ktime_get_real_ts64(&ts);
1965 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
1966 if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) 1986 if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
1967 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); 1987 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
1968 else 1988 else
1969 UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); 1989 UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
1970 1990
1971 lvid->descTag.descCRC = cpu_to_le16( 1991 udf_finalize_lvid(lvid);
1972 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1973 le16_to_cpu(lvid->descTag.descCRCLength)));
1974
1975 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1976 mark_buffer_dirty(bh); 1992 mark_buffer_dirty(bh);
1977 sbi->s_lvid_dirty = 0; 1993 sbi->s_lvid_dirty = 0;
1978 mutex_unlock(&sbi->s_alloc_mutex); 1994 mutex_unlock(&sbi->s_alloc_mutex);
@@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
1986 struct buffer_head *bh = sbi->s_lvid_bh; 2002 struct buffer_head *bh = sbi->s_lvid_bh;
1987 struct logicalVolIntegrityDesc *lvid; 2003 struct logicalVolIntegrityDesc *lvid;
1988 struct logicalVolIntegrityDescImpUse *lvidiu; 2004 struct logicalVolIntegrityDescImpUse *lvidiu;
1989 struct timespec64 ts;
1990 2005
1991 if (!bh) 2006 if (!bh)
1992 return; 2007 return;
@@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
1998 mutex_lock(&sbi->s_alloc_mutex); 2013 mutex_lock(&sbi->s_alloc_mutex);
1999 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 2014 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
2000 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 2015 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
2001 ktime_get_real_ts64(&ts);
2002 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
2003 if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) 2016 if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
2004 lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); 2017 lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
2005 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) 2018 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
2009 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) 2022 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
2010 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 2023 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
2011 2024
2012 lvid->descTag.descCRC = cpu_to_le16(
2013 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
2014 le16_to_cpu(lvid->descTag.descCRCLength)));
2015
2016 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
2017 /* 2025 /*
2018 * We set buffer uptodate unconditionally here to avoid spurious 2026 * We set buffer uptodate unconditionally here to avoid spurious
2019 * warnings from mark_buffer_dirty() when previous EIO has marked 2027 * warnings from mark_buffer_dirty() when previous EIO has marked
2020 * the buffer as !uptodate 2028 * the buffer as !uptodate
2021 */ 2029 */
2022 set_buffer_uptodate(bh); 2030 set_buffer_uptodate(bh);
2031 udf_finalize_lvid(lvid);
2023 mark_buffer_dirty(bh); 2032 mark_buffer_dirty(bh);
2024 sbi->s_lvid_dirty = 0; 2033 sbi->s_lvid_dirty = 0;
2025 mutex_unlock(&sbi->s_alloc_mutex); 2034 mutex_unlock(&sbi->s_alloc_mutex);
@@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
2048 if (!(++uniqueID & 0xFFFFFFFF)) 2057 if (!(++uniqueID & 0xFFFFFFFF))
2049 uniqueID += 16; 2058 uniqueID += 16;
2050 lvhd->uniqueID = cpu_to_le64(uniqueID); 2059 lvhd->uniqueID = cpu_to_le64(uniqueID);
2060 udf_updated_lvid(sb);
2051 mutex_unlock(&sbi->s_alloc_mutex); 2061 mutex_unlock(&sbi->s_alloc_mutex);
2052 mark_buffer_dirty(bh);
2053 2062
2054 return ret; 2063 return ret;
2055} 2064}
@@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
2320 2329
2321 mutex_lock(&sbi->s_alloc_mutex); 2330 mutex_lock(&sbi->s_alloc_mutex);
2322 if (sbi->s_lvid_dirty) { 2331 if (sbi->s_lvid_dirty) {
2332 struct buffer_head *bh = sbi->s_lvid_bh;
2333 struct logicalVolIntegrityDesc *lvid;
2334
2335 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
2336 udf_finalize_lvid(lvid);
2337
2323 /* 2338 /*
2324 * Blockdevice will be synced later so we don't have to submit 2339 * Blockdevice will be synced later so we don't have to submit
2325 * the buffer for IO 2340 * the buffer for IO
2326 */ 2341 */
2327 mark_buffer_dirty(sbi->s_lvid_bh); 2342 mark_buffer_dirty(bh);
2328 sbi->s_lvid_dirty = 0; 2343 sbi->s_lvid_dirty = 0;
2329 } 2344 }
2330 mutex_unlock(&sbi->s_alloc_mutex); 2345 mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 999ad8d00d43..1ef8acf35e7d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
339 { /* BNO root block */ 339 { /* BNO root block */
340 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), 340 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
341 .numblks = BTOBB(mp->m_sb.sb_blocksize), 341 .numblks = BTOBB(mp->m_sb.sb_blocksize),
342 .ops = &xfs_allocbt_buf_ops, 342 .ops = &xfs_bnobt_buf_ops,
343 .work = &xfs_bnoroot_init, 343 .work = &xfs_bnoroot_init,
344 .need_init = true 344 .need_init = true
345 }, 345 },
346 { /* CNT root block */ 346 { /* CNT root block */
347 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), 347 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
348 .numblks = BTOBB(mp->m_sb.sb_blocksize), 348 .numblks = BTOBB(mp->m_sb.sb_blocksize),
349 .ops = &xfs_allocbt_buf_ops, 349 .ops = &xfs_cntbt_buf_ops,
350 .work = &xfs_cntroot_init, 350 .work = &xfs_cntroot_init,
351 .need_init = true 351 .need_init = true
352 }, 352 },
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
361 { /* FINO root block */ 361 { /* FINO root block */
362 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), 362 .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
363 .numblks = BTOBB(mp->m_sb.sb_blocksize), 363 .numblks = BTOBB(mp->m_sb.sb_blocksize),
364 .ops = &xfs_inobt_buf_ops, 364 .ops = &xfs_finobt_buf_ops,
365 .work = &xfs_btroot_init, 365 .work = &xfs_btroot_init,
366 .type = XFS_BTNUM_FINO, 366 .type = XFS_BTNUM_FINO,
367 .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) 367 .need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..e2ba2a3b63b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
281 */ 281 */
282 ask = used = 0; 282 ask = used = 0;
283 283
284 mp->m_inotbt_nores = true; 284 mp->m_finobt_nores = true;
285 285
286 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, 286 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
287 &used); 287 &used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b715668886a4..bc3367b8b7bb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -568,9 +568,9 @@ xfs_agfl_verify(
568 if (!xfs_sb_version_hascrc(&mp->m_sb)) 568 if (!xfs_sb_version_hascrc(&mp->m_sb))
569 return NULL; 569 return NULL;
570 570
571 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 571 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
572 return __this_address; 572 return __this_address;
573 if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) 573 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
574 return __this_address; 574 return __this_address;
575 /* 575 /*
576 * during growfs operations, the perag is not fully initialised, 576 * during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
643 643
644const struct xfs_buf_ops xfs_agfl_buf_ops = { 644const struct xfs_buf_ops xfs_agfl_buf_ops = {
645 .name = "xfs_agfl", 645 .name = "xfs_agfl",
646 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
646 .verify_read = xfs_agfl_read_verify, 647 .verify_read = xfs_agfl_read_verify,
647 .verify_write = xfs_agfl_write_verify, 648 .verify_write = xfs_agfl_write_verify,
648 .verify_struct = xfs_agfl_verify, 649 .verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
2587 return __this_address; 2588 return __this_address;
2588 } 2589 }
2589 2590
2590 if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && 2591 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2591 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2592 return __this_address;
2593
2594 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2592 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2595 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2593 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && 2596 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2594 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && 2597 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
2670 2673
2671const struct xfs_buf_ops xfs_agf_buf_ops = { 2674const struct xfs_buf_ops xfs_agf_buf_ops = {
2672 .name = "xfs_agf", 2675 .name = "xfs_agf",
2676 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2673 .verify_read = xfs_agf_read_verify, 2677 .verify_read = xfs_agf_read_verify,
2674 .verify_write = xfs_agf_write_verify, 2678 .verify_write = xfs_agf_write_verify,
2675 .verify_struct = xfs_agf_verify, 2679 .verify_struct = xfs_agf_verify,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..9fe949f6055e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
297 struct xfs_perag *pag = bp->b_pag; 297 struct xfs_perag *pag = bp->b_pag;
298 xfs_failaddr_t fa; 298 xfs_failaddr_t fa;
299 unsigned int level; 299 unsigned int level;
300 xfs_btnum_t btnum = XFS_BTNUM_BNOi;
301
302 if (!xfs_verify_magic(bp, block->bb_magic))
303 return __this_address;
304
305 if (xfs_sb_version_hascrc(&mp->m_sb)) {
306 fa = xfs_btree_sblock_v5hdr_verify(bp);
307 if (fa)
308 return fa;
309 }
300 310
301 /* 311 /*
302 * magic number and level verification 312 * The perag may not be attached during grow operations or fully
303 * 313 * initialized from the AGF during log recovery. Therefore we can only
304 * During growfs operations, we can't verify the exact level or owner as 314 * check against maximum tree depth from those contexts.
305 * the perag is not fully initialised and hence not attached to the
306 * buffer. In this case, check against the maximum tree depth.
307 * 315 *
308 * Similarly, during log recovery we will have a perag structure 316 * Otherwise check against the per-tree limit. Peek at one of the
309 * attached, but the agf information will not yet have been initialised 317 * verifier magic values to determine the type of tree we're verifying
310 * from the on disk AGF. Again, we can only check against maximum limits 318 * against.
311 * in this case.
312 */ 319 */
313 level = be16_to_cpu(block->bb_level); 320 level = be16_to_cpu(block->bb_level);
314 switch (block->bb_magic) { 321 if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
315 case cpu_to_be32(XFS_ABTB_CRC_MAGIC): 322 btnum = XFS_BTNUM_CNTi;
316 fa = xfs_btree_sblock_v5hdr_verify(bp); 323 if (pag && pag->pagf_init) {
317 if (fa) 324 if (level >= pag->pagf_levels[btnum])
318 return fa;
319 /* fall through */
320 case cpu_to_be32(XFS_ABTB_MAGIC):
321 if (pag && pag->pagf_init) {
322 if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
323 return __this_address;
324 } else if (level >= mp->m_ag_maxlevels)
325 return __this_address; 325 return __this_address;
326 break; 326 } else if (level >= mp->m_ag_maxlevels)
327 case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
328 fa = xfs_btree_sblock_v5hdr_verify(bp);
329 if (fa)
330 return fa;
331 /* fall through */
332 case cpu_to_be32(XFS_ABTC_MAGIC):
333 if (pag && pag->pagf_init) {
334 if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
335 return __this_address;
336 } else if (level >= mp->m_ag_maxlevels)
337 return __this_address;
338 break;
339 default:
340 return __this_address; 327 return __this_address;
341 }
342 328
343 return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); 329 return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
344} 330}
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
377 363
378} 364}
379 365
380const struct xfs_buf_ops xfs_allocbt_buf_ops = { 366const struct xfs_buf_ops xfs_bnobt_buf_ops = {
381 .name = "xfs_allocbt", 367 .name = "xfs_bnobt",
368 .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
369 cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
382 .verify_read = xfs_allocbt_read_verify, 370 .verify_read = xfs_allocbt_read_verify,
383 .verify_write = xfs_allocbt_write_verify, 371 .verify_write = xfs_allocbt_write_verify,
384 .verify_struct = xfs_allocbt_verify, 372 .verify_struct = xfs_allocbt_verify,
385}; 373};
386 374
375const struct xfs_buf_ops xfs_cntbt_buf_ops = {
376 .name = "xfs_cntbt",
377 .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
378 cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
379 .verify_read = xfs_allocbt_read_verify,
380 .verify_write = xfs_allocbt_write_verify,
381 .verify_struct = xfs_allocbt_verify,
382};
387 383
388STATIC int 384STATIC int
389xfs_bnobt_keys_inorder( 385xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
448 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 444 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
449 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 445 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
450 .key_diff = xfs_bnobt_key_diff, 446 .key_diff = xfs_bnobt_key_diff,
451 .buf_ops = &xfs_allocbt_buf_ops, 447 .buf_ops = &xfs_bnobt_buf_ops,
452 .diff_two_keys = xfs_bnobt_diff_two_keys, 448 .diff_two_keys = xfs_bnobt_diff_two_keys,
453 .keys_inorder = xfs_bnobt_keys_inorder, 449 .keys_inorder = xfs_bnobt_keys_inorder,
454 .recs_inorder = xfs_bnobt_recs_inorder, 450 .recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
470 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 466 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
471 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 467 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
472 .key_diff = xfs_cntbt_key_diff, 468 .key_diff = xfs_cntbt_key_diff,
473 .buf_ops = &xfs_allocbt_buf_ops, 469 .buf_ops = &xfs_cntbt_buf_ops,
474 .diff_two_keys = xfs_cntbt_diff_two_keys, 470 .diff_two_keys = xfs_cntbt_diff_two_keys,
475 .keys_inorder = xfs_cntbt_keys_inorder, 471 .keys_inorder = xfs_cntbt_keys_inorder,
476 .recs_inorder = xfs_cntbt_recs_inorder, 472 .recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..2dd9ee2a2e08 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
1336 xfs_da_state_free(state); 1336 xfs_da_state_free(state);
1337 return retval; 1337 return retval;
1338} 1338}
1339
1340/* Returns true if the attribute entry name is valid. */
1341bool
1342xfs_attr_namecheck(
1343 const void *name,
1344 size_t length)
1345{
1346 /*
1347 * MAXNAMELEN includes the trailing null, but (name/length) leave it
1348 * out, so use >= for the length check.
1349 */
1350 if (length >= MAXNAMELEN)
1351 return false;
1352
1353 /* There shouldn't be any nulls here */
1354 return !memchr(name, 0, length);
1355}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..2297d8467666 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
145int xfs_attr_remove_args(struct xfs_da_args *args); 145int xfs_attr_remove_args(struct xfs_da_args *args);
146int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 146int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
147 int flags, struct attrlist_cursor_kern *cursor); 147 int flags, struct attrlist_cursor_kern *cursor);
148 148bool xfs_attr_namecheck(const void *name, size_t length);
149 149
150#endif /* __XFS_ATTR_H__ */ 150#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2652d00842d6..1f6e3965ff74 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
245 struct xfs_attr_leaf_entry *entries; 245 struct xfs_attr_leaf_entry *entries;
246 uint32_t end; /* must be 32bit - see below */ 246 uint32_t end; /* must be 32bit - see below */
247 int i; 247 int i;
248 xfs_failaddr_t fa;
248 249
249 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); 250 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
250 251
251 if (xfs_sb_version_hascrc(&mp->m_sb)) { 252 fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
252 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 253 if (fa)
253 254 return fa;
254 if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
255 return __this_address;
256 255
257 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
258 return __this_address;
259 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
260 return __this_address;
261 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
262 return __this_address;
263 } else {
264 if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
265 return __this_address;
266 }
267 /* 256 /*
268 * In recovery there is a transient state where count == 0 is valid 257 * In recovery there is a transient state where count == 0 is valid
269 * because we may have transitioned an empty shortform attr to a leaf 258 * because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
369 358
370const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { 359const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
371 .name = "xfs_attr3_leaf", 360 .name = "xfs_attr3_leaf",
361 .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
362 cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
372 .verify_read = xfs_attr3_leaf_read_verify, 363 .verify_read = xfs_attr3_leaf_read_verify,
373 .verify_write = xfs_attr3_leaf_write_verify, 364 .verify_write = xfs_attr3_leaf_write_verify,
374 .verify_struct = xfs_attr3_leaf_verify, 365 .verify_struct = xfs_attr3_leaf_verify,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..65ff600a8067 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
79static xfs_failaddr_t 79static xfs_failaddr_t
80xfs_attr3_rmt_verify( 80xfs_attr3_rmt_verify(
81 struct xfs_mount *mp, 81 struct xfs_mount *mp,
82 struct xfs_buf *bp,
82 void *ptr, 83 void *ptr,
83 int fsbsize, 84 int fsbsize,
84 xfs_daddr_t bno) 85 xfs_daddr_t bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
87 88
88 if (!xfs_sb_version_hascrc(&mp->m_sb)) 89 if (!xfs_sb_version_hascrc(&mp->m_sb))
89 return __this_address; 90 return __this_address;
90 if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) 91 if (!xfs_verify_magic(bp, rmt->rm_magic))
91 return __this_address; 92 return __this_address;
92 if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) 93 if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
93 return __this_address; 94 return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
131 *failaddr = __this_address; 132 *failaddr = __this_address;
132 return -EFSBADCRC; 133 return -EFSBADCRC;
133 } 134 }
134 *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); 135 *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
135 if (*failaddr) 136 if (*failaddr)
136 return -EFSCORRUPTED; 137 return -EFSCORRUPTED;
137 len -= blksize; 138 len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
193 while (len > 0) { 194 while (len > 0) {
194 struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; 195 struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
195 196
196 fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); 197 fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
197 if (fa) { 198 if (fa) {
198 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 199 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
199 return; 200 return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
220 221
221const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { 222const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
222 .name = "xfs_attr3_rmt", 223 .name = "xfs_attr3_rmt",
224 .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
223 .verify_read = xfs_attr3_rmt_read_verify, 225 .verify_read = xfs_attr3_rmt_read_verify,
224 .verify_write = xfs_attr3_rmt_write_verify, 226 .verify_write = xfs_attr3_rmt_write_verify,
225 .verify_struct = xfs_attr3_rmt_verify_struct, 227 .verify_struct = xfs_attr3_rmt_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 332eefa2700b..48502cb9990f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
577 */ 577 */
578 578
579/* 579/*
580 * Transform a btree format file with only one leaf node, where the 580 * Convert the inode format to extent format if it currently is in btree format,
581 * extents list will fit in the inode, into an extents format file. 581 * but the extent list is small enough that it fits into the extent format.
582 * Since the file extents are already in-core, all we have to do is 582 *
583 * give up the space for the btree root and pitch the leaf block. 583 * Since the extents are already in-core, all we have to do is give up the space
584 * for the btree root and pitch the leaf block.
584 */ 585 */
585STATIC int /* error */ 586STATIC int /* error */
586xfs_bmap_btree_to_extents( 587xfs_bmap_btree_to_extents(
587 xfs_trans_t *tp, /* transaction pointer */ 588 struct xfs_trans *tp, /* transaction pointer */
588 xfs_inode_t *ip, /* incore inode pointer */ 589 struct xfs_inode *ip, /* incore inode pointer */
589 xfs_btree_cur_t *cur, /* btree cursor */ 590 struct xfs_btree_cur *cur, /* btree cursor */
590 int *logflagsp, /* inode logging flags */ 591 int *logflagsp, /* inode logging flags */
591 int whichfork) /* data or attr fork */ 592 int whichfork) /* data or attr fork */
592{ 593{
593 /* REFERENCED */ 594 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
595 struct xfs_mount *mp = ip->i_mount;
596 struct xfs_btree_block *rblock = ifp->if_broot;
594 struct xfs_btree_block *cblock;/* child btree block */ 597 struct xfs_btree_block *cblock;/* child btree block */
595 xfs_fsblock_t cbno; /* child block number */ 598 xfs_fsblock_t cbno; /* child block number */
596 xfs_buf_t *cbp; /* child block's buffer */ 599 xfs_buf_t *cbp; /* child block's buffer */
597 int error; /* error return value */ 600 int error; /* error return value */
598 struct xfs_ifork *ifp; /* inode fork data */
599 xfs_mount_t *mp; /* mount point structure */
600 __be64 *pp; /* ptr to block address */ 601 __be64 *pp; /* ptr to block address */
601 struct xfs_btree_block *rblock;/* root btree block */
602 struct xfs_owner_info oinfo; 602 struct xfs_owner_info oinfo;
603 603
604 mp = ip->i_mount; 604 /* check if we actually need the extent format first: */
605 ifp = XFS_IFORK_PTR(ip, whichfork); 605 if (!xfs_bmap_wants_extents(ip, whichfork))
606 return 0;
607
608 ASSERT(cur);
606 ASSERT(whichfork != XFS_COW_FORK); 609 ASSERT(whichfork != XFS_COW_FORK);
607 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 610 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
608 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); 611 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
609 rblock = ifp->if_broot;
610 ASSERT(be16_to_cpu(rblock->bb_level) == 1); 612 ASSERT(be16_to_cpu(rblock->bb_level) == 1);
611 ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); 613 ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
612 ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); 614 ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
615
613 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); 616 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
614 cbno = be64_to_cpu(*pp); 617 cbno = be64_to_cpu(*pp);
615 *logflagsp = 0;
616#ifdef DEBUG 618#ifdef DEBUG
617 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, 619 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
618 xfs_btree_check_lptr(cur, cbno, 1)); 620 xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
635 ASSERT(ifp->if_broot == NULL); 637 ASSERT(ifp->if_broot == NULL);
636 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); 638 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
637 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); 639 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
638 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 640 *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
639 return 0; 641 return 0;
640} 642}
641 643
@@ -2029,7 +2031,7 @@ done:
2029/* 2031/*
2030 * Convert an unwritten allocation to a real allocation or vice versa. 2032 * Convert an unwritten allocation to a real allocation or vice versa.
2031 */ 2033 */
2032STATIC int /* error */ 2034int /* error */
2033xfs_bmap_add_extent_unwritten_real( 2035xfs_bmap_add_extent_unwritten_real(
2034 struct xfs_trans *tp, 2036 struct xfs_trans *tp,
2035 xfs_inode_t *ip, /* incore inode pointer */ 2037 xfs_inode_t *ip, /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
3685 } 3687 }
3686} 3688}
3687 3689
3688/* trim extent to within eof */
3689void
3690xfs_trim_extent_eof(
3691 struct xfs_bmbt_irec *irec,
3692 struct xfs_inode *ip)
3693
3694{
3695 xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
3696 i_size_read(VFS_I(ip))));
3697}
3698
3699/* 3690/*
3700 * Trim the returned map to the required bounds 3691 * Trim the returned map to the required bounds
3701 */ 3692 */
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
4203 return 0; 4194 return 0;
4204} 4195}
4205 4196
4197static inline xfs_extlen_t
4198xfs_bmapi_minleft(
4199 struct xfs_trans *tp,
4200 struct xfs_inode *ip,
4201 int fork)
4202{
4203 if (tp && tp->t_firstblock != NULLFSBLOCK)
4204 return 0;
4205 if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
4206 return 1;
4207 return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
4208}
4209
4210/*
4211 * Log whatever the flags say, even if error. Otherwise we might miss detecting
4212 * a case where the data is changed, there's an error, and it's not logged so we
4213 * don't shutdown when we should. Don't bother logging extents/btree changes if
4214 * we converted to the other format.
4215 */
4216static void
4217xfs_bmapi_finish(
4218 struct xfs_bmalloca *bma,
4219 int whichfork,
4220 int error)
4221{
4222 if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
4223 XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4224 bma->logflags &= ~xfs_ilog_fext(whichfork);
4225 else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
4226 XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
4227 bma->logflags &= ~xfs_ilog_fbroot(whichfork);
4228
4229 if (bma->logflags)
4230 xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
4231 if (bma->cur)
4232 xfs_btree_del_cursor(bma->cur, error);
4233}
4234
4206/* 4235/*
4207 * Map file blocks to filesystem blocks, and allocate blocks or convert the 4236 * Map file blocks to filesystem blocks, and allocate blocks or convert the
4208 * extent state if necessary. Details behaviour is controlled by the flags 4237 * extent state if necessary. Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
4247 4276
4248 ASSERT(*nmap >= 1); 4277 ASSERT(*nmap >= 1);
4249 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4278 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4250 ASSERT(tp != NULL || 4279 ASSERT(tp != NULL);
4251 (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
4252 (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
4253 ASSERT(len > 0); 4280 ASSERT(len > 0);
4254 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); 4281 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4255 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 4282 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
4282 4309
4283 XFS_STATS_INC(mp, xs_blk_mapw); 4310 XFS_STATS_INC(mp, xs_blk_mapw);
4284 4311
4285 if (!tp || tp->t_firstblock == NULLFSBLOCK) {
4286 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4287 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
4288 else
4289 bma.minleft = 1;
4290 } else {
4291 bma.minleft = 0;
4292 }
4293
4294 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 4312 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4295 error = xfs_iread_extents(tp, ip, whichfork); 4313 error = xfs_iread_extents(tp, ip, whichfork);
4296 if (error) 4314 if (error)
4297 goto error0; 4315 goto error0;
4298 } 4316 }
4299 4317
4300 n = 0;
4301 end = bno + len;
4302 obno = bno;
4303
4304 if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) 4318 if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
4305 eof = true; 4319 eof = true;
4306 if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) 4320 if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
4309 bma.ip = ip; 4323 bma.ip = ip;
4310 bma.total = total; 4324 bma.total = total;
4311 bma.datatype = 0; 4325 bma.datatype = 0;
4326 bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
4312 4327
4328 n = 0;
4329 end = bno + len;
4330 obno = bno;
4313 while (bno < end && n < *nmap) { 4331 while (bno < end && n < *nmap) {
4314 bool need_alloc = false, wasdelay = false; 4332 bool need_alloc = false, wasdelay = false;
4315 4333
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
4323 ASSERT(!((flags & XFS_BMAPI_CONVERT) && 4341 ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
4324 (flags & XFS_BMAPI_COWFORK))); 4342 (flags & XFS_BMAPI_COWFORK)));
4325 4343
4326 if (flags & XFS_BMAPI_DELALLOC) { 4344 need_alloc = true;
4327 /*
4328 * For the COW fork we can reasonably get a
4329 * request for converting an extent that races
4330 * with other threads already having converted
4331 * part of it, as there converting COW to
4332 * regular blocks is not protected using the
4333 * IOLOCK.
4334 */
4335 ASSERT(flags & XFS_BMAPI_COWFORK);
4336 if (!(flags & XFS_BMAPI_COWFORK)) {
4337 error = -EIO;
4338 goto error0;
4339 }
4340
4341 if (eof || bno >= end)
4342 break;
4343 } else {
4344 need_alloc = true;
4345 }
4346 } else if (isnullstartblock(bma.got.br_startblock)) { 4345 } else if (isnullstartblock(bma.got.br_startblock)) {
4347 wasdelay = true; 4346 wasdelay = true;
4348 } 4347 }
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
4351 * First, deal with the hole before the allocated space 4350 * First, deal with the hole before the allocated space
4352 * that we found, if any. 4351 * that we found, if any.
4353 */ 4352 */
4354 if ((need_alloc || wasdelay) && 4353 if (need_alloc || wasdelay) {
4355 !(flags & XFS_BMAPI_CONVERT_ONLY)) {
4356 bma.eof = eof; 4354 bma.eof = eof;
4357 bma.conv = !!(flags & XFS_BMAPI_CONVERT); 4355 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4358 bma.wasdel = wasdelay; 4356 bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
4420 } 4418 }
4421 *nmap = n; 4419 *nmap = n;
4422 4420
4423 /* 4421 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
4424 * Transform from btree to extents, give it cur. 4422 whichfork);
4425 */ 4423 if (error)
4426 if (xfs_bmap_wants_extents(ip, whichfork)) { 4424 goto error0;
4427 int tmp_logflags = 0;
4428
4429 ASSERT(bma.cur);
4430 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
4431 &tmp_logflags, whichfork);
4432 bma.logflags |= tmp_logflags;
4433 if (error)
4434 goto error0;
4435 }
4436 4425
4437 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4426 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
4438 XFS_IFORK_NEXTENTS(ip, whichfork) > 4427 XFS_IFORK_NEXTENTS(ip, whichfork) >
4439 XFS_IFORK_MAXEXT(ip, whichfork)); 4428 XFS_IFORK_MAXEXT(ip, whichfork));
4440 error = 0; 4429 xfs_bmapi_finish(&bma, whichfork, 0);
4430 xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
4431 orig_nmap, *nmap);
4432 return 0;
4441error0: 4433error0:
4434 xfs_bmapi_finish(&bma, whichfork, error);
4435 return error;
4436}
4437
4438/*
4439 * Convert an existing delalloc extent to real blocks based on file offset. This
4440 * attempts to allocate the entire delalloc extent and may require multiple
4441 * invocations to allocate the target offset if a large enough physical extent
4442 * is not available.
4443 */
4444int
4445xfs_bmapi_convert_delalloc(
4446 struct xfs_inode *ip,
4447 int whichfork,
4448 xfs_fileoff_t offset_fsb,
4449 struct xfs_bmbt_irec *imap,
4450 unsigned int *seq)
4451{
4452 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
4453 struct xfs_mount *mp = ip->i_mount;
4454 struct xfs_bmalloca bma = { NULL };
4455 struct xfs_trans *tp;
4456 int error;
4457
4442 /* 4458 /*
4443 * Log everything. Do this after conversion, there's no point in 4459 * Space for the extent and indirect blocks was reserved when the
4444 * logging the extent records if we've converted to btree format. 4460 * delalloc extent was created so there's no need to do so here.
4445 */ 4461 */
4446 if ((bma.logflags & xfs_ilog_fext(whichfork)) && 4462 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
4447 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 4463 XFS_TRANS_RESERVE, &tp);
4448 bma.logflags &= ~xfs_ilog_fext(whichfork); 4464 if (error)
4449 else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && 4465 return error;
4450 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 4466
4451 bma.logflags &= ~xfs_ilog_fbroot(whichfork); 4467 xfs_ilock(ip, XFS_ILOCK_EXCL);
4468 xfs_trans_ijoin(tp, ip, 0);
4469
4470 if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
4471 bma.got.br_startoff > offset_fsb) {
4472 /*
4473 * No extent found in the range we are trying to convert. This
4474 * should only happen for the COW fork, where another thread
4475 * might have moved the extent to the data fork in the meantime.
4476 */
4477 WARN_ON_ONCE(whichfork != XFS_COW_FORK);
4478 error = -EAGAIN;
4479 goto out_trans_cancel;
4480 }
4481
4452 /* 4482 /*
4453 * Log whatever the flags say, even if error. Otherwise we might miss 4483 * If we find a real extent here we raced with another thread converting
4454 * detecting a case where the data is changed, there's an error, 4484 * the extent. Just return the real extent at this offset.
4455 * and it's not logged so we don't shutdown when we should.
4456 */ 4485 */
4457 if (bma.logflags) 4486 if (!isnullstartblock(bma.got.br_startblock)) {
4458 xfs_trans_log_inode(tp, ip, bma.logflags); 4487 *imap = bma.got;
4488 *seq = READ_ONCE(ifp->if_seq);
4489 goto out_trans_cancel;
4490 }
4491
4492 bma.tp = tp;
4493 bma.ip = ip;
4494 bma.wasdel = true;
4495 bma.offset = bma.got.br_startoff;
4496 bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
4497 bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
4498 bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
4499 if (whichfork == XFS_COW_FORK)
4500 bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
4459 4501
4460 if (bma.cur) { 4502 if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
4461 xfs_btree_del_cursor(bma.cur, error); 4503 bma.prev.br_startoff = NULLFILEOFF;
4504
4505 error = xfs_bmapi_allocate(&bma);
4506 if (error)
4507 goto out_finish;
4508
4509 error = -ENOSPC;
4510 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
4511 goto out_finish;
4512 error = -EFSCORRUPTED;
4513 if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
4514 goto out_finish;
4515
4516 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
4517 XFS_STATS_INC(mp, xs_xstrat_quick);
4518
4519 ASSERT(!isnullstartblock(bma.got.br_startblock));
4520 *imap = bma.got;
4521 *seq = READ_ONCE(ifp->if_seq);
4522
4523 if (whichfork == XFS_COW_FORK) {
4524 error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
4525 bma.length);
4526 if (error)
4527 goto out_finish;
4462 } 4528 }
4463 if (!error) 4529
4464 xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, 4530 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
4465 orig_nmap, *nmap); 4531 whichfork);
4532 if (error)
4533 goto out_finish;
4534
4535 xfs_bmapi_finish(&bma, whichfork, 0);
4536 error = xfs_trans_commit(tp);
4537 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4538 return error;
4539
4540out_finish:
4541 xfs_bmapi_finish(&bma, whichfork, error);
4542out_trans_cancel:
4543 xfs_trans_cancel(tp);
4544 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4466 return error; 4545 return error;
4467} 4546}
4468 4547
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
4536 if (error) 4615 if (error)
4537 goto error0; 4616 goto error0;
4538 4617
4539 if (xfs_bmap_wants_extents(ip, whichfork)) { 4618 error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
4540 int tmp_logflags = 0;
4541
4542 error = xfs_bmap_btree_to_extents(tp, ip, cur,
4543 &tmp_logflags, whichfork);
4544 logflags |= tmp_logflags;
4545 }
4546 4619
4547error0: 4620error0:
4548 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) 4621 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
5406 error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, 5479 error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
5407 &tmp_logflags, whichfork); 5480 &tmp_logflags, whichfork);
5408 logflags |= tmp_logflags; 5481 logflags |= tmp_logflags;
5409 if (error) 5482 } else {
5410 goto error0; 5483 error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
5411 }
5412 /*
5413 * transform from btree to extents, give it cur
5414 */
5415 else if (xfs_bmap_wants_extents(ip, whichfork)) {
5416 ASSERT(cur != NULL);
5417 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5418 whichfork); 5484 whichfork);
5419 logflags |= tmp_logflags;
5420 if (error)
5421 goto error0;
5422 } 5485 }
5423 /* 5486
5424 * transform from extents to local?
5425 */
5426 error = 0;
5427error0: 5487error0:
5428 /* 5488 /*
5429 * Log everything. Do this after conversion, there's no point in 5489 * Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 09d3ea97cc15..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
95/* Map something in the CoW fork. */ 95/* Map something in the CoW fork. */
96#define XFS_BMAPI_COWFORK 0x200 96#define XFS_BMAPI_COWFORK 0x200
97 97
98/* Only convert delalloc space, don't allocate entirely new extents */
99#define XFS_BMAPI_DELALLOC 0x400
100
101/* Only convert unwritten extents, don't allocate new blocks */
102#define XFS_BMAPI_CONVERT_ONLY 0x800
103
104/* Skip online discard of freed extents */ 98/* Skip online discard of freed extents */
105#define XFS_BMAPI_NODISCARD 0x1000 99#define XFS_BMAPI_NODISCARD 0x1000
106 100
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
117 { XFS_BMAPI_ZERO, "ZERO" }, \ 111 { XFS_BMAPI_ZERO, "ZERO" }, \
118 { XFS_BMAPI_REMAP, "REMAP" }, \ 112 { XFS_BMAPI_REMAP, "REMAP" }, \
119 { XFS_BMAPI_COWFORK, "COWFORK" }, \ 113 { XFS_BMAPI_COWFORK, "COWFORK" }, \
120 { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
121 { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
122 { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ 114 { XFS_BMAPI_NODISCARD, "NODISCARD" }, \
123 { XFS_BMAPI_NORMAP, "NORMAP" } 115 { XFS_BMAPI_NORMAP, "NORMAP" }
124 116
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
181 173
182void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, 174void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
183 xfs_filblks_t len); 175 xfs_filblks_t len);
184void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
185int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 176int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
186int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); 177int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
187void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); 178void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
228 xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, 219 xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
229 struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, 220 struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
230 int eof); 221 int eof);
222int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
223 xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
224 unsigned int *seq);
225int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
226 struct xfs_inode *ip, int whichfork,
227 struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
228 struct xfs_bmbt_irec *new, int *logflagsp);
231 229
232static inline void 230static inline void
233xfs_bmap_add_free( 231xfs_bmap_add_free(
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..aff82ed112c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
416 xfs_failaddr_t fa; 416 xfs_failaddr_t fa;
417 unsigned int level; 417 unsigned int level;
418 418
419 switch (block->bb_magic) { 419 if (!xfs_verify_magic(bp, block->bb_magic))
420 case cpu_to_be32(XFS_BMAP_CRC_MAGIC): 420 return __this_address;
421
422 if (xfs_sb_version_hascrc(&mp->m_sb)) {
421 /* 423 /*
422 * XXX: need a better way of verifying the owner here. Right now 424 * XXX: need a better way of verifying the owner here. Right now
423 * just make sure there has been one set. 425 * just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
425 fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 427 fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
426 if (fa) 428 if (fa)
427 return fa; 429 return fa;
428 /* fall through */
429 case cpu_to_be32(XFS_BMAP_MAGIC):
430 break;
431 default:
432 return __this_address;
433 } 430 }
434 431
435 /* 432 /*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
481 478
482const struct xfs_buf_ops xfs_bmbt_buf_ops = { 479const struct xfs_buf_ops xfs_bmbt_buf_ops = {
483 .name = "xfs_bmbt", 480 .name = "xfs_bmbt",
481 .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
482 cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
484 .verify_read = xfs_bmbt_read_verify, 483 .verify_read = xfs_bmbt_read_verify,
485 .verify_write = xfs_bmbt_write_verify, 484 .verify_write = xfs_bmbt_write_verify,
486 .verify_struct = xfs_bmbt_verify, 485 .verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..e2737e2ac2ae 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
116 kmem_zone_free(xfs_da_state_zone, state); 116 kmem_zone_free(xfs_da_state_zone, state);
117} 117}
118 118
119/*
120 * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
121 * accessible on v5 filesystems. This header format is common across da node,
122 * attr leaf and dir leaf blocks.
123 */
124xfs_failaddr_t
125xfs_da3_blkinfo_verify(
126 struct xfs_buf *bp,
127 struct xfs_da3_blkinfo *hdr3)
128{
129 struct xfs_mount *mp = bp->b_target->bt_mount;
130 struct xfs_da_blkinfo *hdr = &hdr3->hdr;
131
132 if (!xfs_verify_magic16(bp, hdr->magic))
133 return __this_address;
134
135 if (xfs_sb_version_hascrc(&mp->m_sb)) {
136 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
137 return __this_address;
138 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
139 return __this_address;
140 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
141 return __this_address;
142 }
143
144 return NULL;
145}
146
119static xfs_failaddr_t 147static xfs_failaddr_t
120xfs_da3_node_verify( 148xfs_da3_node_verify(
121 struct xfs_buf *bp) 149 struct xfs_buf *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
124 struct xfs_da_intnode *hdr = bp->b_addr; 152 struct xfs_da_intnode *hdr = bp->b_addr;
125 struct xfs_da3_icnode_hdr ichdr; 153 struct xfs_da3_icnode_hdr ichdr;
126 const struct xfs_dir_ops *ops; 154 const struct xfs_dir_ops *ops;
155 xfs_failaddr_t fa;
127 156
128 ops = xfs_dir_get_ops(mp, NULL); 157 ops = xfs_dir_get_ops(mp, NULL);
129 158
130 ops->node_hdr_from_disk(&ichdr, hdr); 159 ops->node_hdr_from_disk(&ichdr, hdr);
131 160
132 if (xfs_sb_version_hascrc(&mp->m_sb)) { 161 fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
133 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 162 if (fa)
134 163 return fa;
135 if (ichdr.magic != XFS_DA3_NODE_MAGIC)
136 return __this_address;
137 164
138 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
139 return __this_address;
140 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
141 return __this_address;
142 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
143 return __this_address;
144 } else {
145 if (ichdr.magic != XFS_DA_NODE_MAGIC)
146 return __this_address;
147 }
148 if (ichdr.level == 0) 165 if (ichdr.level == 0)
149 return __this_address; 166 return __this_address;
150 if (ichdr.level > XFS_DA_NODE_MAXDEPTH) 167 if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
257 274
258const struct xfs_buf_ops xfs_da3_node_buf_ops = { 275const struct xfs_buf_ops xfs_da3_node_buf_ops = {
259 .name = "xfs_da3_node", 276 .name = "xfs_da3_node",
277 .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
278 cpu_to_be16(XFS_DA3_NODE_MAGIC) },
260 .verify_read = xfs_da3_node_read_verify, 279 .verify_read = xfs_da3_node_read_verify,
261 .verify_write = xfs_da3_node_write_verify, 280 .verify_write = xfs_da3_node_write_verify,
262 .verify_struct = xfs_da3_node_verify_struct, 281 .verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
869 return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); 869 return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
870} 870}
871 871
872xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
873 struct xfs_da3_blkinfo *hdr3);
874
872#endif /* __XFS_DA_FORMAT_H__ */ 875#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..156ce95c9c45 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
703 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 703 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
704 return 0; 704 return 0;
705} 705}
706
707/* Returns true if the directory entry name is valid. */
708bool
709xfs_dir2_namecheck(
710 const void *name,
711 size_t length)
712{
713 /*
714 * MAXNAMELEN includes the trailing null, but (name/length) leave it
715 * out, so use >= for the length check.
716 */
717 if (length >= MAXNAMELEN)
718 return false;
719
720 /* There shouldn't be any slashes or nulls here */
721 return !memchr(name, '/', length) && !memchr(name, 0, length);
722}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
326unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); 326unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
327void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, 327void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
328 struct xfs_dir2_data_hdr *hdr); 328 struct xfs_dir2_data_hdr *hdr);
329bool xfs_dir2_namecheck(const void *name, size_t length);
329 330
330#endif /* __XFS_DIR2_H__ */ 331#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..b7d6d78f4ce2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
53 struct xfs_mount *mp = bp->b_target->bt_mount; 53 struct xfs_mount *mp = bp->b_target->bt_mount;
54 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 54 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
55 55
56 if (!xfs_verify_magic(bp, hdr3->magic))
57 return __this_address;
58
56 if (xfs_sb_version_hascrc(&mp->m_sb)) { 59 if (xfs_sb_version_hascrc(&mp->m_sb)) {
57 if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
58 return __this_address;
59 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) 60 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
60 return __this_address; 61 return __this_address;
61 if (be64_to_cpu(hdr3->blkno) != bp->b_bn) 62 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
62 return __this_address; 63 return __this_address;
63 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) 64 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
64 return __this_address; 65 return __this_address;
65 } else {
66 if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
67 return __this_address;
68 } 66 }
69 return __xfs_dir3_data_check(NULL, bp); 67 return __xfs_dir3_data_check(NULL, bp);
70} 68}
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
112 110
113const struct xfs_buf_ops xfs_dir3_block_buf_ops = { 111const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
114 .name = "xfs_dir3_block", 112 .name = "xfs_dir3_block",
113 .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
114 cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
115 .verify_read = xfs_dir3_block_read_verify, 115 .verify_read = xfs_dir3_block_read_verify,
116 .verify_write = xfs_dir3_block_write_verify, 116 .verify_write = xfs_dir3_block_write_verify,
117 .verify_struct = xfs_dir3_block_verify, 117 .verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..b7b9ce002cb9 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
252 struct xfs_mount *mp = bp->b_target->bt_mount; 252 struct xfs_mount *mp = bp->b_target->bt_mount;
253 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 253 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
254 254
255 if (!xfs_verify_magic(bp, hdr3->magic))
256 return __this_address;
257
255 if (xfs_sb_version_hascrc(&mp->m_sb)) { 258 if (xfs_sb_version_hascrc(&mp->m_sb)) {
256 if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
257 return __this_address;
258 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) 259 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
259 return __this_address; 260 return __this_address;
260 if (be64_to_cpu(hdr3->blkno) != bp->b_bn) 261 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
261 return __this_address; 262 return __this_address;
262 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) 263 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
263 return __this_address; 264 return __this_address;
264 } else {
265 if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
266 return __this_address;
267 } 265 }
268 return __xfs_dir3_data_check(NULL, bp); 266 return __xfs_dir3_data_check(NULL, bp);
269} 267}
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
339 337
340const struct xfs_buf_ops xfs_dir3_data_buf_ops = { 338const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
341 .name = "xfs_dir3_data", 339 .name = "xfs_dir3_data",
340 .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
341 cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
342 .verify_read = xfs_dir3_data_read_verify, 342 .verify_read = xfs_dir3_data_read_verify,
343 .verify_write = xfs_dir3_data_write_verify, 343 .verify_write = xfs_dir3_data_write_verify,
344 .verify_struct = xfs_dir3_data_verify, 344 .verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
346 346
347static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { 347static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
348 .name = "xfs_dir3_data_reada", 348 .name = "xfs_dir3_data_reada",
349 .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
350 cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
349 .verify_read = xfs_dir3_data_reada_verify, 351 .verify_read = xfs_dir3_data_reada_verify,
350 .verify_write = xfs_dir3_data_write_verify, 352 .verify_write = xfs_dir3_data_write_verify,
351}; 353};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..9a3767818c50 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
142 */ 142 */
143static xfs_failaddr_t 143static xfs_failaddr_t
144xfs_dir3_leaf_verify( 144xfs_dir3_leaf_verify(
145 struct xfs_buf *bp, 145 struct xfs_buf *bp)
146 uint16_t magic)
147{ 146{
148 struct xfs_mount *mp = bp->b_target->bt_mount; 147 struct xfs_mount *mp = bp->b_target->bt_mount;
149 struct xfs_dir2_leaf *leaf = bp->b_addr; 148 struct xfs_dir2_leaf *leaf = bp->b_addr;
149 xfs_failaddr_t fa;
150 150
151 ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); 151 fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
152 152 if (fa)
153 if (xfs_sb_version_hascrc(&mp->m_sb)) { 153 return fa;
154 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
155 uint16_t magic3;
156
157 magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
158 : XFS_DIR3_LEAFN_MAGIC;
159
160 if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
161 return __this_address;
162 if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
163 return __this_address;
164 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
165 return __this_address;
166 if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
167 return __this_address;
168 } else {
169 if (leaf->hdr.info.magic != cpu_to_be16(magic))
170 return __this_address;
171 }
172 154
173 return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); 155 return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
174} 156}
175 157
176static void 158static void
177__read_verify( 159xfs_dir3_leaf_read_verify(
178 struct xfs_buf *bp, 160 struct xfs_buf *bp)
179 uint16_t magic)
180{ 161{
181 struct xfs_mount *mp = bp->b_target->bt_mount; 162 struct xfs_mount *mp = bp->b_target->bt_mount;
182 xfs_failaddr_t fa; 163 xfs_failaddr_t fa;
@@ -185,23 +166,22 @@ __read_verify(
185 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) 166 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
186 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 167 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
187 else { 168 else {
188 fa = xfs_dir3_leaf_verify(bp, magic); 169 fa = xfs_dir3_leaf_verify(bp);
189 if (fa) 170 if (fa)
190 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 171 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
191 } 172 }
192} 173}
193 174
194static void 175static void
195__write_verify( 176xfs_dir3_leaf_write_verify(
196 struct xfs_buf *bp, 177 struct xfs_buf *bp)
197 uint16_t magic)
198{ 178{
199 struct xfs_mount *mp = bp->b_target->bt_mount; 179 struct xfs_mount *mp = bp->b_target->bt_mount;
200 struct xfs_buf_log_item *bip = bp->b_log_item; 180 struct xfs_buf_log_item *bip = bp->b_log_item;
201 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; 181 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
202 xfs_failaddr_t fa; 182 xfs_failaddr_t fa;
203 183
204 fa = xfs_dir3_leaf_verify(bp, magic); 184 fa = xfs_dir3_leaf_verify(bp);
205 if (fa) { 185 if (fa) {
206 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 186 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
207 return; 187 return;
@@ -216,60 +196,22 @@ __write_verify(
216 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); 196 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
217} 197}
218 198
219static xfs_failaddr_t
220xfs_dir3_leaf1_verify(
221 struct xfs_buf *bp)
222{
223 return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
224}
225
226static void
227xfs_dir3_leaf1_read_verify(
228 struct xfs_buf *bp)
229{
230 __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
231}
232
233static void
234xfs_dir3_leaf1_write_verify(
235 struct xfs_buf *bp)
236{
237 __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
238}
239
240static xfs_failaddr_t
241xfs_dir3_leafn_verify(
242 struct xfs_buf *bp)
243{
244 return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
245}
246
247static void
248xfs_dir3_leafn_read_verify(
249 struct xfs_buf *bp)
250{
251 __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
252}
253
254static void
255xfs_dir3_leafn_write_verify(
256 struct xfs_buf *bp)
257{
258 __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
259}
260
261const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { 199const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
262 .name = "xfs_dir3_leaf1", 200 .name = "xfs_dir3_leaf1",
263 .verify_read = xfs_dir3_leaf1_read_verify, 201 .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
264 .verify_write = xfs_dir3_leaf1_write_verify, 202 cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
265 .verify_struct = xfs_dir3_leaf1_verify, 203 .verify_read = xfs_dir3_leaf_read_verify,
204 .verify_write = xfs_dir3_leaf_write_verify,
205 .verify_struct = xfs_dir3_leaf_verify,
266}; 206};
267 207
268const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { 208const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
269 .name = "xfs_dir3_leafn", 209 .name = "xfs_dir3_leafn",
270 .verify_read = xfs_dir3_leafn_read_verify, 210 .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
271 .verify_write = xfs_dir3_leafn_write_verify, 211 cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
272 .verify_struct = xfs_dir3_leafn_verify, 212 .verify_read = xfs_dir3_leaf_read_verify,
213 .verify_write = xfs_dir3_leaf_write_verify,
214 .verify_struct = xfs_dir3_leaf_verify,
273}; 215};
274 216
275int 217int
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..3b03703c5c3d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
87 struct xfs_mount *mp = bp->b_target->bt_mount; 87 struct xfs_mount *mp = bp->b_target->bt_mount;
88 struct xfs_dir2_free_hdr *hdr = bp->b_addr; 88 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
89 89
90 if (!xfs_verify_magic(bp, hdr->magic))
91 return __this_address;
92
90 if (xfs_sb_version_hascrc(&mp->m_sb)) { 93 if (xfs_sb_version_hascrc(&mp->m_sb)) {
91 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 94 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
92 95
93 if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
94 return __this_address;
95 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) 96 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
96 return __this_address; 97 return __this_address;
97 if (be64_to_cpu(hdr3->blkno) != bp->b_bn) 98 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
98 return __this_address; 99 return __this_address;
99 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) 100 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
100 return __this_address; 101 return __this_address;
101 } else {
102 if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
103 return __this_address;
104 } 102 }
105 103
106 /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ 104 /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
151 149
152const struct xfs_buf_ops xfs_dir3_free_buf_ops = { 150const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
153 .name = "xfs_dir3_free", 151 .name = "xfs_dir3_free",
152 .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
153 cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
154 .verify_read = xfs_dir3_free_read_verify, 154 .verify_read = xfs_dir3_free_read_verify,
155 .verify_write = xfs_dir3_free_write_verify, 155 .verify_write = xfs_dir3_free_write_verify,
156 .verify_struct = xfs_dir3_free_verify, 156 .verify_struct = xfs_dir3_free_verify,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..fb5bd9a804f6 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
277 277
278const struct xfs_buf_ops xfs_dquot_buf_ops = { 278const struct xfs_buf_ops xfs_dquot_buf_ops = {
279 .name = "xfs_dquot", 279 .name = "xfs_dquot",
280 .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
281 cpu_to_be16(XFS_DQUOT_MAGIC) },
280 .verify_read = xfs_dquot_buf_read_verify, 282 .verify_read = xfs_dquot_buf_read_verify,
281 .verify_write = xfs_dquot_buf_write_verify, 283 .verify_write = xfs_dquot_buf_write_verify,
282 .verify_struct = xfs_dquot_buf_verify_struct, 284 .verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
284 286
285const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { 287const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
286 .name = "xfs_dquot_ra", 288 .name = "xfs_dquot_ra",
289 .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
290 cpu_to_be16(XFS_DQUOT_MAGIC) },
287 .verify_read = xfs_dquot_buf_readahead_verify, 291 .verify_read = xfs_dquot_buf_readahead_verify,
288 .verify_write = xfs_dquot_buf_write_verify, 292 .verify_write = xfs_dquot_buf_write_verify,
289}; 293};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
54#define XFS_ERRTAG_BUF_LRU_REF 31 54#define XFS_ERRTAG_BUF_LRU_REF 31
55#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 55#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
56#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 56#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
57#define XFS_ERRTAG_MAX 34 57#define XFS_ERRTAG_IUNLINK_FALLBACK 34
58#define XFS_ERRTAG_MAX 35
58 59
59/* 60/*
60 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. 61 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
93#define XFS_RANDOM_BUF_LRU_REF 2 94#define XFS_RANDOM_BUF_LRU_REF 2
94#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 95#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
95#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 96#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
97#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
96 98
97#endif /* __XFS_ERRORTAG_H_ */ 99#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d32152fc8a6c..fe9898875097 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
2508 /* 2508 /*
2509 * Validate the magic number of the agi block. 2509 * Validate the magic number of the agi block.
2510 */ 2510 */
2511 if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) 2511 if (!xfs_verify_magic(bp, agi->agi_magicnum))
2512 return __this_address; 2512 return __this_address;
2513 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) 2513 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2514 return __this_address; 2514 return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
2582 2582
2583const struct xfs_buf_ops xfs_agi_buf_ops = { 2583const struct xfs_buf_ops xfs_agi_buf_ops = {
2584 .name = "xfs_agi", 2584 .name = "xfs_agi",
2585 .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
2585 .verify_read = xfs_agi_read_verify, 2586 .verify_read = xfs_agi_read_verify,
2586 .verify_write = xfs_agi_write_verify, 2587 .verify_write = xfs_agi_write_verify,
2587 .verify_struct = xfs_agi_verify, 2588 .verify_struct = xfs_agi_verify,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b25e7a0df47..1080381ff243 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
124 union xfs_btree_ptr *new, 124 union xfs_btree_ptr *new,
125 int *stat) 125 int *stat)
126{ 126{
127 if (cur->bc_mp->m_inotbt_nores) 127 if (cur->bc_mp->m_finobt_nores)
128 return xfs_inobt_alloc_block(cur, start, new, stat); 128 return xfs_inobt_alloc_block(cur, start, new, stat);
129 return __xfs_inobt_alloc_block(cur, start, new, stat, 129 return __xfs_inobt_alloc_block(cur, start, new, stat,
130 XFS_AG_RESV_METADATA); 130 XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
154 struct xfs_btree_cur *cur, 154 struct xfs_btree_cur *cur,
155 struct xfs_buf *bp) 155 struct xfs_buf *bp)
156{ 156{
157 if (cur->bc_mp->m_inotbt_nores) 157 if (cur->bc_mp->m_finobt_nores)
158 return xfs_inobt_free_block(cur, bp); 158 return xfs_inobt_free_block(cur, bp);
159 return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); 159 return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
160} 160}
@@ -260,6 +260,9 @@ xfs_inobt_verify(
260 xfs_failaddr_t fa; 260 xfs_failaddr_t fa;
261 unsigned int level; 261 unsigned int level;
262 262
263 if (!xfs_verify_magic(bp, block->bb_magic))
264 return __this_address;
265
263 /* 266 /*
264 * During growfs operations, we can't verify the exact owner as the 267 * During growfs operations, we can't verify the exact owner as the
265 * perag is not fully initialised and hence not attached to the buffer. 268 * perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
270 * but beware of the landmine (i.e. need to check pag->pagi_init) if we 273 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
271 * ever do. 274 * ever do.
272 */ 275 */
273 switch (block->bb_magic) { 276 if (xfs_sb_version_hascrc(&mp->m_sb)) {
274 case cpu_to_be32(XFS_IBT_CRC_MAGIC):
275 case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
276 fa = xfs_btree_sblock_v5hdr_verify(bp); 277 fa = xfs_btree_sblock_v5hdr_verify(bp);
277 if (fa) 278 if (fa)
278 return fa; 279 return fa;
279 /* fall through */
280 case cpu_to_be32(XFS_IBT_MAGIC):
281 case cpu_to_be32(XFS_FIBT_MAGIC):
282 break;
283 default:
284 return __this_address;
285 } 280 }
286 281
287 /* level verification */ 282 /* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
328 323
329const struct xfs_buf_ops xfs_inobt_buf_ops = { 324const struct xfs_buf_ops xfs_inobt_buf_ops = {
330 .name = "xfs_inobt", 325 .name = "xfs_inobt",
326 .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
327 .verify_read = xfs_inobt_read_verify,
328 .verify_write = xfs_inobt_write_verify,
329 .verify_struct = xfs_inobt_verify,
330};
331
332const struct xfs_buf_ops xfs_finobt_buf_ops = {
333 .name = "xfs_finobt",
334 .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
335 cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
331 .verify_read = xfs_inobt_read_verify, 336 .verify_read = xfs_inobt_read_verify,
332 .verify_write = xfs_inobt_write_verify, 337 .verify_write = xfs_inobt_write_verify,
333 .verify_struct = xfs_inobt_verify, 338 .verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
389 .init_rec_from_cur = xfs_inobt_init_rec_from_cur, 394 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
390 .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, 395 .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
391 .key_diff = xfs_inobt_key_diff, 396 .key_diff = xfs_inobt_key_diff,
392 .buf_ops = &xfs_inobt_buf_ops, 397 .buf_ops = &xfs_finobt_buf_ops,
393 .diff_two_keys = xfs_inobt_diff_two_keys, 398 .diff_two_keys = xfs_inobt_diff_two_keys,
394 .keys_inorder = xfs_inobt_keys_inorder, 399 .keys_inorder = xfs_inobt_keys_inorder,
395 .recs_inorder = xfs_inobt_recs_inorder, 400 .recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..bc690f2409fa 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
614} 614}
615 615
616/* 616/*
617 * Increment the sequence counter if we are on a COW fork. This allows 617 * Increment the sequence counter on extent tree changes. If we are on a COW
618 * the writeback code to skip looking for a COW extent if the COW fork 618 * fork, this allows the writeback code to skip looking for a COW extent if the
619 * hasn't changed. We use WRITE_ONCE here to ensure the update to the 619 * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
620 * sequence counter is seen before the modifications to the extent 620 * sequence counter is seen before the modifications to the extent tree itself
621 * tree itself take effect. 621 * take effect.
622 */ 622 */
623static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) 623static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
624{ 624{
625 if (state & BMAP_COWFORK) 625 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
626 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
627} 626}
628 627
629void 628void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..e021d5133ccb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
97 97
98 dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); 98 dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
99 unlinked_ino = be32_to_cpu(dip->di_next_unlinked); 99 unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
100 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 100 di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
101 xfs_dinode_good_version(mp, dip->di_version) && 101 xfs_dinode_good_version(mp, dip->di_version) &&
102 (unlinked_ino == NULLAGINO || 102 xfs_verify_agino_or_null(mp, agno, unlinked_ino);
103 xfs_verify_agino(mp, agno, unlinked_ino));
104 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 103 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
105 XFS_ERRTAG_ITOBP_INOTOBP))) { 104 XFS_ERRTAG_ITOBP_INOTOBP))) {
106 if (readahead) { 105 if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
147 146
148const struct xfs_buf_ops xfs_inode_buf_ops = { 147const struct xfs_buf_ops xfs_inode_buf_ops = {
149 .name = "xfs_inode", 148 .name = "xfs_inode",
149 .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
150 cpu_to_be16(XFS_DINODE_MAGIC) },
150 .verify_read = xfs_inode_buf_read_verify, 151 .verify_read = xfs_inode_buf_read_verify,
151 .verify_write = xfs_inode_buf_write_verify, 152 .verify_write = xfs_inode_buf_write_verify,
152}; 153};
153 154
154const struct xfs_buf_ops xfs_inode_buf_ra_ops = { 155const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
155 .name = "xxfs_inode_ra", 156 .name = "xfs_inode_ra",
157 .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
158 cpu_to_be16(XFS_DINODE_MAGIC) },
156 .verify_read = xfs_inode_buf_readahead_verify, 159 .verify_read = xfs_inode_buf_readahead_verify,
157 .verify_write = xfs_inode_buf_write_verify, 160 .verify_write = xfs_inode_buf_write_verify,
158}; 161};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
14 */ 14 */
15struct xfs_ifork { 15struct xfs_ifork {
16 int if_bytes; /* bytes in if_u1 */ 16 int if_bytes; /* bytes in if_u1 */
17 unsigned int if_seq; /* cow fork mod counter */ 17 unsigned int if_seq; /* fork mod counter */
18 struct xfs_btree_block *if_broot; /* file's incore btree root */ 18 struct xfs_btree_block *if_broot; /* file's incore btree root */
19 short if_broot_bytes; /* bytes allocated for root */ 19 short if_broot_bytes; /* bytes allocated for root */
20 unsigned char if_flags; /* per-fork flags */ 20 unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d9eab657b63e..6f47ab876d90 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
209 xfs_failaddr_t fa; 209 xfs_failaddr_t fa;
210 unsigned int level; 210 unsigned int level;
211 211
212 if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) 212 if (!xfs_verify_magic(bp, block->bb_magic))
213 return __this_address; 213 return __this_address;
214 214
215 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 215 if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
264 264
265const struct xfs_buf_ops xfs_refcountbt_buf_ops = { 265const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
266 .name = "xfs_refcountbt", 266 .name = "xfs_refcountbt",
267 .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
267 .verify_read = xfs_refcountbt_read_verify, 268 .verify_read = xfs_refcountbt_read_verify,
268 .verify_write = xfs_refcountbt_write_verify, 269 .verify_write = xfs_refcountbt_write_verify,
269 .verify_struct = xfs_refcountbt_verify, 270 .verify_struct = xfs_refcountbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..5738e11055e6 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
310 * from the on disk AGF. Again, we can only check against maximum limits 310 * from the on disk AGF. Again, we can only check against maximum limits
311 * in this case. 311 * in this case.
312 */ 312 */
313 if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) 313 if (!xfs_verify_magic(bp, block->bb_magic))
314 return __this_address; 314 return __this_address;
315 315
316 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 316 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
365 365
366const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 366const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
367 .name = "xfs_rmapbt", 367 .name = "xfs_rmapbt",
368 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
368 .verify_read = xfs_rmapbt_read_verify, 369 .verify_read = xfs_rmapbt_read_verify,
369 .verify_write = xfs_rmapbt_write_verify, 370 .verify_write = xfs_rmapbt_write_verify,
370 .verify_struct = xfs_rmapbt_verify, 371 .verify_struct = xfs_rmapbt_verify,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..77a3a4085de3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
225 struct xfs_buf *bp, 225 struct xfs_buf *bp,
226 struct xfs_sb *sbp) 226 struct xfs_sb *sbp)
227{ 227{
228 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
228 uint32_t agcount = 0; 229 uint32_t agcount = 0;
229 uint32_t rem; 230 uint32_t rem;
230 231
231 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 232 if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
232 xfs_warn(mp, "bad magic number"); 233 xfs_warn(mp, "bad magic number");
233 return -EWRONGFS; 234 return -EWRONGFS;
234 } 235 }
@@ -781,12 +782,14 @@ out_error:
781 782
782const struct xfs_buf_ops xfs_sb_buf_ops = { 783const struct xfs_buf_ops xfs_sb_buf_ops = {
783 .name = "xfs_sb", 784 .name = "xfs_sb",
785 .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
784 .verify_read = xfs_sb_read_verify, 786 .verify_read = xfs_sb_read_verify,
785 .verify_write = xfs_sb_write_verify, 787 .verify_write = xfs_sb_write_verify,
786}; 788};
787 789
788const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { 790const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
789 .name = "xfs_sb_quiet", 791 .name = "xfs_sb_quiet",
792 .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
790 .verify_read = xfs_sb_quiet_read_verify, 793 .verify_read = xfs_sb_quiet_read_verify,
791 .verify_write = xfs_sb_write_verify, 794 .verify_write = xfs_sb_write_verify,
792}; 795};
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
874 uint64_t bfreelst = 0; 877 uint64_t bfreelst = 0;
875 uint64_t btree = 0; 878 uint64_t btree = 0;
876 uint64_t fdblocks; 879 uint64_t fdblocks;
877 int error; 880 int error = 0;
878 881
879 for (index = 0; index < agcount; index++) { 882 for (index = 0; index < agcount; index++) {
880 /* 883 /*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..4e909791aeac 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
25extern const struct xfs_buf_ops xfs_agi_buf_ops; 25extern const struct xfs_buf_ops xfs_agi_buf_ops;
26extern const struct xfs_buf_ops xfs_agf_buf_ops; 26extern const struct xfs_buf_ops xfs_agf_buf_ops;
27extern const struct xfs_buf_ops xfs_agfl_buf_ops; 27extern const struct xfs_buf_ops xfs_agfl_buf_ops;
28extern const struct xfs_buf_ops xfs_allocbt_buf_ops; 28extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
29extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
29extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; 30extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
30extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; 31extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
31extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; 32extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
36extern const struct xfs_buf_ops xfs_symlink_buf_ops; 37extern const struct xfs_buf_ops xfs_symlink_buf_ops;
37extern const struct xfs_buf_ops xfs_agi_buf_ops; 38extern const struct xfs_buf_ops xfs_agi_buf_ops;
38extern const struct xfs_buf_ops xfs_inobt_buf_ops; 39extern const struct xfs_buf_ops xfs_inobt_buf_ops;
40extern const struct xfs_buf_ops xfs_finobt_buf_ops;
39extern const struct xfs_buf_ops xfs_inode_buf_ops; 41extern const struct xfs_buf_ops xfs_inode_buf_ops;
40extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; 42extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
41extern const struct xfs_buf_ops xfs_dquot_buf_ops; 43extern const struct xfs_buf_ops xfs_dquot_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 77d80106f989..a0ccc253c43d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -95,7 +95,7 @@ xfs_symlink_verify(
95 95
96 if (!xfs_sb_version_hascrc(&mp->m_sb)) 96 if (!xfs_sb_version_hascrc(&mp->m_sb))
97 return __this_address; 97 return __this_address;
98 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) 98 if (!xfs_verify_magic(bp, dsl->sl_magic))
99 return __this_address; 99 return __this_address;
100 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) 100 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
101 return __this_address; 101 return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
159 159
160const struct xfs_buf_ops xfs_symlink_buf_ops = { 160const struct xfs_buf_ops xfs_symlink_buf_ops = {
161 .name = "xfs_symlink", 161 .name = "xfs_symlink",
162 .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
162 .verify_read = xfs_symlink_read_verify, 163 .verify_read = xfs_symlink_read_verify,
163 .verify_write = xfs_symlink_write_verify, 164 .verify_write = xfs_symlink_write_verify,
164 .verify_struct = xfs_symlink_verify, 165 .verify_struct = xfs_symlink_verify,
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 3306fc42cfad..de310712dd6d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -116,6 +116,19 @@ xfs_verify_agino(
116} 116}
117 117
118/* 118/*
119 * Verify that an AG inode number pointer neither points outside the AG
120 * nor points at static metadata, or is NULLAGINO.
121 */
122bool
123xfs_verify_agino_or_null(
124 struct xfs_mount *mp,
125 xfs_agnumber_t agno,
126 xfs_agino_t agino)
127{
128 return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
129}
130
131/*
119 * Verify that an FS inode number pointer neither points outside the 132 * Verify that an FS inode number pointer neither points outside the
120 * filesystem nor points at static AG metadata. 133 * filesystem nor points at static AG metadata.
121 */ 134 */
@@ -204,3 +217,14 @@ xfs_verify_icount(
204 xfs_icount_range(mp, &min, &max); 217 xfs_icount_range(mp, &min, &max);
205 return icount >= min && icount <= max; 218 return icount >= min && icount <= max;
206} 219}
220
221/* Sanity-checking of dir/attr block offsets. */
222bool
223xfs_verify_dablk(
224 struct xfs_mount *mp,
225 xfs_fileoff_t dabno)
226{
227 xfs_dablk_t max_dablk = -1U;
228
229 return dabno <= max_dablk;
230}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8f02855a019a..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
183 xfs_agino_t *first, xfs_agino_t *last); 183 xfs_agino_t *first, xfs_agino_t *last);
184bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, 184bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
185 xfs_agino_t agino); 185 xfs_agino_t agino);
186bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
187 xfs_agino_t agino);
186bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); 188bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
187bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); 189bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
188bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); 190bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
189bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); 191bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
190bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); 192bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
193bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
191 194
192#endif /* __XFS_TYPES_H__ */ 195#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..ddf06bfaa29d 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
399 if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) 399 if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
400 return; 400 return;
401 if (!have) { 401 if (!have) {
402 if (agf->agf_freeblks != be32_to_cpu(0)) 402 if (agf->agf_freeblks != cpu_to_be32(0))
403 xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); 403 xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
404 return; 404 return;
405 } 405 }
@@ -864,19 +864,17 @@ xchk_agi(
864 864
865 /* Check inode pointers */ 865 /* Check inode pointers */
866 agino = be32_to_cpu(agi->agi_newino); 866 agino = be32_to_cpu(agi->agi_newino);
867 if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) 867 if (!xfs_verify_agino_or_null(mp, agno, agino))
868 xchk_block_set_corrupt(sc, sc->sa.agi_bp); 868 xchk_block_set_corrupt(sc, sc->sa.agi_bp);
869 869
870 agino = be32_to_cpu(agi->agi_dirino); 870 agino = be32_to_cpu(agi->agi_dirino);
871 if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) 871 if (!xfs_verify_agino_or_null(mp, agno, agino))
872 xchk_block_set_corrupt(sc, sc->sa.agi_bp); 872 xchk_block_set_corrupt(sc, sc->sa.agi_bp);
873 873
874 /* Check unlinked inode buckets */ 874 /* Check unlinked inode buckets */
875 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { 875 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
876 agino = be32_to_cpu(agi->agi_unlinked[i]); 876 agino = be32_to_cpu(agi->agi_unlinked[i]);
877 if (agino == NULLAGINO) 877 if (!xfs_verify_agino_or_null(mp, agno, agino))
878 continue;
879 if (!xfs_verify_agino(mp, agno, agino))
880 xchk_block_set_corrupt(sc, sc->sa.agi_bp); 878 xchk_block_set_corrupt(sc, sc->sa.agi_bp);
881 } 879 }
882 880
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 03d1e15cceba..64e31f87d490 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -341,23 +341,19 @@ xrep_agf(
341 struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { 341 struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
342 [XREP_AGF_BNOBT] = { 342 [XREP_AGF_BNOBT] = {
343 .rmap_owner = XFS_RMAP_OWN_AG, 343 .rmap_owner = XFS_RMAP_OWN_AG,
344 .buf_ops = &xfs_allocbt_buf_ops, 344 .buf_ops = &xfs_bnobt_buf_ops,
345 .magic = XFS_ABTB_CRC_MAGIC,
346 }, 345 },
347 [XREP_AGF_CNTBT] = { 346 [XREP_AGF_CNTBT] = {
348 .rmap_owner = XFS_RMAP_OWN_AG, 347 .rmap_owner = XFS_RMAP_OWN_AG,
349 .buf_ops = &xfs_allocbt_buf_ops, 348 .buf_ops = &xfs_cntbt_buf_ops,
350 .magic = XFS_ABTC_CRC_MAGIC,
351 }, 349 },
352 [XREP_AGF_RMAPBT] = { 350 [XREP_AGF_RMAPBT] = {
353 .rmap_owner = XFS_RMAP_OWN_AG, 351 .rmap_owner = XFS_RMAP_OWN_AG,
354 .buf_ops = &xfs_rmapbt_buf_ops, 352 .buf_ops = &xfs_rmapbt_buf_ops,
355 .magic = XFS_RMAP_CRC_MAGIC,
356 }, 353 },
357 [XREP_AGF_REFCOUNTBT] = { 354 [XREP_AGF_REFCOUNTBT] = {
358 .rmap_owner = XFS_RMAP_OWN_REFC, 355 .rmap_owner = XFS_RMAP_OWN_REFC,
359 .buf_ops = &xfs_refcountbt_buf_ops, 356 .buf_ops = &xfs_refcountbt_buf_ops,
360 .magic = XFS_REFC_CRC_MAGIC,
361 }, 357 },
362 [XREP_AGF_END] = { 358 [XREP_AGF_END] = {
363 .buf_ops = NULL, 359 .buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
875 [XREP_AGI_INOBT] = { 871 [XREP_AGI_INOBT] = {
876 .rmap_owner = XFS_RMAP_OWN_INOBT, 872 .rmap_owner = XFS_RMAP_OWN_INOBT,
877 .buf_ops = &xfs_inobt_buf_ops, 873 .buf_ops = &xfs_inobt_buf_ops,
878 .magic = XFS_IBT_CRC_MAGIC,
879 }, 874 },
880 [XREP_AGI_FINOBT] = { 875 [XREP_AGI_FINOBT] = {
881 .rmap_owner = XFS_RMAP_OWN_INOBT, 876 .rmap_owner = XFS_RMAP_OWN_INOBT,
882 .buf_ops = &xfs_inobt_buf_ops, 877 .buf_ops = &xfs_finobt_buf_ops,
883 .magic = XFS_FIBT_CRC_MAGIC,
884 }, 878 },
885 [XREP_AGI_END] = { 879 [XREP_AGI_END] = {
886 .buf_ops = NULL 880 .buf_ops = NULL
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..dce74ec57038 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -82,12 +82,23 @@ xchk_xattr_listent(
82 82
83 sx = container_of(context, struct xchk_xattr, context); 83 sx = container_of(context, struct xchk_xattr, context);
84 84
85 if (xchk_should_terminate(sx->sc, &error)) {
86 context->seen_enough = 1;
87 return;
88 }
89
85 if (flags & XFS_ATTR_INCOMPLETE) { 90 if (flags & XFS_ATTR_INCOMPLETE) {
86 /* Incomplete attr key, just mark the inode for preening. */ 91 /* Incomplete attr key, just mark the inode for preening. */
87 xchk_ino_set_preen(sx->sc, context->dp->i_ino); 92 xchk_ino_set_preen(sx->sc, context->dp->i_ino);
88 return; 93 return;
89 } 94 }
90 95
96 /* Does this name make sense? */
97 if (!xfs_attr_namecheck(name, namelen)) {
98 xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
99 return;
100 }
101
91 args.flags = ATTR_KERNOTIME; 102 args.flags = ATTR_KERNOTIME;
92 if (flags & XFS_ATTR_ROOT) 103 if (flags & XFS_ATTR_ROOT)
93 args.flags |= ATTR_ROOT; 104 args.flags |= ATTR_ROOT;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..a703cd58a90e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
281 xchk_ag_free(info->sc, &info->sc->sa); 281 xchk_ag_free(info->sc, &info->sc->sa);
282} 282}
283 283
284/*
285 * Directories and attr forks should never have blocks that can't be addressed
286 * by a xfs_dablk_t.
287 */
288STATIC void
289xchk_bmap_dirattr_extent(
290 struct xfs_inode *ip,
291 struct xchk_bmap_info *info,
292 struct xfs_bmbt_irec *irec)
293{
294 struct xfs_mount *mp = ip->i_mount;
295 xfs_fileoff_t off;
296
297 if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
298 return;
299
300 if (!xfs_verify_dablk(mp, irec->br_startoff))
301 xchk_fblock_set_corrupt(info->sc, info->whichfork,
302 irec->br_startoff);
303
304 off = irec->br_startoff + irec->br_blockcount - 1;
305 if (!xfs_verify_dablk(mp, off))
306 xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
307}
308
284/* Scrub a single extent record. */ 309/* Scrub a single extent record. */
285STATIC int 310STATIC int
286xchk_bmap_extent( 311xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
305 xchk_fblock_set_corrupt(info->sc, info->whichfork, 330 xchk_fblock_set_corrupt(info->sc, info->whichfork,
306 irec->br_startoff); 331 irec->br_startoff);
307 332
333 xchk_bmap_dirattr_extent(ip, info, irec);
334
308 /* There should never be a "hole" extent in either extent list. */ 335 /* There should never be a "hole" extent in either extent list. */
309 if (irec->br_startblock == HOLESTARTBLOCK) 336 if (irec->br_startblock == HOLESTARTBLOCK)
310 xchk_fblock_set_corrupt(info->sc, info->whichfork, 337 xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..a38a22785a1a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -129,6 +129,12 @@ xchk_dir_actor(
129 goto out; 129 goto out;
130 } 130 }
131 131
132 /* Does this name make sense? */
133 if (!xfs_dir2_namecheck(name, namelen)) {
134 xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
135 goto out;
136 }
137
132 if (!strncmp(".", name, namelen)) { 138 if (!strncmp(".", name, namelen)) {
133 /* If this is "." then check that the inum matches the dir. */ 139 /* If this is "." then check that the inum matches the dir. */
134 if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) 140 if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 882dc56c5c21..700114f79a7d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
47struct xchk_iallocbt { 47struct xchk_iallocbt {
48 /* Number of inodes we see while scanning inobt. */ 48 /* Number of inodes we see while scanning inobt. */
49 unsigned long long inodes; 49 unsigned long long inodes;
50
51 /* Expected next startino, for big block filesystems. */
52 xfs_agino_t next_startino;
53
54 /* Expected end of the current inode cluster. */
55 xfs_agino_t next_cluster_ino;
50}; 56};
51 57
52/* 58/*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
128 return hweight64(freemask); 134 return hweight64(freemask);
129} 135}
130 136
131/* Check a particular inode with ir_free. */ 137/*
138 * Check that an inode's allocation status matches ir_free in the inobt
139 * record. First we try querying the in-core inode state, and if the inode
140 * isn't loaded we examine the on-disk inode directly.
141 *
142 * Since there can be 1:M and M:1 mappings between inobt records and inode
143 * clusters, we pass in the inode location information as an inobt record;
144 * the index of an inode cluster within the inobt record (as well as the
145 * cluster buffer itself); and the index of the inode within the cluster.
146 *
147 * @irec is the inobt record.
148 * @irec_ino is the inode offset from the start of the record.
149 * @dip is the on-disk inode.
150 */
132STATIC int 151STATIC int
133xchk_iallocbt_check_cluster_freemask( 152xchk_iallocbt_check_cluster_ifree(
134 struct xchk_btree *bs, 153 struct xchk_btree *bs,
135 xfs_ino_t fsino,
136 xfs_agino_t chunkino,
137 xfs_agino_t clusterino,
138 struct xfs_inobt_rec_incore *irec, 154 struct xfs_inobt_rec_incore *irec,
139 struct xfs_buf *bp) 155 unsigned int irec_ino,
156 struct xfs_dinode *dip)
140{ 157{
141 struct xfs_dinode *dip;
142 struct xfs_mount *mp = bs->cur->bc_mp; 158 struct xfs_mount *mp = bs->cur->bc_mp;
143 bool inode_is_free = false; 159 xfs_ino_t fsino;
160 xfs_agino_t agino;
161 bool irec_free;
162 bool ino_inuse;
144 bool freemask_ok; 163 bool freemask_ok;
145 bool inuse;
146 int error = 0; 164 int error = 0;
147 165
148 if (xchk_should_terminate(bs->sc, &error)) 166 if (xchk_should_terminate(bs->sc, &error))
149 return error; 167 return error;
150 168
151 dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); 169 /*
170 * Given an inobt record and the offset of an inode from the start of
171 * the record, compute which fs inode we're talking about.
172 */
173 agino = irec->ir_startino + irec_ino;
174 fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
175 irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
176
152 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || 177 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
153 (dip->di_version >= 3 && 178 (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
154 be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
155 xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 179 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
156 goto out; 180 goto out;
157 } 181 }
158 182
159 if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) 183 error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
160 inode_is_free = true; 184 &ino_inuse);
161 error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
162 fsino + clusterino, &inuse);
163 if (error == -ENODATA) { 185 if (error == -ENODATA) {
164 /* Not cached, just read the disk buffer */ 186 /* Not cached, just read the disk buffer */
165 freemask_ok = inode_is_free ^ !!(dip->di_mode); 187 freemask_ok = irec_free ^ !!(dip->di_mode);
166 if (!bs->sc->try_harder && !freemask_ok) 188 if (!bs->sc->try_harder && !freemask_ok)
167 return -EDEADLOCK; 189 return -EDEADLOCK;
168 } else if (error < 0) { 190 } else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
174 goto out; 196 goto out;
175 } else { 197 } else {
176 /* Inode is all there. */ 198 /* Inode is all there. */
177 freemask_ok = inode_is_free ^ inuse; 199 freemask_ok = irec_free ^ ino_inuse;
178 } 200 }
179 if (!freemask_ok) 201 if (!freemask_ok)
180 xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 202 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
182 return 0; 204 return 0;
183} 205}
184 206
185/* Make sure the free mask is consistent with what the inodes think. */ 207/*
208 * Check that the holemask and freemask of a hypothetical inode cluster match
209 * what's actually on disk. If sparse inodes are enabled, the cluster does
210 * not actually have to map to inodes if the corresponding holemask bit is set.
211 *
212 * @cluster_base is the first inode in the cluster within the @irec.
213 */
186STATIC int 214STATIC int
187xchk_iallocbt_check_freemask( 215xchk_iallocbt_check_cluster(
188 struct xchk_btree *bs, 216 struct xchk_btree *bs,
189 struct xfs_inobt_rec_incore *irec) 217 struct xfs_inobt_rec_incore *irec,
218 unsigned int cluster_base)
190{ 219{
191 struct xfs_imap imap; 220 struct xfs_imap imap;
192 struct xfs_mount *mp = bs->cur->bc_mp; 221 struct xfs_mount *mp = bs->cur->bc_mp;
193 struct xfs_dinode *dip; 222 struct xfs_dinode *dip;
194 struct xfs_buf *bp; 223 struct xfs_buf *cluster_bp;
195 xfs_ino_t fsino; 224 unsigned int nr_inodes;
196 xfs_agino_t nr_inodes; 225 xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
197 xfs_agino_t agino;
198 xfs_agino_t chunkino;
199 xfs_agino_t clusterino;
200 xfs_agblock_t agbno; 226 xfs_agblock_t agbno;
201 uint16_t holemask; 227 unsigned int cluster_index;
228 uint16_t cluster_mask = 0;
202 uint16_t ir_holemask; 229 uint16_t ir_holemask;
203 int error = 0; 230 int error = 0;
204 231
205 /* Make sure the freemask matches the inode records. */ 232 nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
206 nr_inodes = mp->m_inodes_per_cluster; 233 mp->m_inodes_per_cluster);
207 234
208 for (agino = irec->ir_startino; 235 /* Map this inode cluster */
209 agino < irec->ir_startino + XFS_INODES_PER_CHUNK; 236 agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
210 agino += mp->m_inodes_per_cluster) { 237
211 fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); 238 /* Compute a bitmask for this cluster that can be used for holemask. */
212 chunkino = agino - irec->ir_startino; 239 for (cluster_index = 0;
213 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 240 cluster_index < nr_inodes;
214 241 cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
215 /* Compute the holemask mask for this cluster. */ 242 cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
216 for (clusterino = 0, holemask = 0; clusterino < nr_inodes; 243 XFS_INODES_PER_HOLEMASK_BIT);
217 clusterino += XFS_INODES_PER_HOLEMASK_BIT) 244
218 holemask |= XFS_INOBT_MASK((chunkino + clusterino) / 245 /*
219 XFS_INODES_PER_HOLEMASK_BIT); 246 * Map the first inode of this cluster to a buffer and offset.
220 247 * Be careful about inobt records that don't align with the start of
221 /* The whole cluster must be a hole or not a hole. */ 248 * the inode buffer when block sizes are large enough to hold multiple
222 ir_holemask = (irec->ir_holemask & holemask); 249 * inode chunks. When this happens, cluster_base will be zero but
223 if (ir_holemask != holemask && ir_holemask != 0) { 250 * ir_startino can be large enough to make im_boffset nonzero.
251 */
252 ir_holemask = (irec->ir_holemask & cluster_mask);
253 imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
254 imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
255 imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
256
257 if (imap.im_boffset != 0 && cluster_base != 0) {
258 ASSERT(imap.im_boffset == 0 || cluster_base == 0);
259 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
260 return 0;
261 }
262
263 trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
264 imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
265 cluster_mask, ir_holemask,
266 XFS_INO_TO_OFFSET(mp, irec->ir_startino +
267 cluster_base));
268
269 /* The whole cluster must be a hole or not a hole. */
270 if (ir_holemask != cluster_mask && ir_holemask != 0) {
271 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
272 return 0;
273 }
274
275 /* If any part of this is a hole, skip it. */
276 if (ir_holemask) {
277 xchk_xref_is_not_owned_by(bs->sc, agbno,
278 mp->m_blocks_per_cluster,
279 &XFS_RMAP_OINFO_INODES);
280 return 0;
281 }
282
283 xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
284 &XFS_RMAP_OINFO_INODES);
285
286 /* Grab the inode cluster buffer. */
287 error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
288 0, 0);
289 if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
290 return error;
291
292 /* Check free status of each inode within this cluster. */
293 for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
294 struct xfs_dinode *dip;
295
296 if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
224 xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 297 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
225 continue; 298 break;
226 } 299 }
227 300
228 /* If any part of this is a hole, skip it. */ 301 dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
229 if (ir_holemask) { 302 error = xchk_iallocbt_check_cluster_ifree(bs, irec,
230 xchk_xref_is_not_owned_by(bs->sc, agbno, 303 cluster_base + cluster_index, dip);
231 mp->m_blocks_per_cluster, 304 if (error)
232 &XFS_RMAP_OINFO_INODES); 305 break;
233 continue; 306 imap.im_boffset += mp->m_sb.sb_inodesize;
307 }
308
309 xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
310 return error;
311}
312
313/*
314 * For all the inode clusters that could map to this inobt record, make sure
315 * that the holemask makes sense and that the allocation status of each inode
316 * matches the freemask.
317 */
318STATIC int
319xchk_iallocbt_check_clusters(
320 struct xchk_btree *bs,
321 struct xfs_inobt_rec_incore *irec)
322{
323 unsigned int cluster_base;
324 int error = 0;
325
326 /*
327 * For the common case where this inobt record maps to multiple inode
328 * clusters this will call _check_cluster for each cluster.
329 *
330 * For the case that multiple inobt records map to a single cluster,
331 * this will call _check_cluster once.
332 */
333 for (cluster_base = 0;
334 cluster_base < XFS_INODES_PER_CHUNK;
335 cluster_base += bs->sc->mp->m_inodes_per_cluster) {
336 error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
337 if (error)
338 break;
339 }
340
341 return error;
342}
343
344/*
345 * Make sure this inode btree record is aligned properly. Because a fs block
346 * contains multiple inodes, we check that the inobt record is aligned to the
347 * correct inode, not just the correct block on disk. This results in a finer
348 * grained corruption check.
349 */
350STATIC void
351xchk_iallocbt_rec_alignment(
352 struct xchk_btree *bs,
353 struct xfs_inobt_rec_incore *irec)
354{
355 struct xfs_mount *mp = bs->sc->mp;
356 struct xchk_iallocbt *iabt = bs->private;
357
358 /*
359 * finobt records have different positioning requirements than inobt
360 * records: each finobt record must have a corresponding inobt record.
361 * That is checked in the xref function, so for now we only catch the
362 * obvious case where the record isn't at all aligned properly.
363 *
364 * Note that if a fs block contains more than a single chunk of inodes,
365 * we will have finobt records only for those chunks containing free
366 * inodes, and therefore expect chunk alignment of finobt records.
367 * Otherwise, we expect that the finobt record is aligned to the
368 * cluster alignment as told by the superblock.
369 */
370 if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
371 unsigned int imask;
372
373 imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
374 mp->m_cluster_align_inodes) - 1;
375 if (irec->ir_startino & imask)
376 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
377 return;
378 }
379
380 if (iabt->next_startino != NULLAGINO) {
381 /*
382 * We're midway through a cluster of inodes that is mapped by
383 * multiple inobt records. Did we get the record for the next
384 * irec in the sequence?
385 */
386 if (irec->ir_startino != iabt->next_startino) {
387 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
388 return;
234 } 389 }
235 390
236 xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, 391 iabt->next_startino += XFS_INODES_PER_CHUNK;
237 &XFS_RMAP_OINFO_INODES);
238 392
239 /* Grab the inode cluster buffer. */ 393 /* Are we done with the cluster? */
240 imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, 394 if (iabt->next_startino >= iabt->next_cluster_ino) {
241 agbno); 395 iabt->next_startino = NULLAGINO;
242 imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); 396 iabt->next_cluster_ino = NULLAGINO;
243 imap.im_boffset = 0;
244
245 error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
246 &dip, &bp, 0, 0);
247 if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
248 &error))
249 continue;
250
251 /* Which inodes are free? */
252 for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
253 error = xchk_iallocbt_check_cluster_freemask(bs,
254 fsino, chunkino, clusterino, irec, bp);
255 if (error) {
256 xfs_trans_brelse(bs->cur->bc_tp, bp);
257 return error;
258 }
259 } 397 }
398 return;
399 }
400
401 /* inobt records must be aligned to cluster and inoalignmnt size. */
402 if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
403 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
404 return;
405 }
260 406
261 xfs_trans_brelse(bs->cur->bc_tp, bp); 407 if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
408 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
409 return;
262 } 410 }
263 411
264 return error; 412 if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
413 return;
414
415 /*
416 * If this is the start of an inode cluster that can be mapped by
417 * multiple inobt records, the next inobt record must follow exactly
418 * after this one.
419 */
420 iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
421 iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
265} 422}
266 423
267/* Scrub an inobt/finobt record. */ 424/* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
276 uint64_t holes; 433 uint64_t holes;
277 xfs_agnumber_t agno = bs->cur->bc_private.a.agno; 434 xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
278 xfs_agino_t agino; 435 xfs_agino_t agino;
279 xfs_agblock_t agbno;
280 xfs_extlen_t len; 436 xfs_extlen_t len;
281 int holecount; 437 int holecount;
282 int i; 438 int i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
303 goto out; 459 goto out;
304 } 460 }
305 461
306 /* Make sure this record is aligned to cluster and inoalignmnt size. */ 462 xchk_iallocbt_rec_alignment(bs, &irec);
307 agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); 463 if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
308 if ((agbno & (mp->m_cluster_align - 1)) || 464 goto out;
309 (agbno & (mp->m_blocks_per_cluster - 1)))
310 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
311 465
312 iabt->inodes += irec.ir_count; 466 iabt->inodes += irec.ir_count;
313 467
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
320 474
321 if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) 475 if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
322 goto out; 476 goto out;
323 goto check_freemask; 477 goto check_clusters;
324 } 478 }
325 479
326 /* Check each chunk of a sparse inode cluster. */ 480 /* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
346 holecount + irec.ir_count != XFS_INODES_PER_CHUNK) 500 holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
347 xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 501 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
348 502
349check_freemask: 503check_clusters:
350 error = xchk_iallocbt_check_freemask(bs, &irec); 504 error = xchk_iallocbt_check_clusters(bs, &irec);
351 if (error) 505 if (error)
352 goto out; 506 goto out;
353 507
@@ -429,6 +583,8 @@ xchk_iallocbt(
429 struct xfs_btree_cur *cur; 583 struct xfs_btree_cur *cur;
430 struct xchk_iallocbt iabt = { 584 struct xchk_iallocbt iabt = {
431 .inodes = 0, 585 .inodes = 0,
586 .next_startino = NULLAGINO,
587 .next_cluster_ino = NULLAGINO,
432 }; 588 };
433 int error; 589 int error;
434 590
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 6acf1bfa0bfe..f28f4bad317b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -743,7 +743,8 @@ xrep_findroot_block(
743 743
744 /* Ensure the block magic matches the btree type we're looking for. */ 744 /* Ensure the block magic matches the btree type we're looking for. */
745 btblock = XFS_BUF_TO_BLOCK(bp); 745 btblock = XFS_BUF_TO_BLOCK(bp);
746 if (be32_to_cpu(btblock->bb_magic) != fab->magic) 746 ASSERT(fab->buf_ops->magic[1] != 0);
747 if (btblock->bb_magic != fab->buf_ops->magic[1])
747 goto out; 748 goto out;
748 749
749 /* 750 /*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f2fc18bb7605..d990314eb08b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
42 /* in: buffer ops */ 42 /* in: buffer ops */
43 const struct xfs_buf_ops *buf_ops; 43 const struct xfs_buf_ops *buf_ops;
44 44
45 /* in: magic number of the btree */
46 uint32_t magic;
47
48 /* out: the highest btree block found and the tree height */ 45 /* out: the highest btree block found and the tree height */
49 xfs_agblock_t root; 46 xfs_agblock_t root;
50 unsigned int height; 47 unsigned int height;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..dbe115b075f7 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
141 startext = fsbno; 141 startext = fsbno;
142 endext = fsbno + len - 1; 142 endext = fsbno + len - 1;
143 do_div(startext, sc->mp->m_sb.sb_rextsize); 143 do_div(startext, sc->mp->m_sb.sb_rextsize);
144 if (do_div(endext, sc->mp->m_sb.sb_rextsize)) 144 do_div(endext, sc->mp->m_sb.sb_rextsize);
145 endext++; 145 extcount = endext - startext + 1;
146 extcount = endext - startext;
147 xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 146 xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
148 error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, 147 error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
149 &is_free); 148 &is_free);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 8344b14031ef..3c83e8b3b39c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
545 __entry->ret_ip) 545 __entry->ret_ip)
546); 546);
547 547
548TRACE_EVENT(xchk_iallocbt_check_cluster,
549 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
550 xfs_agino_t startino, xfs_daddr_t map_daddr,
551 unsigned short map_len, unsigned int chunk_ino,
552 unsigned int nr_inodes, uint16_t cluster_mask,
553 uint16_t holemask, unsigned int cluster_ino),
554 TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
555 cluster_mask, holemask, cluster_ino),
556 TP_STRUCT__entry(
557 __field(dev_t, dev)
558 __field(xfs_agnumber_t, agno)
559 __field(xfs_agino_t, startino)
560 __field(xfs_daddr_t, map_daddr)
561 __field(unsigned short, map_len)
562 __field(unsigned int, chunk_ino)
563 __field(unsigned int, nr_inodes)
564 __field(unsigned int, cluster_ino)
565 __field(uint16_t, cluster_mask)
566 __field(uint16_t, holemask)
567 ),
568 TP_fast_assign(
569 __entry->dev = mp->m_super->s_dev;
570 __entry->agno = agno;
571 __entry->startino = startino;
572 __entry->map_daddr = map_daddr;
573 __entry->map_len = map_len;
574 __entry->chunk_ino = chunk_ino;
575 __entry->nr_inodes = nr_inodes;
576 __entry->cluster_mask = cluster_mask;
577 __entry->holemask = holemask;
578 __entry->cluster_ino = cluster_ino;
579 ),
580 TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
581 MAJOR(__entry->dev), MINOR(__entry->dev),
582 __entry->agno,
583 __entry->startino,
584 __entry->map_daddr,
585 __entry->map_len,
586 __entry->chunk_ino,
587 __entry->nr_inodes,
588 __entry->cluster_mask,
589 __entry->holemask,
590 __entry->cluster_ino)
591)
592
548/* repair tracepoints */ 593/* repair tracepoints */
549#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) 594#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
550 595
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d9048bcea49c..7b8bb6bde981 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,7 +28,8 @@
28 */ 28 */
29struct xfs_writepage_ctx { 29struct xfs_writepage_ctx {
30 struct xfs_bmbt_irec imap; 30 struct xfs_bmbt_irec imap;
31 unsigned int io_type; 31 int fork;
32 unsigned int data_seq;
32 unsigned int cow_seq; 33 unsigned int cow_seq;
33 struct xfs_ioend *ioend; 34 struct xfs_ioend *ioend;
34}; 35};
@@ -255,30 +256,20 @@ xfs_end_io(
255 */ 256 */
256 error = blk_status_to_errno(ioend->io_bio->bi_status); 257 error = blk_status_to_errno(ioend->io_bio->bi_status);
257 if (unlikely(error)) { 258 if (unlikely(error)) {
258 switch (ioend->io_type) { 259 if (ioend->io_fork == XFS_COW_FORK)
259 case XFS_IO_COW:
260 xfs_reflink_cancel_cow_range(ip, offset, size, true); 260 xfs_reflink_cancel_cow_range(ip, offset, size, true);
261 break;
262 }
263
264 goto done; 261 goto done;
265 } 262 }
266 263
267 /* 264 /*
268 * Success: commit the COW or unwritten blocks if needed. 265 * Success: commit the COW or unwritten blocks if needed.
269 */ 266 */
270 switch (ioend->io_type) { 267 if (ioend->io_fork == XFS_COW_FORK)
271 case XFS_IO_COW:
272 error = xfs_reflink_end_cow(ip, offset, size); 268 error = xfs_reflink_end_cow(ip, offset, size);
273 break; 269 else if (ioend->io_state == XFS_EXT_UNWRITTEN)
274 case XFS_IO_UNWRITTEN:
275 /* writeback should never update isize */
276 error = xfs_iomap_write_unwritten(ip, offset, size, false); 270 error = xfs_iomap_write_unwritten(ip, offset, size, false);
277 break; 271 else
278 default:
279 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); 272 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
280 break;
281 }
282 273
283done: 274done:
284 if (ioend->io_append_trans) 275 if (ioend->io_append_trans)
@@ -293,7 +284,8 @@ xfs_end_bio(
293 struct xfs_ioend *ioend = bio->bi_private; 284 struct xfs_ioend *ioend = bio->bi_private;
294 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 285 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
295 286
296 if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) 287 if (ioend->io_fork == XFS_COW_FORK ||
288 ioend->io_state == XFS_EXT_UNWRITTEN)
297 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 289 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
298 else if (ioend->io_append_trans) 290 else if (ioend->io_append_trans)
299 queue_work(mp->m_data_workqueue, &ioend->io_work); 291 queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -301,6 +293,75 @@ xfs_end_bio(
301 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); 293 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
302} 294}
303 295
296/*
297 * Fast revalidation of the cached writeback mapping. Return true if the current
298 * mapping is valid, false otherwise.
299 */
300static bool
301xfs_imap_valid(
302 struct xfs_writepage_ctx *wpc,
303 struct xfs_inode *ip,
304 xfs_fileoff_t offset_fsb)
305{
306 if (offset_fsb < wpc->imap.br_startoff ||
307 offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
308 return false;
309 /*
310 * If this is a COW mapping, it is sufficient to check that the mapping
311 * covers the offset. Be careful to check this first because the caller
312 * can revalidate a COW mapping without updating the data seqno.
313 */
314 if (wpc->fork == XFS_COW_FORK)
315 return true;
316
317 /*
318 * This is not a COW mapping. Check the sequence number of the data fork
319 * because concurrent changes could have invalidated the extent. Check
320 * the COW fork because concurrent changes since the last time we
321 * checked (and found nothing at this offset) could have added
322 * overlapping blocks.
323 */
324 if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
325 return false;
326 if (xfs_inode_has_cow_data(ip) &&
327 wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
328 return false;
329 return true;
330}
331
332/*
333 * Pass in a dellalloc extent and convert it to real extents, return the real
334 * extent that maps offset_fsb in wpc->imap.
335 *
336 * The current page is held locked so nothing could have removed the block
337 * backing offset_fsb, although it could have moved from the COW to the data
338 * fork by another thread.
339 */
340static int
341xfs_convert_blocks(
342 struct xfs_writepage_ctx *wpc,
343 struct xfs_inode *ip,
344 xfs_fileoff_t offset_fsb)
345{
346 int error;
347
348 /*
349 * Attempt to allocate whatever delalloc extent currently backs
350 * offset_fsb and put the result into wpc->imap. Allocate in a loop
351 * because it may take several attempts to allocate real blocks for a
352 * contiguous delalloc extent if free space is sufficiently fragmented.
353 */
354 do {
355 error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
356 &wpc->imap, wpc->fork == XFS_COW_FORK ?
357 &wpc->cow_seq : &wpc->data_seq);
358 if (error)
359 return error;
360 } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
361
362 return 0;
363}
364
304STATIC int 365STATIC int
305xfs_map_blocks( 366xfs_map_blocks(
306 struct xfs_writepage_ctx *wpc, 367 struct xfs_writepage_ctx *wpc,
@@ -310,26 +371,16 @@ xfs_map_blocks(
310 struct xfs_inode *ip = XFS_I(inode); 371 struct xfs_inode *ip = XFS_I(inode);
311 struct xfs_mount *mp = ip->i_mount; 372 struct xfs_mount *mp = ip->i_mount;
312 ssize_t count = i_blocksize(inode); 373 ssize_t count = i_blocksize(inode);
313 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; 374 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
375 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
314 xfs_fileoff_t cow_fsb = NULLFILEOFF; 376 xfs_fileoff_t cow_fsb = NULLFILEOFF;
315 struct xfs_bmbt_irec imap; 377 struct xfs_bmbt_irec imap;
316 int whichfork = XFS_DATA_FORK;
317 struct xfs_iext_cursor icur; 378 struct xfs_iext_cursor icur;
318 bool imap_valid; 379 int retries = 0;
319 int error = 0; 380 int error = 0;
320 381
321 /* 382 if (XFS_FORCED_SHUTDOWN(mp))
322 * We have to make sure the cached mapping is within EOF to protect 383 return -EIO;
323 * against eofblocks trimming on file release leaving us with a stale
324 * mapping. Otherwise, a page for a subsequent file extending buffered
325 * write could get picked up by this writeback cycle and written to the
326 * wrong blocks.
327 *
328 * Note that what we really want here is a generic mapping invalidation
329 * mechanism to protect us from arbitrary extent modifying contexts, not
330 * just eofblocks.
331 */
332 xfs_trim_extent_eof(&wpc->imap, ip);
333 384
334 /* 385 /*
335 * COW fork blocks can overlap data fork blocks even if the blocks 386 * COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +397,19 @@ xfs_map_blocks(
346 * against concurrent updates and provides a memory barrier on the way 397 * against concurrent updates and provides a memory barrier on the way
347 * out that ensures that we always see the current value. 398 * out that ensures that we always see the current value.
348 */ 399 */
349 imap_valid = offset_fsb >= wpc->imap.br_startoff && 400 if (xfs_imap_valid(wpc, ip, offset_fsb))
350 offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
351 if (imap_valid &&
352 (!xfs_inode_has_cow_data(ip) ||
353 wpc->io_type == XFS_IO_COW ||
354 wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
355 return 0; 401 return 0;
356 402
357 if (XFS_FORCED_SHUTDOWN(mp))
358 return -EIO;
359
360 /* 403 /*
361 * If we don't have a valid map, now it's time to get a new one for this 404 * If we don't have a valid map, now it's time to get a new one for this
362 * offset. This will convert delayed allocations (including COW ones) 405 * offset. This will convert delayed allocations (including COW ones)
363 * into real extents. If we return without a valid map, it means we 406 * into real extents. If we return without a valid map, it means we
364 * landed in a hole and we skip the block. 407 * landed in a hole and we skip the block.
365 */ 408 */
409retry:
366 xfs_ilock(ip, XFS_ILOCK_SHARED); 410 xfs_ilock(ip, XFS_ILOCK_SHARED);
367 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 411 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
368 (ip->i_df.if_flags & XFS_IFEXTENTS)); 412 (ip->i_df.if_flags & XFS_IFEXTENTS));
369 ASSERT(offset <= mp->m_super->s_maxbytes);
370
371 if (offset > mp->m_super->s_maxbytes - count)
372 count = mp->m_super->s_maxbytes - offset;
373 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
374 413
375 /* 414 /*
376 * Check if this is offset is covered by a COW extents, and if yes use 415 * Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +421,16 @@ xfs_map_blocks(
382 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { 421 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
383 wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); 422 wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
384 xfs_iunlock(ip, XFS_ILOCK_SHARED); 423 xfs_iunlock(ip, XFS_ILOCK_SHARED);
385 /* 424
386 * Truncate can race with writeback since writeback doesn't 425 wpc->fork = XFS_COW_FORK;
387 * take the iolock and truncate decreases the file size before
388 * it starts truncating the pages between new_size and old_size.
389 * Therefore, we can end up in the situation where writeback
390 * gets a CoW fork mapping but the truncate makes the mapping
391 * invalid and we end up in here trying to get a new mapping.
392 * bail out here so that we simply never get a valid mapping
393 * and so we drop the write altogether. The page truncation
394 * will kill the contents anyway.
395 */
396 if (offset > i_size_read(inode)) {
397 wpc->io_type = XFS_IO_HOLE;
398 return 0;
399 }
400 whichfork = XFS_COW_FORK;
401 wpc->io_type = XFS_IO_COW;
402 goto allocate_blocks; 426 goto allocate_blocks;
403 } 427 }
404 428
405 /* 429 /*
406 * Map valid and no COW extent in the way? We're done. 430 * No COW extent overlap. Revalidate now that we may have updated
431 * ->cow_seq. If the data mapping is still valid, we're done.
407 */ 432 */
408 if (imap_valid) { 433 if (xfs_imap_valid(wpc, ip, offset_fsb)) {
409 xfs_iunlock(ip, XFS_ILOCK_SHARED); 434 xfs_iunlock(ip, XFS_ILOCK_SHARED);
410 return 0; 435 return 0;
411 } 436 }
@@ -417,51 +442,65 @@ xfs_map_blocks(
417 */ 442 */
418 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) 443 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
419 imap.br_startoff = end_fsb; /* fake a hole past EOF */ 444 imap.br_startoff = end_fsb; /* fake a hole past EOF */
445 wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
420 xfs_iunlock(ip, XFS_ILOCK_SHARED); 446 xfs_iunlock(ip, XFS_ILOCK_SHARED);
421 447
448 wpc->fork = XFS_DATA_FORK;
449
450 /* landed in a hole or beyond EOF? */
422 if (imap.br_startoff > offset_fsb) { 451 if (imap.br_startoff > offset_fsb) {
423 /* landed in a hole or beyond EOF */
424 imap.br_blockcount = imap.br_startoff - offset_fsb; 452 imap.br_blockcount = imap.br_startoff - offset_fsb;
425 imap.br_startoff = offset_fsb; 453 imap.br_startoff = offset_fsb;
426 imap.br_startblock = HOLESTARTBLOCK; 454 imap.br_startblock = HOLESTARTBLOCK;
427 wpc->io_type = XFS_IO_HOLE; 455 imap.br_state = XFS_EXT_NORM;
428 } else {
429 /*
430 * Truncate to the next COW extent if there is one. This is the
431 * only opportunity to do this because we can skip COW fork
432 * lookups for the subsequent blocks in the mapping; however,
433 * the requirement to treat the COW range separately remains.
434 */
435 if (cow_fsb != NULLFILEOFF &&
436 cow_fsb < imap.br_startoff + imap.br_blockcount)
437 imap.br_blockcount = cow_fsb - imap.br_startoff;
438
439 if (isnullstartblock(imap.br_startblock)) {
440 /* got a delalloc extent */
441 wpc->io_type = XFS_IO_DELALLOC;
442 goto allocate_blocks;
443 }
444
445 if (imap.br_state == XFS_EXT_UNWRITTEN)
446 wpc->io_type = XFS_IO_UNWRITTEN;
447 else
448 wpc->io_type = XFS_IO_OVERWRITE;
449 } 456 }
450 457
458 /*
459 * Truncate to the next COW extent if there is one. This is the only
460 * opportunity to do this because we can skip COW fork lookups for the
461 * subsequent blocks in the mapping; however, the requirement to treat
462 * the COW range separately remains.
463 */
464 if (cow_fsb != NULLFILEOFF &&
465 cow_fsb < imap.br_startoff + imap.br_blockcount)
466 imap.br_blockcount = cow_fsb - imap.br_startoff;
467
468 /* got a delalloc extent? */
469 if (imap.br_startblock != HOLESTARTBLOCK &&
470 isnullstartblock(imap.br_startblock))
471 goto allocate_blocks;
472
451 wpc->imap = imap; 473 wpc->imap = imap;
452 xfs_trim_extent_eof(&wpc->imap, ip); 474 trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
453 trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
454 return 0; 475 return 0;
455allocate_blocks: 476allocate_blocks:
456 error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, 477 error = xfs_convert_blocks(wpc, ip, offset_fsb);
457 &wpc->cow_seq); 478 if (error) {
458 if (error) 479 /*
480 * If we failed to find the extent in the COW fork we might have
481 * raced with a COW to data fork conversion or truncate.
482 * Restart the lookup to catch the extent in the data fork for
483 * the former case, but prevent additional retries to avoid
484 * looping forever for the latter case.
485 */
486 if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
487 goto retry;
488 ASSERT(error != -EAGAIN);
459 return error; 489 return error;
460 ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || 490 }
461 imap.br_startoff + imap.br_blockcount <= cow_fsb); 491
462 wpc->imap = imap; 492 /*
463 xfs_trim_extent_eof(&wpc->imap, ip); 493 * Due to merging the return real extent might be larger than the
464 trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); 494 * original delalloc one. Trim the return extent to the next COW
495 * boundary again to force a re-lookup.
496 */
497 if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
498 cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
499 wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
500
501 ASSERT(wpc->imap.br_startoff <= offset_fsb);
502 ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
503 trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
465 return 0; 504 return 0;
466} 505}
467 506
@@ -486,7 +525,7 @@ xfs_submit_ioend(
486 int status) 525 int status)
487{ 526{
488 /* Convert CoW extents to regular */ 527 /* Convert CoW extents to regular */
489 if (!status && ioend->io_type == XFS_IO_COW) { 528 if (!status && ioend->io_fork == XFS_COW_FORK) {
490 /* 529 /*
491 * Yuk. This can do memory allocation, but is not a 530 * Yuk. This can do memory allocation, but is not a
492 * transactional operation so everything is done in GFP_KERNEL 531 * transactional operation so everything is done in GFP_KERNEL
@@ -504,7 +543,8 @@ xfs_submit_ioend(
504 543
505 /* Reserve log space if we might write beyond the on-disk inode size. */ 544 /* Reserve log space if we might write beyond the on-disk inode size. */
506 if (!status && 545 if (!status &&
507 ioend->io_type != XFS_IO_UNWRITTEN && 546 (ioend->io_fork == XFS_COW_FORK ||
547 ioend->io_state != XFS_EXT_UNWRITTEN) &&
508 xfs_ioend_is_append(ioend) && 548 xfs_ioend_is_append(ioend) &&
509 !ioend->io_append_trans) 549 !ioend->io_append_trans)
510 status = xfs_setfilesize_trans_alloc(ioend); 550 status = xfs_setfilesize_trans_alloc(ioend);
@@ -533,7 +573,8 @@ xfs_submit_ioend(
533static struct xfs_ioend * 573static struct xfs_ioend *
534xfs_alloc_ioend( 574xfs_alloc_ioend(
535 struct inode *inode, 575 struct inode *inode,
536 unsigned int type, 576 int fork,
577 xfs_exntst_t state,
537 xfs_off_t offset, 578 xfs_off_t offset,
538 struct block_device *bdev, 579 struct block_device *bdev,
539 sector_t sector) 580 sector_t sector)
@@ -547,7 +588,8 @@ xfs_alloc_ioend(
547 588
548 ioend = container_of(bio, struct xfs_ioend, io_inline_bio); 589 ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
549 INIT_LIST_HEAD(&ioend->io_list); 590 INIT_LIST_HEAD(&ioend->io_list);
550 ioend->io_type = type; 591 ioend->io_fork = fork;
592 ioend->io_state = state;
551 ioend->io_inode = inode; 593 ioend->io_inode = inode;
552 ioend->io_size = 0; 594 ioend->io_size = 0;
553 ioend->io_offset = offset; 595 ioend->io_offset = offset;
@@ -608,13 +650,15 @@ xfs_add_to_ioend(
608 sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + 650 sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
609 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); 651 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
610 652
611 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || 653 if (!wpc->ioend ||
654 wpc->fork != wpc->ioend->io_fork ||
655 wpc->imap.br_state != wpc->ioend->io_state ||
612 sector != bio_end_sector(wpc->ioend->io_bio) || 656 sector != bio_end_sector(wpc->ioend->io_bio) ||
613 offset != wpc->ioend->io_offset + wpc->ioend->io_size) { 657 offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
614 if (wpc->ioend) 658 if (wpc->ioend)
615 list_add(&wpc->ioend->io_list, iolist); 659 list_add(&wpc->ioend->io_list, iolist);
616 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, 660 wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
617 bdev, sector); 661 wpc->imap.br_state, offset, bdev, sector);
618 } 662 }
619 663
620 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { 664 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
@@ -723,7 +767,7 @@ xfs_writepage_map(
723 error = xfs_map_blocks(wpc, inode, file_offset); 767 error = xfs_map_blocks(wpc, inode, file_offset);
724 if (error) 768 if (error)
725 break; 769 break;
726 if (wpc->io_type == XFS_IO_HOLE) 770 if (wpc->imap.br_startblock == HOLESTARTBLOCK)
727 continue; 771 continue;
728 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, 772 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
729 &submit_list); 773 &submit_list);
@@ -918,9 +962,7 @@ xfs_vm_writepage(
918 struct page *page, 962 struct page *page,
919 struct writeback_control *wbc) 963 struct writeback_control *wbc)
920{ 964{
921 struct xfs_writepage_ctx wpc = { 965 struct xfs_writepage_ctx wpc = { };
922 .io_type = XFS_IO_HOLE,
923 };
924 int ret; 966 int ret;
925 967
926 ret = xfs_do_writepage(page, wbc, &wpc); 968 ret = xfs_do_writepage(page, wbc, &wpc);
@@ -934,9 +976,7 @@ xfs_vm_writepages(
934 struct address_space *mapping, 976 struct address_space *mapping,
935 struct writeback_control *wbc) 977 struct writeback_control *wbc)
936{ 978{
937 struct xfs_writepage_ctx wpc = { 979 struct xfs_writepage_ctx wpc = { };
938 .io_type = XFS_IO_HOLE,
939 };
940 int ret; 980 int ret;
941 981
942 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 982 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -983,7 +1023,7 @@ xfs_vm_bmap(
983 * Since we don't pass back blockdev info, we can't return bmap 1023 * Since we don't pass back blockdev info, we can't return bmap
984 * information for rt files either. 1024 * information for rt files either.
985 */ 1025 */
986 if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) 1026 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
987 return 0; 1027 return 0;
988 return iomap_bmap(mapping, block, &xfs_iomap_ops); 1028 return iomap_bmap(mapping, block, &xfs_iomap_ops);
989} 1029}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e5c23948a8ab..6c2615b83c5d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,32 +9,12 @@
9extern struct bio_set xfs_ioend_bioset; 9extern struct bio_set xfs_ioend_bioset;
10 10
11/* 11/*
12 * Types of I/O for bmap clustering and I/O completion tracking.
13 *
14 * This enum is used in string mapping in xfs_trace.h; please keep the
15 * TRACE_DEFINE_ENUMs for it up to date.
16 */
17enum {
18 XFS_IO_HOLE, /* covers region without any block allocation */
19 XFS_IO_DELALLOC, /* covers delalloc region */
20 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
21 XFS_IO_OVERWRITE, /* covers already allocated extent */
22 XFS_IO_COW, /* covers copy-on-write extent */
23};
24
25#define XFS_IO_TYPES \
26 { XFS_IO_HOLE, "hole" }, \
27 { XFS_IO_DELALLOC, "delalloc" }, \
28 { XFS_IO_UNWRITTEN, "unwritten" }, \
29 { XFS_IO_OVERWRITE, "overwrite" }, \
30 { XFS_IO_COW, "CoW" }
31
32/*
33 * Structure for buffered I/O completions. 12 * Structure for buffered I/O completions.
34 */ 13 */
35struct xfs_ioend { 14struct xfs_ioend {
36 struct list_head io_list; /* next ioend in chain */ 15 struct list_head io_list; /* next ioend in chain */
37 unsigned int io_type; /* delalloc / unwritten */ 16 int io_fork; /* inode fork written back */
17 xfs_exntst_t io_state; /* extent state */
38 struct inode *io_inode; /* file being written to */ 18 struct inode *io_inode; /* file being written to */
39 size_t io_size; /* size of the extent */ 19 size_t io_size; /* size of the extent */
40 xfs_off_t io_offset; /* offset in the file */ 20 xfs_off_t io_offset; /* offset in the file */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..3d213a7394c5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
555 attrlist_ent_t *aep; 555 attrlist_ent_t *aep;
556 int arraytop; 556 int arraytop;
557 557
558 ASSERT(!context->seen_enough);
558 ASSERT(!(context->flags & ATTR_KERNOVAL)); 559 ASSERT(!(context->flags & ATTR_KERNOVAL));
559 ASSERT(context->count >= 0); 560 ASSERT(context->count >= 0);
560 ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); 561 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
1162 * by virtue of the hole punch. 1162 * by virtue of the hole punch.
1163 */ 1163 */
1164 error = xfs_free_file_space(ip, offset, len); 1164 error = xfs_free_file_space(ip, offset, len);
1165 if (error) 1165 if (error || xfs_is_always_cow_inode(ip))
1166 goto out; 1166 return error;
1167 1167
1168 error = xfs_alloc_file_space(ip, round_down(offset, blksize), 1168 return xfs_alloc_file_space(ip, round_down(offset, blksize),
1169 round_up(offset + len, blksize) - 1169 round_up(offset + len, blksize) -
1170 round_down(offset, blksize), 1170 round_down(offset, blksize),
1171 XFS_BMAPI_PREALLOC); 1171 XFS_BMAPI_PREALLOC);
1172out:
1173 return error;
1174
1175} 1172}
1176 1173
1177static int 1174static int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4f5f2ff3f70f..548344e25128 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -776,29 +776,24 @@ _xfs_buf_read(
776} 776}
777 777
778/* 778/*
779 * Set buffer ops on an unchecked buffer and validate it, if possible. 779 * Reverify a buffer found in cache without an attached ->b_ops.
780 * 780 *
781 * If the caller passed in an ops structure and the buffer doesn't have ops 781 * If the caller passed an ops structure and the buffer doesn't have ops
782 * assigned, set the ops and use them to verify the contents. If the contents 782 * assigned, set the ops and use it to verify the contents. If verification
783 * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no 783 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
784 * recorded errors and is already in XBF_DONE state. 784 * already in XBF_DONE state on entry.
785 * 785 *
786 * Under normal operations, every in-core buffer must have buffer ops assigned 786 * Under normal operations, every in-core buffer is verified on read I/O
787 * to them when the buffer is read in from disk so that we can validate the 787 * completion. There are two scenarios that can lead to in-core buffers without
788 * metadata. 788 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
789 * 789 * filesystem, though these buffers are purged at the end of recovery. The
790 * However, there are two scenarios where one can encounter in-core buffers 790 * other is online repair, which intentionally reads with a NULL buffer ops to
791 * that don't have buffer ops. The first is during log recovery of buffers on 791 * run several verifiers across an in-core buffer in order to establish buffer
792 * a V4 filesystem, though these buffers are purged at the end of recovery. 792 * type. If repair can't establish that, the buffer will be left in memory
793 * 793 * with NULL buffer ops.
794 * The other is online repair, which tries to match arbitrary metadata blocks
795 * with btree types in order to find the root. If online repair doesn't match
796 * the buffer with /any/ btree type, the buffer remains in memory in DONE state
797 * with no ops, and a subsequent read_buf call from elsewhere will not set the
798 * ops. This function helps us fix this situation.
799 */ 794 */
800int 795int
801xfs_buf_ensure_ops( 796xfs_buf_reverify(
802 struct xfs_buf *bp, 797 struct xfs_buf *bp,
803 const struct xfs_buf_ops *ops) 798 const struct xfs_buf_ops *ops)
804{ 799{
@@ -840,7 +835,7 @@ xfs_buf_read_map(
840 return bp; 835 return bp;
841 } 836 }
842 837
843 xfs_buf_ensure_ops(bp, ops); 838 xfs_buf_reverify(bp, ops);
844 839
845 if (flags & XBF_ASYNC) { 840 if (flags & XBF_ASYNC) {
846 /* 841 /*
@@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2209 2204
2210 atomic_set(&bp->b_lru_ref, lru_ref); 2205 atomic_set(&bp->b_lru_ref, lru_ref);
2211} 2206}
2207
2208/*
2209 * Verify an on-disk magic value against the magic value specified in the
2210 * verifier structure. The verifier magic is in disk byte order so the caller is
2211 * expected to pass the value directly from disk.
2212 */
2213bool
2214xfs_verify_magic(
2215 struct xfs_buf *bp,
2216 __be32 dmagic)
2217{
2218 struct xfs_mount *mp = bp->b_target->bt_mount;
2219 int idx;
2220
2221 idx = xfs_sb_version_hascrc(&mp->m_sb);
2222 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
2223 return false;
2224 return dmagic == bp->b_ops->magic[idx];
2225}
2226/*
2227 * Verify an on-disk magic value against the magic value specified in the
2228 * verifier structure. The verifier magic is in disk byte order so the caller is
2229 * expected to pass the value directly from disk.
2230 */
2231bool
2232xfs_verify_magic16(
2233 struct xfs_buf *bp,
2234 __be16 dmagic)
2235{
2236 struct xfs_mount *mp = bp->b_target->bt_mount;
2237 int idx;
2238
2239 idx = xfs_sb_version_hascrc(&mp->m_sb);
2240 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
2241 return false;
2242 return dmagic == bp->b_ops->magic16[idx];
2243}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..d0b96e071cec 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -125,6 +125,10 @@ struct xfs_buf_map {
125 125
126struct xfs_buf_ops { 126struct xfs_buf_ops {
127 char *name; 127 char *name;
128 union {
129 __be32 magic[2]; /* v4 and v5 on disk magic values */
130 __be16 magic16[2]; /* v4 and v5 on disk magic values */
131 };
128 void (*verify_read)(struct xfs_buf *); 132 void (*verify_read)(struct xfs_buf *);
129 void (*verify_write)(struct xfs_buf *); 133 void (*verify_write)(struct xfs_buf *);
130 xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); 134 xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
385#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 389#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
386#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 390#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
387 391
388int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 392int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
393bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
394bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
389 395
390#endif /* __XFS_BUF_H__ */ 396#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..a1e177f66404 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
51 XFS_RANDOM_BUF_LRU_REF, 51 XFS_RANDOM_BUF_LRU_REF,
52 XFS_RANDOM_FORCE_SCRUB_REPAIR, 52 XFS_RANDOM_FORCE_SCRUB_REPAIR,
53 XFS_RANDOM_FORCE_SUMMARY_RECALC, 53 XFS_RANDOM_FORCE_SUMMARY_RECALC,
54 XFS_RANDOM_IUNLINK_FALLBACK,
54}; 55};
55 56
56struct xfs_errortag_attr { 57struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
159XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); 160XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
160XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); 161XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
161XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); 162XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
163XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
162 164
163static struct attribute *xfs_errortag_attrs[] = { 165static struct attribute *xfs_errortag_attrs[] = {
164 XFS_ERRORTAG_ATTR_LIST(noerror), 166 XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
195 XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), 197 XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
196 XFS_ERRORTAG_ATTR_LIST(force_repair), 198 XFS_ERRORTAG_ATTR_LIST(force_repair),
197 XFS_ERRORTAG_ATTR_LIST(bad_summary), 199 XFS_ERRORTAG_ATTR_LIST(bad_summary),
200 XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
198 NULL, 201 NULL,
199}; 202};
200 203
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
357 fa = failaddr ? failaddr : __return_address; 360 fa = failaddr ? failaddr : __return_address;
358 __xfs_buf_ioerror(bp, error, fa); 361 __xfs_buf_ioerror(bp, error, fa);
359 362
360 xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", 363 xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
364 "Metadata %s detected at %pS, %s block 0x%llx %s",
361 bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", 365 bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
362 fa, bp->b_ops->name, bp->b_bn, name); 366 fa, bp->b_ops->name, bp->b_bn, name);
363 367
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
98#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 98#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
99#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 99#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
100#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 100#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
101#define XFS_PTAG_VERIFIER_ERROR 0x00000100
101 102
102#endif /* __XFS_ERROR_H__ */ 103#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..770cc2edf777 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
507 * We can't properly handle unaligned direct I/O to reflink 507 * We can't properly handle unaligned direct I/O to reflink
508 * files yet, as we can't unshare a partial block. 508 * files yet, as we can't unshare a partial block.
509 */ 509 */
510 if (xfs_is_reflink_inode(ip)) { 510 if (xfs_is_cow_inode(ip)) {
511 trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); 511 trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
512 return -EREMCHG; 512 return -EREMCHG;
513 } 513 }
@@ -872,14 +872,27 @@ xfs_file_fallocate(
872 goto out_unlock; 872 goto out_unlock;
873 } 873 }
874 874
875 if (mode & FALLOC_FL_ZERO_RANGE) 875 if (mode & FALLOC_FL_ZERO_RANGE) {
876 error = xfs_zero_file_space(ip, offset, len); 876 error = xfs_zero_file_space(ip, offset, len);
877 else { 877 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
878 if (mode & FALLOC_FL_UNSHARE_RANGE) { 878 error = xfs_reflink_unshare(ip, offset, len);
879 error = xfs_reflink_unshare(ip, offset, len); 879 if (error)
880 if (error) 880 goto out_unlock;
881 goto out_unlock; 881
882 if (!xfs_is_always_cow_inode(ip)) {
883 error = xfs_alloc_file_space(ip, offset, len,
884 XFS_BMAPI_PREALLOC);
882 } 885 }
886 } else {
887 /*
888 * If always_cow mode we can't use preallocations and
889 * thus should not create them.
890 */
891 if (xfs_is_always_cow_inode(ip)) {
892 error = -EOPNOTSUPP;
893 goto out_unlock;
894 }
895
883 error = xfs_alloc_file_space(ip, offset, len, 896 error = xfs_alloc_file_space(ip, offset, len,
884 XFS_BMAPI_PREALLOC); 897 XFS_BMAPI_PREALLOC);
885 } 898 }
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
1068 default: 1081 default:
1069 return generic_file_llseek(file, offset, whence); 1082 return generic_file_llseek(file, offset, whence);
1070 case SEEK_HOLE: 1083 case SEEK_HOLE:
1071 offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); 1084 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1072 break; 1085 break;
1073 case SEEK_DATA: 1086 case SEEK_DATA:
1074 offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); 1087 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1075 break; 1088 break;
1076 } 1089 }
1077 1090
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f3ef70c542e1..584648582ba7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
533 int error = 0; 533 int error = 0;
534 int err2; 534 int err2;
535 535
536 mp->m_finobt_nores = false;
536 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 537 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
537 pag = xfs_perag_get(mp, agno); 538 pag = xfs_perag_get(mp, agno);
538 err2 = xfs_ag_resv_init(pag, NULL); 539 err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..d0d377384120 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
16 /* MIN DFLT MAX */ 16 /* MIN DFLT MAX */
17 .sgid_inherit = { 0, 0, 1 }, 17 .sgid_inherit = { 0, 0, 1 },
18 .symlink_mode = { 0, 0, 1 }, 18 .symlink_mode = { 0, 0, 1 },
19 .panic_mask = { 0, 0, 255 }, 19 .panic_mask = { 0, 0, 256 },
20 .error_level = { 0, 3, 11 }, 20 .error_level = { 0, 3, 11 },
21 .syncd_timer = { 1*100, 30*100, 7200*100}, 21 .syncd_timer = { 1*100, 30*100, 7200*100},
22 .stats_clear = { 0, 0, 1 }, 22 .stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae667ba74a1c..f643a9295179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
1332 if (error) 1332 if (error)
1333 goto out_trans_cancel; 1333 goto out_trans_cancel;
1334 1334
1335 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); 1335 error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
1336 if (error) 1336 if (error)
1337 goto out_trans_cancel; 1337 goto out_trans_cancel;
1338 1338
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
1754 * now remains allocated and sits on the unlinked list until the fs is 1754 * now remains allocated and sits on the unlinked list until the fs is
1755 * repaired. 1755 * repaired.
1756 */ 1756 */
1757 if (unlikely(mp->m_inotbt_nores)) { 1757 if (unlikely(mp->m_finobt_nores)) {
1758 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1758 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1759 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1759 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1760 &tp); 1760 &tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
1907} 1907}
1908 1908
1909/* 1909/*
1910 * This is called when the inode's link count goes to 0 or we are creating a 1910 * In-Core Unlinked List Lookups
1911 * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be 1911 * =============================
1912 * set to true as the link count is dropped to zero by the VFS after we've 1912 *
1913 * created the file successfully, so we have to add it to the unlinked list 1913 * Every inode is supposed to be reachable from some other piece of metadata
1914 * while the link count is non-zero. 1914 * with the exception of the root directory. Inodes with a connection to a
1915 * file descriptor but not linked from anywhere in the on-disk directory tree
1916 * are collectively known as unlinked inodes, though the filesystem itself
1917 * maintains links to these inodes so that on-disk metadata are consistent.
1918 *
1919 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
1920 * header contains a number of buckets that point to an inode, and each inode
1921 * record has a pointer to the next inode in the hash chain. This
1922 * singly-linked list causes scaling problems in the iunlink remove function
1923 * because we must walk that list to find the inode that points to the inode
1924 * being removed from the unlinked hash bucket list.
1925 *
1926 * What if we modelled the unlinked list as a collection of records capturing
1927 * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
1928 * have a fast way to look up unlinked list predecessors, which avoids the
1929 * slow list walk. That's exactly what we do here (in-core) with a per-AG
1930 * rhashtable.
1931 *
1932 * Because this is a backref cache, we ignore operational failures since the
1933 * iunlink code can fall back to the slow bucket walk. The only errors that
1934 * should bubble out are for obviously incorrect situations.
1935 *
1936 * All users of the backref cache MUST hold the AGI buffer lock to serialize
1937 * access or have otherwise provided for concurrency control.
1938 */
1939
1940/* Capture a "X.next_unlinked = Y" relationship. */
1941struct xfs_iunlink {
1942 struct rhash_head iu_rhash_head;
1943 xfs_agino_t iu_agino; /* X */
1944 xfs_agino_t iu_next_unlinked; /* Y */
1945};
1946
1947/* Unlinked list predecessor lookup hashtable construction */
1948static int
1949xfs_iunlink_obj_cmpfn(
1950 struct rhashtable_compare_arg *arg,
1951 const void *obj)
1952{
1953 const xfs_agino_t *key = arg->key;
1954 const struct xfs_iunlink *iu = obj;
1955
1956 if (iu->iu_next_unlinked != *key)
1957 return 1;
1958 return 0;
1959}
1960
1961static const struct rhashtable_params xfs_iunlink_hash_params = {
1962 .min_size = XFS_AGI_UNLINKED_BUCKETS,
1963 .key_len = sizeof(xfs_agino_t),
1964 .key_offset = offsetof(struct xfs_iunlink,
1965 iu_next_unlinked),
1966 .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
1967 .automatic_shrinking = true,
1968 .obj_cmpfn = xfs_iunlink_obj_cmpfn,
1969};
1970
1971/*
1972 * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
1973 * relation is found.
1974 */
1975static xfs_agino_t
1976xfs_iunlink_lookup_backref(
1977 struct xfs_perag *pag,
1978 xfs_agino_t agino)
1979{
1980 struct xfs_iunlink *iu;
1981
1982 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1983 xfs_iunlink_hash_params);
1984 return iu ? iu->iu_agino : NULLAGINO;
1985}
1986
1987/*
1988 * Take ownership of an iunlink cache entry and insert it into the hash table.
1989 * If successful, the entry will be owned by the cache; if not, it is freed.
1990 * Either way, the caller does not own @iu after this call.
1991 */
1992static int
1993xfs_iunlink_insert_backref(
1994 struct xfs_perag *pag,
1995 struct xfs_iunlink *iu)
1996{
1997 int error;
1998
1999 error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
2000 &iu->iu_rhash_head, xfs_iunlink_hash_params);
2001 /*
2002 * Fail loudly if there already was an entry because that's a sign of
2003 * corruption of in-memory data. Also fail loudly if we see an error
2004 * code we didn't anticipate from the rhashtable code. Currently we
2005 * only anticipate ENOMEM.
2006 */
2007 if (error) {
2008 WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
2009 kmem_free(iu);
2010 }
2011 /*
2012 * Absorb any runtime errors that aren't a result of corruption because
2013 * this is a cache and we can always fall back to bucket list scanning.
2014 */
2015 if (error != 0 && error != -EEXIST)
2016 error = 0;
2017 return error;
2018}
2019
2020/* Remember that @prev_agino.next_unlinked = @this_agino. */
2021static int
2022xfs_iunlink_add_backref(
2023 struct xfs_perag *pag,
2024 xfs_agino_t prev_agino,
2025 xfs_agino_t this_agino)
2026{
2027 struct xfs_iunlink *iu;
2028
2029 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2030 return 0;
2031
2032 iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
2033 iu->iu_agino = prev_agino;
2034 iu->iu_next_unlinked = this_agino;
2035
2036 return xfs_iunlink_insert_backref(pag, iu);
2037}
2038
2039/*
2040 * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
2041 * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
2042 * wasn't any such entry then we don't bother.
2043 */
2044static int
2045xfs_iunlink_change_backref(
2046 struct xfs_perag *pag,
2047 xfs_agino_t agino,
2048 xfs_agino_t next_unlinked)
2049{
2050 struct xfs_iunlink *iu;
2051 int error;
2052
2053 /* Look up the old entry; if there wasn't one then exit. */
2054 iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
2055 xfs_iunlink_hash_params);
2056 if (!iu)
2057 return 0;
2058
2059 /*
2060 * Remove the entry. This shouldn't ever return an error, but if we
2061 * couldn't remove the old entry we don't want to add it again to the
2062 * hash table, and if the entry disappeared on us then someone's
2063 * violated the locking rules and we need to fail loudly. Either way
2064 * we cannot remove the inode because internal state is or would have
2065 * been corrupt.
2066 */
2067 error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
2068 &iu->iu_rhash_head, xfs_iunlink_hash_params);
2069 if (error)
2070 return error;
2071
2072 /* If there is no new next entry just free our item and return. */
2073 if (next_unlinked == NULLAGINO) {
2074 kmem_free(iu);
2075 return 0;
2076 }
2077
2078 /* Update the entry and re-add it to the hash table. */
2079 iu->iu_next_unlinked = next_unlinked;
2080 return xfs_iunlink_insert_backref(pag, iu);
2081}
2082
2083/* Set up the in-core predecessor structures. */
2084int
2085xfs_iunlink_init(
2086 struct xfs_perag *pag)
2087{
2088 return rhashtable_init(&pag->pagi_unlinked_hash,
2089 &xfs_iunlink_hash_params);
2090}
2091
2092/* Free the in-core predecessor structures. */
2093static void
2094xfs_iunlink_free_item(
2095 void *ptr,
2096 void *arg)
2097{
2098 struct xfs_iunlink *iu = ptr;
2099 bool *freed_anything = arg;
2100
2101 *freed_anything = true;
2102 kmem_free(iu);
2103}
2104
2105void
2106xfs_iunlink_destroy(
2107 struct xfs_perag *pag)
2108{
2109 bool freed_anything = false;
2110
2111 rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
2112 xfs_iunlink_free_item, &freed_anything);
2113
2114 ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
2115}
2116
2117/*
2118 * Point the AGI unlinked bucket at an inode and log the results. The caller
2119 * is responsible for validating the old value.
2120 */
2121STATIC int
2122xfs_iunlink_update_bucket(
2123 struct xfs_trans *tp,
2124 xfs_agnumber_t agno,
2125 struct xfs_buf *agibp,
2126 unsigned int bucket_index,
2127 xfs_agino_t new_agino)
2128{
2129 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
2130 xfs_agino_t old_value;
2131 int offset;
2132
2133 ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
2134
2135 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2136 trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
2137 old_value, new_agino);
2138
2139 /*
2140 * We should never find the head of the list already set to the value
2141 * passed in because either we're adding or removing ourselves from the
2142 * head of the list.
2143 */
2144 if (old_value == new_agino)
2145 return -EFSCORRUPTED;
2146
2147 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2148 offset = offsetof(struct xfs_agi, agi_unlinked) +
2149 (sizeof(xfs_agino_t) * bucket_index);
2150 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2151 return 0;
2152}
2153
2154/* Set an on-disk inode's next_unlinked pointer. */
2155STATIC void
2156xfs_iunlink_update_dinode(
2157 struct xfs_trans *tp,
2158 xfs_agnumber_t agno,
2159 xfs_agino_t agino,
2160 struct xfs_buf *ibp,
2161 struct xfs_dinode *dip,
2162 struct xfs_imap *imap,
2163 xfs_agino_t next_agino)
2164{
2165 struct xfs_mount *mp = tp->t_mountp;
2166 int offset;
2167
2168 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2169
2170 trace_xfs_iunlink_update_dinode(mp, agno, agino,
2171 be32_to_cpu(dip->di_next_unlinked), next_agino);
2172
2173 dip->di_next_unlinked = cpu_to_be32(next_agino);
2174 offset = imap->im_boffset +
2175 offsetof(struct xfs_dinode, di_next_unlinked);
2176
2177 /* need to recalc the inode CRC if appropriate */
2178 xfs_dinode_calc_crc(mp, dip);
2179 xfs_trans_inode_buf(tp, ibp);
2180 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2181 xfs_inobp_check(mp, ibp);
2182}
2183
2184/* Set an in-core inode's unlinked pointer and return the old value. */
2185STATIC int
2186xfs_iunlink_update_inode(
2187 struct xfs_trans *tp,
2188 struct xfs_inode *ip,
2189 xfs_agnumber_t agno,
2190 xfs_agino_t next_agino,
2191 xfs_agino_t *old_next_agino)
2192{
2193 struct xfs_mount *mp = tp->t_mountp;
2194 struct xfs_dinode *dip;
2195 struct xfs_buf *ibp;
2196 xfs_agino_t old_value;
2197 int error;
2198
2199 ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2200
2201 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
2202 if (error)
2203 return error;
2204
2205 /* Make sure the old pointer isn't garbage. */
2206 old_value = be32_to_cpu(dip->di_next_unlinked);
2207 if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
2208 error = -EFSCORRUPTED;
2209 goto out;
2210 }
2211
2212 /*
2213 * Since we're updating a linked list, we should never find that the
2214 * current pointer is the same as the new value, unless we're
2215 * terminating the list.
2216 */
2217 *old_next_agino = old_value;
2218 if (old_value == next_agino) {
2219 if (next_agino != NULLAGINO)
2220 error = -EFSCORRUPTED;
2221 goto out;
2222 }
2223
2224 /* Ok, update the new pointer. */
2225 xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
2226 ibp, dip, &ip->i_imap, next_agino);
2227 return 0;
2228out:
2229 xfs_trans_brelse(tp, ibp);
2230 return error;
2231}
2232
2233/*
2234 * This is called when the inode's link count has gone to 0 or we are creating
2235 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
1915 * 2236 *
1916 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2237 * We place the on-disk inode on a list in the AGI. It will be pulled from this
1917 * list when the inode is freed. 2238 * list when the inode is freed.
1918 */ 2239 */
1919STATIC int 2240STATIC int
1920xfs_iunlink( 2241xfs_iunlink(
1921 struct xfs_trans *tp, 2242 struct xfs_trans *tp,
1922 struct xfs_inode *ip) 2243 struct xfs_inode *ip)
1923{ 2244{
1924 xfs_mount_t *mp = tp->t_mountp; 2245 struct xfs_mount *mp = tp->t_mountp;
1925 xfs_agi_t *agi; 2246 struct xfs_agi *agi;
1926 xfs_dinode_t *dip; 2247 struct xfs_buf *agibp;
1927 xfs_buf_t *agibp; 2248 xfs_agino_t next_agino;
1928 xfs_buf_t *ibp; 2249 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1929 xfs_agino_t agino; 2250 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1930 short bucket_index; 2251 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1931 int offset; 2252 int error;
1932 int error;
1933 2253
2254 ASSERT(VFS_I(ip)->i_nlink == 0);
1934 ASSERT(VFS_I(ip)->i_mode != 0); 2255 ASSERT(VFS_I(ip)->i_mode != 0);
2256 trace_xfs_iunlink(ip);
1935 2257
1936 /* 2258 /* Get the agi buffer first. It ensures lock ordering on the list. */
1937 * Get the agi buffer first. It ensures lock ordering 2259 error = xfs_read_agi(mp, tp, agno, &agibp);
1938 * on the list.
1939 */
1940 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1941 if (error) 2260 if (error)
1942 return error; 2261 return error;
1943 agi = XFS_BUF_TO_AGI(agibp); 2262 agi = XFS_BUF_TO_AGI(agibp);
1944 2263
1945 /* 2264 /*
1946 * Get the index into the agi hash table for the 2265 * Get the index into the agi hash table for the list this inode will
1947 * list this inode will go on. 2266 * go on. Make sure the pointer isn't garbage and that this inode
2267 * isn't already on the list.
1948 */ 2268 */
1949 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2269 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1950 ASSERT(agino != 0); 2270 if (next_agino == agino ||
1951 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2271 !xfs_verify_agino_or_null(mp, agno, next_agino))
1952 ASSERT(agi->agi_unlinked[bucket_index]); 2272 return -EFSCORRUPTED;
1953 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 2273
2274 if (next_agino != NULLAGINO) {
2275 struct xfs_perag *pag;
2276 xfs_agino_t old_agino;
2277
2278 /*
2279 * There is already another inode in the bucket, so point this
2280 * inode to the current head of the list.
2281 */
2282 error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2283 &old_agino);
2284 if (error)
2285 return error;
2286 ASSERT(old_agino == NULLAGINO);
1954 2287
1955 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1956 /* 2288 /*
1957 * There is already another inode in the bucket we need 2289 * agino has been unlinked, add a backref from the next inode
1958 * to add ourselves to. Add us at the front of the list. 2290 * back to agino.
1959 * Here we put the head pointer into our next pointer,
1960 * and then we fall through to point the head at us.
1961 */ 2291 */
1962 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 2292 pag = xfs_perag_get(mp, agno);
1963 0, 0); 2293 error = xfs_iunlink_add_backref(pag, agino, next_agino);
2294 xfs_perag_put(pag);
1964 if (error) 2295 if (error)
1965 return error; 2296 return error;
2297 }
2298
2299 /* Point the head of the list to point to this inode. */
2300 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2301}
1966 2302
1967 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); 2303/* Return the imap, dinode pointer, and buffer for an inode. */
1968 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 2304STATIC int
1969 offset = ip->i_imap.im_boffset + 2305xfs_iunlink_map_ino(
1970 offsetof(xfs_dinode_t, di_next_unlinked); 2306 struct xfs_trans *tp,
2307 xfs_agnumber_t agno,
2308 xfs_agino_t agino,
2309 struct xfs_imap *imap,
2310 struct xfs_dinode **dipp,
2311 struct xfs_buf **bpp)
2312{
2313 struct xfs_mount *mp = tp->t_mountp;
2314 int error;
1971 2315
1972 /* need to recalc the inode CRC if appropriate */ 2316 imap->im_blkno = 0;
1973 xfs_dinode_calc_crc(mp, dip); 2317 error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
2318 if (error) {
2319 xfs_warn(mp, "%s: xfs_imap returned error %d.",
2320 __func__, error);
2321 return error;
2322 }
1974 2323
1975 xfs_trans_inode_buf(tp, ibp); 2324 error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
1976 xfs_trans_log_buf(tp, ibp, offset, 2325 if (error) {
1977 (offset + sizeof(xfs_agino_t) - 1)); 2326 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
1978 xfs_inobp_check(mp, ibp); 2327 __func__, error);
2328 return error;
2329 }
2330
2331 return 0;
2332}
2333
2334/*
2335 * Walk the unlinked chain from @head_agino until we find the inode that
2336 * points to @target_agino. Return the inode number, map, dinode pointer,
2337 * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
2338 *
2339 * @tp, @pag, @head_agino, and @target_agino are input parameters.
2340 * @agino, @imap, @dipp, and @bpp are all output parameters.
2341 *
2342 * Do not call this function if @target_agino is the head of the list.
2343 */
2344STATIC int
2345xfs_iunlink_map_prev(
2346 struct xfs_trans *tp,
2347 xfs_agnumber_t agno,
2348 xfs_agino_t head_agino,
2349 xfs_agino_t target_agino,
2350 xfs_agino_t *agino,
2351 struct xfs_imap *imap,
2352 struct xfs_dinode **dipp,
2353 struct xfs_buf **bpp,
2354 struct xfs_perag *pag)
2355{
2356 struct xfs_mount *mp = tp->t_mountp;
2357 xfs_agino_t next_agino;
2358 int error;
2359
2360 ASSERT(head_agino != target_agino);
2361 *bpp = NULL;
2362
2363 /* See if our backref cache can find it faster. */
2364 *agino = xfs_iunlink_lookup_backref(pag, target_agino);
2365 if (*agino != NULLAGINO) {
2366 error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
2367 if (error)
2368 return error;
2369
2370 if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
2371 return 0;
2372
2373 /*
2374 * If we get here the cache contents were corrupt, so drop the
2375 * buffer and fall back to walking the bucket list.
2376 */
2377 xfs_trans_brelse(tp, *bpp);
2378 *bpp = NULL;
2379 WARN_ON_ONCE(1);
2380 }
2381
2382 trace_xfs_iunlink_map_prev_fallback(mp, agno);
2383
2384 /* Otherwise, walk the entire bucket until we find it. */
2385 next_agino = head_agino;
2386 while (next_agino != target_agino) {
2387 xfs_agino_t unlinked_agino;
2388
2389 if (*bpp)
2390 xfs_trans_brelse(tp, *bpp);
2391
2392 *agino = next_agino;
2393 error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
2394 bpp);
2395 if (error)
2396 return error;
2397
2398 unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
2399 /*
2400 * Make sure this pointer is valid and isn't an obvious
2401 * infinite loop.
2402 */
2403 if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
2404 next_agino == unlinked_agino) {
2405 XFS_CORRUPTION_ERROR(__func__,
2406 XFS_ERRLEVEL_LOW, mp,
2407 *dipp, sizeof(**dipp));
2408 error = -EFSCORRUPTED;
2409 return error;
2410 }
2411 next_agino = unlinked_agino;
1979 } 2412 }
1980 2413
1981 /*
1982 * Point the bucket head pointer at the inode being inserted.
1983 */
1984 ASSERT(agino != 0);
1985 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1986 offset = offsetof(xfs_agi_t, agi_unlinked) +
1987 (sizeof(xfs_agino_t) * bucket_index);
1988 xfs_trans_log_buf(tp, agibp, offset,
1989 (offset + sizeof(xfs_agino_t) - 1));
1990 return 0; 2414 return 0;
1991} 2415}
1992 2416
@@ -1995,181 +2419,106 @@ xfs_iunlink(
1995 */ 2419 */
1996STATIC int 2420STATIC int
1997xfs_iunlink_remove( 2421xfs_iunlink_remove(
1998 xfs_trans_t *tp, 2422 struct xfs_trans *tp,
1999 xfs_inode_t *ip) 2423 struct xfs_inode *ip)
2000{ 2424{
2001 xfs_ino_t next_ino; 2425 struct xfs_mount *mp = tp->t_mountp;
2002 xfs_mount_t *mp; 2426 struct xfs_agi *agi;
2003 xfs_agi_t *agi; 2427 struct xfs_buf *agibp;
2004 xfs_dinode_t *dip; 2428 struct xfs_buf *last_ibp;
2005 xfs_buf_t *agibp; 2429 struct xfs_dinode *last_dip = NULL;
2006 xfs_buf_t *ibp; 2430 struct xfs_perag *pag = NULL;
2007 xfs_agnumber_t agno; 2431 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2008 xfs_agino_t agino; 2432 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2009 xfs_agino_t next_agino; 2433 xfs_agino_t next_agino;
2010 xfs_buf_t *last_ibp; 2434 xfs_agino_t head_agino;
2011 xfs_dinode_t *last_dip = NULL; 2435 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2012 short bucket_index; 2436 int error;
2013 int offset, last_offset = 0;
2014 int error;
2015 2437
2016 mp = tp->t_mountp; 2438 trace_xfs_iunlink_remove(ip);
2017 agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2018 2439
2019 /* 2440 /* Get the agi buffer first. It ensures lock ordering on the list. */
2020 * Get the agi buffer first. It ensures lock ordering
2021 * on the list.
2022 */
2023 error = xfs_read_agi(mp, tp, agno, &agibp); 2441 error = xfs_read_agi(mp, tp, agno, &agibp);
2024 if (error) 2442 if (error)
2025 return error; 2443 return error;
2026
2027 agi = XFS_BUF_TO_AGI(agibp); 2444 agi = XFS_BUF_TO_AGI(agibp);
2028 2445
2029 /* 2446 /*
2030 * Get the index into the agi hash table for the 2447 * Get the index into the agi hash table for the list this inode will
2031 * list this inode will go on. 2448 * go on. Make sure the head pointer isn't garbage.
2032 */ 2449 */
2033 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2450 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2034 if (!xfs_verify_agino(mp, agno, agino)) 2451 if (!xfs_verify_agino(mp, agno, head_agino)) {
2035 return -EFSCORRUPTED;
2036 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2037 if (!xfs_verify_agino(mp, agno,
2038 be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
2039 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2452 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2040 agi, sizeof(*agi)); 2453 agi, sizeof(*agi));
2041 return -EFSCORRUPTED; 2454 return -EFSCORRUPTED;
2042 } 2455 }
2043 2456
2044 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 2457 /*
2045 /* 2458 * Set our inode's next_unlinked pointer to NULL and then return
2046 * We're at the head of the list. Get the inode's on-disk 2459 * the old pointer value so that we can update whatever was previous
2047 * buffer to see if there is anyone after us on the list. 2460 * to us in the list to point to whatever was next in the list.
2048 * Only modify our next pointer if it is not already NULLAGINO. 2461 */
2049 * This saves us the overhead of dealing with the buffer when 2462 error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
2050 * there is no need to change it. 2463 if (error)
2051 */ 2464 return error;
2052 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2053 0, 0);
2054 if (error) {
2055 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2056 __func__, error);
2057 return error;
2058 }
2059 next_agino = be32_to_cpu(dip->di_next_unlinked);
2060 ASSERT(next_agino != 0);
2061 if (next_agino != NULLAGINO) {
2062 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2063 offset = ip->i_imap.im_boffset +
2064 offsetof(xfs_dinode_t, di_next_unlinked);
2065
2066 /* need to recalc the inode CRC if appropriate */
2067 xfs_dinode_calc_crc(mp, dip);
2068
2069 xfs_trans_inode_buf(tp, ibp);
2070 xfs_trans_log_buf(tp, ibp, offset,
2071 (offset + sizeof(xfs_agino_t) - 1));
2072 xfs_inobp_check(mp, ibp);
2073 } else {
2074 xfs_trans_brelse(tp, ibp);
2075 }
2076 /*
2077 * Point the bucket head pointer at the next inode.
2078 */
2079 ASSERT(next_agino != 0);
2080 ASSERT(next_agino != agino);
2081 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2082 offset = offsetof(xfs_agi_t, agi_unlinked) +
2083 (sizeof(xfs_agino_t) * bucket_index);
2084 xfs_trans_log_buf(tp, agibp, offset,
2085 (offset + sizeof(xfs_agino_t) - 1));
2086 } else {
2087 /*
2088 * We need to search the list for the inode being freed.
2089 */
2090 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2091 last_ibp = NULL;
2092 while (next_agino != agino) {
2093 struct xfs_imap imap;
2094 2465
2095 if (last_ibp) 2466 /*
2096 xfs_trans_brelse(tp, last_ibp); 2467 * If there was a backref pointing from the next inode back to this
2468 * one, remove it because we've removed this inode from the list.
2469 *
2470 * Later, if this inode was in the middle of the list we'll update
2471 * this inode's backref to point from the next inode.
2472 */
2473 if (next_agino != NULLAGINO) {
2474 pag = xfs_perag_get(mp, agno);
2475 error = xfs_iunlink_change_backref(pag, next_agino,
2476 NULLAGINO);
2477 if (error)
2478 goto out;
2479 }
2097 2480
2098 imap.im_blkno = 0; 2481 if (head_agino == agino) {
2099 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 2482 /* Point the head of the list to the next unlinked inode. */
2483 error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2484 next_agino);
2485 if (error)
2486 goto out;
2487 } else {
2488 struct xfs_imap imap;
2489 xfs_agino_t prev_agino;
2100 2490
2101 error = xfs_imap(mp, tp, next_ino, &imap, 0); 2491 if (!pag)
2102 if (error) { 2492 pag = xfs_perag_get(mp, agno);
2103 xfs_warn(mp,
2104 "%s: xfs_imap returned error %d.",
2105 __func__, error);
2106 return error;
2107 }
2108 2493
2109 error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, 2494 /* We need to search the list for the inode being freed. */
2110 &last_ibp, 0, 0); 2495 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2111 if (error) { 2496 &prev_agino, &imap, &last_dip, &last_ibp,
2112 xfs_warn(mp, 2497 pag);
2113 "%s: xfs_imap_to_bp returned error %d.", 2498 if (error)
2114 __func__, error); 2499 goto out;
2115 return error;
2116 }
2117 2500
2118 last_offset = imap.im_boffset; 2501 /* Point the previous inode on the list to the next inode. */
2119 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 2502 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2120 if (!xfs_verify_agino(mp, agno, next_agino)) { 2503 last_dip, &imap, next_agino);
2121 XFS_CORRUPTION_ERROR(__func__,
2122 XFS_ERRLEVEL_LOW, mp,
2123 last_dip, sizeof(*last_dip));
2124 return -EFSCORRUPTED;
2125 }
2126 }
2127 2504
2128 /* 2505 /*
2129 * Now last_ibp points to the buffer previous to us on the 2506 * Now we deal with the backref for this inode. If this inode
2130 * unlinked list. Pull us from the list. 2507 * pointed at a real inode, change the backref that pointed to
2508 * us to point to our old next. If this inode was the end of
2509 * the list, delete the backref that pointed to us. Note that
2510 * change_backref takes care of deleting the backref if
2511 * next_agino is NULLAGINO.
2131 */ 2512 */
2132 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 2513 error = xfs_iunlink_change_backref(pag, agino, next_agino);
2133 0, 0); 2514 if (error)
2134 if (error) { 2515 goto out;
2135 xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2136 __func__, error);
2137 return error;
2138 }
2139 next_agino = be32_to_cpu(dip->di_next_unlinked);
2140 ASSERT(next_agino != 0);
2141 ASSERT(next_agino != agino);
2142 if (next_agino != NULLAGINO) {
2143 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2144 offset = ip->i_imap.im_boffset +
2145 offsetof(xfs_dinode_t, di_next_unlinked);
2146
2147 /* need to recalc the inode CRC if appropriate */
2148 xfs_dinode_calc_crc(mp, dip);
2149
2150 xfs_trans_inode_buf(tp, ibp);
2151 xfs_trans_log_buf(tp, ibp, offset,
2152 (offset + sizeof(xfs_agino_t) - 1));
2153 xfs_inobp_check(mp, ibp);
2154 } else {
2155 xfs_trans_brelse(tp, ibp);
2156 }
2157 /*
2158 * Point the previous inode on the list to the next inode.
2159 */
2160 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2161 ASSERT(next_agino != 0);
2162 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2163
2164 /* need to recalc the inode CRC if appropriate */
2165 xfs_dinode_calc_crc(mp, last_dip);
2166
2167 xfs_trans_inode_buf(tp, last_ibp);
2168 xfs_trans_log_buf(tp, last_ibp, offset,
2169 (offset + sizeof(xfs_agino_t) - 1));
2170 xfs_inobp_check(mp, last_ibp);
2171 } 2516 }
2172 return 0; 2517
2518out:
2519 if (pag)
2520 xfs_perag_put(pag);
2521 return error;
2173} 2522}
2174 2523
2175/* 2524/*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
2833 3182
2834 /* 3183 /*
2835 * Prepare the tmpfile inode as if it were created through the VFS. 3184 * Prepare the tmpfile inode as if it were created through the VFS.
2836 * Otherwise, the link increment paths will complain about nlink 0->1. 3185 * Complete the inode setup and flag it as linkable. nlink is already
2837 * Drop the link count as done by d_tmpfile(), complete the inode setup 3186 * zero, so we can skip the drop_nlink.
2838 * and flag it as linkable.
2839 */ 3187 */
2840 drop_nlink(VFS_I(tmpfile));
2841 xfs_setup_iops(tmpfile); 3188 xfs_setup_iops(tmpfile);
2842 xfs_finish_inode_setup(tmpfile); 3189 xfs_finish_inode_setup(tmpfile);
2843 VFS_I(tmpfile)->i_state |= I_LINKABLE; 3190 VFS_I(tmpfile)->i_state |= I_LINKABLE;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
500 500
501bool xfs_inode_verify_forks(struct xfs_inode *ip); 501bool xfs_inode_verify_forks(struct xfs_inode *ip);
502 502
503int xfs_iunlink_init(struct xfs_perag *pag);
504void xfs_iunlink_destroy(struct xfs_perag *pag);
505
503#endif /* __XFS_INODE_H__ */ 506#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..63d323916bba 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
35#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 35#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
36 << mp->m_writeio_log) 36 << mp->m_writeio_log)
37 37
38void 38static int
39xfs_alert_fsblock_zero(
40 xfs_inode_t *ip,
41 xfs_bmbt_irec_t *imap)
42{
43 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
44 "Access to block zero in inode %llu "
45 "start_block: %llx start_off: %llx "
46 "blkcnt: %llx extent-state: %x",
47 (unsigned long long)ip->i_ino,
48 (unsigned long long)imap->br_startblock,
49 (unsigned long long)imap->br_startoff,
50 (unsigned long long)imap->br_blockcount,
51 imap->br_state);
52 return -EFSCORRUPTED;
53}
54
55int
39xfs_bmbt_to_iomap( 56xfs_bmbt_to_iomap(
40 struct xfs_inode *ip, 57 struct xfs_inode *ip,
41 struct iomap *iomap, 58 struct iomap *iomap,
42 struct xfs_bmbt_irec *imap) 59 struct xfs_bmbt_irec *imap,
60 bool shared)
43{ 61{
44 struct xfs_mount *mp = ip->i_mount; 62 struct xfs_mount *mp = ip->i_mount;
45 63
64 if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
65 return xfs_alert_fsblock_zero(ip, imap);
66
46 if (imap->br_startblock == HOLESTARTBLOCK) { 67 if (imap->br_startblock == HOLESTARTBLOCK) {
47 iomap->addr = IOMAP_NULL_ADDR; 68 iomap->addr = IOMAP_NULL_ADDR;
48 iomap->type = IOMAP_HOLE; 69 iomap->type = IOMAP_HOLE;
49 } else if (imap->br_startblock == DELAYSTARTBLOCK) { 70 } else if (imap->br_startblock == DELAYSTARTBLOCK ||
71 isnullstartblock(imap->br_startblock)) {
50 iomap->addr = IOMAP_NULL_ADDR; 72 iomap->addr = IOMAP_NULL_ADDR;
51 iomap->type = IOMAP_DELALLOC; 73 iomap->type = IOMAP_DELALLOC;
52 } else { 74 } else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
60 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); 82 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
61 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); 83 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
62 iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); 84 iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
85
86 if (xfs_ipincount(ip) &&
87 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
88 iomap->flags |= IOMAP_F_DIRTY;
89 if (shared)
90 iomap->flags |= IOMAP_F_SHARED;
91 return 0;
63} 92}
64 93
65static void 94static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
138 return 0; 167 return 0;
139} 168}
140 169
141STATIC int
142xfs_alert_fsblock_zero(
143 xfs_inode_t *ip,
144 xfs_bmbt_irec_t *imap)
145{
146 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
147 "Access to block zero in inode %llu "
148 "start_block: %llx start_off: %llx "
149 "blkcnt: %llx extent-state: %x",
150 (unsigned long long)ip->i_ino,
151 (unsigned long long)imap->br_startblock,
152 (unsigned long long)imap->br_startoff,
153 (unsigned long long)imap->br_blockcount,
154 imap->br_state);
155 return -EFSCORRUPTED;
156}
157
158int 170int
159xfs_iomap_write_direct( 171xfs_iomap_write_direct(
160 xfs_inode_t *ip, 172 xfs_inode_t *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
383STATIC xfs_fsblock_t 395STATIC xfs_fsblock_t
384xfs_iomap_prealloc_size( 396xfs_iomap_prealloc_size(
385 struct xfs_inode *ip, 397 struct xfs_inode *ip,
398 int whichfork,
386 loff_t offset, 399 loff_t offset,
387 loff_t count, 400 loff_t count,
388 struct xfs_iext_cursor *icur) 401 struct xfs_iext_cursor *icur)
389{ 402{
390 struct xfs_mount *mp = ip->i_mount; 403 struct xfs_mount *mp = ip->i_mount;
391 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 404 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
392 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 405 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
393 struct xfs_bmbt_irec prev; 406 struct xfs_bmbt_irec prev;
394 int shift = 0; 407 int shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
522{ 535{
523 struct xfs_inode *ip = XFS_I(inode); 536 struct xfs_inode *ip = XFS_I(inode);
524 struct xfs_mount *mp = ip->i_mount; 537 struct xfs_mount *mp = ip->i_mount;
525 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
526 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 538 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
527 xfs_fileoff_t maxbytes_fsb = 539 xfs_fileoff_t maxbytes_fsb =
528 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 540 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
529 xfs_fileoff_t end_fsb; 541 xfs_fileoff_t end_fsb;
530 int error = 0, eof = 0; 542 struct xfs_bmbt_irec imap, cmap;
531 struct xfs_bmbt_irec got; 543 struct xfs_iext_cursor icur, ccur;
532 struct xfs_iext_cursor icur;
533 xfs_fsblock_t prealloc_blocks = 0; 544 xfs_fsblock_t prealloc_blocks = 0;
545 bool eof = false, cow_eof = false, shared = false;
546 int whichfork = XFS_DATA_FORK;
547 int error = 0;
534 548
535 ASSERT(!XFS_IS_REALTIME_INODE(ip)); 549 ASSERT(!XFS_IS_REALTIME_INODE(ip));
536 ASSERT(!xfs_get_extsz_hint(ip)); 550 ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
548 562
549 XFS_STATS_INC(mp, xs_blk_mapw); 563 XFS_STATS_INC(mp, xs_blk_mapw);
550 564
551 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 565 if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
552 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 566 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
553 if (error) 567 if (error)
554 goto out_unlock; 568 goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
556 570
557 end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); 571 end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
558 572
559 eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); 573 /*
574 * Search the data fork fork first to look up our source mapping. We
575 * always need the data fork map, as we have to return it to the
576 * iomap code so that the higher level write code can read data in to
577 * perform read-modify-write cycles for unaligned writes.
578 */
579 eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
560 if (eof) 580 if (eof)
561 got.br_startoff = end_fsb; /* fake hole until the end */ 581 imap.br_startoff = end_fsb; /* fake hole until the end */
582
583 /* We never need to allocate blocks for zeroing a hole. */
584 if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
585 xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
586 goto out_unlock;
587 }
562 588
563 if (got.br_startoff <= offset_fsb) { 589 /*
590 * Search the COW fork extent list even if we did not find a data fork
591 * extent. This serves two purposes: first this implements the
592 * speculative preallocation using cowextsize, so that we also unshare
593 * block adjacent to shared blocks instead of just the shared blocks
594 * themselves. Second the lookup in the extent list is generally faster
595 * than going out to the shared extent tree.
596 */
597 if (xfs_is_cow_inode(ip)) {
598 if (!ip->i_cowfp) {
599 ASSERT(!xfs_is_reflink_inode(ip));
600 xfs_ifork_init_cow(ip);
601 }
602 cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
603 &ccur, &cmap);
604 if (!cow_eof && cmap.br_startoff <= offset_fsb) {
605 trace_xfs_reflink_cow_found(ip, &cmap);
606 whichfork = XFS_COW_FORK;
607 goto done;
608 }
609 }
610
611 if (imap.br_startoff <= offset_fsb) {
564 /* 612 /*
565 * For reflink files we may need a delalloc reservation when 613 * For reflink files we may need a delalloc reservation when
566 * overwriting shared extents. This includes zeroing of 614 * overwriting shared extents. This includes zeroing of
567 * existing extents that contain data. 615 * existing extents that contain data.
568 */ 616 */
569 if (xfs_is_reflink_inode(ip) && 617 if (!xfs_is_cow_inode(ip) ||
570 ((flags & IOMAP_WRITE) || 618 ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
571 got.br_state != XFS_EXT_UNWRITTEN)) { 619 trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
572 xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); 620 &imap);
573 error = xfs_reflink_reserve_cow(ip, &got); 621 goto done;
574 if (error)
575 goto out_unlock;
576 } 622 }
577 623
578 trace_xfs_iomap_found(ip, offset, count, 0, &got); 624 xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
579 goto done;
580 }
581 625
582 if (flags & IOMAP_ZERO) { 626 /* Trim the mapping to the nearest shared extent boundary. */
583 xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); 627 error = xfs_inode_need_cow(ip, &imap, &shared);
584 goto out_unlock; 628 if (error)
629 goto out_unlock;
630
631 /* Not shared? Just report the (potentially capped) extent. */
632 if (!shared) {
633 trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
634 &imap);
635 goto done;
636 }
637
638 /*
639 * Fork all the shared blocks from our write offset until the
640 * end of the extent.
641 */
642 whichfork = XFS_COW_FORK;
643 end_fsb = imap.br_startoff + imap.br_blockcount;
644 } else {
645 /*
646 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
647 * pages to keep the chunks of work done where somewhat
648 * symmetric with the work writeback does. This is a completely
649 * arbitrary number pulled out of thin air.
650 *
651 * Note that the values needs to be less than 32-bits wide until
652 * the lower level functions are updated.
653 */
654 count = min_t(loff_t, count, 1024 * PAGE_SIZE);
655 end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
656
657 if (xfs_is_always_cow_inode(ip))
658 whichfork = XFS_COW_FORK;
585 } 659 }
586 660
587 error = xfs_qm_dqattach_locked(ip, false); 661 error = xfs_qm_dqattach_locked(ip, false);
588 if (error) 662 if (error)
589 goto out_unlock; 663 goto out_unlock;
590 664
591 /*
592 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
593 * to keep the chunks of work done where somewhat symmetric with the
594 * work writeback does. This is a completely arbitrary number pulled
595 * out of thin air as a best guess for initial testing.
596 *
597 * Note that the values needs to be less than 32-bits wide until
598 * the lower level functions are updated.
599 */
600 count = min_t(loff_t, count, 1024 * PAGE_SIZE);
601 end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
602
603 if (eof) { 665 if (eof) {
604 prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, 666 prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
605 &icur); 667 count, &icur);
606 if (prealloc_blocks) { 668 if (prealloc_blocks) {
607 xfs_extlen_t align; 669 xfs_extlen_t align;
608 xfs_off_t end_offset; 670 xfs_off_t end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
623 } 685 }
624 686
625retry: 687retry:
626 error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, 688 error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
627 end_fsb - offset_fsb, prealloc_blocks, &got, &icur, 689 end_fsb - offset_fsb, prealloc_blocks,
628 eof); 690 whichfork == XFS_DATA_FORK ? &imap : &cmap,
691 whichfork == XFS_DATA_FORK ? &icur : &ccur,
692 whichfork == XFS_DATA_FORK ? eof : cow_eof);
629 switch (error) { 693 switch (error) {
630 case 0: 694 case 0:
631 break; 695 break;
@@ -647,186 +711,22 @@ retry:
647 * them out if the write happens to fail. 711 * them out if the write happens to fail.
648 */ 712 */
649 iomap->flags |= IOMAP_F_NEW; 713 iomap->flags |= IOMAP_F_NEW;
650 trace_xfs_iomap_alloc(ip, offset, count, 0, &got); 714 trace_xfs_iomap_alloc(ip, offset, count, whichfork,
715 whichfork == XFS_DATA_FORK ? &imap : &cmap);
651done: 716done:
652 if (isnullstartblock(got.br_startblock)) 717 if (whichfork == XFS_COW_FORK) {
653 got.br_startblock = DELAYSTARTBLOCK; 718 if (imap.br_startoff > offset_fsb) {
654 719 xfs_trim_extent(&cmap, offset_fsb,
655 if (!got.br_startblock) { 720 imap.br_startoff - offset_fsb);
656 error = xfs_alert_fsblock_zero(ip, &got); 721 error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
657 if (error)
658 goto out_unlock; 722 goto out_unlock;
659 }
660
661 xfs_bmbt_to_iomap(ip, iomap, &got);
662
663out_unlock:
664 xfs_iunlock(ip, XFS_ILOCK_EXCL);
665 return error;
666}
667
668/*
669 * Pass in a delayed allocate extent, convert it to real extents;
670 * return to the caller the extent we create which maps on top of
671 * the originating callers request.
672 *
673 * Called without a lock on the inode.
674 *
675 * We no longer bother to look at the incoming map - all we have to
676 * guarantee is that whatever we allocate fills the required range.
677 */
678int
679xfs_iomap_write_allocate(
680 xfs_inode_t *ip,
681 int whichfork,
682 xfs_off_t offset,
683 xfs_bmbt_irec_t *imap,
684 unsigned int *cow_seq)
685{
686 xfs_mount_t *mp = ip->i_mount;
687 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
688 xfs_fileoff_t offset_fsb, last_block;
689 xfs_fileoff_t end_fsb, map_start_fsb;
690 xfs_filblks_t count_fsb;
691 xfs_trans_t *tp;
692 int nimaps;
693 int error = 0;
694 int flags = XFS_BMAPI_DELALLOC;
695 int nres;
696
697 if (whichfork == XFS_COW_FORK)
698 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
699
700 /*
701 * Make sure that the dquots are there.
702 */
703 error = xfs_qm_dqattach(ip);
704 if (error)
705 return error;
706
707 offset_fsb = XFS_B_TO_FSBT(mp, offset);
708 count_fsb = imap->br_blockcount;
709 map_start_fsb = imap->br_startoff;
710
711 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
712
713 while (count_fsb != 0) {
714 /*
715 * Set up a transaction with which to allocate the
716 * backing store for the file. Do allocations in a
717 * loop until we get some space in the range we are
718 * interested in. The other space that might be allocated
719 * is in the delayed allocation extent on which we sit
720 * but before our buffer starts.
721 */
722 nimaps = 0;
723 while (nimaps == 0) {
724 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
725 /*
726 * We have already reserved space for the extent and any
727 * indirect blocks when creating the delalloc extent,
728 * there is no need to reserve space in this transaction
729 * again.
730 */
731 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
732 0, XFS_TRANS_RESERVE, &tp);
733 if (error)
734 return error;
735
736 xfs_ilock(ip, XFS_ILOCK_EXCL);
737 xfs_trans_ijoin(tp, ip, 0);
738
739 /*
740 * it is possible that the extents have changed since
741 * we did the read call as we dropped the ilock for a
742 * while. We have to be careful about truncates or hole
743 * punchs here - we are not allowed to allocate
744 * non-delalloc blocks here.
745 *
746 * The only protection against truncation is the pages
747 * for the range we are being asked to convert are
748 * locked and hence a truncate will block on them
749 * first.
750 *
751 * As a result, if we go beyond the range we really
752 * need and hit an delalloc extent boundary followed by
753 * a hole while we have excess blocks in the map, we
754 * will fill the hole incorrectly and overrun the
755 * transaction reservation.
756 *
757 * Using a single map prevents this as we are forced to
758 * check each map we look for overlap with the desired
759 * range and abort as soon as we find it. Also, given
760 * that we only return a single map, having one beyond
761 * what we can return is probably a bit silly.
762 *
763 * We also need to check that we don't go beyond EOF;
764 * this is a truncate optimisation as a truncate sets
765 * the new file size before block on the pages we
766 * currently have locked under writeback. Because they
767 * are about to be tossed, we don't need to write them
768 * back....
769 */
770 nimaps = 1;
771 end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
772 error = xfs_bmap_last_offset(ip, &last_block,
773 XFS_DATA_FORK);
774 if (error)
775 goto trans_cancel;
776
777 last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
778 if ((map_start_fsb + count_fsb) > last_block) {
779 count_fsb = last_block - map_start_fsb;
780 if (count_fsb == 0) {
781 error = -EAGAIN;
782 goto trans_cancel;
783 }
784 }
785
786 /*
787 * From this point onwards we overwrite the imap
788 * pointer that the caller gave to us.
789 */
790 error = xfs_bmapi_write(tp, ip, map_start_fsb,
791 count_fsb, flags, nres, imap,
792 &nimaps);
793 if (error)
794 goto trans_cancel;
795
796 error = xfs_trans_commit(tp);
797 if (error)
798 goto error0;
799
800 if (whichfork == XFS_COW_FORK)
801 *cow_seq = READ_ONCE(ifp->if_seq);
802 xfs_iunlock(ip, XFS_ILOCK_EXCL);
803 }
804
805 /*
806 * See if we were able to allocate an extent that
807 * covers at least part of the callers request
808 */
809 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
810 return xfs_alert_fsblock_zero(ip, imap);
811
812 if ((offset_fsb >= imap->br_startoff) &&
813 (offset_fsb < (imap->br_startoff +
814 imap->br_blockcount))) {
815 XFS_STATS_INC(mp, xs_xstrat_quick);
816 return 0;
817 } 723 }
818 724 /* ensure we only report blocks we have a reservation for */
819 /* 725 xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
820 * So far we have not mapped the requested part of the 726 shared = true;
821 * file, just surrounding data, try again.
822 */
823 count_fsb -= imap->br_blockcount;
824 map_start_fsb = imap->br_startoff + imap->br_blockcount;
825 } 727 }
826 728 error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
827trans_cancel: 729out_unlock:
828 xfs_trans_cancel(tp);
829error0:
830 xfs_iunlock(ip, XFS_ILOCK_EXCL); 730 xfs_iunlock(ip, XFS_ILOCK_EXCL);
831 return error; 731 return error;
832} 732}
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
975 * COW writes may allocate delalloc space or convert unwritten COW 875 * COW writes may allocate delalloc space or convert unwritten COW
976 * extents, so we need to make sure to take the lock exclusively here. 876 * extents, so we need to make sure to take the lock exclusively here.
977 */ 877 */
978 if (xfs_is_reflink_inode(ip) && is_write) { 878 if (xfs_is_cow_inode(ip) && is_write) {
979 /* 879 /*
980 * FIXME: It could still overwrite on unshared extents and not 880 * FIXME: It could still overwrite on unshared extents and not
981 * need allocation. 881 * need allocation.
@@ -1009,7 +909,7 @@ relock:
1009 * check, so if we got ILOCK_SHARED for a write and but we're now a 909 * check, so if we got ILOCK_SHARED for a write and but we're now a
1010 * reflink inode we have to switch to ILOCK_EXCL and relock. 910 * reflink inode we have to switch to ILOCK_EXCL and relock.
1011 */ 911 */
1012 if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { 912 if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
1013 xfs_iunlock(ip, mode); 913 xfs_iunlock(ip, mode);
1014 mode = XFS_ILOCK_EXCL; 914 mode = XFS_ILOCK_EXCL;
1015 goto relock; 915 goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
1081 * Break shared extents if necessary. Checks for non-blocking IO have 981 * Break shared extents if necessary. Checks for non-blocking IO have
1082 * been done up front, so we don't need to do them here. 982 * been done up front, so we don't need to do them here.
1083 */ 983 */
1084 if (xfs_is_reflink_inode(ip)) { 984 if (xfs_is_cow_inode(ip)) {
985 struct xfs_bmbt_irec cmap;
986 bool directio = (flags & IOMAP_DIRECT);
987
1085 /* if zeroing doesn't need COW allocation, then we are done. */ 988 /* if zeroing doesn't need COW allocation, then we are done. */
1086 if ((flags & IOMAP_ZERO) && 989 if ((flags & IOMAP_ZERO) &&
1087 !needs_cow_for_zeroing(&imap, nimaps)) 990 !needs_cow_for_zeroing(&imap, nimaps))
1088 goto out_found; 991 goto out_found;
1089 992
1090 if (flags & IOMAP_DIRECT) { 993 /* may drop and re-acquire the ilock */
1091 /* may drop and re-acquire the ilock */ 994 cmap = imap;
1092 error = xfs_reflink_allocate_cow(ip, &imap, &shared, 995 error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
1093 &lockmode); 996 directio);
1094 if (error) 997 if (error)
1095 goto out_unlock; 998 goto out_unlock;
1096 } else { 999
1097 error = xfs_reflink_reserve_cow(ip, &imap); 1000 /*
1098 if (error) 1001 * For buffered writes we need to report the address of the
1099 goto out_unlock; 1002 * previous block (if there was any) so that the higher level
1100 } 1003 * write code can perform read-modify-write operations; we
1004 * won't need the CoW fork mapping until writeback. For direct
1005 * I/O, which must be block aligned, we need to report the
1006 * newly allocated address. If the data fork has a hole, copy
1007 * the COW fork mapping to avoid allocating to the data fork.
1008 */
1009 if (directio || imap.br_startblock == HOLESTARTBLOCK)
1010 imap = cmap;
1101 1011
1102 end_fsb = imap.br_startoff + imap.br_blockcount; 1012 end_fsb = imap.br_startoff + imap.br_blockcount;
1103 length = XFS_FSB_TO_B(mp, end_fsb) - offset; 1013 length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
1139 return error; 1049 return error;
1140 1050
1141 iomap->flags |= IOMAP_F_NEW; 1051 iomap->flags |= IOMAP_F_NEW;
1142 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1052 trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
1143 1053
1144out_finish: 1054out_finish:
1145 if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields 1055 return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
1146 & ~XFS_ILOG_TIMESTAMP))
1147 iomap->flags |= IOMAP_F_DIRTY;
1148
1149 xfs_bmbt_to_iomap(ip, iomap, &imap);
1150
1151 if (shared)
1152 iomap->flags |= IOMAP_F_SHARED;
1153 return 0;
1154 1056
1155out_found: 1057out_found:
1156 ASSERT(nimaps); 1058 ASSERT(nimaps);
1157 xfs_iunlock(ip, lockmode); 1059 xfs_iunlock(ip, lockmode);
1158 trace_xfs_iomap_found(ip, offset, length, 0, &imap); 1060 trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
1159 goto out_finish; 1061 goto out_finish;
1160 1062
1161out_unlock: 1063out_unlock:
@@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = {
1241}; 1143};
1242 1144
1243static int 1145static int
1146xfs_seek_iomap_begin(
1147 struct inode *inode,
1148 loff_t offset,
1149 loff_t length,
1150 unsigned flags,
1151 struct iomap *iomap)
1152{
1153 struct xfs_inode *ip = XFS_I(inode);
1154 struct xfs_mount *mp = ip->i_mount;
1155 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
1156 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
1157 xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
1158 struct xfs_iext_cursor icur;
1159 struct xfs_bmbt_irec imap, cmap;
1160 int error = 0;
1161 unsigned lockmode;
1162
1163 if (XFS_FORCED_SHUTDOWN(mp))
1164 return -EIO;
1165
1166 lockmode = xfs_ilock_data_map_shared(ip);
1167 if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
1168 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
1169 if (error)
1170 goto out_unlock;
1171 }
1172
1173 if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
1174 /*
1175 * If we found a data extent we are done.
1176 */
1177 if (imap.br_startoff <= offset_fsb)
1178 goto done;
1179 data_fsb = imap.br_startoff;
1180 } else {
1181 /*
1182 * Fake a hole until the end of the file.
1183 */
1184 data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
1185 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
1186 }
1187
1188 /*
1189 * If a COW fork extent covers the hole, report it - capped to the next
1190 * data fork extent:
1191 */
1192 if (xfs_inode_has_cow_data(ip) &&
1193 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
1194 cow_fsb = cmap.br_startoff;
1195 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
1196 if (data_fsb < cow_fsb + cmap.br_blockcount)
1197 end_fsb = min(end_fsb, data_fsb);
1198 xfs_trim_extent(&cmap, offset_fsb, end_fsb);
1199 error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
1200 /*
1201 * This is a COW extent, so we must probe the page cache
1202 * because there could be dirty page cache being backed
1203 * by this extent.
1204 */
1205 iomap->type = IOMAP_UNWRITTEN;
1206 goto out_unlock;
1207 }
1208
1209 /*
1210 * Else report a hole, capped to the next found data or COW extent.
1211 */
1212 if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
1213 imap.br_blockcount = cow_fsb - offset_fsb;
1214 else
1215 imap.br_blockcount = data_fsb - offset_fsb;
1216 imap.br_startoff = offset_fsb;
1217 imap.br_startblock = HOLESTARTBLOCK;
1218 imap.br_state = XFS_EXT_NORM;
1219done:
1220 xfs_trim_extent(&imap, offset_fsb, end_fsb);
1221 error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
1222out_unlock:
1223 xfs_iunlock(ip, lockmode);
1224 return error;
1225}
1226
1227const struct iomap_ops xfs_seek_iomap_ops = {
1228 .iomap_begin = xfs_seek_iomap_begin,
1229};
1230
1231static int
1244xfs_xattr_iomap_begin( 1232xfs_xattr_iomap_begin(
1245 struct inode *inode, 1233 struct inode *inode,
1246 loff_t offset, 1234 loff_t offset,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
1273out_unlock: 1261out_unlock:
1274 xfs_iunlock(ip, lockmode); 1262 xfs_iunlock(ip, lockmode);
1275 1263
1276 if (!error) { 1264 if (error)
1277 ASSERT(nimaps); 1265 return error;
1278 xfs_bmbt_to_iomap(ip, iomap, &imap); 1266 ASSERT(nimaps);
1279 } 1267 return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
1280
1281 return error;
1282} 1268}
1283 1269
1284const struct iomap_ops xfs_xattr_iomap_ops = { 1270const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
13 13
14int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, 14int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
15 struct xfs_bmbt_irec *, int); 15 struct xfs_bmbt_irec *, int);
16int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
17 struct xfs_bmbt_irec *, unsigned int *);
18int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); 16int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
19 17
20void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, 18int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
21 struct xfs_bmbt_irec *); 19 struct xfs_bmbt_irec *, bool shared);
22xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); 20xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
23 21
24static inline xfs_filblks_t 22static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
42} 40}
43 41
44extern const struct iomap_ops xfs_iomap_ops; 42extern const struct iomap_ops xfs_iomap_ops;
43extern const struct iomap_ops xfs_seek_iomap_ops;
45extern const struct iomap_ops xfs_xattr_iomap_ops; 44extern const struct iomap_ops xfs_xattr_iomap_ops;
46 45
47#endif /* __XFS_IOMAP_H__*/ 46#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..74047bd0c1ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -191,9 +191,18 @@ xfs_generic_create(
191 191
192 xfs_setup_iops(ip); 192 xfs_setup_iops(ip);
193 193
194 if (tmpfile) 194 if (tmpfile) {
195 /*
196 * The VFS requires that any inode fed to d_tmpfile must have
197 * nlink == 1 so that it can decrement the nlink in d_tmpfile.
198 * However, we created the temp file with nlink == 0 because
199 * we're not allowed to put an inode with nlink > 0 on the
200 * unlinked list. Therefore we have to set nlink to 1 so that
201 * d_tmpfile can immediately set it back to zero.
202 */
203 set_nlink(inode, 1);
195 d_tmpfile(dentry, inode); 204 d_tmpfile(dentry, inode);
196 else 205 } else
197 d_instantiate(dentry, inode); 206 d_instantiate(dentry, inode);
198 207
199 xfs_finish_inode_setup(ip); 208 xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
522 } 531 }
523 } 532 }
524 533
534 /*
535 * Note: If you add another clause to set an attribute flag, please
536 * update attributes_mask below.
537 */
525 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) 538 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
526 stat->attributes |= STATX_ATTR_IMMUTABLE; 539 stat->attributes |= STATX_ATTR_IMMUTABLE;
527 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) 540 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
529 if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) 542 if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
530 stat->attributes |= STATX_ATTR_NODUMP; 543 stat->attributes |= STATX_ATTR_NODUMP;
531 544
545 stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
546 STATX_ATTR_APPEND |
547 STATX_ATTR_NODUMP);
548
532 switch (inode->i_mode & S_IFMT) { 549 switch (inode->i_mode & S_IFMT) {
533 case S_IFBLK: 550 case S_IFBLK:
534 case S_IFCHR: 551 case S_IFCHR:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9fe88d125f0a..3371d1ff27c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
2439 case XFS_BLFT_BTREE_BUF: 2439 case XFS_BLFT_BTREE_BUF:
2440 switch (magic32) { 2440 switch (magic32) {
2441 case XFS_ABTB_CRC_MAGIC: 2441 case XFS_ABTB_CRC_MAGIC:
2442 case XFS_ABTC_CRC_MAGIC:
2443 case XFS_ABTB_MAGIC: 2442 case XFS_ABTB_MAGIC:
2443 bp->b_ops = &xfs_bnobt_buf_ops;
2444 break;
2445 case XFS_ABTC_CRC_MAGIC:
2444 case XFS_ABTC_MAGIC: 2446 case XFS_ABTC_MAGIC:
2445 bp->b_ops = &xfs_allocbt_buf_ops; 2447 bp->b_ops = &xfs_cntbt_buf_ops;
2446 break; 2448 break;
2447 case XFS_IBT_CRC_MAGIC: 2449 case XFS_IBT_CRC_MAGIC:
2448 case XFS_FIBT_CRC_MAGIC:
2449 case XFS_IBT_MAGIC: 2450 case XFS_IBT_MAGIC:
2450 case XFS_FIBT_MAGIC:
2451 bp->b_ops = &xfs_inobt_buf_ops; 2451 bp->b_ops = &xfs_inobt_buf_ops;
2452 break; 2452 break;
2453 case XFS_FIBT_CRC_MAGIC:
2454 case XFS_FIBT_MAGIC:
2455 bp->b_ops = &xfs_finobt_buf_ops;
2456 break;
2453 case XFS_BMAP_CRC_MAGIC: 2457 case XFS_BMAP_CRC_MAGIC:
2454 case XFS_BMAP_MAGIC: 2458 case XFS_BMAP_MAGIC:
2455 bp->b_ops = &xfs_bmbt_buf_ops; 2459 bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
3045 * Make sure the place we're flushing out to really looks 3049 * Make sure the place we're flushing out to really looks
3046 * like an inode! 3050 * like an inode!
3047 */ 3051 */
3048 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 3052 if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
3049 xfs_alert(mp, 3053 xfs_alert(mp,
3050 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", 3054 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
3051 __func__, dip, bp, in_f->ilf_ino); 3055 __func__, dip, bp, in_f->ilf_ino);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@ xfs_free_perag(
149 spin_unlock(&mp->m_perag_lock); 149 spin_unlock(&mp->m_perag_lock);
150 ASSERT(pag); 150 ASSERT(pag);
151 ASSERT(atomic_read(&pag->pag_ref) == 0); 151 ASSERT(atomic_read(&pag->pag_ref) == 0);
152 xfs_iunlink_destroy(pag);
152 xfs_buf_hash_destroy(pag); 153 xfs_buf_hash_destroy(pag);
153 mutex_destroy(&pag->pag_ici_reclaim_lock); 154 mutex_destroy(&pag->pag_ici_reclaim_lock);
154 call_rcu(&pag->rcu_head, __xfs_free_perag); 155 call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
227 /* first new pag is fully initialized */ 228 /* first new pag is fully initialized */
228 if (first_initialised == NULLAGNUMBER) 229 if (first_initialised == NULLAGNUMBER)
229 first_initialised = index; 230 first_initialised = index;
231 error = xfs_iunlink_init(pag);
232 if (error)
233 goto out_hash_destroy;
230 } 234 }
231 235
232 index = xfs_set_inode_alloc(mp, agcount); 236 index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
249 if (!pag) 253 if (!pag)
250 break; 254 break;
251 xfs_buf_hash_destroy(pag); 255 xfs_buf_hash_destroy(pag);
256 xfs_iunlink_destroy(pag);
252 mutex_destroy(&pag->pag_ici_reclaim_lock); 257 mutex_destroy(&pag->pag_ici_reclaim_lock);
253 kmem_free(pag); 258 kmem_free(pag);
254 } 259 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..110f927cf943 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
138 struct mutex m_growlock; /* growfs mutex */ 138 struct mutex m_growlock; /* growfs mutex */
139 int m_fixedfsid[2]; /* unchanged for life of FS */ 139 int m_fixedfsid[2]; /* unchanged for life of FS */
140 uint64_t m_flags; /* global mount flags */ 140 uint64_t m_flags; /* global mount flags */
141 bool m_inotbt_nores; /* no per-AG finobt resv. */ 141 bool m_finobt_nores; /* no per-AG finobt resv. */
142 int m_ialloc_inos; /* inodes in inode allocation */ 142 int m_ialloc_inos; /* inodes in inode allocation */
143 int m_ialloc_blks; /* blocks in inode allocation */ 143 int m_ialloc_blks; /* blocks in inode allocation */
144 int m_ialloc_min_blks;/* min blocks in sparse inode 144 int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
194 */ 194 */
195 uint32_t m_generation; 195 uint32_t m_generation;
196 196
197 bool m_always_cow;
197 bool m_fail_unmount; 198 bool m_fail_unmount;
198#ifdef DEBUG 199#ifdef DEBUG
199 /* 200 /*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
396 397
397 /* reference count */ 398 /* reference count */
398 uint8_t pagf_refcount_level; 399 uint8_t pagf_refcount_level;
400
401 /*
402 * Unlinked inode information. This incore information reflects
403 * data stored in the AGI, so callers must hold the AGI buffer lock
404 * or have some other means to control concurrency.
405 */
406 struct rhashtable pagi_unlinked_hash;
399} xfs_perag_t; 407} xfs_perag_t;
400 408
401static inline struct xfs_ag_resv * 409static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..c8ba98fae30a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
125 XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); 125 XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
126 XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); 126 XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
127 XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); 127 XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
128
129 /*
130 * The v5 superblock format extended several v4 header structures with
131 * additional data. While new fields are only accessible on v5
132 * superblocks, it's important that the v5 structures place original v4
133 * fields/headers in the correct location on-disk. For example, we must
134 * be able to find magic values at the same location in certain blocks
135 * regardless of superblock version.
136 *
137 * The following checks ensure that various v5 data structures place the
138 * subset of v4 metadata associated with the same type of block at the
139 * start of the on-disk block. If there is no data structure definition
140 * for certain types of v4 blocks, traverse down to the first field of
141 * common metadata (e.g., magic value) and make sure it is at offset
142 * zero.
143 */
144 XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
145 XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
146 XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
147 XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
148 XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
128} 149}
129 150
130#endif /* __XFS_ONDISK_H */ 151#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
185 } 185 }
186 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 186 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
187 187
188 xfs_bmbt_to_iomap(ip, iomap, &imap); 188 error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
189 *device_generation = mp->m_generation; 189 *device_generation = mp->m_generation;
190 return error; 190 return error;
191out_unlock: 191out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c5b4fa004ca4..680ae7662a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
192 int error = 0; 192 int error = 0;
193 193
194 /* Holes, unwritten, and delalloc extents cannot be shared */ 194 /* Holes, unwritten, and delalloc extents cannot be shared */
195 if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 195 if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
196 *shared = false; 196 *shared = false;
197 return 0; 197 return 0;
198 } 198 }
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
234 } 234 }
235} 235}
236 236
237/* 237bool
238 * Trim the passed in imap to the next shared/unshared extent boundary, and 238xfs_inode_need_cow(
239 * if imap->br_startoff points to a shared extent reserve space for it in the
240 * COW fork.
241 *
242 * Note that imap will always contain the block numbers for the existing blocks
243 * in the data fork, as the upper layers need them for read-modify-write
244 * operations.
245 */
246int
247xfs_reflink_reserve_cow(
248 struct xfs_inode *ip, 239 struct xfs_inode *ip,
249 struct xfs_bmbt_irec *imap) 240 struct xfs_bmbt_irec *imap,
241 bool *shared)
250{ 242{
251 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 243 /* We can't update any real extents in always COW mode. */
252 struct xfs_bmbt_irec got; 244 if (xfs_is_always_cow_inode(ip) &&
253 int error = 0; 245 !isnullstartblock(imap->br_startblock)) {
254 bool eof = false; 246 *shared = true;
255 struct xfs_iext_cursor icur;
256 bool shared;
257
258 /*
259 * Search the COW fork extent list first. This serves two purposes:
260 * first this implement the speculative preallocation using cowextisze,
261 * so that we also unshared block adjacent to shared blocks instead
262 * of just the shared blocks themselves. Second the lookup in the
263 * extent list is generally faster than going out to the shared extent
264 * tree.
265 */
266
267 if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
268 eof = true;
269 if (!eof && got.br_startoff <= imap->br_startoff) {
270 trace_xfs_reflink_cow_found(ip, imap);
271 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
272 return 0; 247 return 0;
273 } 248 }
274 249
275 /* Trim the mapping to the nearest shared extent boundary. */ 250 /* Trim the mapping to the nearest shared extent boundary. */
276 error = xfs_reflink_trim_around_shared(ip, imap, &shared); 251 return xfs_reflink_trim_around_shared(ip, imap, shared);
277 if (error)
278 return error;
279
280 /* Not shared? Just report the (potentially capped) extent. */
281 if (!shared)
282 return 0;
283
284 /*
285 * Fork all the shared blocks from our write offset until the end of
286 * the extent.
287 */
288 error = xfs_qm_dqattach_locked(ip, false);
289 if (error)
290 return error;
291
292 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
293 imap->br_blockcount, 0, &got, &icur, eof);
294 if (error == -ENOSPC || error == -EDQUOT)
295 trace_xfs_reflink_cow_enospc(ip, imap);
296 if (error)
297 return error;
298
299 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
300 trace_xfs_reflink_cow_alloc(ip, &got);
301 return 0;
302} 252}
303 253
304/* Convert part of an unwritten CoW extent to a real one. */ 254static int
305STATIC int 255xfs_reflink_convert_cow_locked(
306xfs_reflink_convert_cow_extent( 256 struct xfs_inode *ip,
307 struct xfs_inode *ip, 257 xfs_fileoff_t offset_fsb,
308 struct xfs_bmbt_irec *imap, 258 xfs_filblks_t count_fsb)
309 xfs_fileoff_t offset_fsb,
310 xfs_filblks_t count_fsb)
311{ 259{
312 int nimaps = 1; 260 struct xfs_iext_cursor icur;
261 struct xfs_bmbt_irec got;
262 struct xfs_btree_cur *dummy_cur = NULL;
263 int dummy_logflags;
264 int error = 0;
313 265
314 if (imap->br_state == XFS_EXT_NORM) 266 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
315 return 0; 267 return 0;
316 268
317 xfs_trim_extent(imap, offset_fsb, count_fsb); 269 do {
318 trace_xfs_reflink_convert_cow(ip, imap); 270 if (got.br_startoff >= offset_fsb + count_fsb)
319 if (imap->br_blockcount == 0) 271 break;
320 return 0; 272 if (got.br_state == XFS_EXT_NORM)
321 return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, 273 continue;
322 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, 274 if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
323 &nimaps); 275 return -EIO;
276
277 xfs_trim_extent(&got, offset_fsb, count_fsb);
278 if (!got.br_blockcount)
279 continue;
280
281 got.br_state = XFS_EXT_NORM;
282 error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
283 XFS_COW_FORK, &icur, &dummy_cur, &got,
284 &dummy_logflags);
285 if (error)
286 return error;
287 } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
288
289 return error;
324} 290}
325 291
326/* Convert all of the unwritten CoW extents in a file's range to real ones. */ 292/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
334 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 300 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
335 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 301 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
336 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 302 xfs_filblks_t count_fsb = end_fsb - offset_fsb;
337 struct xfs_bmbt_irec imap; 303 int error;
338 int nimaps = 1, error = 0;
339 304
340 ASSERT(count != 0); 305 ASSERT(count != 0);
341 306
342 xfs_ilock(ip, XFS_ILOCK_EXCL); 307 xfs_ilock(ip, XFS_ILOCK_EXCL);
343 error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, 308 error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
344 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
345 XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
346 xfs_iunlock(ip, XFS_ILOCK_EXCL); 309 xfs_iunlock(ip, XFS_ILOCK_EXCL);
347 return error; 310 return error;
348} 311}
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
375 if (got.br_startoff > offset_fsb) { 338 if (got.br_startoff > offset_fsb) {
376 xfs_trim_extent(imap, imap->br_startoff, 339 xfs_trim_extent(imap, imap->br_startoff,
377 got.br_startoff - imap->br_startoff); 340 got.br_startoff - imap->br_startoff);
378 return xfs_reflink_trim_around_shared(ip, imap, shared); 341 return xfs_inode_need_cow(ip, imap, shared);
379 } 342 }
380 343
381 *shared = true; 344 *shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
397 struct xfs_inode *ip, 360 struct xfs_inode *ip,
398 struct xfs_bmbt_irec *imap, 361 struct xfs_bmbt_irec *imap,
399 bool *shared, 362 bool *shared,
400 uint *lockmode) 363 uint *lockmode,
364 bool convert_now)
401{ 365{
402 struct xfs_mount *mp = ip->i_mount; 366 struct xfs_mount *mp = ip->i_mount;
403 xfs_fileoff_t offset_fsb = imap->br_startoff; 367 xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
409 xfs_extlen_t resblks = 0; 373 xfs_extlen_t resblks = 0;
410 374
411 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 375 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
412 ASSERT(xfs_is_reflink_inode(ip)); 376 if (!ip->i_cowfp) {
377 ASSERT(!xfs_is_reflink_inode(ip));
378 xfs_ifork_init_cow(ip);
379 }
413 380
414 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 381 error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
415 if (error || !*shared) 382 if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
471 if (nimaps == 0) 438 if (nimaps == 0)
472 return -ENOSPC; 439 return -ENOSPC;
473convert: 440convert:
474 return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); 441 xfs_trim_extent(imap, offset_fsb, count_fsb);
442 /*
443 * COW fork extents are supposed to remain unwritten until we're ready
444 * to initiate a disk write. For direct I/O we are going to write the
445 * data and need the conversion, but for buffered writes we're done.
446 */
447 if (!convert_now || imap->br_state == XFS_EXT_NORM)
448 return 0;
449 trace_xfs_reflink_convert_cow(ip, imap);
450 return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
475 451
476out_unreserve: 452out_unreserve:
477 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, 453 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
586 int error; 562 int error;
587 563
588 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 564 trace_xfs_reflink_cancel_cow_range(ip, offset, count);
589 ASSERT(xfs_is_reflink_inode(ip)); 565 ASSERT(ip->i_cowfp);
590 566
591 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 567 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
592 if (count == NULLFILEOFF) 568 if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
1192 break; 1168 break;
1193 ASSERT(nimaps == 1); 1169 ASSERT(nimaps == 1);
1194 1170
1195 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, 1171 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
1196 &imap); 1172 &imap);
1197 1173
1198 /* Translate imap into the destination file. */ 1174 /* Translate imap into the destination file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
6#ifndef __XFS_REFLINK_H 6#ifndef __XFS_REFLINK_H
7#define __XFS_REFLINK_H 1 7#define __XFS_REFLINK_H 1
8 8
9static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
10{
11 return ip->i_mount->m_always_cow &&
12 xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
13}
14
15static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
16{
17 return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
18}
19
9extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, 20extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
10 xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, 21 xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
11 xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); 22 xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
12extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, 23extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
13 struct xfs_bmbt_irec *irec, bool *shared); 24 struct xfs_bmbt_irec *irec, bool *shared);
25bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
26 bool *shared);
14 27
15extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
16 struct xfs_bmbt_irec *imap);
17extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, 28extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
18 struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); 29 struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
30 bool convert_now);
19extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, 31extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
20 xfs_off_t count); 32 xfs_off_t count);
21 33
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..f093ea244849 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
1594 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); 1594 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1595 INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); 1595 INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
1596 mp->m_kobj.kobject.kset = xfs_kset; 1596 mp->m_kobj.kobject.kset = xfs_kset;
1597 /*
1598 * We don't create the finobt per-ag space reservation until after log
1599 * recovery, so we must set this to true so that an ifree transaction
1600 * started during log recovery will not depend on space reservations
1601 * for finobt expansion.
1602 */
1603 mp->m_finobt_nores = true;
1597 return mp; 1604 return mp;
1598} 1605}
1599 1606
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
1729 } 1736 }
1730 } 1737 }
1731 1738
1732 if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { 1739 if (xfs_sb_version_hasreflink(&mp->m_sb)) {
1733 xfs_alert(mp, 1740 if (mp->m_sb.sb_rblocks) {
1741 xfs_alert(mp,
1734 "reflink not compatible with realtime device!"); 1742 "reflink not compatible with realtime device!");
1735 error = -EINVAL; 1743 error = -EINVAL;
1736 goto out_filestream_unmount; 1744 goto out_filestream_unmount;
1745 }
1746
1747 if (xfs_globals.always_cow) {
1748 xfs_info(mp, "using DEBUG-only always_cow mode.");
1749 mp->m_always_cow = true;
1750 }
1737 } 1751 }
1738 1752
1739 if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { 1753 if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
85 int log_recovery_delay; /* log recovery delay (secs) */ 85 int log_recovery_delay; /* log recovery delay (secs) */
86 int mount_delay; /* mount setup delay (secs) */ 86 int mount_delay; /* mount setup delay (secs) */
87 bool bug_on_assert; /* BUG() the kernel on assert failure */ 87 bool bug_on_assert; /* BUG() the kernel on assert failure */
88 bool always_cow; /* use COW fork for all overwrites */
88}; 89};
89extern struct xfs_globals xfs_globals; 90extern struct xfs_globals xfs_globals;
90 91
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
183} 183}
184XFS_SYSFS_ATTR_RW(mount_delay); 184XFS_SYSFS_ATTR_RW(mount_delay);
185 185
186static ssize_t
187always_cow_store(
188 struct kobject *kobject,
189 const char *buf,
190 size_t count)
191{
192 ssize_t ret;
193
194 ret = kstrtobool(buf, &xfs_globals.always_cow);
195 if (ret < 0)
196 return ret;
197 return count;
198}
199
200static ssize_t
201always_cow_show(
202 struct kobject *kobject,
203 char *buf)
204{
205 return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
206}
207XFS_SYSFS_ATTR_RW(always_cow);
208
186static struct attribute *xfs_dbg_attrs[] = { 209static struct attribute *xfs_dbg_attrs[] = {
187 ATTR_LIST(bug_on_assert), 210 ATTR_LIST(bug_on_assert),
188 ATTR_LIST(log_recovery_delay), 211 ATTR_LIST(log_recovery_delay),
189 ATTR_LIST(mount_delay), 212 ATTR_LIST(mount_delay),
213 ATTR_LIST(always_cow),
190 NULL, 214 NULL,
191}; 215};
192 216
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \
1218DEFINE_READPAGE_EVENT(xfs_vm_readpage); 1218DEFINE_READPAGE_EVENT(xfs_vm_readpage);
1219DEFINE_READPAGE_EVENT(xfs_vm_readpages); 1219DEFINE_READPAGE_EVENT(xfs_vm_readpages);
1220 1220
1221TRACE_DEFINE_ENUM(XFS_IO_HOLE);
1222TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
1223TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
1224TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
1225TRACE_DEFINE_ENUM(XFS_IO_COW);
1226
1227DECLARE_EVENT_CLASS(xfs_imap_class, 1221DECLARE_EVENT_CLASS(xfs_imap_class,
1228 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 1222 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
1229 int type, struct xfs_bmbt_irec *irec), 1223 int whichfork, struct xfs_bmbt_irec *irec),
1230 TP_ARGS(ip, offset, count, type, irec), 1224 TP_ARGS(ip, offset, count, whichfork, irec),
1231 TP_STRUCT__entry( 1225 TP_STRUCT__entry(
1232 __field(dev_t, dev) 1226 __field(dev_t, dev)
1233 __field(xfs_ino_t, ino) 1227 __field(xfs_ino_t, ino)
1234 __field(loff_t, size) 1228 __field(loff_t, size)
1235 __field(loff_t, offset) 1229 __field(loff_t, offset)
1236 __field(size_t, count) 1230 __field(size_t, count)
1237 __field(int, type) 1231 __field(int, whichfork)
1238 __field(xfs_fileoff_t, startoff) 1232 __field(xfs_fileoff_t, startoff)
1239 __field(xfs_fsblock_t, startblock) 1233 __field(xfs_fsblock_t, startblock)
1240 __field(xfs_filblks_t, blockcount) 1234 __field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
1245 __entry->size = ip->i_d.di_size; 1239 __entry->size = ip->i_d.di_size;
1246 __entry->offset = offset; 1240 __entry->offset = offset;
1247 __entry->count = count; 1241 __entry->count = count;
1248 __entry->type = type; 1242 __entry->whichfork = whichfork;
1249 __entry->startoff = irec ? irec->br_startoff : 0; 1243 __entry->startoff = irec ? irec->br_startoff : 0;
1250 __entry->startblock = irec ? irec->br_startblock : 0; 1244 __entry->startblock = irec ? irec->br_startblock : 0;
1251 __entry->blockcount = irec ? irec->br_blockcount : 0; 1245 __entry->blockcount = irec ? irec->br_blockcount : 0;
1252 ), 1246 ),
1253 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " 1247 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
1254 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", 1248 "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
1255 MAJOR(__entry->dev), MINOR(__entry->dev), 1249 MAJOR(__entry->dev), MINOR(__entry->dev),
1256 __entry->ino, 1250 __entry->ino,
1257 __entry->size, 1251 __entry->size,
1258 __entry->offset, 1252 __entry->offset,
1259 __entry->count, 1253 __entry->count,
1260 __print_symbolic(__entry->type, XFS_IO_TYPES), 1254 __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
1261 __entry->startoff, 1255 __entry->startoff,
1262 (int64_t)__entry->startblock, 1256 (int64_t)__entry->startblock,
1263 __entry->blockcount) 1257 __entry->blockcount)
1264) 1258)
1265 1259
1266#define DEFINE_IOMAP_EVENT(name) \ 1260#define DEFINE_IMAP_EVENT(name) \
1267DEFINE_EVENT(xfs_imap_class, name, \ 1261DEFINE_EVENT(xfs_imap_class, name, \
1268 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 1262 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
1269 int type, struct xfs_bmbt_irec *irec), \ 1263 int whichfork, struct xfs_bmbt_irec *irec), \
1270 TP_ARGS(ip, offset, count, type, irec)) 1264 TP_ARGS(ip, offset, count, whichfork, irec))
1271DEFINE_IOMAP_EVENT(xfs_map_blocks_found); 1265DEFINE_IMAP_EVENT(xfs_map_blocks_found);
1272DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); 1266DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
1273DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 1267DEFINE_IMAP_EVENT(xfs_iomap_alloc);
1274DEFINE_IOMAP_EVENT(xfs_iomap_found); 1268DEFINE_IMAP_EVENT(xfs_iomap_found);
1275 1269
1276DECLARE_EVENT_CLASS(xfs_simple_io_class, 1270DECLARE_EVENT_CLASS(xfs_simple_io_class,
1277 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1271 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
3078DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); 3072DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
3079DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); 3073DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
3080DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); 3074DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
3081DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); 3075DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
3082TRACE_EVENT(xfs_reflink_remap_blocks_loop, 3076TRACE_EVENT(xfs_reflink_remap_blocks_loop,
3083 TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, 3077 TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
3084 xfs_filblks_t len, struct xfs_inode *dest, 3078 xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
3202 3196
3203/* copy on write */ 3197/* copy on write */
3204DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); 3198DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
3205DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
3206DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); 3199DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
3207DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); 3200DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
3208DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); 3201DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
3209 3202
3210DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
3211
3212DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); 3203DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
3213 3204
3214DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); 3205DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
3371DEFINE_TRANS_EVENT(xfs_trans_add_item); 3362DEFINE_TRANS_EVENT(xfs_trans_add_item);
3372DEFINE_TRANS_EVENT(xfs_trans_free_items); 3363DEFINE_TRANS_EVENT(xfs_trans_free_items);
3373 3364
3365TRACE_EVENT(xfs_iunlink_update_bucket,
3366 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
3367 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
3368 TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
3369 TP_STRUCT__entry(
3370 __field(dev_t, dev)
3371 __field(xfs_agnumber_t, agno)
3372 __field(unsigned int, bucket)
3373 __field(xfs_agino_t, old_ptr)
3374 __field(xfs_agino_t, new_ptr)
3375 ),
3376 TP_fast_assign(
3377 __entry->dev = mp->m_super->s_dev;
3378 __entry->agno = agno;
3379 __entry->bucket = bucket;
3380 __entry->old_ptr = old_ptr;
3381 __entry->new_ptr = new_ptr;
3382 ),
3383 TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
3384 MAJOR(__entry->dev), MINOR(__entry->dev),
3385 __entry->agno,
3386 __entry->bucket,
3387 __entry->old_ptr,
3388 __entry->new_ptr)
3389);
3390
3391TRACE_EVENT(xfs_iunlink_update_dinode,
3392 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
3393 xfs_agino_t old_ptr, xfs_agino_t new_ptr),
3394 TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
3395 TP_STRUCT__entry(
3396 __field(dev_t, dev)
3397 __field(xfs_agnumber_t, agno)
3398 __field(xfs_agino_t, agino)
3399 __field(xfs_agino_t, old_ptr)
3400 __field(xfs_agino_t, new_ptr)
3401 ),
3402 TP_fast_assign(
3403 __entry->dev = mp->m_super->s_dev;
3404 __entry->agno = agno;
3405 __entry->agino = agino;
3406 __entry->old_ptr = old_ptr;
3407 __entry->new_ptr = new_ptr;
3408 ),
3409 TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
3410 MAJOR(__entry->dev), MINOR(__entry->dev),
3411 __entry->agno,
3412 __entry->agino,
3413 __entry->old_ptr,
3414 __entry->new_ptr)
3415);
3416
3417DECLARE_EVENT_CLASS(xfs_ag_inode_class,
3418 TP_PROTO(struct xfs_inode *ip),
3419 TP_ARGS(ip),
3420 TP_STRUCT__entry(
3421 __field(dev_t, dev)
3422 __field(xfs_agnumber_t, agno)
3423 __field(xfs_agino_t, agino)
3424 ),
3425 TP_fast_assign(
3426 __entry->dev = VFS_I(ip)->i_sb->s_dev;
3427 __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
3428 __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
3429 ),
3430 TP_printk("dev %d:%d agno %u agino %u",
3431 MAJOR(__entry->dev), MINOR(__entry->dev),
3432 __entry->agno, __entry->agino)
3433)
3434
3435#define DEFINE_AGINODE_EVENT(name) \
3436DEFINE_EVENT(xfs_ag_inode_class, name, \
3437 TP_PROTO(struct xfs_inode *ip), \
3438 TP_ARGS(ip))
3439DEFINE_AGINODE_EVENT(xfs_iunlink);
3440DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
3441DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
3442
3374#endif /* _TRACE_XFS_H */ 3443#endif /* _TRACE_XFS_H */
3375 3444
3376#undef TRACE_INCLUDE_PATH 3445#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 11cff449d055..e1c7d55b32c3 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -17,7 +17,6 @@
17#include "xfs_alloc.h" 17#include "xfs_alloc.h"
18#include "xfs_bmap.h" 18#include "xfs_bmap.h"
19#include "xfs_inode.h" 19#include "xfs_inode.h"
20#include "xfs_defer.h"
21 20
22/* 21/*
23 * This routine is called to allocate a "bmap update done" 22 * This routine is called to allocate a "bmap update done"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..7d65ebf1e847 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
277 * release this buffer when it kills the tranaction. 277 * release this buffer when it kills the tranaction.
278 */ 278 */
279 ASSERT(bp->b_ops != NULL); 279 ASSERT(bp->b_ops != NULL);
280 error = xfs_buf_ensure_ops(bp, ops); 280 error = xfs_buf_reverify(bp, ops);
281 if (error) { 281 if (error) {
282 xfs_buf_ioerror_alert(bp, __func__); 282 xfs_buf_ioerror_alert(bp, __func__);
283 283
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 0710434eb240..8ee7a3f8bb20 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,7 +18,6 @@
18#include "xfs_alloc.h" 18#include "xfs_alloc.h"
19#include "xfs_bmap.h" 19#include "xfs_bmap.h"
20#include "xfs_trace.h" 20#include "xfs_trace.h"
21#include "xfs_defer.h"
22 21
23/* 22/*
24 * This routine is called to allocate an "extent free done" 23 * This routine is called to allocate an "extent free done"
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 6c947ff4faf6..8d734728dd1b 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -16,7 +16,6 @@
16#include "xfs_refcount_item.h" 16#include "xfs_refcount_item.h"
17#include "xfs_alloc.h" 17#include "xfs_alloc.h"
18#include "xfs_refcount.h" 18#include "xfs_refcount.h"
19#include "xfs_defer.h"
20 19
21/* 20/*
22 * This routine is called to allocate a "refcount update done" 21 * This routine is called to allocate a "refcount update done"
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index a42890931ecd..5c7936b1be13 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -16,7 +16,6 @@
16#include "xfs_rmap_item.h" 16#include "xfs_rmap_item.h"
17#include "xfs_alloc.h" 17#include "xfs_alloc.h"
18#include "xfs_rmap.h" 18#include "xfs_rmap.h"
19#include "xfs_defer.h"
20 19
21/* Set the map extent flags for this reverse mapping. */ 20/* Set the map extent flags for this reverse mapping. */
22static void 21static void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..9a63016009a1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
129 char *offset; 129 char *offset;
130 int arraytop; 130 int arraytop;
131 131
132 if (context->count < 0 || context->seen_enough)
133 return;
134
132 if (!context->alist) 135 if (!context->alist)
133 goto compute_size; 136 goto compute_size;
134 137
diff --git a/include/linux/audit.h b/include/linux/audit.h
index a625c29a2ea2..1e69d9fe16da 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -25,6 +25,7 @@
25 25
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/namei.h> /* LOOKUP_* */
28#include <uapi/linux/audit.h> 29#include <uapi/linux/audit.h>
29 30
30#define AUDIT_INO_UNSET ((unsigned long)-1) 31#define AUDIT_INO_UNSET ((unsigned long)-1)
@@ -159,6 +160,18 @@ extern int audit_update_lsm_rules(void);
159extern int audit_rule_change(int type, int seq, void *data, size_t datasz); 160extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
160extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); 161extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
161 162
163extern int audit_set_loginuid(kuid_t loginuid);
164
165static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
166{
167 return tsk->loginuid;
168}
169
170static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
171{
172 return tsk->sessionid;
173}
174
162extern u32 audit_enabled; 175extern u32 audit_enabled;
163#else /* CONFIG_AUDIT */ 176#else /* CONFIG_AUDIT */
164static inline __printf(4, 5) 177static inline __printf(4, 5)
@@ -201,6 +214,17 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
201} 214}
202static inline void audit_log_task_info(struct audit_buffer *ab) 215static inline void audit_log_task_info(struct audit_buffer *ab)
203{ } 216{ }
217
218static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
219{
220 return INVALID_UID;
221}
222
223static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
224{
225 return AUDIT_SID_UNSET;
226}
227
204#define audit_enabled AUDIT_OFF 228#define audit_enabled AUDIT_OFF
205#endif /* CONFIG_AUDIT */ 229#endif /* CONFIG_AUDIT */
206 230
@@ -225,6 +249,7 @@ extern void __audit_getname(struct filename *name);
225 249
226#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ 250#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */
227#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ 251#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */
252#define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */
228extern void __audit_inode(struct filename *name, const struct dentry *dentry, 253extern void __audit_inode(struct filename *name, const struct dentry *dentry,
229 unsigned int flags); 254 unsigned int flags);
230extern void __audit_file(const struct file *); 255extern void __audit_file(const struct file *);
@@ -285,12 +310,15 @@ static inline void audit_getname(struct filename *name)
285} 310}
286static inline void audit_inode(struct filename *name, 311static inline void audit_inode(struct filename *name,
287 const struct dentry *dentry, 312 const struct dentry *dentry,
288 unsigned int parent) { 313 unsigned int flags) {
289 if (unlikely(!audit_dummy_context())) { 314 if (unlikely(!audit_dummy_context())) {
290 unsigned int flags = 0; 315 unsigned int aflags = 0;
291 if (parent) 316
292 flags |= AUDIT_INODE_PARENT; 317 if (flags & LOOKUP_PARENT)
293 __audit_inode(name, dentry, flags); 318 aflags |= AUDIT_INODE_PARENT;
319 if (flags & LOOKUP_NO_EVAL)
320 aflags |= AUDIT_INODE_NOEVAL;
321 __audit_inode(name, dentry, aflags);
294 } 322 }
295} 323}
296static inline void audit_file(struct file *file) 324static inline void audit_file(struct file *file)
@@ -320,21 +348,6 @@ static inline void audit_ptrace(struct task_struct *t)
320} 348}
321 349
322 /* Private API (for audit.c only) */ 350 /* Private API (for audit.c only) */
323extern unsigned int audit_serial(void);
324extern int auditsc_get_stamp(struct audit_context *ctx,
325 struct timespec64 *t, unsigned int *serial);
326extern int audit_set_loginuid(kuid_t loginuid);
327
328static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
329{
330 return tsk->loginuid;
331}
332
333static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
334{
335 return tsk->sessionid;
336}
337
338extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); 351extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
339extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); 352extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
340extern void __audit_bprm(struct linux_binprm *bprm); 353extern void __audit_bprm(struct linux_binprm *bprm);
@@ -514,19 +527,6 @@ static inline void audit_seccomp(unsigned long syscall, long signr, int code)
514static inline void audit_seccomp_actions_logged(const char *names, 527static inline void audit_seccomp_actions_logged(const char *names,
515 const char *old_names, int res) 528 const char *old_names, int res)
516{ } 529{ }
517static inline int auditsc_get_stamp(struct audit_context *ctx,
518 struct timespec64 *t, unsigned int *serial)
519{
520 return 0;
521}
522static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
523{
524 return INVALID_UID;
525}
526static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
527{
528 return AUDIT_SID_UNSET;
529}
530static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) 530static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
531{ } 531{ }
532static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, 532static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..ecce0f43c73a 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,7 +14,7 @@
14#define _LINUX_CAPABILITY_H 14#define _LINUX_CAPABILITY_H
15 15
16#include <uapi/linux/capability.h> 16#include <uapi/linux/capability.h>
17 17#include <linux/uidgid.h>
18 18
19#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 19#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
20#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 20#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@@ -25,11 +25,12 @@ typedef struct kernel_cap_struct {
25 __u32 cap[_KERNEL_CAPABILITY_U32S]; 25 __u32 cap[_KERNEL_CAPABILITY_U32S];
26} kernel_cap_t; 26} kernel_cap_t;
27 27
28/* exact same as vfs_cap_data but in cpu endian and always filled completely */ 28/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
29struct cpu_vfs_cap_data { 29struct cpu_vfs_cap_data {
30 __u32 magic_etc; 30 __u32 magic_etc;
31 kernel_cap_t permitted; 31 kernel_cap_t permitted;
32 kernel_cap_t inheritable; 32 kernel_cap_t inheritable;
33 kuid_t rootid;
33}; 34};
34 35
35#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) 36#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
@@ -209,6 +210,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
209extern bool capable(int cap); 210extern bool capable(int cap);
210extern bool ns_capable(struct user_namespace *ns, int cap); 211extern bool ns_capable(struct user_namespace *ns, int cap);
211extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); 212extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
213extern bool ns_capable_setid(struct user_namespace *ns, int cap);
212#else 214#else
213static inline bool has_capability(struct task_struct *t, int cap) 215static inline bool has_capability(struct task_struct *t, int cap)
214{ 216{
@@ -240,6 +242,10 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
240{ 242{
241 return true; 243 return true;
242} 244}
245static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
246{
247 return true;
248}
243#endif /* CONFIG_MULTIUSER */ 249#endif /* CONFIG_MULTIUSER */
244extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); 250extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
245extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); 251extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index aad3babef007..1c70803e9f77 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -606,7 +606,7 @@ struct cgroup_subsys {
606 void (*cancel_fork)(struct task_struct *task); 606 void (*cancel_fork)(struct task_struct *task);
607 void (*fork)(struct task_struct *task); 607 void (*fork)(struct task_struct *task);
608 void (*exit)(struct task_struct *task); 608 void (*exit)(struct task_struct *task);
609 void (*free)(struct task_struct *task); 609 void (*release)(struct task_struct *task);
610 void (*bind)(struct cgroup_subsys_state *root_css); 610 void (*bind)(struct cgroup_subsys_state *root_css);
611 611
612 bool early_init:1; 612 bool early_init:1;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9968332cceed..81f58b4a5418 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -121,6 +121,7 @@ extern int cgroup_can_fork(struct task_struct *p);
121extern void cgroup_cancel_fork(struct task_struct *p); 121extern void cgroup_cancel_fork(struct task_struct *p);
122extern void cgroup_post_fork(struct task_struct *p); 122extern void cgroup_post_fork(struct task_struct *p);
123void cgroup_exit(struct task_struct *p); 123void cgroup_exit(struct task_struct *p);
124void cgroup_release(struct task_struct *p);
124void cgroup_free(struct task_struct *p); 125void cgroup_free(struct task_struct *p);
125 126
126int cgroup_init_early(void); 127int cgroup_init_early(void);
@@ -697,6 +698,7 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
697static inline void cgroup_cancel_fork(struct task_struct *p) {} 698static inline void cgroup_cancel_fork(struct task_struct *p) {}
698static inline void cgroup_post_fork(struct task_struct *p) {} 699static inline void cgroup_post_fork(struct task_struct *p) {}
699static inline void cgroup_exit(struct task_struct *p) {} 700static inline void cgroup_exit(struct task_struct *p) {}
701static inline void cgroup_release(struct task_struct *p) {}
700static inline void cgroup_free(struct task_struct *p) {} 702static inline void cgroup_free(struct task_struct *p) {}
701 703
702static inline int cgroup_init_early(void) { return 0; } 704static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4907c9df86b3..ddd45bb74887 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -15,7 +15,6 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/key.h> 17#include <linux/key.h>
18#include <linux/selinux.h>
19#include <linux/atomic.h> 18#include <linux/atomic.h>
20#include <linux/uidgid.h> 19#include <linux/uidgid.h>
21#include <linux/sched.h> 20#include <linux/sched.h>
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 9e2142795335..b79fa9bb7359 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -19,7 +19,7 @@
19 FAN_CLASS_PRE_CONTENT) 19 FAN_CLASS_PRE_CONTENT)
20 20
21#define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | \ 21#define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | \
22 FAN_REPORT_TID | \ 22 FAN_REPORT_TID | FAN_REPORT_FID | \
23 FAN_CLOEXEC | FAN_NONBLOCK | \ 23 FAN_CLOEXEC | FAN_NONBLOCK | \
24 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS) 24 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
25 25
@@ -35,10 +35,28 @@
35 FAN_MARK_IGNORED_SURV_MODIFY | \ 35 FAN_MARK_IGNORED_SURV_MODIFY | \
36 FAN_MARK_FLUSH) 36 FAN_MARK_FLUSH)
37 37
38/* Events that user can request to be notified on */ 38/*
39#define FANOTIFY_EVENTS (FAN_ACCESS | FAN_MODIFY | \ 39 * Events that can be reported with data type FSNOTIFY_EVENT_PATH.
40 * Note that FAN_MODIFY can also be reported with data type
41 * FSNOTIFY_EVENT_INODE.
42 */
43#define FANOTIFY_PATH_EVENTS (FAN_ACCESS | FAN_MODIFY | \
40 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC) 44 FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC)
41 45
46/*
47 * Directory entry modification events - reported only to directory
48 * where entry is modified and not to a watching parent.
49 */
50#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE)
51
52/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
53#define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \
54 FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
55
56/* Events that user can request to be notified on */
57#define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \
58 FANOTIFY_INODE_EVENTS)
59
42/* Events that require a permission response from user */ 60/* Events that require a permission response from user */
43#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ 61#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
44 FAN_OPEN_EXEC_PERM) 62 FAN_OPEN_EXEC_PERM)
@@ -49,7 +67,7 @@
49/* Events that may be reported to user */ 67/* Events that may be reported to user */
50#define FANOTIFY_OUTGOING_EVENTS (FANOTIFY_EVENTS | \ 68#define FANOTIFY_OUTGOING_EVENTS (FANOTIFY_EVENTS | \
51 FANOTIFY_PERM_EVENTS | \ 69 FANOTIFY_PERM_EVENTS | \
52 FAN_Q_OVERFLOW) 70 FAN_Q_OVERFLOW | FAN_ONDIR)
53 71
54#define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \ 72#define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \
55 FANOTIFY_EVENT_FLAGS) 73 FANOTIFY_EVENT_FLAGS)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a775aa3e349..2cc540805a02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -37,6 +37,7 @@
37#include <linux/uuid.h> 37#include <linux/uuid.h>
38#include <linux/errseq.h> 38#include <linux/errseq.h>
39#include <linux/ioprio.h> 39#include <linux/ioprio.h>
40#include <linux/fs_types.h>
40#include <linux/build_bug.h> 41#include <linux/build_bug.h>
41#include <linux/stddef.h> 42#include <linux/stddef.h>
42 43
@@ -1709,22 +1710,6 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
1709int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); 1710int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
1710 1711
1711/* 1712/*
1712 * File types
1713 *
1714 * NOTE! These match bits 12..15 of stat.st_mode
1715 * (ie "(i_mode >> 12) & 15").
1716 */
1717#define DT_UNKNOWN 0
1718#define DT_FIFO 1
1719#define DT_CHR 2
1720#define DT_DIR 4
1721#define DT_BLK 6
1722#define DT_REG 8
1723#define DT_LNK 10
1724#define DT_SOCK 12
1725#define DT_WHT 14
1726
1727/*
1728 * This is the "filldir" function type, used by readdir() to let 1713 * This is the "filldir" function type, used by readdir() to let
1729 * the kernel specify what kind of dirent layout it wants to have. 1714 * the kernel specify what kind of dirent layout it wants to have.
1730 * This allows the kernel to read directories into kernel space or 1715 * This allows the kernel to read directories into kernel space or
diff --git a/include/linux/fs_types.h b/include/linux/fs_types.h
new file mode 100644
index 000000000000..54816791196f
--- /dev/null
+++ b/include/linux/fs_types.h
@@ -0,0 +1,75 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_FS_TYPES_H
3#define _LINUX_FS_TYPES_H
4
5/*
6 * This is a header for the common implementation of dirent
7 * to fs on-disk file type conversion. Although the fs on-disk
8 * bits are specific to every file system, in practice, many
9 * file systems use the exact same on-disk format to describe
10 * the lower 3 file type bits that represent the 7 POSIX file
11 * types.
12 *
13 * It is important to note that the definitions in this
14 * header MUST NOT change. This would break both the
15 * userspace ABI and the on-disk format of filesystems
16 * using this code.
17 *
18 * All those file systems can use this generic code for the
19 * conversions.
20 */
21
22/*
23 * struct dirent file types
24 * exposed to user via getdents(2), readdir(3)
25 *
26 * These match bits 12..15 of stat.st_mode
27 * (ie "(i_mode >> 12) & 15").
28 */
29#define S_DT_SHIFT 12
30#define S_DT(mode) (((mode) & S_IFMT) >> S_DT_SHIFT)
31#define S_DT_MASK (S_IFMT >> S_DT_SHIFT)
32
33/* these are defined by POSIX and also present in glibc's dirent.h */
34#define DT_UNKNOWN 0
35#define DT_FIFO 1
36#define DT_CHR 2
37#define DT_DIR 4
38#define DT_BLK 6
39#define DT_REG 8
40#define DT_LNK 10
41#define DT_SOCK 12
42#define DT_WHT 14
43
44#define DT_MAX (S_DT_MASK + 1) /* 16 */
45
46/*
47 * fs on-disk file types.
48 * Only the low 3 bits are used for the POSIX file types.
49 * Other bits are reserved for fs private use.
50 * These definitions are shared and used by multiple filesystems,
51 * and MUST NOT change under any circumstances.
52 *
53 * Note that no fs currently stores the whiteout type on-disk,
54 * so whiteout dirents are exposed to user as DT_CHR.
55 */
56#define FT_UNKNOWN 0
57#define FT_REG_FILE 1
58#define FT_DIR 2
59#define FT_CHRDEV 3
60#define FT_BLKDEV 4
61#define FT_FIFO 5
62#define FT_SOCK 6
63#define FT_SYMLINK 7
64
65#define FT_MAX 8
66
67/*
68 * declarations for helper functions, accompanying implementation
69 * is in fs/fs_types.c
70 */
71extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
72extern unsigned char fs_umode_to_ftype(umode_t mode);
73extern unsigned char fs_umode_to_dtype(umode_t mode);
74
75#endif
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 2ccb08cb5d6a..09587e2860b5 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -17,8 +17,22 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/bug.h> 18#include <linux/bug.h>
19 19
20/*
21 * Notify this @dir inode about a change in the directory entry @dentry.
22 *
23 * Unlike fsnotify_parent(), the event will be reported regardless of the
24 * FS_EVENT_ON_CHILD mask on the parent inode.
25 */
26static inline int fsnotify_dirent(struct inode *dir, struct dentry *dentry,
27 __u32 mask)
28{
29 return fsnotify(dir, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
30 dentry->d_name.name, 0);
31}
32
20/* Notify this dentry's parent about a child's events. */ 33/* Notify this dentry's parent about a child's events. */
21static inline int fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) 34static inline int fsnotify_parent(const struct path *path,
35 struct dentry *dentry, __u32 mask)
22{ 36{
23 if (!dentry) 37 if (!dentry)
24 dentry = path->dentry; 38 dentry = path->dentry;
@@ -65,6 +79,9 @@ static inline int fsnotify_perm(struct file *file, int mask)
65 fsnotify_mask = FS_ACCESS_PERM; 79 fsnotify_mask = FS_ACCESS_PERM;
66 } 80 }
67 81
82 if (S_ISDIR(inode->i_mode))
83 fsnotify_mask |= FS_ISDIR;
84
68 return fsnotify_path(inode, path, fsnotify_mask); 85 return fsnotify_path(inode, path, fsnotify_mask);
69} 86}
70 87
@@ -73,7 +90,12 @@ static inline int fsnotify_perm(struct file *file, int mask)
73 */ 90 */
74static inline void fsnotify_link_count(struct inode *inode) 91static inline void fsnotify_link_count(struct inode *inode)
75{ 92{
76 fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 93 __u32 mask = FS_ATTRIB;
94
95 if (S_ISDIR(inode->i_mode))
96 mask |= FS_ISDIR;
97
98 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
77} 99}
78 100
79/* 101/*
@@ -81,12 +103,14 @@ static inline void fsnotify_link_count(struct inode *inode)
81 */ 103 */
82static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, 104static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
83 const unsigned char *old_name, 105 const unsigned char *old_name,
84 int isdir, struct inode *target, struct dentry *moved) 106 int isdir, struct inode *target,
107 struct dentry *moved)
85{ 108{
86 struct inode *source = moved->d_inode; 109 struct inode *source = moved->d_inode;
87 u32 fs_cookie = fsnotify_get_cookie(); 110 u32 fs_cookie = fsnotify_get_cookie();
88 __u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM); 111 __u32 old_dir_mask = FS_MOVED_FROM;
89 __u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO); 112 __u32 new_dir_mask = FS_MOVED_TO;
113 __u32 mask = FS_MOVE_SELF;
90 const unsigned char *new_name = moved->d_name.name; 114 const unsigned char *new_name = moved->d_name.name;
91 115
92 if (old_dir == new_dir) 116 if (old_dir == new_dir)
@@ -95,6 +119,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
95 if (isdir) { 119 if (isdir) {
96 old_dir_mask |= FS_ISDIR; 120 old_dir_mask |= FS_ISDIR;
97 new_dir_mask |= FS_ISDIR; 121 new_dir_mask |= FS_ISDIR;
122 mask |= FS_ISDIR;
98 } 123 }
99 124
100 fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name, 125 fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
@@ -106,7 +131,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
106 fsnotify_link_count(target); 131 fsnotify_link_count(target);
107 132
108 if (source) 133 if (source)
109 fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); 134 fsnotify(source, mask, source, FSNOTIFY_EVENT_INODE, NULL, 0);
110 audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); 135 audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
111} 136}
112 137
@@ -128,15 +153,35 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
128 153
129/* 154/*
130 * fsnotify_nameremove - a filename was removed from a directory 155 * fsnotify_nameremove - a filename was removed from a directory
156 *
157 * This is mostly called under parent vfs inode lock so name and
158 * dentry->d_parent should be stable. However there are some corner cases where
159 * inode lock is not held. So to be on the safe side and be reselient to future
160 * callers and out of tree users of d_delete(), we do not assume that d_parent
161 * and d_name are stable and we use dget_parent() and
162 * take_dentry_name_snapshot() to grab stable references.
131 */ 163 */
132static inline void fsnotify_nameremove(struct dentry *dentry, int isdir) 164static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
133{ 165{
166 struct dentry *parent;
167 struct name_snapshot name;
134 __u32 mask = FS_DELETE; 168 __u32 mask = FS_DELETE;
135 169
170 /* d_delete() of pseudo inode? (e.g. __ns_get_path() playing tricks) */
171 if (IS_ROOT(dentry))
172 return;
173
136 if (isdir) 174 if (isdir)
137 mask |= FS_ISDIR; 175 mask |= FS_ISDIR;
138 176
139 fsnotify_parent(NULL, dentry, mask); 177 parent = dget_parent(dentry);
178 take_dentry_name_snapshot(&name, dentry);
179
180 fsnotify(d_inode(parent), mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
181 name.name, 0);
182
183 release_dentry_name_snapshot(&name);
184 dput(parent);
140} 185}
141 186
142/* 187/*
@@ -144,7 +189,12 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
144 */ 189 */
145static inline void fsnotify_inoderemove(struct inode *inode) 190static inline void fsnotify_inoderemove(struct inode *inode)
146{ 191{
147 fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0); 192 __u32 mask = FS_DELETE_SELF;
193
194 if (S_ISDIR(inode->i_mode))
195 mask |= FS_ISDIR;
196
197 fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
148 __fsnotify_inode_delete(inode); 198 __fsnotify_inode_delete(inode);
149} 199}
150 200
@@ -155,7 +205,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
155{ 205{
156 audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); 206 audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
157 207
158 fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); 208 fsnotify_dirent(inode, dentry, FS_CREATE);
159} 209}
160 210
161/* 211/*
@@ -176,12 +226,9 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
176 */ 226 */
177static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) 227static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
178{ 228{
179 __u32 mask = (FS_CREATE | FS_ISDIR);
180 struct inode *d_inode = dentry->d_inode;
181
182 audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); 229 audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
183 230
184 fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0); 231 fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
185} 232}
186 233
187/* 234/*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7639774e7475..dfc28fcb4de8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -59,27 +59,33 @@
59 * dnotify and inotify. */ 59 * dnotify and inotify. */
60#define FS_EVENT_ON_CHILD 0x08000000 60#define FS_EVENT_ON_CHILD 0x08000000
61 61
62/* This is a list of all events that may get sent to a parernt based on fs event
63 * happening to inodes inside that directory */
64#define FS_EVENTS_POSS_ON_CHILD (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
65 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
66 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
67 FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM | \
68 FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
69
70#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) 62#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO)
71 63
64/*
65 * Directory entry modification events - reported only to directory
66 * where entry is modified and not to a watching parent.
67 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
68 * when a directory entry inside a child subdir changes.
69 */
70#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE)
71
72#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ 72#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
73 FS_OPEN_EXEC_PERM) 73 FS_OPEN_EXEC_PERM)
74 74
75/*
76 * This is a list of all events that may get sent to a parent based on fs event
77 * happening to inodes inside that directory.
78 */
79#define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \
80 FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
81 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
82 FS_OPEN | FS_OPEN_EXEC)
83
75/* Events that can be reported to backends */ 84/* Events that can be reported to backends */
76#define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ 85#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
77 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN | \ 86 FS_EVENTS_POSS_ON_CHILD | \
78 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \ 87 FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
79 FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \ 88 FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
80 FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
81 FS_OPEN_PERM | FS_ACCESS_PERM | FS_DN_RENAME | \
82 FS_OPEN_EXEC | FS_OPEN_EXEC_PERM)
83 89
84/* Extra flags that may be reported with event or control handling of events */ 90/* Extra flags that may be reported with event or control handling of events */
85#define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ 91#define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
@@ -129,7 +135,6 @@ struct fsnotify_event {
129 struct list_head list; 135 struct list_head list;
130 /* inode may ONLY be dereferenced during handle_event(). */ 136 /* inode may ONLY be dereferenced during handle_event(). */
131 struct inode *inode; /* either the inode the event happened to or its parent */ 137 struct inode *inode; /* either the inode the event happened to or its parent */
132 u32 mask; /* the type of access, bitwise OR for FS_* event types */
133}; 138};
134 139
135/* 140/*
@@ -288,6 +293,7 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
288struct fsnotify_mark_connector { 293struct fsnotify_mark_connector {
289 spinlock_t lock; 294 spinlock_t lock;
290 unsigned int type; /* Type of object [lock] */ 295 unsigned int type; /* Type of object [lock] */
296 __kernel_fsid_t fsid; /* fsid of filesystem containing object */
291 union { 297 union {
292 /* Object pointer [lock] */ 298 /* Object pointer [lock] */
293 fsnotify_connp_t *obj; 299 fsnotify_connp_t *obj;
@@ -416,6 +422,9 @@ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
416extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); 422extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
417/* return AND dequeue the first event on the notification queue */ 423/* return AND dequeue the first event on the notification queue */
418extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); 424extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
425/* Remove event queued in the notification list */
426extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
427 struct fsnotify_event *event);
419 428
420/* functions used to manipulate the marks attached to inodes */ 429/* functions used to manipulate the marks attached to inodes */
421 430
@@ -428,28 +437,35 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark,
428/* Find mark belonging to given group in the list of marks */ 437/* Find mark belonging to given group in the list of marks */
429extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, 438extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
430 struct fsnotify_group *group); 439 struct fsnotify_group *group);
440/* Get cached fsid of filesystem containing object */
441extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
442 __kernel_fsid_t *fsid);
431/* attach the mark to the object */ 443/* attach the mark to the object */
432extern int fsnotify_add_mark(struct fsnotify_mark *mark, 444extern int fsnotify_add_mark(struct fsnotify_mark *mark,
433 fsnotify_connp_t *connp, unsigned int type, 445 fsnotify_connp_t *connp, unsigned int type,
434 int allow_dups); 446 int allow_dups, __kernel_fsid_t *fsid);
435extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, 447extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
436 fsnotify_connp_t *connp, unsigned int type, 448 fsnotify_connp_t *connp,
437 int allow_dups); 449 unsigned int type, int allow_dups,
450 __kernel_fsid_t *fsid);
451
438/* attach the mark to the inode */ 452/* attach the mark to the inode */
439static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, 453static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
440 struct inode *inode, 454 struct inode *inode,
441 int allow_dups) 455 int allow_dups)
442{ 456{
443 return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, 457 return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
444 FSNOTIFY_OBJ_TYPE_INODE, allow_dups); 458 FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL);
445} 459}
446static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, 460static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
447 struct inode *inode, 461 struct inode *inode,
448 int allow_dups) 462 int allow_dups)
449{ 463{
450 return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, 464 return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
451 FSNOTIFY_OBJ_TYPE_INODE, allow_dups); 465 FSNOTIFY_OBJ_TYPE_INODE, allow_dups,
466 NULL);
452} 467}
468
453/* given a group and a mark, flag mark to be freed when all references are dropped */ 469/* given a group and a mark, flag mark to be freed when all references are dropped */
454extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, 470extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
455 struct fsnotify_group *group); 471 struct fsnotify_group *group);
@@ -479,9 +495,12 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
479extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); 495extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
480extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); 496extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
481 497
482/* put here because inotify does some weird stuff when destroying watches */ 498static inline void fsnotify_init_event(struct fsnotify_event *event,
483extern void fsnotify_init_event(struct fsnotify_event *event, 499 struct inode *inode)
484 struct inode *to_tell, u32 mask); 500{
501 INIT_LIST_HEAD(&event->list);
502 event->inode = inode;
503}
485 504
486#else 505#else
487 506
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e07e91daaacc..201f0f2683f2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -442,6 +442,11 @@ static inline int enable_kprobe(struct kprobe *kp)
442{ 442{
443 return -ENOSYS; 443 return -ENOSYS;
444} 444}
445
446static inline bool within_kprobe_blacklist(unsigned long addr)
447{
448 return true;
449}
445#endif /* CONFIG_KPROBES */ 450#endif /* CONFIG_KPROBES */
446static inline int disable_kretprobe(struct kretprobe *rp) 451static inline int disable_kretprobe(struct kretprobe *rp)
447{ 452{
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 9a0bdf91e646..85a301632cf1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1270,7 +1270,7 @@
1270 * @cred contains the credentials to use. 1270 * @cred contains the credentials to use.
1271 * @ns contains the user namespace we want the capability in 1271 * @ns contains the user namespace we want the capability in
1272 * @cap contains the capability <include/linux/capability.h>. 1272 * @cap contains the capability <include/linux/capability.h>.
1273 * @audit contains whether to write an audit message or not 1273 * @opts contains options for the capable check <include/linux/security.h>
1274 * Return 0 if the capability is granted for @tsk. 1274 * Return 0 if the capability is granted for @tsk.
1275 * @syslog: 1275 * @syslog:
1276 * Check permission before accessing the kernel message ring or changing 1276 * Check permission before accessing the kernel message ring or changing
@@ -1344,7 +1344,6 @@
1344 * @field contains the field which relates to current LSM. 1344 * @field contains the field which relates to current LSM.
1345 * @op contains the operator that will be used for matching. 1345 * @op contains the operator that will be used for matching.
1346 * @rule points to the audit rule that will be checked against. 1346 * @rule points to the audit rule that will be checked against.
1347 * @actx points to the audit context associated with the check.
1348 * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure. 1347 * Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure.
1349 * 1348 *
1350 * @audit_rule_free: 1349 * @audit_rule_free:
@@ -1446,8 +1445,10 @@ union security_list_options {
1446 const kernel_cap_t *effective, 1445 const kernel_cap_t *effective,
1447 const kernel_cap_t *inheritable, 1446 const kernel_cap_t *inheritable,
1448 const kernel_cap_t *permitted); 1447 const kernel_cap_t *permitted);
1449 int (*capable)(const struct cred *cred, struct user_namespace *ns, 1448 int (*capable)(const struct cred *cred,
1450 int cap, int audit); 1449 struct user_namespace *ns,
1450 int cap,
1451 unsigned int opts);
1451 int (*quotactl)(int cmds, int type, int id, struct super_block *sb); 1452 int (*quotactl)(int cmds, int type, int id, struct super_block *sb);
1452 int (*quota_on)(struct dentry *dentry); 1453 int (*quota_on)(struct dentry *dentry);
1453 int (*syslog)(int type); 1454 int (*syslog)(int type);
@@ -1764,8 +1765,7 @@ union security_list_options {
1764 int (*audit_rule_init)(u32 field, u32 op, char *rulestr, 1765 int (*audit_rule_init)(u32 field, u32 op, char *rulestr,
1765 void **lsmrule); 1766 void **lsmrule);
1766 int (*audit_rule_known)(struct audit_krule *krule); 1767 int (*audit_rule_known)(struct audit_krule *krule);
1767 int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule, 1768 int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule);
1768 struct audit_context *actx);
1769 void (*audit_rule_free)(void *lsmrule); 1769 void (*audit_rule_free)(void *lsmrule);
1770#endif /* CONFIG_AUDIT */ 1770#endif /* CONFIG_AUDIT */
1771 1771
@@ -2028,6 +2028,18 @@ struct security_hook_list {
2028} __randomize_layout; 2028} __randomize_layout;
2029 2029
2030/* 2030/*
2031 * Security blob size or offset data.
2032 */
2033struct lsm_blob_sizes {
2034 int lbs_cred;
2035 int lbs_file;
2036 int lbs_inode;
2037 int lbs_ipc;
2038 int lbs_msg_msg;
2039 int lbs_task;
2040};
2041
2042/*
2031 * Initializing a security_hook_list structure takes 2043 * Initializing a security_hook_list structure takes
2032 * up a lot of space in a source file. This macro takes 2044 * up a lot of space in a source file. This macro takes
2033 * care of the common case and reduces the amount of 2045 * care of the common case and reduces the amount of
@@ -2042,9 +2054,21 @@ extern char *lsm_names;
2042extern void security_add_hooks(struct security_hook_list *hooks, int count, 2054extern void security_add_hooks(struct security_hook_list *hooks, int count,
2043 char *lsm); 2055 char *lsm);
2044 2056
2057#define LSM_FLAG_LEGACY_MAJOR BIT(0)
2058#define LSM_FLAG_EXCLUSIVE BIT(1)
2059
2060enum lsm_order {
2061 LSM_ORDER_FIRST = -1, /* This is only for capabilities. */
2062 LSM_ORDER_MUTABLE = 0,
2063};
2064
2045struct lsm_info { 2065struct lsm_info {
2046 const char *name; /* Required. */ 2066 const char *name; /* Required. */
2067 enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */
2068 unsigned long flags; /* Optional: flags describing LSM */
2069 int *enabled; /* Optional: controlled by CONFIG_LSM */
2047 int (*init)(void); /* Required. */ 2070 int (*init)(void); /* Required. */
2071 struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
2048}; 2072};
2049 2073
2050extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; 2074extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
@@ -2084,17 +2108,6 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
2084#define __lsm_ro_after_init __ro_after_init 2108#define __lsm_ro_after_init __ro_after_init
2085#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */ 2109#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
2086 2110
2087extern int __init security_module_enable(const char *module); 2111extern int lsm_inode_alloc(struct inode *inode);
2088extern void __init capability_add_hooks(void);
2089#ifdef CONFIG_SECURITY_YAMA
2090extern void __init yama_add_hooks(void);
2091#else
2092static inline void __init yama_add_hooks(void) { }
2093#endif
2094#ifdef CONFIG_SECURITY_LOADPIN
2095void __init loadpin_add_hooks(void);
2096#else
2097static inline void loadpin_add_hooks(void) { };
2098#endif
2099 2112
2100#endif /* ! __LINUX_LSM_HOOKS_H */ 2113#endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a78606e8e3df..9138b4471dbf 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -24,6 +24,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
24 * - internal "there are more path components" flag 24 * - internal "there are more path components" flag
25 * - dentry cache is untrusted; force a real lookup 25 * - dentry cache is untrusted; force a real lookup
26 * - suppress terminal automount 26 * - suppress terminal automount
27 * - skip revalidation
28 * - don't fetch xattrs on audit_inode
27 */ 29 */
28#define LOOKUP_FOLLOW 0x0001 30#define LOOKUP_FOLLOW 0x0001
29#define LOOKUP_DIRECTORY 0x0002 31#define LOOKUP_DIRECTORY 0x0002
@@ -33,6 +35,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
33#define LOOKUP_REVAL 0x0020 35#define LOOKUP_REVAL 0x0020
34#define LOOKUP_RCU 0x0040 36#define LOOKUP_RCU 0x0040
35#define LOOKUP_NO_REVAL 0x0080 37#define LOOKUP_NO_REVAL 0x0080
38#define LOOKUP_NO_EVAL 0x0100
36 39
37/* 40/*
38 * Intent data 41 * Intent data
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f073bd59df32..1549584a1538 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -872,8 +872,10 @@ struct task_struct {
872 872
873 struct callback_head *task_works; 873 struct callback_head *task_works;
874 874
875 struct audit_context *audit_context; 875#ifdef CONFIG_AUDIT
876#ifdef CONFIG_AUDITSYSCALL 876#ifdef CONFIG_AUDITSYSCALL
877 struct audit_context *audit_context;
878#endif
877 kuid_t loginuid; 879 kuid_t loginuid;
878 unsigned int sessionid; 880 unsigned int sessionid;
879#endif 881#endif
diff --git a/include/linux/security.h b/include/linux/security.h
index dbfb5a66babb..2b35a43d11d6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -54,9 +54,12 @@ struct xattr;
54struct xfrm_sec_ctx; 54struct xfrm_sec_ctx;
55struct mm_struct; 55struct mm_struct;
56 56
57/* Default (no) options for the capable function */
58#define CAP_OPT_NONE 0x0
57/* If capable should audit the security request */ 59/* If capable should audit the security request */
58#define SECURITY_CAP_NOAUDIT 0 60#define CAP_OPT_NOAUDIT BIT(1)
59#define SECURITY_CAP_AUDIT 1 61/* If capable is being called by a setid function */
62#define CAP_OPT_INSETID BIT(2)
60 63
61/* LSM Agnostic defines for sb_set_mnt_opts */ 64/* LSM Agnostic defines for sb_set_mnt_opts */
62#define SECURITY_LSM_NATIVE_LABELS 1 65#define SECURITY_LSM_NATIVE_LABELS 1
@@ -72,7 +75,7 @@ enum lsm_event {
72 75
73/* These functions are in security/commoncap.c */ 76/* These functions are in security/commoncap.c */
74extern int cap_capable(const struct cred *cred, struct user_namespace *ns, 77extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
75 int cap, int audit); 78 int cap, unsigned int opts);
76extern int cap_settime(const struct timespec64 *ts, const struct timezone *tz); 79extern int cap_settime(const struct timespec64 *ts, const struct timezone *tz);
77extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); 80extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
78extern int cap_ptrace_traceme(struct task_struct *parent); 81extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -207,10 +210,10 @@ int security_capset(struct cred *new, const struct cred *old,
207 const kernel_cap_t *effective, 210 const kernel_cap_t *effective,
208 const kernel_cap_t *inheritable, 211 const kernel_cap_t *inheritable,
209 const kernel_cap_t *permitted); 212 const kernel_cap_t *permitted);
210int security_capable(const struct cred *cred, struct user_namespace *ns, 213int security_capable(const struct cred *cred,
211 int cap); 214 struct user_namespace *ns,
212int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns, 215 int cap,
213 int cap); 216 unsigned int opts);
214int security_quotactl(int cmds, int type, int id, struct super_block *sb); 217int security_quotactl(int cmds, int type, int id, struct super_block *sb);
215int security_quota_on(struct dentry *dentry); 218int security_quota_on(struct dentry *dentry);
216int security_syslog(int type); 219int security_syslog(int type);
@@ -366,8 +369,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
366int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, 369int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
367 unsigned nsops, int alter); 370 unsigned nsops, int alter);
368void security_d_instantiate(struct dentry *dentry, struct inode *inode); 371void security_d_instantiate(struct dentry *dentry, struct inode *inode);
369int security_getprocattr(struct task_struct *p, char *name, char **value); 372int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
370int security_setprocattr(const char *name, void *value, size_t size); 373 char **value);
374int security_setprocattr(const char *lsm, const char *name, void *value,
375 size_t size);
371int security_netlink_send(struct sock *sk, struct sk_buff *skb); 376int security_netlink_send(struct sock *sk, struct sk_buff *skb);
372int security_ismaclabel(const char *name); 377int security_ismaclabel(const char *name);
373int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); 378int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
@@ -462,14 +467,11 @@ static inline int security_capset(struct cred *new,
462} 467}
463 468
464static inline int security_capable(const struct cred *cred, 469static inline int security_capable(const struct cred *cred,
465 struct user_namespace *ns, int cap) 470 struct user_namespace *ns,
471 int cap,
472 unsigned int opts)
466{ 473{
467 return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT); 474 return cap_capable(cred, ns, cap, opts);
468}
469
470static inline int security_capable_noaudit(const struct cred *cred,
471 struct user_namespace *ns, int cap) {
472 return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
473} 475}
474 476
475static inline int security_quotactl(int cmds, int type, int id, 477static inline int security_quotactl(int cmds, int type, int id,
@@ -1112,15 +1114,18 @@ static inline int security_sem_semop(struct kern_ipc_perm *sma,
1112 return 0; 1114 return 0;
1113} 1115}
1114 1116
1115static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) 1117static inline void security_d_instantiate(struct dentry *dentry,
1118 struct inode *inode)
1116{ } 1119{ }
1117 1120
1118static inline int security_getprocattr(struct task_struct *p, char *name, char **value) 1121static inline int security_getprocattr(struct task_struct *p, const char *lsm,
1122 char *name, char **value)
1119{ 1123{
1120 return -EINVAL; 1124 return -EINVAL;
1121} 1125}
1122 1126
1123static inline int security_setprocattr(char *name, void *value, size_t size) 1127static inline int security_setprocattr(const char *lsm, char *name,
1128 void *value, size_t size)
1124{ 1129{
1125 return -EINVAL; 1130 return -EINVAL;
1126} 1131}
@@ -1674,8 +1679,7 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer)
1674#ifdef CONFIG_SECURITY 1679#ifdef CONFIG_SECURITY
1675int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule); 1680int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule);
1676int security_audit_rule_known(struct audit_krule *krule); 1681int security_audit_rule_known(struct audit_krule *krule);
1677int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule, 1682int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule);
1678 struct audit_context *actx);
1679void security_audit_rule_free(void *lsmrule); 1683void security_audit_rule_free(void *lsmrule);
1680 1684
1681#else 1685#else
@@ -1692,7 +1696,7 @@ static inline int security_audit_rule_known(struct audit_krule *krule)
1692} 1696}
1693 1697
1694static inline int security_audit_rule_match(u32 secid, u32 field, u32 op, 1698static inline int security_audit_rule_match(u32 secid, u32 field, u32 op,
1695 void *lsmrule, struct audit_context *actx) 1699 void *lsmrule)
1696{ 1700{
1697 return 0; 1701 return 0;
1698} 1702}
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
deleted file mode 100644
index 44f459612690..000000000000
--- a/include/linux/selinux.h
+++ /dev/null
@@ -1,35 +0,0 @@
1/*
2 * SELinux services exported to the rest of the kernel.
3 *
4 * Author: James Morris <jmorris@redhat.com>
5 *
6 * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
7 * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
8 * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 */
14#ifndef _LINUX_SELINUX_H
15#define _LINUX_SELINUX_H
16
17struct selinux_audit_rule;
18struct audit_context;
19struct kern_ipc_perm;
20
21#ifdef CONFIG_SECURITY_SELINUX
22
23/**
24 * selinux_is_enabled - is SELinux enabled?
25 */
26bool selinux_is_enabled(void);
27#else
28
29static inline bool selinux_is_enabled(void)
30{
31 return false;
32}
33#endif /* CONFIG_SECURITY_SELINUX */
34
35#endif /* _LINUX_SELINUX_H */
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index 3142e98546ac..9bc69edb8f18 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -41,4 +41,7 @@ struct kstatfs {
41#define ST_NODIRATIME 0x0800 /* do not update directory access times */ 41#define ST_NODIRATIME 0x0800 /* do not update directory access times */
42#define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */ 42#define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */
43 43
44struct dentry;
45extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid);
46
44#endif 47#endif
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7c007ed7505f..54254388899e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -60,9 +60,6 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev,
60 size_t size, enum dma_data_direction dir, 60 size_t size, enum dma_data_direction dir,
61 enum dma_sync_target target); 61 enum dma_sync_target target);
62 62
63extern int
64swiotlb_dma_supported(struct device *hwdev, u64 mask);
65
66#ifdef CONFIG_SWIOTLB 63#ifdef CONFIG_SWIOTLB
67extern enum swiotlb_force swiotlb_force; 64extern enum swiotlb_force swiotlb_force;
68extern phys_addr_t io_tlb_start, io_tlb_end; 65extern phys_addr_t io_tlb_start, io_tlb_end;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 2887503e4d12..ab1cc33adbac 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1051,6 +1051,7 @@ TRACE_EVENT(btrfs_trigger_flush,
1051 { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ 1051 { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \
1052 { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ 1052 { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \
1053 { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ 1053 { ALLOC_CHUNK, "ALLOC_CHUNK"}, \
1054 { ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \
1054 { COMMIT_TRANS, "COMMIT_TRANS"}) 1055 { COMMIT_TRANS, "COMMIT_TRANS"})
1055 1056
1056TRACE_EVENT(btrfs_flush_space, 1057TRACE_EVENT(btrfs_flush_space,
@@ -1512,35 +1513,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
1512 TP_ARGS(inode, start, len, reserved, op) 1513 TP_ARGS(inode, start, len, reserved, op)
1513); 1514);
1514 1515
1515DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
1516
1517 TP_PROTO(const struct btrfs_fs_info *fs_info,
1518 u64 ref_root, u64 reserved),
1519
1520 TP_ARGS(fs_info, ref_root, reserved),
1521
1522 TP_STRUCT__entry_btrfs(
1523 __field( u64, ref_root )
1524 __field( u64, reserved )
1525 ),
1526
1527 TP_fast_assign_btrfs(fs_info,
1528 __entry->ref_root = ref_root;
1529 __entry->reserved = reserved;
1530 ),
1531
1532 TP_printk_btrfs("root=%llu reserved=%llu op=free",
1533 __entry->ref_root, __entry->reserved)
1534);
1535
1536DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
1537
1538 TP_PROTO(const struct btrfs_fs_info *fs_info,
1539 u64 ref_root, u64 reserved),
1540
1541 TP_ARGS(fs_info, ref_root, reserved)
1542);
1543
1544DECLARE_EVENT_CLASS(btrfs_qgroup_extent, 1516DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
1545 TP_PROTO(const struct btrfs_fs_info *fs_info, 1517 TP_PROTO(const struct btrfs_fs_info *fs_info,
1546 const struct btrfs_qgroup_extent_record *rec), 1518 const struct btrfs_qgroup_extent_record *rec),
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e0763bc4158e..c195896d478f 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -837,6 +837,8 @@ enum btrfs_err_code {
837 struct btrfs_ioctl_vol_args) 837 struct btrfs_ioctl_vol_args)
838#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ 838#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
839 struct btrfs_ioctl_vol_args) 839 struct btrfs_ioctl_vol_args)
840#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
841 struct btrfs_ioctl_vol_args)
840/* trans start and trans end are dangerous, and only for 842/* trans start and trans end are dangerous, and only for
841 * use by applications that know how to avoid the 843 * use by applications that know how to avoid the
842 * resulting deadlocks 844 * resulting deadlocks
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 909c98fcace2..b9effa6f8503 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -7,9 +7,16 @@
7/* the following events that user-space can register for */ 7/* the following events that user-space can register for */
8#define FAN_ACCESS 0x00000001 /* File was accessed */ 8#define FAN_ACCESS 0x00000001 /* File was accessed */
9#define FAN_MODIFY 0x00000002 /* File was modified */ 9#define FAN_MODIFY 0x00000002 /* File was modified */
10#define FAN_ATTRIB 0x00000004 /* Metadata changed */
10#define FAN_CLOSE_WRITE 0x00000008 /* Writtable file closed */ 11#define FAN_CLOSE_WRITE 0x00000008 /* Writtable file closed */
11#define FAN_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ 12#define FAN_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */
12#define FAN_OPEN 0x00000020 /* File was opened */ 13#define FAN_OPEN 0x00000020 /* File was opened */
14#define FAN_MOVED_FROM 0x00000040 /* File was moved from X */
15#define FAN_MOVED_TO 0x00000080 /* File was moved to Y */
16#define FAN_CREATE 0x00000100 /* Subfile was created */
17#define FAN_DELETE 0x00000200 /* Subfile was deleted */
18#define FAN_DELETE_SELF 0x00000400 /* Self was deleted */
19#define FAN_MOVE_SELF 0x00000800 /* Self was moved */
13#define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */ 20#define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */
14 21
15#define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ 22#define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
@@ -24,6 +31,7 @@
24 31
25/* helper events */ 32/* helper events */
26#define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */ 33#define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
34#define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */
27 35
28/* flags used for fanotify_init() */ 36/* flags used for fanotify_init() */
29#define FAN_CLOEXEC 0x00000001 37#define FAN_CLOEXEC 0x00000001
@@ -44,6 +52,7 @@
44 52
45/* Flags to determine fanotify event format */ 53/* Flags to determine fanotify event format */
46#define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ 54#define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */
55#define FAN_REPORT_FID 0x00000200 /* Report unique file id */
47 56
48/* Deprecated - do not use this in programs and do not add new flags here! */ 57/* Deprecated - do not use this in programs and do not add new flags here! */
49#define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ 58#define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \
@@ -106,6 +115,26 @@ struct fanotify_event_metadata {
106 __s32 pid; 115 __s32 pid;
107}; 116};
108 117
118#define FAN_EVENT_INFO_TYPE_FID 1
119
120/* Variable length info record following event metadata */
121struct fanotify_event_info_header {
122 __u8 info_type;
123 __u8 pad;
124 __u16 len;
125};
126
127/* Unique file identifier info record */
128struct fanotify_event_info_fid {
129 struct fanotify_event_info_header hdr;
130 __kernel_fsid_t fsid;
131 /*
132 * Following is an opaque struct file_handle that can be passed as
133 * an argument to open_by_handle_at(2).
134 */
135 unsigned char handle[0];
136};
137
109struct fanotify_response { 138struct fanotify_response {
110 __s32 fd; 139 __s32 fd;
111 __u32 response; 140 __u32 response;
diff --git a/init/init_task.c b/init/init_task.c
index df0257c5928c..c70ef656d0f4 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -122,7 +122,7 @@ struct task_struct init_task
122 .thread_pid = &init_struct_pid, 122 .thread_pid = &init_struct_pid,
123 .thread_group = LIST_HEAD_INIT(init_task.thread_group), 123 .thread_group = LIST_HEAD_INIT(init_task.thread_group),
124 .thread_node = LIST_HEAD_INIT(init_signals.thread_head), 124 .thread_node = LIST_HEAD_INIT(init_signals.thread_head),
125#ifdef CONFIG_AUDITSYSCALL 125#ifdef CONFIG_AUDIT
126 .loginuid = INVALID_UID, 126 .loginuid = INVALID_UID,
127 .sessionid = AUDIT_SID_UNSET, 127 .sessionid = AUDIT_SID_UNSET,
128#endif 128#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 632d36059556..c89ea48c70a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
396 struct audit_buffer *ab; 396 struct audit_buffer *ab;
397 int rc = 0; 397 int rc = 0;
398 398
399 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 399 ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
400 if (unlikely(!ab)) 400 if (unlikely(!ab))
401 return rc; 401 return rc;
402 audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); 402 audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
403 audit_log_session_info(ab); 403 audit_log_session_info(ab);
404 rc = audit_log_task_context(ab); 404 rc = audit_log_task_context(ab);
405 if (rc) 405 if (rc)
@@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
1053 return err; 1053 return err;
1054} 1054}
1055 1055
1056static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) 1056static void audit_log_common_recv_msg(struct audit_context *context,
1057 struct audit_buffer **ab, u16 msg_type)
1057{ 1058{
1058 uid_t uid = from_kuid(&init_user_ns, current_uid()); 1059 uid_t uid = from_kuid(&init_user_ns, current_uid());
1059 pid_t pid = task_tgid_nr(current); 1060 pid_t pid = task_tgid_nr(current);
@@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
1063 return; 1064 return;
1064 } 1065 }
1065 1066
1066 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 1067 *ab = audit_log_start(context, GFP_KERNEL, msg_type);
1067 if (unlikely(!*ab)) 1068 if (unlikely(!*ab))
1068 return; 1069 return;
1069 audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); 1070 audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
@@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
1071 audit_log_task_context(*ab); 1072 audit_log_task_context(*ab);
1072} 1073}
1073 1074
1075static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
1076 u16 msg_type)
1077{
1078 audit_log_common_recv_msg(NULL, ab, msg_type);
1079}
1080
1074int is_audit_feature_set(int i) 1081int is_audit_feature_set(int i)
1075{ 1082{
1076 return af.features & AUDIT_FEATURE_TO_MASK(i); 1083 return af.features & AUDIT_FEATURE_TO_MASK(i);
@@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1338 if (err) 1345 if (err)
1339 break; 1346 break;
1340 } 1347 }
1341 audit_log_common_recv_msg(&ab, msg_type); 1348 audit_log_user_recv_msg(&ab, msg_type);
1342 if (msg_type != AUDIT_USER_TTY) 1349 if (msg_type != AUDIT_USER_TTY)
1343 audit_log_format(ab, " msg='%.*s'", 1350 audit_log_format(ab, " msg='%.*s'",
1344 AUDIT_MESSAGE_TEXT_MAX, 1351 AUDIT_MESSAGE_TEXT_MAX,
@@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1361 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 1368 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
1362 return -EINVAL; 1369 return -EINVAL;
1363 if (audit_enabled == AUDIT_LOCKED) { 1370 if (audit_enabled == AUDIT_LOCKED) {
1364 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 1371 audit_log_common_recv_msg(audit_context(), &ab,
1365 audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); 1372 AUDIT_CONFIG_CHANGE);
1373 audit_log_format(ab, " op=%s audit_enabled=%d res=0",
1374 msg_type == AUDIT_ADD_RULE ?
1375 "add_rule" : "remove_rule",
1376 audit_enabled);
1366 audit_log_end(ab); 1377 audit_log_end(ab);
1367 return -EPERM; 1378 return -EPERM;
1368 } 1379 }
@@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1373 break; 1384 break;
1374 case AUDIT_TRIM: 1385 case AUDIT_TRIM:
1375 audit_trim_trees(); 1386 audit_trim_trees();
1376 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 1387 audit_log_common_recv_msg(audit_context(), &ab,
1388 AUDIT_CONFIG_CHANGE);
1377 audit_log_format(ab, " op=trim res=1"); 1389 audit_log_format(ab, " op=trim res=1");
1378 audit_log_end(ab); 1390 audit_log_end(ab);
1379 break; 1391 break;
@@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1403 /* OK, here comes... */ 1415 /* OK, here comes... */
1404 err = audit_tag_tree(old, new); 1416 err = audit_tag_tree(old, new);
1405 1417
1406 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 1418 audit_log_common_recv_msg(audit_context(), &ab,
1407 1419 AUDIT_CONFIG_CHANGE);
1408 audit_log_format(ab, " op=make_equiv old="); 1420 audit_log_format(ab, " op=make_equiv old=");
1409 audit_log_untrustedstring(ab, old); 1421 audit_log_untrustedstring(ab, old);
1410 audit_log_format(ab, " new="); 1422 audit_log_format(ab, " new=");
@@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1471 old.enabled = t & AUDIT_TTY_ENABLE; 1483 old.enabled = t & AUDIT_TTY_ENABLE;
1472 old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); 1484 old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
1473 1485
1474 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); 1486 audit_log_common_recv_msg(audit_context(), &ab,
1487 AUDIT_CONFIG_CHANGE);
1475 audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" 1488 audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
1476 " old-log_passwd=%d new-log_passwd=%d res=%d", 1489 " old-log_passwd=%d new-log_passwd=%d res=%d",
1477 old.enabled, s.enabled, old.log_passwd, 1490 old.enabled, s.enabled, old.log_passwd,
@@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
2054 audit_log_format(ab, "(null)"); 2067 audit_log_format(ab, "(null)");
2055} 2068}
2056 2069
2057void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
2058{
2059 int i;
2060
2061 if (cap_isclear(*cap)) {
2062 audit_log_format(ab, " %s=0", prefix);
2063 return;
2064 }
2065 audit_log_format(ab, " %s=", prefix);
2066 CAP_FOR_EACH_U32(i)
2067 audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
2068}
2069
2070static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
2071{
2072 audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
2073 audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
2074 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
2075 name->fcap.fE, name->fcap_ver);
2076}
2077
2078static inline int audit_copy_fcaps(struct audit_names *name,
2079 const struct dentry *dentry)
2080{
2081 struct cpu_vfs_cap_data caps;
2082 int rc;
2083
2084 if (!dentry)
2085 return 0;
2086
2087 rc = get_vfs_caps_from_disk(dentry, &caps);
2088 if (rc)
2089 return rc;
2090
2091 name->fcap.permitted = caps.permitted;
2092 name->fcap.inheritable = caps.inheritable;
2093 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2094 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
2095 VFS_CAP_REVISION_SHIFT;
2096
2097 return 0;
2098}
2099
2100/* Copy inode data into an audit_names. */
2101void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
2102 struct inode *inode)
2103{
2104 name->ino = inode->i_ino;
2105 name->dev = inode->i_sb->s_dev;
2106 name->mode = inode->i_mode;
2107 name->uid = inode->i_uid;
2108 name->gid = inode->i_gid;
2109 name->rdev = inode->i_rdev;
2110 security_inode_getsecid(inode, &name->osid);
2111 audit_copy_fcaps(name, dentry);
2112}
2113
2114/**
2115 * audit_log_name - produce AUDIT_PATH record from struct audit_names
2116 * @context: audit_context for the task
2117 * @n: audit_names structure with reportable details
2118 * @path: optional path to report instead of audit_names->name
2119 * @record_num: record number to report when handling a list of names
2120 * @call_panic: optional pointer to int that will be updated if secid fails
2121 */
2122void audit_log_name(struct audit_context *context, struct audit_names *n,
2123 const struct path *path, int record_num, int *call_panic)
2124{
2125 struct audit_buffer *ab;
2126 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
2127 if (!ab)
2128 return;
2129
2130 audit_log_format(ab, "item=%d", record_num);
2131
2132 if (path)
2133 audit_log_d_path(ab, " name=", path);
2134 else if (n->name) {
2135 switch (n->name_len) {
2136 case AUDIT_NAME_FULL:
2137 /* log the full path */
2138 audit_log_format(ab, " name=");
2139 audit_log_untrustedstring(ab, n->name->name);
2140 break;
2141 case 0:
2142 /* name was specified as a relative path and the
2143 * directory component is the cwd */
2144 audit_log_d_path(ab, " name=", &context->pwd);
2145 break;
2146 default:
2147 /* log the name's directory component */
2148 audit_log_format(ab, " name=");
2149 audit_log_n_untrustedstring(ab, n->name->name,
2150 n->name_len);
2151 }
2152 } else
2153 audit_log_format(ab, " name=(null)");
2154
2155 if (n->ino != AUDIT_INO_UNSET)
2156 audit_log_format(ab, " inode=%lu"
2157 " dev=%02x:%02x mode=%#ho"
2158 " ouid=%u ogid=%u rdev=%02x:%02x",
2159 n->ino,
2160 MAJOR(n->dev),
2161 MINOR(n->dev),
2162 n->mode,
2163 from_kuid(&init_user_ns, n->uid),
2164 from_kgid(&init_user_ns, n->gid),
2165 MAJOR(n->rdev),
2166 MINOR(n->rdev));
2167 if (n->osid != 0) {
2168 char *ctx = NULL;
2169 u32 len;
2170 if (security_secid_to_secctx(
2171 n->osid, &ctx, &len)) {
2172 audit_log_format(ab, " osid=%u", n->osid);
2173 if (call_panic)
2174 *call_panic = 2;
2175 } else {
2176 audit_log_format(ab, " obj=%s", ctx);
2177 security_release_secctx(ctx, len);
2178 }
2179 }
2180
2181 /* log the audit_names record type */
2182 switch(n->type) {
2183 case AUDIT_TYPE_NORMAL:
2184 audit_log_format(ab, " nametype=NORMAL");
2185 break;
2186 case AUDIT_TYPE_PARENT:
2187 audit_log_format(ab, " nametype=PARENT");
2188 break;
2189 case AUDIT_TYPE_CHILD_DELETE:
2190 audit_log_format(ab, " nametype=DELETE");
2191 break;
2192 case AUDIT_TYPE_CHILD_CREATE:
2193 audit_log_format(ab, " nametype=CREATE");
2194 break;
2195 default:
2196 audit_log_format(ab, " nametype=UNKNOWN");
2197 break;
2198 }
2199
2200 audit_log_fcaps(ab, n);
2201 audit_log_end(ab);
2202}
2203
2204int audit_log_task_context(struct audit_buffer *ab) 2070int audit_log_task_context(struct audit_buffer *ab)
2205{ 2071{
2206 char *ctx = NULL; 2072 char *ctx = NULL;
@@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation)
2322 audit_log_end(ab); 2188 audit_log_end(ab);
2323} 2189}
2324 2190
2191/* global counter which is incremented every time something logs in */
2192static atomic_t session_id = ATOMIC_INIT(0);
2193
2194static int audit_set_loginuid_perm(kuid_t loginuid)
2195{
2196 /* if we are unset, we don't need privs */
2197 if (!audit_loginuid_set(current))
2198 return 0;
2199 /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
2200 if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
2201 return -EPERM;
2202 /* it is set, you need permission */
2203 if (!capable(CAP_AUDIT_CONTROL))
2204 return -EPERM;
2205 /* reject if this is not an unset and we don't allow that */
2206 if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
2207 && uid_valid(loginuid))
2208 return -EPERM;
2209 return 0;
2210}
2211
2212static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
2213 unsigned int oldsessionid,
2214 unsigned int sessionid, int rc)
2215{
2216 struct audit_buffer *ab;
2217 uid_t uid, oldloginuid, loginuid;
2218 struct tty_struct *tty;
2219
2220 if (!audit_enabled)
2221 return;
2222
2223 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
2224 if (!ab)
2225 return;
2226
2227 uid = from_kuid(&init_user_ns, task_uid(current));
2228 oldloginuid = from_kuid(&init_user_ns, koldloginuid);
2229 loginuid = from_kuid(&init_user_ns, kloginuid),
2230 tty = audit_get_tty();
2231
2232 audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
2233 audit_log_task_context(ab);
2234 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
2235 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
2236 oldsessionid, sessionid, !rc);
2237 audit_put_tty(tty);
2238 audit_log_end(ab);
2239}
2240
2241/**
2242 * audit_set_loginuid - set current task's loginuid
2243 * @loginuid: loginuid value
2244 *
2245 * Returns 0.
2246 *
2247 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2248 */
2249int audit_set_loginuid(kuid_t loginuid)
2250{
2251 unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
2252 kuid_t oldloginuid;
2253 int rc;
2254
2255 oldloginuid = audit_get_loginuid(current);
2256 oldsessionid = audit_get_sessionid(current);
2257
2258 rc = audit_set_loginuid_perm(loginuid);
2259 if (rc)
2260 goto out;
2261
2262 /* are we setting or clearing? */
2263 if (uid_valid(loginuid)) {
2264 sessionid = (unsigned int)atomic_inc_return(&session_id);
2265 if (unlikely(sessionid == AUDIT_SID_UNSET))
2266 sessionid = (unsigned int)atomic_inc_return(&session_id);
2267 }
2268
2269 current->sessionid = sessionid;
2270 current->loginuid = loginuid;
2271out:
2272 audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
2273 return rc;
2274}
2275
2325/** 2276/**
2326 * audit_log_end - end one audit record 2277 * audit_log_end - end one audit record
2327 * @ab: the audit_buffer 2278 * @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index 91421679a168..958d5b8fc1b3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -69,6 +69,7 @@ struct audit_cap_data {
69 kernel_cap_t effective; /* effective set of process */ 69 kernel_cap_t effective; /* effective set of process */
70 }; 70 };
71 kernel_cap_t ambient; 71 kernel_cap_t ambient;
72 kuid_t rootid;
72}; 73};
73 74
74/* When fs/namei.c:getname() is called, we store the pointer in name and bump 75/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -212,15 +213,6 @@ extern bool audit_ever_enabled;
212 213
213extern void audit_log_session_info(struct audit_buffer *ab); 214extern void audit_log_session_info(struct audit_buffer *ab);
214 215
215extern void audit_copy_inode(struct audit_names *name,
216 const struct dentry *dentry,
217 struct inode *inode);
218extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
219 kernel_cap_t *cap);
220extern void audit_log_name(struct audit_context *context,
221 struct audit_names *n, const struct path *path,
222 int record_num, int *call_panic);
223
224extern int auditd_test_task(struct task_struct *task); 216extern int auditd_test_task(struct task_struct *task);
225 217
226#define AUDIT_INODE_BUCKETS 32 218#define AUDIT_INODE_BUCKETS 32
@@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
267extern struct tty_struct *audit_get_tty(void); 259extern struct tty_struct *audit_get_tty(void);
268extern void audit_put_tty(struct tty_struct *tty); 260extern void audit_put_tty(struct tty_struct *tty);
269 261
270/* audit watch functions */ 262/* audit watch/mark/tree functions */
271#ifdef CONFIG_AUDITSYSCALL 263#ifdef CONFIG_AUDITSYSCALL
264extern unsigned int audit_serial(void);
265extern int auditsc_get_stamp(struct audit_context *ctx,
266 struct timespec64 *t, unsigned int *serial);
267
272extern void audit_put_watch(struct audit_watch *watch); 268extern void audit_put_watch(struct audit_watch *watch);
273extern void audit_get_watch(struct audit_watch *watch); 269extern void audit_get_watch(struct audit_watch *watch);
274extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); 270extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
271 u32 op);
275extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); 272extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
276extern void audit_remove_watch_rule(struct audit_krule *krule); 273extern void audit_remove_watch_rule(struct audit_krule *krule);
277extern char *audit_watch_path(struct audit_watch *watch); 274extern char *audit_watch_path(struct audit_watch *watch);
278extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); 275extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
276 dev_t dev);
279 277
280extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); 278extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
279 char *pathname, int len);
281extern char *audit_mark_path(struct audit_fsnotify_mark *mark); 280extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
282extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); 281extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
283extern void audit_remove_mark_rule(struct audit_krule *krule); 282extern void audit_remove_mark_rule(struct audit_krule *krule);
284extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); 283extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
284 unsigned long ino, dev_t dev);
285extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); 285extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
286extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); 286extern int audit_exe_compare(struct task_struct *tsk,
287 struct audit_fsnotify_mark *mark);
288
289extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
290extern void audit_put_chunk(struct audit_chunk *chunk);
291extern bool audit_tree_match(struct audit_chunk *chunk,
292 struct audit_tree *tree);
293extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
294extern int audit_add_tree_rule(struct audit_krule *rule);
295extern int audit_remove_tree_rule(struct audit_krule *rule);
296extern void audit_trim_trees(void);
297extern int audit_tag_tree(char *old, char *new);
298extern const char *audit_tree_path(struct audit_tree *tree);
299extern void audit_put_tree(struct audit_tree *tree);
300extern void audit_kill_trees(struct audit_context *context);
287 301
288#else 302extern int audit_signal_info(int sig, struct task_struct *t);
303extern void audit_filter_inodes(struct task_struct *tsk,
304 struct audit_context *ctx);
305extern struct list_head *audit_killed_trees(void);
306#else /* CONFIG_AUDITSYSCALL */
307#define auditsc_get_stamp(c, t, s) 0
289#define audit_put_watch(w) {} 308#define audit_put_watch(w) {}
290#define audit_get_watch(w) {} 309#define audit_get_watch(w) {}
291#define audit_to_watch(k, p, l, o) (-EINVAL) 310#define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
301#define audit_mark_compare(m, i, d) 0 320#define audit_mark_compare(m, i, d) 0
302#define audit_exe_compare(t, m) (-EINVAL) 321#define audit_exe_compare(t, m) (-EINVAL)
303#define audit_dupe_exe(n, o) (-EINVAL) 322#define audit_dupe_exe(n, o) (-EINVAL)
304#endif /* CONFIG_AUDITSYSCALL */
305 323
306#ifdef CONFIG_AUDITSYSCALL
307extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
308extern void audit_put_chunk(struct audit_chunk *chunk);
309extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
310extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
311extern int audit_add_tree_rule(struct audit_krule *rule);
312extern int audit_remove_tree_rule(struct audit_krule *rule);
313extern void audit_trim_trees(void);
314extern int audit_tag_tree(char *old, char *new);
315extern const char *audit_tree_path(struct audit_tree *tree);
316extern void audit_put_tree(struct audit_tree *tree);
317extern void audit_kill_trees(struct list_head *list);
318#else
319#define audit_remove_tree_rule(rule) BUG() 324#define audit_remove_tree_rule(rule) BUG()
320#define audit_add_tree_rule(rule) -EINVAL 325#define audit_add_tree_rule(rule) -EINVAL
321#define audit_make_tree(rule, str, op) -EINVAL 326#define audit_make_tree(rule, str, op) -EINVAL
@@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list);
323#define audit_put_tree(tree) (void)0 328#define audit_put_tree(tree) (void)0
324#define audit_tag_tree(old, new) -EINVAL 329#define audit_tag_tree(old, new) -EINVAL
325#define audit_tree_path(rule) "" /* never called */ 330#define audit_tree_path(rule) "" /* never called */
326#define audit_kill_trees(list) BUG() 331#define audit_kill_trees(context) BUG()
327#endif 332
333#define audit_signal_info(s, t) AUDIT_DISABLED
334#define audit_filter_inodes(t, c) AUDIT_DISABLED
335#endif /* CONFIG_AUDITSYSCALL */
328 336
329extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); 337extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
330 338
@@ -334,14 +342,5 @@ extern u32 audit_sig_sid;
334 342
335extern int audit_filter(int msgtype, unsigned int listtype); 343extern int audit_filter(int msgtype, unsigned int listtype);
336 344
337#ifdef CONFIG_AUDITSYSCALL
338extern int audit_signal_info(int sig, struct task_struct *t);
339extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
340extern struct list_head *audit_killed_trees(void);
341#else
342#define audit_signal_info(s,t) AUDIT_DISABLED
343#define audit_filter_inodes(t,c) AUDIT_DISABLED
344#endif
345
346extern void audit_ctl_lock(void); 345extern void audit_ctl_lock(void);
347extern void audit_ctl_unlock(void); 346extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index cf4512a33675..37ae95cfb7f4 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
127 127
128 if (!audit_enabled) 128 if (!audit_enabled)
129 return; 129 return;
130 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 130 ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
131 if (unlikely(!ab)) 131 if (unlikely(!ab))
132 return; 132 return;
133 audit_log_session_info(ab); 133 audit_log_session_info(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d4af4d97f847..abfb112f26aa 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
524 return 0; 524 return 0;
525} 525}
526 526
527static void audit_tree_log_remove_rule(struct audit_krule *rule) 527static void audit_tree_log_remove_rule(struct audit_context *context,
528 struct audit_krule *rule)
528{ 529{
529 struct audit_buffer *ab; 530 struct audit_buffer *ab;
530 531
531 if (!audit_enabled) 532 if (!audit_enabled)
532 return; 533 return;
533 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 534 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
534 if (unlikely(!ab)) 535 if (unlikely(!ab))
535 return; 536 return;
536 audit_log_format(ab, "op=remove_rule dir="); 537 audit_log_format(ab, "op=remove_rule dir=");
@@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
540 audit_log_end(ab); 541 audit_log_end(ab);
541} 542}
542 543
543static void kill_rules(struct audit_tree *tree) 544static void kill_rules(struct audit_context *context, struct audit_tree *tree)
544{ 545{
545 struct audit_krule *rule, *next; 546 struct audit_krule *rule, *next;
546 struct audit_entry *entry; 547 struct audit_entry *entry;
@@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree)
551 list_del_init(&rule->rlist); 552 list_del_init(&rule->rlist);
552 if (rule->tree) { 553 if (rule->tree) {
553 /* not a half-baked one */ 554 /* not a half-baked one */
554 audit_tree_log_remove_rule(rule); 555 audit_tree_log_remove_rule(context, rule);
555 if (entry->rule.exe) 556 if (entry->rule.exe)
556 audit_remove_mark(entry->rule.exe); 557 audit_remove_mark(entry->rule.exe);
557 rule->tree = NULL; 558 rule->tree = NULL;
@@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree)
633 tree->goner = 1; 634 tree->goner = 1;
634 spin_unlock(&hash_lock); 635 spin_unlock(&hash_lock);
635 mutex_lock(&audit_filter_mutex); 636 mutex_lock(&audit_filter_mutex);
636 kill_rules(tree); 637 kill_rules(audit_context(), tree);
637 list_del_init(&tree->list); 638 list_del_init(&tree->list);
638 mutex_unlock(&audit_filter_mutex); 639 mutex_unlock(&audit_filter_mutex);
639 prune_one(tree); 640 prune_one(tree);
@@ -973,8 +974,10 @@ static void audit_schedule_prune(void)
973 * ... and that one is done if evict_chunk() decides to delay until the end 974 * ... and that one is done if evict_chunk() decides to delay until the end
974 * of syscall. Runs synchronously. 975 * of syscall. Runs synchronously.
975 */ 976 */
976void audit_kill_trees(struct list_head *list) 977void audit_kill_trees(struct audit_context *context)
977{ 978{
979 struct list_head *list = &context->killed_trees;
980
978 audit_ctl_lock(); 981 audit_ctl_lock();
979 mutex_lock(&audit_filter_mutex); 982 mutex_lock(&audit_filter_mutex);
980 983
@@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list)
982 struct audit_tree *victim; 985 struct audit_tree *victim;
983 986
984 victim = list_entry(list->next, struct audit_tree, list); 987 victim = list_entry(list->next, struct audit_tree, list);
985 kill_rules(victim); 988 kill_rules(context, victim);
986 list_del_init(&victim->list); 989 list_del_init(&victim->list);
987 990
988 mutex_unlock(&audit_filter_mutex); 991 mutex_unlock(&audit_filter_mutex);
@@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk)
1017 list_del_init(&owner->same_root); 1020 list_del_init(&owner->same_root);
1018 spin_unlock(&hash_lock); 1021 spin_unlock(&hash_lock);
1019 if (!postponed) { 1022 if (!postponed) {
1020 kill_rules(owner); 1023 kill_rules(audit_context(), owner);
1021 list_move(&owner->list, &prune_list); 1024 list_move(&owner->list, &prune_list);
1022 need_prune = 1; 1025 need_prune = 1;
1023 } else { 1026 } else {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20ef9ba134b0..e8d1adeb2223 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
242 242
243 if (!audit_enabled) 243 if (!audit_enabled)
244 return; 244 return;
245 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); 245 ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
246 if (!ab) 246 if (!ab)
247 return; 247 return;
248 audit_log_session_info(ab); 248 audit_log_session_info(ab);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bf309f2592c4..63f8b3f26fab 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
670 data->values[i] = AUDIT_UID_UNSET; 670 data->values[i] = AUDIT_UID_UNSET;
671 break; 671 break;
672 } 672 }
673 /* fallthrough if set */ 673 /* fall through - if set */
674 default: 674 default:
675 data->values[i] = f->val; 675 data->values[i] = f->val;
676 } 676 }
@@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1091 if (!audit_enabled) 1091 if (!audit_enabled)
1092 return; 1092 return;
1093 1093
1094 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1094 ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1095 if (!ab) 1095 if (!ab)
1096 return; 1096 return;
1097 audit_log_session_info(ab); 1097 audit_log_session_info(ab);
@@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype)
1355 if (f->lsm_rule) { 1355 if (f->lsm_rule) {
1356 security_task_getsecid(current, &sid); 1356 security_task_getsecid(current, &sid);
1357 result = security_audit_rule_match(sid, 1357 result = security_audit_rule_match(sid,
1358 f->type, f->op, f->lsm_rule, NULL); 1358 f->type, f->op, f->lsm_rule);
1359 } 1359 }
1360 break; 1360 break;
1361 case AUDIT_EXE: 1361 case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6593a5207fb0..d1eab1d4a930 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk,
631 need_sid = 0; 631 need_sid = 0;
632 } 632 }
633 result = security_audit_rule_match(sid, f->type, 633 result = security_audit_rule_match(sid, f->type,
634 f->op, 634 f->op,
635 f->lsm_rule, 635 f->lsm_rule);
636 ctx);
637 } 636 }
638 break; 637 break;
639 case AUDIT_OBJ_USER: 638 case AUDIT_OBJ_USER:
@@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk,
647 /* Find files that match */ 646 /* Find files that match */
648 if (name) { 647 if (name) {
649 result = security_audit_rule_match( 648 result = security_audit_rule_match(
650 name->osid, f->type, f->op, 649 name->osid,
651 f->lsm_rule, ctx); 650 f->type,
651 f->op,
652 f->lsm_rule);
652 } else if (ctx) { 653 } else if (ctx) {
653 list_for_each_entry(n, &ctx->names_list, list) { 654 list_for_each_entry(n, &ctx->names_list, list) {
654 if (security_audit_rule_match(n->osid, f->type, 655 if (security_audit_rule_match(
655 f->op, f->lsm_rule, 656 n->osid,
656 ctx)) { 657 f->type,
658 f->op,
659 f->lsm_rule)) {
657 ++result; 660 ++result;
658 break; 661 break;
659 } 662 }
@@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
664 break; 667 break;
665 if (security_audit_rule_match(ctx->ipc.osid, 668 if (security_audit_rule_match(ctx->ipc.osid,
666 f->type, f->op, 669 f->type, f->op,
667 f->lsm_rule, ctx)) 670 f->lsm_rule))
668 ++result; 671 ++result;
669 } 672 }
670 break; 673 break;
@@ -1136,6 +1139,32 @@ out:
1136 kfree(buf_head); 1139 kfree(buf_head);
1137} 1140}
1138 1141
1142void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1143{
1144 int i;
1145
1146 if (cap_isclear(*cap)) {
1147 audit_log_format(ab, " %s=0", prefix);
1148 return;
1149 }
1150 audit_log_format(ab, " %s=", prefix);
1151 CAP_FOR_EACH_U32(i)
1152 audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
1153}
1154
1155static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1156{
1157 if (name->fcap_ver == -1) {
1158 audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
1159 return;
1160 }
1161 audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
1162 audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
1163 audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
1164 name->fcap.fE, name->fcap_ver,
1165 from_kuid(&init_user_ns, name->fcap.rootid));
1166}
1167
1139static void show_special(struct audit_context *context, int *call_panic) 1168static void show_special(struct audit_context *context, int *call_panic)
1140{ 1169{
1141 struct audit_buffer *ab; 1170 struct audit_buffer *ab;
@@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
1258 return len; 1287 return len;
1259} 1288}
1260 1289
1290/*
1291 * audit_log_name - produce AUDIT_PATH record from struct audit_names
1292 * @context: audit_context for the task
1293 * @n: audit_names structure with reportable details
1294 * @path: optional path to report instead of audit_names->name
1295 * @record_num: record number to report when handling a list of names
1296 * @call_panic: optional pointer to int that will be updated if secid fails
1297 */
1298static void audit_log_name(struct audit_context *context, struct audit_names *n,
1299 const struct path *path, int record_num, int *call_panic)
1300{
1301 struct audit_buffer *ab;
1302
1303 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1304 if (!ab)
1305 return;
1306
1307 audit_log_format(ab, "item=%d", record_num);
1308
1309 if (path)
1310 audit_log_d_path(ab, " name=", path);
1311 else if (n->name) {
1312 switch (n->name_len) {
1313 case AUDIT_NAME_FULL:
1314 /* log the full path */
1315 audit_log_format(ab, " name=");
1316 audit_log_untrustedstring(ab, n->name->name);
1317 break;
1318 case 0:
1319 /* name was specified as a relative path and the
1320 * directory component is the cwd
1321 */
1322 audit_log_d_path(ab, " name=", &context->pwd);
1323 break;
1324 default:
1325 /* log the name's directory component */
1326 audit_log_format(ab, " name=");
1327 audit_log_n_untrustedstring(ab, n->name->name,
1328 n->name_len);
1329 }
1330 } else
1331 audit_log_format(ab, " name=(null)");
1332
1333 if (n->ino != AUDIT_INO_UNSET)
1334 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
1335 n->ino,
1336 MAJOR(n->dev),
1337 MINOR(n->dev),
1338 n->mode,
1339 from_kuid(&init_user_ns, n->uid),
1340 from_kgid(&init_user_ns, n->gid),
1341 MAJOR(n->rdev),
1342 MINOR(n->rdev));
1343 if (n->osid != 0) {
1344 char *ctx = NULL;
1345 u32 len;
1346
1347 if (security_secid_to_secctx(
1348 n->osid, &ctx, &len)) {
1349 audit_log_format(ab, " osid=%u", n->osid);
1350 if (call_panic)
1351 *call_panic = 2;
1352 } else {
1353 audit_log_format(ab, " obj=%s", ctx);
1354 security_release_secctx(ctx, len);
1355 }
1356 }
1357
1358 /* log the audit_names record type */
1359 switch (n->type) {
1360 case AUDIT_TYPE_NORMAL:
1361 audit_log_format(ab, " nametype=NORMAL");
1362 break;
1363 case AUDIT_TYPE_PARENT:
1364 audit_log_format(ab, " nametype=PARENT");
1365 break;
1366 case AUDIT_TYPE_CHILD_DELETE:
1367 audit_log_format(ab, " nametype=DELETE");
1368 break;
1369 case AUDIT_TYPE_CHILD_CREATE:
1370 audit_log_format(ab, " nametype=CREATE");
1371 break;
1372 default:
1373 audit_log_format(ab, " nametype=UNKNOWN");
1374 break;
1375 }
1376
1377 audit_log_fcaps(ab, n);
1378 audit_log_end(ab);
1379}
1380
1261static void audit_log_proctitle(void) 1381static void audit_log_proctitle(void)
1262{ 1382{
1263 int res; 1383 int res;
@@ -1358,6 +1478,9 @@ static void audit_log_exit(void)
1358 audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); 1478 audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
1359 audit_log_cap(ab, "pe", &axs->new_pcap.effective); 1479 audit_log_cap(ab, "pe", &axs->new_pcap.effective);
1360 audit_log_cap(ab, "pa", &axs->new_pcap.ambient); 1480 audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
1481 audit_log_format(ab, " frootid=%d",
1482 from_kuid(&init_user_ns,
1483 axs->fcap.rootid));
1361 break; } 1484 break; }
1362 1485
1363 } 1486 }
@@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk)
1444 if (!context) 1567 if (!context)
1445 return; 1568 return;
1446 1569
1570 if (!list_empty(&context->killed_trees))
1571 audit_kill_trees(context);
1572
1447 /* We are called either by do_exit() or the fork() error handling code; 1573 /* We are called either by do_exit() or the fork() error handling code;
1448 * in the former case tsk == current and in the latter tsk is a 1574 * in the former case tsk == current and in the latter tsk is a
1449 * random task_struct that doesn't doesn't have any meaningful data we 1575 * random task_struct that doesn't doesn't have any meaningful data we
@@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk)
1460 audit_log_exit(); 1586 audit_log_exit();
1461 } 1587 }
1462 1588
1463 if (!list_empty(&context->killed_trees))
1464 audit_kill_trees(&context->killed_trees);
1465
1466 audit_set_context(tsk, NULL); 1589 audit_set_context(tsk, NULL);
1467 audit_free_context(context); 1590 audit_free_context(context);
1468} 1591}
@@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code)
1537 if (!context) 1660 if (!context)
1538 return; 1661 return;
1539 1662
1663 if (!list_empty(&context->killed_trees))
1664 audit_kill_trees(context);
1665
1540 if (!context->dummy && context->in_syscall) { 1666 if (!context->dummy && context->in_syscall) {
1541 if (success) 1667 if (success)
1542 context->return_valid = AUDITSC_SUCCESS; 1668 context->return_valid = AUDITSC_SUCCESS;
@@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code)
1571 context->in_syscall = 0; 1697 context->in_syscall = 0;
1572 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; 1698 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1573 1699
1574 if (!list_empty(&context->killed_trees))
1575 audit_kill_trees(&context->killed_trees);
1576
1577 audit_free_names(context); 1700 audit_free_names(context);
1578 unroll_tree_refs(context, NULL, 0); 1701 unroll_tree_refs(context, NULL, 0);
1579 audit_free_aux(context); 1702 audit_free_aux(context);
@@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name)
1750 get_fs_pwd(current->fs, &context->pwd); 1873 get_fs_pwd(current->fs, &context->pwd);
1751} 1874}
1752 1875
1876static inline int audit_copy_fcaps(struct audit_names *name,
1877 const struct dentry *dentry)
1878{
1879 struct cpu_vfs_cap_data caps;
1880 int rc;
1881
1882 if (!dentry)
1883 return 0;
1884
1885 rc = get_vfs_caps_from_disk(dentry, &caps);
1886 if (rc)
1887 return rc;
1888
1889 name->fcap.permitted = caps.permitted;
1890 name->fcap.inheritable = caps.inheritable;
1891 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1892 name->fcap.rootid = caps.rootid;
1893 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
1894 VFS_CAP_REVISION_SHIFT;
1895
1896 return 0;
1897}
1898
1899/* Copy inode data into an audit_names. */
1900void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1901 struct inode *inode, unsigned int flags)
1902{
1903 name->ino = inode->i_ino;
1904 name->dev = inode->i_sb->s_dev;
1905 name->mode = inode->i_mode;
1906 name->uid = inode->i_uid;
1907 name->gid = inode->i_gid;
1908 name->rdev = inode->i_rdev;
1909 security_inode_getsecid(inode, &name->osid);
1910 if (flags & AUDIT_INODE_NOEVAL) {
1911 name->fcap_ver = -1;
1912 return;
1913 }
1914 audit_copy_fcaps(name, dentry);
1915}
1916
1753/** 1917/**
1754 * __audit_inode - store the inode and device from a lookup 1918 * __audit_inode - store the inode and device from a lookup
1755 * @name: name being audited 1919 * @name: name being audited
@@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1763 struct inode *inode = d_backing_inode(dentry); 1927 struct inode *inode = d_backing_inode(dentry);
1764 struct audit_names *n; 1928 struct audit_names *n;
1765 bool parent = flags & AUDIT_INODE_PARENT; 1929 bool parent = flags & AUDIT_INODE_PARENT;
1930 struct audit_entry *e;
1931 struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
1932 int i;
1766 1933
1767 if (!context->in_syscall) 1934 if (!context->in_syscall)
1768 return; 1935 return;
1769 1936
1937 rcu_read_lock();
1938 if (!list_empty(list)) {
1939 list_for_each_entry_rcu(e, list, list) {
1940 for (i = 0; i < e->rule.field_count; i++) {
1941 struct audit_field *f = &e->rule.fields[i];
1942
1943 if (f->type == AUDIT_FSTYPE
1944 && audit_comparator(inode->i_sb->s_magic,
1945 f->op, f->val)
1946 && e->rule.action == AUDIT_NEVER) {
1947 rcu_read_unlock();
1948 return;
1949 }
1950 }
1951 }
1952 }
1953 rcu_read_unlock();
1954
1770 if (!name) 1955 if (!name)
1771 goto out_alloc; 1956 goto out_alloc;
1772 1957
@@ -1832,7 +2017,7 @@ out:
1832 n->type = AUDIT_TYPE_NORMAL; 2017 n->type = AUDIT_TYPE_NORMAL;
1833 } 2018 }
1834 handle_path(dentry); 2019 handle_path(dentry);
1835 audit_copy_inode(n, dentry, inode); 2020 audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
1836} 2021}
1837 2022
1838void __audit_file(const struct file *file) 2023void __audit_file(const struct file *file)
@@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent,
1875 for (i = 0; i < e->rule.field_count; i++) { 2060 for (i = 0; i < e->rule.field_count; i++) {
1876 struct audit_field *f = &e->rule.fields[i]; 2061 struct audit_field *f = &e->rule.fields[i];
1877 2062
1878 if (f->type == AUDIT_FSTYPE) { 2063 if (f->type == AUDIT_FSTYPE
1879 if (audit_comparator(parent->i_sb->s_magic, 2064 && audit_comparator(parent->i_sb->s_magic,
1880 f->op, f->val)) { 2065 f->op, f->val)
1881 if (e->rule.action == AUDIT_NEVER) { 2066 && e->rule.action == AUDIT_NEVER) {
1882 rcu_read_unlock(); 2067 rcu_read_unlock();
1883 return; 2068 return;
1884 }
1885 }
1886 } 2069 }
1887 } 2070 }
1888 } 2071 }
@@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
1933 n = audit_alloc_name(context, AUDIT_TYPE_PARENT); 2116 n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
1934 if (!n) 2117 if (!n)
1935 return; 2118 return;
1936 audit_copy_inode(n, NULL, parent); 2119 audit_copy_inode(n, NULL, parent, 0);
1937 } 2120 }
1938 2121
1939 if (!found_child) { 2122 if (!found_child) {
@@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent,
1952 } 2135 }
1953 2136
1954 if (inode) 2137 if (inode)
1955 audit_copy_inode(found_child, dentry, inode); 2138 audit_copy_inode(found_child, dentry, inode, 0);
1956 else 2139 else
1957 found_child->ino = AUDIT_INO_UNSET; 2140 found_child->ino = AUDIT_INO_UNSET;
1958} 2141}
@@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
1983 return 1; 2166 return 1;
1984} 2167}
1985 2168
1986/* global counter which is incremented every time something logs in */
1987static atomic_t session_id = ATOMIC_INIT(0);
1988
1989static int audit_set_loginuid_perm(kuid_t loginuid)
1990{
1991 /* if we are unset, we don't need privs */
1992 if (!audit_loginuid_set(current))
1993 return 0;
1994 /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
1995 if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
1996 return -EPERM;
1997 /* it is set, you need permission */
1998 if (!capable(CAP_AUDIT_CONTROL))
1999 return -EPERM;
2000 /* reject if this is not an unset and we don't allow that */
2001 if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
2002 return -EPERM;
2003 return 0;
2004}
2005
2006static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
2007 unsigned int oldsessionid, unsigned int sessionid,
2008 int rc)
2009{
2010 struct audit_buffer *ab;
2011 uid_t uid, oldloginuid, loginuid;
2012 struct tty_struct *tty;
2013
2014 if (!audit_enabled)
2015 return;
2016
2017 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
2018 if (!ab)
2019 return;
2020
2021 uid = from_kuid(&init_user_ns, task_uid(current));
2022 oldloginuid = from_kuid(&init_user_ns, koldloginuid);
2023 loginuid = from_kuid(&init_user_ns, kloginuid),
2024 tty = audit_get_tty();
2025
2026 audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
2027 audit_log_task_context(ab);
2028 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
2029 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
2030 oldsessionid, sessionid, !rc);
2031 audit_put_tty(tty);
2032 audit_log_end(ab);
2033}
2034
2035/**
2036 * audit_set_loginuid - set current task's audit_context loginuid
2037 * @loginuid: loginuid value
2038 *
2039 * Returns 0.
2040 *
2041 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2042 */
2043int audit_set_loginuid(kuid_t loginuid)
2044{
2045 unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
2046 kuid_t oldloginuid;
2047 int rc;
2048
2049 oldloginuid = audit_get_loginuid(current);
2050 oldsessionid = audit_get_sessionid(current);
2051
2052 rc = audit_set_loginuid_perm(loginuid);
2053 if (rc)
2054 goto out;
2055
2056 /* are we setting or clearing? */
2057 if (uid_valid(loginuid)) {
2058 sessionid = (unsigned int)atomic_inc_return(&session_id);
2059 if (unlikely(sessionid == AUDIT_SID_UNSET))
2060 sessionid = (unsigned int)atomic_inc_return(&session_id);
2061 }
2062
2063 current->sessionid = sessionid;
2064 current->loginuid = loginuid;
2065out:
2066 audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
2067 return rc;
2068}
2069
2070/** 2169/**
2071 * __audit_mq_open - record audit data for a POSIX MQ open 2170 * __audit_mq_open - record audit data for a POSIX MQ open
2072 * @oflag: open flag 2171 * @oflag: open flag
@@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2355 ax->fcap.permitted = vcaps.permitted; 2454 ax->fcap.permitted = vcaps.permitted;
2356 ax->fcap.inheritable = vcaps.inheritable; 2455 ax->fcap.inheritable = vcaps.inheritable;
2357 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); 2456 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2457 ax->fcap.rootid = vcaps.rootid;
2358 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; 2458 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2359 2459
2360 ax->old_pcap.permitted = old->cap_permitted; 2460 ax->old_pcap.permitted = old->cap_permitted;
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..1444f3954d75 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
93 break; 93 break;
94 case _LINUX_CAPABILITY_VERSION_2: 94 case _LINUX_CAPABILITY_VERSION_2:
95 warn_deprecated_v2(); 95 warn_deprecated_v2();
96 /* 96 /* fall through - v3 is otherwise equivalent to v2. */
97 * fall through - v3 is otherwise equivalent to v2.
98 */
99 case _LINUX_CAPABILITY_VERSION_3: 97 case _LINUX_CAPABILITY_VERSION_3:
100 *tocopy = _LINUX_CAPABILITY_U32S_3; 98 *tocopy = _LINUX_CAPABILITY_U32S_3;
101 break; 99 break;
@@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t,
299 int ret; 297 int ret;
300 298
301 rcu_read_lock(); 299 rcu_read_lock();
302 ret = security_capable(__task_cred(t), ns, cap); 300 ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
303 rcu_read_unlock(); 301 rcu_read_unlock();
304 302
305 return (ret == 0); 303 return (ret == 0);
@@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
340 int ret; 338 int ret;
341 339
342 rcu_read_lock(); 340 rcu_read_lock();
343 ret = security_capable_noaudit(__task_cred(t), ns, cap); 341 ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
344 rcu_read_unlock(); 342 rcu_read_unlock();
345 343
346 return (ret == 0); 344 return (ret == 0);
@@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
363 return has_ns_capability_noaudit(t, &init_user_ns, cap); 361 return has_ns_capability_noaudit(t, &init_user_ns, cap);
364} 362}
365 363
366static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) 364static bool ns_capable_common(struct user_namespace *ns,
365 int cap,
366 unsigned int opts)
367{ 367{
368 int capable; 368 int capable;
369 369
@@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
372 BUG(); 372 BUG();
373 } 373 }
374 374
375 capable = audit ? security_capable(current_cred(), ns, cap) : 375 capable = security_capable(current_cred(), ns, cap, opts);
376 security_capable_noaudit(current_cred(), ns, cap);
377 if (capable == 0) { 376 if (capable == 0) {
378 current->flags |= PF_SUPERPRIV; 377 current->flags |= PF_SUPERPRIV;
379 return true; 378 return true;
@@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
394 */ 393 */
395bool ns_capable(struct user_namespace *ns, int cap) 394bool ns_capable(struct user_namespace *ns, int cap)
396{ 395{
397 return ns_capable_common(ns, cap, true); 396 return ns_capable_common(ns, cap, CAP_OPT_NONE);
398} 397}
399EXPORT_SYMBOL(ns_capable); 398EXPORT_SYMBOL(ns_capable);
400 399
@@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable);
412 */ 411 */
413bool ns_capable_noaudit(struct user_namespace *ns, int cap) 412bool ns_capable_noaudit(struct user_namespace *ns, int cap)
414{ 413{
415 return ns_capable_common(ns, cap, false); 414 return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
416} 415}
417EXPORT_SYMBOL(ns_capable_noaudit); 416EXPORT_SYMBOL(ns_capable_noaudit);
418 417
419/** 418/**
419 * ns_capable_setid - Determine if the current task has a superior capability
420 * in effect, while signalling that this check is being done from within a
421 * setid syscall.
422 * @ns: The usernamespace we want the capability in
423 * @cap: The capability to be tested for
424 *
425 * Return true if the current task has the given superior capability currently
426 * available for use, false if not.
427 *
428 * This sets PF_SUPERPRIV on the task if the capability is available on the
429 * assumption that it's about to be used.
430 */
431bool ns_capable_setid(struct user_namespace *ns, int cap)
432{
433 return ns_capable_common(ns, cap, CAP_OPT_INSETID);
434}
435EXPORT_SYMBOL(ns_capable_setid);
436
437/**
420 * capable - Determine if the current task has a superior capability in effect 438 * capable - Determine if the current task has a superior capability in effect
421 * @cap: The capability to be tested for 439 * @cap: The capability to be tested for
422 * 440 *
@@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable);
448bool file_ns_capable(const struct file *file, struct user_namespace *ns, 466bool file_ns_capable(const struct file *file, struct user_namespace *ns,
449 int cap) 467 int cap)
450{ 468{
469
451 if (WARN_ON_ONCE(!cap_valid(cap))) 470 if (WARN_ON_ONCE(!cap_valid(cap)))
452 return false; 471 return false;
453 472
454 if (security_capable(file->f_cred, ns, cap) == 0) 473 if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
455 return true; 474 return true;
456 475
457 return false; 476 return false;
@@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
500{ 519{
501 int ret = 0; /* An absent tracer adds no restrictions */ 520 int ret = 0; /* An absent tracer adds no restrictions */
502 const struct cred *cred; 521 const struct cred *cred;
522
503 rcu_read_lock(); 523 rcu_read_lock();
504 cred = rcu_dereference(tsk->ptracer_cred); 524 cred = rcu_dereference(tsk->ptracer_cred);
505 if (cred) 525 if (cred)
506 ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); 526 ret = security_capable(cred, ns, CAP_SYS_PTRACE,
527 CAP_OPT_NOAUDIT);
507 rcu_read_unlock(); 528 rcu_read_unlock();
508 return (ret == 0); 529 return (ret == 0);
509} 530}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 17828333f7c3..eef24a25bda7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1;
197 */ 197 */
198static u16 have_fork_callback __read_mostly; 198static u16 have_fork_callback __read_mostly;
199static u16 have_exit_callback __read_mostly; 199static u16 have_exit_callback __read_mostly;
200static u16 have_free_callback __read_mostly; 200static u16 have_release_callback __read_mostly;
201static u16 have_canfork_callback __read_mostly; 201static u16 have_canfork_callback __read_mostly;
202 202
203/* cgroup namespace for init task */ 203/* cgroup namespace for init task */
@@ -5326,7 +5326,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5326 5326
5327 have_fork_callback |= (bool)ss->fork << ss->id; 5327 have_fork_callback |= (bool)ss->fork << ss->id;
5328 have_exit_callback |= (bool)ss->exit << ss->id; 5328 have_exit_callback |= (bool)ss->exit << ss->id;
5329 have_free_callback |= (bool)ss->free << ss->id; 5329 have_release_callback |= (bool)ss->release << ss->id;
5330 have_canfork_callback |= (bool)ss->can_fork << ss->id; 5330 have_canfork_callback |= (bool)ss->can_fork << ss->id;
5331 5331
5332 /* At system boot, before all subsystems have been 5332 /* At system boot, before all subsystems have been
@@ -5762,16 +5762,19 @@ void cgroup_exit(struct task_struct *tsk)
5762 } while_each_subsys_mask(); 5762 } while_each_subsys_mask();
5763} 5763}
5764 5764
5765void cgroup_free(struct task_struct *task) 5765void cgroup_release(struct task_struct *task)
5766{ 5766{
5767 struct css_set *cset = task_css_set(task);
5768 struct cgroup_subsys *ss; 5767 struct cgroup_subsys *ss;
5769 int ssid; 5768 int ssid;
5770 5769
5771 do_each_subsys_mask(ss, ssid, have_free_callback) { 5770 do_each_subsys_mask(ss, ssid, have_release_callback) {
5772 ss->free(task); 5771 ss->release(task);
5773 } while_each_subsys_mask(); 5772 } while_each_subsys_mask();
5773}
5774 5774
5775void cgroup_free(struct task_struct *task)
5776{
5777 struct css_set *cset = task_css_set(task);
5775 put_css_set(cset); 5778 put_css_set(cset);
5776} 5779}
5777 5780
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 479743db6c37..72afd55f70c6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -203,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
203 return css_cs(cs->css.parent); 203 return css_cs(cs->css.parent);
204} 204}
205 205
206#ifdef CONFIG_NUMA
207static inline bool task_has_mempolicy(struct task_struct *task)
208{
209 return task->mempolicy;
210}
211#else
212static inline bool task_has_mempolicy(struct task_struct *task)
213{
214 return false;
215}
216#endif
217
218
219/* bits in struct cpuset flags field */ 206/* bits in struct cpuset flags field */
220typedef enum { 207typedef enum {
221 CS_ONLINE, 208 CS_ONLINE,
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..c9960baaa14f 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
247 pids_uncharge(pids, 1); 247 pids_uncharge(pids, 1);
248} 248}
249 249
250static void pids_free(struct task_struct *task) 250static void pids_release(struct task_struct *task)
251{ 251{
252 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); 252 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
253 253
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
342 .cancel_attach = pids_cancel_attach, 342 .cancel_attach = pids_cancel_attach,
343 .can_fork = pids_can_fork, 343 .can_fork = pids_can_fork,
344 .cancel_fork = pids_cancel_fork, 344 .cancel_fork = pids_cancel_fork,
345 .free = pids_free, 345 .release = pids_release,
346 .legacy_cftypes = pids_files, 346 .legacy_cftypes = pids_files,
347 .dfl_cftypes = pids_files, 347 .dfl_cftypes = pids_files,
348 .threaded = true, 348 .threaded = true,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d503d1a9007c..bb95a35e8c2d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
87 struct cgroup *root, int cpu) 87 struct cgroup *root, int cpu)
88{ 88{
89 struct cgroup_rstat_cpu *rstatc; 89 struct cgroup_rstat_cpu *rstatc;
90 struct cgroup *parent;
91 90
92 if (pos == root) 91 if (pos == root)
93 return NULL; 92 return NULL;
@@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
115 * However, due to the way we traverse, @pos will be the first 114 * However, due to the way we traverse, @pos will be the first
116 * child in most cases. The only exception is @root. 115 * child in most cases. The only exception is @root.
117 */ 116 */
118 parent = cgroup_parent(pos); 117 if (rstatc->updated_next) {
119 if (parent && rstatc->updated_next) { 118 struct cgroup *parent = cgroup_parent(pos);
120 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); 119 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
121 struct cgroup_rstat_cpu *nrstatc; 120 struct cgroup_rstat_cpu *nrstatc;
122 struct cgroup **nextp; 121 struct cgroup **nextp;
@@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
140 * updated stat. 139 * updated stat.
141 */ 140 */
142 smp_mb(); 141 smp_mb();
142
143 return pos;
143 } 144 }
144 145
145 return pos; 146 /* only happens for @root */
147 return NULL;
146} 148}
147 149
148/* see cgroup_rstat_flush() */ 150/* see cgroup_rstat_flush() */
diff --git a/kernel/cred.c b/kernel/cred.c
index 21f4a97085b4..45d77284aed0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred)
760{ 760{
761 if (cred->magic != CRED_MAGIC) 761 if (cred->magic != CRED_MAGIC)
762 return true; 762 return true;
763#ifdef CONFIG_SECURITY_SELINUX
764 /*
765 * cred->security == NULL if security_cred_alloc_blank() or
766 * security_prepare_creds() returned an error.
767 */
768 if (selinux_is_enabled() && cred->security) {
769 if ((unsigned long) cred->security < PAGE_SIZE)
770 return true;
771 if ((*(u32 *)cred->security & 0xffffff00) ==
772 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
773 return true;
774 }
775#endif
776 return false; 763 return false;
777} 764}
778EXPORT_SYMBOL(creds_are_invalid); 765EXPORT_SYMBOL(creds_are_invalid);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index ca88b867e7fe..0711d18645de 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT
16config ARCH_HAS_DMA_COHERENCE_H 16config ARCH_HAS_DMA_COHERENCE_H
17 bool 17 bool
18 18
19config ARCH_HAS_DMA_SET_MASK
20 bool
21
19config HAVE_GENERIC_DMA_COHERENT 22config HAVE_GENERIC_DMA_COHERENT
20 bool 23 bool
21 24
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..d5bb51cf27c6 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -132,8 +132,7 @@ again:
132 goto again; 132 goto again;
133 } 133 }
134 134
135 if (IS_ENABLED(CONFIG_ZONE_DMA) && 135 if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
136 phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
137 gfp = (gfp & ~GFP_DMA32) | GFP_DMA; 136 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
138 goto again; 137 goto again;
139 } 138 }
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a11006b6d8e8..ef2aba503467 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
207} 207}
208EXPORT_SYMBOL(dma_mmap_attrs); 208EXPORT_SYMBOL(dma_mmap_attrs);
209 209
210#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
211static u64 dma_default_get_required_mask(struct device *dev) 210static u64 dma_default_get_required_mask(struct device *dev)
212{ 211{
213 u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); 212 u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
@@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev)
238 return dma_default_get_required_mask(dev); 237 return dma_default_get_required_mask(dev);
239} 238}
240EXPORT_SYMBOL_GPL(dma_get_required_mask); 239EXPORT_SYMBOL_GPL(dma_get_required_mask);
241#endif
242 240
243#ifndef arch_dma_alloc_attrs 241#ifndef arch_dma_alloc_attrs
244#define arch_dma_alloc_attrs(dev) (true) 242#define arch_dma_alloc_attrs(dev) (true)
@@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask)
318} 316}
319EXPORT_SYMBOL(dma_supported); 317EXPORT_SYMBOL(dma_supported);
320 318
321#ifndef HAVE_ARCH_DMA_SET_MASK 319#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
320void arch_dma_set_mask(struct device *dev, u64 mask);
321#else
322#define arch_dma_set_mask(dev, mask) do { } while (0)
323#endif
324
322int dma_set_mask(struct device *dev, u64 mask) 325int dma_set_mask(struct device *dev, u64 mask)
323{ 326{
324 if (!dev->dma_mask || !dma_supported(dev, mask)) 327 if (!dev->dma_mask || !dma_supported(dev, mask))
325 return -EIO; 328 return -EIO;
326 329
330 arch_dma_set_mask(dev, mask);
327 dma_check_mask(dev, mask); 331 dma_check_mask(dev, mask);
328 *dev->dma_mask = mask; 332 *dev->dma_mask = mask;
329 return 0; 333 return 0;
330} 334}
331EXPORT_SYMBOL(dma_set_mask); 335EXPORT_SYMBOL(dma_set_mask);
332#endif
333 336
334#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK 337#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
335int dma_set_coherent_mask(struct device *dev, u64 mask) 338int dma_set_coherent_mask(struct device *dev, u64 mask)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd68b9c7..6d0236bd3929 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -650,15 +650,3 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
650 650
651 return true; 651 return true;
652} 652}
653
654/*
655 * Return whether the given device DMA address mask can be supported
656 * properly. For example, if your device can only drive the low 24-bits
657 * during bus mastering, then you would pass 0x00ffffff as the mask to
658 * this function.
659 */
660int
661swiotlb_dma_supported(struct device *hwdev, u64 mask)
662{
663 return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
664}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2639a30a8aa5..2166c2d92ddc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,6 +219,7 @@ repeat:
219 } 219 }
220 220
221 write_unlock_irq(&tasklist_lock); 221 write_unlock_irq(&tasklist_lock);
222 cgroup_release(p);
222 release_thread(p); 223 release_thread(p);
223 call_rcu(&p->rcu, delayed_put_task_struct); 224 call_rcu(&p->rcu, delayed_put_task_struct);
224 225
diff --git a/kernel/resource.c b/kernel/resource.c
index 915c02e8e5dd..e81b17b53fa5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg,
448 arg, func); 448 arg, func);
449} 449}
450 450
451#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
452
453/* 451/*
454 * This function calls the @func callback against all memory ranges of type 452 * This function calls the @func callback against all memory ranges of type
455 * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. 453 * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
@@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
481 return ret; 479 return ret;
482} 480}
483 481
484#endif
485
486static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) 482static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
487{ 483{
488 return 1; 484 return 1;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index a43c601ac252..54a0347ca812 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -445,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
445 * behavior of privileged children. 445 * behavior of privileged children.
446 */ 446 */
447 if (!task_no_new_privs(current) && 447 if (!task_no_new_privs(current) &&
448 security_capable_noaudit(current_cred(), current_user_ns(), 448 security_capable(current_cred(), current_user_ns(),
449 CAP_SYS_ADMIN) != 0) 449 CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
450 return ERR_PTR(-EACCES); 450 return ERR_PTR(-EACCES);
451 451
452 /* Allocate a new seccomp_filter */ 452 /* Allocate a new seccomp_filter */
diff --git a/kernel/sys.c b/kernel/sys.c
index dc5d9e636d48..12df0e5434b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
516 new->uid = kruid; 516 new->uid = kruid;
517 if (!uid_eq(old->uid, kruid) && 517 if (!uid_eq(old->uid, kruid) &&
518 !uid_eq(old->euid, kruid) && 518 !uid_eq(old->euid, kruid) &&
519 !ns_capable(old->user_ns, CAP_SETUID)) 519 !ns_capable_setid(old->user_ns, CAP_SETUID))
520 goto error; 520 goto error;
521 } 521 }
522 522
@@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
525 if (!uid_eq(old->uid, keuid) && 525 if (!uid_eq(old->uid, keuid) &&
526 !uid_eq(old->euid, keuid) && 526 !uid_eq(old->euid, keuid) &&
527 !uid_eq(old->suid, keuid) && 527 !uid_eq(old->suid, keuid) &&
528 !ns_capable(old->user_ns, CAP_SETUID)) 528 !ns_capable_setid(old->user_ns, CAP_SETUID))
529 goto error; 529 goto error;
530 } 530 }
531 531
@@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid)
584 old = current_cred(); 584 old = current_cred();
585 585
586 retval = -EPERM; 586 retval = -EPERM;
587 if (ns_capable(old->user_ns, CAP_SETUID)) { 587 if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
588 new->suid = new->uid = kuid; 588 new->suid = new->uid = kuid;
589 if (!uid_eq(kuid, old->uid)) { 589 if (!uid_eq(kuid, old->uid)) {
590 retval = set_user(new); 590 retval = set_user(new);
@@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
646 old = current_cred(); 646 old = current_cred();
647 647
648 retval = -EPERM; 648 retval = -EPERM;
649 if (!ns_capable(old->user_ns, CAP_SETUID)) { 649 if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
650 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && 650 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
651 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) 651 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
652 goto error; 652 goto error;
@@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid)
814 814
815 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || 815 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
816 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || 816 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
817 ns_capable(old->user_ns, CAP_SETUID)) { 817 ns_capable_setid(old->user_ns, CAP_SETUID)) {
818 if (!uid_eq(kuid, old->fsuid)) { 818 if (!uid_eq(kuid, old->fsuid)) {
819 new->fsuid = kuid; 819 new->fsuid = kuid;
820 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 820 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 27821480105e..217ef481fbbb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1301,7 +1301,7 @@ static int parse_pred(const char *str, void *data,
1301 /* go past the last quote */ 1301 /* go past the last quote */
1302 i++; 1302 i++;
1303 1303
1304 } else if (isdigit(str[i])) { 1304 } else if (isdigit(str[i]) || str[i] == '-') {
1305 1305
1306 /* Make sure the field is not a string */ 1306 /* Make sure the field is not a string */
1307 if (is_string_field(field)) { 1307 if (is_string_field(field)) {
@@ -1314,6 +1314,9 @@ static int parse_pred(const char *str, void *data,
1314 goto err_free; 1314 goto err_free;
1315 } 1315 }
1316 1316
1317 if (str[i] == '-')
1318 i++;
1319
1317 /* We allow 0xDEADBEEF */ 1320 /* We allow 0xDEADBEEF */
1318 while (isalnum(str[i])) 1321 while (isalnum(str[i]))
1319 i++; 1322 i++;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9eaf07f99212..99592c27465e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -865,7 +865,7 @@ fetch_store_strlen(unsigned long addr)
865 u8 c; 865 u8 c;
866 866
867 do { 867 do {
868 ret = probe_mem_read(&c, (u8 *)addr + len, 1); 868 ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
869 len++; 869 len++;
870 } while (c && ret == 0 && len < MAX_STRING_SIZE); 870 } while (c && ret == 0 && len < MAX_STRING_SIZE);
871 871
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d51c37dd9422..7abbeed13421 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -648,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
648 * The following mb guarantees that previous clear of a PENDING bit 648 * The following mb guarantees that previous clear of a PENDING bit
649 * will not be reordered with any speculative LOADS or STORES from 649 * will not be reordered with any speculative LOADS or STORES from
650 * work->current_func, which is executed afterwards. This possible 650 * work->current_func, which is executed afterwards. This possible
651 * reordering can lead to a missed execution on attempt to qeueue 651 * reordering can lead to a missed execution on attempt to queue
652 * the same @work. E.g. consider this case: 652 * the same @work. E.g. consider this case:
653 * 653 *
654 * CPU#0 CPU#1 654 * CPU#0 CPU#1
@@ -1353,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
1353 1353
1354 worker = current_wq_worker(); 1354 worker = current_wq_worker();
1355 /* 1355 /*
1356 * Return %true iff I'm a worker execuing a work item on @wq. If 1356 * Return %true iff I'm a worker executing a work item on @wq. If
1357 * I'm @worker, it's safe to dereference it without locking. 1357 * I'm @worker, it's safe to dereference it without locking.
1358 */ 1358 */
1359 return worker && worker->current_pwq->wq == wq; 1359 return worker && worker->current_pwq->wq == wq;
@@ -1735,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
1735 * 1735 *
1736 * Return: %false if @rwork was already pending, %true otherwise. Note 1736 * Return: %false if @rwork was already pending, %true otherwise. Note
1737 * that a full RCU grace period is guaranteed only after a %true return. 1737 * that a full RCU grace period is guaranteed only after a %true return.
1738 * While @rwork is guarnateed to be executed after a %false return, the 1738 * While @rwork is guaranteed to be executed after a %false return, the
1739 * execution may happen before a full RCU grace period has passed. 1739 * execution may happen before a full RCU grace period has passed.
1740 */ 1740 */
1741bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) 1741bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
@@ -3027,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
3027 if (WARN_ON(!wq_online)) 3027 if (WARN_ON(!wq_online))
3028 return false; 3028 return false;
3029 3029
3030 if (WARN_ON(!work->func))
3031 return false;
3032
3030 if (!from_cancel) { 3033 if (!from_cancel) {
3031 lock_map_acquire(&work->lockdep_map); 3034 lock_map_acquire(&work->lockdep_map);
3032 lock_map_release(&work->lockdep_map); 3035 lock_map_release(&work->lockdep_map);
diff --git a/mm/gup.c b/mm/gup.c
index 22291db50013..f84e22685aaa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1939,7 +1939,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
1939 * Check if it's allowed to use __get_user_pages_fast() for the range, or 1939 * Check if it's allowed to use __get_user_pages_fast() for the range, or
1940 * we need to fall back to the slow version: 1940 * we need to fall back to the slow version:
1941 */ 1941 */
1942bool gup_fast_permitted(unsigned long start, int nr_pages, int write) 1942bool gup_fast_permitted(unsigned long start, int nr_pages)
1943{ 1943{
1944 unsigned long len, end; 1944 unsigned long len, end;
1945 1945
@@ -1981,7 +1981,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1981 * block IPIs that come from THPs splitting. 1981 * block IPIs that come from THPs splitting.
1982 */ 1982 */
1983 1983
1984 if (gup_fast_permitted(start, nr_pages, write)) { 1984 if (gup_fast_permitted(start, nr_pages)) {
1985 local_irq_save(flags); 1985 local_irq_save(flags);
1986 gup_pgd_range(start, end, write, pages, &nr); 1986 gup_pgd_range(start, end, write, pages, &nr);
1987 local_irq_restore(flags); 1987 local_irq_restore(flags);
@@ -2023,7 +2023,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
2023 if (unlikely(!access_ok((void __user *)start, len))) 2023 if (unlikely(!access_ok((void __user *)start, len)))
2024 return -EFAULT; 2024 return -EFAULT;
2025 2025
2026 if (gup_fast_permitted(start, nr_pages, write)) { 2026 if (gup_fast_permitted(start, nr_pages)) {
2027 local_irq_disable(); 2027 local_irq_disable();
2028 gup_pgd_range(addr, end, write, pages, &nr); 2028 gup_pgd_range(addr, end, write, pages, &nr);
2029 local_irq_enable(); 2029 local_irq_enable();
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0f643dc2dc65..b68d5df14731 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -67,7 +67,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
67 pcpu_set_page_chunk(nth_page(pages, i), chunk); 67 pcpu_set_page_chunk(nth_page(pages, i), chunk);
68 68
69 chunk->data = pages; 69 chunk->data = pages;
70 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 70 chunk->base_addr = page_address(pages);
71 71
72 spin_lock_irqsave(&pcpu_lock, flags); 72 spin_lock_irqsave(&pcpu_lock, flags);
73 pcpu_chunk_populated(chunk, 0, nr_pages, false); 73 pcpu_chunk_populated(chunk, 0, nr_pages, false);
diff --git a/mm/percpu.c b/mm/percpu.c
index db86282fd024..c5c750781628 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2384,7 +2384,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
2384 ai->atom_size = atom_size; 2384 ai->atom_size = atom_size;
2385 ai->alloc_size = alloc_size; 2385 ai->alloc_size = alloc_size;
2386 2386
2387 for (group = 0, unit = 0; group_cnt[group]; group++) { 2387 for (group = 0, unit = 0; group < nr_groups; group++) {
2388 struct pcpu_group_info *gi = &ai->groups[group]; 2388 struct pcpu_group_info *gi = &ai->groups[group];
2389 2389
2390 /* 2390 /*
diff --git a/security/Kconfig b/security/Kconfig
index e4fe2f3c2c65..1d6463fb1450 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -40,8 +40,7 @@ config SECURITYFS
40 bool "Enable the securityfs filesystem" 40 bool "Enable the securityfs filesystem"
41 help 41 help
42 This will build the securityfs filesystem. It is currently used by 42 This will build the securityfs filesystem. It is currently used by
43 the TPM bios character driver and IMA, an integrity provider. It is 43 various security modules (AppArmor, IMA, SafeSetID, TOMOYO, TPM).
44 not used by SELinux or SMACK.
45 44
46 If you are unsure how to answer this question, answer N. 45 If you are unsure how to answer this question, answer N.
47 46
@@ -236,45 +235,19 @@ source "security/tomoyo/Kconfig"
236source "security/apparmor/Kconfig" 235source "security/apparmor/Kconfig"
237source "security/loadpin/Kconfig" 236source "security/loadpin/Kconfig"
238source "security/yama/Kconfig" 237source "security/yama/Kconfig"
238source "security/safesetid/Kconfig"
239 239
240source "security/integrity/Kconfig" 240source "security/integrity/Kconfig"
241 241
242choice 242config LSM
243 prompt "Default security module" 243 string "Ordered list of enabled LSMs"
244 default DEFAULT_SECURITY_SELINUX if SECURITY_SELINUX 244 default "yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
245 default DEFAULT_SECURITY_SMACK if SECURITY_SMACK
246 default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO
247 default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR
248 default DEFAULT_SECURITY_DAC
249
250 help 245 help
251 Select the security module that will be used by default if the 246 A comma-separated list of LSMs, in initialization order.
252 kernel parameter security= is not specified. 247 Any LSMs left off this list will be ignored. This can be
253 248 controlled at boot with the "lsm=" parameter.
254 config DEFAULT_SECURITY_SELINUX
255 bool "SELinux" if SECURITY_SELINUX=y
256
257 config DEFAULT_SECURITY_SMACK
258 bool "Simplified Mandatory Access Control" if SECURITY_SMACK=y
259
260 config DEFAULT_SECURITY_TOMOYO
261 bool "TOMOYO" if SECURITY_TOMOYO=y
262
263 config DEFAULT_SECURITY_APPARMOR
264 bool "AppArmor" if SECURITY_APPARMOR=y
265
266 config DEFAULT_SECURITY_DAC
267 bool "Unix Discretionary Access Controls"
268
269endchoice
270 249
271config DEFAULT_SECURITY 250 If unsure, leave this as the default.
272 string
273 default "selinux" if DEFAULT_SECURITY_SELINUX
274 default "smack" if DEFAULT_SECURITY_SMACK
275 default "tomoyo" if DEFAULT_SECURITY_TOMOYO
276 default "apparmor" if DEFAULT_SECURITY_APPARMOR
277 default "" if DEFAULT_SECURITY_DAC
278 251
279endmenu 252endmenu
280 253
diff --git a/security/Makefile b/security/Makefile
index 4d2d3782ddef..c598b904938f 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -10,6 +10,7 @@ subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo
10subdir-$(CONFIG_SECURITY_APPARMOR) += apparmor 10subdir-$(CONFIG_SECURITY_APPARMOR) += apparmor
11subdir-$(CONFIG_SECURITY_YAMA) += yama 11subdir-$(CONFIG_SECURITY_YAMA) += yama
12subdir-$(CONFIG_SECURITY_LOADPIN) += loadpin 12subdir-$(CONFIG_SECURITY_LOADPIN) += loadpin
13subdir-$(CONFIG_SECURITY_SAFESETID) += safesetid
13 14
14# always enable default capabilities 15# always enable default capabilities
15obj-y += commoncap.o 16obj-y += commoncap.o
@@ -25,6 +26,7 @@ obj-$(CONFIG_SECURITY_TOMOYO) += tomoyo/
25obj-$(CONFIG_SECURITY_APPARMOR) += apparmor/ 26obj-$(CONFIG_SECURITY_APPARMOR) += apparmor/
26obj-$(CONFIG_SECURITY_YAMA) += yama/ 27obj-$(CONFIG_SECURITY_YAMA) += yama/
27obj-$(CONFIG_SECURITY_LOADPIN) += loadpin/ 28obj-$(CONFIG_SECURITY_LOADPIN) += loadpin/
29obj-$(CONFIG_SECURITY_SAFESETID) += safesetid/
28obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o 30obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o
29 31
30# Object integrity file lists 32# Object integrity file lists
diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig
index b6b68a7750ce..3de21f46c82a 100644
--- a/security/apparmor/Kconfig
+++ b/security/apparmor/Kconfig
@@ -14,22 +14,6 @@ config SECURITY_APPARMOR
14 14
15 If you are unsure how to answer this question, answer N. 15 If you are unsure how to answer this question, answer N.
16 16
17config SECURITY_APPARMOR_BOOTPARAM_VALUE
18 int "AppArmor boot parameter default value"
19 depends on SECURITY_APPARMOR
20 range 0 1
21 default 1
22 help
23 This option sets the default value for the kernel parameter
24 'apparmor', which allows AppArmor to be enabled or disabled
25 at boot. If this option is set to 0 (zero), the AppArmor
26 kernel parameter will default to 0, disabling AppArmor at
27 boot. If this option is set to 1 (one), the AppArmor
28 kernel parameter will default to 1, enabling AppArmor at
29 boot.
30
31 If you are unsure how to answer this question, answer 1.
32
33config SECURITY_APPARMOR_HASH 17config SECURITY_APPARMOR_HASH
34 bool "Enable introspection of sha1 hashes for loaded profiles" 18 bool "Enable introspection of sha1 hashes for loaded profiles"
35 depends on SECURITY_APPARMOR 19 depends on SECURITY_APPARMOR
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
index eeaddfe0c0fb..5a8b9cded4f2 100644
--- a/security/apparmor/audit.c
+++ b/security/apparmor/audit.c
@@ -225,8 +225,7 @@ int aa_audit_rule_known(struct audit_krule *rule)
225 return 0; 225 return 0;
226} 226}
227 227
228int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule, 228int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
229 struct audit_context *actx)
230{ 229{
231 struct aa_audit_rule *rule = vrule; 230 struct aa_audit_rule *rule = vrule;
232 struct aa_label *label; 231 struct aa_label *label;
diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c
index 253ef6e9d445..752f73980e30 100644
--- a/security/apparmor/capability.c
+++ b/security/apparmor/capability.c
@@ -110,13 +110,13 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile,
110 * profile_capable - test if profile allows use of capability @cap 110 * profile_capable - test if profile allows use of capability @cap
111 * @profile: profile being enforced (NOT NULL, NOT unconfined) 111 * @profile: profile being enforced (NOT NULL, NOT unconfined)
112 * @cap: capability to test if allowed 112 * @cap: capability to test if allowed
113 * @audit: whether an audit record should be generated 113 * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated
114 * @sa: audit data (MAY BE NULL indicating no auditing) 114 * @sa: audit data (MAY BE NULL indicating no auditing)
115 * 115 *
116 * Returns: 0 if allowed else -EPERM 116 * Returns: 0 if allowed else -EPERM
117 */ 117 */
118static int profile_capable(struct aa_profile *profile, int cap, int audit, 118static int profile_capable(struct aa_profile *profile, int cap,
119 struct common_audit_data *sa) 119 unsigned int opts, struct common_audit_data *sa)
120{ 120{
121 int error; 121 int error;
122 122
@@ -126,7 +126,7 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit,
126 else 126 else
127 error = -EPERM; 127 error = -EPERM;
128 128
129 if (audit == SECURITY_CAP_NOAUDIT) { 129 if (opts & CAP_OPT_NOAUDIT) {
130 if (!COMPLAIN_MODE(profile)) 130 if (!COMPLAIN_MODE(profile))
131 return error; 131 return error;
132 /* audit the cap request in complain mode but note that it 132 /* audit the cap request in complain mode but note that it
@@ -142,13 +142,13 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit,
142 * aa_capable - test permission to use capability 142 * aa_capable - test permission to use capability
143 * @label: label being tested for capability (NOT NULL) 143 * @label: label being tested for capability (NOT NULL)
144 * @cap: capability to be tested 144 * @cap: capability to be tested
145 * @audit: whether an audit record should be generated 145 * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated
146 * 146 *
147 * Look up capability in profile capability set. 147 * Look up capability in profile capability set.
148 * 148 *
149 * Returns: 0 on success, or else an error code. 149 * Returns: 0 on success, or else an error code.
150 */ 150 */
151int aa_capable(struct aa_label *label, int cap, int audit) 151int aa_capable(struct aa_label *label, int cap, unsigned int opts)
152{ 152{
153 struct aa_profile *profile; 153 struct aa_profile *profile;
154 int error = 0; 154 int error = 0;
@@ -156,7 +156,7 @@ int aa_capable(struct aa_label *label, int cap, int audit)
156 156
157 sa.u.cap = cap; 157 sa.u.cap = cap;
158 error = fn_for_each_confined(label, profile, 158 error = fn_for_each_confined(label, profile,
159 profile_capable(profile, cap, audit, &sa)); 159 profile_capable(profile, cap, opts, &sa));
160 160
161 return error; 161 return error;
162} 162}
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 11975ec8d566..ca2dccf5b445 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -572,7 +572,7 @@ static struct aa_label *x_to_label(struct aa_profile *profile,
572 stack = NULL; 572 stack = NULL;
573 break; 573 break;
574 } 574 }
575 /* fall through to X_NAME */ 575 /* fall through - to X_NAME */
576 case AA_X_NAME: 576 case AA_X_NAME:
577 if (xindex & AA_X_CHILD) 577 if (xindex & AA_X_CHILD)
578 /* released by caller */ 578 /* released by caller */
@@ -975,7 +975,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
975 } 975 }
976 aa_put_label(cred_label(bprm->cred)); 976 aa_put_label(cred_label(bprm->cred));
977 /* transfer reference, released when cred is freed */ 977 /* transfer reference, released when cred is freed */
978 cred_label(bprm->cred) = new; 978 set_cred_label(bprm->cred, new);
979 979
980done: 980done:
981 aa_put_label(label); 981 aa_put_label(label);
diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h
index b8c8b1066b0a..ee559bc2acb8 100644
--- a/security/apparmor/include/audit.h
+++ b/security/apparmor/include/audit.h
@@ -192,7 +192,6 @@ static inline int complain_error(int error)
192void aa_audit_rule_free(void *vrule); 192void aa_audit_rule_free(void *vrule);
193int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule); 193int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule);
194int aa_audit_rule_known(struct audit_krule *rule); 194int aa_audit_rule_known(struct audit_krule *rule);
195int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule, 195int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule);
196 struct audit_context *actx);
197 196
198#endif /* __AA_AUDIT_H */ 197#endif /* __AA_AUDIT_H */
diff --git a/security/apparmor/include/capability.h b/security/apparmor/include/capability.h
index e0304e2aeb7f..1b3663b6ab12 100644
--- a/security/apparmor/include/capability.h
+++ b/security/apparmor/include/capability.h
@@ -40,7 +40,7 @@ struct aa_caps {
40 40
41extern struct aa_sfs_entry aa_sfs_entry_caps[]; 41extern struct aa_sfs_entry aa_sfs_entry_caps[];
42 42
43int aa_capable(struct aa_label *label, int cap, int audit); 43int aa_capable(struct aa_label *label, int cap, unsigned int opts);
44 44
45static inline void aa_free_cap_rules(struct aa_caps *caps) 45static inline void aa_free_cap_rules(struct aa_caps *caps)
46{ 46{
diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h
index 265ae6641a06..b9504a05fddc 100644
--- a/security/apparmor/include/cred.h
+++ b/security/apparmor/include/cred.h
@@ -23,8 +23,22 @@
23#include "policy_ns.h" 23#include "policy_ns.h"
24#include "task.h" 24#include "task.h"
25 25
26#define cred_label(X) ((X)->security) 26static inline struct aa_label *cred_label(const struct cred *cred)
27{
28 struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;
29
30 AA_BUG(!blob);
31 return *blob;
32}
27 33
34static inline void set_cred_label(const struct cred *cred,
35 struct aa_label *label)
36{
37 struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;
38
39 AA_BUG(!blob);
40 *blob = label;
41}
28 42
29/** 43/**
30 * aa_cred_raw_label - obtain cred's label 44 * aa_cred_raw_label - obtain cred's label
diff --git a/security/apparmor/include/file.h b/security/apparmor/include/file.h
index 4c2c8ac8842f..8be09208cf7c 100644
--- a/security/apparmor/include/file.h
+++ b/security/apparmor/include/file.h
@@ -32,7 +32,10 @@ struct path;
32 AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \ 32 AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \
33 AA_EXEC_MMAP | AA_MAY_LINK) 33 AA_EXEC_MMAP | AA_MAY_LINK)
34 34
35#define file_ctx(X) ((struct aa_file_ctx *)(X)->f_security) 35static inline struct aa_file_ctx *file_ctx(struct file *file)
36{
37 return file->f_security + apparmor_blob_sizes.lbs_file;
38}
36 39
37/* struct aa_file_ctx - the AppArmor context the file was opened in 40/* struct aa_file_ctx - the AppArmor context the file was opened in
38 * @lock: lock to update the ctx 41 * @lock: lock to update the ctx
diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h
index 6505e1ad9e23..bbe9b384d71d 100644
--- a/security/apparmor/include/lib.h
+++ b/security/apparmor/include/lib.h
@@ -16,6 +16,7 @@
16 16
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/lsm_hooks.h>
19 20
20#include "match.h" 21#include "match.h"
21 22
@@ -55,6 +56,9 @@ const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name,
55 size_t *ns_len); 56 size_t *ns_len);
56void aa_info_message(const char *str); 57void aa_info_message(const char *str);
57 58
59/* Security blob offsets */
60extern struct lsm_blob_sizes apparmor_blob_sizes;
61
58/** 62/**
59 * aa_strneq - compare null terminated @str to a non null terminated substring 63 * aa_strneq - compare null terminated @str to a non null terminated substring
60 * @str: a null terminated string 64 * @str: a null terminated string
diff --git a/security/apparmor/include/task.h b/security/apparmor/include/task.h
index 55edaa1d83f8..311e652324e3 100644
--- a/security/apparmor/include/task.h
+++ b/security/apparmor/include/task.h
@@ -14,7 +14,10 @@
14#ifndef __AA_TASK_H 14#ifndef __AA_TASK_H
15#define __AA_TASK_H 15#define __AA_TASK_H
16 16
17#define task_ctx(X) ((X)->security) 17static inline struct aa_task_ctx *task_ctx(struct task_struct *task)
18{
19 return task->security + apparmor_blob_sizes.lbs_task;
20}
18 21
19/* 22/*
20 * struct aa_task_ctx - information for current task label change 23 * struct aa_task_ctx - information for current task label change
@@ -37,17 +40,6 @@ int aa_restore_previous_label(u64 cookie);
37struct aa_label *aa_get_task_label(struct task_struct *task); 40struct aa_label *aa_get_task_label(struct task_struct *task);
38 41
39/** 42/**
40 * aa_alloc_task_ctx - allocate a new task_ctx
41 * @flags: gfp flags for allocation
42 *
43 * Returns: allocated buffer or NULL on failure
44 */
45static inline struct aa_task_ctx *aa_alloc_task_ctx(gfp_t flags)
46{
47 return kzalloc(sizeof(struct aa_task_ctx), flags);
48}
49
50/**
51 * aa_free_task_ctx - free a task_ctx 43 * aa_free_task_ctx - free a task_ctx
52 * @ctx: task_ctx to free (MAYBE NULL) 44 * @ctx: task_ctx to free (MAYBE NULL)
53 */ 45 */
@@ -57,8 +49,6 @@ static inline void aa_free_task_ctx(struct aa_task_ctx *ctx)
57 aa_put_label(ctx->nnp); 49 aa_put_label(ctx->nnp);
58 aa_put_label(ctx->previous); 50 aa_put_label(ctx->previous);
59 aa_put_label(ctx->onexec); 51 aa_put_label(ctx->onexec);
60
61 kzfree(ctx);
62 } 52 }
63} 53}
64 54
diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c
index 527ea1557120..aacd1e95cb59 100644
--- a/security/apparmor/ipc.c
+++ b/security/apparmor/ipc.c
@@ -107,7 +107,8 @@ static int profile_tracer_perm(struct aa_profile *tracer,
107 aad(sa)->label = &tracer->label; 107 aad(sa)->label = &tracer->label;
108 aad(sa)->peer = tracee; 108 aad(sa)->peer = tracee;
109 aad(sa)->request = 0; 109 aad(sa)->request = 0;
110 aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1); 110 aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE,
111 CAP_OPT_NONE);
111 112
112 return aa_audit(AUDIT_APPARMOR_AUTO, tracer, sa, audit_ptrace_cb); 113 return aa_audit(AUDIT_APPARMOR_AUTO, tracer, sa, audit_ptrace_cb);
113} 114}
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 8db1731d046a..49d664ddff44 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -60,7 +60,7 @@ DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
60static void apparmor_cred_free(struct cred *cred) 60static void apparmor_cred_free(struct cred *cred)
61{ 61{
62 aa_put_label(cred_label(cred)); 62 aa_put_label(cred_label(cred));
63 cred_label(cred) = NULL; 63 set_cred_label(cred, NULL);
64} 64}
65 65
66/* 66/*
@@ -68,7 +68,7 @@ static void apparmor_cred_free(struct cred *cred)
68 */ 68 */
69static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) 69static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
70{ 70{
71 cred_label(cred) = NULL; 71 set_cred_label(cred, NULL);
72 return 0; 72 return 0;
73} 73}
74 74
@@ -78,7 +78,7 @@ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
78static int apparmor_cred_prepare(struct cred *new, const struct cred *old, 78static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
79 gfp_t gfp) 79 gfp_t gfp)
80{ 80{
81 cred_label(new) = aa_get_newest_label(cred_label(old)); 81 set_cred_label(new, aa_get_newest_label(cred_label(old)));
82 return 0; 82 return 0;
83} 83}
84 84
@@ -87,26 +87,21 @@ static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
87 */ 87 */
88static void apparmor_cred_transfer(struct cred *new, const struct cred *old) 88static void apparmor_cred_transfer(struct cred *new, const struct cred *old)
89{ 89{
90 cred_label(new) = aa_get_newest_label(cred_label(old)); 90 set_cred_label(new, aa_get_newest_label(cred_label(old)));
91} 91}
92 92
93static void apparmor_task_free(struct task_struct *task) 93static void apparmor_task_free(struct task_struct *task)
94{ 94{
95 95
96 aa_free_task_ctx(task_ctx(task)); 96 aa_free_task_ctx(task_ctx(task));
97 task_ctx(task) = NULL;
98} 97}
99 98
100static int apparmor_task_alloc(struct task_struct *task, 99static int apparmor_task_alloc(struct task_struct *task,
101 unsigned long clone_flags) 100 unsigned long clone_flags)
102{ 101{
103 struct aa_task_ctx *new = aa_alloc_task_ctx(GFP_KERNEL); 102 struct aa_task_ctx *new = task_ctx(task);
104
105 if (!new)
106 return -ENOMEM;
107 103
108 aa_dup_task_ctx(new, task_ctx(current)); 104 aa_dup_task_ctx(new, task_ctx(current));
109 task_ctx(task) = new;
110 105
111 return 0; 106 return 0;
112} 107}
@@ -177,14 +172,14 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective,
177} 172}
178 173
179static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, 174static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
180 int cap, int audit) 175 int cap, unsigned int opts)
181{ 176{
182 struct aa_label *label; 177 struct aa_label *label;
183 int error = 0; 178 int error = 0;
184 179
185 label = aa_get_newest_cred_label(cred); 180 label = aa_get_newest_cred_label(cred);
186 if (!unconfined(label)) 181 if (!unconfined(label))
187 error = aa_capable(label, cap, audit); 182 error = aa_capable(label, cap, opts);
188 aa_put_label(label); 183 aa_put_label(label);
189 184
190 return error; 185 return error;
@@ -434,21 +429,21 @@ static int apparmor_file_open(struct file *file)
434 429
435static int apparmor_file_alloc_security(struct file *file) 430static int apparmor_file_alloc_security(struct file *file)
436{ 431{
437 int error = 0; 432 struct aa_file_ctx *ctx = file_ctx(file);
438
439 /* freed by apparmor_file_free_security */
440 struct aa_label *label = begin_current_label_crit_section(); 433 struct aa_label *label = begin_current_label_crit_section();
441 file->f_security = aa_alloc_file_ctx(label, GFP_KERNEL);
442 if (!file_ctx(file))
443 error = -ENOMEM;
444 end_current_label_crit_section(label);
445 434
446 return error; 435 spin_lock_init(&ctx->lock);
436 rcu_assign_pointer(ctx->label, aa_get_label(label));
437 end_current_label_crit_section(label);
438 return 0;
447} 439}
448 440
449static void apparmor_file_free_security(struct file *file) 441static void apparmor_file_free_security(struct file *file)
450{ 442{
451 aa_free_file_ctx(file_ctx(file)); 443 struct aa_file_ctx *ctx = file_ctx(file);
444
445 if (ctx)
446 aa_put_label(rcu_access_pointer(ctx->label));
452} 447}
453 448
454static int common_file_perm(const char *op, struct file *file, u32 mask) 449static int common_file_perm(const char *op, struct file *file, u32 mask)
@@ -1151,6 +1146,15 @@ static int apparmor_inet_conn_request(struct sock *sk, struct sk_buff *skb,
1151} 1146}
1152#endif 1147#endif
1153 1148
1149/*
1150 * The cred blob is a pointer to, not an instance of, an aa_task_ctx.
1151 */
1152struct lsm_blob_sizes apparmor_blob_sizes __lsm_ro_after_init = {
1153 .lbs_cred = sizeof(struct aa_task_ctx *),
1154 .lbs_file = sizeof(struct aa_file_ctx),
1155 .lbs_task = sizeof(struct aa_task_ctx),
1156};
1157
1154static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { 1158static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = {
1155 LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), 1159 LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
1156 LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), 1160 LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
@@ -1333,8 +1337,8 @@ bool aa_g_paranoid_load = true;
1333module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); 1337module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO);
1334 1338
1335/* Boot time disable flag */ 1339/* Boot time disable flag */
1336static bool apparmor_enabled = CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE; 1340static int apparmor_enabled __lsm_ro_after_init = 1;
1337module_param_named(enabled, apparmor_enabled, bool, S_IRUGO); 1341module_param_named(enabled, apparmor_enabled, int, 0444);
1338 1342
1339static int __init apparmor_enabled_setup(char *str) 1343static int __init apparmor_enabled_setup(char *str)
1340{ 1344{
@@ -1479,14 +1483,8 @@ static int param_set_mode(const char *val, const struct kernel_param *kp)
1479static int __init set_init_ctx(void) 1483static int __init set_init_ctx(void)
1480{ 1484{
1481 struct cred *cred = (struct cred *)current->real_cred; 1485 struct cred *cred = (struct cred *)current->real_cred;
1482 struct aa_task_ctx *ctx;
1483
1484 ctx = aa_alloc_task_ctx(GFP_KERNEL);
1485 if (!ctx)
1486 return -ENOMEM;
1487 1486
1488 cred_label(cred) = aa_get_label(ns_unconfined(root_ns)); 1487 set_cred_label(cred, aa_get_label(ns_unconfined(root_ns)));
1489 task_ctx(current) = ctx;
1490 1488
1491 return 0; 1489 return 0;
1492} 1490}
@@ -1665,12 +1663,6 @@ static int __init apparmor_init(void)
1665{ 1663{
1666 int error; 1664 int error;
1667 1665
1668 if (!apparmor_enabled || !security_module_enable("apparmor")) {
1669 aa_info_message("AppArmor disabled by boot time parameter");
1670 apparmor_enabled = false;
1671 return 0;
1672 }
1673
1674 aa_secids_init(); 1666 aa_secids_init();
1675 1667
1676 error = aa_setup_dfa_engine(); 1668 error = aa_setup_dfa_engine();
@@ -1731,5 +1723,8 @@ alloc_out:
1731 1723
1732DEFINE_LSM(apparmor) = { 1724DEFINE_LSM(apparmor) = {
1733 .name = "apparmor", 1725 .name = "apparmor",
1726 .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
1727 .enabled = &apparmor_enabled,
1728 .blobs = &apparmor_blob_sizes,
1734 .init = apparmor_init, 1729 .init = apparmor_init,
1735}; 1730};
diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c
index 95fd26d09757..552ed09cb47e 100644
--- a/security/apparmor/resource.c
+++ b/security/apparmor/resource.c
@@ -124,7 +124,7 @@ int aa_task_setrlimit(struct aa_label *label, struct task_struct *task,
124 */ 124 */
125 125
126 if (label != peer && 126 if (label != peer &&
127 aa_capable(label, CAP_SYS_RESOURCE, SECURITY_CAP_NOAUDIT) != 0) 127 aa_capable(label, CAP_SYS_RESOURCE, CAP_OPT_NOAUDIT) != 0)
128 error = fn_for_each(label, profile, 128 error = fn_for_each(label, profile,
129 audit_resource(profile, resource, 129 audit_resource(profile, resource,
130 new_rlim->rlim_max, peer, 130 new_rlim->rlim_max, peer,
diff --git a/security/apparmor/task.c b/security/apparmor/task.c
index c6b78a14da91..4551110f0496 100644
--- a/security/apparmor/task.c
+++ b/security/apparmor/task.c
@@ -81,7 +81,7 @@ int aa_replace_current_label(struct aa_label *label)
81 */ 81 */
82 aa_get_label(label); 82 aa_get_label(label);
83 aa_put_label(cred_label(new)); 83 aa_put_label(cred_label(new));
84 cred_label(new) = label; 84 set_cred_label(new, label);
85 85
86 commit_creds(new); 86 commit_creds(new);
87 return 0; 87 return 0;
@@ -138,7 +138,7 @@ int aa_set_current_hat(struct aa_label *label, u64 token)
138 return -EACCES; 138 return -EACCES;
139 } 139 }
140 140
141 cred_label(new) = aa_get_newest_label(label); 141 set_cred_label(new, aa_get_newest_label(label));
142 /* clear exec on switching context */ 142 /* clear exec on switching context */
143 aa_put_label(ctx->onexec); 143 aa_put_label(ctx->onexec);
144 ctx->onexec = NULL; 144 ctx->onexec = NULL;
@@ -172,7 +172,7 @@ int aa_restore_previous_label(u64 token)
172 return -ENOMEM; 172 return -ENOMEM;
173 173
174 aa_put_label(cred_label(new)); 174 aa_put_label(cred_label(new));
175 cred_label(new) = aa_get_newest_label(ctx->previous); 175 set_cred_label(new, aa_get_newest_label(ctx->previous));
176 AA_BUG(!cred_label(new)); 176 AA_BUG(!cred_label(new));
177 /* clear exec && prev information when restoring to previous context */ 177 /* clear exec && prev information when restoring to previous context */
178 aa_clear_task_ctx_trans(ctx); 178 aa_clear_task_ctx_trans(ctx);
diff --git a/security/commoncap.c b/security/commoncap.c
index 232db019f051..c477fb673701 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -57,7 +57,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
57 * @cred: The credentials to use 57 * @cred: The credentials to use
58 * @ns: The user namespace in which we need the capability 58 * @ns: The user namespace in which we need the capability
59 * @cap: The capability to check for 59 * @cap: The capability to check for
60 * @audit: Whether to write an audit message or not 60 * @opts: Bitmask of options defined in include/linux/security.h
61 * 61 *
62 * Determine whether the nominated task has the specified capability amongst 62 * Determine whether the nominated task has the specified capability amongst
63 * its effective set, returning 0 if it does, -ve if it does not. 63 * its effective set, returning 0 if it does, -ve if it does not.
@@ -68,7 +68,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
68 * kernel's capable() and has_capability() returns 1 for this case. 68 * kernel's capable() and has_capability() returns 1 for this case.
69 */ 69 */
70int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, 70int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
71 int cap, int audit) 71 int cap, unsigned int opts)
72{ 72{
73 struct user_namespace *ns = targ_ns; 73 struct user_namespace *ns = targ_ns;
74 74
@@ -222,12 +222,11 @@ int cap_capget(struct task_struct *target, kernel_cap_t *effective,
222 */ 222 */
223static inline int cap_inh_is_capped(void) 223static inline int cap_inh_is_capped(void)
224{ 224{
225
226 /* they are so limited unless the current task has the CAP_SETPCAP 225 /* they are so limited unless the current task has the CAP_SETPCAP
227 * capability 226 * capability
228 */ 227 */
229 if (cap_capable(current_cred(), current_cred()->user_ns, 228 if (cap_capable(current_cred(), current_cred()->user_ns,
230 CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0) 229 CAP_SETPCAP, CAP_OPT_NONE) == 0)
231 return 0; 230 return 0;
232 return 1; 231 return 1;
233} 232}
@@ -643,6 +642,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
643 cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; 642 cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
644 cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; 643 cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
645 644
645 cpu_caps->rootid = rootkuid;
646
646 return 0; 647 return 0;
647} 648}
648 649
@@ -1208,8 +1209,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1208 || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ 1209 || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/
1209 || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ 1210 || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
1210 || (cap_capable(current_cred(), 1211 || (cap_capable(current_cred(),
1211 current_cred()->user_ns, CAP_SETPCAP, 1212 current_cred()->user_ns,
1212 SECURITY_CAP_AUDIT) != 0) /*[4]*/ 1213 CAP_SETPCAP,
1214 CAP_OPT_NONE) != 0) /*[4]*/
1213 /* 1215 /*
1214 * [1] no changing of bits that are locked 1216 * [1] no changing of bits that are locked
1215 * [2] no unlocking of locks 1217 * [2] no unlocking of locks
@@ -1304,9 +1306,10 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
1304{ 1306{
1305 int cap_sys_admin = 0; 1307 int cap_sys_admin = 0;
1306 1308
1307 if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, 1309 if (cap_capable(current_cred(), &init_user_ns,
1308 SECURITY_CAP_NOAUDIT) == 0) 1310 CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
1309 cap_sys_admin = 1; 1311 cap_sys_admin = 1;
1312
1310 return cap_sys_admin; 1313 return cap_sys_admin;
1311} 1314}
1312 1315
@@ -1325,7 +1328,7 @@ int cap_mmap_addr(unsigned long addr)
1325 1328
1326 if (addr < dac_mmap_min_addr) { 1329 if (addr < dac_mmap_min_addr) {
1327 ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, 1330 ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
1328 SECURITY_CAP_AUDIT); 1331 CAP_OPT_NONE);
1329 /* set PF_SUPERPRIV if it turns out we allow the low mmap */ 1332 /* set PF_SUPERPRIV if it turns out we allow the low mmap */
1330 if (ret == 0) 1333 if (ret == 0)
1331 current->flags |= PF_SUPERPRIV; 1334 current->flags |= PF_SUPERPRIV;
@@ -1362,10 +1365,17 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
1362 LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), 1365 LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1363}; 1366};
1364 1367
1365void __init capability_add_hooks(void) 1368static int __init capability_init(void)
1366{ 1369{
1367 security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), 1370 security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
1368 "capability"); 1371 "capability");
1372 return 0;
1369} 1373}
1370 1374
1375DEFINE_LSM(capability) = {
1376 .name = "capability",
1377 .order = LSM_ORDER_FIRST,
1378 .init = capability_init,
1379};
1380
1371#endif /* CONFIG_SECURITY */ 1381#endif /* CONFIG_SECURITY */
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index cc12f3449a72..026163f37ba1 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -307,8 +307,7 @@ static inline int security_filter_rule_init(u32 field, u32 op, char *rulestr,
307} 307}
308 308
309static inline int security_filter_rule_match(u32 secid, u32 field, u32 op, 309static inline int security_filter_rule_match(u32 secid, u32 field, u32 op,
310 void *lsmrule, 310 void *lsmrule)
311 struct audit_context *actx)
312{ 311{
313 return -EINVAL; 312 return -EINVAL;
314} 313}
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index a2baa85ea2f5..5fb7127bbe68 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -114,6 +114,7 @@ static void ima_set_cache_status(struct integrity_iint_cache *iint,
114 break; 114 break;
115 case CREDS_CHECK: 115 case CREDS_CHECK:
116 iint->ima_creds_status = status; 116 iint->ima_creds_status = status;
117 break;
117 case FILE_CHECK: 118 case FILE_CHECK:
118 case POST_SETATTR: 119 case POST_SETATTR:
119 iint->ima_file_status = status; 120 iint->ima_file_status = status;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 8bc8a1c8cb3f..e0cc323f948f 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -340,8 +340,7 @@ retry:
340 rc = security_filter_rule_match(osid, 340 rc = security_filter_rule_match(osid,
341 rule->lsm[i].type, 341 rule->lsm[i].type,
342 Audit_equal, 342 Audit_equal,
343 rule->lsm[i].rule, 343 rule->lsm[i].rule);
344 NULL);
345 break; 344 break;
346 case LSM_SUBJ_USER: 345 case LSM_SUBJ_USER:
347 case LSM_SUBJ_ROLE: 346 case LSM_SUBJ_ROLE:
@@ -349,8 +348,7 @@ retry:
349 rc = security_filter_rule_match(secid, 348 rc = security_filter_rule_match(secid,
350 rule->lsm[i].type, 349 rule->lsm[i].type,
351 Audit_equal, 350 Audit_equal,
352 rule->lsm[i].rule, 351 rule->lsm[i].rule);
353 NULL);
354 default: 352 default:
355 break; 353 break;
356 } 354 }
@@ -938,10 +936,12 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
938 case Opt_uid_gt: 936 case Opt_uid_gt:
939 case Opt_euid_gt: 937 case Opt_euid_gt:
940 entry->uid_op = &uid_gt; 938 entry->uid_op = &uid_gt;
939 /* fall through */
941 case Opt_uid_lt: 940 case Opt_uid_lt:
942 case Opt_euid_lt: 941 case Opt_euid_lt:
943 if ((token == Opt_uid_lt) || (token == Opt_euid_lt)) 942 if ((token == Opt_uid_lt) || (token == Opt_euid_lt))
944 entry->uid_op = &uid_lt; 943 entry->uid_op = &uid_lt;
944 /* fall through */
945 case Opt_uid_eq: 945 case Opt_uid_eq:
946 case Opt_euid_eq: 946 case Opt_euid_eq:
947 uid_token = (token == Opt_uid_eq) || 947 uid_token = (token == Opt_uid_eq) ||
@@ -970,9 +970,11 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
970 break; 970 break;
971 case Opt_fowner_gt: 971 case Opt_fowner_gt:
972 entry->fowner_op = &uid_gt; 972 entry->fowner_op = &uid_gt;
973 /* fall through */
973 case Opt_fowner_lt: 974 case Opt_fowner_lt:
974 if (token == Opt_fowner_lt) 975 if (token == Opt_fowner_lt)
975 entry->fowner_op = &uid_lt; 976 entry->fowner_op = &uid_lt;
977 /* fall through */
976 case Opt_fowner_eq: 978 case Opt_fowner_eq:
977 ima_log_string_op(ab, "fowner", args[0].from, 979 ima_log_string_op(ab, "fowner", args[0].from,
978 entry->fowner_op); 980 entry->fowner_op);
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index 43752002c222..513b457ae900 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -83,6 +83,7 @@ static void ima_show_template_data_ascii(struct seq_file *m,
83 /* skip ':' and '\0' */ 83 /* skip ':' and '\0' */
84 buf_ptr += 2; 84 buf_ptr += 2;
85 buflen -= buf_ptr - field_data->data; 85 buflen -= buf_ptr - field_data->data;
86 /* fall through */
86 case DATA_FMT_DIGEST: 87 case DATA_FMT_DIGEST:
87 case DATA_FMT_HEX: 88 case DATA_FMT_HEX:
88 if (!buflen) 89 if (!buflen)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 7bbe03593e58..3e4053a217c3 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1752,7 +1752,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
1752 return -EINVAL; 1752 return -EINVAL;
1753 return keyctl_pkey_query((key_serial_t)arg2, 1753 return keyctl_pkey_query((key_serial_t)arg2,
1754 (const char __user *)arg4, 1754 (const char __user *)arg4,
1755 (struct keyctl_pkey_query *)arg5); 1755 (struct keyctl_pkey_query __user *)arg5);
1756 1756
1757 case KEYCTL_PKEY_ENCRYPT: 1757 case KEYCTL_PKEY_ENCRYPT:
1758 case KEYCTL_PKEY_DECRYPT: 1758 case KEYCTL_PKEY_DECRYPT:
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index f81372f53dd7..e14f09e3a4b0 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -246,6 +246,7 @@ static unsigned long keyring_get_key_chunk(const void *data, int level)
246 (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8)); 246 (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
247 n--; 247 n--;
248 offset = 1; 248 offset = 1;
249 /* fall through */
249 default: 250 default:
250 offset += sizeof(chunk) - 1; 251 offset += sizeof(chunk) - 1;
251 offset += (level - 3) * sizeof(chunk); 252 offset += (level - 3) * sizeof(chunk);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 0e0b9ccad2f8..9320424c4a46 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -380,6 +380,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
380 case -EAGAIN: /* no key */ 380 case -EAGAIN: /* no key */
381 if (ret) 381 if (ret)
382 break; 382 break;
383 /* fall through */
383 case -ENOKEY: /* negative key */ 384 case -ENOKEY: /* negative key */
384 ret = key_ref; 385 ret = key_ref;
385 break; 386 break;
@@ -404,6 +405,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
404 case -EAGAIN: /* no key */ 405 case -EAGAIN: /* no key */
405 if (ret) 406 if (ret)
406 break; 407 break;
408 /* fall through */
407 case -ENOKEY: /* negative key */ 409 case -ENOKEY: /* negative key */
408 ret = key_ref; 410 ret = key_ref;
409 break; 411 break;
@@ -424,6 +426,7 @@ key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
424 case -EAGAIN: /* no key */ 426 case -EAGAIN: /* no key */
425 if (ret) 427 if (ret)
426 break; 428 break;
429 /* fall through */
427 case -ENOKEY: /* negative key */ 430 case -ENOKEY: /* negative key */
428 ret = key_ref; 431 ret = key_ref;
429 break; 432 break;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 7a0c6b666ff0..2f17d84d46f1 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -273,16 +273,19 @@ static int construct_get_dest_keyring(struct key **_dest_keyring)
273 } 273 }
274 } 274 }
275 275
276 /* fall through */
276 case KEY_REQKEY_DEFL_THREAD_KEYRING: 277 case KEY_REQKEY_DEFL_THREAD_KEYRING:
277 dest_keyring = key_get(cred->thread_keyring); 278 dest_keyring = key_get(cred->thread_keyring);
278 if (dest_keyring) 279 if (dest_keyring)
279 break; 280 break;
280 281
282 /* fall through */
281 case KEY_REQKEY_DEFL_PROCESS_KEYRING: 283 case KEY_REQKEY_DEFL_PROCESS_KEYRING:
282 dest_keyring = key_get(cred->process_keyring); 284 dest_keyring = key_get(cred->process_keyring);
283 if (dest_keyring) 285 if (dest_keyring)
284 break; 286 break;
285 287
288 /* fall through */
286 case KEY_REQKEY_DEFL_SESSION_KEYRING: 289 case KEY_REQKEY_DEFL_SESSION_KEYRING:
287 rcu_read_lock(); 290 rcu_read_lock();
288 dest_keyring = key_get( 291 dest_keyring = key_get(
@@ -292,6 +295,7 @@ static int construct_get_dest_keyring(struct key **_dest_keyring)
292 if (dest_keyring) 295 if (dest_keyring)
293 break; 296 break;
294 297
298 /* fall through */
295 case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: 299 case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
296 dest_keyring = 300 dest_keyring =
297 key_get(cred->user->session_keyring); 301 key_get(cred->user->session_keyring);
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 48f39631b370..055fb0a64169 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -187,13 +187,19 @@ static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = {
187 LSM_HOOK_INIT(kernel_load_data, loadpin_load_data), 187 LSM_HOOK_INIT(kernel_load_data, loadpin_load_data),
188}; 188};
189 189
190void __init loadpin_add_hooks(void) 190static int __init loadpin_init(void)
191{ 191{
192 pr_info("ready to pin (currently %senforcing)\n", 192 pr_info("ready to pin (currently %senforcing)\n",
193 enforce ? "" : "not "); 193 enforce ? "" : "not ");
194 security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin"); 194 security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
195 return 0;
195} 196}
196 197
198DEFINE_LSM(loadpin) = {
199 .name = "loadpin",
200 .init = loadpin_init,
201};
202
197/* Should not be mutable after boot, so not listed in sysfs (perm == 0). */ 203/* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
198module_param(enforce, int, 0); 204module_param(enforce, int, 0);
199MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning"); 205MODULE_PARM_DESC(enforce, "Enforce module/firmware pinning");
diff --git a/security/safesetid/Kconfig b/security/safesetid/Kconfig
new file mode 100644
index 000000000000..4f415c4e3f93
--- /dev/null
+++ b/security/safesetid/Kconfig
@@ -0,0 +1,14 @@
1config SECURITY_SAFESETID
2 bool "Gate setid transitions to limit CAP_SET{U/G}ID capabilities"
3 depends on SECURITY
4 select SECURITYFS
5 default n
6 help
7 SafeSetID is an LSM module that gates the setid family of syscalls to
8 restrict UID/GID transitions from a given UID/GID to only those
9 approved by a system-wide whitelist. These restrictions also prohibit
10 the given UIDs/GIDs from obtaining auxiliary privileges associated
11 with CAP_SET{U/G}ID, such as allowing a user to set up user namespace
12 UID mappings.
13
14 If you are unsure how to answer this question, answer N.
diff --git a/security/safesetid/Makefile b/security/safesetid/Makefile
new file mode 100644
index 000000000000..6b0660321164
--- /dev/null
+++ b/security/safesetid/Makefile
@@ -0,0 +1,7 @@
1# SPDX-License-Identifier: GPL-2.0
2#
3# Makefile for the safesetid LSM.
4#
5
6obj-$(CONFIG_SECURITY_SAFESETID) := safesetid.o
7safesetid-y := lsm.o securityfs.o
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
new file mode 100644
index 000000000000..cecd38e2ac80
--- /dev/null
+++ b/security/safesetid/lsm.c
@@ -0,0 +1,277 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * SafeSetID Linux Security Module
4 *
5 * Author: Micah Morton <mortonm@chromium.org>
6 *
7 * Copyright (C) 2018 The Chromium OS Authors.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2, as
11 * published by the Free Software Foundation.
12 *
13 */
14
15#define pr_fmt(fmt) "SafeSetID: " fmt
16
17#include <linux/hashtable.h>
18#include <linux/lsm_hooks.h>
19#include <linux/module.h>
20#include <linux/ptrace.h>
21#include <linux/sched/task_stack.h>
22#include <linux/security.h>
23
24/* Flag indicating whether initialization completed */
25int safesetid_initialized;
26
27#define NUM_BITS 8 /* 128 buckets in hash table */
28
29static DEFINE_HASHTABLE(safesetid_whitelist_hashtable, NUM_BITS);
30
31/*
32 * Hash table entry to store safesetid policy signifying that 'parent' user
33 * can setid to 'child' user.
34 */
35struct entry {
36 struct hlist_node next;
37 struct hlist_node dlist; /* for deletion cleanup */
38 uint64_t parent_kuid;
39 uint64_t child_kuid;
40};
41
42static DEFINE_SPINLOCK(safesetid_whitelist_hashtable_spinlock);
43
44static bool check_setuid_policy_hashtable_key(kuid_t parent)
45{
46 struct entry *entry;
47
48 rcu_read_lock();
49 hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
50 entry, next, __kuid_val(parent)) {
51 if (entry->parent_kuid == __kuid_val(parent)) {
52 rcu_read_unlock();
53 return true;
54 }
55 }
56 rcu_read_unlock();
57
58 return false;
59}
60
61static bool check_setuid_policy_hashtable_key_value(kuid_t parent,
62 kuid_t child)
63{
64 struct entry *entry;
65
66 rcu_read_lock();
67 hash_for_each_possible_rcu(safesetid_whitelist_hashtable,
68 entry, next, __kuid_val(parent)) {
69 if (entry->parent_kuid == __kuid_val(parent) &&
70 entry->child_kuid == __kuid_val(child)) {
71 rcu_read_unlock();
72 return true;
73 }
74 }
75 rcu_read_unlock();
76
77 return false;
78}
79
80static int safesetid_security_capable(const struct cred *cred,
81 struct user_namespace *ns,
82 int cap,
83 unsigned int opts)
84{
85 if (cap == CAP_SETUID &&
86 check_setuid_policy_hashtable_key(cred->uid)) {
87 if (!(opts & CAP_OPT_INSETID)) {
88 /*
89 * Deny if we're not in a set*uid() syscall to avoid
90 * giving powers gated by CAP_SETUID that are related
91 * to functionality other than calling set*uid() (e.g.
92 * allowing user to set up userns uid mappings).
93 */
94 pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions",
95 __kuid_val(cred->uid));
96 return -1;
97 }
98 }
99 return 0;
100}
101
102static int check_uid_transition(kuid_t parent, kuid_t child)
103{
104 if (check_setuid_policy_hashtable_key_value(parent, child))
105 return 0;
106 pr_warn("UID transition (%d -> %d) blocked",
107 __kuid_val(parent),
108 __kuid_val(child));
109 /*
110 * Kill this process to avoid potential security vulnerabilities
111 * that could arise from a missing whitelist entry preventing a
112 * privileged process from dropping to a lesser-privileged one.
113 */
114 force_sig(SIGKILL, current);
115 return -EACCES;
116}
117
118/*
119 * Check whether there is either an exception for user under old cred struct to
120 * set*uid to user under new cred struct, or the UID transition is allowed (by
121 * Linux set*uid rules) even without CAP_SETUID.
122 */
123static int safesetid_task_fix_setuid(struct cred *new,
124 const struct cred *old,
125 int flags)
126{
127
128 /* Do nothing if there are no setuid restrictions for this UID. */
129 if (!check_setuid_policy_hashtable_key(old->uid))
130 return 0;
131
132 switch (flags) {
133 case LSM_SETID_RE:
134 /*
135 * Users for which setuid restrictions exist can only set the
136 * real UID to the real UID or the effective UID, unless an
137 * explicit whitelist policy allows the transition.
138 */
139 if (!uid_eq(old->uid, new->uid) &&
140 !uid_eq(old->euid, new->uid)) {
141 return check_uid_transition(old->uid, new->uid);
142 }
143 /*
144 * Users for which setuid restrictions exist can only set the
145 * effective UID to the real UID, the effective UID, or the
146 * saved set-UID, unless an explicit whitelist policy allows
147 * the transition.
148 */
149 if (!uid_eq(old->uid, new->euid) &&
150 !uid_eq(old->euid, new->euid) &&
151 !uid_eq(old->suid, new->euid)) {
152 return check_uid_transition(old->euid, new->euid);
153 }
154 break;
155 case LSM_SETID_ID:
156 /*
157 * Users for which setuid restrictions exist cannot change the
158 * real UID or saved set-UID unless an explicit whitelist
159 * policy allows the transition.
160 */
161 if (!uid_eq(old->uid, new->uid))
162 return check_uid_transition(old->uid, new->uid);
163 if (!uid_eq(old->suid, new->suid))
164 return check_uid_transition(old->suid, new->suid);
165 break;
166 case LSM_SETID_RES:
167 /*
168 * Users for which setuid restrictions exist cannot change the
169 * real UID, effective UID, or saved set-UID to anything but
170 * one of: the current real UID, the current effective UID or
171 * the current saved set-user-ID unless an explicit whitelist
172 * policy allows the transition.
173 */
174 if (!uid_eq(new->uid, old->uid) &&
175 !uid_eq(new->uid, old->euid) &&
176 !uid_eq(new->uid, old->suid)) {
177 return check_uid_transition(old->uid, new->uid);
178 }
179 if (!uid_eq(new->euid, old->uid) &&
180 !uid_eq(new->euid, old->euid) &&
181 !uid_eq(new->euid, old->suid)) {
182 return check_uid_transition(old->euid, new->euid);
183 }
184 if (!uid_eq(new->suid, old->uid) &&
185 !uid_eq(new->suid, old->euid) &&
186 !uid_eq(new->suid, old->suid)) {
187 return check_uid_transition(old->suid, new->suid);
188 }
189 break;
190 case LSM_SETID_FS:
191 /*
192 * Users for which setuid restrictions exist cannot change the
193 * filesystem UID to anything but one of: the current real UID,
194 * the current effective UID or the current saved set-UID
195 * unless an explicit whitelist policy allows the transition.
196 */
197 if (!uid_eq(new->fsuid, old->uid) &&
198 !uid_eq(new->fsuid, old->euid) &&
199 !uid_eq(new->fsuid, old->suid) &&
200 !uid_eq(new->fsuid, old->fsuid)) {
201 return check_uid_transition(old->fsuid, new->fsuid);
202 }
203 break;
204 default:
205 pr_warn("Unknown setid state %d\n", flags);
206 force_sig(SIGKILL, current);
207 return -EINVAL;
208 }
209 return 0;
210}
211
212int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child)
213{
214 struct entry *new;
215
216 /* Return if entry already exists */
217 if (check_setuid_policy_hashtable_key_value(parent, child))
218 return 0;
219
220 new = kzalloc(sizeof(struct entry), GFP_KERNEL);
221 if (!new)
222 return -ENOMEM;
223 new->parent_kuid = __kuid_val(parent);
224 new->child_kuid = __kuid_val(child);
225 spin_lock(&safesetid_whitelist_hashtable_spinlock);
226 hash_add_rcu(safesetid_whitelist_hashtable,
227 &new->next,
228 __kuid_val(parent));
229 spin_unlock(&safesetid_whitelist_hashtable_spinlock);
230 return 0;
231}
232
233void flush_safesetid_whitelist_entries(void)
234{
235 struct entry *entry;
236 struct hlist_node *hlist_node;
237 unsigned int bkt_loop_cursor;
238 HLIST_HEAD(free_list);
239
240 /*
241 * Could probably use hash_for_each_rcu here instead, but this should
242 * be fine as well.
243 */
244 spin_lock(&safesetid_whitelist_hashtable_spinlock);
245 hash_for_each_safe(safesetid_whitelist_hashtable, bkt_loop_cursor,
246 hlist_node, entry, next) {
247 hash_del_rcu(&entry->next);
248 hlist_add_head(&entry->dlist, &free_list);
249 }
250 spin_unlock(&safesetid_whitelist_hashtable_spinlock);
251 synchronize_rcu();
252 hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist) {
253 hlist_del(&entry->dlist);
254 kfree(entry);
255 }
256}
257
258static struct security_hook_list safesetid_security_hooks[] = {
259 LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
260 LSM_HOOK_INIT(capable, safesetid_security_capable)
261};
262
263static int __init safesetid_security_init(void)
264{
265 security_add_hooks(safesetid_security_hooks,
266 ARRAY_SIZE(safesetid_security_hooks), "safesetid");
267
268 /* Report that SafeSetID successfully initialized */
269 safesetid_initialized = 1;
270
271 return 0;
272}
273
274DEFINE_LSM(safesetid_security_init) = {
275 .init = safesetid_security_init,
276 .name = "safesetid",
277};
diff --git a/security/safesetid/lsm.h b/security/safesetid/lsm.h
new file mode 100644
index 000000000000..c1ea3c265fcf
--- /dev/null
+++ b/security/safesetid/lsm.h
@@ -0,0 +1,33 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * SafeSetID Linux Security Module
4 *
5 * Author: Micah Morton <mortonm@chromium.org>
6 *
7 * Copyright (C) 2018 The Chromium OS Authors.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2, as
11 * published by the Free Software Foundation.
12 *
13 */
14#ifndef _SAFESETID_H
15#define _SAFESETID_H
16
17#include <linux/types.h>
18
19/* Flag indicating whether initialization completed */
20extern int safesetid_initialized;
21
22/* Function type. */
23enum safesetid_whitelist_file_write_type {
24 SAFESETID_WHITELIST_ADD, /* Add whitelist policy. */
25 SAFESETID_WHITELIST_FLUSH, /* Flush whitelist policies. */
26};
27
28/* Add entry to safesetid whitelist to allow 'parent' to setid to 'child'. */
29int add_safesetid_whitelist_entry(kuid_t parent, kuid_t child);
30
31void flush_safesetid_whitelist_entries(void);
32
33#endif /* _SAFESETID_H */
diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
new file mode 100644
index 000000000000..2c6c829be044
--- /dev/null
+++ b/security/safesetid/securityfs.c
@@ -0,0 +1,193 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * SafeSetID Linux Security Module
4 *
5 * Author: Micah Morton <mortonm@chromium.org>
6 *
7 * Copyright (C) 2018 The Chromium OS Authors.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2, as
11 * published by the Free Software Foundation.
12 *
13 */
14#include <linux/security.h>
15#include <linux/cred.h>
16
17#include "lsm.h"
18
19static struct dentry *safesetid_policy_dir;
20
21struct safesetid_file_entry {
22 const char *name;
23 enum safesetid_whitelist_file_write_type type;
24 struct dentry *dentry;
25};
26
27static struct safesetid_file_entry safesetid_files[] = {
28 {.name = "add_whitelist_policy",
29 .type = SAFESETID_WHITELIST_ADD},
30 {.name = "flush_whitelist_policies",
31 .type = SAFESETID_WHITELIST_FLUSH},
32};
33
34/*
35 * In the case the input buffer contains one or more invalid UIDs, the kuid_t
36 * variables pointed to by 'parent' and 'child' will get updated but this
37 * function will return an error.
38 */
39static int parse_safesetid_whitelist_policy(const char __user *buf,
40 size_t len,
41 kuid_t *parent,
42 kuid_t *child)
43{
44 char *kern_buf;
45 char *parent_buf;
46 char *child_buf;
47 const char separator[] = ":";
48 int ret;
49 size_t first_substring_length;
50 long parsed_parent;
51 long parsed_child;
52
53 /* Duplicate string from user memory and NULL-terminate */
54 kern_buf = memdup_user_nul(buf, len);
55 if (IS_ERR(kern_buf))
56 return PTR_ERR(kern_buf);
57
58 /*
59 * Format of |buf| string should be <UID>:<UID>.
60 * Find location of ":" in kern_buf (copied from |buf|).
61 */
62 first_substring_length = strcspn(kern_buf, separator);
63 if (first_substring_length == 0 || first_substring_length == len) {
64 ret = -EINVAL;
65 goto free_kern;
66 }
67
68 parent_buf = kmemdup_nul(kern_buf, first_substring_length, GFP_KERNEL);
69 if (!parent_buf) {
70 ret = -ENOMEM;
71 goto free_kern;
72 }
73
74 ret = kstrtol(parent_buf, 0, &parsed_parent);
75 if (ret)
76 goto free_both;
77
78 child_buf = kern_buf + first_substring_length + 1;
79 ret = kstrtol(child_buf, 0, &parsed_child);
80 if (ret)
81 goto free_both;
82
83 *parent = make_kuid(current_user_ns(), parsed_parent);
84 if (!uid_valid(*parent)) {
85 ret = -EINVAL;
86 goto free_both;
87 }
88
89 *child = make_kuid(current_user_ns(), parsed_child);
90 if (!uid_valid(*child)) {
91 ret = -EINVAL;
92 goto free_both;
93 }
94
95free_both:
96 kfree(parent_buf);
97free_kern:
98 kfree(kern_buf);
99 return ret;
100}
101
102static ssize_t safesetid_file_write(struct file *file,
103 const char __user *buf,
104 size_t len,
105 loff_t *ppos)
106{
107 struct safesetid_file_entry *file_entry =
108 file->f_inode->i_private;
109 kuid_t parent;
110 kuid_t child;
111 int ret;
112
113 if (!ns_capable(current_user_ns(), CAP_MAC_ADMIN))
114 return -EPERM;
115
116 if (*ppos != 0)
117 return -EINVAL;
118
119 switch (file_entry->type) {
120 case SAFESETID_WHITELIST_FLUSH:
121 flush_safesetid_whitelist_entries();
122 break;
123 case SAFESETID_WHITELIST_ADD:
124 ret = parse_safesetid_whitelist_policy(buf, len, &parent,
125 &child);
126 if (ret)
127 return ret;
128
129 ret = add_safesetid_whitelist_entry(parent, child);
130 if (ret)
131 return ret;
132 break;
133 default:
134 pr_warn("Unknown securityfs file %d\n", file_entry->type);
135 break;
136 }
137
138 /* Return len on success so caller won't keep trying to write */
139 return len;
140}
141
142static const struct file_operations safesetid_file_fops = {
143 .write = safesetid_file_write,
144};
145
146static void safesetid_shutdown_securityfs(void)
147{
148 int i;
149
150 for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
151 struct safesetid_file_entry *entry =
152 &safesetid_files[i];
153 securityfs_remove(entry->dentry);
154 entry->dentry = NULL;
155 }
156
157 securityfs_remove(safesetid_policy_dir);
158 safesetid_policy_dir = NULL;
159}
160
161static int __init safesetid_init_securityfs(void)
162{
163 int i;
164 int ret;
165
166 if (!safesetid_initialized)
167 return 0;
168
169 safesetid_policy_dir = securityfs_create_dir("safesetid", NULL);
170 if (IS_ERR(safesetid_policy_dir)) {
171 ret = PTR_ERR(safesetid_policy_dir);
172 goto error;
173 }
174
175 for (i = 0; i < ARRAY_SIZE(safesetid_files); ++i) {
176 struct safesetid_file_entry *entry =
177 &safesetid_files[i];
178 entry->dentry = securityfs_create_file(
179 entry->name, 0200, safesetid_policy_dir,
180 entry, &safesetid_file_fops);
181 if (IS_ERR(entry->dentry)) {
182 ret = PTR_ERR(entry->dentry);
183 goto error;
184 }
185 }
186
187 return 0;
188
189error:
190 safesetid_shutdown_securityfs();
191 return ret;
192}
193fs_initcall(safesetid_init_securityfs);
diff --git a/security/security.c b/security/security.c
index 55bc49027ba9..301b141b9a32 100644
--- a/security/security.c
+++ b/security/security.c
@@ -30,20 +30,32 @@
30#include <linux/personality.h> 30#include <linux/personality.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/msg.h>
33#include <net/flow.h> 34#include <net/flow.h>
34 35
35#define MAX_LSM_EVM_XATTR 2 36#define MAX_LSM_EVM_XATTR 2
36 37
37/* Maximum number of letters for an LSM name string */ 38/* How many LSMs were built into the kernel? */
38#define SECURITY_NAME_MAX 10 39#define LSM_COUNT (__end_lsm_info - __start_lsm_info)
39 40
40struct security_hook_heads security_hook_heads __lsm_ro_after_init; 41struct security_hook_heads security_hook_heads __lsm_ro_after_init;
41static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain); 42static ATOMIC_NOTIFIER_HEAD(lsm_notifier_chain);
42 43
44static struct kmem_cache *lsm_file_cache;
45static struct kmem_cache *lsm_inode_cache;
46
43char *lsm_names; 47char *lsm_names;
48static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;
49
44/* Boot-time LSM user choice */ 50/* Boot-time LSM user choice */
45static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = 51static __initdata const char *chosen_lsm_order;
46 CONFIG_DEFAULT_SECURITY; 52static __initdata const char *chosen_major_lsm;
53
54static __initconst const char * const builtin_lsm_order = CONFIG_LSM;
55
56/* Ordered list of LSMs to initialize. */
57static __initdata struct lsm_info **ordered_lsms;
58static __initdata struct lsm_info *exclusive;
47 59
48static __initdata bool debug; 60static __initdata bool debug;
49#define init_debug(...) \ 61#define init_debug(...) \
@@ -52,18 +64,269 @@ static __initdata bool debug;
52 pr_info(__VA_ARGS__); \ 64 pr_info(__VA_ARGS__); \
53 } while (0) 65 } while (0)
54 66
55static void __init major_lsm_init(void) 67static bool __init is_enabled(struct lsm_info *lsm)
56{ 68{
57 struct lsm_info *lsm; 69 if (!lsm->enabled)
58 int ret; 70 return false;
71
72 return *lsm->enabled;
73}
74
75/* Mark an LSM's enabled flag. */
76static int lsm_enabled_true __initdata = 1;
77static int lsm_enabled_false __initdata = 0;
78static void __init set_enabled(struct lsm_info *lsm, bool enabled)
79{
80 /*
81 * When an LSM hasn't configured an enable variable, we can use
82 * a hard-coded location for storing the default enabled state.
83 */
84 if (!lsm->enabled) {
85 if (enabled)
86 lsm->enabled = &lsm_enabled_true;
87 else
88 lsm->enabled = &lsm_enabled_false;
89 } else if (lsm->enabled == &lsm_enabled_true) {
90 if (!enabled)
91 lsm->enabled = &lsm_enabled_false;
92 } else if (lsm->enabled == &lsm_enabled_false) {
93 if (enabled)
94 lsm->enabled = &lsm_enabled_true;
95 } else {
96 *lsm->enabled = enabled;
97 }
98}
99
100/* Is an LSM already listed in the ordered LSMs list? */
101static bool __init exists_ordered_lsm(struct lsm_info *lsm)
102{
103 struct lsm_info **check;
104
105 for (check = ordered_lsms; *check; check++)
106 if (*check == lsm)
107 return true;
108
109 return false;
110}
111
112/* Append an LSM to the list of ordered LSMs to initialize. */
113static int last_lsm __initdata;
114static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
115{
116 /* Ignore duplicate selections. */
117 if (exists_ordered_lsm(lsm))
118 return;
119
120 if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
121 return;
122
123 /* Enable this LSM, if it is not already set. */
124 if (!lsm->enabled)
125 lsm->enabled = &lsm_enabled_true;
126 ordered_lsms[last_lsm++] = lsm;
127
128 init_debug("%s ordering: %s (%sabled)\n", from, lsm->name,
129 is_enabled(lsm) ? "en" : "dis");
130}
131
132/* Is an LSM allowed to be initialized? */
133static bool __init lsm_allowed(struct lsm_info *lsm)
134{
135 /* Skip if the LSM is disabled. */
136 if (!is_enabled(lsm))
137 return false;
138
139 /* Not allowed if another exclusive LSM already initialized. */
140 if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
141 init_debug("exclusive disabled: %s\n", lsm->name);
142 return false;
143 }
144
145 return true;
146}
147
148static void __init lsm_set_blob_size(int *need, int *lbs)
149{
150 int offset;
151
152 if (*need > 0) {
153 offset = *lbs;
154 *lbs += *need;
155 *need = offset;
156 }
157}
158
159static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
160{
161 if (!needed)
162 return;
163
164 lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
165 lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
166 /*
167 * The inode blob gets an rcu_head in addition to
168 * what the modules might need.
169 */
170 if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
171 blob_sizes.lbs_inode = sizeof(struct rcu_head);
172 lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
173 lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
174 lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
175 lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
176}
177
178/* Prepare LSM for initialization. */
179static void __init prepare_lsm(struct lsm_info *lsm)
180{
181 int enabled = lsm_allowed(lsm);
182
183 /* Record enablement (to handle any following exclusive LSMs). */
184 set_enabled(lsm, enabled);
185
186 /* If enabled, do pre-initialization work. */
187 if (enabled) {
188 if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
189 exclusive = lsm;
190 init_debug("exclusive chosen: %s\n", lsm->name);
191 }
192
193 lsm_set_blob_sizes(lsm->blobs);
194 }
195}
196
197/* Initialize a given LSM, if it is enabled. */
198static void __init initialize_lsm(struct lsm_info *lsm)
199{
200 if (is_enabled(lsm)) {
201 int ret;
59 202
60 for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
61 init_debug("initializing %s\n", lsm->name); 203 init_debug("initializing %s\n", lsm->name);
62 ret = lsm->init(); 204 ret = lsm->init();
63 WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); 205 WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
64 } 206 }
65} 207}
66 208
209/* Populate ordered LSMs list from comma-separated LSM name list. */
210static void __init ordered_lsm_parse(const char *order, const char *origin)
211{
212 struct lsm_info *lsm;
213 char *sep, *name, *next;
214
215 /* LSM_ORDER_FIRST is always first. */
216 for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
217 if (lsm->order == LSM_ORDER_FIRST)
218 append_ordered_lsm(lsm, "first");
219 }
220
221 /* Process "security=", if given. */
222 if (chosen_major_lsm) {
223 struct lsm_info *major;
224
225 /*
226 * To match the original "security=" behavior, this
227 * explicitly does NOT fallback to another Legacy Major
228 * if the selected one was separately disabled: disable
229 * all non-matching Legacy Major LSMs.
230 */
231 for (major = __start_lsm_info; major < __end_lsm_info;
232 major++) {
233 if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
234 strcmp(major->name, chosen_major_lsm) != 0) {
235 set_enabled(major, false);
236 init_debug("security=%s disabled: %s\n",
237 chosen_major_lsm, major->name);
238 }
239 }
240 }
241
242 sep = kstrdup(order, GFP_KERNEL);
243 next = sep;
244 /* Walk the list, looking for matching LSMs. */
245 while ((name = strsep(&next, ",")) != NULL) {
246 bool found = false;
247
248 for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
249 if (lsm->order == LSM_ORDER_MUTABLE &&
250 strcmp(lsm->name, name) == 0) {
251 append_ordered_lsm(lsm, origin);
252 found = true;
253 }
254 }
255
256 if (!found)
257 init_debug("%s ignored: %s\n", origin, name);
258 }
259
260 /* Process "security=", if given. */
261 if (chosen_major_lsm) {
262 for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
263 if (exists_ordered_lsm(lsm))
264 continue;
265 if (strcmp(lsm->name, chosen_major_lsm) == 0)
266 append_ordered_lsm(lsm, "security=");
267 }
268 }
269
270 /* Disable all LSMs not in the ordered list. */
271 for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
272 if (exists_ordered_lsm(lsm))
273 continue;
274 set_enabled(lsm, false);
275 init_debug("%s disabled: %s\n", origin, lsm->name);
276 }
277
278 kfree(sep);
279}
280
281static void __init lsm_early_cred(struct cred *cred);
282static void __init lsm_early_task(struct task_struct *task);
283
284static void __init ordered_lsm_init(void)
285{
286 struct lsm_info **lsm;
287
288 ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
289 GFP_KERNEL);
290
291 if (chosen_lsm_order) {
292 if (chosen_major_lsm) {
293 pr_info("security= is ignored because it is superseded by lsm=\n");
294 chosen_major_lsm = NULL;
295 }
296 ordered_lsm_parse(chosen_lsm_order, "cmdline");
297 } else
298 ordered_lsm_parse(builtin_lsm_order, "builtin");
299
300 for (lsm = ordered_lsms; *lsm; lsm++)
301 prepare_lsm(*lsm);
302
303 init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
304 init_debug("file blob size = %d\n", blob_sizes.lbs_file);
305 init_debug("inode blob size = %d\n", blob_sizes.lbs_inode);
306 init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc);
307 init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg);
308 init_debug("task blob size = %d\n", blob_sizes.lbs_task);
309
310 /*
311 * Create any kmem_caches needed for blobs
312 */
313 if (blob_sizes.lbs_file)
314 lsm_file_cache = kmem_cache_create("lsm_file_cache",
315 blob_sizes.lbs_file, 0,
316 SLAB_PANIC, NULL);
317 if (blob_sizes.lbs_inode)
318 lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
319 blob_sizes.lbs_inode, 0,
320 SLAB_PANIC, NULL);
321
322 lsm_early_cred((struct cred *) current->cred);
323 lsm_early_task(current);
324 for (lsm = ordered_lsms; *lsm; lsm++)
325 initialize_lsm(*lsm);
326
327 kfree(ordered_lsms);
328}
329
67/** 330/**
68 * security_init - initializes the security framework 331 * security_init - initializes the security framework
69 * 332 *
@@ -80,28 +343,27 @@ int __init security_init(void)
80 i++) 343 i++)
81 INIT_HLIST_HEAD(&list[i]); 344 INIT_HLIST_HEAD(&list[i]);
82 345
83 /* 346 /* Load LSMs in specified order. */
84 * Load minor LSMs, with the capability module always first. 347 ordered_lsm_init();
85 */
86 capability_add_hooks();
87 yama_add_hooks();
88 loadpin_add_hooks();
89
90 /*
91 * Load all the remaining security modules.
92 */
93 major_lsm_init();
94 348
95 return 0; 349 return 0;
96} 350}
97 351
98/* Save user chosen LSM */ 352/* Save user chosen LSM */
99static int __init choose_lsm(char *str) 353static int __init choose_major_lsm(char *str)
354{
355 chosen_major_lsm = str;
356 return 1;
357}
358__setup("security=", choose_major_lsm);
359
360/* Explicitly choose LSM initialization order. */
361static int __init choose_lsm_order(char *str)
100{ 362{
101 strncpy(chosen_lsm, str, SECURITY_NAME_MAX); 363 chosen_lsm_order = str;
102 return 1; 364 return 1;
103} 365}
104__setup("security=", choose_lsm); 366__setup("lsm=", choose_lsm_order);
105 367
106/* Enable LSM order debugging. */ 368/* Enable LSM order debugging. */
107static int __init enable_debug(char *str) 369static int __init enable_debug(char *str)
@@ -148,29 +410,6 @@ static int lsm_append(char *new, char **result)
148} 410}
149 411
150/** 412/**
151 * security_module_enable - Load given security module on boot ?
152 * @module: the name of the module
153 *
154 * Each LSM must pass this method before registering its own operations
155 * to avoid security registration races. This method may also be used
156 * to check if your LSM is currently loaded during kernel initialization.
157 *
158 * Returns:
159 *
160 * true if:
161 *
162 * - The passed LSM is the one chosen by user at boot time,
163 * - or the passed LSM is configured as the default and the user did not
164 * choose an alternate LSM at boot time.
165 *
166 * Otherwise, return false.
167 */
168int __init security_module_enable(const char *module)
169{
170 return !strcmp(module, chosen_lsm);
171}
172
173/**
174 * security_add_hooks - Add a modules hooks to the hook lists. 413 * security_add_hooks - Add a modules hooks to the hook lists.
175 * @hooks: the hooks to add 414 * @hooks: the hooks to add
176 * @count: the number of hooks to add 415 * @count: the number of hooks to add
@@ -209,6 +448,161 @@ int unregister_lsm_notifier(struct notifier_block *nb)
209} 448}
210EXPORT_SYMBOL(unregister_lsm_notifier); 449EXPORT_SYMBOL(unregister_lsm_notifier);
211 450
451/**
452 * lsm_cred_alloc - allocate a composite cred blob
453 * @cred: the cred that needs a blob
454 * @gfp: allocation type
455 *
456 * Allocate the cred blob for all the modules
457 *
458 * Returns 0, or -ENOMEM if memory can't be allocated.
459 */
460static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
461{
462 if (blob_sizes.lbs_cred == 0) {
463 cred->security = NULL;
464 return 0;
465 }
466
467 cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
468 if (cred->security == NULL)
469 return -ENOMEM;
470 return 0;
471}
472
473/**
474 * lsm_early_cred - during initialization allocate a composite cred blob
475 * @cred: the cred that needs a blob
476 *
477 * Allocate the cred blob for all the modules
478 */
479static void __init lsm_early_cred(struct cred *cred)
480{
481 int rc = lsm_cred_alloc(cred, GFP_KERNEL);
482
483 if (rc)
484 panic("%s: Early cred alloc failed.\n", __func__);
485}
486
487/**
488 * lsm_file_alloc - allocate a composite file blob
489 * @file: the file that needs a blob
490 *
491 * Allocate the file blob for all the modules
492 *
493 * Returns 0, or -ENOMEM if memory can't be allocated.
494 */
495static int lsm_file_alloc(struct file *file)
496{
497 if (!lsm_file_cache) {
498 file->f_security = NULL;
499 return 0;
500 }
501
502 file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
503 if (file->f_security == NULL)
504 return -ENOMEM;
505 return 0;
506}
507
508/**
509 * lsm_inode_alloc - allocate a composite inode blob
510 * @inode: the inode that needs a blob
511 *
512 * Allocate the inode blob for all the modules
513 *
514 * Returns 0, or -ENOMEM if memory can't be allocated.
515 */
516int lsm_inode_alloc(struct inode *inode)
517{
518 if (!lsm_inode_cache) {
519 inode->i_security = NULL;
520 return 0;
521 }
522
523 inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
524 if (inode->i_security == NULL)
525 return -ENOMEM;
526 return 0;
527}
528
529/**
530 * lsm_task_alloc - allocate a composite task blob
531 * @task: the task that needs a blob
532 *
533 * Allocate the task blob for all the modules
534 *
535 * Returns 0, or -ENOMEM if memory can't be allocated.
536 */
537static int lsm_task_alloc(struct task_struct *task)
538{
539 if (blob_sizes.lbs_task == 0) {
540 task->security = NULL;
541 return 0;
542 }
543
544 task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
545 if (task->security == NULL)
546 return -ENOMEM;
547 return 0;
548}
549
550/**
551 * lsm_ipc_alloc - allocate a composite ipc blob
552 * @kip: the ipc that needs a blob
553 *
554 * Allocate the ipc blob for all the modules
555 *
556 * Returns 0, or -ENOMEM if memory can't be allocated.
557 */
558static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
559{
560 if (blob_sizes.lbs_ipc == 0) {
561 kip->security = NULL;
562 return 0;
563 }
564
565 kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
566 if (kip->security == NULL)
567 return -ENOMEM;
568 return 0;
569}
570
571/**
572 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
573 * @mp: the msg_msg that needs a blob
574 *
575 * Allocate the ipc blob for all the modules
576 *
577 * Returns 0, or -ENOMEM if memory can't be allocated.
578 */
579static int lsm_msg_msg_alloc(struct msg_msg *mp)
580{
581 if (blob_sizes.lbs_msg_msg == 0) {
582 mp->security = NULL;
583 return 0;
584 }
585
586 mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
587 if (mp->security == NULL)
588 return -ENOMEM;
589 return 0;
590}
591
592/**
593 * lsm_early_task - during initialization allocate a composite task blob
594 * @task: the task that needs a blob
595 *
596 * Allocate the task blob for all the modules
597 */
598static void __init lsm_early_task(struct task_struct *task)
599{
600 int rc = lsm_task_alloc(task);
601
602 if (rc)
603 panic("%s: Early task alloc failed.\n", __func__);
604}
605
212/* 606/*
213 * Hook list operation macros. 607 * Hook list operation macros.
214 * 608 *
@@ -294,16 +688,12 @@ int security_capset(struct cred *new, const struct cred *old,
294 effective, inheritable, permitted); 688 effective, inheritable, permitted);
295} 689}
296 690
297int security_capable(const struct cred *cred, struct user_namespace *ns, 691int security_capable(const struct cred *cred,
298 int cap) 692 struct user_namespace *ns,
693 int cap,
694 unsigned int opts)
299{ 695{
300 return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_AUDIT); 696 return call_int_hook(capable, 0, cred, ns, cap, opts);
301}
302
303int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
304 int cap)
305{
306 return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_NOAUDIT);
307} 697}
308 698
309int security_quotactl(int cmds, int type, int id, struct super_block *sb) 699int security_quotactl(int cmds, int type, int id, struct super_block *sb)
@@ -468,14 +858,40 @@ EXPORT_SYMBOL(security_add_mnt_opt);
468 858
469int security_inode_alloc(struct inode *inode) 859int security_inode_alloc(struct inode *inode)
470{ 860{
471 inode->i_security = NULL; 861 int rc = lsm_inode_alloc(inode);
472 return call_int_hook(inode_alloc_security, 0, inode); 862
863 if (unlikely(rc))
864 return rc;
865 rc = call_int_hook(inode_alloc_security, 0, inode);
866 if (unlikely(rc))
867 security_inode_free(inode);
868 return rc;
869}
870
871static void inode_free_by_rcu(struct rcu_head *head)
872{
873 /*
874 * The rcu head is at the start of the inode blob
875 */
876 kmem_cache_free(lsm_inode_cache, head);
473} 877}
474 878
475void security_inode_free(struct inode *inode) 879void security_inode_free(struct inode *inode)
476{ 880{
477 integrity_inode_free(inode); 881 integrity_inode_free(inode);
478 call_void_hook(inode_free_security, inode); 882 call_void_hook(inode_free_security, inode);
883 /*
884 * The inode may still be referenced in a path walk and
885 * a call to security_inode_permission() can be made
886 * after inode_free_security() is called. Ideally, the VFS
887 * wouldn't do this, but fixing that is a much harder
888 * job. For now, simply free the i_security via RCU, and
889 * leave the current inode->i_security pointer intact.
890 * The inode will be freed after the RCU grace period too.
891 */
892 if (inode->i_security)
893 call_rcu((struct rcu_head *)inode->i_security,
894 inode_free_by_rcu);
479} 895}
480 896
481int security_dentry_init_security(struct dentry *dentry, int mode, 897int security_dentry_init_security(struct dentry *dentry, int mode,
@@ -905,12 +1321,27 @@ int security_file_permission(struct file *file, int mask)
905 1321
906int security_file_alloc(struct file *file) 1322int security_file_alloc(struct file *file)
907{ 1323{
908 return call_int_hook(file_alloc_security, 0, file); 1324 int rc = lsm_file_alloc(file);
1325
1326 if (rc)
1327 return rc;
1328 rc = call_int_hook(file_alloc_security, 0, file);
1329 if (unlikely(rc))
1330 security_file_free(file);
1331 return rc;
909} 1332}
910 1333
911void security_file_free(struct file *file) 1334void security_file_free(struct file *file)
912{ 1335{
1336 void *blob;
1337
913 call_void_hook(file_free_security, file); 1338 call_void_hook(file_free_security, file);
1339
1340 blob = file->f_security;
1341 if (blob) {
1342 file->f_security = NULL;
1343 kmem_cache_free(lsm_file_cache, blob);
1344 }
914} 1345}
915 1346
916int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1347int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1012,17 +1443,35 @@ int security_file_open(struct file *file)
1012 1443
1013int security_task_alloc(struct task_struct *task, unsigned long clone_flags) 1444int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
1014{ 1445{
1015 return call_int_hook(task_alloc, 0, task, clone_flags); 1446 int rc = lsm_task_alloc(task);
1447
1448 if (rc)
1449 return rc;
1450 rc = call_int_hook(task_alloc, 0, task, clone_flags);
1451 if (unlikely(rc))
1452 security_task_free(task);
1453 return rc;
1016} 1454}
1017 1455
1018void security_task_free(struct task_struct *task) 1456void security_task_free(struct task_struct *task)
1019{ 1457{
1020 call_void_hook(task_free, task); 1458 call_void_hook(task_free, task);
1459
1460 kfree(task->security);
1461 task->security = NULL;
1021} 1462}
1022 1463
1023int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) 1464int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
1024{ 1465{
1025 return call_int_hook(cred_alloc_blank, 0, cred, gfp); 1466 int rc = lsm_cred_alloc(cred, gfp);
1467
1468 if (rc)
1469 return rc;
1470
1471 rc = call_int_hook(cred_alloc_blank, 0, cred, gfp);
1472 if (unlikely(rc))
1473 security_cred_free(cred);
1474 return rc;
1026} 1475}
1027 1476
1028void security_cred_free(struct cred *cred) 1477void security_cred_free(struct cred *cred)
@@ -1035,11 +1484,22 @@ void security_cred_free(struct cred *cred)
1035 return; 1484 return;
1036 1485
1037 call_void_hook(cred_free, cred); 1486 call_void_hook(cred_free, cred);
1487
1488 kfree(cred->security);
1489 cred->security = NULL;
1038} 1490}
1039 1491
1040int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) 1492int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
1041{ 1493{
1042 return call_int_hook(cred_prepare, 0, new, old, gfp); 1494 int rc = lsm_cred_alloc(new, gfp);
1495
1496 if (rc)
1497 return rc;
1498
1499 rc = call_int_hook(cred_prepare, 0, new, old, gfp);
1500 if (unlikely(rc))
1501 security_cred_free(new);
1502 return rc;
1043} 1503}
1044 1504
1045void security_transfer_creds(struct cred *new, const struct cred *old) 1505void security_transfer_creds(struct cred *new, const struct cred *old)
@@ -1220,22 +1680,40 @@ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
1220 1680
1221int security_msg_msg_alloc(struct msg_msg *msg) 1681int security_msg_msg_alloc(struct msg_msg *msg)
1222{ 1682{
1223 return call_int_hook(msg_msg_alloc_security, 0, msg); 1683 int rc = lsm_msg_msg_alloc(msg);
1684
1685 if (unlikely(rc))
1686 return rc;
1687 rc = call_int_hook(msg_msg_alloc_security, 0, msg);
1688 if (unlikely(rc))
1689 security_msg_msg_free(msg);
1690 return rc;
1224} 1691}
1225 1692
1226void security_msg_msg_free(struct msg_msg *msg) 1693void security_msg_msg_free(struct msg_msg *msg)
1227{ 1694{
1228 call_void_hook(msg_msg_free_security, msg); 1695 call_void_hook(msg_msg_free_security, msg);
1696 kfree(msg->security);
1697 msg->security = NULL;
1229} 1698}
1230 1699
1231int security_msg_queue_alloc(struct kern_ipc_perm *msq) 1700int security_msg_queue_alloc(struct kern_ipc_perm *msq)
1232{ 1701{
1233 return call_int_hook(msg_queue_alloc_security, 0, msq); 1702 int rc = lsm_ipc_alloc(msq);
1703
1704 if (unlikely(rc))
1705 return rc;
1706 rc = call_int_hook(msg_queue_alloc_security, 0, msq);
1707 if (unlikely(rc))
1708 security_msg_queue_free(msq);
1709 return rc;
1234} 1710}
1235 1711
1236void security_msg_queue_free(struct kern_ipc_perm *msq) 1712void security_msg_queue_free(struct kern_ipc_perm *msq)
1237{ 1713{
1238 call_void_hook(msg_queue_free_security, msq); 1714 call_void_hook(msg_queue_free_security, msq);
1715 kfree(msq->security);
1716 msq->security = NULL;
1239} 1717}
1240 1718
1241int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) 1719int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
@@ -1262,12 +1740,21 @@ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
1262 1740
1263int security_shm_alloc(struct kern_ipc_perm *shp) 1741int security_shm_alloc(struct kern_ipc_perm *shp)
1264{ 1742{
1265 return call_int_hook(shm_alloc_security, 0, shp); 1743 int rc = lsm_ipc_alloc(shp);
1744
1745 if (unlikely(rc))
1746 return rc;
1747 rc = call_int_hook(shm_alloc_security, 0, shp);
1748 if (unlikely(rc))
1749 security_shm_free(shp);
1750 return rc;
1266} 1751}
1267 1752
1268void security_shm_free(struct kern_ipc_perm *shp) 1753void security_shm_free(struct kern_ipc_perm *shp)
1269{ 1754{
1270 call_void_hook(shm_free_security, shp); 1755 call_void_hook(shm_free_security, shp);
1756 kfree(shp->security);
1757 shp->security = NULL;
1271} 1758}
1272 1759
1273int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) 1760int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
@@ -1287,12 +1774,21 @@ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmf
1287 1774
1288int security_sem_alloc(struct kern_ipc_perm *sma) 1775int security_sem_alloc(struct kern_ipc_perm *sma)
1289{ 1776{
1290 return call_int_hook(sem_alloc_security, 0, sma); 1777 int rc = lsm_ipc_alloc(sma);
1778
1779 if (unlikely(rc))
1780 return rc;
1781 rc = call_int_hook(sem_alloc_security, 0, sma);
1782 if (unlikely(rc))
1783 security_sem_free(sma);
1784 return rc;
1291} 1785}
1292 1786
1293void security_sem_free(struct kern_ipc_perm *sma) 1787void security_sem_free(struct kern_ipc_perm *sma)
1294{ 1788{
1295 call_void_hook(sem_free_security, sma); 1789 call_void_hook(sem_free_security, sma);
1790 kfree(sma->security);
1791 sma->security = NULL;
1296} 1792}
1297 1793
1298int security_sem_associate(struct kern_ipc_perm *sma, int semflg) 1794int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
@@ -1319,14 +1815,30 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode)
1319} 1815}
1320EXPORT_SYMBOL(security_d_instantiate); 1816EXPORT_SYMBOL(security_d_instantiate);
1321 1817
1322int security_getprocattr(struct task_struct *p, char *name, char **value) 1818int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
1819 char **value)
1323{ 1820{
1324 return call_int_hook(getprocattr, -EINVAL, p, name, value); 1821 struct security_hook_list *hp;
1822
1823 hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
1824 if (lsm != NULL && strcmp(lsm, hp->lsm))
1825 continue;
1826 return hp->hook.getprocattr(p, name, value);
1827 }
1828 return -EINVAL;
1325} 1829}
1326 1830
1327int security_setprocattr(const char *name, void *value, size_t size) 1831int security_setprocattr(const char *lsm, const char *name, void *value,
1832 size_t size)
1328{ 1833{
1329 return call_int_hook(setprocattr, -EINVAL, name, value, size); 1834 struct security_hook_list *hp;
1835
1836 hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
1837 if (lsm != NULL && strcmp(lsm, hp->lsm))
1838 continue;
1839 return hp->hook.setprocattr(name, value, size);
1840 }
1841 return -EINVAL;
1330} 1842}
1331 1843
1332int security_netlink_send(struct sock *sk, struct sk_buff *skb) 1844int security_netlink_send(struct sock *sk, struct sk_buff *skb)
@@ -1790,11 +2302,9 @@ void security_audit_rule_free(void *lsmrule)
1790 call_void_hook(audit_rule_free, lsmrule); 2302 call_void_hook(audit_rule_free, lsmrule);
1791} 2303}
1792 2304
1793int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule, 2305int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
1794 struct audit_context *actx)
1795{ 2306{
1796 return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule, 2307 return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
1797 actx);
1798} 2308}
1799#endif /* CONFIG_AUDIT */ 2309#endif /* CONFIG_AUDIT */
1800 2310
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index 8af7a690eb40..55f032f1fc2d 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -22,21 +22,6 @@ config SECURITY_SELINUX_BOOTPARAM
22 22
23 If you are unsure how to answer this question, answer N. 23 If you are unsure how to answer this question, answer N.
24 24
25config SECURITY_SELINUX_BOOTPARAM_VALUE
26 int "NSA SELinux boot parameter default value"
27 depends on SECURITY_SELINUX_BOOTPARAM
28 range 0 1
29 default 1
30 help
31 This option sets the default value for the kernel parameter
32 'selinux', which allows SELinux to be disabled at boot. If this
33 option is set to 0 (zero), the SELinux kernel parameter will
34 default to 0, disabling SELinux at bootup. If this option is
35 set to 1 (one), the SELinux kernel parameter will default to 1,
36 enabling SELinux at bootup.
37
38 If you are unsure how to answer this question, answer 1.
39
40config SECURITY_SELINUX_DISABLE 25config SECURITY_SELINUX_DISABLE
41 bool "NSA SELinux runtime disable" 26 bool "NSA SELinux runtime disable"
42 depends on SECURITY_SELINUX 27 depends on SECURITY_SELINUX
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index c7161f8792b2..ccf950409384 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -6,7 +6,7 @@
6obj-$(CONFIG_SECURITY_SELINUX) := selinux.o 6obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
7 7
8selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \ 8selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
9 netnode.o netport.o ibpkey.o exports.o \ 9 netnode.o netport.o ibpkey.o \
10 ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \ 10 ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
11 ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o 11 ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
12 12
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 635e5c1e3e48..8346a4f7c5d7 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -130,75 +130,6 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
130} 130}
131 131
132/** 132/**
133 * avc_dump_av - Display an access vector in human-readable form.
134 * @tclass: target security class
135 * @av: access vector
136 */
137static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
138{
139 const char **perms;
140 int i, perm;
141
142 if (av == 0) {
143 audit_log_format(ab, " null");
144 return;
145 }
146
147 BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
148 perms = secclass_map[tclass-1].perms;
149
150 audit_log_format(ab, " {");
151 i = 0;
152 perm = 1;
153 while (i < (sizeof(av) * 8)) {
154 if ((perm & av) && perms[i]) {
155 audit_log_format(ab, " %s", perms[i]);
156 av &= ~perm;
157 }
158 i++;
159 perm <<= 1;
160 }
161
162 if (av)
163 audit_log_format(ab, " 0x%x", av);
164
165 audit_log_format(ab, " }");
166}
167
168/**
169 * avc_dump_query - Display a SID pair and a class in human-readable form.
170 * @ssid: source security identifier
171 * @tsid: target security identifier
172 * @tclass: target security class
173 */
174static void avc_dump_query(struct audit_buffer *ab, struct selinux_state *state,
175 u32 ssid, u32 tsid, u16 tclass)
176{
177 int rc;
178 char *scontext;
179 u32 scontext_len;
180
181 rc = security_sid_to_context(state, ssid, &scontext, &scontext_len);
182 if (rc)
183 audit_log_format(ab, "ssid=%d", ssid);
184 else {
185 audit_log_format(ab, "scontext=%s", scontext);
186 kfree(scontext);
187 }
188
189 rc = security_sid_to_context(state, tsid, &scontext, &scontext_len);
190 if (rc)
191 audit_log_format(ab, " tsid=%d", tsid);
192 else {
193 audit_log_format(ab, " tcontext=%s", scontext);
194 kfree(scontext);
195 }
196
197 BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
198 audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name);
199}
200
201/**
202 * avc_init - Initialize the AVC. 133 * avc_init - Initialize the AVC.
203 * 134 *
204 * Initialize the access vector cache. 135 * Initialize the access vector cache.
@@ -735,11 +666,36 @@ out:
735static void avc_audit_pre_callback(struct audit_buffer *ab, void *a) 666static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
736{ 667{
737 struct common_audit_data *ad = a; 668 struct common_audit_data *ad = a;
738 audit_log_format(ab, "avc: %s ", 669 struct selinux_audit_data *sad = ad->selinux_audit_data;
739 ad->selinux_audit_data->denied ? "denied" : "granted"); 670 u32 av = sad->audited;
740 avc_dump_av(ab, ad->selinux_audit_data->tclass, 671 const char **perms;
741 ad->selinux_audit_data->audited); 672 int i, perm;
742 audit_log_format(ab, " for "); 673
674 audit_log_format(ab, "avc: %s ", sad->denied ? "denied" : "granted");
675
676 if (av == 0) {
677 audit_log_format(ab, " null");
678 return;
679 }
680
681 perms = secclass_map[sad->tclass-1].perms;
682
683 audit_log_format(ab, " {");
684 i = 0;
685 perm = 1;
686 while (i < (sizeof(av) * 8)) {
687 if ((perm & av) && perms[i]) {
688 audit_log_format(ab, " %s", perms[i]);
689 av &= ~perm;
690 }
691 i++;
692 perm <<= 1;
693 }
694
695 if (av)
696 audit_log_format(ab, " 0x%x", av);
697
698 audit_log_format(ab, " } for ");
743} 699}
744 700
745/** 701/**
@@ -751,14 +707,47 @@ static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
751static void avc_audit_post_callback(struct audit_buffer *ab, void *a) 707static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
752{ 708{
753 struct common_audit_data *ad = a; 709 struct common_audit_data *ad = a;
754 audit_log_format(ab, " "); 710 struct selinux_audit_data *sad = ad->selinux_audit_data;
755 avc_dump_query(ab, ad->selinux_audit_data->state, 711 char *scontext;
756 ad->selinux_audit_data->ssid, 712 u32 scontext_len;
757 ad->selinux_audit_data->tsid, 713 int rc;
758 ad->selinux_audit_data->tclass); 714
759 if (ad->selinux_audit_data->denied) { 715 rc = security_sid_to_context(sad->state, sad->ssid, &scontext,
760 audit_log_format(ab, " permissive=%u", 716 &scontext_len);
761 ad->selinux_audit_data->result ? 0 : 1); 717 if (rc)
718 audit_log_format(ab, " ssid=%d", sad->ssid);
719 else {
720 audit_log_format(ab, " scontext=%s", scontext);
721 kfree(scontext);
722 }
723
724 rc = security_sid_to_context(sad->state, sad->tsid, &scontext,
725 &scontext_len);
726 if (rc)
727 audit_log_format(ab, " tsid=%d", sad->tsid);
728 else {
729 audit_log_format(ab, " tcontext=%s", scontext);
730 kfree(scontext);
731 }
732
733 audit_log_format(ab, " tclass=%s", secclass_map[sad->tclass-1].name);
734
735 if (sad->denied)
736 audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);
737
738 /* in case of invalid context report also the actual context string */
739 rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext,
740 &scontext_len);
741 if (!rc && scontext) {
742 audit_log_format(ab, " srawcon=%s", scontext);
743 kfree(scontext);
744 }
745
746 rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext,
747 &scontext_len);
748 if (!rc && scontext) {
749 audit_log_format(ab, " trawcon=%s", scontext);
750 kfree(scontext);
762 } 751 }
763} 752}
764 753
@@ -772,6 +761,9 @@ noinline int slow_avc_audit(struct selinux_state *state,
772 struct common_audit_data stack_data; 761 struct common_audit_data stack_data;
773 struct selinux_audit_data sad; 762 struct selinux_audit_data sad;
774 763
764 if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
765 return -EINVAL;
766
775 if (!a) { 767 if (!a) {
776 a = &stack_data; 768 a = &stack_data;
777 a->type = LSM_AUDIT_DATA_NONE; 769 a->type = LSM_AUDIT_DATA_NONE;
@@ -838,6 +830,7 @@ out:
838 * @ssid,@tsid,@tclass : identifier of an AVC entry 830 * @ssid,@tsid,@tclass : identifier of an AVC entry
839 * @seqno : sequence number when decision was made 831 * @seqno : sequence number when decision was made
840 * @xpd: extended_perms_decision to be added to the node 832 * @xpd: extended_perms_decision to be added to the node
833 * @flags: the AVC_* flags, e.g. AVC_NONBLOCKING, AVC_EXTENDED_PERMS, or 0.
841 * 834 *
842 * if a valid AVC entry doesn't exist,this function returns -ENOENT. 835 * if a valid AVC entry doesn't exist,this function returns -ENOENT.
843 * if kmalloc() called internal returns NULL, this function returns -ENOMEM. 836 * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
@@ -856,6 +849,22 @@ static int avc_update_node(struct selinux_avc *avc,
856 struct hlist_head *head; 849 struct hlist_head *head;
857 spinlock_t *lock; 850 spinlock_t *lock;
858 851
852 /*
853 * If we are in a non-blocking code path, e.g. VFS RCU walk,
854 * then we must not add permissions to a cache entry
855 * because we cannot safely audit the denial. Otherwise,
856 * during the subsequent blocking retry (e.g. VFS ref walk), we
857 * will find the permissions already granted in the cache entry
858 * and won't audit anything at all, leading to silent denials in
859 * permissive mode that only appear when in enforcing mode.
860 *
861 * See the corresponding handling in slow_avc_audit(), and the
862 * logic in selinux_inode_permission for the MAY_NOT_BLOCK flag,
863 * which is transliterated into AVC_NONBLOCKING.
864 */
865 if (flags & AVC_NONBLOCKING)
866 return 0;
867
859 node = avc_alloc_node(avc); 868 node = avc_alloc_node(avc);
860 if (!node) { 869 if (!node) {
861 rc = -ENOMEM; 870 rc = -ENOMEM;
@@ -1050,7 +1059,8 @@ int avc_has_extended_perms(struct selinux_state *state,
1050 int rc = 0, rc2; 1059 int rc = 0, rc2;
1051 1060
1052 xp_node = &local_xp_node; 1061 xp_node = &local_xp_node;
1053 BUG_ON(!requested); 1062 if (WARN_ON(!requested))
1063 return -EACCES;
1054 1064
1055 rcu_read_lock(); 1065 rcu_read_lock();
1056 1066
@@ -1115,7 +1125,7 @@ decision:
1115 * @tsid: target security identifier 1125 * @tsid: target security identifier
1116 * @tclass: target security class 1126 * @tclass: target security class
1117 * @requested: requested permissions, interpreted based on @tclass 1127 * @requested: requested permissions, interpreted based on @tclass
1118 * @flags: AVC_STRICT or 0 1128 * @flags: AVC_STRICT, AVC_NONBLOCKING, or 0
1119 * @avd: access vector decisions 1129 * @avd: access vector decisions
1120 * 1130 *
1121 * Check the AVC to determine whether the @requested permissions are granted 1131 * Check the AVC to determine whether the @requested permissions are granted
@@ -1140,7 +1150,8 @@ inline int avc_has_perm_noaudit(struct selinux_state *state,
1140 int rc = 0; 1150 int rc = 0;
1141 u32 denied; 1151 u32 denied;
1142 1152
1143 BUG_ON(!requested); 1153 if (WARN_ON(!requested))
1154 return -EACCES;
1144 1155
1145 rcu_read_lock(); 1156 rcu_read_lock();
1146 1157
@@ -1191,24 +1202,6 @@ int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass,
1191 return rc; 1202 return rc;
1192} 1203}
1193 1204
1194int avc_has_perm_flags(struct selinux_state *state,
1195 u32 ssid, u32 tsid, u16 tclass, u32 requested,
1196 struct common_audit_data *auditdata,
1197 int flags)
1198{
1199 struct av_decision avd;
1200 int rc, rc2;
1201
1202 rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0,
1203 &avd);
1204
1205 rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
1206 auditdata, flags);
1207 if (rc2)
1208 return rc2;
1209 return rc;
1210}
1211
1212u32 avc_policy_seqno(struct selinux_state *state) 1205u32 avc_policy_seqno(struct selinux_state *state)
1213{ 1206{
1214 return state->avc->avc_cache.latest_notif; 1207 return state->avc->avc_cache.latest_notif;
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
deleted file mode 100644
index e75dd94e2d2b..000000000000
--- a/security/selinux/exports.c
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * SELinux services exported to the rest of the kernel.
3 *
4 * Author: James Morris <jmorris@redhat.com>
5 *
6 * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
7 * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
8 * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation.
13 */
14#include <linux/module.h>
15#include <linux/selinux.h>
16
17#include "security.h"
18
19bool selinux_is_enabled(void)
20{
21 return selinux_enabled;
22}
23EXPORT_SYMBOL_GPL(selinux_is_enabled);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f0e36c3492ba..2f82a54f8703 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -79,7 +79,6 @@
79#include <linux/personality.h> 79#include <linux/personality.h>
80#include <linux/audit.h> 80#include <linux/audit.h>
81#include <linux/string.h> 81#include <linux/string.h>
82#include <linux/selinux.h>
83#include <linux/mutex.h> 82#include <linux/mutex.h>
84#include <linux/posix-timers.h> 83#include <linux/posix-timers.h>
85#include <linux/syslog.h> 84#include <linux/syslog.h>
@@ -121,9 +120,8 @@ __setup("enforcing=", enforcing_setup);
121#define selinux_enforcing_boot 1 120#define selinux_enforcing_boot 1
122#endif 121#endif
123 122
123int selinux_enabled __lsm_ro_after_init = 1;
124#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM 124#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
125int selinux_enabled = CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE;
126
127static int __init selinux_enabled_setup(char *str) 125static int __init selinux_enabled_setup(char *str)
128{ 126{
129 unsigned long enabled; 127 unsigned long enabled;
@@ -132,8 +130,6 @@ static int __init selinux_enabled_setup(char *str)
132 return 1; 130 return 1;
133} 131}
134__setup("selinux=", selinux_enabled_setup); 132__setup("selinux=", selinux_enabled_setup);
135#else
136int selinux_enabled = 1;
137#endif 133#endif
138 134
139static unsigned int selinux_checkreqprot_boot = 135static unsigned int selinux_checkreqprot_boot =
@@ -149,9 +145,6 @@ static int __init checkreqprot_setup(char *str)
149} 145}
150__setup("checkreqprot=", checkreqprot_setup); 146__setup("checkreqprot=", checkreqprot_setup);
151 147
152static struct kmem_cache *sel_inode_cache;
153static struct kmem_cache *file_security_cache;
154
155/** 148/**
156 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled 149 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
157 * 150 *
@@ -214,12 +207,8 @@ static void cred_init_security(void)
214 struct cred *cred = (struct cred *) current->real_cred; 207 struct cred *cred = (struct cred *) current->real_cred;
215 struct task_security_struct *tsec; 208 struct task_security_struct *tsec;
216 209
217 tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL); 210 tsec = selinux_cred(cred);
218 if (!tsec)
219 panic("SELinux: Failed to initialize initial task.\n");
220
221 tsec->osid = tsec->sid = SECINITSID_KERNEL; 211 tsec->osid = tsec->sid = SECINITSID_KERNEL;
222 cred->security = tsec;
223} 212}
224 213
225/* 214/*
@@ -229,7 +218,7 @@ static inline u32 cred_sid(const struct cred *cred)
229{ 218{
230 const struct task_security_struct *tsec; 219 const struct task_security_struct *tsec;
231 220
232 tsec = cred->security; 221 tsec = selinux_cred(cred);
233 return tsec->sid; 222 return tsec->sid;
234} 223}
235 224
@@ -250,13 +239,9 @@ static inline u32 task_sid(const struct task_struct *task)
250 239
251static int inode_alloc_security(struct inode *inode) 240static int inode_alloc_security(struct inode *inode)
252{ 241{
253 struct inode_security_struct *isec; 242 struct inode_security_struct *isec = selinux_inode(inode);
254 u32 sid = current_sid(); 243 u32 sid = current_sid();
255 244
256 isec = kmem_cache_zalloc(sel_inode_cache, GFP_NOFS);
257 if (!isec)
258 return -ENOMEM;
259
260 spin_lock_init(&isec->lock); 245 spin_lock_init(&isec->lock);
261 INIT_LIST_HEAD(&isec->list); 246 INIT_LIST_HEAD(&isec->list);
262 isec->inode = inode; 247 isec->inode = inode;
@@ -264,7 +249,6 @@ static int inode_alloc_security(struct inode *inode)
264 isec->sclass = SECCLASS_FILE; 249 isec->sclass = SECCLASS_FILE;
265 isec->task_sid = sid; 250 isec->task_sid = sid;
266 isec->initialized = LABEL_INVALID; 251 isec->initialized = LABEL_INVALID;
267 inode->i_security = isec;
268 252
269 return 0; 253 return 0;
270} 254}
@@ -281,7 +265,7 @@ static int __inode_security_revalidate(struct inode *inode,
281 struct dentry *dentry, 265 struct dentry *dentry,
282 bool may_sleep) 266 bool may_sleep)
283{ 267{
284 struct inode_security_struct *isec = inode->i_security; 268 struct inode_security_struct *isec = selinux_inode(inode);
285 269
286 might_sleep_if(may_sleep); 270 might_sleep_if(may_sleep);
287 271
@@ -302,7 +286,7 @@ static int __inode_security_revalidate(struct inode *inode,
302 286
303static struct inode_security_struct *inode_security_novalidate(struct inode *inode) 287static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
304{ 288{
305 return inode->i_security; 289 return selinux_inode(inode);
306} 290}
307 291
308static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) 292static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu)
@@ -312,7 +296,7 @@ static struct inode_security_struct *inode_security_rcu(struct inode *inode, boo
312 error = __inode_security_revalidate(inode, NULL, !rcu); 296 error = __inode_security_revalidate(inode, NULL, !rcu);
313 if (error) 297 if (error)
314 return ERR_PTR(error); 298 return ERR_PTR(error);
315 return inode->i_security; 299 return selinux_inode(inode);
316} 300}
317 301
318/* 302/*
@@ -321,14 +305,14 @@ static struct inode_security_struct *inode_security_rcu(struct inode *inode, boo
321static struct inode_security_struct *inode_security(struct inode *inode) 305static struct inode_security_struct *inode_security(struct inode *inode)
322{ 306{
323 __inode_security_revalidate(inode, NULL, true); 307 __inode_security_revalidate(inode, NULL, true);
324 return inode->i_security; 308 return selinux_inode(inode);
325} 309}
326 310
327static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) 311static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
328{ 312{
329 struct inode *inode = d_backing_inode(dentry); 313 struct inode *inode = d_backing_inode(dentry);
330 314
331 return inode->i_security; 315 return selinux_inode(inode);
332} 316}
333 317
334/* 318/*
@@ -339,22 +323,17 @@ static struct inode_security_struct *backing_inode_security(struct dentry *dentr
339 struct inode *inode = d_backing_inode(dentry); 323 struct inode *inode = d_backing_inode(dentry);
340 324
341 __inode_security_revalidate(inode, dentry, true); 325 __inode_security_revalidate(inode, dentry, true);
342 return inode->i_security; 326 return selinux_inode(inode);
343}
344
345static void inode_free_rcu(struct rcu_head *head)
346{
347 struct inode_security_struct *isec;
348
349 isec = container_of(head, struct inode_security_struct, rcu);
350 kmem_cache_free(sel_inode_cache, isec);
351} 327}
352 328
353static void inode_free_security(struct inode *inode) 329static void inode_free_security(struct inode *inode)
354{ 330{
355 struct inode_security_struct *isec = inode->i_security; 331 struct inode_security_struct *isec = selinux_inode(inode);
356 struct superblock_security_struct *sbsec = inode->i_sb->s_security; 332 struct superblock_security_struct *sbsec;
357 333
334 if (!isec)
335 return;
336 sbsec = inode->i_sb->s_security;
358 /* 337 /*
359 * As not all inode security structures are in a list, we check for 338 * As not all inode security structures are in a list, we check for
360 * empty list outside of the lock to make sure that we won't waste 339 * empty list outside of the lock to make sure that we won't waste
@@ -370,42 +349,19 @@ static void inode_free_security(struct inode *inode)
370 list_del_init(&isec->list); 349 list_del_init(&isec->list);
371 spin_unlock(&sbsec->isec_lock); 350 spin_unlock(&sbsec->isec_lock);
372 } 351 }
373
374 /*
375 * The inode may still be referenced in a path walk and
376 * a call to selinux_inode_permission() can be made
377 * after inode_free_security() is called. Ideally, the VFS
378 * wouldn't do this, but fixing that is a much harder
379 * job. For now, simply free the i_security via RCU, and
380 * leave the current inode->i_security pointer intact.
381 * The inode will be freed after the RCU grace period too.
382 */
383 call_rcu(&isec->rcu, inode_free_rcu);
384} 352}
385 353
386static int file_alloc_security(struct file *file) 354static int file_alloc_security(struct file *file)
387{ 355{
388 struct file_security_struct *fsec; 356 struct file_security_struct *fsec = selinux_file(file);
389 u32 sid = current_sid(); 357 u32 sid = current_sid();
390 358
391 fsec = kmem_cache_zalloc(file_security_cache, GFP_KERNEL);
392 if (!fsec)
393 return -ENOMEM;
394
395 fsec->sid = sid; 359 fsec->sid = sid;
396 fsec->fown_sid = sid; 360 fsec->fown_sid = sid;
397 file->f_security = fsec;
398 361
399 return 0; 362 return 0;
400} 363}
401 364
402static void file_free_security(struct file *file)
403{
404 struct file_security_struct *fsec = file->f_security;
405 file->f_security = NULL;
406 kmem_cache_free(file_security_cache, fsec);
407}
408
409static int superblock_alloc_security(struct super_block *sb) 365static int superblock_alloc_security(struct super_block *sb)
410{ 366{
411 struct superblock_security_struct *sbsec; 367 struct superblock_security_struct *sbsec;
@@ -501,7 +457,7 @@ static int may_context_mount_sb_relabel(u32 sid,
501 struct superblock_security_struct *sbsec, 457 struct superblock_security_struct *sbsec,
502 const struct cred *cred) 458 const struct cred *cred)
503{ 459{
504 const struct task_security_struct *tsec = cred->security; 460 const struct task_security_struct *tsec = selinux_cred(cred);
505 int rc; 461 int rc;
506 462
507 rc = avc_has_perm(&selinux_state, 463 rc = avc_has_perm(&selinux_state,
@@ -520,7 +476,7 @@ static int may_context_mount_inode_relabel(u32 sid,
520 struct superblock_security_struct *sbsec, 476 struct superblock_security_struct *sbsec,
521 const struct cred *cred) 477 const struct cred *cred)
522{ 478{
523 const struct task_security_struct *tsec = cred->security; 479 const struct task_security_struct *tsec = selinux_cred(cred);
524 int rc; 480 int rc;
525 rc = avc_has_perm(&selinux_state, 481 rc = avc_has_perm(&selinux_state,
526 tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, 482 tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
@@ -534,16 +490,10 @@ static int may_context_mount_inode_relabel(u32 sid,
534 return rc; 490 return rc;
535} 491}
536 492
537static int selinux_is_sblabel_mnt(struct super_block *sb) 493static int selinux_is_genfs_special_handling(struct super_block *sb)
538{ 494{
539 struct superblock_security_struct *sbsec = sb->s_security; 495 /* Special handling. Genfs but also in-core setxattr handler */
540 496 return !strcmp(sb->s_type->name, "sysfs") ||
541 return sbsec->behavior == SECURITY_FS_USE_XATTR ||
542 sbsec->behavior == SECURITY_FS_USE_TRANS ||
543 sbsec->behavior == SECURITY_FS_USE_TASK ||
544 sbsec->behavior == SECURITY_FS_USE_NATIVE ||
545 /* Special handling. Genfs but also in-core setxattr handler */
546 !strcmp(sb->s_type->name, "sysfs") ||
547 !strcmp(sb->s_type->name, "pstore") || 497 !strcmp(sb->s_type->name, "pstore") ||
548 !strcmp(sb->s_type->name, "debugfs") || 498 !strcmp(sb->s_type->name, "debugfs") ||
549 !strcmp(sb->s_type->name, "tracefs") || 499 !strcmp(sb->s_type->name, "tracefs") ||
@@ -553,6 +503,34 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
553 !strcmp(sb->s_type->name, "cgroup2"))); 503 !strcmp(sb->s_type->name, "cgroup2")));
554} 504}
555 505
506static int selinux_is_sblabel_mnt(struct super_block *sb)
507{
508 struct superblock_security_struct *sbsec = sb->s_security;
509
510 /*
511 * IMPORTANT: Double-check logic in this function when adding a new
512 * SECURITY_FS_USE_* definition!
513 */
514 BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);
515
516 switch (sbsec->behavior) {
517 case SECURITY_FS_USE_XATTR:
518 case SECURITY_FS_USE_TRANS:
519 case SECURITY_FS_USE_TASK:
520 case SECURITY_FS_USE_NATIVE:
521 return 1;
522
523 case SECURITY_FS_USE_GENFS:
524 return selinux_is_genfs_special_handling(sb);
525
526 /* Never allow relabeling on context mounts */
527 case SECURITY_FS_USE_MNTPOINT:
528 case SECURITY_FS_USE_NONE:
529 default:
530 return 0;
531 }
532}
533
556static int sb_finish_set_opts(struct super_block *sb) 534static int sb_finish_set_opts(struct super_block *sb)
557{ 535{
558 struct superblock_security_struct *sbsec = sb->s_security; 536 struct superblock_security_struct *sbsec = sb->s_security;
@@ -1374,7 +1352,7 @@ static int selinux_genfs_get_sid(struct dentry *dentry,
1374static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry) 1352static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
1375{ 1353{
1376 struct superblock_security_struct *sbsec = NULL; 1354 struct superblock_security_struct *sbsec = NULL;
1377 struct inode_security_struct *isec = inode->i_security; 1355 struct inode_security_struct *isec = selinux_inode(inode);
1378 u32 task_sid, sid = 0; 1356 u32 task_sid, sid = 0;
1379 u16 sclass; 1357 u16 sclass;
1380 struct dentry *dentry; 1358 struct dentry *dentry;
@@ -1621,7 +1599,7 @@ static inline u32 signal_to_av(int sig)
1621 1599
1622/* Check whether a task is allowed to use a capability. */ 1600/* Check whether a task is allowed to use a capability. */
1623static int cred_has_capability(const struct cred *cred, 1601static int cred_has_capability(const struct cred *cred,
1624 int cap, int audit, bool initns) 1602 int cap, unsigned int opts, bool initns)
1625{ 1603{
1626 struct common_audit_data ad; 1604 struct common_audit_data ad;
1627 struct av_decision avd; 1605 struct av_decision avd;
@@ -1648,7 +1626,7 @@ static int cred_has_capability(const struct cred *cred,
1648 1626
1649 rc = avc_has_perm_noaudit(&selinux_state, 1627 rc = avc_has_perm_noaudit(&selinux_state,
1650 sid, sid, sclass, av, 0, &avd); 1628 sid, sid, sclass, av, 0, &avd);
1651 if (audit == SECURITY_CAP_AUDIT) { 1629 if (!(opts & CAP_OPT_NOAUDIT)) {
1652 int rc2 = avc_audit(&selinux_state, 1630 int rc2 = avc_audit(&selinux_state,
1653 sid, sid, sclass, av, &avd, rc, &ad, 0); 1631 sid, sid, sclass, av, &avd, rc, &ad, 0);
1654 if (rc2) 1632 if (rc2)
@@ -1674,7 +1652,7 @@ static int inode_has_perm(const struct cred *cred,
1674 return 0; 1652 return 0;
1675 1653
1676 sid = cred_sid(cred); 1654 sid = cred_sid(cred);
1677 isec = inode->i_security; 1655 isec = selinux_inode(inode);
1678 1656
1679 return avc_has_perm(&selinux_state, 1657 return avc_has_perm(&selinux_state,
1680 sid, isec->sid, isec->sclass, perms, adp); 1658 sid, isec->sid, isec->sclass, perms, adp);
@@ -1740,7 +1718,7 @@ static int file_has_perm(const struct cred *cred,
1740 struct file *file, 1718 struct file *file,
1741 u32 av) 1719 u32 av)
1742{ 1720{
1743 struct file_security_struct *fsec = file->f_security; 1721 struct file_security_struct *fsec = selinux_file(file);
1744 struct inode *inode = file_inode(file); 1722 struct inode *inode = file_inode(file);
1745 struct common_audit_data ad; 1723 struct common_audit_data ad;
1746 u32 sid = cred_sid(cred); 1724 u32 sid = cred_sid(cred);
@@ -1806,7 +1784,7 @@ static int may_create(struct inode *dir,
1806 struct dentry *dentry, 1784 struct dentry *dentry,
1807 u16 tclass) 1785 u16 tclass)
1808{ 1786{
1809 const struct task_security_struct *tsec = current_security(); 1787 const struct task_security_struct *tsec = selinux_cred(current_cred());
1810 struct inode_security_struct *dsec; 1788 struct inode_security_struct *dsec;
1811 struct superblock_security_struct *sbsec; 1789 struct superblock_security_struct *sbsec;
1812 u32 sid, newsid; 1790 u32 sid, newsid;
@@ -1828,7 +1806,7 @@ static int may_create(struct inode *dir,
1828 if (rc) 1806 if (rc)
1829 return rc; 1807 return rc;
1830 1808
1831 rc = selinux_determine_inode_label(current_security(), dir, 1809 rc = selinux_determine_inode_label(selinux_cred(current_cred()), dir,
1832 &dentry->d_name, tclass, &newsid); 1810 &dentry->d_name, tclass, &newsid);
1833 if (rc) 1811 if (rc)
1834 return rc; 1812 return rc;
@@ -2084,7 +2062,7 @@ static int selinux_binder_transfer_file(struct task_struct *from,
2084 struct file *file) 2062 struct file *file)
2085{ 2063{
2086 u32 sid = task_sid(to); 2064 u32 sid = task_sid(to);
2087 struct file_security_struct *fsec = file->f_security; 2065 struct file_security_struct *fsec = selinux_file(file);
2088 struct dentry *dentry = file->f_path.dentry; 2066 struct dentry *dentry = file->f_path.dentry;
2089 struct inode_security_struct *isec; 2067 struct inode_security_struct *isec;
2090 struct common_audit_data ad; 2068 struct common_audit_data ad;
@@ -2168,9 +2146,9 @@ static int selinux_capset(struct cred *new, const struct cred *old,
2168 */ 2146 */
2169 2147
2170static int selinux_capable(const struct cred *cred, struct user_namespace *ns, 2148static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
2171 int cap, int audit) 2149 int cap, unsigned int opts)
2172{ 2150{
2173 return cred_has_capability(cred, cap, audit, ns == &init_user_ns); 2151 return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
2174} 2152}
2175 2153
2176static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb) 2154static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
@@ -2244,7 +2222,7 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
2244 int rc, cap_sys_admin = 0; 2222 int rc, cap_sys_admin = 0;
2245 2223
2246 rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN, 2224 rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN,
2247 SECURITY_CAP_NOAUDIT, true); 2225 CAP_OPT_NOAUDIT, true);
2248 if (rc == 0) 2226 if (rc == 0)
2249 cap_sys_admin = 1; 2227 cap_sys_admin = 1;
2250 2228
@@ -2335,8 +2313,8 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
2335 if (bprm->called_set_creds) 2313 if (bprm->called_set_creds)
2336 return 0; 2314 return 0;
2337 2315
2338 old_tsec = current_security(); 2316 old_tsec = selinux_cred(current_cred());
2339 new_tsec = bprm->cred->security; 2317 new_tsec = selinux_cred(bprm->cred);
2340 isec = inode_security(inode); 2318 isec = inode_security(inode);
2341 2319
2342 /* Default to the current task SID. */ 2320 /* Default to the current task SID. */
@@ -2500,7 +2478,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
2500 struct rlimit *rlim, *initrlim; 2478 struct rlimit *rlim, *initrlim;
2501 int rc, i; 2479 int rc, i;
2502 2480
2503 new_tsec = bprm->cred->security; 2481 new_tsec = selinux_cred(bprm->cred);
2504 if (new_tsec->sid == new_tsec->osid) 2482 if (new_tsec->sid == new_tsec->osid)
2505 return; 2483 return;
2506 2484
@@ -2543,7 +2521,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
2543 */ 2521 */
2544static void selinux_bprm_committed_creds(struct linux_binprm *bprm) 2522static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
2545{ 2523{
2546 const struct task_security_struct *tsec = current_security(); 2524 const struct task_security_struct *tsec = selinux_cred(current_cred());
2547 struct itimerval itimer; 2525 struct itimerval itimer;
2548 u32 osid, sid; 2526 u32 osid, sid;
2549 int rc, i; 2527 int rc, i;
@@ -2780,7 +2758,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode,
2780 u32 newsid; 2758 u32 newsid;
2781 int rc; 2759 int rc;
2782 2760
2783 rc = selinux_determine_inode_label(current_security(), 2761 rc = selinux_determine_inode_label(selinux_cred(current_cred()),
2784 d_inode(dentry->d_parent), name, 2762 d_inode(dentry->d_parent), name,
2785 inode_mode_to_security_class(mode), 2763 inode_mode_to_security_class(mode),
2786 &newsid); 2764 &newsid);
@@ -2800,14 +2778,14 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
2800 int rc; 2778 int rc;
2801 struct task_security_struct *tsec; 2779 struct task_security_struct *tsec;
2802 2780
2803 rc = selinux_determine_inode_label(old->security, 2781 rc = selinux_determine_inode_label(selinux_cred(old),
2804 d_inode(dentry->d_parent), name, 2782 d_inode(dentry->d_parent), name,
2805 inode_mode_to_security_class(mode), 2783 inode_mode_to_security_class(mode),
2806 &newsid); 2784 &newsid);
2807 if (rc) 2785 if (rc)
2808 return rc; 2786 return rc;
2809 2787
2810 tsec = new->security; 2788 tsec = selinux_cred(new);
2811 tsec->create_sid = newsid; 2789 tsec->create_sid = newsid;
2812 return 0; 2790 return 0;
2813} 2791}
@@ -2817,7 +2795,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
2817 const char **name, 2795 const char **name,
2818 void **value, size_t *len) 2796 void **value, size_t *len)
2819{ 2797{
2820 const struct task_security_struct *tsec = current_security(); 2798 const struct task_security_struct *tsec = selinux_cred(current_cred());
2821 struct superblock_security_struct *sbsec; 2799 struct superblock_security_struct *sbsec;
2822 u32 newsid, clen; 2800 u32 newsid, clen;
2823 int rc; 2801 int rc;
@@ -2827,7 +2805,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
2827 2805
2828 newsid = tsec->create_sid; 2806 newsid = tsec->create_sid;
2829 2807
2830 rc = selinux_determine_inode_label(current_security(), 2808 rc = selinux_determine_inode_label(selinux_cred(current_cred()),
2831 dir, qstr, 2809 dir, qstr,
2832 inode_mode_to_security_class(inode->i_mode), 2810 inode_mode_to_security_class(inode->i_mode),
2833 &newsid); 2811 &newsid);
@@ -2836,7 +2814,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
2836 2814
2837 /* Possibly defer initialization to selinux_complete_init. */ 2815 /* Possibly defer initialization to selinux_complete_init. */
2838 if (sbsec->flags & SE_SBINITIALIZED) { 2816 if (sbsec->flags & SE_SBINITIALIZED) {
2839 struct inode_security_struct *isec = inode->i_security; 2817 struct inode_security_struct *isec = selinux_inode(inode);
2840 isec->sclass = inode_mode_to_security_class(inode->i_mode); 2818 isec->sclass = inode_mode_to_security_class(inode->i_mode);
2841 isec->sid = newsid; 2819 isec->sid = newsid;
2842 isec->initialized = LABEL_INITIALIZED; 2820 isec->initialized = LABEL_INITIALIZED;
@@ -2925,9 +2903,8 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
2925 if (IS_ERR(isec)) 2903 if (IS_ERR(isec))
2926 return PTR_ERR(isec); 2904 return PTR_ERR(isec);
2927 2905
2928 return avc_has_perm_flags(&selinux_state, 2906 return avc_has_perm(&selinux_state,
2929 sid, isec->sid, isec->sclass, FILE__READ, &ad, 2907 sid, isec->sid, isec->sclass, FILE__READ, &ad);
2930 rcu ? MAY_NOT_BLOCK : 0);
2931} 2908}
2932 2909
2933static noinline int audit_inode_permission(struct inode *inode, 2910static noinline int audit_inode_permission(struct inode *inode,
@@ -2936,7 +2913,7 @@ static noinline int audit_inode_permission(struct inode *inode,
2936 unsigned flags) 2913 unsigned flags)
2937{ 2914{
2938 struct common_audit_data ad; 2915 struct common_audit_data ad;
2939 struct inode_security_struct *isec = inode->i_security; 2916 struct inode_security_struct *isec = selinux_inode(inode);
2940 int rc; 2917 int rc;
2941 2918
2942 ad.type = LSM_AUDIT_DATA_INODE; 2919 ad.type = LSM_AUDIT_DATA_INODE;
@@ -2982,7 +2959,9 @@ static int selinux_inode_permission(struct inode *inode, int mask)
2982 return PTR_ERR(isec); 2959 return PTR_ERR(isec);
2983 2960
2984 rc = avc_has_perm_noaudit(&selinux_state, 2961 rc = avc_has_perm_noaudit(&selinux_state,
2985 sid, isec->sid, isec->sclass, perms, 0, &avd); 2962 sid, isec->sid, isec->sclass, perms,
2963 (flags & MAY_NOT_BLOCK) ? AVC_NONBLOCKING : 0,
2964 &avd);
2986 audited = avc_audit_required(perms, &avd, rc, 2965 audited = avc_audit_required(perms, &avd, rc,
2987 from_access ? FILE__AUDIT_ACCESS : 0, 2966 from_access ? FILE__AUDIT_ACCESS : 0,
2988 &denied); 2967 &denied);
@@ -3031,11 +3010,11 @@ static int selinux_inode_getattr(const struct path *path)
3031static bool has_cap_mac_admin(bool audit) 3010static bool has_cap_mac_admin(bool audit)
3032{ 3011{
3033 const struct cred *cred = current_cred(); 3012 const struct cred *cred = current_cred();
3034 int cap_audit = audit ? SECURITY_CAP_AUDIT : SECURITY_CAP_NOAUDIT; 3013 unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;
3035 3014
3036 if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, cap_audit)) 3015 if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
3037 return false; 3016 return false;
3038 if (cred_has_capability(cred, CAP_MAC_ADMIN, cap_audit, true)) 3017 if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
3039 return false; 3018 return false;
3040 return true; 3019 return true;
3041} 3020}
@@ -3241,12 +3220,16 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
3241 const void *value, size_t size, int flags) 3220 const void *value, size_t size, int flags)
3242{ 3221{
3243 struct inode_security_struct *isec = inode_security_novalidate(inode); 3222 struct inode_security_struct *isec = inode_security_novalidate(inode);
3223 struct superblock_security_struct *sbsec = inode->i_sb->s_security;
3244 u32 newsid; 3224 u32 newsid;
3245 int rc; 3225 int rc;
3246 3226
3247 if (strcmp(name, XATTR_SELINUX_SUFFIX)) 3227 if (strcmp(name, XATTR_SELINUX_SUFFIX))
3248 return -EOPNOTSUPP; 3228 return -EOPNOTSUPP;
3249 3229
3230 if (!(sbsec->flags & SBLABEL_MNT))
3231 return -EOPNOTSUPP;
3232
3250 if (!value || !size) 3233 if (!value || !size)
3251 return -EACCES; 3234 return -EACCES;
3252 3235
@@ -3289,7 +3272,7 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
3289 return -ENOMEM; 3272 return -ENOMEM;
3290 } 3273 }
3291 3274
3292 tsec = new_creds->security; 3275 tsec = selinux_cred(new_creds);
3293 /* Get label from overlay inode and set it in create_sid */ 3276 /* Get label from overlay inode and set it in create_sid */
3294 selinux_inode_getsecid(d_inode(src), &sid); 3277 selinux_inode_getsecid(d_inode(src), &sid);
3295 tsec->create_sid = sid; 3278 tsec->create_sid = sid;
@@ -3330,7 +3313,7 @@ static int selinux_revalidate_file_permission(struct file *file, int mask)
3330static int selinux_file_permission(struct file *file, int mask) 3313static int selinux_file_permission(struct file *file, int mask)
3331{ 3314{
3332 struct inode *inode = file_inode(file); 3315 struct inode *inode = file_inode(file);
3333 struct file_security_struct *fsec = file->f_security; 3316 struct file_security_struct *fsec = selinux_file(file);
3334 struct inode_security_struct *isec; 3317 struct inode_security_struct *isec;
3335 u32 sid = current_sid(); 3318 u32 sid = current_sid();
3336 3319
@@ -3352,11 +3335,6 @@ static int selinux_file_alloc_security(struct file *file)
3352 return file_alloc_security(file); 3335 return file_alloc_security(file);
3353} 3336}
3354 3337
3355static void selinux_file_free_security(struct file *file)
3356{
3357 file_free_security(file);
3358}
3359
3360/* 3338/*
3361 * Check whether a task has the ioctl permission and cmd 3339 * Check whether a task has the ioctl permission and cmd
3362 * operation to an inode. 3340 * operation to an inode.
@@ -3365,7 +3343,7 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file,
3365 u32 requested, u16 cmd) 3343 u32 requested, u16 cmd)
3366{ 3344{
3367 struct common_audit_data ad; 3345 struct common_audit_data ad;
3368 struct file_security_struct *fsec = file->f_security; 3346 struct file_security_struct *fsec = selinux_file(file);
3369 struct inode *inode = file_inode(file); 3347 struct inode *inode = file_inode(file);
3370 struct inode_security_struct *isec; 3348 struct inode_security_struct *isec;
3371 struct lsm_ioctlop_audit ioctl; 3349 struct lsm_ioctlop_audit ioctl;
@@ -3435,7 +3413,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
3435 case KDSKBENT: 3413 case KDSKBENT:
3436 case KDSKBSENT: 3414 case KDSKBSENT:
3437 error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, 3415 error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
3438 SECURITY_CAP_AUDIT, true); 3416 CAP_OPT_NONE, true);
3439 break; 3417 break;
3440 3418
3441 /* default case assumes that the command will go 3419 /* default case assumes that the command will go
@@ -3617,7 +3595,7 @@ static void selinux_file_set_fowner(struct file *file)
3617{ 3595{
3618 struct file_security_struct *fsec; 3596 struct file_security_struct *fsec;
3619 3597
3620 fsec = file->f_security; 3598 fsec = selinux_file(file);
3621 fsec->fown_sid = current_sid(); 3599 fsec->fown_sid = current_sid();
3622} 3600}
3623 3601
@@ -3632,7 +3610,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
3632 /* struct fown_struct is never outside the context of a struct file */ 3610 /* struct fown_struct is never outside the context of a struct file */
3633 file = container_of(fown, struct file, f_owner); 3611 file = container_of(fown, struct file, f_owner);
3634 3612
3635 fsec = file->f_security; 3613 fsec = selinux_file(file);
3636 3614
3637 if (!signum) 3615 if (!signum)
3638 perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */ 3616 perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
@@ -3656,7 +3634,7 @@ static int selinux_file_open(struct file *file)
3656 struct file_security_struct *fsec; 3634 struct file_security_struct *fsec;
3657 struct inode_security_struct *isec; 3635 struct inode_security_struct *isec;
3658 3636
3659 fsec = file->f_security; 3637 fsec = selinux_file(file);
3660 isec = inode_security(file_inode(file)); 3638 isec = inode_security(file_inode(file));
3661 /* 3639 /*
3662 * Save inode label and policy sequence number 3640 * Save inode label and policy sequence number
@@ -3690,52 +3668,15 @@ static int selinux_task_alloc(struct task_struct *task,
3690} 3668}
3691 3669
3692/* 3670/*
3693 * allocate the SELinux part of blank credentials
3694 */
3695static int selinux_cred_alloc_blank(struct cred *cred, gfp_t gfp)
3696{
3697 struct task_security_struct *tsec;
3698
3699 tsec = kzalloc(sizeof(struct task_security_struct), gfp);
3700 if (!tsec)
3701 return -ENOMEM;
3702
3703 cred->security = tsec;
3704 return 0;
3705}
3706
3707/*
3708 * detach and free the LSM part of a set of credentials
3709 */
3710static void selinux_cred_free(struct cred *cred)
3711{
3712 struct task_security_struct *tsec = cred->security;
3713
3714 /*
3715 * cred->security == NULL if security_cred_alloc_blank() or
3716 * security_prepare_creds() returned an error.
3717 */
3718 BUG_ON(cred->security && (unsigned long) cred->security < PAGE_SIZE);
3719 cred->security = (void *) 0x7UL;
3720 kfree(tsec);
3721}
3722
3723/*
3724 * prepare a new set of credentials for modification 3671 * prepare a new set of credentials for modification
3725 */ 3672 */
3726static int selinux_cred_prepare(struct cred *new, const struct cred *old, 3673static int selinux_cred_prepare(struct cred *new, const struct cred *old,
3727 gfp_t gfp) 3674 gfp_t gfp)
3728{ 3675{
3729 const struct task_security_struct *old_tsec; 3676 const struct task_security_struct *old_tsec = selinux_cred(old);
3730 struct task_security_struct *tsec; 3677 struct task_security_struct *tsec = selinux_cred(new);
3731
3732 old_tsec = old->security;
3733
3734 tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp);
3735 if (!tsec)
3736 return -ENOMEM;
3737 3678
3738 new->security = tsec; 3679 *tsec = *old_tsec;
3739 return 0; 3680 return 0;
3740} 3681}
3741 3682
@@ -3744,8 +3685,8 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old,
3744 */ 3685 */
3745static void selinux_cred_transfer(struct cred *new, const struct cred *old) 3686static void selinux_cred_transfer(struct cred *new, const struct cred *old)
3746{ 3687{
3747 const struct task_security_struct *old_tsec = old->security; 3688 const struct task_security_struct *old_tsec = selinux_cred(old);
3748 struct task_security_struct *tsec = new->security; 3689 struct task_security_struct *tsec = selinux_cred(new);
3749 3690
3750 *tsec = *old_tsec; 3691 *tsec = *old_tsec;
3751} 3692}
@@ -3761,7 +3702,7 @@ static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
3761 */ 3702 */
3762static int selinux_kernel_act_as(struct cred *new, u32 secid) 3703static int selinux_kernel_act_as(struct cred *new, u32 secid)
3763{ 3704{
3764 struct task_security_struct *tsec = new->security; 3705 struct task_security_struct *tsec = selinux_cred(new);
3765 u32 sid = current_sid(); 3706 u32 sid = current_sid();
3766 int ret; 3707 int ret;
3767 3708
@@ -3786,7 +3727,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid)
3786static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) 3727static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
3787{ 3728{
3788 struct inode_security_struct *isec = inode_security(inode); 3729 struct inode_security_struct *isec = inode_security(inode);
3789 struct task_security_struct *tsec = new->security; 3730 struct task_security_struct *tsec = selinux_cred(new);
3790 u32 sid = current_sid(); 3731 u32 sid = current_sid();
3791 int ret; 3732 int ret;
3792 3733
@@ -3832,7 +3773,7 @@ static int selinux_kernel_module_from_file(struct file *file)
3832 ad.type = LSM_AUDIT_DATA_FILE; 3773 ad.type = LSM_AUDIT_DATA_FILE;
3833 ad.u.file = file; 3774 ad.u.file = file;
3834 3775
3835 fsec = file->f_security; 3776 fsec = selinux_file(file);
3836 if (sid != fsec->sid) { 3777 if (sid != fsec->sid) {
3837 rc = avc_has_perm(&selinux_state, 3778 rc = avc_has_perm(&selinux_state,
3838 sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); 3779 sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
@@ -3998,7 +3939,7 @@ static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
3998static void selinux_task_to_inode(struct task_struct *p, 3939static void selinux_task_to_inode(struct task_struct *p,
3999 struct inode *inode) 3940 struct inode *inode)
4000{ 3941{
4001 struct inode_security_struct *isec = inode->i_security; 3942 struct inode_security_struct *isec = selinux_inode(inode);
4002 u32 sid = task_sid(p); 3943 u32 sid = task_sid(p);
4003 3944
4004 spin_lock(&isec->lock); 3945 spin_lock(&isec->lock);
@@ -4335,7 +4276,7 @@ static int sock_has_perm(struct sock *sk, u32 perms)
4335static int selinux_socket_create(int family, int type, 4276static int selinux_socket_create(int family, int type,
4336 int protocol, int kern) 4277 int protocol, int kern)
4337{ 4278{
4338 const struct task_security_struct *tsec = current_security(); 4279 const struct task_security_struct *tsec = selinux_cred(current_cred());
4339 u32 newsid; 4280 u32 newsid;
4340 u16 secclass; 4281 u16 secclass;
4341 int rc; 4282 int rc;
@@ -4355,7 +4296,7 @@ static int selinux_socket_create(int family, int type,
4355static int selinux_socket_post_create(struct socket *sock, int family, 4296static int selinux_socket_post_create(struct socket *sock, int family,
4356 int type, int protocol, int kern) 4297 int type, int protocol, int kern)
4357{ 4298{
4358 const struct task_security_struct *tsec = current_security(); 4299 const struct task_security_struct *tsec = selinux_cred(current_cred());
4359 struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); 4300 struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
4360 struct sk_security_struct *sksec; 4301 struct sk_security_struct *sksec;
4361 u16 sclass = socket_type_to_security_class(family, type, protocol); 4302 u16 sclass = socket_type_to_security_class(family, type, protocol);
@@ -5236,7 +5177,7 @@ static int selinux_secmark_relabel_packet(u32 sid)
5236 const struct task_security_struct *__tsec; 5177 const struct task_security_struct *__tsec;
5237 u32 tsid; 5178 u32 tsid;
5238 5179
5239 __tsec = current_security(); 5180 __tsec = selinux_cred(current_cred());
5240 tsid = __tsec->sid; 5181 tsid = __tsec->sid;
5241 5182
5242 return avc_has_perm(&selinux_state, 5183 return avc_has_perm(&selinux_state,
@@ -5711,51 +5652,22 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
5711 return selinux_nlmsg_perm(sk, skb); 5652 return selinux_nlmsg_perm(sk, skb);
5712} 5653}
5713 5654
5714static int ipc_alloc_security(struct kern_ipc_perm *perm, 5655static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
5715 u16 sclass)
5716{ 5656{
5717 struct ipc_security_struct *isec;
5718
5719 isec = kzalloc(sizeof(struct ipc_security_struct), GFP_KERNEL);
5720 if (!isec)
5721 return -ENOMEM;
5722
5723 isec->sclass = sclass; 5657 isec->sclass = sclass;
5724 isec->sid = current_sid(); 5658 isec->sid = current_sid();
5725 perm->security = isec;
5726
5727 return 0;
5728}
5729
5730static void ipc_free_security(struct kern_ipc_perm *perm)
5731{
5732 struct ipc_security_struct *isec = perm->security;
5733 perm->security = NULL;
5734 kfree(isec);
5735} 5659}
5736 5660
5737static int msg_msg_alloc_security(struct msg_msg *msg) 5661static int msg_msg_alloc_security(struct msg_msg *msg)
5738{ 5662{
5739 struct msg_security_struct *msec; 5663 struct msg_security_struct *msec;
5740 5664
5741 msec = kzalloc(sizeof(struct msg_security_struct), GFP_KERNEL); 5665 msec = selinux_msg_msg(msg);
5742 if (!msec)
5743 return -ENOMEM;
5744
5745 msec->sid = SECINITSID_UNLABELED; 5666 msec->sid = SECINITSID_UNLABELED;
5746 msg->security = msec;
5747 5667
5748 return 0; 5668 return 0;
5749} 5669}
5750 5670
5751static void msg_msg_free_security(struct msg_msg *msg)
5752{
5753 struct msg_security_struct *msec = msg->security;
5754
5755 msg->security = NULL;
5756 kfree(msec);
5757}
5758
5759static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, 5671static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
5760 u32 perms) 5672 u32 perms)
5761{ 5673{
@@ -5763,7 +5675,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
5763 struct common_audit_data ad; 5675 struct common_audit_data ad;
5764 u32 sid = current_sid(); 5676 u32 sid = current_sid();
5765 5677
5766 isec = ipc_perms->security; 5678 isec = selinux_ipc(ipc_perms);
5767 5679
5768 ad.type = LSM_AUDIT_DATA_IPC; 5680 ad.type = LSM_AUDIT_DATA_IPC;
5769 ad.u.ipc_id = ipc_perms->key; 5681 ad.u.ipc_id = ipc_perms->key;
@@ -5777,11 +5689,6 @@ static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
5777 return msg_msg_alloc_security(msg); 5689 return msg_msg_alloc_security(msg);
5778} 5690}
5779 5691
5780static void selinux_msg_msg_free_security(struct msg_msg *msg)
5781{
5782 msg_msg_free_security(msg);
5783}
5784
5785/* message queue security operations */ 5692/* message queue security operations */
5786static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq) 5693static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
5787{ 5694{
@@ -5790,11 +5697,8 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
5790 u32 sid = current_sid(); 5697 u32 sid = current_sid();
5791 int rc; 5698 int rc;
5792 5699
5793 rc = ipc_alloc_security(msq, SECCLASS_MSGQ); 5700 isec = selinux_ipc(msq);
5794 if (rc) 5701 ipc_init_security(isec, SECCLASS_MSGQ);
5795 return rc;
5796
5797 isec = msq->security;
5798 5702
5799 ad.type = LSM_AUDIT_DATA_IPC; 5703 ad.type = LSM_AUDIT_DATA_IPC;
5800 ad.u.ipc_id = msq->key; 5704 ad.u.ipc_id = msq->key;
@@ -5802,16 +5706,7 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
5802 rc = avc_has_perm(&selinux_state, 5706 rc = avc_has_perm(&selinux_state,
5803 sid, isec->sid, SECCLASS_MSGQ, 5707 sid, isec->sid, SECCLASS_MSGQ,
5804 MSGQ__CREATE, &ad); 5708 MSGQ__CREATE, &ad);
5805 if (rc) { 5709 return rc;
5806 ipc_free_security(msq);
5807 return rc;
5808 }
5809 return 0;
5810}
5811
5812static void selinux_msg_queue_free_security(struct kern_ipc_perm *msq)
5813{
5814 ipc_free_security(msq);
5815} 5710}
5816 5711
5817static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) 5712static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
@@ -5820,7 +5715,7 @@ static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
5820 struct common_audit_data ad; 5715 struct common_audit_data ad;
5821 u32 sid = current_sid(); 5716 u32 sid = current_sid();
5822 5717
5823 isec = msq->security; 5718 isec = selinux_ipc(msq);
5824 5719
5825 ad.type = LSM_AUDIT_DATA_IPC; 5720 ad.type = LSM_AUDIT_DATA_IPC;
5826 ad.u.ipc_id = msq->key; 5721 ad.u.ipc_id = msq->key;
@@ -5869,8 +5764,8 @@ static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *m
5869 u32 sid = current_sid(); 5764 u32 sid = current_sid();
5870 int rc; 5765 int rc;
5871 5766
5872 isec = msq->security; 5767 isec = selinux_ipc(msq);
5873 msec = msg->security; 5768 msec = selinux_msg_msg(msg);
5874 5769
5875 /* 5770 /*
5876 * First time through, need to assign label to the message 5771 * First time through, need to assign label to the message
@@ -5917,8 +5812,8 @@ static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *m
5917 u32 sid = task_sid(target); 5812 u32 sid = task_sid(target);
5918 int rc; 5813 int rc;
5919 5814
5920 isec = msq->security; 5815 isec = selinux_ipc(msq);
5921 msec = msg->security; 5816 msec = selinux_msg_msg(msg);
5922 5817
5923 ad.type = LSM_AUDIT_DATA_IPC; 5818 ad.type = LSM_AUDIT_DATA_IPC;
5924 ad.u.ipc_id = msq->key; 5819 ad.u.ipc_id = msq->key;
@@ -5941,11 +5836,8 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
5941 u32 sid = current_sid(); 5836 u32 sid = current_sid();
5942 int rc; 5837 int rc;
5943 5838
5944 rc = ipc_alloc_security(shp, SECCLASS_SHM); 5839 isec = selinux_ipc(shp);
5945 if (rc) 5840 ipc_init_security(isec, SECCLASS_SHM);
5946 return rc;
5947
5948 isec = shp->security;
5949 5841
5950 ad.type = LSM_AUDIT_DATA_IPC; 5842 ad.type = LSM_AUDIT_DATA_IPC;
5951 ad.u.ipc_id = shp->key; 5843 ad.u.ipc_id = shp->key;
@@ -5953,16 +5845,7 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
5953 rc = avc_has_perm(&selinux_state, 5845 rc = avc_has_perm(&selinux_state,
5954 sid, isec->sid, SECCLASS_SHM, 5846 sid, isec->sid, SECCLASS_SHM,
5955 SHM__CREATE, &ad); 5847 SHM__CREATE, &ad);
5956 if (rc) { 5848 return rc;
5957 ipc_free_security(shp);
5958 return rc;
5959 }
5960 return 0;
5961}
5962
5963static void selinux_shm_free_security(struct kern_ipc_perm *shp)
5964{
5965 ipc_free_security(shp);
5966} 5849}
5967 5850
5968static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg) 5851static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
@@ -5971,7 +5854,7 @@ static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
5971 struct common_audit_data ad; 5854 struct common_audit_data ad;
5972 u32 sid = current_sid(); 5855 u32 sid = current_sid();
5973 5856
5974 isec = shp->security; 5857 isec = selinux_ipc(shp);
5975 5858
5976 ad.type = LSM_AUDIT_DATA_IPC; 5859 ad.type = LSM_AUDIT_DATA_IPC;
5977 ad.u.ipc_id = shp->key; 5860 ad.u.ipc_id = shp->key;
@@ -6038,11 +5921,8 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
6038 u32 sid = current_sid(); 5921 u32 sid = current_sid();
6039 int rc; 5922 int rc;
6040 5923
6041 rc = ipc_alloc_security(sma, SECCLASS_SEM); 5924 isec = selinux_ipc(sma);
6042 if (rc) 5925 ipc_init_security(isec, SECCLASS_SEM);
6043 return rc;
6044
6045 isec = sma->security;
6046 5926
6047 ad.type = LSM_AUDIT_DATA_IPC; 5927 ad.type = LSM_AUDIT_DATA_IPC;
6048 ad.u.ipc_id = sma->key; 5928 ad.u.ipc_id = sma->key;
@@ -6050,16 +5930,7 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
6050 rc = avc_has_perm(&selinux_state, 5930 rc = avc_has_perm(&selinux_state,
6051 sid, isec->sid, SECCLASS_SEM, 5931 sid, isec->sid, SECCLASS_SEM,
6052 SEM__CREATE, &ad); 5932 SEM__CREATE, &ad);
6053 if (rc) { 5933 return rc;
6054 ipc_free_security(sma);
6055 return rc;
6056 }
6057 return 0;
6058}
6059
6060static void selinux_sem_free_security(struct kern_ipc_perm *sma)
6061{
6062 ipc_free_security(sma);
6063} 5934}
6064 5935
6065static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg) 5936static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
@@ -6068,7 +5939,7 @@ static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
6068 struct common_audit_data ad; 5939 struct common_audit_data ad;
6069 u32 sid = current_sid(); 5940 u32 sid = current_sid();
6070 5941
6071 isec = sma->security; 5942 isec = selinux_ipc(sma);
6072 5943
6073 ad.type = LSM_AUDIT_DATA_IPC; 5944 ad.type = LSM_AUDIT_DATA_IPC;
6074 ad.u.ipc_id = sma->key; 5945 ad.u.ipc_id = sma->key;
@@ -6154,7 +6025,7 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
6154 6025
6155static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) 6026static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
6156{ 6027{
6157 struct ipc_security_struct *isec = ipcp->security; 6028 struct ipc_security_struct *isec = selinux_ipc(ipcp);
6158 *secid = isec->sid; 6029 *secid = isec->sid;
6159} 6030}
6160 6031
@@ -6173,7 +6044,7 @@ static int selinux_getprocattr(struct task_struct *p,
6173 unsigned len; 6044 unsigned len;
6174 6045
6175 rcu_read_lock(); 6046 rcu_read_lock();
6176 __tsec = __task_cred(p)->security; 6047 __tsec = selinux_cred(__task_cred(p));
6177 6048
6178 if (current != p) { 6049 if (current != p) {
6179 error = avc_has_perm(&selinux_state, 6050 error = avc_has_perm(&selinux_state,
@@ -6296,7 +6167,7 @@ static int selinux_setprocattr(const char *name, void *value, size_t size)
6296 operation. See selinux_bprm_set_creds for the execve 6167 operation. See selinux_bprm_set_creds for the execve
6297 checks and may_create for the file creation checks. The 6168 checks and may_create for the file creation checks. The
6298 operation will then fail if the context is not permitted. */ 6169 operation will then fail if the context is not permitted. */
6299 tsec = new->security; 6170 tsec = selinux_cred(new);
6300 if (!strcmp(name, "exec")) { 6171 if (!strcmp(name, "exec")) {
6301 tsec->exec_sid = sid; 6172 tsec->exec_sid = sid;
6302 } else if (!strcmp(name, "fscreate")) { 6173 } else if (!strcmp(name, "fscreate")) {
@@ -6380,7 +6251,7 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
6380 6251
6381static void selinux_inode_invalidate_secctx(struct inode *inode) 6252static void selinux_inode_invalidate_secctx(struct inode *inode)
6382{ 6253{
6383 struct inode_security_struct *isec = inode->i_security; 6254 struct inode_security_struct *isec = selinux_inode(inode);
6384 6255
6385 spin_lock(&isec->lock); 6256 spin_lock(&isec->lock);
6386 isec->initialized = LABEL_INVALID; 6257 isec->initialized = LABEL_INVALID;
@@ -6392,7 +6263,10 @@ static void selinux_inode_invalidate_secctx(struct inode *inode)
6392 */ 6263 */
6393static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) 6264static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
6394{ 6265{
6395 return selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0); 6266 int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
6267 ctx, ctxlen, 0);
6268 /* Do not return error when suppressing label (SBLABEL_MNT not set). */
6269 return rc == -EOPNOTSUPP ? 0 : rc;
6396} 6270}
6397 6271
6398/* 6272/*
@@ -6425,7 +6299,7 @@ static int selinux_key_alloc(struct key *k, const struct cred *cred,
6425 if (!ksec) 6299 if (!ksec)
6426 return -ENOMEM; 6300 return -ENOMEM;
6427 6301
6428 tsec = cred->security; 6302 tsec = selinux_cred(cred);
6429 if (tsec->keycreate_sid) 6303 if (tsec->keycreate_sid)
6430 ksec->sid = tsec->keycreate_sid; 6304 ksec->sid = tsec->keycreate_sid;
6431 else 6305 else
@@ -6688,6 +6562,14 @@ static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
6688} 6562}
6689#endif 6563#endif
6690 6564
6565struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
6566 .lbs_cred = sizeof(struct task_security_struct),
6567 .lbs_file = sizeof(struct file_security_struct),
6568 .lbs_inode = sizeof(struct inode_security_struct),
6569 .lbs_ipc = sizeof(struct ipc_security_struct),
6570 .lbs_msg_msg = sizeof(struct msg_security_struct),
6571};
6572
6691static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { 6573static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
6692 LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), 6574 LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
6693 LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), 6575 LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -6757,7 +6639,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
6757 6639
6758 LSM_HOOK_INIT(file_permission, selinux_file_permission), 6640 LSM_HOOK_INIT(file_permission, selinux_file_permission),
6759 LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), 6641 LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
6760 LSM_HOOK_INIT(file_free_security, selinux_file_free_security),
6761 LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), 6642 LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
6762 LSM_HOOK_INIT(mmap_file, selinux_mmap_file), 6643 LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
6763 LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), 6644 LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
@@ -6771,8 +6652,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
6771 LSM_HOOK_INIT(file_open, selinux_file_open), 6652 LSM_HOOK_INIT(file_open, selinux_file_open),
6772 6653
6773 LSM_HOOK_INIT(task_alloc, selinux_task_alloc), 6654 LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
6774 LSM_HOOK_INIT(cred_alloc_blank, selinux_cred_alloc_blank),
6775 LSM_HOOK_INIT(cred_free, selinux_cred_free),
6776 LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare), 6655 LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
6777 LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer), 6656 LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
6778 LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid), 6657 LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
@@ -6800,24 +6679,20 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
6800 LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid), 6679 LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
6801 6680
6802 LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security), 6681 LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
6803 LSM_HOOK_INIT(msg_msg_free_security, selinux_msg_msg_free_security),
6804 6682
6805 LSM_HOOK_INIT(msg_queue_alloc_security, 6683 LSM_HOOK_INIT(msg_queue_alloc_security,
6806 selinux_msg_queue_alloc_security), 6684 selinux_msg_queue_alloc_security),
6807 LSM_HOOK_INIT(msg_queue_free_security, selinux_msg_queue_free_security),
6808 LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate), 6685 LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
6809 LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl), 6686 LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
6810 LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd), 6687 LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
6811 LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv), 6688 LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),
6812 6689
6813 LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security), 6690 LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
6814 LSM_HOOK_INIT(shm_free_security, selinux_shm_free_security),
6815 LSM_HOOK_INIT(shm_associate, selinux_shm_associate), 6691 LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
6816 LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl), 6692 LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
6817 LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat), 6693 LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),
6818 6694
6819 LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security), 6695 LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
6820 LSM_HOOK_INIT(sem_free_security, selinux_sem_free_security),
6821 LSM_HOOK_INIT(sem_associate, selinux_sem_associate), 6696 LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
6822 LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl), 6697 LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
6823 LSM_HOOK_INIT(sem_semop, selinux_sem_semop), 6698 LSM_HOOK_INIT(sem_semop, selinux_sem_semop),
@@ -6928,16 +6803,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
6928 6803
6929static __init int selinux_init(void) 6804static __init int selinux_init(void)
6930{ 6805{
6931 if (!security_module_enable("selinux")) {
6932 selinux_enabled = 0;
6933 return 0;
6934 }
6935
6936 if (!selinux_enabled) {
6937 pr_info("SELinux: Disabled at boot.\n");
6938 return 0;
6939 }
6940
6941 pr_info("SELinux: Initializing.\n"); 6806 pr_info("SELinux: Initializing.\n");
6942 6807
6943 memset(&selinux_state, 0, sizeof(selinux_state)); 6808 memset(&selinux_state, 0, sizeof(selinux_state));
@@ -6951,12 +6816,6 @@ static __init int selinux_init(void)
6951 6816
6952 default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); 6817 default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);
6953 6818
6954 sel_inode_cache = kmem_cache_create("selinux_inode_security",
6955 sizeof(struct inode_security_struct),
6956 0, SLAB_PANIC, NULL);
6957 file_security_cache = kmem_cache_create("selinux_file_security",
6958 sizeof(struct file_security_struct),
6959 0, SLAB_PANIC, NULL);
6960 avc_init(); 6819 avc_init();
6961 6820
6962 avtab_cache_init(); 6821 avtab_cache_init();
@@ -6999,6 +6858,9 @@ void selinux_complete_init(void)
6999 all processes and objects when they are created. */ 6858 all processes and objects when they are created. */
7000DEFINE_LSM(selinux) = { 6859DEFINE_LSM(selinux) = {
7001 .name = "selinux", 6860 .name = "selinux",
6861 .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
6862 .enabled = &selinux_enabled,
6863 .blobs = &selinux_blob_sizes,
7002 .init = selinux_init, 6864 .init = selinux_init,
7003}; 6865};
7004 6866
diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h
index 1bdf973433cc..682e2b5de2a4 100644
--- a/security/selinux/include/audit.h
+++ b/security/selinux/include/audit.h
@@ -1,9 +1,6 @@
1/* 1/*
2 * SELinux support for the Audit LSM hooks 2 * SELinux support for the Audit LSM hooks
3 * 3 *
4 * Most of below header was moved from include/linux/selinux.h which
5 * is released under below copyrights:
6 *
7 * Author: James Morris <jmorris@redhat.com> 4 * Author: James Morris <jmorris@redhat.com>
8 * 5 *
9 * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com> 6 * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -46,13 +43,11 @@ void selinux_audit_rule_free(void *rule);
46 * @field: the field this rule refers to 43 * @field: the field this rule refers to
47 * @op: the operater the rule uses 44 * @op: the operater the rule uses
48 * @rule: pointer to the audit rule to check against 45 * @rule: pointer to the audit rule to check against
49 * @actx: the audit context (can be NULL) associated with the check
50 * 46 *
51 * Returns 1 if the context id matches the rule, 0 if it does not, and 47 * Returns 1 if the context id matches the rule, 0 if it does not, and
52 * -errno on failure. 48 * -errno on failure.
53 */ 49 */
54int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule, 50int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule);
55 struct audit_context *actx);
56 51
57/** 52/**
58 * selinux_audit_rule_known - check to see if rule contains selinux fields. 53 * selinux_audit_rule_known - check to see if rule contains selinux fields.
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index ef899bcfd2cb..7be0e1e90e8b 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -142,6 +142,7 @@ static inline int avc_audit(struct selinux_state *state,
142 142
143#define AVC_STRICT 1 /* Ignore permissive mode. */ 143#define AVC_STRICT 1 /* Ignore permissive mode. */
144#define AVC_EXTENDED_PERMS 2 /* update extended permissions */ 144#define AVC_EXTENDED_PERMS 2 /* update extended permissions */
145#define AVC_NONBLOCKING 4 /* non blocking */
145int avc_has_perm_noaudit(struct selinux_state *state, 146int avc_has_perm_noaudit(struct selinux_state *state,
146 u32 ssid, u32 tsid, 147 u32 ssid, u32 tsid,
147 u16 tclass, u32 requested, 148 u16 tclass, u32 requested,
@@ -152,11 +153,6 @@ int avc_has_perm(struct selinux_state *state,
152 u32 ssid, u32 tsid, 153 u32 ssid, u32 tsid,
153 u16 tclass, u32 requested, 154 u16 tclass, u32 requested,
154 struct common_audit_data *auditdata); 155 struct common_audit_data *auditdata);
155int avc_has_perm_flags(struct selinux_state *state,
156 u32 ssid, u32 tsid,
157 u16 tclass, u32 requested,
158 struct common_audit_data *auditdata,
159 int flags);
160 156
161int avc_has_extended_perms(struct selinux_state *state, 157int avc_has_extended_perms(struct selinux_state *state,
162 u32 ssid, u32 tsid, u16 tclass, u32 requested, 158 u32 ssid, u32 tsid, u16 tclass, u32 requested,
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index cc5e26b0161b..231262d8eac9 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -25,6 +25,8 @@
25#include <linux/binfmts.h> 25#include <linux/binfmts.h>
26#include <linux/in.h> 26#include <linux/in.h>
27#include <linux/spinlock.h> 27#include <linux/spinlock.h>
28#include <linux/lsm_hooks.h>
29#include <linux/msg.h>
28#include <net/net_namespace.h> 30#include <net/net_namespace.h>
29#include "flask.h" 31#include "flask.h"
30#include "avc.h" 32#include "avc.h"
@@ -56,10 +58,7 @@ enum label_initialized {
56 58
57struct inode_security_struct { 59struct inode_security_struct {
58 struct inode *inode; /* back pointer to inode object */ 60 struct inode *inode; /* back pointer to inode object */
59 union { 61 struct list_head list; /* list of inode_security_struct */
60 struct list_head list; /* list of inode_security_struct */
61 struct rcu_head rcu; /* for freeing the inode_security_struct */
62 };
63 u32 task_sid; /* SID of creating task */ 62 u32 task_sid; /* SID of creating task */
64 u32 sid; /* SID of this object */ 63 u32 sid; /* SID of this object */
65 u16 sclass; /* security class of this object */ 64 u16 sclass; /* security class of this object */
@@ -158,4 +157,35 @@ struct bpf_security_struct {
158 u32 sid; /*SID of bpf obj creater*/ 157 u32 sid; /*SID of bpf obj creater*/
159}; 158};
160 159
160extern struct lsm_blob_sizes selinux_blob_sizes;
161static inline struct task_security_struct *selinux_cred(const struct cred *cred)
162{
163 return cred->security + selinux_blob_sizes.lbs_cred;
164}
165
166static inline struct file_security_struct *selinux_file(const struct file *file)
167{
168 return file->f_security + selinux_blob_sizes.lbs_file;
169}
170
171static inline struct inode_security_struct *selinux_inode(
172 const struct inode *inode)
173{
174 if (unlikely(!inode->i_security))
175 return NULL;
176 return inode->i_security + selinux_blob_sizes.lbs_inode;
177}
178
179static inline struct msg_security_struct *selinux_msg_msg(
180 const struct msg_msg *msg_msg)
181{
182 return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
183}
184
185static inline struct ipc_security_struct *selinux_ipc(
186 const struct kern_ipc_perm *ipc)
187{
188 return ipc->security + selinux_blob_sizes.lbs_ipc;
189}
190
161#endif /* _SELINUX_OBJSEC_H_ */ 191#endif /* _SELINUX_OBJSEC_H_ */
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index ba8eedf42b90..f68fb25b5702 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -255,6 +255,9 @@ int security_sid_to_context(struct selinux_state *state, u32 sid,
255int security_sid_to_context_force(struct selinux_state *state, 255int security_sid_to_context_force(struct selinux_state *state,
256 u32 sid, char **scontext, u32 *scontext_len); 256 u32 sid, char **scontext, u32 *scontext_len);
257 257
258int security_sid_to_context_inval(struct selinux_state *state,
259 u32 sid, char **scontext, u32 *scontext_len);
260
258int security_context_to_sid(struct selinux_state *state, 261int security_context_to_sid(struct selinux_state *state,
259 const char *scontext, u32 scontext_len, 262 const char *scontext, u32 scontext_len,
260 u32 *out_sid, gfp_t gfp); 263 u32 *out_sid, gfp_t gfp);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index f3a5a138a096..145ee62f205a 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1378,7 +1378,7 @@ static int sel_make_bools(struct selinux_fs_info *fsi)
1378 goto out; 1378 goto out;
1379 } 1379 }
1380 1380
1381 isec = (struct inode_security_struct *)inode->i_security; 1381 isec = selinux_inode(inode);
1382 ret = security_genfs_sid(fsi->state, "selinuxfs", page, 1382 ret = security_genfs_sid(fsi->state, "selinuxfs", page,
1383 SECCLASS_FILE, &sid); 1383 SECCLASS_FILE, &sid);
1384 if (ret) { 1384 if (ret) {
@@ -1953,7 +1953,7 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
1953 } 1953 }
1954 1954
1955 inode->i_ino = ++fsi->last_ino; 1955 inode->i_ino = ++fsi->last_ino;
1956 isec = (struct inode_security_struct *)inode->i_security; 1956 isec = selinux_inode(inode);
1957 isec->sid = SECINITSID_DEVNULL; 1957 isec->sid = SECINITSID_DEVNULL;
1958 isec->sclass = SECCLASS_CHR_FILE; 1958 isec->sclass = SECCLASS_CHR_FILE;
1959 isec->initialized = LABEL_INITIALIZED; 1959 isec->initialized = LABEL_INITIALIZED;
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index dd44126c8d14..1269e2be3c2d 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -49,7 +49,6 @@
49#include <linux/sched.h> 49#include <linux/sched.h>
50#include <linux/audit.h> 50#include <linux/audit.h>
51#include <linux/mutex.h> 51#include <linux/mutex.h>
52#include <linux/selinux.h>
53#include <linux/flex_array.h> 52#include <linux/flex_array.h>
54#include <linux/vmalloc.h> 53#include <linux/vmalloc.h>
55#include <net/netlabel.h> 54#include <net/netlabel.h>
@@ -1281,7 +1280,8 @@ const char *security_get_initial_sid_context(u32 sid)
1281 1280
1282static int security_sid_to_context_core(struct selinux_state *state, 1281static int security_sid_to_context_core(struct selinux_state *state,
1283 u32 sid, char **scontext, 1282 u32 sid, char **scontext,
1284 u32 *scontext_len, int force) 1283 u32 *scontext_len, int force,
1284 int only_invalid)
1285{ 1285{
1286 struct policydb *policydb; 1286 struct policydb *policydb;
1287 struct sidtab *sidtab; 1287 struct sidtab *sidtab;
@@ -1326,8 +1326,14 @@ static int security_sid_to_context_core(struct selinux_state *state,
1326 rc = -EINVAL; 1326 rc = -EINVAL;
1327 goto out_unlock; 1327 goto out_unlock;
1328 } 1328 }
1329 rc = context_struct_to_string(policydb, context, scontext, 1329 if (only_invalid && !context->len) {
1330 scontext_len); 1330 scontext = NULL;
1331 scontext_len = 0;
1332 rc = 0;
1333 } else {
1334 rc = context_struct_to_string(policydb, context, scontext,
1335 scontext_len);
1336 }
1331out_unlock: 1337out_unlock:
1332 read_unlock(&state->ss->policy_rwlock); 1338 read_unlock(&state->ss->policy_rwlock);
1333out: 1339out:
@@ -1349,14 +1355,34 @@ int security_sid_to_context(struct selinux_state *state,
1349 u32 sid, char **scontext, u32 *scontext_len) 1355 u32 sid, char **scontext, u32 *scontext_len)
1350{ 1356{
1351 return security_sid_to_context_core(state, sid, scontext, 1357 return security_sid_to_context_core(state, sid, scontext,
1352 scontext_len, 0); 1358 scontext_len, 0, 0);
1353} 1359}
1354 1360
1355int security_sid_to_context_force(struct selinux_state *state, u32 sid, 1361int security_sid_to_context_force(struct selinux_state *state, u32 sid,
1356 char **scontext, u32 *scontext_len) 1362 char **scontext, u32 *scontext_len)
1357{ 1363{
1358 return security_sid_to_context_core(state, sid, scontext, 1364 return security_sid_to_context_core(state, sid, scontext,
1359 scontext_len, 1); 1365 scontext_len, 1, 0);
1366}
1367
1368/**
1369 * security_sid_to_context_inval - Obtain a context for a given SID if it
1370 * is invalid.
1371 * @sid: security identifier, SID
1372 * @scontext: security context
1373 * @scontext_len: length in bytes
1374 *
1375 * Write the string representation of the context associated with @sid
1376 * into a dynamically allocated string of the correct size, but only if the
1377 * context is invalid in the current policy. Set @scontext to point to
1378 * this string (or NULL if the context is valid) and set @scontext_len to
1379 * the length of the string (or 0 if the context is valid).
1380 */
1381int security_sid_to_context_inval(struct selinux_state *state, u32 sid,
1382 char **scontext, u32 *scontext_len)
1383{
1384 return security_sid_to_context_core(state, sid, scontext,
1385 scontext_len, 1, 1);
1360} 1386}
1361 1387
1362/* 1388/*
@@ -3376,8 +3402,7 @@ int selinux_audit_rule_known(struct audit_krule *rule)
3376 return 0; 3402 return 0;
3377} 3403}
3378 3404
3379int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule, 3405int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
3380 struct audit_context *actx)
3381{ 3406{
3382 struct selinux_state *state = &selinux_state; 3407 struct selinux_state *state = &selinux_state;
3383 struct context *ctxt; 3408 struct context *ctxt;
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index bd7d18bdb147..7c57cb7e4146 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -79,7 +79,7 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp,
79 gfp_t gfp) 79 gfp_t gfp)
80{ 80{
81 int rc; 81 int rc;
82 const struct task_security_struct *tsec = current_security(); 82 const struct task_security_struct *tsec = selinux_cred(current_cred());
83 struct xfrm_sec_ctx *ctx = NULL; 83 struct xfrm_sec_ctx *ctx = NULL;
84 u32 str_len; 84 u32 str_len;
85 85
@@ -138,7 +138,7 @@ static void selinux_xfrm_free(struct xfrm_sec_ctx *ctx)
138 */ 138 */
139static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx) 139static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx)
140{ 140{
141 const struct task_security_struct *tsec = current_security(); 141 const struct task_security_struct *tsec = selinux_cred(current_cred());
142 142
143 if (!ctx) 143 if (!ctx)
144 return 0; 144 return 0;
diff --git a/security/smack/smack.h b/security/smack/smack.h
index f7db791fb566..9c7c95a5c497 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -24,6 +24,7 @@
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/rculist.h> 25#include <linux/rculist.h>
26#include <linux/lsm_audit.h> 26#include <linux/lsm_audit.h>
27#include <linux/msg.h>
27 28
28/* 29/*
29 * Use IPv6 port labeling if IPv6 is enabled and secmarks 30 * Use IPv6 port labeling if IPv6 is enabled and secmarks
@@ -336,6 +337,7 @@ extern struct smack_known *smack_syslog_label;
336extern struct smack_known *smack_unconfined; 337extern struct smack_known *smack_unconfined;
337#endif 338#endif
338extern int smack_ptrace_rule; 339extern int smack_ptrace_rule;
340extern struct lsm_blob_sizes smack_blob_sizes;
339 341
340extern struct smack_known smack_known_floor; 342extern struct smack_known smack_known_floor;
341extern struct smack_known smack_known_hat; 343extern struct smack_known smack_known_hat;
@@ -356,12 +358,38 @@ extern struct list_head smack_onlycap_list;
356#define SMACK_HASH_SLOTS 16 358#define SMACK_HASH_SLOTS 16
357extern struct hlist_head smack_known_hash[SMACK_HASH_SLOTS]; 359extern struct hlist_head smack_known_hash[SMACK_HASH_SLOTS];
358 360
361static inline struct task_smack *smack_cred(const struct cred *cred)
362{
363 return cred->security + smack_blob_sizes.lbs_cred;
364}
365
366static inline struct smack_known **smack_file(const struct file *file)
367{
368 return (struct smack_known **)(file->f_security +
369 smack_blob_sizes.lbs_file);
370}
371
372static inline struct inode_smack *smack_inode(const struct inode *inode)
373{
374 return inode->i_security + smack_blob_sizes.lbs_inode;
375}
376
377static inline struct smack_known **smack_msg_msg(const struct msg_msg *msg)
378{
379 return msg->security + smack_blob_sizes.lbs_msg_msg;
380}
381
382static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
383{
384 return ipc->security + smack_blob_sizes.lbs_ipc;
385}
386
359/* 387/*
360 * Is the directory transmuting? 388 * Is the directory transmuting?
361 */ 389 */
362static inline int smk_inode_transmutable(const struct inode *isp) 390static inline int smk_inode_transmutable(const struct inode *isp)
363{ 391{
364 struct inode_smack *sip = isp->i_security; 392 struct inode_smack *sip = smack_inode(isp);
365 return (sip->smk_flags & SMK_INODE_TRANSMUTE) != 0; 393 return (sip->smk_flags & SMK_INODE_TRANSMUTE) != 0;
366} 394}
367 395
@@ -370,7 +398,7 @@ static inline int smk_inode_transmutable(const struct inode *isp)
370 */ 398 */
371static inline struct smack_known *smk_of_inode(const struct inode *isp) 399static inline struct smack_known *smk_of_inode(const struct inode *isp)
372{ 400{
373 struct inode_smack *sip = isp->i_security; 401 struct inode_smack *sip = smack_inode(isp);
374 return sip->smk_inode; 402 return sip->smk_inode;
375} 403}
376 404
@@ -382,13 +410,19 @@ static inline struct smack_known *smk_of_task(const struct task_smack *tsp)
382 return tsp->smk_task; 410 return tsp->smk_task;
383} 411}
384 412
385static inline struct smack_known *smk_of_task_struct(const struct task_struct *t) 413static inline struct smack_known *smk_of_task_struct(
414 const struct task_struct *t)
386{ 415{
387 struct smack_known *skp; 416 struct smack_known *skp;
417 const struct cred *cred;
388 418
389 rcu_read_lock(); 419 rcu_read_lock();
390 skp = smk_of_task(__task_cred(t)->security); 420
421 cred = __task_cred(t);
422 skp = smk_of_task(smack_cred(cred));
423
391 rcu_read_unlock(); 424 rcu_read_unlock();
425
392 return skp; 426 return skp;
393} 427}
394 428
@@ -405,7 +439,7 @@ static inline struct smack_known *smk_of_forked(const struct task_smack *tsp)
405 */ 439 */
406static inline struct smack_known *smk_of_current(void) 440static inline struct smack_known *smk_of_current(void)
407{ 441{
408 return smk_of_task(current_security()); 442 return smk_of_task(smack_cred(current_cred()));
409} 443}
410 444
411/* 445/*
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index 9a4c0ad46518..fe2ce3a65822 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -275,7 +275,7 @@ out_audit:
275int smk_curacc(struct smack_known *obj_known, 275int smk_curacc(struct smack_known *obj_known,
276 u32 mode, struct smk_audit_info *a) 276 u32 mode, struct smk_audit_info *a)
277{ 277{
278 struct task_smack *tsp = current_security(); 278 struct task_smack *tsp = smack_cred(current_cred());
279 279
280 return smk_tskacc(tsp, obj_known, mode, a); 280 return smk_tskacc(tsp, obj_known, mode, a);
281} 281}
@@ -635,12 +635,12 @@ DEFINE_MUTEX(smack_onlycap_lock);
635 */ 635 */
636bool smack_privileged_cred(int cap, const struct cred *cred) 636bool smack_privileged_cred(int cap, const struct cred *cred)
637{ 637{
638 struct task_smack *tsp = cred->security; 638 struct task_smack *tsp = smack_cred(cred);
639 struct smack_known *skp = tsp->smk_task; 639 struct smack_known *skp = tsp->smk_task;
640 struct smack_known_list_elem *sklep; 640 struct smack_known_list_elem *sklep;
641 int rc; 641 int rc;
642 642
643 rc = cap_capable(cred, &init_user_ns, cap, SECURITY_CAP_AUDIT); 643 rc = cap_capable(cred, &init_user_ns, cap, CAP_OPT_NONE);
644 if (rc) 644 if (rc)
645 return false; 645 return false;
646 646
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 430d4f35e55c..424bce4ef21d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -139,7 +139,7 @@ static int smk_bu_note(char *note, struct smack_known *sskp,
139static int smk_bu_current(char *note, struct smack_known *oskp, 139static int smk_bu_current(char *note, struct smack_known *oskp,
140 int mode, int rc) 140 int mode, int rc)
141{ 141{
142 struct task_smack *tsp = current_security(); 142 struct task_smack *tsp = smack_cred(current_cred());
143 char acc[SMK_NUM_ACCESS_TYPE + 1]; 143 char acc[SMK_NUM_ACCESS_TYPE + 1];
144 144
145 if (rc <= 0) 145 if (rc <= 0)
@@ -160,7 +160,7 @@ static int smk_bu_current(char *note, struct smack_known *oskp,
160#ifdef CONFIG_SECURITY_SMACK_BRINGUP 160#ifdef CONFIG_SECURITY_SMACK_BRINGUP
161static int smk_bu_task(struct task_struct *otp, int mode, int rc) 161static int smk_bu_task(struct task_struct *otp, int mode, int rc)
162{ 162{
163 struct task_smack *tsp = current_security(); 163 struct task_smack *tsp = smack_cred(current_cred());
164 struct smack_known *smk_task = smk_of_task_struct(otp); 164 struct smack_known *smk_task = smk_of_task_struct(otp);
165 char acc[SMK_NUM_ACCESS_TYPE + 1]; 165 char acc[SMK_NUM_ACCESS_TYPE + 1];
166 166
@@ -182,8 +182,8 @@ static int smk_bu_task(struct task_struct *otp, int mode, int rc)
182#ifdef CONFIG_SECURITY_SMACK_BRINGUP 182#ifdef CONFIG_SECURITY_SMACK_BRINGUP
183static int smk_bu_inode(struct inode *inode, int mode, int rc) 183static int smk_bu_inode(struct inode *inode, int mode, int rc)
184{ 184{
185 struct task_smack *tsp = current_security(); 185 struct task_smack *tsp = smack_cred(current_cred());
186 struct inode_smack *isp = inode->i_security; 186 struct inode_smack *isp = smack_inode(inode);
187 char acc[SMK_NUM_ACCESS_TYPE + 1]; 187 char acc[SMK_NUM_ACCESS_TYPE + 1];
188 188
189 if (isp->smk_flags & SMK_INODE_IMPURE) 189 if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -212,10 +212,10 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc)
212#ifdef CONFIG_SECURITY_SMACK_BRINGUP 212#ifdef CONFIG_SECURITY_SMACK_BRINGUP
213static int smk_bu_file(struct file *file, int mode, int rc) 213static int smk_bu_file(struct file *file, int mode, int rc)
214{ 214{
215 struct task_smack *tsp = current_security(); 215 struct task_smack *tsp = smack_cred(current_cred());
216 struct smack_known *sskp = tsp->smk_task; 216 struct smack_known *sskp = tsp->smk_task;
217 struct inode *inode = file_inode(file); 217 struct inode *inode = file_inode(file);
218 struct inode_smack *isp = inode->i_security; 218 struct inode_smack *isp = smack_inode(inode);
219 char acc[SMK_NUM_ACCESS_TYPE + 1]; 219 char acc[SMK_NUM_ACCESS_TYPE + 1];
220 220
221 if (isp->smk_flags & SMK_INODE_IMPURE) 221 if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -242,10 +242,10 @@ static int smk_bu_file(struct file *file, int mode, int rc)
242static int smk_bu_credfile(const struct cred *cred, struct file *file, 242static int smk_bu_credfile(const struct cred *cred, struct file *file,
243 int mode, int rc) 243 int mode, int rc)
244{ 244{
245 struct task_smack *tsp = cred->security; 245 struct task_smack *tsp = smack_cred(cred);
246 struct smack_known *sskp = tsp->smk_task; 246 struct smack_known *sskp = tsp->smk_task;
247 struct inode *inode = file_inode(file); 247 struct inode *inode = file_inode(file);
248 struct inode_smack *isp = inode->i_security; 248 struct inode_smack *isp = smack_inode(inode);
249 char acc[SMK_NUM_ACCESS_TYPE + 1]; 249 char acc[SMK_NUM_ACCESS_TYPE + 1];
250 250
251 if (isp->smk_flags & SMK_INODE_IMPURE) 251 if (isp->smk_flags & SMK_INODE_IMPURE)
@@ -305,50 +305,35 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip,
305} 305}
306 306
307/** 307/**
308 * new_inode_smack - allocate an inode security blob 308 * init_inode_smack - initialize an inode security blob
309 * @isp: the blob to initialize
309 * @skp: a pointer to the Smack label entry to use in the blob 310 * @skp: a pointer to the Smack label entry to use in the blob
310 * 311 *
311 * Returns the new blob or NULL if there's no memory available
312 */ 312 */
313static struct inode_smack *new_inode_smack(struct smack_known *skp) 313static void init_inode_smack(struct inode *inode, struct smack_known *skp)
314{ 314{
315 struct inode_smack *isp; 315 struct inode_smack *isp = smack_inode(inode);
316
317 isp = kmem_cache_zalloc(smack_inode_cache, GFP_NOFS);
318 if (isp == NULL)
319 return NULL;
320 316
321 isp->smk_inode = skp; 317 isp->smk_inode = skp;
322 isp->smk_flags = 0; 318 isp->smk_flags = 0;
323 mutex_init(&isp->smk_lock); 319 mutex_init(&isp->smk_lock);
324
325 return isp;
326} 320}
327 321
328/** 322/**
329 * new_task_smack - allocate a task security blob 323 * init_task_smack - initialize a task security blob
324 * @tsp: blob to initialize
330 * @task: a pointer to the Smack label for the running task 325 * @task: a pointer to the Smack label for the running task
331 * @forked: a pointer to the Smack label for the forked task 326 * @forked: a pointer to the Smack label for the forked task
332 * @gfp: type of the memory for the allocation
333 * 327 *
334 * Returns the new blob or NULL if there's no memory available
335 */ 328 */
336static struct task_smack *new_task_smack(struct smack_known *task, 329static void init_task_smack(struct task_smack *tsp, struct smack_known *task,
337 struct smack_known *forked, gfp_t gfp) 330 struct smack_known *forked)
338{ 331{
339 struct task_smack *tsp;
340
341 tsp = kzalloc(sizeof(struct task_smack), gfp);
342 if (tsp == NULL)
343 return NULL;
344
345 tsp->smk_task = task; 332 tsp->smk_task = task;
346 tsp->smk_forked = forked; 333 tsp->smk_forked = forked;
347 INIT_LIST_HEAD(&tsp->smk_rules); 334 INIT_LIST_HEAD(&tsp->smk_rules);
348 INIT_LIST_HEAD(&tsp->smk_relabel); 335 INIT_LIST_HEAD(&tsp->smk_relabel);
349 mutex_init(&tsp->smk_rules_lock); 336 mutex_init(&tsp->smk_rules_lock);
350
351 return tsp;
352} 337}
353 338
354/** 339/**
@@ -448,7 +433,7 @@ static int smk_ptrace_rule_check(struct task_struct *tracer,
448 433
449 rcu_read_lock(); 434 rcu_read_lock();
450 tracercred = __task_cred(tracer); 435 tracercred = __task_cred(tracer);
451 tsp = tracercred->security; 436 tsp = smack_cred(tracercred);
452 tracer_known = smk_of_task(tsp); 437 tracer_known = smk_of_task(tsp);
453 438
454 if ((mode & PTRACE_MODE_ATTACH) && 439 if ((mode & PTRACE_MODE_ATTACH) &&
@@ -515,7 +500,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
515 int rc; 500 int rc;
516 struct smack_known *skp; 501 struct smack_known *skp;
517 502
518 skp = smk_of_task(current_security()); 503 skp = smk_of_task(smack_cred(current_cred()));
519 504
520 rc = smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__); 505 rc = smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
521 return rc; 506 return rc;
@@ -718,6 +703,13 @@ static int smack_set_mnt_opts(struct super_block *sb,
718 if (sp->smk_flags & SMK_SB_INITIALIZED) 703 if (sp->smk_flags & SMK_SB_INITIALIZED)
719 return 0; 704 return 0;
720 705
706 if (inode->i_security == NULL) {
707 int rc = lsm_inode_alloc(inode);
708
709 if (rc)
710 return rc;
711 }
712
721 if (!smack_privileged(CAP_MAC_ADMIN)) { 713 if (!smack_privileged(CAP_MAC_ADMIN)) {
722 /* 714 /*
723 * Unprivileged mounts don't get to specify Smack values. 715 * Unprivileged mounts don't get to specify Smack values.
@@ -782,17 +774,12 @@ static int smack_set_mnt_opts(struct super_block *sb,
782 /* 774 /*
783 * Initialize the root inode. 775 * Initialize the root inode.
784 */ 776 */
785 isp = inode->i_security; 777 init_inode_smack(inode, sp->smk_root);
786 if (isp == NULL) {
787 isp = new_inode_smack(sp->smk_root);
788 if (isp == NULL)
789 return -ENOMEM;
790 inode->i_security = isp;
791 } else
792 isp->smk_inode = sp->smk_root;
793 778
794 if (transmute) 779 if (transmute) {
780 isp = smack_inode(inode);
795 isp->smk_flags |= SMK_INODE_TRANSMUTE; 781 isp->smk_flags |= SMK_INODE_TRANSMUTE;
782 }
796 783
797 return 0; 784 return 0;
798} 785}
@@ -831,7 +818,7 @@ static int smack_sb_statfs(struct dentry *dentry)
831static int smack_bprm_set_creds(struct linux_binprm *bprm) 818static int smack_bprm_set_creds(struct linux_binprm *bprm)
832{ 819{
833 struct inode *inode = file_inode(bprm->file); 820 struct inode *inode = file_inode(bprm->file);
834 struct task_smack *bsp = bprm->cred->security; 821 struct task_smack *bsp = smack_cred(bprm->cred);
835 struct inode_smack *isp; 822 struct inode_smack *isp;
836 struct superblock_smack *sbsp; 823 struct superblock_smack *sbsp;
837 int rc; 824 int rc;
@@ -839,7 +826,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
839 if (bprm->called_set_creds) 826 if (bprm->called_set_creds)
840 return 0; 827 return 0;
841 828
842 isp = inode->i_security; 829 isp = smack_inode(inode);
843 if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task) 830 if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
844 return 0; 831 return 0;
845 832
@@ -890,49 +877,11 @@ static int smack_inode_alloc_security(struct inode *inode)
890{ 877{
891 struct smack_known *skp = smk_of_current(); 878 struct smack_known *skp = smk_of_current();
892 879
893 inode->i_security = new_inode_smack(skp); 880 init_inode_smack(inode, skp);
894 if (inode->i_security == NULL)
895 return -ENOMEM;
896 return 0; 881 return 0;
897} 882}
898 883
899/** 884/**
900 * smack_inode_free_rcu - Free inode_smack blob from cache
901 * @head: the rcu_head for getting inode_smack pointer
902 *
903 * Call back function called from call_rcu() to free
904 * the i_security blob pointer in inode
905 */
906static void smack_inode_free_rcu(struct rcu_head *head)
907{
908 struct inode_smack *issp;
909
910 issp = container_of(head, struct inode_smack, smk_rcu);
911 kmem_cache_free(smack_inode_cache, issp);
912}
913
914/**
915 * smack_inode_free_security - free an inode blob using call_rcu()
916 * @inode: the inode with a blob
917 *
918 * Clears the blob pointer in inode using RCU
919 */
920static void smack_inode_free_security(struct inode *inode)
921{
922 struct inode_smack *issp = inode->i_security;
923
924 /*
925 * The inode may still be referenced in a path walk and
926 * a call to smack_inode_permission() can be made
927 * after smack_inode_free_security() is called.
928 * To avoid race condition free the i_security via RCU
929 * and leave the current inode->i_security pointer intact.
930 * The inode will be freed after the RCU grace period too.
931 */
932 call_rcu(&issp->smk_rcu, smack_inode_free_rcu);
933}
934
935/**
936 * smack_inode_init_security - copy out the smack from an inode 885 * smack_inode_init_security - copy out the smack from an inode
937 * @inode: the newly created inode 886 * @inode: the newly created inode
938 * @dir: containing directory object 887 * @dir: containing directory object
@@ -947,7 +896,7 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
947 const struct qstr *qstr, const char **name, 896 const struct qstr *qstr, const char **name,
948 void **value, size_t *len) 897 void **value, size_t *len)
949{ 898{
950 struct inode_smack *issp = inode->i_security; 899 struct inode_smack *issp = smack_inode(inode);
951 struct smack_known *skp = smk_of_current(); 900 struct smack_known *skp = smk_of_current();
952 struct smack_known *isp = smk_of_inode(inode); 901 struct smack_known *isp = smk_of_inode(inode);
953 struct smack_known *dsp = smk_of_inode(dir); 902 struct smack_known *dsp = smk_of_inode(dir);
@@ -1285,7 +1234,7 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
1285 const void *value, size_t size, int flags) 1234 const void *value, size_t size, int flags)
1286{ 1235{
1287 struct smack_known *skp; 1236 struct smack_known *skp;
1288 struct inode_smack *isp = d_backing_inode(dentry)->i_security; 1237 struct inode_smack *isp = smack_inode(d_backing_inode(dentry));
1289 1238
1290 if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) { 1239 if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
1291 isp->smk_flags |= SMK_INODE_TRANSMUTE; 1240 isp->smk_flags |= SMK_INODE_TRANSMUTE;
@@ -1366,7 +1315,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
1366 if (rc != 0) 1315 if (rc != 0)
1367 return rc; 1316 return rc;
1368 1317
1369 isp = d_backing_inode(dentry)->i_security; 1318 isp = smack_inode(d_backing_inode(dentry));
1370 /* 1319 /*
1371 * Don't do anything special for these. 1320 * Don't do anything special for these.
1372 * XATTR_NAME_SMACKIPIN 1321 * XATTR_NAME_SMACKIPIN
@@ -1498,25 +1447,13 @@ static void smack_inode_getsecid(struct inode *inode, u32 *secid)
1498 */ 1447 */
1499static int smack_file_alloc_security(struct file *file) 1448static int smack_file_alloc_security(struct file *file)
1500{ 1449{
1501 struct smack_known *skp = smk_of_current(); 1450 struct smack_known **blob = smack_file(file);
1502 1451
1503 file->f_security = skp; 1452 *blob = smk_of_current();
1504 return 0; 1453 return 0;
1505} 1454}
1506 1455
1507/** 1456/**
1508 * smack_file_free_security - clear a file security blob
1509 * @file: the object
1510 *
1511 * The security blob for a file is a pointer to the master
1512 * label list, so no memory is freed.
1513 */
1514static void smack_file_free_security(struct file *file)
1515{
1516 file->f_security = NULL;
1517}
1518
1519/**
1520 * smack_file_ioctl - Smack check on ioctls 1457 * smack_file_ioctl - Smack check on ioctls
1521 * @file: the object 1458 * @file: the object
1522 * @cmd: what to do 1459 * @cmd: what to do
@@ -1653,7 +1590,7 @@ static int smack_mmap_file(struct file *file,
1653 if (unlikely(IS_PRIVATE(file_inode(file)))) 1590 if (unlikely(IS_PRIVATE(file_inode(file))))
1654 return 0; 1591 return 0;
1655 1592
1656 isp = file_inode(file)->i_security; 1593 isp = smack_inode(file_inode(file));
1657 if (isp->smk_mmap == NULL) 1594 if (isp->smk_mmap == NULL)
1658 return 0; 1595 return 0;
1659 sbsp = file_inode(file)->i_sb->s_security; 1596 sbsp = file_inode(file)->i_sb->s_security;
@@ -1662,7 +1599,7 @@ static int smack_mmap_file(struct file *file,
1662 return -EACCES; 1599 return -EACCES;
1663 mkp = isp->smk_mmap; 1600 mkp = isp->smk_mmap;
1664 1601
1665 tsp = current_security(); 1602 tsp = smack_cred(current_cred());
1666 skp = smk_of_current(); 1603 skp = smk_of_current();
1667 rc = 0; 1604 rc = 0;
1668 1605
@@ -1740,7 +1677,9 @@ static int smack_mmap_file(struct file *file,
1740 */ 1677 */
1741static void smack_file_set_fowner(struct file *file) 1678static void smack_file_set_fowner(struct file *file)
1742{ 1679{
1743 file->f_security = smk_of_current(); 1680 struct smack_known **blob = smack_file(file);
1681
1682 *blob = smk_of_current();
1744} 1683}
1745 1684
1746/** 1685/**
@@ -1757,8 +1696,9 @@ static void smack_file_set_fowner(struct file *file)
1757static int smack_file_send_sigiotask(struct task_struct *tsk, 1696static int smack_file_send_sigiotask(struct task_struct *tsk,
1758 struct fown_struct *fown, int signum) 1697 struct fown_struct *fown, int signum)
1759{ 1698{
1699 struct smack_known **blob;
1760 struct smack_known *skp; 1700 struct smack_known *skp;
1761 struct smack_known *tkp = smk_of_task(tsk->cred->security); 1701 struct smack_known *tkp = smk_of_task(smack_cred(tsk->cred));
1762 const struct cred *tcred; 1702 const struct cred *tcred;
1763 struct file *file; 1703 struct file *file;
1764 int rc; 1704 int rc;
@@ -1770,7 +1710,8 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
1770 file = container_of(fown, struct file, f_owner); 1710 file = container_of(fown, struct file, f_owner);
1771 1711
1772 /* we don't log here as rc can be overriden */ 1712 /* we don't log here as rc can be overriden */
1773 skp = file->f_security; 1713 blob = smack_file(file);
1714 skp = *blob;
1774 rc = smk_access(skp, tkp, MAY_DELIVER, NULL); 1715 rc = smk_access(skp, tkp, MAY_DELIVER, NULL);
1775 rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc); 1716 rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc);
1776 1717
@@ -1811,7 +1752,7 @@ static int smack_file_receive(struct file *file)
1811 if (inode->i_sb->s_magic == SOCKFS_MAGIC) { 1752 if (inode->i_sb->s_magic == SOCKFS_MAGIC) {
1812 sock = SOCKET_I(inode); 1753 sock = SOCKET_I(inode);
1813 ssp = sock->sk->sk_security; 1754 ssp = sock->sk->sk_security;
1814 tsp = current_security(); 1755 tsp = smack_cred(current_cred());
1815 /* 1756 /*
1816 * If the receiving process can't write to the 1757 * If the receiving process can't write to the
1817 * passed socket or if the passed socket can't 1758 * passed socket or if the passed socket can't
@@ -1853,7 +1794,7 @@ static int smack_file_receive(struct file *file)
1853 */ 1794 */
1854static int smack_file_open(struct file *file) 1795static int smack_file_open(struct file *file)
1855{ 1796{
1856 struct task_smack *tsp = file->f_cred->security; 1797 struct task_smack *tsp = smack_cred(file->f_cred);
1857 struct inode *inode = file_inode(file); 1798 struct inode *inode = file_inode(file);
1858 struct smk_audit_info ad; 1799 struct smk_audit_info ad;
1859 int rc; 1800 int rc;
@@ -1881,14 +1822,7 @@ static int smack_file_open(struct file *file)
1881 */ 1822 */
1882static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp) 1823static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
1883{ 1824{
1884 struct task_smack *tsp; 1825 init_task_smack(smack_cred(cred), NULL, NULL);
1885
1886 tsp = new_task_smack(NULL, NULL, gfp);
1887 if (tsp == NULL)
1888 return -ENOMEM;
1889
1890 cred->security = tsp;
1891
1892 return 0; 1826 return 0;
1893} 1827}
1894 1828
@@ -1900,15 +1834,11 @@ static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
1900 */ 1834 */
1901static void smack_cred_free(struct cred *cred) 1835static void smack_cred_free(struct cred *cred)
1902{ 1836{
1903 struct task_smack *tsp = cred->security; 1837 struct task_smack *tsp = smack_cred(cred);
1904 struct smack_rule *rp; 1838 struct smack_rule *rp;
1905 struct list_head *l; 1839 struct list_head *l;
1906 struct list_head *n; 1840 struct list_head *n;
1907 1841
1908 if (tsp == NULL)
1909 return;
1910 cred->security = NULL;
1911
1912 smk_destroy_label_list(&tsp->smk_relabel); 1842 smk_destroy_label_list(&tsp->smk_relabel);
1913 1843
1914 list_for_each_safe(l, n, &tsp->smk_rules) { 1844 list_for_each_safe(l, n, &tsp->smk_rules) {
@@ -1916,7 +1846,6 @@ static void smack_cred_free(struct cred *cred)
1916 list_del(&rp->list); 1846 list_del(&rp->list);
1917 kfree(rp); 1847 kfree(rp);
1918 } 1848 }
1919 kfree(tsp);
1920} 1849}
1921 1850
1922/** 1851/**
@@ -1930,15 +1859,11 @@ static void smack_cred_free(struct cred *cred)
1930static int smack_cred_prepare(struct cred *new, const struct cred *old, 1859static int smack_cred_prepare(struct cred *new, const struct cred *old,
1931 gfp_t gfp) 1860 gfp_t gfp)
1932{ 1861{
1933 struct task_smack *old_tsp = old->security; 1862 struct task_smack *old_tsp = smack_cred(old);
1934 struct task_smack *new_tsp; 1863 struct task_smack *new_tsp = smack_cred(new);
1935 int rc; 1864 int rc;
1936 1865
1937 new_tsp = new_task_smack(old_tsp->smk_task, old_tsp->smk_task, gfp); 1866 init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task);
1938 if (new_tsp == NULL)
1939 return -ENOMEM;
1940
1941 new->security = new_tsp;
1942 1867
1943 rc = smk_copy_rules(&new_tsp->smk_rules, &old_tsp->smk_rules, gfp); 1868 rc = smk_copy_rules(&new_tsp->smk_rules, &old_tsp->smk_rules, gfp);
1944 if (rc != 0) 1869 if (rc != 0)
@@ -1946,10 +1871,7 @@ static int smack_cred_prepare(struct cred *new, const struct cred *old,
1946 1871
1947 rc = smk_copy_relabel(&new_tsp->smk_relabel, &old_tsp->smk_relabel, 1872 rc = smk_copy_relabel(&new_tsp->smk_relabel, &old_tsp->smk_relabel,
1948 gfp); 1873 gfp);
1949 if (rc != 0) 1874 return rc;
1950 return rc;
1951
1952 return 0;
1953} 1875}
1954 1876
1955/** 1877/**
@@ -1961,15 +1883,14 @@ static int smack_cred_prepare(struct cred *new, const struct cred *old,
1961 */ 1883 */
1962static void smack_cred_transfer(struct cred *new, const struct cred *old) 1884static void smack_cred_transfer(struct cred *new, const struct cred *old)
1963{ 1885{
1964 struct task_smack *old_tsp = old->security; 1886 struct task_smack *old_tsp = smack_cred(old);
1965 struct task_smack *new_tsp = new->security; 1887 struct task_smack *new_tsp = smack_cred(new);
1966 1888
1967 new_tsp->smk_task = old_tsp->smk_task; 1889 new_tsp->smk_task = old_tsp->smk_task;
1968 new_tsp->smk_forked = old_tsp->smk_task; 1890 new_tsp->smk_forked = old_tsp->smk_task;
1969 mutex_init(&new_tsp->smk_rules_lock); 1891 mutex_init(&new_tsp->smk_rules_lock);
1970 INIT_LIST_HEAD(&new_tsp->smk_rules); 1892 INIT_LIST_HEAD(&new_tsp->smk_rules);
1971 1893
1972
1973 /* cbs copy rule list */ 1894 /* cbs copy rule list */
1974} 1895}
1975 1896
@@ -1980,12 +1901,12 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old)
1980 * 1901 *
1981 * Sets the secid to contain a u32 version of the smack label. 1902 * Sets the secid to contain a u32 version of the smack label.
1982 */ 1903 */
1983static void smack_cred_getsecid(const struct cred *c, u32 *secid) 1904static void smack_cred_getsecid(const struct cred *cred, u32 *secid)
1984{ 1905{
1985 struct smack_known *skp; 1906 struct smack_known *skp;
1986 1907
1987 rcu_read_lock(); 1908 rcu_read_lock();
1988 skp = smk_of_task(c->security); 1909 skp = smk_of_task(smack_cred(cred));
1989 *secid = skp->smk_secid; 1910 *secid = skp->smk_secid;
1990 rcu_read_unlock(); 1911 rcu_read_unlock();
1991} 1912}
@@ -1999,7 +1920,7 @@ static void smack_cred_getsecid(const struct cred *c, u32 *secid)
1999 */ 1920 */
2000static int smack_kernel_act_as(struct cred *new, u32 secid) 1921static int smack_kernel_act_as(struct cred *new, u32 secid)
2001{ 1922{
2002 struct task_smack *new_tsp = new->security; 1923 struct task_smack *new_tsp = smack_cred(new);
2003 1924
2004 new_tsp->smk_task = smack_from_secid(secid); 1925 new_tsp->smk_task = smack_from_secid(secid);
2005 return 0; 1926 return 0;
@@ -2016,8 +1937,8 @@ static int smack_kernel_act_as(struct cred *new, u32 secid)
2016static int smack_kernel_create_files_as(struct cred *new, 1937static int smack_kernel_create_files_as(struct cred *new,
2017 struct inode *inode) 1938 struct inode *inode)
2018{ 1939{
2019 struct inode_smack *isp = inode->i_security; 1940 struct inode_smack *isp = smack_inode(inode);
2020 struct task_smack *tsp = new->security; 1941 struct task_smack *tsp = smack_cred(new);
2021 1942
2022 tsp->smk_forked = isp->smk_inode; 1943 tsp->smk_forked = isp->smk_inode;
2023 tsp->smk_task = tsp->smk_forked; 1944 tsp->smk_task = tsp->smk_forked;
@@ -2201,7 +2122,7 @@ static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info,
2201 * specific behavior. This is not clean. For one thing 2122 * specific behavior. This is not clean. For one thing
2202 * we can't take privilege into account. 2123 * we can't take privilege into account.
2203 */ 2124 */
2204 skp = smk_of_task(cred->security); 2125 skp = smk_of_task(smack_cred(cred));
2205 rc = smk_access(skp, tkp, MAY_DELIVER, &ad); 2126 rc = smk_access(skp, tkp, MAY_DELIVER, &ad);
2206 rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc); 2127 rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc);
2207 return rc; 2128 return rc;
@@ -2216,7 +2137,7 @@ static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info,
2216 */ 2137 */
2217static void smack_task_to_inode(struct task_struct *p, struct inode *inode) 2138static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
2218{ 2139{
2219 struct inode_smack *isp = inode->i_security; 2140 struct inode_smack *isp = smack_inode(inode);
2220 struct smack_known *skp = smk_of_task_struct(p); 2141 struct smack_known *skp = smk_of_task_struct(p);
2221 2142
2222 isp->smk_inode = skp; 2143 isp->smk_inode = skp;
@@ -2679,7 +2600,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
2679 const void *value, size_t size, int flags) 2600 const void *value, size_t size, int flags)
2680{ 2601{
2681 struct smack_known *skp; 2602 struct smack_known *skp;
2682 struct inode_smack *nsp = inode->i_security; 2603 struct inode_smack *nsp = smack_inode(inode);
2683 struct socket_smack *ssp; 2604 struct socket_smack *ssp;
2684 struct socket *sock; 2605 struct socket *sock;
2685 int rc = 0; 2606 int rc = 0;
@@ -2888,24 +2809,13 @@ static int smack_flags_to_may(int flags)
2888 */ 2809 */
2889static int smack_msg_msg_alloc_security(struct msg_msg *msg) 2810static int smack_msg_msg_alloc_security(struct msg_msg *msg)
2890{ 2811{
2891 struct smack_known *skp = smk_of_current(); 2812 struct smack_known **blob = smack_msg_msg(msg);
2892 2813
2893 msg->security = skp; 2814 *blob = smk_of_current();
2894 return 0; 2815 return 0;
2895} 2816}
2896 2817
2897/** 2818/**
2898 * smack_msg_msg_free_security - Clear the security blob for msg_msg
2899 * @msg: the object
2900 *
2901 * Clears the blob pointer
2902 */
2903static void smack_msg_msg_free_security(struct msg_msg *msg)
2904{
2905 msg->security = NULL;
2906}
2907
2908/**
2909 * smack_of_ipc - the smack pointer for the ipc 2819 * smack_of_ipc - the smack pointer for the ipc
2910 * @isp: the object 2820 * @isp: the object
2911 * 2821 *
@@ -2913,7 +2823,9 @@ static void smack_msg_msg_free_security(struct msg_msg *msg)
2913 */ 2823 */
2914static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp) 2824static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp)
2915{ 2825{
2916 return (struct smack_known *)isp->security; 2826 struct smack_known **blob = smack_ipc(isp);
2827
2828 return *blob;
2917} 2829}
2918 2830
2919/** 2831/**
@@ -2924,24 +2836,13 @@ static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp)
2924 */ 2836 */
2925static int smack_ipc_alloc_security(struct kern_ipc_perm *isp) 2837static int smack_ipc_alloc_security(struct kern_ipc_perm *isp)
2926{ 2838{
2927 struct smack_known *skp = smk_of_current(); 2839 struct smack_known **blob = smack_ipc(isp);
2928 2840
2929 isp->security = skp; 2841 *blob = smk_of_current();
2930 return 0; 2842 return 0;
2931} 2843}
2932 2844
2933/** 2845/**
2934 * smack_ipc_free_security - Clear the security blob for ipc
2935 * @isp: the object
2936 *
2937 * Clears the blob pointer
2938 */
2939static void smack_ipc_free_security(struct kern_ipc_perm *isp)
2940{
2941 isp->security = NULL;
2942}
2943
2944/**
2945 * smk_curacc_shm : check if current has access on shm 2846 * smk_curacc_shm : check if current has access on shm
2946 * @isp : the object 2847 * @isp : the object
2947 * @access : access requested 2848 * @access : access requested
@@ -3238,7 +3139,8 @@ static int smack_msg_queue_msgrcv(struct kern_ipc_perm *isp, struct msg_msg *msg
3238 */ 3139 */
3239static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag) 3140static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
3240{ 3141{
3241 struct smack_known *iskp = ipp->security; 3142 struct smack_known **blob = smack_ipc(ipp);
3143 struct smack_known *iskp = *blob;
3242 int may = smack_flags_to_may(flag); 3144 int may = smack_flags_to_may(flag);
3243 struct smk_audit_info ad; 3145 struct smk_audit_info ad;
3244 int rc; 3146 int rc;
@@ -3259,7 +3161,8 @@ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
3259 */ 3161 */
3260static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid) 3162static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
3261{ 3163{
3262 struct smack_known *iskp = ipp->security; 3164 struct smack_known **blob = smack_ipc(ipp);
3165 struct smack_known *iskp = *blob;
3263 3166
3264 *secid = iskp->smk_secid; 3167 *secid = iskp->smk_secid;
3265} 3168}
@@ -3287,7 +3190,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
3287 if (inode == NULL) 3190 if (inode == NULL)
3288 return; 3191 return;
3289 3192
3290 isp = inode->i_security; 3193 isp = smack_inode(inode);
3291 3194
3292 mutex_lock(&isp->smk_lock); 3195 mutex_lock(&isp->smk_lock);
3293 /* 3196 /*
@@ -3390,13 +3293,12 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
3390 */ 3293 */
3391 final = &smack_known_star; 3294 final = &smack_known_star;
3392 /* 3295 /*
3393 * Fall through.
3394 *
3395 * If a smack value has been set we want to use it, 3296 * If a smack value has been set we want to use it,
3396 * but since tmpfs isn't giving us the opportunity 3297 * but since tmpfs isn't giving us the opportunity
3397 * to set mount options simulate setting the 3298 * to set mount options simulate setting the
3398 * superblock default. 3299 * superblock default.
3399 */ 3300 */
3301 /* Fall through */
3400 default: 3302 default:
3401 /* 3303 /*
3402 * This isn't an understood special case. 3304 * This isn't an understood special case.
@@ -3528,7 +3430,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
3528 */ 3430 */
3529static int smack_setprocattr(const char *name, void *value, size_t size) 3431static int smack_setprocattr(const char *name, void *value, size_t size)
3530{ 3432{
3531 struct task_smack *tsp = current_security(); 3433 struct task_smack *tsp = smack_cred(current_cred());
3532 struct cred *new; 3434 struct cred *new;
3533 struct smack_known *skp; 3435 struct smack_known *skp;
3534 struct smack_known_list_elem *sklep; 3436 struct smack_known_list_elem *sklep;
@@ -3569,7 +3471,7 @@ static int smack_setprocattr(const char *name, void *value, size_t size)
3569 if (new == NULL) 3471 if (new == NULL)
3570 return -ENOMEM; 3472 return -ENOMEM;
3571 3473
3572 tsp = new->security; 3474 tsp = smack_cred(new);
3573 tsp->smk_task = skp; 3475 tsp->smk_task = skp;
3574 /* 3476 /*
3575 * process can change its label only once 3477 * process can change its label only once
@@ -4214,7 +4116,7 @@ static void smack_inet_csk_clone(struct sock *sk,
4214static int smack_key_alloc(struct key *key, const struct cred *cred, 4116static int smack_key_alloc(struct key *key, const struct cred *cred,
4215 unsigned long flags) 4117 unsigned long flags)
4216{ 4118{
4217 struct smack_known *skp = smk_of_task(cred->security); 4119 struct smack_known *skp = smk_of_task(smack_cred(cred));
4218 4120
4219 key->security = skp; 4121 key->security = skp;
4220 return 0; 4122 return 0;
@@ -4245,7 +4147,7 @@ static int smack_key_permission(key_ref_t key_ref,
4245{ 4147{
4246 struct key *keyp; 4148 struct key *keyp;
4247 struct smk_audit_info ad; 4149 struct smk_audit_info ad;
4248 struct smack_known *tkp = smk_of_task(cred->security); 4150 struct smack_known *tkp = smk_of_task(smack_cred(cred));
4249 int request = 0; 4151 int request = 0;
4250 int rc; 4152 int rc;
4251 4153
@@ -4393,13 +4295,11 @@ static int smack_audit_rule_known(struct audit_krule *krule)
4393 * @field: audit rule flags given from user-space 4295 * @field: audit rule flags given from user-space
4394 * @op: required testing operator 4296 * @op: required testing operator
4395 * @vrule: smack internal rule presentation 4297 * @vrule: smack internal rule presentation
4396 * @actx: audit context associated with the check
4397 * 4298 *
4398 * The core Audit hook. It's used to take the decision of 4299 * The core Audit hook. It's used to take the decision of
4399 * whether to audit or not to audit a given object. 4300 * whether to audit or not to audit a given object.
4400 */ 4301 */
4401static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule, 4302static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
4402 struct audit_context *actx)
4403{ 4303{
4404 struct smack_known *skp; 4304 struct smack_known *skp;
4405 char *rule = vrule; 4305 char *rule = vrule;
@@ -4520,12 +4420,12 @@ static int smack_inode_copy_up(struct dentry *dentry, struct cred **new)
4520 return -ENOMEM; 4420 return -ENOMEM;
4521 } 4421 }
4522 4422
4523 tsp = new_creds->security; 4423 tsp = smack_cred(new_creds);
4524 4424
4525 /* 4425 /*
4526 * Get label from overlay inode and set it in create_sid 4426 * Get label from overlay inode and set it in create_sid
4527 */ 4427 */
4528 isp = d_inode(dentry->d_parent)->i_security; 4428 isp = smack_inode(d_inode(dentry->d_parent));
4529 skp = isp->smk_inode; 4429 skp = isp->smk_inode;
4530 tsp->smk_task = skp; 4430 tsp->smk_task = skp;
4531 *new = new_creds; 4431 *new = new_creds;
@@ -4548,8 +4448,8 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
4548 const struct cred *old, 4448 const struct cred *old,
4549 struct cred *new) 4449 struct cred *new)
4550{ 4450{
4551 struct task_smack *otsp = old->security; 4451 struct task_smack *otsp = smack_cred(old);
4552 struct task_smack *ntsp = new->security; 4452 struct task_smack *ntsp = smack_cred(new);
4553 struct inode_smack *isp; 4453 struct inode_smack *isp;
4554 int may; 4454 int may;
4555 4455
@@ -4562,7 +4462,7 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
4562 /* 4462 /*
4563 * the attribute of the containing directory 4463 * the attribute of the containing directory
4564 */ 4464 */
4565 isp = d_inode(dentry->d_parent)->i_security; 4465 isp = smack_inode(d_inode(dentry->d_parent));
4566 4466
4567 if (isp->smk_flags & SMK_INODE_TRANSMUTE) { 4467 if (isp->smk_flags & SMK_INODE_TRANSMUTE) {
4568 rcu_read_lock(); 4468 rcu_read_lock();
@@ -4582,6 +4482,14 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
4582 return 0; 4482 return 0;
4583} 4483}
4584 4484
4485struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
4486 .lbs_cred = sizeof(struct task_smack),
4487 .lbs_file = sizeof(struct smack_known *),
4488 .lbs_inode = sizeof(struct inode_smack),
4489 .lbs_ipc = sizeof(struct smack_known *),
4490 .lbs_msg_msg = sizeof(struct smack_known *),
4491};
4492
4585static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { 4493static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
4586 LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check), 4494 LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
4587 LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), 4495 LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
@@ -4597,7 +4505,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
4597 LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds), 4505 LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds),
4598 4506
4599 LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security), 4507 LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security),
4600 LSM_HOOK_INIT(inode_free_security, smack_inode_free_security),
4601 LSM_HOOK_INIT(inode_init_security, smack_inode_init_security), 4508 LSM_HOOK_INIT(inode_init_security, smack_inode_init_security),
4602 LSM_HOOK_INIT(inode_link, smack_inode_link), 4509 LSM_HOOK_INIT(inode_link, smack_inode_link),
4603 LSM_HOOK_INIT(inode_unlink, smack_inode_unlink), 4510 LSM_HOOK_INIT(inode_unlink, smack_inode_unlink),
@@ -4616,7 +4523,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
4616 LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid), 4523 LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),
4617 4524
4618 LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security), 4525 LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
4619 LSM_HOOK_INIT(file_free_security, smack_file_free_security),
4620 LSM_HOOK_INIT(file_ioctl, smack_file_ioctl), 4526 LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
4621 LSM_HOOK_INIT(file_lock, smack_file_lock), 4527 LSM_HOOK_INIT(file_lock, smack_file_lock),
4622 LSM_HOOK_INIT(file_fcntl, smack_file_fcntl), 4528 LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
@@ -4652,23 +4558,19 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
4652 LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid), 4558 LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),
4653 4559
4654 LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security), 4560 LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),
4655 LSM_HOOK_INIT(msg_msg_free_security, smack_msg_msg_free_security),
4656 4561
4657 LSM_HOOK_INIT(msg_queue_alloc_security, smack_ipc_alloc_security), 4562 LSM_HOOK_INIT(msg_queue_alloc_security, smack_ipc_alloc_security),
4658 LSM_HOOK_INIT(msg_queue_free_security, smack_ipc_free_security),
4659 LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate), 4563 LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate),
4660 LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl), 4564 LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl),
4661 LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd), 4565 LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd),
4662 LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv), 4566 LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv),
4663 4567
4664 LSM_HOOK_INIT(shm_alloc_security, smack_ipc_alloc_security), 4568 LSM_HOOK_INIT(shm_alloc_security, smack_ipc_alloc_security),
4665 LSM_HOOK_INIT(shm_free_security, smack_ipc_free_security),
4666 LSM_HOOK_INIT(shm_associate, smack_shm_associate), 4569 LSM_HOOK_INIT(shm_associate, smack_shm_associate),
4667 LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl), 4570 LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl),
4668 LSM_HOOK_INIT(shm_shmat, smack_shm_shmat), 4571 LSM_HOOK_INIT(shm_shmat, smack_shm_shmat),
4669 4572
4670 LSM_HOOK_INIT(sem_alloc_security, smack_ipc_alloc_security), 4573 LSM_HOOK_INIT(sem_alloc_security, smack_ipc_alloc_security),
4671 LSM_HOOK_INIT(sem_free_security, smack_ipc_free_security),
4672 LSM_HOOK_INIT(sem_associate, smack_sem_associate), 4574 LSM_HOOK_INIT(sem_associate, smack_sem_associate),
4673 LSM_HOOK_INIT(sem_semctl, smack_sem_semctl), 4575 LSM_HOOK_INIT(sem_semctl, smack_sem_semctl),
4674 LSM_HOOK_INIT(sem_semop, smack_sem_semop), 4576 LSM_HOOK_INIT(sem_semop, smack_sem_semop),
@@ -4759,23 +4661,23 @@ static __init void init_smack_known_list(void)
4759 */ 4661 */
4760static __init int smack_init(void) 4662static __init int smack_init(void)
4761{ 4663{
4762 struct cred *cred; 4664 struct cred *cred = (struct cred *) current->cred;
4763 struct task_smack *tsp; 4665 struct task_smack *tsp;
4764 4666
4765 if (!security_module_enable("smack"))
4766 return 0;
4767
4768 smack_inode_cache = KMEM_CACHE(inode_smack, 0); 4667 smack_inode_cache = KMEM_CACHE(inode_smack, 0);
4769 if (!smack_inode_cache) 4668 if (!smack_inode_cache)
4770 return -ENOMEM; 4669 return -ENOMEM;
4771 4670
4772 tsp = new_task_smack(&smack_known_floor, &smack_known_floor, 4671 /*
4773 GFP_KERNEL); 4672 * Set the security state for the initial task.
4774 if (tsp == NULL) { 4673 */
4775 kmem_cache_destroy(smack_inode_cache); 4674 tsp = smack_cred(cred);
4776 return -ENOMEM; 4675 init_task_smack(tsp, &smack_known_floor, &smack_known_floor);
4777 }
4778 4676
4677 /*
4678 * Register with LSM
4679 */
4680 security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
4779 smack_enabled = 1; 4681 smack_enabled = 1;
4780 4682
4781 pr_info("Smack: Initializing.\n"); 4683 pr_info("Smack: Initializing.\n");
@@ -4789,20 +4691,9 @@ static __init int smack_init(void)
4789 pr_info("Smack: IPv6 Netfilter enabled.\n"); 4691 pr_info("Smack: IPv6 Netfilter enabled.\n");
4790#endif 4692#endif
4791 4693
4792 /*
4793 * Set the security state for the initial task.
4794 */
4795 cred = (struct cred *) current->cred;
4796 cred->security = tsp;
4797
4798 /* initialize the smack_known_list */ 4694 /* initialize the smack_known_list */
4799 init_smack_known_list(); 4695 init_smack_known_list();
4800 4696
4801 /*
4802 * Register with LSM
4803 */
4804 security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
4805
4806 return 0; 4697 return 0;
4807} 4698}
4808 4699
@@ -4812,5 +4703,7 @@ static __init int smack_init(void)
4812 */ 4703 */
4813DEFINE_LSM(smack) = { 4704DEFINE_LSM(smack) = {
4814 .name = "smack", 4705 .name = "smack",
4706 .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
4707 .blobs = &smack_blob_sizes,
4815 .init = smack_init, 4708 .init = smack_init,
4816}; 4709};
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 06b517075ec0..faf2ea3968b3 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -2208,14 +2208,14 @@ static const struct file_operations smk_logging_ops = {
2208 2208
2209static void *load_self_seq_start(struct seq_file *s, loff_t *pos) 2209static void *load_self_seq_start(struct seq_file *s, loff_t *pos)
2210{ 2210{
2211 struct task_smack *tsp = current_security(); 2211 struct task_smack *tsp = smack_cred(current_cred());
2212 2212
2213 return smk_seq_start(s, pos, &tsp->smk_rules); 2213 return smk_seq_start(s, pos, &tsp->smk_rules);
2214} 2214}
2215 2215
2216static void *load_self_seq_next(struct seq_file *s, void *v, loff_t *pos) 2216static void *load_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
2217{ 2217{
2218 struct task_smack *tsp = current_security(); 2218 struct task_smack *tsp = smack_cred(current_cred());
2219 2219
2220 return smk_seq_next(s, v, pos, &tsp->smk_rules); 2220 return smk_seq_next(s, v, pos, &tsp->smk_rules);
2221} 2221}
@@ -2262,7 +2262,7 @@ static int smk_open_load_self(struct inode *inode, struct file *file)
2262static ssize_t smk_write_load_self(struct file *file, const char __user *buf, 2262static ssize_t smk_write_load_self(struct file *file, const char __user *buf,
2263 size_t count, loff_t *ppos) 2263 size_t count, loff_t *ppos)
2264{ 2264{
2265 struct task_smack *tsp = current_security(); 2265 struct task_smack *tsp = smack_cred(current_cred());
2266 2266
2267 return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules, 2267 return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
2268 &tsp->smk_rules_lock, SMK_FIXED24_FMT); 2268 &tsp->smk_rules_lock, SMK_FIXED24_FMT);
@@ -2414,14 +2414,14 @@ static const struct file_operations smk_load2_ops = {
2414 2414
2415static void *load_self2_seq_start(struct seq_file *s, loff_t *pos) 2415static void *load_self2_seq_start(struct seq_file *s, loff_t *pos)
2416{ 2416{
2417 struct task_smack *tsp = current_security(); 2417 struct task_smack *tsp = smack_cred(current_cred());
2418 2418
2419 return smk_seq_start(s, pos, &tsp->smk_rules); 2419 return smk_seq_start(s, pos, &tsp->smk_rules);
2420} 2420}
2421 2421
2422static void *load_self2_seq_next(struct seq_file *s, void *v, loff_t *pos) 2422static void *load_self2_seq_next(struct seq_file *s, void *v, loff_t *pos)
2423{ 2423{
2424 struct task_smack *tsp = current_security(); 2424 struct task_smack *tsp = smack_cred(current_cred());
2425 2425
2426 return smk_seq_next(s, v, pos, &tsp->smk_rules); 2426 return smk_seq_next(s, v, pos, &tsp->smk_rules);
2427} 2427}
@@ -2467,7 +2467,7 @@ static int smk_open_load_self2(struct inode *inode, struct file *file)
2467static ssize_t smk_write_load_self2(struct file *file, const char __user *buf, 2467static ssize_t smk_write_load_self2(struct file *file, const char __user *buf,
2468 size_t count, loff_t *ppos) 2468 size_t count, loff_t *ppos)
2469{ 2469{
2470 struct task_smack *tsp = current_security(); 2470 struct task_smack *tsp = smack_cred(current_cred());
2471 2471
2472 return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules, 2472 return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
2473 &tsp->smk_rules_lock, SMK_LONG_FMT); 2473 &tsp->smk_rules_lock, SMK_LONG_FMT);
@@ -2681,14 +2681,14 @@ static const struct file_operations smk_syslog_ops = {
2681 2681
2682static void *relabel_self_seq_start(struct seq_file *s, loff_t *pos) 2682static void *relabel_self_seq_start(struct seq_file *s, loff_t *pos)
2683{ 2683{
2684 struct task_smack *tsp = current_security(); 2684 struct task_smack *tsp = smack_cred(current_cred());
2685 2685
2686 return smk_seq_start(s, pos, &tsp->smk_relabel); 2686 return smk_seq_start(s, pos, &tsp->smk_relabel);
2687} 2687}
2688 2688
2689static void *relabel_self_seq_next(struct seq_file *s, void *v, loff_t *pos) 2689static void *relabel_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
2690{ 2690{
2691 struct task_smack *tsp = current_security(); 2691 struct task_smack *tsp = smack_cred(current_cred());
2692 2692
2693 return smk_seq_next(s, v, pos, &tsp->smk_relabel); 2693 return smk_seq_next(s, v, pos, &tsp->smk_relabel);
2694} 2694}
@@ -2736,7 +2736,7 @@ static int smk_open_relabel_self(struct inode *inode, struct file *file)
2736static ssize_t smk_write_relabel_self(struct file *file, const char __user *buf, 2736static ssize_t smk_write_relabel_self(struct file *file, const char __user *buf,
2737 size_t count, loff_t *ppos) 2737 size_t count, loff_t *ppos)
2738{ 2738{
2739 struct task_smack *tsp = current_security(); 2739 struct task_smack *tsp = smack_cred(current_cred());
2740 char *data; 2740 char *data;
2741 int rc; 2741 int rc;
2742 LIST_HEAD(list_tmp); 2742 LIST_HEAD(list_tmp);
diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c
index 479b03a7a17e..3c96e8402e94 100644
--- a/security/tomoyo/audit.c
+++ b/security/tomoyo/audit.c
@@ -32,6 +32,7 @@ static char *tomoyo_print_bprm(struct linux_binprm *bprm,
32 int argv_count = bprm->argc; 32 int argv_count = bprm->argc;
33 int envp_count = bprm->envc; 33 int envp_count = bprm->envc;
34 bool truncated = false; 34 bool truncated = false;
35
35 if (!buffer) 36 if (!buffer)
36 return NULL; 37 return NULL;
37 len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ "); 38 len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
@@ -49,6 +50,7 @@ static char *tomoyo_print_bprm(struct linux_binprm *bprm,
49 while (offset < PAGE_SIZE) { 50 while (offset < PAGE_SIZE) {
50 const char *kaddr = dump->data; 51 const char *kaddr = dump->data;
51 const unsigned char c = kaddr[offset++]; 52 const unsigned char c = kaddr[offset++];
53
52 if (cp == last_start) 54 if (cp == last_start)
53 *cp++ = '"'; 55 *cp++ = '"';
54 if (cp >= buffer + tomoyo_buffer_len - 32) { 56 if (cp >= buffer + tomoyo_buffer_len - 32) {
@@ -154,19 +156,18 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
154 char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS); 156 char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
155 int pos; 157 int pos;
156 u8 i; 158 u8 i;
159
157 if (!buffer) 160 if (!buffer)
158 return NULL; 161 return NULL;
159 162
160 tomoyo_convert_time(ktime_get_real_seconds(), &stamp); 163 tomoyo_convert_time(ktime_get_real_seconds(), &stamp);
161 164
162 pos = snprintf(buffer, tomoyo_buffer_len - 1, 165 pos = snprintf(buffer, tomoyo_buffer_len - 1,
163 "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s " 166 "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
164 "granted=%s (global-pid=%u) task={ pid=%u ppid=%u " 167 stamp.year, stamp.month, stamp.day, stamp.hour,
165 "uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u " 168 stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
166 "fsuid=%u fsgid=%u }", stamp.year, stamp.month, 169 tomoyo_yesno(r->granted), gpid, tomoyo_sys_getpid(),
167 stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile, 170 tomoyo_sys_getppid(),
168 tomoyo_mode[r->mode], tomoyo_yesno(r->granted), gpid,
169 tomoyo_sys_getpid(), tomoyo_sys_getppid(),
170 from_kuid(&init_user_ns, current_uid()), 171 from_kuid(&init_user_ns, current_uid()),
171 from_kgid(&init_user_ns, current_gid()), 172 from_kgid(&init_user_ns, current_gid()),
172 from_kuid(&init_user_ns, current_euid()), 173 from_kuid(&init_user_ns, current_euid()),
@@ -185,6 +186,7 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
185 struct tomoyo_mini_stat *stat; 186 struct tomoyo_mini_stat *stat;
186 unsigned int dev; 187 unsigned int dev;
187 umode_t mode; 188 umode_t mode;
189
188 if (!obj->stat_valid[i]) 190 if (!obj->stat_valid[i])
189 continue; 191 continue;
190 stat = &obj->stat[i]; 192 stat = &obj->stat[i];
@@ -193,8 +195,8 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
193 if (i & 1) { 195 if (i & 1) {
194 pos += snprintf(buffer + pos, 196 pos += snprintf(buffer + pos,
195 tomoyo_buffer_len - 1 - pos, 197 tomoyo_buffer_len - 1 - pos,
196 " path%u.parent={ uid=%u gid=%u " 198 " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
197 "ino=%lu perm=0%o }", (i >> 1) + 1, 199 (i >> 1) + 1,
198 from_kuid(&init_user_ns, stat->uid), 200 from_kuid(&init_user_ns, stat->uid),
199 from_kgid(&init_user_ns, stat->gid), 201 from_kgid(&init_user_ns, stat->gid),
200 (unsigned long)stat->ino, 202 (unsigned long)stat->ino,
@@ -202,8 +204,8 @@ static char *tomoyo_print_header(struct tomoyo_request_info *r)
202 continue; 204 continue;
203 } 205 }
204 pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, 206 pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
205 " path%u={ uid=%u gid=%u ino=%lu major=%u" 207 " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
206 " minor=%u perm=0%o type=%s", (i >> 1) + 1, 208 (i >> 1) + 1,
207 from_kuid(&init_user_ns, stat->uid), 209 from_kuid(&init_user_ns, stat->uid),
208 from_kgid(&init_user_ns, stat->gid), 210 from_kgid(&init_user_ns, stat->gid),
209 (unsigned long)stat->ino, 211 (unsigned long)stat->ino,
@@ -249,6 +251,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
249 const char *symlink = NULL; 251 const char *symlink = NULL;
250 int pos; 252 int pos;
251 const char *domainname = r->domain->domainname->name; 253 const char *domainname = r->domain->domainname->name;
254
252 header = tomoyo_print_header(r); 255 header = tomoyo_print_header(r);
253 if (!header) 256 if (!header)
254 return NULL; 257 return NULL;
@@ -256,6 +259,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
256 len += strlen(domainname) + strlen(header) + 10; 259 len += strlen(domainname) + strlen(header) + 10;
257 if (r->ee) { 260 if (r->ee) {
258 struct file *file = r->ee->bprm->file; 261 struct file *file = r->ee->bprm->file;
262
259 realpath = tomoyo_realpath_from_path(&file->f_path); 263 realpath = tomoyo_realpath_from_path(&file->f_path);
260 bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump); 264 bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
261 if (!realpath || !bprm_info) 265 if (!realpath || !bprm_info)
@@ -275,6 +279,7 @@ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
275 pos = snprintf(buf, len, "%s", header); 279 pos = snprintf(buf, len, "%s", header);
276 if (realpath) { 280 if (realpath) {
277 struct linux_binprm *bprm = r->ee->bprm; 281 struct linux_binprm *bprm = r->ee->bprm;
282
278 pos += snprintf(buf + pos, len - pos, 283 pos += snprintf(buf + pos, len - pos,
279 " exec={ realpath=\"%s\" argc=%d envc=%d %s }", 284 " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
280 realpath, bprm->argc, bprm->envc, bprm_info); 285 realpath, bprm->argc, bprm->envc, bprm_info);
@@ -328,6 +333,7 @@ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
328 const u8 category = tomoyo_index2category[index] + 333 const u8 category = tomoyo_index2category[index] +
329 TOMOYO_MAX_MAC_INDEX; 334 TOMOYO_MAX_MAC_INDEX;
330 struct tomoyo_profile *p; 335 struct tomoyo_profile *p;
336
331 if (!tomoyo_policy_loaded) 337 if (!tomoyo_policy_loaded)
332 return false; 338 return false;
333 p = tomoyo_profile(ns, profile); 339 p = tomoyo_profile(ns, profile);
@@ -362,6 +368,7 @@ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
362 char *buf; 368 char *buf;
363 struct tomoyo_log *entry; 369 struct tomoyo_log *entry;
364 bool quota_exceeded = false; 370 bool quota_exceeded = false;
371
365 if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type, 372 if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
366 r->matched_acl, r->granted)) 373 r->matched_acl, r->granted))
367 goto out; 374 goto out;
@@ -413,6 +420,7 @@ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
413{ 420{
414 va_list args; 421 va_list args;
415 int len; 422 int len;
423
416 va_start(args, fmt); 424 va_start(args, fmt);
417 len = vsnprintf((char *) &len, 1, fmt, args) + 1; 425 len = vsnprintf((char *) &len, 1, fmt, args) + 1;
418 va_end(args); 426 va_end(args);
@@ -431,6 +439,7 @@ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
431void tomoyo_read_log(struct tomoyo_io_buffer *head) 439void tomoyo_read_log(struct tomoyo_io_buffer *head)
432{ 440{
433 struct tomoyo_log *ptr = NULL; 441 struct tomoyo_log *ptr = NULL;
442
434 if (head->r.w_pos) 443 if (head->r.w_pos)
435 return; 444 return;
436 kfree(head->read_buf); 445 kfree(head->read_buf);
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index c598aa00d5e3..57988d95d33d 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -197,6 +197,7 @@ static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
197{ 197{
198 va_list args; 198 va_list args;
199 const int pos = strlen(buffer); 199 const int pos = strlen(buffer);
200
200 va_start(args, fmt); 201 va_start(args, fmt);
201 vsnprintf(buffer + pos, len - pos - 1, fmt, args); 202 vsnprintf(buffer + pos, len - pos - 1, fmt, args);
202 va_end(args); 203 va_end(args);
@@ -214,6 +215,7 @@ static bool tomoyo_flush(struct tomoyo_io_buffer *head)
214 while (head->r.w_pos) { 215 while (head->r.w_pos) {
215 const char *w = head->r.w[0]; 216 const char *w = head->r.w[0];
216 size_t len = strlen(w); 217 size_t len = strlen(w);
218
217 if (len) { 219 if (len) {
218 if (len > head->read_user_buf_avail) 220 if (len > head->read_user_buf_avail)
219 len = head->read_user_buf_avail; 221 len = head->read_user_buf_avail;
@@ -279,6 +281,7 @@ static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
279 size_t len; 281 size_t len;
280 size_t pos = head->r.avail; 282 size_t pos = head->r.avail;
281 int size = head->readbuf_size - pos; 283 int size = head->readbuf_size - pos;
284
282 if (size <= 0) 285 if (size <= 0)
283 return; 286 return;
284 va_start(args, fmt); 287 va_start(args, fmt);
@@ -344,13 +347,14 @@ static bool tomoyo_namespace_enabled;
344void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns) 347void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
345{ 348{
346 unsigned int idx; 349 unsigned int idx;
350
347 for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++) 351 for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
348 INIT_LIST_HEAD(&ns->acl_group[idx]); 352 INIT_LIST_HEAD(&ns->acl_group[idx]);
349 for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++) 353 for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
350 INIT_LIST_HEAD(&ns->group_list[idx]); 354 INIT_LIST_HEAD(&ns->group_list[idx]);
351 for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++) 355 for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
352 INIT_LIST_HEAD(&ns->policy_list[idx]); 356 INIT_LIST_HEAD(&ns->policy_list[idx]);
353 ns->profile_version = 20110903; 357 ns->profile_version = 20150505;
354 tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list); 358 tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
355 list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list); 359 list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
356} 360}
@@ -433,6 +437,7 @@ static void tomoyo_print_number_union_nospace
433 u8 min_type = ptr->value_type[0]; 437 u8 min_type = ptr->value_type[0];
434 const u8 max_type = ptr->value_type[1]; 438 const u8 max_type = ptr->value_type[1];
435 char buffer[128]; 439 char buffer[128];
440
436 buffer[0] = '\0'; 441 buffer[0] = '\0';
437 for (i = 0; i < 2; i++) { 442 for (i = 0; i < 2; i++) {
438 switch (min_type) { 443 switch (min_type) {
@@ -487,6 +492,7 @@ static struct tomoyo_profile *tomoyo_assign_profile
487{ 492{
488 struct tomoyo_profile *ptr; 493 struct tomoyo_profile *ptr;
489 struct tomoyo_profile *entry; 494 struct tomoyo_profile *entry;
495
490 if (profile >= TOMOYO_MAX_PROFILES) 496 if (profile >= TOMOYO_MAX_PROFILES)
491 return NULL; 497 return NULL;
492 ptr = ns->profile_ptr[profile]; 498 ptr = ns->profile_ptr[profile];
@@ -530,6 +536,7 @@ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
530{ 536{
531 static struct tomoyo_profile tomoyo_null_profile; 537 static struct tomoyo_profile tomoyo_null_profile;
532 struct tomoyo_profile *ptr = ns->profile_ptr[profile]; 538 struct tomoyo_profile *ptr = ns->profile_ptr[profile];
539
533 if (!ptr) 540 if (!ptr)
534 ptr = &tomoyo_null_profile; 541 ptr = &tomoyo_null_profile;
535 return ptr; 542 return ptr;
@@ -546,6 +553,7 @@ struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
546static s8 tomoyo_find_yesno(const char *string, const char *find) 553static s8 tomoyo_find_yesno(const char *string, const char *find)
547{ 554{
548 const char *cp = strstr(string, find); 555 const char *cp = strstr(string, find);
556
549 if (cp) { 557 if (cp) {
550 cp += strlen(find); 558 cp += strlen(find);
551 if (!strncmp(cp, "=yes", 4)) 559 if (!strncmp(cp, "=yes", 4))
@@ -569,6 +577,7 @@ static void tomoyo_set_uint(unsigned int *i, const char *string,
569 const char *find) 577 const char *find)
570{ 578{
571 const char *cp = strstr(string, find); 579 const char *cp = strstr(string, find);
580
572 if (cp) 581 if (cp)
573 sscanf(cp + strlen(find), "=%u", i); 582 sscanf(cp + strlen(find), "=%u", i);
574} 583}
@@ -587,6 +596,7 @@ static int tomoyo_set_mode(char *name, const char *value,
587{ 596{
588 u8 i; 597 u8 i;
589 u8 config; 598 u8 config;
599
590 if (!strcmp(name, "CONFIG")) { 600 if (!strcmp(name, "CONFIG")) {
591 i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX; 601 i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
592 config = profile->default_config; 602 config = profile->default_config;
@@ -595,10 +605,12 @@ static int tomoyo_set_mode(char *name, const char *value,
595 for (i = 0; i < TOMOYO_MAX_MAC_INDEX 605 for (i = 0; i < TOMOYO_MAX_MAC_INDEX
596 + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) { 606 + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
597 int len = 0; 607 int len = 0;
608
598 if (i < TOMOYO_MAX_MAC_INDEX) { 609 if (i < TOMOYO_MAX_MAC_INDEX) {
599 const u8 c = tomoyo_index2category[i]; 610 const u8 c = tomoyo_index2category[i];
600 const char *category = 611 const char *category =
601 tomoyo_category_keywords[c]; 612 tomoyo_category_keywords[c];
613
602 len = strlen(category); 614 len = strlen(category);
603 if (strncmp(name, category, len) || 615 if (strncmp(name, category, len) ||
604 name[len++] != ':' || name[len++] != ':') 616 name[len++] != ':' || name[len++] != ':')
@@ -618,6 +630,7 @@ static int tomoyo_set_mode(char *name, const char *value,
618 config = TOMOYO_CONFIG_USE_DEFAULT; 630 config = TOMOYO_CONFIG_USE_DEFAULT;
619 } else { 631 } else {
620 u8 mode; 632 u8 mode;
633
621 for (mode = 0; mode < 4; mode++) 634 for (mode = 0; mode < 4; mode++)
622 if (strstr(value, tomoyo_mode[mode])) 635 if (strstr(value, tomoyo_mode[mode]))
623 /* 636 /*
@@ -664,6 +677,7 @@ static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
664 unsigned int i; 677 unsigned int i;
665 char *cp; 678 char *cp;
666 struct tomoyo_profile *profile; 679 struct tomoyo_profile *profile;
680
667 if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version) 681 if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
668 == 1) 682 == 1)
669 return 0; 683 return 0;
@@ -683,6 +697,7 @@ static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
683 const struct tomoyo_path_info *new_comment 697 const struct tomoyo_path_info *new_comment
684 = tomoyo_get_name(cp); 698 = tomoyo_get_name(cp);
685 const struct tomoyo_path_info *old_comment; 699 const struct tomoyo_path_info *old_comment;
700
686 if (!new_comment) 701 if (!new_comment)
687 return -ENOMEM; 702 return -ENOMEM;
688 spin_lock(&lock); 703 spin_lock(&lock);
@@ -732,6 +747,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
732 struct tomoyo_policy_namespace *ns = 747 struct tomoyo_policy_namespace *ns =
733 container_of(head->r.ns, typeof(*ns), namespace_list); 748 container_of(head->r.ns, typeof(*ns), namespace_list);
734 const struct tomoyo_profile *profile; 749 const struct tomoyo_profile *profile;
750
735 if (head->r.eof) 751 if (head->r.eof)
736 return; 752 return;
737 next: 753 next:
@@ -760,6 +776,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
760 u8 i; 776 u8 i;
761 const struct tomoyo_path_info *comment = 777 const struct tomoyo_path_info *comment =
762 profile->comment; 778 profile->comment;
779
763 tomoyo_print_namespace(head); 780 tomoyo_print_namespace(head);
764 tomoyo_io_printf(head, "%u-COMMENT=", index); 781 tomoyo_io_printf(head, "%u-COMMENT=", index);
765 tomoyo_set_string(head, comment ? comment->name : ""); 782 tomoyo_set_string(head, comment ? comment->name : "");
@@ -788,6 +805,7 @@ static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
788 + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) { 805 + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
789 const u8 i = head->r.bit; 806 const u8 i = head->r.bit;
790 const u8 config = profile->config[i]; 807 const u8 config = profile->config[i];
808
791 if (config == TOMOYO_CONFIG_USE_DEFAULT) 809 if (config == TOMOYO_CONFIG_USE_DEFAULT)
792 continue; 810 continue;
793 tomoyo_print_namespace(head); 811 tomoyo_print_namespace(head);
@@ -847,10 +865,10 @@ static int tomoyo_update_manager_entry(const char *manager,
847 struct tomoyo_acl_param param = { 865 struct tomoyo_acl_param param = {
848 /* .ns = &tomoyo_kernel_namespace, */ 866 /* .ns = &tomoyo_kernel_namespace, */
849 .is_delete = is_delete, 867 .is_delete = is_delete,
850 .list = &tomoyo_kernel_namespace. 868 .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
851 policy_list[TOMOYO_ID_MANAGER],
852 }; 869 };
853 int error = is_delete ? -ENOENT : -ENOMEM; 870 int error = is_delete ? -ENOENT : -ENOMEM;
871
854 if (!tomoyo_correct_domain(manager) && 872 if (!tomoyo_correct_domain(manager) &&
855 !tomoyo_correct_word(manager)) 873 !tomoyo_correct_word(manager))
856 return -EINVAL; 874 return -EINVAL;
@@ -894,10 +912,10 @@ static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
894{ 912{
895 if (head->r.eof) 913 if (head->r.eof)
896 return; 914 return;
897 list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace. 915 list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
898 policy_list[TOMOYO_ID_MANAGER]) {
899 struct tomoyo_manager *ptr = 916 struct tomoyo_manager *ptr =
900 list_entry(head->r.acl, typeof(*ptr), head.list); 917 list_entry(head->r.acl, typeof(*ptr), head.list);
918
901 if (ptr->head.is_deleted) 919 if (ptr->head.is_deleted)
902 continue; 920 continue;
903 if (!tomoyo_flush(head)) 921 if (!tomoyo_flush(head))
@@ -933,8 +951,7 @@ static bool tomoyo_manager(void)
933 exe = tomoyo_get_exe(); 951 exe = tomoyo_get_exe();
934 if (!exe) 952 if (!exe)
935 return false; 953 return false;
936 list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace. 954 list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list) {
937 policy_list[TOMOYO_ID_MANAGER], head.list) {
938 if (!ptr->head.is_deleted && 955 if (!ptr->head.is_deleted &&
939 (!tomoyo_pathcmp(domainname, ptr->manager) || 956 (!tomoyo_pathcmp(domainname, ptr->manager) ||
940 !strcmp(exe, ptr->manager->name))) { 957 !strcmp(exe, ptr->manager->name))) {
@@ -945,9 +962,10 @@ static bool tomoyo_manager(void)
945 if (!found) { /* Reduce error messages. */ 962 if (!found) { /* Reduce error messages. */
946 static pid_t last_pid; 963 static pid_t last_pid;
947 const pid_t pid = current->pid; 964 const pid_t pid = current->pid;
965
948 if (last_pid != pid) { 966 if (last_pid != pid) {
949 printk(KERN_WARNING "%s ( %s ) is not permitted to " 967 pr_warn("%s ( %s ) is not permitted to update policies.\n",
950 "update policies.\n", domainname->name, exe); 968 domainname->name, exe);
951 last_pid = pid; 969 last_pid = pid;
952 } 970 }
953 } 971 }
@@ -974,19 +992,21 @@ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
974 unsigned int pid; 992 unsigned int pid;
975 struct tomoyo_domain_info *domain = NULL; 993 struct tomoyo_domain_info *domain = NULL;
976 bool global_pid = false; 994 bool global_pid = false;
995
977 if (strncmp(data, "select ", 7)) 996 if (strncmp(data, "select ", 7))
978 return false; 997 return false;
979 data += 7; 998 data += 7;
980 if (sscanf(data, "pid=%u", &pid) == 1 || 999 if (sscanf(data, "pid=%u", &pid) == 1 ||
981 (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) { 1000 (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
982 struct task_struct *p; 1001 struct task_struct *p;
1002
983 rcu_read_lock(); 1003 rcu_read_lock();
984 if (global_pid) 1004 if (global_pid)
985 p = find_task_by_pid_ns(pid, &init_pid_ns); 1005 p = find_task_by_pid_ns(pid, &init_pid_ns);
986 else 1006 else
987 p = find_task_by_vpid(pid); 1007 p = find_task_by_vpid(pid);
988 if (p) 1008 if (p)
989 domain = tomoyo_real_domain(p); 1009 domain = tomoyo_task(p)->domain_info;
990 rcu_read_unlock(); 1010 rcu_read_unlock();
991 } else if (!strncmp(data, "domain=", 7)) { 1011 } else if (!strncmp(data, "domain=", 7)) {
992 if (tomoyo_domain_def(data + 7)) 1012 if (tomoyo_domain_def(data + 7))
@@ -1020,10 +1040,11 @@ static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
1020 * Returns true if @a == @b, false otherwise. 1040 * Returns true if @a == @b, false otherwise.
1021 */ 1041 */
1022static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a, 1042static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
1023 const struct tomoyo_acl_info *b) 1043 const struct tomoyo_acl_info *b)
1024{ 1044{
1025 const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head); 1045 const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
1026 const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head); 1046 const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);
1047
1027 return p1->domainname == p2->domainname; 1048 return p1->domainname == p2->domainname;
1028} 1049}
1029 1050
@@ -1039,11 +1060,13 @@ static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
1039static int tomoyo_write_task(struct tomoyo_acl_param *param) 1060static int tomoyo_write_task(struct tomoyo_acl_param *param)
1040{ 1061{
1041 int error = -EINVAL; 1062 int error = -EINVAL;
1063
1042 if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) { 1064 if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
1043 struct tomoyo_task_acl e = { 1065 struct tomoyo_task_acl e = {
1044 .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL, 1066 .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
1045 .domainname = tomoyo_get_domainname(param), 1067 .domainname = tomoyo_get_domainname(param),
1046 }; 1068 };
1069
1047 if (e.domainname) 1070 if (e.domainname)
1048 error = tomoyo_update_domain(&e.head, sizeof(e), param, 1071 error = tomoyo_update_domain(&e.head, sizeof(e), param,
1049 tomoyo_same_task_acl, 1072 tomoyo_same_task_acl,
@@ -1110,7 +1133,7 @@ static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
1110 }; 1133 };
1111 static const struct { 1134 static const struct {
1112 const char *keyword; 1135 const char *keyword;
1113 int (*write) (struct tomoyo_acl_param *); 1136 int (*write)(struct tomoyo_acl_param *param);
1114 } tomoyo_callback[5] = { 1137 } tomoyo_callback[5] = {
1115 { "file ", tomoyo_write_file }, 1138 { "file ", tomoyo_write_file },
1116 { "network inet ", tomoyo_write_inet_network }, 1139 { "network inet ", tomoyo_write_inet_network },
@@ -1151,9 +1174,11 @@ static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
1151 struct tomoyo_domain_info *domain = head->w.domain; 1174 struct tomoyo_domain_info *domain = head->w.domain;
1152 const bool is_delete = head->w.is_delete; 1175 const bool is_delete = head->w.is_delete;
1153 bool is_select = !is_delete && tomoyo_str_starts(&data, "select "); 1176 bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
1154 unsigned int profile; 1177 unsigned int idx;
1178
1155 if (*data == '<') { 1179 if (*data == '<') {
1156 int ret = 0; 1180 int ret = 0;
1181
1157 domain = NULL; 1182 domain = NULL;
1158 if (is_delete) 1183 if (is_delete)
1159 ret = tomoyo_delete_domain(data); 1184 ret = tomoyo_delete_domain(data);
@@ -1167,23 +1192,27 @@ static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
1167 if (!domain) 1192 if (!domain)
1168 return -EINVAL; 1193 return -EINVAL;
1169 ns = domain->ns; 1194 ns = domain->ns;
1170 if (sscanf(data, "use_profile %u", &profile) == 1 1195 if (sscanf(data, "use_profile %u", &idx) == 1
1171 && profile < TOMOYO_MAX_PROFILES) { 1196 && idx < TOMOYO_MAX_PROFILES) {
1172 if (!tomoyo_policy_loaded || ns->profile_ptr[profile]) 1197 if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
1173 domain->profile = (u8) profile; 1198 if (!is_delete)
1199 domain->profile = (u8) idx;
1174 return 0; 1200 return 0;
1175 } 1201 }
1176 if (sscanf(data, "use_group %u\n", &profile) == 1 1202 if (sscanf(data, "use_group %u\n", &idx) == 1
1177 && profile < TOMOYO_MAX_ACL_GROUPS) { 1203 && idx < TOMOYO_MAX_ACL_GROUPS) {
1178 if (!is_delete) 1204 if (!is_delete)
1179 domain->group = (u8) profile; 1205 set_bit(idx, domain->group);
1206 else
1207 clear_bit(idx, domain->group);
1180 return 0; 1208 return 0;
1181 } 1209 }
1182 for (profile = 0; profile < TOMOYO_MAX_DOMAIN_INFO_FLAGS; profile++) { 1210 for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
1183 const char *cp = tomoyo_dif[profile]; 1211 const char *cp = tomoyo_dif[idx];
1212
1184 if (strncmp(data, cp, strlen(cp) - 1)) 1213 if (strncmp(data, cp, strlen(cp) - 1))
1185 continue; 1214 continue;
1186 domain->flags[profile] = !is_delete; 1215 domain->flags[idx] = !is_delete;
1187 return 0; 1216 return 0;
1188 } 1217 }
1189 return tomoyo_write_domain2(ns, &domain->acl_info_list, data, 1218 return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
@@ -1225,9 +1254,11 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
1225 const struct tomoyo_envp *envp = 1254 const struct tomoyo_envp *envp =
1226 (typeof(envp)) (argv + cond->argc); 1255 (typeof(envp)) (argv + cond->argc);
1227 u16 skip; 1256 u16 skip;
1257
1228 for (skip = 0; skip < head->r.cond_index; skip++) { 1258 for (skip = 0; skip < head->r.cond_index; skip++) {
1229 const u8 left = condp->left; 1259 const u8 left = condp->left;
1230 const u8 right = condp->right; 1260 const u8 right = condp->right;
1261
1231 condp++; 1262 condp++;
1232 switch (left) { 1263 switch (left) {
1233 case TOMOYO_ARGV_ENTRY: 1264 case TOMOYO_ARGV_ENTRY:
@@ -1253,6 +1284,7 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
1253 const u8 match = condp->equals; 1284 const u8 match = condp->equals;
1254 const u8 left = condp->left; 1285 const u8 left = condp->left;
1255 const u8 right = condp->right; 1286 const u8 right = condp->right;
1287
1256 if (!tomoyo_flush(head)) 1288 if (!tomoyo_flush(head))
1257 return false; 1289 return false;
1258 condp++; 1290 condp++;
@@ -1262,8 +1294,7 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
1262 case TOMOYO_ARGV_ENTRY: 1294 case TOMOYO_ARGV_ENTRY:
1263 tomoyo_io_printf(head, 1295 tomoyo_io_printf(head,
1264 "exec.argv[%lu]%s=\"", 1296 "exec.argv[%lu]%s=\"",
1265 argv->index, argv-> 1297 argv->index, argv->is_not ? "!" : "");
1266 is_not ? "!" : "");
1267 tomoyo_set_string(head, 1298 tomoyo_set_string(head,
1268 argv->value->name); 1299 argv->value->name);
1269 tomoyo_set_string(head, "\""); 1300 tomoyo_set_string(head, "\"");
@@ -1274,12 +1305,10 @@ static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
1274 "exec.envp[\""); 1305 "exec.envp[\"");
1275 tomoyo_set_string(head, 1306 tomoyo_set_string(head,
1276 envp->name->name); 1307 envp->name->name);
1277 tomoyo_io_printf(head, "\"]%s=", envp-> 1308 tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
1278 is_not ? "!" : "");
1279 if (envp->value) { 1309 if (envp->value) {
1280 tomoyo_set_string(head, "\""); 1310 tomoyo_set_string(head, "\"");
1281 tomoyo_set_string(head, envp-> 1311 tomoyo_set_string(head, envp->value->name);
1282 value->name);
1283 tomoyo_set_string(head, "\""); 1312 tomoyo_set_string(head, "\"");
1284 } else { 1313 } else {
1285 tomoyo_set_string(head, 1314 tomoyo_set_string(head,
@@ -1375,6 +1404,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1375 struct tomoyo_path_acl *ptr = 1404 struct tomoyo_path_acl *ptr =
1376 container_of(acl, typeof(*ptr), head); 1405 container_of(acl, typeof(*ptr), head);
1377 const u16 perm = ptr->perm; 1406 const u16 perm = ptr->perm;
1407
1378 for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) { 1408 for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
1379 if (!(perm & (1 << bit))) 1409 if (!(perm & (1 << bit)))
1380 continue; 1410 continue;
@@ -1395,6 +1425,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1395 } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) { 1425 } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
1396 struct tomoyo_task_acl *ptr = 1426 struct tomoyo_task_acl *ptr =
1397 container_of(acl, typeof(*ptr), head); 1427 container_of(acl, typeof(*ptr), head);
1428
1398 tomoyo_set_group(head, "task "); 1429 tomoyo_set_group(head, "task ");
1399 tomoyo_set_string(head, "manual_domain_transition "); 1430 tomoyo_set_string(head, "manual_domain_transition ");
1400 tomoyo_set_string(head, ptr->domainname->name); 1431 tomoyo_set_string(head, ptr->domainname->name);
@@ -1404,6 +1435,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1404 struct tomoyo_path2_acl *ptr = 1435 struct tomoyo_path2_acl *ptr =
1405 container_of(acl, typeof(*ptr), head); 1436 container_of(acl, typeof(*ptr), head);
1406 const u8 perm = ptr->perm; 1437 const u8 perm = ptr->perm;
1438
1407 for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) { 1439 for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
1408 if (!(perm & (1 << bit))) 1440 if (!(perm & (1 << bit)))
1409 continue; 1441 continue;
@@ -1424,6 +1456,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1424 struct tomoyo_path_number_acl *ptr = 1456 struct tomoyo_path_number_acl *ptr =
1425 container_of(acl, typeof(*ptr), head); 1457 container_of(acl, typeof(*ptr), head);
1426 const u8 perm = ptr->perm; 1458 const u8 perm = ptr->perm;
1459
1427 for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) { 1460 for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
1428 if (!(perm & (1 << bit))) 1461 if (!(perm & (1 << bit)))
1429 continue; 1462 continue;
@@ -1444,6 +1477,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1444 struct tomoyo_mkdev_acl *ptr = 1477 struct tomoyo_mkdev_acl *ptr =
1445 container_of(acl, typeof(*ptr), head); 1478 container_of(acl, typeof(*ptr), head);
1446 const u8 perm = ptr->perm; 1479 const u8 perm = ptr->perm;
1480
1447 for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) { 1481 for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
1448 if (!(perm & (1 << bit))) 1482 if (!(perm & (1 << bit)))
1449 continue; 1483 continue;
@@ -1490,6 +1524,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1490 ->name); 1524 ->name);
1491 } else { 1525 } else {
1492 char buf[128]; 1526 char buf[128];
1527
1493 tomoyo_print_ip(buf, sizeof(buf), &ptr->address); 1528 tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
1494 tomoyo_io_printf(head, "%s", buf); 1529 tomoyo_io_printf(head, "%s", buf);
1495 } 1530 }
@@ -1519,6 +1554,7 @@ static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
1519 } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) { 1554 } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
1520 struct tomoyo_mount_acl *ptr = 1555 struct tomoyo_mount_acl *ptr =
1521 container_of(acl, typeof(*ptr), head); 1556 container_of(acl, typeof(*ptr), head);
1557
1522 tomoyo_set_group(head, "file mount"); 1558 tomoyo_set_group(head, "file mount");
1523 tomoyo_print_name_union(head, &ptr->dev_name); 1559 tomoyo_print_name_union(head, &ptr->dev_name);
1524 tomoyo_print_name_union(head, &ptr->dir_name); 1560 tomoyo_print_name_union(head, &ptr->dir_name);
@@ -1562,6 +1598,7 @@ static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
1562 list_for_each_cookie(head->r.acl, list) { 1598 list_for_each_cookie(head->r.acl, list) {
1563 struct tomoyo_acl_info *ptr = 1599 struct tomoyo_acl_info *ptr =
1564 list_entry(head->r.acl, typeof(*ptr), list); 1600 list_entry(head->r.acl, typeof(*ptr), list);
1601
1565 if (!tomoyo_print_entry(head, ptr)) 1602 if (!tomoyo_print_entry(head, ptr))
1566 return false; 1603 return false;
1567 } 1604 }
@@ -1583,8 +1620,9 @@ static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
1583 list_for_each_cookie(head->r.domain, &tomoyo_domain_list) { 1620 list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
1584 struct tomoyo_domain_info *domain = 1621 struct tomoyo_domain_info *domain =
1585 list_entry(head->r.domain, typeof(*domain), list); 1622 list_entry(head->r.domain, typeof(*domain), list);
1623 u8 i;
1624
1586 switch (head->r.step) { 1625 switch (head->r.step) {
1587 u8 i;
1588 case 0: 1626 case 0:
1589 if (domain->is_deleted && 1627 if (domain->is_deleted &&
1590 !head->r.print_this_domain_only) 1628 !head->r.print_this_domain_only)
@@ -1594,22 +1632,33 @@ static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
1594 tomoyo_set_lf(head); 1632 tomoyo_set_lf(head);
1595 tomoyo_io_printf(head, "use_profile %u\n", 1633 tomoyo_io_printf(head, "use_profile %u\n",
1596 domain->profile); 1634 domain->profile);
1597 tomoyo_io_printf(head, "use_group %u\n",
1598 domain->group);
1599 for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++) 1635 for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
1600 if (domain->flags[i]) 1636 if (domain->flags[i])
1601 tomoyo_set_string(head, tomoyo_dif[i]); 1637 tomoyo_set_string(head, tomoyo_dif[i]);
1638 head->r.index = 0;
1602 head->r.step++; 1639 head->r.step++;
1603 tomoyo_set_lf(head);
1604 /* fall through */ 1640 /* fall through */
1605 case 1: 1641 case 1:
1642 while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
1643 i = head->r.index++;
1644 if (!test_bit(i, domain->group))
1645 continue;
1646 tomoyo_io_printf(head, "use_group %u\n", i);
1647 if (!tomoyo_flush(head))
1648 return;
1649 }
1650 head->r.index = 0;
1651 head->r.step++;
1652 tomoyo_set_lf(head);
1653 /* fall through */
1654 case 2:
1606 if (!tomoyo_read_domain2(head, &domain->acl_info_list)) 1655 if (!tomoyo_read_domain2(head, &domain->acl_info_list))
1607 return; 1656 return;
1608 head->r.step++; 1657 head->r.step++;
1609 if (!tomoyo_set_lf(head)) 1658 if (!tomoyo_set_lf(head))
1610 return; 1659 return;
1611 /* fall through */ 1660 /* fall through */
1612 case 2: 1661 case 3:
1613 head->r.step = 0; 1662 head->r.step = 0;
1614 if (head->r.print_this_domain_only) 1663 if (head->r.print_this_domain_only)
1615 goto done; 1664 goto done;
@@ -1668,7 +1717,7 @@ static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
1668 else 1717 else
1669 p = find_task_by_vpid(pid); 1718 p = find_task_by_vpid(pid);
1670 if (p) 1719 if (p)
1671 domain = tomoyo_real_domain(p); 1720 domain = tomoyo_task(p)->domain_info;
1672 rcu_read_unlock(); 1721 rcu_read_unlock();
1673 if (!domain) 1722 if (!domain)
1674 return; 1723 return;
@@ -1711,6 +1760,7 @@ static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
1711 .data = head->write_buf, 1760 .data = head->write_buf,
1712 }; 1761 };
1713 u8 i; 1762 u8 i;
1763
1714 if (tomoyo_str_starts(&param.data, "aggregator ")) 1764 if (tomoyo_str_starts(&param.data, "aggregator "))
1715 return tomoyo_write_aggregator(&param); 1765 return tomoyo_write_aggregator(&param);
1716 for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++) 1766 for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
@@ -1722,6 +1772,7 @@ static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
1722 if (tomoyo_str_starts(&param.data, "acl_group ")) { 1772 if (tomoyo_str_starts(&param.data, "acl_group ")) {
1723 unsigned int group; 1773 unsigned int group;
1724 char *data; 1774 char *data;
1775
1725 group = simple_strtoul(param.data, &data, 10); 1776 group = simple_strtoul(param.data, &data, 10);
1726 if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ') 1777 if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
1727 return tomoyo_write_domain2 1778 return tomoyo_write_domain2
@@ -1746,12 +1797,15 @@ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
1746 struct tomoyo_policy_namespace *ns = 1797 struct tomoyo_policy_namespace *ns =
1747 container_of(head->r.ns, typeof(*ns), namespace_list); 1798 container_of(head->r.ns, typeof(*ns), namespace_list);
1748 struct list_head *list = &ns->group_list[idx]; 1799 struct list_head *list = &ns->group_list[idx];
1800
1749 list_for_each_cookie(head->r.group, list) { 1801 list_for_each_cookie(head->r.group, list) {
1750 struct tomoyo_group *group = 1802 struct tomoyo_group *group =
1751 list_entry(head->r.group, typeof(*group), head.list); 1803 list_entry(head->r.group, typeof(*group), head.list);
1804
1752 list_for_each_cookie(head->r.acl, &group->member_list) { 1805 list_for_each_cookie(head->r.acl, &group->member_list) {
1753 struct tomoyo_acl_head *ptr = 1806 struct tomoyo_acl_head *ptr =
1754 list_entry(head->r.acl, typeof(*ptr), list); 1807 list_entry(head->r.acl, typeof(*ptr), list);
1808
1755 if (ptr->is_deleted) 1809 if (ptr->is_deleted)
1756 continue; 1810 continue;
1757 if (!tomoyo_flush(head)) 1811 if (!tomoyo_flush(head))
@@ -1771,10 +1825,10 @@ static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
1771 head)->number); 1825 head)->number);
1772 } else if (idx == TOMOYO_ADDRESS_GROUP) { 1826 } else if (idx == TOMOYO_ADDRESS_GROUP) {
1773 char buffer[128]; 1827 char buffer[128];
1774
1775 struct tomoyo_address_group *member = 1828 struct tomoyo_address_group *member =
1776 container_of(ptr, typeof(*member), 1829 container_of(ptr, typeof(*member),
1777 head); 1830 head);
1831
1778 tomoyo_print_ip(buffer, sizeof(buffer), 1832 tomoyo_print_ip(buffer, sizeof(buffer),
1779 &member->address); 1833 &member->address);
1780 tomoyo_io_printf(head, " %s", buffer); 1834 tomoyo_io_printf(head, " %s", buffer);
@@ -1802,6 +1856,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
1802 struct tomoyo_policy_namespace *ns = 1856 struct tomoyo_policy_namespace *ns =
1803 container_of(head->r.ns, typeof(*ns), namespace_list); 1857 container_of(head->r.ns, typeof(*ns), namespace_list);
1804 struct list_head *list = &ns->policy_list[idx]; 1858 struct list_head *list = &ns->policy_list[idx];
1859
1805 list_for_each_cookie(head->r.acl, list) { 1860 list_for_each_cookie(head->r.acl, list) {
1806 struct tomoyo_acl_head *acl = 1861 struct tomoyo_acl_head *acl =
1807 container_of(head->r.acl, typeof(*acl), list); 1862 container_of(head->r.acl, typeof(*acl), list);
@@ -1814,6 +1869,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
1814 { 1869 {
1815 struct tomoyo_transition_control *ptr = 1870 struct tomoyo_transition_control *ptr =
1816 container_of(acl, typeof(*ptr), head); 1871 container_of(acl, typeof(*ptr), head);
1872
1817 tomoyo_print_namespace(head); 1873 tomoyo_print_namespace(head);
1818 tomoyo_set_string(head, tomoyo_transition_type 1874 tomoyo_set_string(head, tomoyo_transition_type
1819 [ptr->type]); 1875 [ptr->type]);
@@ -1829,6 +1885,7 @@ static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
1829 { 1885 {
1830 struct tomoyo_aggregator *ptr = 1886 struct tomoyo_aggregator *ptr =
1831 container_of(acl, typeof(*ptr), head); 1887 container_of(acl, typeof(*ptr), head);
1888
1832 tomoyo_print_namespace(head); 1889 tomoyo_print_namespace(head);
1833 tomoyo_set_string(head, "aggregator "); 1890 tomoyo_set_string(head, "aggregator ");
1834 tomoyo_set_string(head, 1891 tomoyo_set_string(head,
@@ -1858,6 +1915,7 @@ static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
1858{ 1915{
1859 struct tomoyo_policy_namespace *ns = 1916 struct tomoyo_policy_namespace *ns =
1860 container_of(head->r.ns, typeof(*ns), namespace_list); 1917 container_of(head->r.ns, typeof(*ns), namespace_list);
1918
1861 if (head->r.eof) 1919 if (head->r.eof)
1862 return; 1920 return;
1863 while (head->r.step < TOMOYO_MAX_POLICY && 1921 while (head->r.step < TOMOYO_MAX_POLICY &&
@@ -1921,6 +1979,7 @@ static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);
1921static int tomoyo_truncate(char *str) 1979static int tomoyo_truncate(char *str)
1922{ 1980{
1923 char *start = str; 1981 char *start = str;
1982
1924 while (*(unsigned char *) str > (unsigned char) ' ') 1983 while (*(unsigned char *) str > (unsigned char) ' ')
1925 str++; 1984 str++;
1926 *str = '\0'; 1985 *str = '\0';
@@ -1943,6 +2002,7 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
1943 char *symlink = NULL; 2002 char *symlink = NULL;
1944 char *cp = strchr(header, '\n'); 2003 char *cp = strchr(header, '\n');
1945 int len; 2004 int len;
2005
1946 if (!cp) 2006 if (!cp)
1947 return; 2007 return;
1948 cp = strchr(cp + 1, '\n'); 2008 cp = strchr(cp + 1, '\n');
@@ -2002,6 +2062,7 @@ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
2002 static unsigned int tomoyo_serial; 2062 static unsigned int tomoyo_serial;
2003 struct tomoyo_query entry = { }; 2063 struct tomoyo_query entry = { };
2004 bool quota_exceeded = false; 2064 bool quota_exceeded = false;
2065
2005 va_start(args, fmt); 2066 va_start(args, fmt);
2006 len = vsnprintf((char *) &len, 1, fmt, args) + 1; 2067 len = vsnprintf((char *) &len, 1, fmt, args) + 1;
2007 va_end(args); 2068 va_end(args);
@@ -2063,8 +2124,7 @@ int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
2063 (tomoyo_answer_wait, entry.answer || 2124 (tomoyo_answer_wait, entry.answer ||
2064 !atomic_read(&tomoyo_query_observers), HZ)) 2125 !atomic_read(&tomoyo_query_observers), HZ))
2065 break; 2126 break;
2066 else 2127 entry.timer++;
2067 entry.timer++;
2068 } 2128 }
2069 spin_lock(&tomoyo_query_list_lock); 2129 spin_lock(&tomoyo_query_list_lock);
2070 list_del(&entry.list); 2130 list_del(&entry.list);
@@ -2100,6 +2160,7 @@ static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
2100{ 2160{
2101 struct tomoyo_query *ptr; 2161 struct tomoyo_query *ptr;
2102 struct tomoyo_domain_info *domain = NULL; 2162 struct tomoyo_domain_info *domain = NULL;
2163
2103 spin_lock(&tomoyo_query_list_lock); 2164 spin_lock(&tomoyo_query_list_lock);
2104 list_for_each_entry(ptr, &tomoyo_query_list, list) { 2165 list_for_each_entry(ptr, &tomoyo_query_list, list) {
2105 if (ptr->serial != serial) 2166 if (ptr->serial != serial)
@@ -2142,15 +2203,15 @@ static void tomoyo_read_query(struct tomoyo_io_buffer *head)
2142 unsigned int pos = 0; 2203 unsigned int pos = 0;
2143 size_t len = 0; 2204 size_t len = 0;
2144 char *buf; 2205 char *buf;
2206
2145 if (head->r.w_pos) 2207 if (head->r.w_pos)
2146 return; 2208 return;
2147 if (head->read_buf) { 2209 kfree(head->read_buf);
2148 kfree(head->read_buf); 2210 head->read_buf = NULL;
2149 head->read_buf = NULL;
2150 }
2151 spin_lock(&tomoyo_query_list_lock); 2211 spin_lock(&tomoyo_query_list_lock);
2152 list_for_each(tmp, &tomoyo_query_list) { 2212 list_for_each(tmp, &tomoyo_query_list) {
2153 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); 2213 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
2214
2154 if (pos++ != head->r.query_index) 2215 if (pos++ != head->r.query_index)
2155 continue; 2216 continue;
2156 len = ptr->query_len; 2217 len = ptr->query_len;
@@ -2168,6 +2229,7 @@ static void tomoyo_read_query(struct tomoyo_io_buffer *head)
2168 spin_lock(&tomoyo_query_list_lock); 2229 spin_lock(&tomoyo_query_list_lock);
2169 list_for_each(tmp, &tomoyo_query_list) { 2230 list_for_each(tmp, &tomoyo_query_list) {
2170 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); 2231 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
2232
2171 if (pos++ != head->r.query_index) 2233 if (pos++ != head->r.query_index)
2172 continue; 2234 continue;
2173 /* 2235 /*
@@ -2202,9 +2264,11 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
2202 struct list_head *tmp; 2264 struct list_head *tmp;
2203 unsigned int serial; 2265 unsigned int serial;
2204 unsigned int answer; 2266 unsigned int answer;
2267
2205 spin_lock(&tomoyo_query_list_lock); 2268 spin_lock(&tomoyo_query_list_lock);
2206 list_for_each(tmp, &tomoyo_query_list) { 2269 list_for_each(tmp, &tomoyo_query_list) {
2207 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); 2270 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
2271
2208 ptr->timer = 0; 2272 ptr->timer = 0;
2209 } 2273 }
2210 spin_unlock(&tomoyo_query_list_lock); 2274 spin_unlock(&tomoyo_query_list_lock);
@@ -2213,6 +2277,7 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
2213 spin_lock(&tomoyo_query_list_lock); 2277 spin_lock(&tomoyo_query_list_lock);
2214 list_for_each(tmp, &tomoyo_query_list) { 2278 list_for_each(tmp, &tomoyo_query_list) {
2215 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list); 2279 struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);
2280
2216 if (ptr->serial != serial) 2281 if (ptr->serial != serial)
2217 continue; 2282 continue;
2218 ptr->answer = answer; 2283 ptr->answer = answer;
@@ -2235,7 +2300,7 @@ static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
2235static void tomoyo_read_version(struct tomoyo_io_buffer *head) 2300static void tomoyo_read_version(struct tomoyo_io_buffer *head)
2236{ 2301{
2237 if (!head->r.eof) { 2302 if (!head->r.eof) {
2238 tomoyo_io_printf(head, "2.5.0"); 2303 tomoyo_io_printf(head, "2.6.0");
2239 head->r.eof = true; 2304 head->r.eof = true;
2240 } 2305 }
2241} 2306}
@@ -2287,6 +2352,7 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
2287{ 2352{
2288 u8 i; 2353 u8 i;
2289 unsigned int total = 0; 2354 unsigned int total = 0;
2355
2290 if (head->r.eof) 2356 if (head->r.eof)
2291 return; 2357 return;
2292 for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) { 2358 for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
@@ -2295,9 +2361,9 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
2295 tomoyo_stat_updated[i]); 2361 tomoyo_stat_updated[i]);
2296 if (tomoyo_stat_modified[i]) { 2362 if (tomoyo_stat_modified[i]) {
2297 struct tomoyo_time stamp; 2363 struct tomoyo_time stamp;
2364
2298 tomoyo_convert_time(tomoyo_stat_modified[i], &stamp); 2365 tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
2299 tomoyo_io_printf(head, " (Last: %04u/%02u/%02u " 2366 tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
2300 "%02u:%02u:%02u)",
2301 stamp.year, stamp.month, stamp.day, 2367 stamp.year, stamp.month, stamp.day,
2302 stamp.hour, stamp.min, stamp.sec); 2368 stamp.hour, stamp.min, stamp.sec);
2303 } 2369 }
@@ -2305,6 +2371,7 @@ static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
2305 } 2371 }
2306 for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) { 2372 for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
2307 unsigned int used = tomoyo_memory_used[i]; 2373 unsigned int used = tomoyo_memory_used[i];
2374
2308 total += used; 2375 total += used;
2309 tomoyo_io_printf(head, "Memory used by %-22s %10u", 2376 tomoyo_io_printf(head, "Memory used by %-22s %10u",
2310 tomoyo_memory_headers[i], used); 2377 tomoyo_memory_headers[i], used);
@@ -2329,6 +2396,7 @@ static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
2329{ 2396{
2330 char *data = head->write_buf; 2397 char *data = head->write_buf;
2331 u8 i; 2398 u8 i;
2399
2332 if (tomoyo_str_starts(&data, "Memory used by ")) 2400 if (tomoyo_str_starts(&data, "Memory used by "))
2333 for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) 2401 for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
2334 if (tomoyo_str_starts(&data, tomoyo_memory_headers[i])) 2402 if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
@@ -2457,6 +2525,7 @@ int tomoyo_open_control(const u8 type, struct file *file)
2457__poll_t tomoyo_poll_control(struct file *file, poll_table *wait) 2525__poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
2458{ 2526{
2459 struct tomoyo_io_buffer *head = file->private_data; 2527 struct tomoyo_io_buffer *head = file->private_data;
2528
2460 if (head->poll) 2529 if (head->poll)
2461 return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM; 2530 return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
2462 return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM; 2531 return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
@@ -2472,6 +2541,7 @@ __poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
2472static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head) 2541static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
2473{ 2542{
2474 struct list_head *ns; 2543 struct list_head *ns;
2544
2475 if (head->type != TOMOYO_EXCEPTIONPOLICY && 2545 if (head->type != TOMOYO_EXCEPTIONPOLICY &&
2476 head->type != TOMOYO_PROFILE) 2546 head->type != TOMOYO_PROFILE)
2477 return; 2547 return;
@@ -2517,7 +2587,7 @@ ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
2517 int idx; 2587 int idx;
2518 2588
2519 if (!head->read) 2589 if (!head->read)
2520 return -ENOSYS; 2590 return -EINVAL;
2521 if (mutex_lock_interruptible(&head->io_sem)) 2591 if (mutex_lock_interruptible(&head->io_sem))
2522 return -EINTR; 2592 return -EINTR;
2523 head->read_user_buf = buffer; 2593 head->read_user_buf = buffer;
@@ -2557,6 +2627,7 @@ static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
2557 head->type == TOMOYO_PROFILE) { 2627 head->type == TOMOYO_PROFILE) {
2558 if (*line == '<') { 2628 if (*line == '<') {
2559 char *cp = strchr(line, ' '); 2629 char *cp = strchr(line, ' ');
2630
2560 if (cp) { 2631 if (cp) {
2561 *cp++ = '\0'; 2632 *cp++ = '\0';
2562 head->w.ns = tomoyo_assign_namespace(line); 2633 head->w.ns = tomoyo_assign_namespace(line);
@@ -2589,8 +2660,9 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
2589 size_t avail_len = buffer_len; 2660 size_t avail_len = buffer_len;
2590 char *cp0 = head->write_buf; 2661 char *cp0 = head->write_buf;
2591 int idx; 2662 int idx;
2663
2592 if (!head->write) 2664 if (!head->write)
2593 return -ENOSYS; 2665 return -EINVAL;
2594 if (!access_ok(buffer, buffer_len)) 2666 if (!access_ok(buffer, buffer_len))
2595 return -EFAULT; 2667 return -EFAULT;
2596 if (mutex_lock_interruptible(&head->io_sem)) 2668 if (mutex_lock_interruptible(&head->io_sem))
@@ -2600,9 +2672,11 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
2600 /* Read a line and dispatch it to the policy handler. */ 2672 /* Read a line and dispatch it to the policy handler. */
2601 while (avail_len > 0) { 2673 while (avail_len > 0) {
2602 char c; 2674 char c;
2675
2603 if (head->w.avail >= head->writebuf_size - 1) { 2676 if (head->w.avail >= head->writebuf_size - 1) {
2604 const int len = head->writebuf_size * 2; 2677 const int len = head->writebuf_size * 2;
2605 char *cp = kzalloc(len, GFP_NOFS); 2678 char *cp = kzalloc(len, GFP_NOFS);
2679
2606 if (!cp) { 2680 if (!cp) {
2607 error = -ENOMEM; 2681 error = -ENOMEM;
2608 break; 2682 break;
@@ -2701,30 +2775,32 @@ void tomoyo_check_profile(void)
2701{ 2775{
2702 struct tomoyo_domain_info *domain; 2776 struct tomoyo_domain_info *domain;
2703 const int idx = tomoyo_read_lock(); 2777 const int idx = tomoyo_read_lock();
2778
2704 tomoyo_policy_loaded = true; 2779 tomoyo_policy_loaded = true;
2705 printk(KERN_INFO "TOMOYO: 2.5.0\n"); 2780 pr_info("TOMOYO: 2.6.0\n");
2706 list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) { 2781 list_for_each_entry_rcu(domain, &tomoyo_domain_list, list) {
2707 const u8 profile = domain->profile; 2782 const u8 profile = domain->profile;
2708 const struct tomoyo_policy_namespace *ns = domain->ns; 2783 struct tomoyo_policy_namespace *ns = domain->ns;
2709 if (ns->profile_version != 20110903) 2784
2710 printk(KERN_ERR 2785 if (ns->profile_version == 20110903) {
2711 "Profile version %u is not supported.\n", 2786 pr_info_once("Converting profile version from %u to %u.\n",
2787 20110903, 20150505);
2788 ns->profile_version = 20150505;
2789 }
2790 if (ns->profile_version != 20150505)
2791 pr_err("Profile version %u is not supported.\n",
2712 ns->profile_version); 2792 ns->profile_version);
2713 else if (!ns->profile_ptr[profile]) 2793 else if (!ns->profile_ptr[profile])
2714 printk(KERN_ERR 2794 pr_err("Profile %u (used by '%s') is not defined.\n",
2715 "Profile %u (used by '%s') is not defined.\n",
2716 profile, domain->domainname->name); 2795 profile, domain->domainname->name);
2717 else 2796 else
2718 continue; 2797 continue;
2719 printk(KERN_ERR 2798 pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
2720 "Userland tools for TOMOYO 2.5 must be installed and " 2799 pr_err("Please see https://tomoyo.osdn.jp/2.6/ for more information.\n");
2721 "policy must be initialized.\n");
2722 printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.5/ "
2723 "for more information.\n");
2724 panic("STOP!"); 2800 panic("STOP!");
2725 } 2801 }
2726 tomoyo_read_unlock(idx); 2802 tomoyo_read_unlock(idx);
2727 printk(KERN_INFO "Mandatory Access Control activated.\n"); 2803 pr_info("Mandatory Access Control activated.\n");
2728} 2804}
2729 2805
2730/** 2806/**
@@ -2743,9 +2819,11 @@ void __init tomoyo_load_builtin_policy(void)
2743#include "builtin-policy.h" 2819#include "builtin-policy.h"
2744 u8 i; 2820 u8 i;
2745 const int idx = tomoyo_read_lock(); 2821 const int idx = tomoyo_read_lock();
2822
2746 for (i = 0; i < 5; i++) { 2823 for (i = 0; i < 5; i++) {
2747 struct tomoyo_io_buffer head = { }; 2824 struct tomoyo_io_buffer head = { };
2748 char *start = ""; 2825 char *start = "";
2826
2749 switch (i) { 2827 switch (i) {
2750 case 0: 2828 case 0:
2751 start = tomoyo_builtin_profile; 2829 start = tomoyo_builtin_profile;
@@ -2775,6 +2853,7 @@ void __init tomoyo_load_builtin_policy(void)
2775 } 2853 }
2776 while (1) { 2854 while (1) {
2777 char *end = strchr(start, '\n'); 2855 char *end = strchr(start, '\n');
2856
2778 if (!end) 2857 if (!end)
2779 break; 2858 break;
2780 *end = '\0'; 2859 *end = '\0';
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 539bcdd30bb8..050473df5809 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -10,6 +10,8 @@
10#ifndef _SECURITY_TOMOYO_COMMON_H 10#ifndef _SECURITY_TOMOYO_COMMON_H
11#define _SECURITY_TOMOYO_COMMON_H 11#define _SECURITY_TOMOYO_COMMON_H
12 12
13#define pr_fmt(fmt) fmt
14
13#include <linux/ctype.h> 15#include <linux/ctype.h>
14#include <linux/string.h> 16#include <linux/string.h>
15#include <linux/mm.h> 17#include <linux/mm.h>
@@ -29,6 +31,7 @@
29#include <linux/in.h> 31#include <linux/in.h>
30#include <linux/in6.h> 32#include <linux/in6.h>
31#include <linux/un.h> 33#include <linux/un.h>
34#include <linux/lsm_hooks.h>
32#include <net/sock.h> 35#include <net/sock.h>
33#include <net/af_unix.h> 36#include <net/af_unix.h>
34#include <net/ip.h> 37#include <net/ip.h>
@@ -681,11 +684,12 @@ struct tomoyo_domain_info {
681 const struct tomoyo_path_info *domainname; 684 const struct tomoyo_path_info *domainname;
682 /* Namespace for this domain. Never NULL. */ 685 /* Namespace for this domain. Never NULL. */
683 struct tomoyo_policy_namespace *ns; 686 struct tomoyo_policy_namespace *ns;
687 /* Group numbers to use. */
688 unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
684 u8 profile; /* Profile number to use. */ 689 u8 profile; /* Profile number to use. */
685 u8 group; /* Group number to use. */
686 bool is_deleted; /* Delete flag. */ 690 bool is_deleted; /* Delete flag. */
687 bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; 691 bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
688 atomic_t users; /* Number of referring credentials. */ 692 atomic_t users; /* Number of referring tasks. */
689}; 693};
690 694
691/* 695/*
@@ -787,9 +791,9 @@ struct tomoyo_acl_param {
787 * interfaces. 791 * interfaces.
788 */ 792 */
789struct tomoyo_io_buffer { 793struct tomoyo_io_buffer {
790 void (*read) (struct tomoyo_io_buffer *); 794 void (*read)(struct tomoyo_io_buffer *head);
791 int (*write) (struct tomoyo_io_buffer *); 795 int (*write)(struct tomoyo_io_buffer *head);
792 __poll_t (*poll) (struct file *file, poll_table *wait); 796 __poll_t (*poll)(struct file *file, poll_table *wait);
793 /* Exclusive lock for this structure. */ 797 /* Exclusive lock for this structure. */
794 struct mutex io_sem; 798 struct mutex io_sem;
795 char __user *read_user_buf; 799 char __user *read_user_buf;
@@ -906,12 +910,18 @@ struct tomoyo_policy_namespace {
906 struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS]; 910 struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
907 /* List for connecting to tomoyo_namespace_list list. */ 911 /* List for connecting to tomoyo_namespace_list list. */
908 struct list_head namespace_list; 912 struct list_head namespace_list;
909 /* Profile version. Currently only 20110903 is defined. */ 913 /* Profile version. Currently only 20150505 is defined. */
910 unsigned int profile_version; 914 unsigned int profile_version;
911 /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */ 915 /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
912 const char *name; 916 const char *name;
913}; 917};
914 918
919/* Structure for "struct task_struct"->security. */
920struct tomoyo_task {
921 struct tomoyo_domain_info *domain_info;
922 struct tomoyo_domain_info *old_domain_info;
923};
924
915/********** Function prototypes. **********/ 925/********** Function prototypes. **********/
916 926
917bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, 927bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
@@ -1020,6 +1030,7 @@ ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
1020struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param); 1030struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
1021struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, 1031struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
1022 const bool transit); 1032 const bool transit);
1033struct tomoyo_domain_info *tomoyo_domain(void);
1023struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); 1034struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
1024struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, 1035struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
1025 const u8 idx); 1036 const u8 idx);
@@ -1034,8 +1045,8 @@ void *tomoyo_commit_ok(void *data, const unsigned int size);
1034void __init tomoyo_load_builtin_policy(void); 1045void __init tomoyo_load_builtin_policy(void);
1035void __init tomoyo_mm_init(void); 1046void __init tomoyo_mm_init(void);
1036void tomoyo_check_acl(struct tomoyo_request_info *r, 1047void tomoyo_check_acl(struct tomoyo_request_info *r,
1037 bool (*check_entry) (struct tomoyo_request_info *, 1048 bool (*check_entry)(struct tomoyo_request_info *,
1038 const struct tomoyo_acl_info *)); 1049 const struct tomoyo_acl_info *));
1039void tomoyo_check_profile(void); 1050void tomoyo_check_profile(void);
1040void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); 1051void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
1041void tomoyo_del_condition(struct list_head *element); 1052void tomoyo_del_condition(struct list_head *element);
@@ -1062,6 +1073,7 @@ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
1062/********** External variable definitions. **********/ 1073/********** External variable definitions. **********/
1063 1074
1064extern bool tomoyo_policy_loaded; 1075extern bool tomoyo_policy_loaded;
1076extern int tomoyo_enabled;
1065extern const char * const tomoyo_condition_keyword 1077extern const char * const tomoyo_condition_keyword
1066[TOMOYO_MAX_CONDITION_KEYWORD]; 1078[TOMOYO_MAX_CONDITION_KEYWORD];
1067extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; 1079extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
@@ -1085,6 +1097,7 @@ extern struct tomoyo_domain_info tomoyo_kernel_domain;
1085extern struct tomoyo_policy_namespace tomoyo_kernel_namespace; 1097extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
1086extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; 1098extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
1087extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; 1099extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
1100extern struct lsm_blob_sizes tomoyo_blob_sizes;
1088 1101
1089/********** Inlined functions. **********/ 1102/********** Inlined functions. **********/
1090 1103
@@ -1121,6 +1134,7 @@ static inline void tomoyo_read_unlock(int idx)
1121static inline pid_t tomoyo_sys_getppid(void) 1134static inline pid_t tomoyo_sys_getppid(void)
1122{ 1135{
1123 pid_t pid; 1136 pid_t pid;
1137
1124 rcu_read_lock(); 1138 rcu_read_lock();
1125 pid = task_tgid_vnr(rcu_dereference(current->real_parent)); 1139 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1126 rcu_read_unlock(); 1140 rcu_read_unlock();
@@ -1197,26 +1211,15 @@ static inline void tomoyo_put_group(struct tomoyo_group *group)
1197} 1211}
1198 1212
1199/** 1213/**
1200 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread. 1214 * tomoyo_task - Get "struct tomoyo_task" for specified thread.
1201 *
1202 * Returns pointer to "struct tomoyo_domain_info" for current thread.
1203 */
1204static inline struct tomoyo_domain_info *tomoyo_domain(void)
1205{
1206 return current_cred()->security;
1207}
1208
1209/**
1210 * tomoyo_real_domain - Get "struct tomoyo_domain_info" for specified thread.
1211 * 1215 *
1212 * @task: Pointer to "struct task_struct". 1216 * @task - Pointer to "struct task_struct".
1213 * 1217 *
1214 * Returns pointer to "struct tomoyo_security" for specified thread. 1218 * Returns pointer to "struct tomoyo_task" for specified thread.
1215 */ 1219 */
1216static inline struct tomoyo_domain_info *tomoyo_real_domain(struct task_struct 1220static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
1217 *task)
1218{ 1221{
1219 return task_cred_xxx(task, security); 1222 return task->security + tomoyo_blob_sizes.lbs_task;
1220} 1223}
1221 1224
1222/** 1225/**
diff --git a/security/tomoyo/condition.c b/security/tomoyo/condition.c
index 8d0e1b9c9c57..8f6d57c15df6 100644
--- a/security/tomoyo/condition.c
+++ b/security/tomoyo/condition.c
@@ -28,9 +28,11 @@ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr,
28{ 28{
29 int i; 29 int i;
30 struct tomoyo_path_info arg; 30 struct tomoyo_path_info arg;
31
31 arg.name = arg_ptr; 32 arg.name = arg_ptr;
32 for (i = 0; i < argc; argv++, checked++, i++) { 33 for (i = 0; i < argc; argv++, checked++, i++) {
33 bool result; 34 bool result;
35
34 if (index != argv->index) 36 if (index != argv->index)
35 continue; 37 continue;
36 *checked = 1; 38 *checked = 1;
@@ -62,12 +64,14 @@ static bool tomoyo_envp(const char *env_name, const char *env_value,
62 int i; 64 int i;
63 struct tomoyo_path_info name; 65 struct tomoyo_path_info name;
64 struct tomoyo_path_info value; 66 struct tomoyo_path_info value;
67
65 name.name = env_name; 68 name.name = env_name;
66 tomoyo_fill_path_info(&name); 69 tomoyo_fill_path_info(&name);
67 value.name = env_value; 70 value.name = env_value;
68 tomoyo_fill_path_info(&value); 71 tomoyo_fill_path_info(&value);
69 for (i = 0; i < envc; envp++, checked++, i++) { 72 for (i = 0; i < envc; envp++, checked++, i++) {
70 bool result; 73 bool result;
74
71 if (!tomoyo_path_matches_pattern(&name, envp->name)) 75 if (!tomoyo_path_matches_pattern(&name, envp->name))
72 continue; 76 continue;
73 *checked = 1; 77 *checked = 1;
@@ -113,6 +117,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
113 bool result = true; 117 bool result = true;
114 u8 local_checked[32]; 118 u8 local_checked[32];
115 u8 *checked; 119 u8 *checked;
120
116 if (argc + envc <= sizeof(local_checked)) { 121 if (argc + envc <= sizeof(local_checked)) {
117 checked = local_checked; 122 checked = local_checked;
118 memset(local_checked, 0, sizeof(local_checked)); 123 memset(local_checked, 0, sizeof(local_checked));
@@ -131,6 +136,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
131 /* Read. */ 136 /* Read. */
132 const char *kaddr = dump->data; 137 const char *kaddr = dump->data;
133 const unsigned char c = kaddr[offset++]; 138 const unsigned char c = kaddr[offset++];
139
134 if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { 140 if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
135 if (c == '\\') { 141 if (c == '\\') {
136 arg_ptr[arg_len++] = '\\'; 142 arg_ptr[arg_len++] = '\\';
@@ -160,6 +166,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
160 argv_count--; 166 argv_count--;
161 } else if (envp_count) { 167 } else if (envp_count) {
162 char *cp = strchr(arg_ptr, '='); 168 char *cp = strchr(arg_ptr, '=');
169
163 if (cp) { 170 if (cp) {
164 *cp = '\0'; 171 *cp = '\0';
165 if (!tomoyo_envp(arg_ptr, cp + 1, 172 if (!tomoyo_envp(arg_ptr, cp + 1,
@@ -182,6 +189,7 @@ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
182out: 189out:
183 if (result) { 190 if (result) {
184 int i; 191 int i;
192
185 /* Check not-yet-checked entries. */ 193 /* Check not-yet-checked entries. */
186 for (i = 0; i < argc; i++) { 194 for (i = 0; i < argc; i++) {
187 if (checked[i]) 195 if (checked[i])
@@ -229,6 +237,7 @@ static bool tomoyo_scan_exec_realpath(struct file *file,
229{ 237{
230 bool result; 238 bool result;
231 struct tomoyo_path_info exe; 239 struct tomoyo_path_info exe;
240
232 if (!file) 241 if (!file)
233 return false; 242 return false;
234 exe.name = tomoyo_realpath_from_path(&file->f_path); 243 exe.name = tomoyo_realpath_from_path(&file->f_path);
@@ -250,6 +259,7 @@ static bool tomoyo_scan_exec_realpath(struct file *file,
250static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) 259static const struct tomoyo_path_info *tomoyo_get_dqword(char *start)
251{ 260{
252 char *cp = start + strlen(start) - 1; 261 char *cp = start + strlen(start) - 1;
262
253 if (cp == start || *start++ != '"' || *cp != '"') 263 if (cp == start || *start++ != '"' || *cp != '"')
254 return NULL; 264 return NULL;
255 *cp = '\0'; 265 *cp = '\0';
@@ -270,6 +280,7 @@ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param,
270 struct tomoyo_name_union *ptr) 280 struct tomoyo_name_union *ptr)
271{ 281{
272 char *filename = param->data; 282 char *filename = param->data;
283
273 if (*filename == '@') 284 if (*filename == '@')
274 return tomoyo_parse_name_union(param, ptr); 285 return tomoyo_parse_name_union(param, ptr);
275 ptr->filename = tomoyo_get_dqword(filename); 286 ptr->filename = tomoyo_get_dqword(filename);
@@ -310,6 +321,7 @@ static bool tomoyo_parse_envp(char *left, char *right,
310 const struct tomoyo_path_info *name; 321 const struct tomoyo_path_info *name;
311 const struct tomoyo_path_info *value; 322 const struct tomoyo_path_info *value;
312 char *cp = left + strlen(left) - 1; 323 char *cp = left + strlen(left) - 1;
324
313 if (*cp-- != ']' || *cp != '"') 325 if (*cp-- != ']' || *cp != '"')
314 goto out; 326 goto out;
315 *cp = '\0'; 327 *cp = '\0';
@@ -364,6 +376,7 @@ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a,
364static u8 tomoyo_condition_type(const char *word) 376static u8 tomoyo_condition_type(const char *word)
365{ 377{
366 u8 i; 378 u8 i;
379
367 for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { 380 for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) {
368 if (!strcmp(word, tomoyo_condition_keyword[i])) 381 if (!strcmp(word, tomoyo_condition_keyword[i]))
369 break; 382 break;
@@ -395,6 +408,7 @@ static struct tomoyo_condition *tomoyo_commit_condition
395{ 408{
396 struct tomoyo_condition *ptr; 409 struct tomoyo_condition *ptr;
397 bool found = false; 410 bool found = false;
411
398 if (mutex_lock_interruptible(&tomoyo_policy_lock)) { 412 if (mutex_lock_interruptible(&tomoyo_policy_lock)) {
399 dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); 413 dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
400 ptr = NULL; 414 ptr = NULL;
@@ -442,12 +456,14 @@ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param,
442{ 456{
443 char * const pos = param->data; 457 char * const pos = param->data;
444 bool flag; 458 bool flag;
459
445 if (*pos == '<') { 460 if (*pos == '<') {
446 e->transit = tomoyo_get_domainname(param); 461 e->transit = tomoyo_get_domainname(param);
447 goto done; 462 goto done;
448 } 463 }
449 { 464 {
450 char *cp = strchr(pos, ' '); 465 char *cp = strchr(pos, ' ');
466
451 if (cp) 467 if (cp)
452 *cp = '\0'; 468 *cp = '\0';
453 flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || 469 flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") ||
@@ -489,6 +505,7 @@ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param)
489 tomoyo_get_transit_preference(param, &e); 505 tomoyo_get_transit_preference(param, &e);
490 char * const end_of_string = start_of_string + strlen(start_of_string); 506 char * const end_of_string = start_of_string + strlen(start_of_string);
491 char *pos; 507 char *pos;
508
492rerun: 509rerun:
493 pos = start_of_string; 510 pos = start_of_string;
494 while (1) { 511 while (1) {
@@ -498,6 +515,7 @@ rerun:
498 char *cp; 515 char *cp;
499 char *right_word; 516 char *right_word;
500 bool is_not; 517 bool is_not;
518
501 if (!*left_word) 519 if (!*left_word)
502 break; 520 break;
503 /* 521 /*
@@ -622,8 +640,8 @@ rerun:
622 } 640 }
623store_value: 641store_value:
624 if (!condp) { 642 if (!condp) {
625 dprintk(KERN_WARNING "%u: dry_run left=%u right=%u " 643 dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n",
626 "match=%u\n", __LINE__, left, right, !is_not); 644 __LINE__, left, right, !is_not);
627 continue; 645 continue;
628 } 646 }
629 condp->left = left; 647 condp->left = left;
@@ -660,6 +678,7 @@ store_value:
660 envp = (struct tomoyo_envp *) (argv + e.argc); 678 envp = (struct tomoyo_envp *) (argv + e.argc);
661 { 679 {
662 bool flag = false; 680 bool flag = false;
681
663 for (pos = start_of_string; pos < end_of_string; pos++) { 682 for (pos = start_of_string; pos < end_of_string; pos++) {
664 if (*pos) 683 if (*pos)
665 continue; 684 continue;
@@ -698,6 +717,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
698 717
699 for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { 718 for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
700 struct inode *inode; 719 struct inode *inode;
720
701 switch (i) { 721 switch (i) {
702 case TOMOYO_PATH1: 722 case TOMOYO_PATH1:
703 dentry = obj->path1.dentry; 723 dentry = obj->path1.dentry;
@@ -718,6 +738,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
718 inode = d_backing_inode(dentry); 738 inode = d_backing_inode(dentry);
719 if (inode) { 739 if (inode) {
720 struct tomoyo_mini_stat *stat = &obj->stat[i]; 740 struct tomoyo_mini_stat *stat = &obj->stat[i];
741
721 stat->uid = inode->i_uid; 742 stat->uid = inode->i_uid;
722 stat->gid = inode->i_gid; 743 stat->gid = inode->i_gid;
723 stat->ino = inode->i_ino; 744 stat->ino = inode->i_ino;
@@ -726,8 +747,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
726 stat->rdev = inode->i_rdev; 747 stat->rdev = inode->i_rdev;
727 obj->stat_valid[i] = true; 748 obj->stat_valid[i] = true;
728 } 749 }
729 if (i & 1) /* i == TOMOYO_PATH1_PARENT || 750 if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */
730 i == TOMOYO_PATH2_PARENT */
731 dput(dentry); 751 dput(dentry);
732 } 752 }
733} 753}
@@ -758,6 +778,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
758 u16 argc; 778 u16 argc;
759 u16 envc; 779 u16 envc;
760 struct linux_binprm *bprm = NULL; 780 struct linux_binprm *bprm = NULL;
781
761 if (!cond) 782 if (!cond)
762 return true; 783 return true;
763 condc = cond->condc; 784 condc = cond->condc;
@@ -780,6 +801,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
780 const u8 right = condp->right; 801 const u8 right = condp->right;
781 bool is_bitop[2] = { false, false }; 802 bool is_bitop[2] = { false, false };
782 u8 j; 803 u8 j;
804
783 condp++; 805 condp++;
784 /* Check argv[] and envp[] later. */ 806 /* Check argv[] and envp[] later. */
785 if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) 807 if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY)
@@ -787,10 +809,11 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
787 /* Check string expressions. */ 809 /* Check string expressions. */
788 if (right == TOMOYO_NAME_UNION) { 810 if (right == TOMOYO_NAME_UNION) {
789 const struct tomoyo_name_union *ptr = names_p++; 811 const struct tomoyo_name_union *ptr = names_p++;
812 struct tomoyo_path_info *symlink;
813 struct tomoyo_execve *ee;
814 struct file *file;
815
790 switch (left) { 816 switch (left) {
791 struct tomoyo_path_info *symlink;
792 struct tomoyo_execve *ee;
793 struct file *file;
794 case TOMOYO_SYMLINK_TARGET: 817 case TOMOYO_SYMLINK_TARGET:
795 symlink = obj ? obj->symlink_target : NULL; 818 symlink = obj ? obj->symlink_target : NULL;
796 if (!symlink || 819 if (!symlink ||
@@ -812,6 +835,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
812 for (j = 0; j < 2; j++) { 835 for (j = 0; j < 2; j++) {
813 const u8 index = j ? right : left; 836 const u8 index = j ? right : left;
814 unsigned long value = 0; 837 unsigned long value = 0;
838
815 switch (index) { 839 switch (index) {
816 case TOMOYO_TASK_UID: 840 case TOMOYO_TASK_UID:
817 value = from_kuid(&init_user_ns, current_uid()); 841 value = from_kuid(&init_user_ns, current_uid());
@@ -874,31 +898,31 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
874 value = S_ISVTX; 898 value = S_ISVTX;
875 break; 899 break;
876 case TOMOYO_MODE_OWNER_READ: 900 case TOMOYO_MODE_OWNER_READ:
877 value = S_IRUSR; 901 value = 0400;
878 break; 902 break;
879 case TOMOYO_MODE_OWNER_WRITE: 903 case TOMOYO_MODE_OWNER_WRITE:
880 value = S_IWUSR; 904 value = 0200;
881 break; 905 break;
882 case TOMOYO_MODE_OWNER_EXECUTE: 906 case TOMOYO_MODE_OWNER_EXECUTE:
883 value = S_IXUSR; 907 value = 0100;
884 break; 908 break;
885 case TOMOYO_MODE_GROUP_READ: 909 case TOMOYO_MODE_GROUP_READ:
886 value = S_IRGRP; 910 value = 0040;
887 break; 911 break;
888 case TOMOYO_MODE_GROUP_WRITE: 912 case TOMOYO_MODE_GROUP_WRITE:
889 value = S_IWGRP; 913 value = 0020;
890 break; 914 break;
891 case TOMOYO_MODE_GROUP_EXECUTE: 915 case TOMOYO_MODE_GROUP_EXECUTE:
892 value = S_IXGRP; 916 value = 0010;
893 break; 917 break;
894 case TOMOYO_MODE_OTHERS_READ: 918 case TOMOYO_MODE_OTHERS_READ:
895 value = S_IROTH; 919 value = 0004;
896 break; 920 break;
897 case TOMOYO_MODE_OTHERS_WRITE: 921 case TOMOYO_MODE_OTHERS_WRITE:
898 value = S_IWOTH; 922 value = 0002;
899 break; 923 break;
900 case TOMOYO_MODE_OTHERS_EXECUTE: 924 case TOMOYO_MODE_OTHERS_EXECUTE:
901 value = S_IXOTH; 925 value = 0001;
902 break; 926 break;
903 case TOMOYO_EXEC_ARGC: 927 case TOMOYO_EXEC_ARGC:
904 if (!bprm) 928 if (!bprm)
@@ -923,6 +947,7 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
923 { 947 {
924 u8 stat_index; 948 u8 stat_index;
925 struct tomoyo_mini_stat *stat; 949 struct tomoyo_mini_stat *stat;
950
926 switch (index) { 951 switch (index) {
927 case TOMOYO_PATH1_UID: 952 case TOMOYO_PATH1_UID:
928 case TOMOYO_PATH1_GID: 953 case TOMOYO_PATH1_GID:
@@ -1036,12 +1061,14 @@ bool tomoyo_condition(struct tomoyo_request_info *r,
1036 if (left == TOMOYO_NUMBER_UNION) { 1061 if (left == TOMOYO_NUMBER_UNION) {
1037 /* Fetch values now. */ 1062 /* Fetch values now. */
1038 const struct tomoyo_number_union *ptr = numbers_p++; 1063 const struct tomoyo_number_union *ptr = numbers_p++;
1064
1039 min_v[0] = ptr->values[0]; 1065 min_v[0] = ptr->values[0];
1040 max_v[0] = ptr->values[1]; 1066 max_v[0] = ptr->values[1];
1041 } 1067 }
1042 if (right == TOMOYO_NUMBER_UNION) { 1068 if (right == TOMOYO_NUMBER_UNION) {
1043 /* Fetch values now. */ 1069 /* Fetch values now. */
1044 const struct tomoyo_number_union *ptr = numbers_p++; 1070 const struct tomoyo_number_union *ptr = numbers_p++;
1071
1045 if (ptr->group) { 1072 if (ptr->group) {
1046 if (tomoyo_number_matches_group(min_v[0], 1073 if (tomoyo_number_matches_group(min_v[0],
1047 max_v[0], 1074 max_v[0],
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index f6758dad981f..8526a0a74023 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -30,10 +30,10 @@ struct tomoyo_domain_info tomoyo_kernel_domain;
30 */ 30 */
31int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, 31int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
32 struct tomoyo_acl_param *param, 32 struct tomoyo_acl_param *param,
33 bool (*check_duplicate) (const struct tomoyo_acl_head 33 bool (*check_duplicate)(const struct tomoyo_acl_head
34 *, 34 *,
35 const struct tomoyo_acl_head 35 const struct tomoyo_acl_head
36 *)) 36 *))
37{ 37{
38 int error = param->is_delete ? -ENOENT : -ENOMEM; 38 int error = param->is_delete ? -ENOENT : -ENOMEM;
39 struct tomoyo_acl_head *entry; 39 struct tomoyo_acl_head *entry;
@@ -90,13 +90,13 @@ static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
90 */ 90 */
91int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, 91int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
92 struct tomoyo_acl_param *param, 92 struct tomoyo_acl_param *param,
93 bool (*check_duplicate) (const struct tomoyo_acl_info 93 bool (*check_duplicate)(const struct tomoyo_acl_info
94 *, 94 *,
95 const struct tomoyo_acl_info 95 const struct tomoyo_acl_info
96 *), 96 *),
97 bool (*merge_duplicate) (struct tomoyo_acl_info *, 97 bool (*merge_duplicate)(struct tomoyo_acl_info *,
98 struct tomoyo_acl_info *, 98 struct tomoyo_acl_info *,
99 const bool)) 99 const bool))
100{ 100{
101 const bool is_delete = param->is_delete; 101 const bool is_delete = param->is_delete;
102 int error = is_delete ? -ENOENT : -ENOMEM; 102 int error = is_delete ? -ENOENT : -ENOMEM;
@@ -157,13 +157,13 @@ out:
157 * Caller holds tomoyo_read_lock(). 157 * Caller holds tomoyo_read_lock().
158 */ 158 */
159void tomoyo_check_acl(struct tomoyo_request_info *r, 159void tomoyo_check_acl(struct tomoyo_request_info *r,
160 bool (*check_entry) (struct tomoyo_request_info *, 160 bool (*check_entry)(struct tomoyo_request_info *,
161 const struct tomoyo_acl_info *)) 161 const struct tomoyo_acl_info *))
162{ 162{
163 const struct tomoyo_domain_info *domain = r->domain; 163 const struct tomoyo_domain_info *domain = r->domain;
164 struct tomoyo_acl_info *ptr; 164 struct tomoyo_acl_info *ptr;
165 bool retried = false;
166 const struct list_head *list = &domain->acl_info_list; 165 const struct list_head *list = &domain->acl_info_list;
166 u16 i = 0;
167 167
168retry: 168retry:
169 list_for_each_entry_rcu(ptr, list, list) { 169 list_for_each_entry_rcu(ptr, list, list) {
@@ -177,9 +177,10 @@ retry:
177 r->granted = true; 177 r->granted = true;
178 return; 178 return;
179 } 179 }
180 if (!retried) { 180 for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
181 retried = true; 181 if (!test_bit(i, domain->group))
182 list = &domain->ns->acl_group[domain->group]; 182 continue;
183 list = &domain->ns->acl_group[i++];
183 goto retry; 184 goto retry;
184 } 185 }
185 r->granted = false; 186 r->granted = false;
@@ -198,6 +199,7 @@ LIST_HEAD(tomoyo_domain_list);
198static const char *tomoyo_last_word(const char *name) 199static const char *tomoyo_last_word(const char *name)
199{ 200{
200 const char *cp = strrchr(name, ' '); 201 const char *cp = strrchr(name, ' ');
202
201 if (cp) 203 if (cp)
202 return cp + 1; 204 return cp + 1;
203 return name; 205 return name;
@@ -220,6 +222,7 @@ static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
220 const struct tomoyo_transition_control *p2 = container_of(b, 222 const struct tomoyo_transition_control *p2 = container_of(b,
221 typeof(*p2), 223 typeof(*p2),
222 head); 224 head);
225
223 return p1->type == p2->type && p1->is_last_name == p2->is_last_name 226 return p1->type == p2->type && p1->is_last_name == p2->is_last_name
224 && p1->domainname == p2->domainname 227 && p1->domainname == p2->domainname
225 && p1->program == p2->program; 228 && p1->program == p2->program;
@@ -240,6 +243,7 @@ int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
240 int error = param->is_delete ? -ENOENT : -ENOMEM; 243 int error = param->is_delete ? -ENOENT : -ENOMEM;
241 char *program = param->data; 244 char *program = param->data;
242 char *domainname = strstr(program, " from "); 245 char *domainname = strstr(program, " from ");
246
243 if (domainname) { 247 if (domainname) {
244 *domainname = '\0'; 248 *domainname = '\0';
245 domainname += 6; 249 domainname += 6;
@@ -293,6 +297,7 @@ static inline bool tomoyo_scan_transition
293 const enum tomoyo_transition_type type) 297 const enum tomoyo_transition_type type)
294{ 298{
295 const struct tomoyo_transition_control *ptr; 299 const struct tomoyo_transition_control *ptr;
300
296 list_for_each_entry_rcu(ptr, list, head.list) { 301 list_for_each_entry_rcu(ptr, list, head.list) {
297 if (ptr->head.is_deleted || ptr->type != type) 302 if (ptr->head.is_deleted || ptr->type != type)
298 continue; 303 continue;
@@ -338,9 +343,11 @@ static enum tomoyo_transition_type tomoyo_transition_type
338{ 343{
339 const char *last_name = tomoyo_last_word(domainname->name); 344 const char *last_name = tomoyo_last_word(domainname->name);
340 enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET; 345 enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;
346
341 while (type < TOMOYO_MAX_TRANSITION_TYPE) { 347 while (type < TOMOYO_MAX_TRANSITION_TYPE) {
342 const struct list_head * const list = 348 const struct list_head * const list =
343 &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL]; 349 &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
350
344 if (!tomoyo_scan_transition(list, domainname, program, 351 if (!tomoyo_scan_transition(list, domainname, program,
345 last_name, type)) { 352 last_name, type)) {
346 type++; 353 type++;
@@ -375,6 +382,7 @@ static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
375 head); 382 head);
376 const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2), 383 const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
377 head); 384 head);
385
378 return p1->original_name == p2->original_name && 386 return p1->original_name == p2->original_name &&
379 p1->aggregated_name == p2->aggregated_name; 387 p1->aggregated_name == p2->aggregated_name;
380} 388}
@@ -394,6 +402,7 @@ int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
394 int error = param->is_delete ? -ENOENT : -ENOMEM; 402 int error = param->is_delete ? -ENOENT : -ENOMEM;
395 const char *original_name = tomoyo_read_token(param); 403 const char *original_name = tomoyo_read_token(param);
396 const char *aggregated_name = tomoyo_read_token(param); 404 const char *aggregated_name = tomoyo_read_token(param);
405
397 if (!tomoyo_correct_word(original_name) || 406 if (!tomoyo_correct_word(original_name) ||
398 !tomoyo_correct_path(aggregated_name)) 407 !tomoyo_correct_path(aggregated_name))
399 return -EINVAL; 408 return -EINVAL;
@@ -426,6 +435,7 @@ static struct tomoyo_policy_namespace *tomoyo_find_namespace
426(const char *name, const unsigned int len) 435(const char *name, const unsigned int len)
427{ 436{
428 struct tomoyo_policy_namespace *ns; 437 struct tomoyo_policy_namespace *ns;
438
429 list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) { 439 list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
430 if (strncmp(name, ns->name, len) || 440 if (strncmp(name, ns->name, len) ||
431 (name[len] && name[len] != ' ')) 441 (name[len] && name[len] != ' '))
@@ -451,6 +461,7 @@ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
451 struct tomoyo_policy_namespace *entry; 461 struct tomoyo_policy_namespace *entry;
452 const char *cp = domainname; 462 const char *cp = domainname;
453 unsigned int len = 0; 463 unsigned int len = 0;
464
454 while (*cp && *cp++ != ' ') 465 while (*cp && *cp++ != ' ')
455 len++; 466 len++;
456 ptr = tomoyo_find_namespace(domainname, len); 467 ptr = tomoyo_find_namespace(domainname, len);
@@ -466,6 +477,7 @@ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
466 ptr = tomoyo_find_namespace(domainname, len); 477 ptr = tomoyo_find_namespace(domainname, len);
467 if (!ptr && tomoyo_memory_ok(entry)) { 478 if (!ptr && tomoyo_memory_ok(entry)) {
468 char *name = (char *) (entry + 1); 479 char *name = (char *) (entry + 1);
480
469 ptr = entry; 481 ptr = entry;
470 memmove(name, domainname, len); 482 memmove(name, domainname, len);
471 name[len] = '\0'; 483 name[len] = '\0';
@@ -490,6 +502,7 @@ static bool tomoyo_namespace_jump(const char *domainname)
490{ 502{
491 const char *namespace = tomoyo_current_namespace()->name; 503 const char *namespace = tomoyo_current_namespace()->name;
492 const int len = strlen(namespace); 504 const int len = strlen(namespace);
505
493 return strncmp(domainname, namespace, len) || 506 return strncmp(domainname, namespace, len) ||
494 (domainname[len] && domainname[len] != ' '); 507 (domainname[len] && domainname[len] != ' ');
495} 508}
@@ -510,6 +523,7 @@ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
510 struct tomoyo_domain_info e = { }; 523 struct tomoyo_domain_info e = { };
511 struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname); 524 struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
512 bool created = false; 525 bool created = false;
526
513 if (entry) { 527 if (entry) {
514 if (transit) { 528 if (transit) {
515 /* 529 /*
@@ -546,8 +560,9 @@ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
546 */ 560 */
547 if (transit) { 561 if (transit) {
548 const struct tomoyo_domain_info *domain = tomoyo_domain(); 562 const struct tomoyo_domain_info *domain = tomoyo_domain();
563
549 e.profile = domain->profile; 564 e.profile = domain->profile;
550 e.group = domain->group; 565 memcpy(e.group, domain->group, sizeof(e.group));
551 } 566 }
552 e.domainname = tomoyo_get_name(domainname); 567 e.domainname = tomoyo_get_name(domainname);
553 if (!e.domainname) 568 if (!e.domainname)
@@ -569,12 +584,17 @@ out:
569 if (entry && transit) { 584 if (entry && transit) {
570 if (created) { 585 if (created) {
571 struct tomoyo_request_info r; 586 struct tomoyo_request_info r;
587 int i;
588
572 tomoyo_init_request_info(&r, entry, 589 tomoyo_init_request_info(&r, entry,
573 TOMOYO_MAC_FILE_EXECUTE); 590 TOMOYO_MAC_FILE_EXECUTE);
574 r.granted = false; 591 r.granted = false;
575 tomoyo_write_log(&r, "use_profile %u\n", 592 tomoyo_write_log(&r, "use_profile %u\n",
576 entry->profile); 593 entry->profile);
577 tomoyo_write_log(&r, "use_group %u\n", entry->group); 594 for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
595 if (test_bit(i, entry->group))
596 tomoyo_write_log(&r, "use_group %u\n",
597 i);
578 tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); 598 tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
579 } 599 }
580 } 600 }
@@ -712,6 +732,7 @@ retry:
712 struct tomoyo_aggregator *ptr; 732 struct tomoyo_aggregator *ptr;
713 struct list_head *list = 733 struct list_head *list =
714 &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR]; 734 &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];
735
715 /* Check 'aggregator' directive. */ 736 /* Check 'aggregator' directive. */
716 candidate = &exename; 737 candidate = &exename;
717 list_for_each_entry_rcu(ptr, list, head.list) { 738 list_for_each_entry_rcu(ptr, list, head.list) {
@@ -747,6 +768,7 @@ retry:
747 */ 768 */
748 if (ee->transition) { 769 if (ee->transition) {
749 const char *domainname = ee->transition->name; 770 const char *domainname = ee->transition->name;
771
750 reject_on_transition_failure = true; 772 reject_on_transition_failure = true;
751 if (!strcmp(domainname, "keep")) 773 if (!strcmp(domainname, "keep"))
752 goto force_keep_domain; 774 goto force_keep_domain;
@@ -758,6 +780,7 @@ retry:
758 goto force_initialize_domain; 780 goto force_initialize_domain;
759 if (!strcmp(domainname, "parent")) { 781 if (!strcmp(domainname, "parent")) {
760 char *cp; 782 char *cp;
783
761 strncpy(ee->tmp, old_domain->domainname->name, 784 strncpy(ee->tmp, old_domain->domainname->name,
762 TOMOYO_EXEC_TMPSIZE - 1); 785 TOMOYO_EXEC_TMPSIZE - 1);
763 cp = strrchr(ee->tmp, ' '); 786 cp = strrchr(ee->tmp, ' ');
@@ -822,8 +845,7 @@ force_jump_domain:
822 if (domain) 845 if (domain)
823 retval = 0; 846 retval = 0;
824 else if (reject_on_transition_failure) { 847 else if (reject_on_transition_failure) {
825 printk(KERN_WARNING "ERROR: Domain '%s' not ready.\n", 848 pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
826 ee->tmp);
827 retval = -ENOMEM; 849 retval = -ENOMEM;
828 } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING) 850 } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
829 retval = -ENOMEM; 851 retval = -ENOMEM;
@@ -834,16 +856,20 @@ force_jump_domain:
834 ee->r.granted = false; 856 ee->r.granted = false;
835 tomoyo_write_log(&ee->r, "%s", tomoyo_dif 857 tomoyo_write_log(&ee->r, "%s", tomoyo_dif
836 [TOMOYO_DIF_TRANSITION_FAILED]); 858 [TOMOYO_DIF_TRANSITION_FAILED]);
837 printk(KERN_WARNING 859 pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
838 "ERROR: Domain '%s' not defined.\n", ee->tmp);
839 } 860 }
840 } 861 }
841 out: 862 out:
842 if (!domain) 863 if (!domain)
843 domain = old_domain; 864 domain = old_domain;
844 /* Update reference count on "struct tomoyo_domain_info". */ 865 /* Update reference count on "struct tomoyo_domain_info". */
845 atomic_inc(&domain->users); 866 {
846 bprm->cred->security = domain; 867 struct tomoyo_task *s = tomoyo_task(current);
868
869 s->old_domain_info = s->domain_info;
870 s->domain_info = domain;
871 atomic_inc(&domain->users);
872 }
847 kfree(exename.name); 873 kfree(exename.name);
848 if (!retval) { 874 if (!retval) {
849 ee->r.domain = domain; 875 ee->r.domain = domain;
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index 2a374b4da8f5..86f7d1b90212 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -214,6 +214,7 @@ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
214 const u8 type = r->param.path_number.operation; 214 const u8 type = r->param.path_number.operation;
215 u8 radix; 215 u8 radix;
216 char buffer[64]; 216 char buffer[64];
217
217 switch (type) { 218 switch (type) {
218 case TOMOYO_TYPE_CREATE: 219 case TOMOYO_TYPE_CREATE:
219 case TOMOYO_TYPE_MKDIR: 220 case TOMOYO_TYPE_MKDIR:
@@ -253,6 +254,7 @@ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
253{ 254{
254 const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl), 255 const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
255 head); 256 head);
257
256 if (acl->perm & (1 << r->param.path.operation)) { 258 if (acl->perm & (1 << r->param.path.operation)) {
257 r->param.path.matched_path = 259 r->param.path.matched_path =
258 tomoyo_compare_name_union(r->param.path.filename, 260 tomoyo_compare_name_union(r->param.path.filename,
@@ -275,6 +277,7 @@ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
275{ 277{
276 const struct tomoyo_path_number_acl *acl = 278 const struct tomoyo_path_number_acl *acl =
277 container_of(ptr, typeof(*acl), head); 279 container_of(ptr, typeof(*acl), head);
280
278 return (acl->perm & (1 << r->param.path_number.operation)) && 281 return (acl->perm & (1 << r->param.path_number.operation)) &&
279 tomoyo_compare_number_union(r->param.path_number.number, 282 tomoyo_compare_number_union(r->param.path_number.number,
280 &acl->number) && 283 &acl->number) &&
@@ -295,6 +298,7 @@ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
295{ 298{
296 const struct tomoyo_path2_acl *acl = 299 const struct tomoyo_path2_acl *acl =
297 container_of(ptr, typeof(*acl), head); 300 container_of(ptr, typeof(*acl), head);
301
298 return (acl->perm & (1 << r->param.path2.operation)) && 302 return (acl->perm & (1 << r->param.path2.operation)) &&
299 tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1) 303 tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
300 && tomoyo_compare_name_union(r->param.path2.filename2, 304 && tomoyo_compare_name_union(r->param.path2.filename2,
@@ -314,6 +318,7 @@ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
314{ 318{
315 const struct tomoyo_mkdev_acl *acl = 319 const struct tomoyo_mkdev_acl *acl =
316 container_of(ptr, typeof(*acl), head); 320 container_of(ptr, typeof(*acl), head);
321
317 return (acl->perm & (1 << r->param.mkdev.operation)) && 322 return (acl->perm & (1 << r->param.mkdev.operation)) &&
318 tomoyo_compare_number_union(r->param.mkdev.mode, 323 tomoyo_compare_number_union(r->param.mkdev.mode,
319 &acl->mode) && 324 &acl->mode) &&
@@ -338,6 +343,7 @@ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
338{ 343{
339 const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head); 344 const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
340 const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head); 345 const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);
346
341 return tomoyo_same_name_union(&p1->name, &p2->name); 347 return tomoyo_same_name_union(&p1->name, &p2->name);
342} 348}
343 349
@@ -358,6 +364,7 @@ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
358 ->perm; 364 ->perm;
359 u16 perm = *a_perm; 365 u16 perm = *a_perm;
360 const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm; 366 const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;
367
361 if (is_delete) 368 if (is_delete)
362 perm &= ~b_perm; 369 perm &= ~b_perm;
363 else 370 else
@@ -384,6 +391,7 @@ static int tomoyo_update_path_acl(const u16 perm,
384 .perm = perm 391 .perm = perm
385 }; 392 };
386 int error; 393 int error;
394
387 if (!tomoyo_parse_name_union(param, &e.name)) 395 if (!tomoyo_parse_name_union(param, &e.name))
388 error = -EINVAL; 396 error = -EINVAL;
389 else 397 else
@@ -407,6 +415,7 @@ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
407{ 415{
408 const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head); 416 const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
409 const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head); 417 const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);
418
410 return tomoyo_same_name_union(&p1->name, &p2->name) && 419 return tomoyo_same_name_union(&p1->name, &p2->name) &&
411 tomoyo_same_number_union(&p1->mode, &p2->mode) && 420 tomoyo_same_number_union(&p1->mode, &p2->mode) &&
412 tomoyo_same_number_union(&p1->major, &p2->major) && 421 tomoyo_same_number_union(&p1->major, &p2->major) &&
@@ -431,6 +440,7 @@ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
431 u8 perm = *a_perm; 440 u8 perm = *a_perm;
432 const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head) 441 const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
433 ->perm; 442 ->perm;
443
434 if (is_delete) 444 if (is_delete)
435 perm &= ~b_perm; 445 perm &= ~b_perm;
436 else 446 else
@@ -457,6 +467,7 @@ static int tomoyo_update_mkdev_acl(const u8 perm,
457 .perm = perm 467 .perm = perm
458 }; 468 };
459 int error; 469 int error;
470
460 if (!tomoyo_parse_name_union(param, &e.name) || 471 if (!tomoyo_parse_name_union(param, &e.name) ||
461 !tomoyo_parse_number_union(param, &e.mode) || 472 !tomoyo_parse_number_union(param, &e.mode) ||
462 !tomoyo_parse_number_union(param, &e.major) || 473 !tomoyo_parse_number_union(param, &e.major) ||
@@ -486,6 +497,7 @@ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
486{ 497{
487 const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head); 498 const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
488 const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head); 499 const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);
500
489 return tomoyo_same_name_union(&p1->name1, &p2->name1) && 501 return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
490 tomoyo_same_name_union(&p1->name2, &p2->name2); 502 tomoyo_same_name_union(&p1->name2, &p2->name2);
491} 503}
@@ -507,6 +519,7 @@ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
507 ->perm; 519 ->perm;
508 u8 perm = *a_perm; 520 u8 perm = *a_perm;
509 const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm; 521 const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;
522
510 if (is_delete) 523 if (is_delete)
511 perm &= ~b_perm; 524 perm &= ~b_perm;
512 else 525 else
@@ -533,6 +546,7 @@ static int tomoyo_update_path2_acl(const u8 perm,
533 .perm = perm 546 .perm = perm
534 }; 547 };
535 int error; 548 int error;
549
536 if (!tomoyo_parse_name_union(param, &e.name1) || 550 if (!tomoyo_parse_name_union(param, &e.name1) ||
537 !tomoyo_parse_name_union(param, &e.name2)) 551 !tomoyo_parse_name_union(param, &e.name2))
538 error = -EINVAL; 552 error = -EINVAL;
@@ -621,6 +635,7 @@ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
621 head); 635 head);
622 const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2), 636 const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
623 head); 637 head);
638
624 return tomoyo_same_name_union(&p1->name, &p2->name) && 639 return tomoyo_same_name_union(&p1->name, &p2->name) &&
625 tomoyo_same_number_union(&p1->number, &p2->number); 640 tomoyo_same_number_union(&p1->number, &p2->number);
626} 641}
@@ -643,6 +658,7 @@ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
643 u8 perm = *a_perm; 658 u8 perm = *a_perm;
644 const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head) 659 const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
645 ->perm; 660 ->perm;
661
646 if (is_delete) 662 if (is_delete)
647 perm &= ~b_perm; 663 perm &= ~b_perm;
648 else 664 else
@@ -667,6 +683,7 @@ static int tomoyo_update_path_number_acl(const u8 perm,
667 .perm = perm 683 .perm = perm
668 }; 684 };
669 int error; 685 int error;
686
670 if (!tomoyo_parse_name_union(param, &e.name) || 687 if (!tomoyo_parse_name_union(param, &e.name) ||
671 !tomoyo_parse_number_union(param, &e.number)) 688 !tomoyo_parse_number_union(param, &e.number))
672 error = -EINVAL; 689 error = -EINVAL;
@@ -947,6 +964,7 @@ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
947{ 964{
948 const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head); 965 const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
949 const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head); 966 const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);
967
950 return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) && 968 return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
951 tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) && 969 tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
952 tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) && 970 tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
@@ -966,6 +984,7 @@ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
966{ 984{
967 struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL }; 985 struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
968 int error; 986 int error;
987
969 if (!tomoyo_parse_name_union(param, &e.dev_name) || 988 if (!tomoyo_parse_name_union(param, &e.dev_name) ||
970 !tomoyo_parse_name_union(param, &e.dir_name) || 989 !tomoyo_parse_name_union(param, &e.dir_name) ||
971 !tomoyo_parse_name_union(param, &e.fs_type) || 990 !tomoyo_parse_name_union(param, &e.fs_type) ||
@@ -995,6 +1014,7 @@ int tomoyo_write_file(struct tomoyo_acl_param *param)
995 u16 perm = 0; 1014 u16 perm = 0;
996 u8 type; 1015 u8 type;
997 const char *operation = tomoyo_read_token(param); 1016 const char *operation = tomoyo_read_token(param);
1017
998 for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++) 1018 for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
999 if (tomoyo_permstr(operation, tomoyo_path_keyword[type])) 1019 if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
1000 perm |= 1 << type; 1020 perm |= 1 << type;
diff --git a/security/tomoyo/gc.c b/security/tomoyo/gc.c
index e22bea811c57..9537832fca18 100644
--- a/security/tomoyo/gc.c
+++ b/security/tomoyo/gc.c
@@ -77,11 +77,13 @@ static bool tomoyo_name_used_by_io_buffer(const char *string)
77 spin_lock(&tomoyo_io_buffer_list_lock); 77 spin_lock(&tomoyo_io_buffer_list_lock);
78 list_for_each_entry(head, &tomoyo_io_buffer_list, list) { 78 list_for_each_entry(head, &tomoyo_io_buffer_list, list) {
79 int i; 79 int i;
80
80 head->users++; 81 head->users++;
81 spin_unlock(&tomoyo_io_buffer_list_lock); 82 spin_unlock(&tomoyo_io_buffer_list_lock);
82 mutex_lock(&head->io_sem); 83 mutex_lock(&head->io_sem);
83 for (i = 0; i < TOMOYO_MAX_IO_READ_QUEUE; i++) { 84 for (i = 0; i < TOMOYO_MAX_IO_READ_QUEUE; i++) {
84 const char *w = head->r.w[i]; 85 const char *w = head->r.w[i];
86
85 if (w < string || w > string + size) 87 if (w < string || w > string + size)
86 continue; 88 continue;
87 in_use = true; 89 in_use = true;
@@ -108,6 +110,7 @@ static inline void tomoyo_del_transition_control(struct list_head *element)
108{ 110{
109 struct tomoyo_transition_control *ptr = 111 struct tomoyo_transition_control *ptr =
110 container_of(element, typeof(*ptr), head.list); 112 container_of(element, typeof(*ptr), head.list);
113
111 tomoyo_put_name(ptr->domainname); 114 tomoyo_put_name(ptr->domainname);
112 tomoyo_put_name(ptr->program); 115 tomoyo_put_name(ptr->program);
113} 116}
@@ -123,6 +126,7 @@ static inline void tomoyo_del_aggregator(struct list_head *element)
123{ 126{
124 struct tomoyo_aggregator *ptr = 127 struct tomoyo_aggregator *ptr =
125 container_of(element, typeof(*ptr), head.list); 128 container_of(element, typeof(*ptr), head.list);
129
126 tomoyo_put_name(ptr->original_name); 130 tomoyo_put_name(ptr->original_name);
127 tomoyo_put_name(ptr->aggregated_name); 131 tomoyo_put_name(ptr->aggregated_name);
128} 132}
@@ -138,6 +142,7 @@ static inline void tomoyo_del_manager(struct list_head *element)
138{ 142{
139 struct tomoyo_manager *ptr = 143 struct tomoyo_manager *ptr =
140 container_of(element, typeof(*ptr), head.list); 144 container_of(element, typeof(*ptr), head.list);
145
141 tomoyo_put_name(ptr->manager); 146 tomoyo_put_name(ptr->manager);
142} 147}
143 148
@@ -152,6 +157,7 @@ static void tomoyo_del_acl(struct list_head *element)
152{ 157{
153 struct tomoyo_acl_info *acl = 158 struct tomoyo_acl_info *acl =
154 container_of(element, typeof(*acl), list); 159 container_of(element, typeof(*acl), list);
160
155 tomoyo_put_condition(acl->cond); 161 tomoyo_put_condition(acl->cond);
156 switch (acl->type) { 162 switch (acl->type) {
157 case TOMOYO_TYPE_PATH_ACL: 163 case TOMOYO_TYPE_PATH_ACL:
@@ -226,6 +232,7 @@ static void tomoyo_del_acl(struct list_head *element)
226 { 232 {
227 struct tomoyo_task_acl *entry = 233 struct tomoyo_task_acl *entry =
228 container_of(acl, typeof(*entry), head); 234 container_of(acl, typeof(*entry), head);
235
229 tomoyo_put_name(entry->domainname); 236 tomoyo_put_name(entry->domainname);
230 } 237 }
231 break; 238 break;
@@ -247,6 +254,7 @@ static inline void tomoyo_del_domain(struct list_head *element)
247 container_of(element, typeof(*domain), list); 254 container_of(element, typeof(*domain), list);
248 struct tomoyo_acl_info *acl; 255 struct tomoyo_acl_info *acl;
249 struct tomoyo_acl_info *tmp; 256 struct tomoyo_acl_info *tmp;
257
250 /* 258 /*
251 * Since this domain is referenced from neither 259 * Since this domain is referenced from neither
252 * "struct tomoyo_io_buffer" nor "struct cred"->security, we can delete 260 * "struct tomoyo_io_buffer" nor "struct cred"->security, we can delete
@@ -286,6 +294,7 @@ void tomoyo_del_condition(struct list_head *element)
286 = (const struct tomoyo_argv *) (names_p + names_count); 294 = (const struct tomoyo_argv *) (names_p + names_count);
287 const struct tomoyo_envp *envp 295 const struct tomoyo_envp *envp
288 = (const struct tomoyo_envp *) (argv + argc); 296 = (const struct tomoyo_envp *) (argv + argc);
297
289 for (i = 0; i < numbers_count; i++) 298 for (i = 0; i < numbers_count; i++)
290 tomoyo_put_number_union(numbers_p++); 299 tomoyo_put_number_union(numbers_p++);
291 for (i = 0; i < names_count; i++) 300 for (i = 0; i < names_count; i++)
@@ -321,6 +330,7 @@ static inline void tomoyo_del_path_group(struct list_head *element)
321{ 330{
322 struct tomoyo_path_group *member = 331 struct tomoyo_path_group *member =
323 container_of(element, typeof(*member), head.list); 332 container_of(element, typeof(*member), head.list);
333
324 tomoyo_put_name(member->member_name); 334 tomoyo_put_name(member->member_name);
325} 335}
326 336
@@ -335,6 +345,7 @@ static inline void tomoyo_del_group(struct list_head *element)
335{ 345{
336 struct tomoyo_group *group = 346 struct tomoyo_group *group =
337 container_of(element, typeof(*group), head.list); 347 container_of(element, typeof(*group), head.list);
348
338 tomoyo_put_name(group->group_name); 349 tomoyo_put_name(group->group_name);
339} 350}
340 351
@@ -476,6 +487,7 @@ static void tomoyo_collect_member(const enum tomoyo_policy_id id,
476{ 487{
477 struct tomoyo_acl_head *member; 488 struct tomoyo_acl_head *member;
478 struct tomoyo_acl_head *tmp; 489 struct tomoyo_acl_head *tmp;
490
479 list_for_each_entry_safe(member, tmp, member_list, list) { 491 list_for_each_entry_safe(member, tmp, member_list, list) {
480 if (!member->is_deleted) 492 if (!member->is_deleted)
481 continue; 493 continue;
@@ -495,6 +507,7 @@ static void tomoyo_collect_acl(struct list_head *list)
495{ 507{
496 struct tomoyo_acl_info *acl; 508 struct tomoyo_acl_info *acl;
497 struct tomoyo_acl_info *tmp; 509 struct tomoyo_acl_info *tmp;
510
498 list_for_each_entry_safe(acl, tmp, list, list) { 511 list_for_each_entry_safe(acl, tmp, list, list) {
499 if (!acl->is_deleted) 512 if (!acl->is_deleted)
500 continue; 513 continue;
@@ -513,10 +526,12 @@ static void tomoyo_collect_entry(void)
513 int i; 526 int i;
514 enum tomoyo_policy_id id; 527 enum tomoyo_policy_id id;
515 struct tomoyo_policy_namespace *ns; 528 struct tomoyo_policy_namespace *ns;
529
516 mutex_lock(&tomoyo_policy_lock); 530 mutex_lock(&tomoyo_policy_lock);
517 { 531 {
518 struct tomoyo_domain_info *domain; 532 struct tomoyo_domain_info *domain;
519 struct tomoyo_domain_info *tmp; 533 struct tomoyo_domain_info *tmp;
534
520 list_for_each_entry_safe(domain, tmp, &tomoyo_domain_list, 535 list_for_each_entry_safe(domain, tmp, &tomoyo_domain_list,
521 list) { 536 list) {
522 tomoyo_collect_acl(&domain->acl_info_list); 537 tomoyo_collect_acl(&domain->acl_info_list);
@@ -534,6 +549,7 @@ static void tomoyo_collect_entry(void)
534 { 549 {
535 struct tomoyo_shared_acl_head *ptr; 550 struct tomoyo_shared_acl_head *ptr;
536 struct tomoyo_shared_acl_head *tmp; 551 struct tomoyo_shared_acl_head *tmp;
552
537 list_for_each_entry_safe(ptr, tmp, &tomoyo_condition_list, 553 list_for_each_entry_safe(ptr, tmp, &tomoyo_condition_list,
538 list) { 554 list) {
539 if (atomic_read(&ptr->users) > 0) 555 if (atomic_read(&ptr->users) > 0)
@@ -547,6 +563,7 @@ static void tomoyo_collect_entry(void)
547 struct list_head *list = &ns->group_list[i]; 563 struct list_head *list = &ns->group_list[i];
548 struct tomoyo_group *group; 564 struct tomoyo_group *group;
549 struct tomoyo_group *tmp; 565 struct tomoyo_group *tmp;
566
550 switch (i) { 567 switch (i) {
551 case 0: 568 case 0:
552 id = TOMOYO_ID_PATH_GROUP; 569 id = TOMOYO_ID_PATH_GROUP;
@@ -574,6 +591,7 @@ static void tomoyo_collect_entry(void)
574 struct list_head *list = &tomoyo_name_list[i]; 591 struct list_head *list = &tomoyo_name_list[i];
575 struct tomoyo_shared_acl_head *ptr; 592 struct tomoyo_shared_acl_head *ptr;
576 struct tomoyo_shared_acl_head *tmp; 593 struct tomoyo_shared_acl_head *tmp;
594
577 list_for_each_entry_safe(ptr, tmp, list, list) { 595 list_for_each_entry_safe(ptr, tmp, list, list) {
578 if (atomic_read(&ptr->users) > 0) 596 if (atomic_read(&ptr->users) > 0)
579 continue; 597 continue;
@@ -595,6 +613,7 @@ static int tomoyo_gc_thread(void *unused)
595{ 613{
596 /* Garbage collector thread is exclusive. */ 614 /* Garbage collector thread is exclusive. */
597 static DEFINE_MUTEX(tomoyo_gc_mutex); 615 static DEFINE_MUTEX(tomoyo_gc_mutex);
616
598 if (!mutex_trylock(&tomoyo_gc_mutex)) 617 if (!mutex_trylock(&tomoyo_gc_mutex))
599 goto out; 618 goto out;
600 tomoyo_collect_entry(); 619 tomoyo_collect_entry();
diff --git a/security/tomoyo/group.c b/security/tomoyo/group.c
index 21b0cc3a7e1a..a37c7dc66e44 100644
--- a/security/tomoyo/group.c
+++ b/security/tomoyo/group.c
@@ -75,11 +75,13 @@ int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type)
75{ 75{
76 struct tomoyo_group *group = tomoyo_get_group(param, type); 76 struct tomoyo_group *group = tomoyo_get_group(param, type);
77 int error = -EINVAL; 77 int error = -EINVAL;
78
78 if (!group) 79 if (!group)
79 return -ENOMEM; 80 return -ENOMEM;
80 param->list = &group->member_list; 81 param->list = &group->member_list;
81 if (type == TOMOYO_PATH_GROUP) { 82 if (type == TOMOYO_PATH_GROUP) {
82 struct tomoyo_path_group e = { }; 83 struct tomoyo_path_group e = { };
84
83 e.member_name = tomoyo_get_name(tomoyo_read_token(param)); 85 e.member_name = tomoyo_get_name(tomoyo_read_token(param));
84 if (!e.member_name) { 86 if (!e.member_name) {
85 error = -ENOMEM; 87 error = -ENOMEM;
@@ -90,6 +92,7 @@ int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type)
90 tomoyo_put_name(e.member_name); 92 tomoyo_put_name(e.member_name);
91 } else if (type == TOMOYO_NUMBER_GROUP) { 93 } else if (type == TOMOYO_NUMBER_GROUP) {
92 struct tomoyo_number_group e = { }; 94 struct tomoyo_number_group e = { };
95
93 if (param->data[0] == '@' || 96 if (param->data[0] == '@' ||
94 !tomoyo_parse_number_union(param, &e.number)) 97 !tomoyo_parse_number_union(param, &e.number))
95 goto out; 98 goto out;
@@ -129,6 +132,7 @@ tomoyo_path_matches_group(const struct tomoyo_path_info *pathname,
129 const struct tomoyo_group *group) 132 const struct tomoyo_group *group)
130{ 133{
131 struct tomoyo_path_group *member; 134 struct tomoyo_path_group *member;
135
132 list_for_each_entry_rcu(member, &group->member_list, head.list) { 136 list_for_each_entry_rcu(member, &group->member_list, head.list) {
133 if (member->head.is_deleted) 137 if (member->head.is_deleted)
134 continue; 138 continue;
@@ -156,6 +160,7 @@ bool tomoyo_number_matches_group(const unsigned long min,
156{ 160{
157 struct tomoyo_number_group *member; 161 struct tomoyo_number_group *member;
158 bool matched = false; 162 bool matched = false;
163
159 list_for_each_entry_rcu(member, &group->member_list, head.list) { 164 list_for_each_entry_rcu(member, &group->member_list, head.list) {
160 if (member->head.is_deleted) 165 if (member->head.is_deleted)
161 continue; 166 continue;
diff --git a/security/tomoyo/load_policy.c b/security/tomoyo/load_policy.c
index 81b951652051..3445ae6fd479 100644
--- a/security/tomoyo/load_policy.c
+++ b/security/tomoyo/load_policy.c
@@ -37,11 +37,12 @@ __setup("TOMOYO_loader=", tomoyo_loader_setup);
37static bool tomoyo_policy_loader_exists(void) 37static bool tomoyo_policy_loader_exists(void)
38{ 38{
39 struct path path; 39 struct path path;
40
40 if (!tomoyo_loader) 41 if (!tomoyo_loader)
41 tomoyo_loader = CONFIG_SECURITY_TOMOYO_POLICY_LOADER; 42 tomoyo_loader = CONFIG_SECURITY_TOMOYO_POLICY_LOADER;
42 if (kern_path(tomoyo_loader, LOOKUP_FOLLOW, &path)) { 43 if (kern_path(tomoyo_loader, LOOKUP_FOLLOW, &path)) {
43 printk(KERN_INFO "Not activating Mandatory Access Control " 44 pr_info("Not activating Mandatory Access Control as %s does not exist.\n",
44 "as %s does not exist.\n", tomoyo_loader); 45 tomoyo_loader);
45 return false; 46 return false;
46 } 47 }
47 path_put(&path); 48 path_put(&path);
@@ -96,8 +97,7 @@ void tomoyo_load_policy(const char *filename)
96 if (!tomoyo_policy_loader_exists()) 97 if (!tomoyo_policy_loader_exists())
97 return; 98 return;
98 done = true; 99 done = true;
99 printk(KERN_INFO "Calling %s to load policy. Please wait.\n", 100 pr_info("Calling %s to load policy. Please wait.\n", tomoyo_loader);
100 tomoyo_loader);
101 argv[0] = (char *) tomoyo_loader; 101 argv[0] = (char *) tomoyo_loader;
102 argv[1] = NULL; 102 argv[1] = NULL;
103 envp[0] = "HOME=/"; 103 envp[0] = "HOME=/";
diff --git a/security/tomoyo/memory.c b/security/tomoyo/memory.c
index 12477e0b0a11..2e7fcfa923c9 100644
--- a/security/tomoyo/memory.c
+++ b/security/tomoyo/memory.c
@@ -19,9 +19,9 @@ void tomoyo_warn_oom(const char *function)
19 /* Reduce error messages. */ 19 /* Reduce error messages. */
20 static pid_t tomoyo_last_pid; 20 static pid_t tomoyo_last_pid;
21 const pid_t pid = current->pid; 21 const pid_t pid = current->pid;
22
22 if (tomoyo_last_pid != pid) { 23 if (tomoyo_last_pid != pid) {
23 printk(KERN_WARNING "ERROR: Out of memory at %s.\n", 24 pr_warn("ERROR: Out of memory at %s.\n", function);
24 function);
25 tomoyo_last_pid = pid; 25 tomoyo_last_pid = pid;
26 } 26 }
27 if (!tomoyo_policy_loaded) 27 if (!tomoyo_policy_loaded)
@@ -48,6 +48,7 @@ bool tomoyo_memory_ok(void *ptr)
48{ 48{
49 if (ptr) { 49 if (ptr) {
50 const size_t s = ksize(ptr); 50 const size_t s = ksize(ptr);
51
51 tomoyo_memory_used[TOMOYO_MEMORY_POLICY] += s; 52 tomoyo_memory_used[TOMOYO_MEMORY_POLICY] += s;
52 if (!tomoyo_memory_quota[TOMOYO_MEMORY_POLICY] || 53 if (!tomoyo_memory_quota[TOMOYO_MEMORY_POLICY] ||
53 tomoyo_memory_used[TOMOYO_MEMORY_POLICY] <= 54 tomoyo_memory_used[TOMOYO_MEMORY_POLICY] <=
@@ -73,6 +74,7 @@ bool tomoyo_memory_ok(void *ptr)
73void *tomoyo_commit_ok(void *data, const unsigned int size) 74void *tomoyo_commit_ok(void *data, const unsigned int size)
74{ 75{
75 void *ptr = kzalloc(size, GFP_NOFS); 76 void *ptr = kzalloc(size, GFP_NOFS);
77
76 if (tomoyo_memory_ok(ptr)) { 78 if (tomoyo_memory_ok(ptr)) {
77 memmove(ptr, data, size); 79 memmove(ptr, data, size);
78 memset(data, 0, size); 80 memset(data, 0, size);
@@ -98,6 +100,7 @@ struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
98 struct list_head *list; 100 struct list_head *list;
99 const char *group_name = tomoyo_read_token(param); 101 const char *group_name = tomoyo_read_token(param);
100 bool found = false; 102 bool found = false;
103
101 if (!tomoyo_correct_word(group_name) || idx >= TOMOYO_MAX_GROUP) 104 if (!tomoyo_correct_word(group_name) || idx >= TOMOYO_MAX_GROUP)
102 return NULL; 105 return NULL;
103 e.group_name = tomoyo_get_name(group_name); 106 e.group_name = tomoyo_get_name(group_name);
@@ -116,6 +119,7 @@ struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
116 } 119 }
117 if (!found) { 120 if (!found) {
118 struct tomoyo_group *entry = tomoyo_commit_ok(&e, sizeof(e)); 121 struct tomoyo_group *entry = tomoyo_commit_ok(&e, sizeof(e));
122
119 if (entry) { 123 if (entry) {
120 INIT_LIST_HEAD(&entry->member_list); 124 INIT_LIST_HEAD(&entry->member_list);
121 atomic_set(&entry->head.users, 1); 125 atomic_set(&entry->head.users, 1);
@@ -191,6 +195,7 @@ struct tomoyo_policy_namespace tomoyo_kernel_namespace;
191void __init tomoyo_mm_init(void) 195void __init tomoyo_mm_init(void)
192{ 196{
193 int idx; 197 int idx;
198
194 for (idx = 0; idx < TOMOYO_MAX_HASH; idx++) 199 for (idx = 0; idx < TOMOYO_MAX_HASH; idx++)
195 INIT_LIST_HEAD(&tomoyo_name_list[idx]); 200 INIT_LIST_HEAD(&tomoyo_name_list[idx]);
196 tomoyo_kernel_namespace.name = "<kernel>"; 201 tomoyo_kernel_namespace.name = "<kernel>";
diff --git a/security/tomoyo/mount.c b/security/tomoyo/mount.c
index 7dc7f59b7dde..2755971f50df 100644
--- a/security/tomoyo/mount.c
+++ b/security/tomoyo/mount.c
@@ -49,6 +49,7 @@ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r,
49{ 49{
50 const struct tomoyo_mount_acl *acl = 50 const struct tomoyo_mount_acl *acl =
51 container_of(ptr, typeof(*acl), head); 51 container_of(ptr, typeof(*acl), head);
52
52 return tomoyo_compare_number_union(r->param.mount.flags, 53 return tomoyo_compare_number_union(r->param.mount.flags,
53 &acl->flags) && 54 &acl->flags) &&
54 tomoyo_compare_name_union(r->param.mount.type, 55 tomoyo_compare_name_union(r->param.mount.type,
@@ -89,6 +90,7 @@ static int tomoyo_mount_acl(struct tomoyo_request_info *r,
89 struct tomoyo_path_info rdir; 90 struct tomoyo_path_info rdir;
90 int need_dev = 0; 91 int need_dev = 0;
91 int error = -ENOMEM; 92 int error = -ENOMEM;
93
92 r->obj = &obj; 94 r->obj = &obj;
93 95
94 /* Get fstype. */ 96 /* Get fstype. */
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 6ff8c21e4fff..85e6e31dd1e5 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -94,11 +94,13 @@ static char *tomoyo_get_absolute_path(const struct path *path, char * const buff
94 const int buflen) 94 const int buflen)
95{ 95{
96 char *pos = ERR_PTR(-ENOMEM); 96 char *pos = ERR_PTR(-ENOMEM);
97
97 if (buflen >= 256) { 98 if (buflen >= 256) {
98 /* go to whatever namespace root we are under */ 99 /* go to whatever namespace root we are under */
99 pos = d_absolute_path(path, buffer, buflen - 1); 100 pos = d_absolute_path(path, buffer, buflen - 1);
100 if (!IS_ERR(pos) && *pos == '/' && pos[1]) { 101 if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
101 struct inode *inode = d_backing_inode(path->dentry); 102 struct inode *inode = d_backing_inode(path->dentry);
103
102 if (inode && S_ISDIR(inode->i_mode)) { 104 if (inode && S_ISDIR(inode->i_mode)) {
103 buffer[buflen - 2] = '/'; 105 buffer[buflen - 2] = '/';
104 buffer[buflen - 1] = '\0'; 106 buffer[buflen - 1] = '\0';
@@ -123,10 +125,12 @@ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
123 const int buflen) 125 const int buflen)
124{ 126{
125 char *pos = ERR_PTR(-ENOMEM); 127 char *pos = ERR_PTR(-ENOMEM);
128
126 if (buflen >= 256) { 129 if (buflen >= 256) {
127 pos = dentry_path_raw(dentry, buffer, buflen - 1); 130 pos = dentry_path_raw(dentry, buffer, buflen - 1);
128 if (!IS_ERR(pos) && *pos == '/' && pos[1]) { 131 if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
129 struct inode *inode = d_backing_inode(dentry); 132 struct inode *inode = d_backing_inode(dentry);
133
130 if (inode && S_ISDIR(inode->i_mode)) { 134 if (inode && S_ISDIR(inode->i_mode)) {
131 buffer[buflen - 2] = '/'; 135 buffer[buflen - 2] = '/';
132 buffer[buflen - 1] = '\0'; 136 buffer[buflen - 1] = '\0';
@@ -150,12 +154,14 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
150{ 154{
151 struct super_block *sb = dentry->d_sb; 155 struct super_block *sb = dentry->d_sb;
152 char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); 156 char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);
157
153 if (IS_ERR(pos)) 158 if (IS_ERR(pos))
154 return pos; 159 return pos;
155 /* Convert from $PID to self if $PID is current thread. */ 160 /* Convert from $PID to self if $PID is current thread. */
156 if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { 161 if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
157 char *ep; 162 char *ep;
158 const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); 163 const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
164
159 if (*ep == '/' && pid && pid == 165 if (*ep == '/' && pid && pid ==
160 task_tgid_nr_ns(current, sb->s_fs_info)) { 166 task_tgid_nr_ns(current, sb->s_fs_info)) {
161 pos = ep - 5; 167 pos = ep - 5;
@@ -170,6 +176,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
170 goto prepend_filesystem_name; 176 goto prepend_filesystem_name;
171 { 177 {
172 struct inode *inode = d_backing_inode(sb->s_root); 178 struct inode *inode = d_backing_inode(sb->s_root);
179
173 /* 180 /*
174 * Use filesystem name if filesystem does not support rename() 181 * Use filesystem name if filesystem does not support rename()
175 * operation. 182 * operation.
@@ -182,6 +189,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
182 char name[64]; 189 char name[64];
183 int name_len; 190 int name_len;
184 const dev_t dev = sb->s_dev; 191 const dev_t dev = sb->s_dev;
192
185 name[sizeof(name) - 1] = '\0'; 193 name[sizeof(name) - 1] = '\0';
186 snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), 194 snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
187 MINOR(dev)); 195 MINOR(dev));
@@ -197,6 +205,7 @@ prepend_filesystem_name:
197 { 205 {
198 const char *name = sb->s_type->name; 206 const char *name = sb->s_type->name;
199 const int name_len = strlen(name); 207 const int name_len = strlen(name);
208
200 pos -= name_len + 1; 209 pos -= name_len + 1;
201 if (pos < buffer) 210 if (pos < buffer)
202 goto out; 211 goto out;
@@ -223,10 +232,10 @@ static char *tomoyo_get_socket_name(const struct path *path, char * const buffer
223 struct inode *inode = d_backing_inode(path->dentry); 232 struct inode *inode = d_backing_inode(path->dentry);
224 struct socket *sock = inode ? SOCKET_I(inode) : NULL; 233 struct socket *sock = inode ? SOCKET_I(inode) : NULL;
225 struct sock *sk = sock ? sock->sk : NULL; 234 struct sock *sk = sock ? sock->sk : NULL;
235
226 if (sk) { 236 if (sk) {
227 snprintf(buffer, buflen, "socket:[family=%u:type=%u:" 237 snprintf(buffer, buflen, "socket:[family=%u:type=%u:protocol=%u]",
228 "protocol=%u]", sk->sk_family, sk->sk_type, 238 sk->sk_family, sk->sk_type, sk->sk_protocol);
229 sk->sk_protocol);
230 } else { 239 } else {
231 snprintf(buffer, buflen, "socket:[unknown]"); 240 snprintf(buffer, buflen, "socket:[unknown]");
232 } 241 }
@@ -255,12 +264,14 @@ char *tomoyo_realpath_from_path(const struct path *path)
255 unsigned int buf_len = PAGE_SIZE / 2; 264 unsigned int buf_len = PAGE_SIZE / 2;
256 struct dentry *dentry = path->dentry; 265 struct dentry *dentry = path->dentry;
257 struct super_block *sb; 266 struct super_block *sb;
267
258 if (!dentry) 268 if (!dentry)
259 return NULL; 269 return NULL;
260 sb = dentry->d_sb; 270 sb = dentry->d_sb;
261 while (1) { 271 while (1) {
262 char *pos; 272 char *pos;
263 struct inode *inode; 273 struct inode *inode;
274
264 buf_len <<= 1; 275 buf_len <<= 1;
265 kfree(buf); 276 kfree(buf);
266 buf = kmalloc(buf_len, GFP_NOFS); 277 buf = kmalloc(buf_len, GFP_NOFS);
@@ -323,6 +334,7 @@ char *tomoyo_realpath_nofollow(const char *pathname)
323 334
324 if (pathname && kern_path(pathname, 0, &path) == 0) { 335 if (pathname && kern_path(pathname, 0, &path) == 0) {
325 char *buf = tomoyo_realpath_from_path(&path); 336 char *buf = tomoyo_realpath_from_path(&path);
337
326 path_put(&path); 338 path_put(&path);
327 return buf; 339 return buf;
328 } 340 }
diff --git a/security/tomoyo/securityfs_if.c b/security/tomoyo/securityfs_if.c
index 1d3d7e7a1f05..546281c5b233 100644
--- a/security/tomoyo/securityfs_if.c
+++ b/security/tomoyo/securityfs_if.c
@@ -21,6 +21,7 @@ static bool tomoyo_check_task_acl(struct tomoyo_request_info *r,
21{ 21{
22 const struct tomoyo_task_acl *acl = container_of(ptr, typeof(*acl), 22 const struct tomoyo_task_acl *acl = container_of(ptr, typeof(*acl),
23 head); 23 head);
24
24 return !tomoyo_pathcmp(r->param.task.domainname, acl->domainname); 25 return !tomoyo_pathcmp(r->param.task.domainname, acl->domainname);
25} 26}
26 27
@@ -42,6 +43,7 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
42{ 43{
43 char *data; 44 char *data;
44 int error; 45 int error;
46
45 if (!count || count >= TOMOYO_EXEC_TMPSIZE - 10) 47 if (!count || count >= TOMOYO_EXEC_TMPSIZE - 10)
46 return -ENOMEM; 48 return -ENOMEM;
47 data = memdup_user_nul(buf, count); 49 data = memdup_user_nul(buf, count);
@@ -52,6 +54,7 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
52 const int idx = tomoyo_read_lock(); 54 const int idx = tomoyo_read_lock();
53 struct tomoyo_path_info name; 55 struct tomoyo_path_info name;
54 struct tomoyo_request_info r; 56 struct tomoyo_request_info r;
57
55 name.name = data; 58 name.name = data;
56 tomoyo_fill_path_info(&name); 59 tomoyo_fill_path_info(&name);
57 /* Check "task manual_domain_transition" permission. */ 60 /* Check "task manual_domain_transition" permission. */
@@ -67,18 +70,14 @@ static ssize_t tomoyo_write_self(struct file *file, const char __user *buf,
67 if (!new_domain) { 70 if (!new_domain) {
68 error = -ENOENT; 71 error = -ENOENT;
69 } else { 72 } else {
70 struct cred *cred = prepare_creds(); 73 struct tomoyo_task *s = tomoyo_task(current);
71 if (!cred) { 74 struct tomoyo_domain_info *old_domain =
72 error = -ENOMEM; 75 s->domain_info;
73 } else { 76
74 struct tomoyo_domain_info *old_domain = 77 s->domain_info = new_domain;
75 cred->security; 78 atomic_inc(&new_domain->users);
76 cred->security = new_domain; 79 atomic_dec(&old_domain->users);
77 atomic_inc(&new_domain->users); 80 error = 0;
78 atomic_dec(&old_domain->users);
79 commit_creds(cred);
80 error = 0;
81 }
82 } 81 }
83 } 82 }
84 tomoyo_read_unlock(idx); 83 tomoyo_read_unlock(idx);
@@ -104,6 +103,7 @@ static ssize_t tomoyo_read_self(struct file *file, char __user *buf,
104 const char *domain = tomoyo_domain()->domainname->name; 103 const char *domain = tomoyo_domain()->domainname->name;
105 loff_t len = strlen(domain); 104 loff_t len = strlen(domain);
106 loff_t pos = *ppos; 105 loff_t pos = *ppos;
106
107 if (pos >= len || !count) 107 if (pos >= len || !count)
108 return 0; 108 return 0;
109 len -= pos; 109 len -= pos;
@@ -234,10 +234,14 @@ static void __init tomoyo_create_entry(const char *name, const umode_t mode,
234 */ 234 */
235static int __init tomoyo_initerface_init(void) 235static int __init tomoyo_initerface_init(void)
236{ 236{
237 struct tomoyo_domain_info *domain;
237 struct dentry *tomoyo_dir; 238 struct dentry *tomoyo_dir;
238 239
240 if (!tomoyo_enabled)
241 return 0;
242 domain = tomoyo_domain();
239 /* Don't create securityfs entries unless registered. */ 243 /* Don't create securityfs entries unless registered. */
240 if (current_cred()->security != &tomoyo_kernel_domain) 244 if (domain != &tomoyo_kernel_domain)
241 return 0; 245 return 0;
242 246
243 tomoyo_dir = securityfs_create_dir("tomoyo", NULL); 247 tomoyo_dir = securityfs_create_dir("tomoyo", NULL);
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 1b5b5097efd7..716c92ec941a 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -9,17 +9,19 @@
9#include "common.h" 9#include "common.h"
10 10
11/** 11/**
12 * tomoyo_cred_alloc_blank - Target for security_cred_alloc_blank(). 12 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
13 * 13 *
14 * @new: Pointer to "struct cred". 14 * Returns pointer to "struct tomoyo_domain_info" for current thread.
15 * @gfp: Memory allocation flags.
16 *
17 * Returns 0.
18 */ 15 */
19static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp) 16struct tomoyo_domain_info *tomoyo_domain(void)
20{ 17{
21 new->security = NULL; 18 struct tomoyo_task *s = tomoyo_task(current);
22 return 0; 19
20 if (s->old_domain_info && !current->in_execve) {
21 atomic_dec(&s->old_domain_info->users);
22 s->old_domain_info = NULL;
23 }
24 return s->domain_info;
23} 25}
24 26
25/** 27/**
@@ -34,42 +36,38 @@ static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp)
34static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, 36static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
35 gfp_t gfp) 37 gfp_t gfp)
36{ 38{
37 struct tomoyo_domain_info *domain = old->security; 39 /* Restore old_domain_info saved by previous execve() request. */
38 new->security = domain; 40 struct tomoyo_task *s = tomoyo_task(current);
39 if (domain) 41
40 atomic_inc(&domain->users); 42 if (s->old_domain_info && !current->in_execve) {
43 atomic_dec(&s->domain_info->users);
44 s->domain_info = s->old_domain_info;
45 s->old_domain_info = NULL;
46 }
41 return 0; 47 return 0;
42} 48}
43 49
44/** 50/**
45 * tomoyo_cred_transfer - Target for security_transfer_creds(). 51 * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
46 * 52 *
47 * @new: Pointer to "struct cred". 53 * @bprm: Pointer to "struct linux_binprm".
48 * @old: Pointer to "struct cred".
49 */ 54 */
50static void tomoyo_cred_transfer(struct cred *new, const struct cred *old) 55static void tomoyo_bprm_committed_creds(struct linux_binprm *bprm)
51{ 56{
52 tomoyo_cred_prepare(new, old, 0); 57 /* Clear old_domain_info saved by execve() request. */
53} 58 struct tomoyo_task *s = tomoyo_task(current);
54 59
55/** 60 atomic_dec(&s->old_domain_info->users);
56 * tomoyo_cred_free - Target for security_cred_free(). 61 s->old_domain_info = NULL;
57 *
58 * @cred: Pointer to "struct cred".
59 */
60static void tomoyo_cred_free(struct cred *cred)
61{
62 struct tomoyo_domain_info *domain = cred->security;
63 if (domain)
64 atomic_dec(&domain->users);
65} 62}
66 63
64#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
67/** 65/**
68 * tomoyo_bprm_set_creds - Target for security_bprm_set_creds(). 66 * tomoyo_bprm_set_creds - Target for security_bprm_set_creds().
69 * 67 *
70 * @bprm: Pointer to "struct linux_binprm". 68 * @bprm: Pointer to "struct linux_binprm".
71 * 69 *
72 * Returns 0 on success, negative value otherwise. 70 * Returns 0.
73 */ 71 */
74static int tomoyo_bprm_set_creds(struct linux_binprm *bprm) 72static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
75{ 73{
@@ -79,29 +77,15 @@ static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
79 */ 77 */
80 if (bprm->called_set_creds) 78 if (bprm->called_set_creds)
81 return 0; 79 return 0;
82#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
83 /* 80 /*
84 * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested 81 * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
85 * for the first time. 82 * for the first time.
86 */ 83 */
87 if (!tomoyo_policy_loaded) 84 if (!tomoyo_policy_loaded)
88 tomoyo_load_policy(bprm->filename); 85 tomoyo_load_policy(bprm->filename);
89#endif
90 /*
91 * Release reference to "struct tomoyo_domain_info" stored inside
92 * "bprm->cred->security". New reference to "struct tomoyo_domain_info"
93 * stored inside "bprm->cred->security" will be acquired later inside
94 * tomoyo_find_next_domain().
95 */
96 atomic_dec(&((struct tomoyo_domain_info *)
97 bprm->cred->security)->users);
98 /*
99 * Tell tomoyo_bprm_check_security() is called for the first time of an
100 * execve operation.
101 */
102 bprm->cred->security = NULL;
103 return 0; 86 return 0;
104} 87}
88#endif
105 89
106/** 90/**
107 * tomoyo_bprm_check_security - Target for security_bprm_check(). 91 * tomoyo_bprm_check_security - Target for security_bprm_check().
@@ -112,23 +96,24 @@ static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
112 */ 96 */
113static int tomoyo_bprm_check_security(struct linux_binprm *bprm) 97static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
114{ 98{
115 struct tomoyo_domain_info *domain = bprm->cred->security; 99 struct tomoyo_task *s = tomoyo_task(current);
116 100
117 /* 101 /*
118 * Execute permission is checked against pathname passed to do_execve() 102 * Execute permission is checked against pathname passed to do_execve()
119 * using current domain. 103 * using current domain.
120 */ 104 */
121 if (!domain) { 105 if (!s->old_domain_info) {
122 const int idx = tomoyo_read_lock(); 106 const int idx = tomoyo_read_lock();
123 const int err = tomoyo_find_next_domain(bprm); 107 const int err = tomoyo_find_next_domain(bprm);
108
124 tomoyo_read_unlock(idx); 109 tomoyo_read_unlock(idx);
125 return err; 110 return err;
126 } 111 }
127 /* 112 /*
128 * Read permission is checked against interpreters using next domain. 113 * Read permission is checked against interpreters using next domain.
129 */ 114 */
130 return tomoyo_check_open_permission(domain, &bprm->file->f_path, 115 return tomoyo_check_open_permission(s->domain_info,
131 O_RDONLY); 116 &bprm->file->f_path, O_RDONLY);
132} 117}
133 118
134/** 119/**
@@ -167,6 +152,7 @@ static int tomoyo_path_truncate(const struct path *path)
167static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry) 152static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
168{ 153{
169 struct path path = { .mnt = parent->mnt, .dentry = dentry }; 154 struct path path = { .mnt = parent->mnt, .dentry = dentry };
155
170 return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL); 156 return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
171} 157}
172 158
@@ -183,6 +169,7 @@ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
183 umode_t mode) 169 umode_t mode)
184{ 170{
185 struct path path = { .mnt = parent->mnt, .dentry = dentry }; 171 struct path path = { .mnt = parent->mnt, .dentry = dentry };
172
186 return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path, 173 return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
187 mode & S_IALLUGO); 174 mode & S_IALLUGO);
188} 175}
@@ -198,6 +185,7 @@ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
198static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry) 185static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
199{ 186{
200 struct path path = { .mnt = parent->mnt, .dentry = dentry }; 187 struct path path = { .mnt = parent->mnt, .dentry = dentry };
188
201 return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL); 189 return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
202} 190}
203 191
@@ -214,6 +202,7 @@ static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
214 const char *old_name) 202 const char *old_name)
215{ 203{
216 struct path path = { .mnt = parent->mnt, .dentry = dentry }; 204 struct path path = { .mnt = parent->mnt, .dentry = dentry };
205
217 return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name); 206 return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
218} 207}
219 208
@@ -271,6 +260,7 @@ static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_di
271{ 260{
272 struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry }; 261 struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
273 struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry }; 262 struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };
263
274 return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2); 264 return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
275} 265}
276 266
@@ -291,6 +281,7 @@ static int tomoyo_path_rename(const struct path *old_parent,
291{ 281{
292 struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry }; 282 struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
293 struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry }; 283 struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };
284
294 return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2); 285 return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
295} 286}
296 287
@@ -322,11 +313,11 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
322 */ 313 */
323static int tomoyo_file_open(struct file *f) 314static int tomoyo_file_open(struct file *f)
324{ 315{
325 int flags = f->f_flags;
326 /* Don't check read permission here if called from do_execve(). */ 316 /* Don't check read permission here if called from do_execve(). */
327 if (current->in_execve) 317 if (current->in_execve)
328 return 0; 318 return 0;
329 return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, flags); 319 return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
320 f->f_flags);
330} 321}
331 322
332/** 323/**
@@ -370,6 +361,7 @@ static int tomoyo_path_chmod(const struct path *path, umode_t mode)
370static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid) 361static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
371{ 362{
372 int error = 0; 363 int error = 0;
364
373 if (uid_valid(uid)) 365 if (uid_valid(uid))
374 error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path, 366 error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
375 from_kuid(&init_user_ns, uid)); 367 from_kuid(&init_user_ns, uid));
@@ -419,6 +411,7 @@ static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
419static int tomoyo_sb_umount(struct vfsmount *mnt, int flags) 411static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
420{ 412{
421 struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; 413 struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };
414
422 return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL); 415 return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
423} 416}
424 417
@@ -493,16 +486,61 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
493 return tomoyo_socket_sendmsg_permission(sock, msg, size); 486 return tomoyo_socket_sendmsg_permission(sock, msg, size);
494} 487}
495 488
489struct lsm_blob_sizes tomoyo_blob_sizes __lsm_ro_after_init = {
490 .lbs_task = sizeof(struct tomoyo_task),
491};
492
493/**
494 * tomoyo_task_alloc - Target for security_task_alloc().
495 *
496 * @task: Pointer to "struct task_struct".
497 * @flags: clone() flags.
498 *
499 * Returns 0.
500 */
501static int tomoyo_task_alloc(struct task_struct *task,
502 unsigned long clone_flags)
503{
504 struct tomoyo_task *old = tomoyo_task(current);
505 struct tomoyo_task *new = tomoyo_task(task);
506
507 new->domain_info = old->domain_info;
508 atomic_inc(&new->domain_info->users);
509 new->old_domain_info = NULL;
510 return 0;
511}
512
513/**
514 * tomoyo_task_free - Target for security_task_free().
515 *
516 * @task: Pointer to "struct task_struct".
517 */
518static void tomoyo_task_free(struct task_struct *task)
519{
520 struct tomoyo_task *s = tomoyo_task(task);
521
522 if (s->domain_info) {
523 atomic_dec(&s->domain_info->users);
524 s->domain_info = NULL;
525 }
526 if (s->old_domain_info) {
527 atomic_dec(&s->old_domain_info->users);
528 s->old_domain_info = NULL;
529 }
530}
531
496/* 532/*
497 * tomoyo_security_ops is a "struct security_operations" which is used for 533 * tomoyo_security_ops is a "struct security_operations" which is used for
498 * registering TOMOYO. 534 * registering TOMOYO.
499 */ 535 */
500static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = { 536static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
501 LSM_HOOK_INIT(cred_alloc_blank, tomoyo_cred_alloc_blank),
502 LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), 537 LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
503 LSM_HOOK_INIT(cred_transfer, tomoyo_cred_transfer), 538 LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
504 LSM_HOOK_INIT(cred_free, tomoyo_cred_free), 539 LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
540 LSM_HOOK_INIT(task_free, tomoyo_task_free),
541#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
505 LSM_HOOK_INIT(bprm_set_creds, tomoyo_bprm_set_creds), 542 LSM_HOOK_INIT(bprm_set_creds, tomoyo_bprm_set_creds),
543#endif
506 LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security), 544 LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
507 LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl), 545 LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
508 LSM_HOOK_INIT(file_open, tomoyo_file_open), 546 LSM_HOOK_INIT(file_open, tomoyo_file_open),
@@ -531,6 +569,8 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
531/* Lock for GC. */ 569/* Lock for GC. */
532DEFINE_SRCU(tomoyo_ss); 570DEFINE_SRCU(tomoyo_ss);
533 571
572int tomoyo_enabled __lsm_ro_after_init = 1;
573
534/** 574/**
535 * tomoyo_init - Register TOMOYO Linux as a LSM module. 575 * tomoyo_init - Register TOMOYO Linux as a LSM module.
536 * 576 *
@@ -538,19 +578,23 @@ DEFINE_SRCU(tomoyo_ss);
538 */ 578 */
539static int __init tomoyo_init(void) 579static int __init tomoyo_init(void)
540{ 580{
541 struct cred *cred = (struct cred *) current_cred(); 581 struct tomoyo_task *s = tomoyo_task(current);
542 582
543 if (!security_module_enable("tomoyo"))
544 return 0;
545 /* register ourselves with the security framework */ 583 /* register ourselves with the security framework */
546 security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo"); 584 security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo");
547 printk(KERN_INFO "TOMOYO Linux initialized\n"); 585 pr_info("TOMOYO Linux initialized\n");
548 cred->security = &tomoyo_kernel_domain; 586 s->domain_info = &tomoyo_kernel_domain;
587 atomic_inc(&tomoyo_kernel_domain.users);
588 s->old_domain_info = NULL;
549 tomoyo_mm_init(); 589 tomoyo_mm_init();
590
550 return 0; 591 return 0;
551} 592}
552 593
553DEFINE_LSM(tomoyo) = { 594DEFINE_LSM(tomoyo) = {
554 .name = "tomoyo", 595 .name = "tomoyo",
596 .enabled = &tomoyo_enabled,
597 .flags = LSM_FLAG_LEGACY_MAJOR,
598 .blobs = &tomoyo_blob_sizes,
555 .init = tomoyo_init, 599 .init = tomoyo_init,
556}; 600};
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index badffc8271c8..0517cbdd7275 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -91,6 +91,7 @@ const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
91void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp) 91void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
92{ 92{
93 struct tm tm; 93 struct tm tm;
94
94 time64_to_tm(time64, 0, &tm); 95 time64_to_tm(time64, 0, &tm);
95 stamp->sec = tm.tm_sec; 96 stamp->sec = tm.tm_sec;
96 stamp->min = tm.tm_min; 97 stamp->min = tm.tm_min;
@@ -113,6 +114,7 @@ void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
113bool tomoyo_permstr(const char *string, const char *keyword) 114bool tomoyo_permstr(const char *string, const char *keyword)
114{ 115{
115 const char *cp = strstr(string, keyword); 116 const char *cp = strstr(string, keyword);
117
116 if (cp) 118 if (cp)
117 return cp == string || *(cp - 1) == '/'; 119 return cp == string || *(cp - 1) == '/';
118 return false; 120 return false;
@@ -132,6 +134,7 @@ char *tomoyo_read_token(struct tomoyo_acl_param *param)
132{ 134{
133 char *pos = param->data; 135 char *pos = param->data;
134 char *del = strchr(pos, ' '); 136 char *del = strchr(pos, ' ');
137
135 if (del) 138 if (del)
136 *del++ = '\0'; 139 *del++ = '\0';
137 else 140 else
@@ -152,6 +155,7 @@ const struct tomoyo_path_info *tomoyo_get_domainname
152{ 155{
153 char *start = param->data; 156 char *start = param->data;
154 char *pos = start; 157 char *pos = start;
158
155 while (*pos) { 159 while (*pos) {
156 if (*pos++ != ' ' || *pos++ == '/') 160 if (*pos++ != ' ' || *pos++ == '/')
157 continue; 161 continue;
@@ -181,8 +185,10 @@ u8 tomoyo_parse_ulong(unsigned long *result, char **str)
181 const char *cp = *str; 185 const char *cp = *str;
182 char *ep; 186 char *ep;
183 int base = 10; 187 int base = 10;
188
184 if (*cp == '0') { 189 if (*cp == '0') {
185 char c = *(cp + 1); 190 char c = *(cp + 1);
191
186 if (c == 'x' || c == 'X') { 192 if (c == 'x' || c == 'X') {
187 base = 16; 193 base = 16;
188 cp += 2; 194 cp += 2;
@@ -240,6 +246,7 @@ bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
240 struct tomoyo_name_union *ptr) 246 struct tomoyo_name_union *ptr)
241{ 247{
242 char *filename; 248 char *filename;
249
243 if (param->data[0] == '@') { 250 if (param->data[0] == '@') {
244 param->data++; 251 param->data++;
245 ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP); 252 ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
@@ -266,6 +273,7 @@ bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
266 char *data; 273 char *data;
267 u8 type; 274 u8 type;
268 unsigned long v; 275 unsigned long v;
276
269 memset(ptr, 0, sizeof(*ptr)); 277 memset(ptr, 0, sizeof(*ptr));
270 if (param->data[0] == '@') { 278 if (param->data[0] == '@') {
271 param->data++; 279 param->data++;
@@ -429,6 +437,7 @@ static bool tomoyo_correct_word2(const char *string, size_t len)
429 unsigned char c; 437 unsigned char c;
430 unsigned char d; 438 unsigned char d;
431 unsigned char e; 439 unsigned char e;
440
432 if (!len) 441 if (!len)
433 goto out; 442 goto out;
434 while (len--) { 443 while (len--) {
@@ -533,6 +542,7 @@ bool tomoyo_correct_domain(const unsigned char *domainname)
533 return true; 542 return true;
534 while (1) { 543 while (1) {
535 const unsigned char *cp = strchr(domainname, ' '); 544 const unsigned char *cp = strchr(domainname, ' ');
545
536 if (!cp) 546 if (!cp)
537 break; 547 break;
538 if (*domainname != '/' || 548 if (*domainname != '/' ||
@@ -554,6 +564,7 @@ bool tomoyo_domain_def(const unsigned char *buffer)
554{ 564{
555 const unsigned char *cp; 565 const unsigned char *cp;
556 int len; 566 int len;
567
557 if (*buffer != '<') 568 if (*buffer != '<')
558 return false; 569 return false;
559 cp = strchr(buffer, ' '); 570 cp = strchr(buffer, ' ');
@@ -668,6 +679,9 @@ static bool tomoyo_file_matches_pattern2(const char *filename,
668{ 679{
669 while (filename < filename_end && pattern < pattern_end) { 680 while (filename < filename_end && pattern < pattern_end) {
670 char c; 681 char c;
682 int i;
683 int j;
684
671 if (*pattern != '\\') { 685 if (*pattern != '\\') {
672 if (*filename++ != *pattern++) 686 if (*filename++ != *pattern++)
673 return false; 687 return false;
@@ -676,8 +690,6 @@ static bool tomoyo_file_matches_pattern2(const char *filename,
676 c = *filename; 690 c = *filename;
677 pattern++; 691 pattern++;
678 switch (*pattern) { 692 switch (*pattern) {
679 int i;
680 int j;
681 case '?': 693 case '?':
682 if (c == '/') { 694 if (c == '/') {
683 return false; 695 return false;
@@ -985,6 +997,7 @@ int tomoyo_init_request_info(struct tomoyo_request_info *r,
985 struct tomoyo_domain_info *domain, const u8 index) 997 struct tomoyo_domain_info *domain, const u8 index)
986{ 998{
987 u8 profile; 999 u8 profile;
1000
988 memset(r, 0, sizeof(*r)); 1001 memset(r, 0, sizeof(*r));
989 if (!domain) 1002 if (!domain)
990 domain = tomoyo_domain(); 1003 domain = tomoyo_domain();
@@ -1018,6 +1031,7 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
1018 list_for_each_entry_rcu(ptr, &domain->acl_info_list, list) { 1031 list_for_each_entry_rcu(ptr, &domain->acl_info_list, list) {
1019 u16 perm; 1032 u16 perm;
1020 u8 i; 1033 u8 i;
1034
1021 if (ptr->is_deleted) 1035 if (ptr->is_deleted)
1022 continue; 1036 continue;
1023 switch (ptr->type) { 1037 switch (ptr->type) {
@@ -1062,9 +1076,8 @@ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
1062 domain->flags[TOMOYO_DIF_QUOTA_WARNED] = true; 1076 domain->flags[TOMOYO_DIF_QUOTA_WARNED] = true;
1063 /* r->granted = false; */ 1077 /* r->granted = false; */
1064 tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]); 1078 tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
1065 printk(KERN_WARNING "WARNING: " 1079 pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
1066 "Domain '%s' has too many ACLs to hold. " 1080 domain->domainname->name);
1067 "Stopped learning mode.\n", domain->domainname->name);
1068 } 1081 }
1069 return false; 1082 return false;
1070} 1083}
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 02514fe558b4..57cc60722dd3 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -479,9 +479,15 @@ static void __init yama_init_sysctl(void)
479static inline void yama_init_sysctl(void) { } 479static inline void yama_init_sysctl(void) { }
480#endif /* CONFIG_SYSCTL */ 480#endif /* CONFIG_SYSCTL */
481 481
482void __init yama_add_hooks(void) 482static int __init yama_init(void)
483{ 483{
484 pr_info("Yama: becoming mindful.\n"); 484 pr_info("Yama: becoming mindful.\n");
485 security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama"); 485 security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama");
486 yama_init_sysctl(); 486 yama_init_sysctl();
487 return 0;
487} 488}
489
490DEFINE_LSM(yama) = {
491 .name = "yama",
492 .init = yama_init,
493};
diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
index ecc14d68e101..908de689a902 100644
--- a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
+++ b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
@@ -25,7 +25,7 @@ unsigned long long clock_frequency;
25unsigned long long timebase_frequency; 25unsigned long long timebase_frequency;
26double timebase_multiplier; 26double timebase_multiplier;
27 27
28static inline unsigned long long mftb(void) 28static inline unsigned long mftb(void)
29{ 29{
30 unsigned long low; 30 unsigned long low;
31 31
diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h
index 52b4710469d2..96043b9b9829 100644
--- a/tools/testing/selftests/powerpc/include/reg.h
+++ b/tools/testing/selftests/powerpc/include/reg.h
@@ -77,6 +77,14 @@
77#define TEXASR_TE 0x0000000004000000 77#define TEXASR_TE 0x0000000004000000
78#define TEXASR_ROT 0x0000000002000000 78#define TEXASR_ROT 0x0000000002000000
79 79
80/* MSR register bits */
81#define MSR_TS_S_LG 33 /* Trans Mem state: Suspended */
82
83#define __MASK(X) (1UL<<(X))
84
85/* macro to check TM MSR bits */
86#define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */
87
80/* Vector Instructions */ 88/* Vector Instructions */
81#define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \ 89#define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \
82 ((rb) << 11) | (((xs) >> 5))) 90 ((rb) << 11) | (((xs) >> 5)))
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index ae43a614835d..7636bf45d5d5 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -102,8 +102,10 @@ do { \
102 102
103#if defined(__powerpc64__) 103#if defined(__powerpc64__)
104#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP] 104#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP]
105#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.gp_regs[PT_MSR]
105#elif defined(__powerpc__) 106#elif defined(__powerpc__)
106#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP] 107#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP]
108#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_MSR]
107#else 109#else
108#error implement UCONTEXT_NIA 110#error implement UCONTEXT_NIA
109#endif 111#endif
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
index 167135bd92a8..af1b80265076 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
@@ -11,7 +11,6 @@
11#include <sys/wait.h> 11#include <sys/wait.h>
12#include <unistd.h> 12#include <unistd.h>
13#include <setjmp.h> 13#include <setjmp.h>
14#include <signal.h>
15 14
16#include "ebb.h" 15#include "ebb.h"
17 16
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 208452a93e2c..951fe855f7cd 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -11,6 +11,7 @@ tm-signal-context-chk-fpu
11tm-signal-context-chk-gpr 11tm-signal-context-chk-gpr
12tm-signal-context-chk-vmx 12tm-signal-context-chk-vmx
13tm-signal-context-chk-vsx 13tm-signal-context-chk-vsx
14tm-signal-context-force-tm
14tm-signal-sigreturn-nt 15tm-signal-sigreturn-nt
15tm-vmx-unavail 16tm-vmx-unavail
16tm-unavailable 17tm-unavailable
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 75a685359129..c0734ed0ef56 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -4,7 +4,8 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu
4 4
5TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ 5TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
6 tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ 6 tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
7 $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt 7 $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
8 tm-signal-context-force-tm
8 9
9top_srcdir = ../../../../.. 10top_srcdir = ../../../../..
10include ../../lib.mk 11include ../../lib.mk
@@ -20,6 +21,7 @@ $(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64
20$(OUTPUT)/tm-resched-dscr: ../pmu/lib.c 21$(OUTPUT)/tm-resched-dscr: ../pmu/lib.c
21$(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx 22$(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx
22$(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64 23$(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64
24$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64
23 25
24SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) 26SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS))
25$(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S 27$(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
new file mode 100644
index 000000000000..31717625f318
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
@@ -0,0 +1,184 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp.
4 *
5 * This test raises a SIGUSR1 signal, and toggle the MSR[TS]
6 * fields at the signal handler. With MSR[TS] being set, the kernel will
7 * force a recheckpoint, which may cause a segfault when returning to
8 * user space. Since the test needs to re-run, the segfault needs to be
9 * caught and handled.
10 *
11 * In order to continue the test even after a segfault, the context is
12 * saved prior to the signal being raised, and it is restored when there is
13 * a segmentation fault. This happens for COUNT_MAX times.
14 *
15 * This test never fails (as returning EXIT_FAILURE). It either succeeds,
16 * or crash the kernel (on a buggy kernel).
17 */
18
19#define _GNU_SOURCE
20#include <stdio.h>
21#include <stdlib.h>
22#include <signal.h>
23#include <string.h>
24#include <ucontext.h>
25#include <unistd.h>
26#include <sys/mman.h>
27
28#include "tm.h"
29#include "utils.h"
30#include "reg.h"
31
32#define COUNT_MAX 5000 /* Number of interactions */
33
34/*
35 * This test only runs on 64 bits system. Unsetting MSR_TS_S to avoid
36 * compilation issue on 32 bits system. There is no side effect, since the
37 * whole test will be skipped if it is not running on 64 bits system.
38 */
39#ifndef __powerpc64__
40#undef MSR_TS_S
41#define MSR_TS_S 0
42#endif
43
44/* Setting contexts because the test will crash and we want to recover */
45ucontext_t init_context, main_context;
46
47static int count, first_time;
48
49void usr_signal_handler(int signo, siginfo_t *si, void *uc)
50{
51 ucontext_t *ucp = uc;
52 int ret;
53
54 /*
55 * Allocating memory in a signal handler, and never freeing it on
56 * purpose, forcing the heap increase, so, the memory leak is what
57 * we want here.
58 */
59 ucp->uc_link = mmap(NULL, sizeof(ucontext_t),
60 PROT_READ | PROT_WRITE,
61 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
62 if (ucp->uc_link == (void *)-1) {
63 perror("Mmap failed");
64 exit(-1);
65 }
66
67 /* Forcing the page to be allocated in a page fault */
68 ret = madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED);
69 if (ret) {
70 perror("madvise failed");
71 exit(-1);
72 }
73
74 memcpy(&ucp->uc_link->uc_mcontext, &ucp->uc_mcontext,
75 sizeof(ucp->uc_mcontext));
76
77 /* Forcing to enable MSR[TM] */
78 UCONTEXT_MSR(ucp) |= MSR_TS_S;
79
80 /*
81 * A fork inside a signal handler seems to be more efficient than a
82 * fork() prior to the signal being raised.
83 */
84 if (fork() == 0) {
85 /*
86 * Both child and parent will return, but, child returns
87 * with count set so it will exit in the next segfault.
88 * Parent will continue to loop.
89 */
90 count = COUNT_MAX;
91 }
92
93 /*
94 * If the change above does not hit the bug, it will cause a
95 * segmentation fault, since the ck structures are NULL.
96 */
97}
98
99void seg_signal_handler(int signo, siginfo_t *si, void *uc)
100{
101 if (count == COUNT_MAX) {
102 /* Return to tm_signal_force_msr() and exit */
103 setcontext(&main_context);
104 }
105
106 count++;
107
108 /* Reexecute the test */
109 setcontext(&init_context);
110}
111
112void tm_trap_test(void)
113{
114 struct sigaction usr_sa, seg_sa;
115 stack_t ss;
116
117 usr_sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
118 usr_sa.sa_sigaction = usr_signal_handler;
119
120 seg_sa.sa_flags = SA_SIGINFO;
121 seg_sa.sa_sigaction = seg_signal_handler;
122
123 /*
124 * Set initial context. Will get back here from
125 * seg_signal_handler()
126 */
127 getcontext(&init_context);
128
129 /* Allocated an alternative signal stack area */
130 ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
131 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
132 ss.ss_size = SIGSTKSZ;
133 ss.ss_flags = 0;
134
135 if (ss.ss_sp == (void *)-1) {
136 perror("mmap error\n");
137 exit(-1);
138 }
139
140 /* Force the allocation through a page fault */
141 if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) {
142 perror("madvise\n");
143 exit(-1);
144 }
145
146 /* Setting an alternative stack to generate a page fault when
147 * the signal is raised.
148 */
149 if (sigaltstack(&ss, NULL)) {
150 perror("sigaltstack\n");
151 exit(-1);
152 }
153
154 /* The signal handler will enable MSR_TS */
155 sigaction(SIGUSR1, &usr_sa, NULL);
156 /* If it does not crash, it will segfault, avoid it to retest */
157 sigaction(SIGSEGV, &seg_sa, NULL);
158
159 raise(SIGUSR1);
160}
161
162int tm_signal_context_force_tm(void)
163{
164 SKIP_IF(!have_htm());
165 /*
166 * Skipping if not running on 64 bits system, since I think it is
167 * not possible to set mcontext's [MSR] with TS, due to it being 32
168 * bits.
169 */
170 SKIP_IF(!is_ppc64le());
171
172 /* Will get back here after COUNT_MAX interactions */
173 getcontext(&main_context);
174
175 if (!first_time++)
176 tm_trap_test();
177
178 return EXIT_SUCCESS;
179}
180
181int main(int argc, char **argv)
182{
183 test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm");
184}
diff --git a/tools/testing/selftests/safesetid/.gitignore b/tools/testing/selftests/safesetid/.gitignore
new file mode 100644
index 000000000000..9c1a629bca01
--- /dev/null
+++ b/tools/testing/selftests/safesetid/.gitignore
@@ -0,0 +1 @@
safesetid-test
diff --git a/tools/testing/selftests/safesetid/Makefile b/tools/testing/selftests/safesetid/Makefile
new file mode 100644
index 000000000000..98da7a504737
--- /dev/null
+++ b/tools/testing/selftests/safesetid/Makefile
@@ -0,0 +1,8 @@
1# SPDX-License-Identifier: GPL-2.0
2# Makefile for mount selftests.
3CFLAGS = -Wall -lcap -O2
4
5TEST_PROGS := run_tests.sh
6TEST_GEN_FILES := safesetid-test
7
8include ../lib.mk
diff --git a/tools/testing/selftests/safesetid/config b/tools/testing/selftests/safesetid/config
new file mode 100644
index 000000000000..9d44e5c2e096
--- /dev/null
+++ b/tools/testing/selftests/safesetid/config
@@ -0,0 +1,2 @@
1CONFIG_SECURITY=y
2CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c
new file mode 100644
index 000000000000..892c8e8b1b8b
--- /dev/null
+++ b/tools/testing/selftests/safesetid/safesetid-test.c
@@ -0,0 +1,334 @@
1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3#include <stdio.h>
4#include <errno.h>
5#include <pwd.h>
6#include <string.h>
7#include <syscall.h>
8#include <sys/capability.h>
9#include <sys/types.h>
10#include <sys/mount.h>
11#include <sys/prctl.h>
12#include <sys/wait.h>
13#include <stdlib.h>
14#include <unistd.h>
15#include <fcntl.h>
16#include <stdbool.h>
17#include <stdarg.h>
18
19#ifndef CLONE_NEWUSER
20# define CLONE_NEWUSER 0x10000000
21#endif
22
23#define ROOT_USER 0
24#define RESTRICTED_PARENT 1
25#define ALLOWED_CHILD1 2
26#define ALLOWED_CHILD2 3
27#define NO_POLICY_USER 4
28
29char* add_whitelist_policy_file = "/sys/kernel/security/safesetid/add_whitelist_policy";
30
31static void die(char *fmt, ...)
32{
33 va_list ap;
34 va_start(ap, fmt);
35 vfprintf(stderr, fmt, ap);
36 va_end(ap);
37 exit(EXIT_FAILURE);
38}
39
40static bool vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
41{
42 char buf[4096];
43 int fd;
44 ssize_t written;
45 int buf_len;
46
47 buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
48 if (buf_len < 0) {
49 printf("vsnprintf failed: %s\n",
50 strerror(errno));
51 return false;
52 }
53 if (buf_len >= sizeof(buf)) {
54 printf("vsnprintf output truncated\n");
55 return false;
56 }
57
58 fd = open(filename, O_WRONLY);
59 if (fd < 0) {
60 if ((errno == ENOENT) && enoent_ok)
61 return true;
62 return false;
63 }
64 written = write(fd, buf, buf_len);
65 if (written != buf_len) {
66 if (written >= 0) {
67 printf("short write to %s\n", filename);
68 return false;
69 } else {
70 printf("write to %s failed: %s\n",
71 filename, strerror(errno));
72 return false;
73 }
74 }
75 if (close(fd) != 0) {
76 printf("close of %s failed: %s\n",
77 filename, strerror(errno));
78 return false;
79 }
80 return true;
81}
82
83static bool write_file(char *filename, char *fmt, ...)
84{
85 va_list ap;
86 bool ret;
87
88 va_start(ap, fmt);
89 ret = vmaybe_write_file(false, filename, fmt, ap);
90 va_end(ap);
91
92 return ret;
93}
94
95static void ensure_user_exists(uid_t uid)
96{
97 struct passwd p;
98
99 FILE *fd;
100 char name_str[10];
101
102 if (getpwuid(uid) == NULL) {
103 memset(&p,0x00,sizeof(p));
104 fd=fopen("/etc/passwd","a");
105 if (fd == NULL)
106 die("couldn't open file\n");
107 if (fseek(fd, 0, SEEK_END))
108 die("couldn't fseek\n");
109 snprintf(name_str, 10, "%d", uid);
110 p.pw_name=name_str;
111 p.pw_uid=uid;
112 p.pw_gecos="Test account";
113 p.pw_dir="/dev/null";
114 p.pw_shell="/bin/false";
115 int value = putpwent(&p,fd);
116 if (value != 0)
117 die("putpwent failed\n");
118 if (fclose(fd))
119 die("fclose failed\n");
120 }
121}
122
123static void ensure_securityfs_mounted(void)
124{
125 int fd = open(add_whitelist_policy_file, O_WRONLY);
126 if (fd < 0) {
127 if (errno == ENOENT) {
128 // Need to mount securityfs
129 if (mount("securityfs", "/sys/kernel/security",
130 "securityfs", 0, NULL) < 0)
131 die("mounting securityfs failed\n");
132 } else {
133 die("couldn't find securityfs for unknown reason\n");
134 }
135 } else {
136 if (close(fd) != 0) {
137 die("close of %s failed: %s\n",
138 add_whitelist_policy_file, strerror(errno));
139 }
140 }
141}
142
143static void write_policies(void)
144{
145 ssize_t written;
146 int fd;
147
148 fd = open(add_whitelist_policy_file, O_WRONLY);
149 if (fd < 0)
150 die("cant open add_whitelist_policy file\n");
151 written = write(fd, "1:2", strlen("1:2"));
152 if (written != strlen("1:2")) {
153 if (written >= 0) {
154 die("short write to %s\n", add_whitelist_policy_file);
155 } else {
156 die("write to %s failed: %s\n",
157 add_whitelist_policy_file, strerror(errno));
158 }
159 }
160 written = write(fd, "1:3", strlen("1:3"));
161 if (written != strlen("1:3")) {
162 if (written >= 0) {
163 die("short write to %s\n", add_whitelist_policy_file);
164 } else {
165 die("write to %s failed: %s\n",
166 add_whitelist_policy_file, strerror(errno));
167 }
168 }
169 if (close(fd) != 0) {
170 die("close of %s failed: %s\n",
171 add_whitelist_policy_file, strerror(errno));
172 }
173}
174
175static bool test_userns(bool expect_success)
176{
177 uid_t uid;
178 char map_file_name[32];
179 size_t sz = sizeof(map_file_name);
180 pid_t cpid;
181 bool success;
182
183 uid = getuid();
184
185 int clone_flags = CLONE_NEWUSER;
186 cpid = syscall(SYS_clone, clone_flags, NULL);
187 if (cpid == -1) {
188 printf("clone failed");
189 return false;
190 }
191
192 if (cpid == 0) { /* Code executed by child */
193 // Give parent 1 second to write map file
194 sleep(1);
195 exit(EXIT_SUCCESS);
196 } else { /* Code executed by parent */
197 if(snprintf(map_file_name, sz, "/proc/%d/uid_map", cpid) < 0) {
198 printf("preparing file name string failed");
199 return false;
200 }
201 success = write_file(map_file_name, "0 0 1", uid);
202 return success == expect_success;
203 }
204
205 printf("should not reach here");
206 return false;
207}
208
209static void test_setuid(uid_t child_uid, bool expect_success)
210{
211 pid_t cpid, w;
212 int wstatus;
213
214 cpid = fork();
215 if (cpid == -1) {
216 die("fork\n");
217 }
218
219 if (cpid == 0) { /* Code executed by child */
220 setuid(child_uid);
221 if (getuid() == child_uid)
222 exit(EXIT_SUCCESS);
223 else
224 exit(EXIT_FAILURE);
225 } else { /* Code executed by parent */
226 do {
227 w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED);
228 if (w == -1) {
229 die("waitpid\n");
230 }
231
232 if (WIFEXITED(wstatus)) {
233 if (WEXITSTATUS(wstatus) == EXIT_SUCCESS) {
234 if (expect_success) {
235 return;
236 } else {
237 die("unexpected success\n");
238 }
239 } else {
240 if (expect_success) {
241 die("unexpected failure\n");
242 } else {
243 return;
244 }
245 }
246 } else if (WIFSIGNALED(wstatus)) {
247 if (WTERMSIG(wstatus) == 9) {
248 if (expect_success)
249 die("killed unexpectedly\n");
250 else
251 return;
252 } else {
253 die("unexpected signal: %d\n", wstatus);
254 }
255 } else {
256 die("unexpected status: %d\n", wstatus);
257 }
258 } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus));
259 }
260
261 die("should not reach here\n");
262}
263
264static void ensure_users_exist(void)
265{
266 ensure_user_exists(ROOT_USER);
267 ensure_user_exists(RESTRICTED_PARENT);
268 ensure_user_exists(ALLOWED_CHILD1);
269 ensure_user_exists(ALLOWED_CHILD2);
270 ensure_user_exists(NO_POLICY_USER);
271}
272
273static void drop_caps(bool setid_retained)
274{
275 cap_value_t cap_values[] = {CAP_SETUID, CAP_SETGID};
276 cap_t caps;
277
278 caps = cap_get_proc();
279 if (setid_retained)
280 cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET);
281 else
282 cap_clear(caps);
283 cap_set_proc(caps);
284 cap_free(caps);
285}
286
287int main(int argc, char **argv)
288{
289 ensure_users_exist();
290 ensure_securityfs_mounted();
291 write_policies();
292
293 if (prctl(PR_SET_KEEPCAPS, 1L))
294 die("Error with set keepcaps\n");
295
296 // First test to make sure we can write userns mappings from a user
297 // that doesn't have any restrictions (as long as it has CAP_SETUID);
298 setuid(NO_POLICY_USER);
299 setgid(NO_POLICY_USER);
300
301 // Take away all but setid caps
302 drop_caps(true);
303
304 // Need PR_SET_DUMPABLE flag set so we can write /proc/[pid]/uid_map
305 // from non-root parent process.
306 if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
307 die("Error with set dumpable\n");
308
309 if (!test_userns(true)) {
310 die("test_userns failed when it should work\n");
311 }
312
313 setuid(RESTRICTED_PARENT);
314 setgid(RESTRICTED_PARENT);
315
316 test_setuid(ROOT_USER, false);
317 test_setuid(ALLOWED_CHILD1, true);
318 test_setuid(ALLOWED_CHILD2, true);
319 test_setuid(NO_POLICY_USER, false);
320
321 if (!test_userns(false)) {
322 die("test_userns worked when it should fail\n");
323 }
324
325 // Now take away all caps
326 drop_caps(false);
327 test_setuid(2, false);
328 test_setuid(3, false);
329 test_setuid(4, false);
330
331 // NOTE: this test doesn't clean up users that were created in
332 // /etc/passwd or flush policies that were added to the LSM.
333 return EXIT_SUCCESS;
334}
diff --git a/tools/testing/selftests/safesetid/safesetid-test.sh b/tools/testing/selftests/safesetid/safesetid-test.sh
new file mode 100755
index 000000000000..e4fdce675c54
--- /dev/null
+++ b/tools/testing/selftests/safesetid/safesetid-test.sh
@@ -0,0 +1,26 @@
1#!/bin/bash
2
3TCID="safesetid-test.sh"
4errcode=0
5
6# Kselftest framework requirement - SKIP code is 4.
7ksft_skip=4
8
9check_root()
10{
11 uid=$(id -u)
12 if [ $uid -ne 0 ]; then
13 echo $TCID: must be run as root >&2
14 exit $ksft_skip
15 fi
16}
17
18main_function()
19{
20 check_root
21 ./safesetid-test
22}
23
24main_function
25echo "$TCID: done"
26exit $errcode
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
index 9b777fa95f09..5a2d7b8efc40 100644
--- a/tools/testing/selftests/vm/map_hugetlb.c
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -23,6 +23,14 @@
23#define MAP_HUGETLB 0x40000 /* arch specific */ 23#define MAP_HUGETLB 0x40000 /* arch specific */
24#endif 24#endif
25 25
26#ifndef MAP_HUGE_SHIFT
27#define MAP_HUGE_SHIFT 26
28#endif
29
30#ifndef MAP_HUGE_MASK
31#define MAP_HUGE_MASK 0x3f
32#endif
33
26/* Only ia64 requires this */ 34/* Only ia64 requires this */
27#ifdef __ia64__ 35#ifdef __ia64__
28#define ADDR (void *)(0x8000000000000000UL) 36#define ADDR (void *)(0x8000000000000000UL)
@@ -58,12 +66,29 @@ static int read_bytes(char *addr)
58 return 0; 66 return 0;
59} 67}
60 68
61int main(void) 69int main(int argc, char **argv)
62{ 70{
63 void *addr; 71 void *addr;
64 int ret; 72 int ret;
73 size_t length = LENGTH;
74 int flags = FLAGS;
75 int shift = 0;
76
77 if (argc > 1)
78 length = atol(argv[1]) << 20;
79 if (argc > 2) {
80 shift = atoi(argv[2]);
81 if (shift)
82 flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
83 }
84
85 if (shift)
86 printf("%u kB hugepages\n", 1 << shift);
87 else
88 printf("Default size hugepages\n");
89 printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
65 90
66 addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0); 91 addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
67 if (addr == MAP_FAILED) { 92 if (addr == MAP_FAILED) {
68 perror("mmap"); 93 perror("mmap");
69 exit(1); 94 exit(1);