aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 17:27:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 17:27:38 -0400
commit6c0f568e84a3cfc775682311d65205462c3f3bc1 (patch)
tree5105a137a9ea2459d55e895d3c096bbd31274724
parentc82199061009d1561e31e17fca5e47a87cb7ff4c (diff)
parent559ec2f8fd50981821621f52db5e1a8ffcf8d792 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - a few misc things - Andy's "ambient capabilities" - fs/nofity updates - the ocfs2 queue - kernel/watchdog.c updates and feature work. - some of MM. Includes Andrea's userfaultfd feature. [ Hadn't noticed that userfaultfd was 'default y' when applying the patches, so that got fixed in this merge instead. We do _not_ mark new features that nobody uses yet 'default y' - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits) mm/hugetlb.c: make vma_has_reserves() return bool mm/madvise.c: make madvise_behaviour_valid() return bool mm/memory.c: make tlb_next_batch() return bool mm/dmapool.c: change is_page_busy() return from int to bool mm: remove struct node_active_region mremap: simplify the "overlap" check in mremap_to() mremap: don't do uneccesary checks if new_len == old_len mremap: don't do mm_populate(new_addr) on failure mm: move ->mremap() from file_operations to vm_operations_struct mremap: don't leak new_vma if f_op->mremap() fails mm/hugetlb.c: make vma_shareable() return bool mm: make GUP handle pfn mapping unless FOLL_GET is requested mm: fix status code which move_pages() returns for zero page mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout() genalloc: add support of multiple gen_pools per device genalloc: add name arg to gen_pool_get() and devm_gen_pool_create() mm/memblock: WARN_ON when nid differs from overlap region Documentation/features/vm: add feature description and arch support status for batched TLB flush after unmap mm: defer flush of writable TLB entries mm: send one IPI per CPU to TLB flush all entries after unmapping pages ...
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt40
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/vm/userfaultfd.txt144
-rw-r--r--arch/arm/mach-at91/pm.c2
-rw-r--r--arch/arm/mach-imx/pm-imx5.c2
-rw-r--r--arch/arm/mach-imx/pm-imx6.c2
-rw-r--r--arch/arm/mach-socfpga/pm.c2
-rw-r--r--arch/sh/mm/init.c4
-rw-r--r--arch/sh/mm/numa.c4
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c9
-rw-r--r--arch/x86/mm/tlb.c1
-rw-r--r--drivers/base/node.c10
-rw-r--r--drivers/media/platform/coda/coda-common.c2
-rw-r--r--drivers/misc/sram.c8
-rw-r--r--drivers/video/console/Kconfig2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/aio.c27
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/notify/dnotify/dnotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c8
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/fsnotify.c11
-rw-r--r--fs/notify/fsnotify.h21
-rw-r--r--fs/notify/inode_mark.c20
-rw-r--r--fs/notify/mark.c113
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/ntfs/super.c21
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c148
-rw-r--r--fs/ocfs2/aops.c54
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c69
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c78
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c53
-rw-r--r--fs/ocfs2/inode.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c96
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c81
-rw-r--r--fs/ocfs2/suballoc.c96
-rw-r--r--fs/ocfs2/super.c73
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/overlayfs/super.c6
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/userfaultfd.c1330
-rw-r--r--fs/xfs/xfs_super.c4
-rw-r--r--include/linux/cred.h8
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/fsnotify_backend.h55
-rw-r--r--include/linux/genalloc.h6
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/mm_types.h12
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/nmi.h15
-rw-r--r--include/linux/rmap.h3
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/seq_file.h35
-rw-r--r--include/linux/slab.h10
-rw-r--r--include/linux/smpboot.h11
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/userfaultfd_k.h85
-rw-r--r--include/linux/wait.h5
-rw-r--r--include/linux/watchdog.h8
-rw-r--r--include/trace/events/tlb.h3
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/prctl.h7
-rw-r--r--include/uapi/linux/securebits.h11
-rw-r--r--include/uapi/linux/userfaultfd.h169
-rw-r--r--init/Kconfig18
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c189
-rw-r--r--lib/genalloc.c110
-rw-r--r--mm/Makefile1
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/gup.c60
-rw-r--r--mm/huge_memory.c79
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/internal.h15
-rw-r--r--mm/madvise.c9
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c8
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c18
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c50
-rw-r--r--mm/rmap.c118
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c23
-rw-r--r--mm/slob.c13
-rw-r--r--mm/slub.c204
-rw-r--r--mm/userfaultfd.c308
-rw-r--r--mm/vmscan.c30
-rw-r--r--net/ceph/ceph_common.c7
-rw-r--r--net/sunrpc/sched.c2
-rwxr-xr-xscripts/Lindent3
-rwxr-xr-xscripts/decode_stacktrace.sh5
-rwxr-xr-xscripts/kernel-doc38
-rw-r--r--scripts/spelling.txt29
-rw-r--r--security/commoncap.c103
-rw-r--r--security/keys/process_keys.c1
-rw-r--r--security/selinux/hooks.c2
-rw-r--r--tools/testing/selftests/capabilities/.gitignore2
-rw-r--r--tools/testing/selftests/capabilities/Makefile18
-rw-r--r--tools/testing/selftests/capabilities/test_execve.c427
-rw-r--r--tools/testing/selftests/capabilities/validate_cap.c73
-rw-r--r--tools/testing/selftests/vm/Makefile3
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests11
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c636
143 files changed, 5174 insertions, 1062 deletions
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644
index 000000000000..261b92e2fb1a
--- /dev/null
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -0,0 +1,40 @@
1#
2# Feature name: batch-unmap-tlb-flush
3# Kconfig: ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
4# description: arch supports deferral of TLB flush until multiple pages are unmapped
5#
6 -----------------------
7 | arch |status|
8 -----------------------
9 | alpha: | TODO |
10 | arc: | TODO |
11 | arm: | TODO |
12 | arm64: | TODO |
13 | avr32: | .. |
14 | blackfin: | TODO |
15 | c6x: | .. |
16 | cris: | .. |
17 | frv: | .. |
18 | h8300: | .. |
19 | hexagon: | TODO |
20 | ia64: | TODO |
21 | m32r: | TODO |
22 | m68k: | .. |
23 | metag: | TODO |
24 | microblaze: | .. |
25 | mips: | TODO |
26 | mn10300: | TODO |
27 | nios2: | .. |
28 | openrisc: | .. |
29 | parisc: | TODO |
30 | powerpc: | TODO |
31 | s390: | TODO |
32 | score: | .. |
33 | sh: | TODO |
34 | sparc: | TODO |
35 | tile: | TODO |
36 | um: | .. |
37 | unicore32: | .. |
38 | x86: | ok |
39 | xtensa: | TODO |
40 -----------------------
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 64df08db4657..39ac6546d4a4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -303,6 +303,7 @@ Code Seq#(hex) Include File Comments
3030xA3 80-8F Port ACL in development: 3030xA3 80-8F Port ACL in development:
304 <mailto:tlewis@mindspring.com> 304 <mailto:tlewis@mindspring.com>
3050xA3 90-9F linux/dtlk.h 3050xA3 90-9F linux/dtlk.h
3060xAA 00-3F linux/uapi/linux/userfaultfd.h
3060xAB 00-1F linux/nbd.h 3070xAB 00-1F linux/nbd.h
3070xAC 00-1F linux/raw.h 3080xAC 00-1F linux/raw.h
3080xAD 00 Netfilter device in development: 3090xAD 00 Netfilter device in development:
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 000000000000..70a3c94d1941
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,144 @@
1= Userfaultfd =
2
3== Objective ==
4
5Userfaults allow the implementation of on-demand paging from userland
6and more generally they allow userland to take control of various
7memory page faults, something otherwise only the kernel code could do.
8
9For example userfaults allows a proper and more optimal implementation
10of the PROT_NONE+SIGSEGV trick.
11
12== Design ==
13
14Userfaults are delivered and resolved through the userfaultfd syscall.
15
16The userfaultfd (aside from registering and unregistering virtual
17memory ranges) provides two primary functionalities:
18
191) read/POLLIN protocol to notify a userland thread of the faults
20 happening
21
222) various UFFDIO_* ioctls that can manage the virtual memory regions
23 registered in the userfaultfd that allows userland to efficiently
24 resolve the userfaults it receives via 1) or to manage the virtual
25 memory in the background
26
27The real advantage of userfaults if compared to regular virtual memory
28management of mremap/mprotect is that the userfaults in all their
29operations never involve heavyweight structures like vmas (in fact the
30userfaultfd runtime load never takes the mmap_sem for writing).
31
32Vmas are not suitable for page- (or hugepage) granular fault tracking
33when dealing with virtual address spaces that could span
34Terabytes. Too many vmas would be needed for that.
35
36The userfaultfd once opened by invoking the syscall, can also be
37passed using unix domain sockets to a manager process, so the same
38manager process could handle the userfaults of a multitude of
39different processes without them being aware about what is going on
40(well of course unless they later try to use the userfaultfd
41themselves on the same region the manager is already tracking, which
42is a corner case that would currently return -EBUSY).
43
44== API ==
45
46When first opened the userfaultfd must be enabled invoking the
47UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
48a later API version) which will specify the read/POLLIN protocol
49userland intends to speak on the UFFD and the uffdio_api.features
50userland requires. The UFFDIO_API ioctl if successful (i.e. if the
51requested uffdio_api.api is spoken also by the running kernel and the
52requested features are going to be enabled) will return into
53uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
54respectively all the available features of the read(2) protocol and
55the generic ioctl available.
56
57Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
58be invoked (if present in the returned uffdio_api.ioctls bitmask) to
59register a memory range in the userfaultfd by setting the
60uffdio_register structure accordingly. The uffdio_register.mode
61bitmask will specify to the kernel which kind of faults to track for
62the range (UFFDIO_REGISTER_MODE_MISSING would track missing
63pages). The UFFDIO_REGISTER ioctl will return the
64uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
65userfaults on the range registered. Not all ioctls will necessarily be
66supported for all memory types depending on the underlying virtual
67memory backend (anonymous memory vs tmpfs vs real filebacked
68mappings).
69
70Userland can use the uffdio_register.ioctls to manage the virtual
71address space in the background (to add or potentially also remove
72memory from the userfaultfd registered range). This means a userfault
73could be triggering just before userland maps in the background the
74user-faulted page.
75
76The primary ioctl to resolve userfaults is UFFDIO_COPY. That
77atomically copies a page into the userfault registered range and wakes
78up the blocked userfaults (unless uffdio_copy.mode &
79UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
80UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
81half copied page since it'll keep userfaulting until the copy has
82finished.
83
84== QEMU/KVM ==
85
86QEMU/KVM is using the userfaultfd syscall to implement postcopy live
87migration. Postcopy live migration is one form of memory
88externalization consisting of a virtual machine running with part or
89all of its memory residing on a different node in the cloud. The
90userfaultfd abstraction is generic enough that not a single line of
91KVM kernel code had to be modified in order to add postcopy live
92migration to QEMU.
93
94Guest async page faults, FOLL_NOWAIT and all other GUP features work
95just fine in combination with userfaults. Userfaults trigger async
96page faults in the guest scheduler so those guest processes that
97aren't waiting for userfaults (i.e. network bound) can keep running in
98the guest vcpus.
99
100It is generally beneficial to run one pass of precopy live migration
101just before starting postcopy live migration, in order to avoid
102generating userfaults for readonly guest regions.
103
104The implementation of postcopy live migration currently uses one
105single bidirectional socket but in the future two different sockets
106will be used (to reduce the latency of the userfaults to the minimum
107possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
108
109The QEMU in the source node writes all pages that it knows are missing
110in the destination node, into the socket, and the migration thread of
111the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
112ioctls on the userfaultfd in order to map the received pages into the
113guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
114
115A different postcopy thread in the destination node listens with
116poll() to the userfaultfd in parallel. When a POLLIN event is
117generated after a userfault triggers, the postcopy thread read() from
118the userfaultfd and receives the fault address (or -EAGAIN in case the
119userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
120by the parallel QEMU migration thread).
121
122After the QEMU postcopy thread (running in the destination node) gets
123the userfault address it writes the information about the missing page
124into the socket. The QEMU source node receives the information and
125roughly "seeks" to that page address and continues sending all
126remaining missing pages from that new page offset. Soon after that
127(just the time to flush the tcp_wmem queue through the network) the
128migration thread in the QEMU running in the destination node will
129receive the page that triggered the userfault and it'll map it as
130usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
131was spontaneously sent by the source or if it was an urgent page
132requested through an userfault).
133
134By the time the userfaults start, the QEMU in the destination node
135doesn't need to keep any per-page state bitmap relative to the live
136migration around and a single per-page bitmap has to be maintained in
137the QEMU running in the source node to know which pages are still
138missing in the destination node. The bitmap in the source node is
139checked to find which missing pages to send in round robin and we seek
140over it when receiving incoming userfaults. After sending each page of
141course the bitmap is updated accordingly. It's also useful to avoid
142sending the same page twice (in case the userfault is read by the
143postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
144thread).
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 265ffeb2037e..80e277cfcc8b 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
369 return; 369 return;
370 } 370 }
371 371
372 sram_pool = gen_pool_get(&pdev->dev); 372 sram_pool = gen_pool_get(&pdev->dev, NULL);
373 if (!sram_pool) { 373 if (!sram_pool) {
374 pr_warn("%s: sram pool unavailable!\n", __func__); 374 pr_warn("%s: sram pool unavailable!\n", __func__);
375 return; 375 return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
297 goto put_node; 297 goto put_node;
298 } 298 }
299 299
300 ocram_pool = gen_pool_get(&pdev->dev); 300 ocram_pool = gen_pool_get(&pdev->dev, NULL);
301 if (!ocram_pool) { 301 if (!ocram_pool) {
302 pr_warn("%s: ocram pool unavailable!\n", __func__); 302 pr_warn("%s: ocram pool unavailable!\n", __func__);
303 ret = -ENODEV; 303 ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
451 goto put_node; 451 goto put_node;
452 } 452 }
453 453
454 ocram_pool = gen_pool_get(&pdev->dev); 454 ocram_pool = gen_pool_get(&pdev->dev, NULL);
455 if (!ocram_pool) { 455 if (!ocram_pool) {
456 pr_warn("%s: ocram pool unavailable!\n", __func__); 456 pr_warn("%s: ocram pool unavailable!\n", __func__);
457 ret = -ENODEV; 457 ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
56 goto put_node; 56 goto put_node;
57 } 57 }
58 58
59 ocram_pool = gen_pool_get(&pdev->dev); 59 ocram_pool = gen_pool_get(&pdev->dev, NULL);
60 if (!ocram_pool) { 60 if (!ocram_pool) {
61 pr_warn("%s: ocram pool unavailable!\n", __func__); 61 pr_warn("%s: ocram pool unavailable!\n", __func__);
62 ret = -ENODEV; 62 ret = -ENODEV;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a64157..17f486233db0 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
488int arch_add_memory(int nid, u64 start, u64 size) 488int arch_add_memory(int nid, u64 start, u64 size)
489{ 489{
490 pg_data_t *pgdat; 490 pg_data_t *pgdat;
491 unsigned long start_pfn = start >> PAGE_SHIFT; 491 unsigned long start_pfn = PFN_DOWN(start);
492 unsigned long nr_pages = size >> PAGE_SHIFT; 492 unsigned long nr_pages = size >> PAGE_SHIFT;
493 int ret; 493 int ret;
494 494
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
517#ifdef CONFIG_MEMORY_HOTREMOVE 517#ifdef CONFIG_MEMORY_HOTREMOVE
518int arch_remove_memory(u64 start, u64 size) 518int arch_remove_memory(u64 start, u64 size)
519{ 519{
520 unsigned long start_pfn = start >> PAGE_SHIFT; 520 unsigned long start_pfn = PFN_DOWN(start);
521 unsigned long nr_pages = size >> PAGE_SHIFT; 521 unsigned long nr_pages = size >> PAGE_SHIFT;
522 struct zone *zone; 522 struct zone *zone;
523 int ret; 523 int ret;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index bce52ba66206..05713d190247 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
33 /* Don't allow bogus node assignment */ 33 /* Don't allow bogus node assignment */
34 BUG_ON(nid >= MAX_NUMNODES || nid <= 0); 34 BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
35 35
36 start_pfn = start >> PAGE_SHIFT; 36 start_pfn = PFN_DOWN(start);
37 end_pfn = end >> PAGE_SHIFT; 37 end_pfn = PFN_DOWN(end);
38 38
39 pmb_bolt_mapping((unsigned long)__va(start), start, end - start, 39 pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
40 PAGE_KERNEL); 40 PAGE_KERNEL);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64
42 select ARCH_USE_QUEUED_RWLOCKS 42 select ARCH_USE_QUEUED_RWLOCKS
43 select ARCH_USE_QUEUED_SPINLOCKS 43 select ARCH_USE_QUEUED_SPINLOCKS
44 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
44 select ARCH_WANTS_DYNAMIC_TASK_STRUCT 45 select ARCH_WANTS_DYNAMIC_TASK_STRUCT
45 select ARCH_WANT_FRAME_POINTERS 46 select ARCH_WANT_FRAME_POINTERS
46 select ARCH_WANT_IPC_PARSE_VERSION if X86_32 47 select ARCH_WANT_IPC_PARSE_VERSION if X86_32
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..477bfa6db370 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,4 @@
380371 i386 recvfrom sys_recvfrom compat_sys_recvfrom 380371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
381372 i386 recvmsg sys_recvmsg compat_sys_recvmsg 381372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
382373 i386 shutdown sys_shutdown 382373 i386 shutdown sys_shutdown
383374 i386 userfaultfd sys_userfaultfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..81c490634db9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
329320 common kexec_file_load sys_kexec_file_load 329320 common kexec_file_load sys_kexec_file_load
330321 common bpf sys_bpf 330321 common bpf sys_bpf
331322 64 execveat stub_execveat 331322 64 execveat stub_execveat
332323 common userfaultfd sys_userfaultfd
332 333
333# 334#
334# x32-specific system call numbers start at 512 to avoid cache impact 335# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
261 261
262#endif /* SMP */ 262#endif /* SMP */
263 263
264/* Not inlined due to inc_irq_stat not being defined yet */
265#define flush_tlb_local() { \
266 inc_irq_stat(irq_tlb_count); \
267 local_flush_tlb(); \
268}
269
264#ifndef CONFIG_PARAVIRT 270#ifndef CONFIG_PARAVIRT
265#define flush_tlb_others(mask, mm, start, end) \ 271#define flush_tlb_others(mask, mm, start, end) \
266 native_flush_tlb_others(mask, mm, start, end) 272 native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/watchdog.h> 15#include <linux/nmi.h>
16 16
17#include <asm/cpufeature.h> 17#include <asm/cpufeature.h>
18#include <asm/hardirq.h> 18#include <asm/hardirq.h>
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
3627 return 0; 3627 return 0;
3628 } 3628 }
3629 3629
3630 watchdog_nmi_disable_all(); 3630 if (lockup_detector_suspend() != 0) {
3631 pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
3632 return 0;
3633 }
3631 3634
3632 x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); 3635 x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
3633 3636
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
3635 x86_pmu.commit_scheduling = NULL; 3638 x86_pmu.commit_scheduling = NULL;
3636 x86_pmu.stop_scheduling = NULL; 3639 x86_pmu.stop_scheduling = NULL;
3637 3640
3638 watchdog_nmi_enable_all(); 3641 lockup_detector_resume();
3639 3642
3640 get_online_cpus(); 3643 get_online_cpus();
3641 3644
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
140 info.flush_end = end; 140 info.flush_end = end;
141 141
142 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 142 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
143 trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
143 if (is_uv_system()) { 144 if (is_uv_system()) {
144 unsigned int cpu; 145 unsigned int cpu;
145 146
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 31df474d72f4..560751bad294 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -392,6 +392,16 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
392 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { 392 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
393 int page_nid; 393 int page_nid;
394 394
395 /*
396 * memory block could have several absent sections from start.
397 * skip pfn range from absent section
398 */
399 if (!pfn_present(pfn)) {
400 pfn = round_down(pfn + PAGES_PER_SECTION,
401 PAGES_PER_SECTION) - 1;
402 continue;
403 }
404
395 page_nid = get_nid_for_pfn(pfn); 405 page_nid = get_nid_for_pfn(pfn);
396 if (page_nid < 0) 406 if (page_nid < 0)
397 continue; 407 continue;
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 58f65486de33..284ac4c934ba 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
2157 /* Get IRAM pool from device tree or platform data */ 2157 /* Get IRAM pool from device tree or platform data */
2158 pool = of_gen_pool_get(np, "iram", 0); 2158 pool = of_gen_pool_get(np, "iram", 0);
2159 if (!pool && pdata) 2159 if (!pool && pdata)
2160 pool = gen_pool_get(pdata->iram_dev); 2160 pool = gen_pool_get(pdata->iram_dev, NULL);
2161 if (!pool) { 2161 if (!pool) {
2162 dev_err(&pdev->dev, "iram pool not available\n"); 2162 dev_err(&pdev->dev, "iram pool not available\n");
2163 return -ENOMEM; 2163 return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
186 if (IS_ERR(sram->virt_base)) 186 if (IS_ERR(sram->virt_base))
187 return PTR_ERR(sram->virt_base); 187 return PTR_ERR(sram->virt_base);
188 188
189 sram->pool = devm_gen_pool_create(sram->dev, 189 sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
190 ilog2(SRAM_GRANULARITY), -1); 190 NUMA_NO_NODE, NULL);
191 if (!sram->pool) 191 if (IS_ERR(sram->pool))
192 return -ENOMEM; 192 return PTR_ERR(sram->pool);
193 193
194 ret = sram_reserve_regions(sram, res); 194 ret = sram_reserve_regions(sram, res);
195 if (ret) 195 if (ret)
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index ba97efc3bf70..071280643db7 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -9,7 +9,7 @@ config VGA_CONSOLE
9 depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \ 9 depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
10 !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \ 10 !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
11 (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \ 11 (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
12 !ARM64 12 !ARM64 && !ARC
13 default y 13 default y
14 help 14 help
15 Saying Y here will allow you to use Linux in text mode through a 15 Saying Y here will allow you to use Linux in text mode through a
diff --git a/fs/Makefile b/fs/Makefile
index 09e051fefc5b..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27obj-$(CONFIG_SIGNALFD) += signalfd.o 27obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
30obj-$(CONFIG_AIO) += aio.o 31obj-$(CONFIG_AIO) += aio.o
31obj-$(CONFIG_FS_DAX) += dax.o 32obj-$(CONFIG_FS_DAX) += dax.o
32obj-$(CONFIG_FILE_LOCKING) += locks.o 33obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
308 } 308 }
309} 309}
310 310
311static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) 311static int aio_ring_mremap(struct vm_area_struct *vma)
312{
313 vma->vm_flags |= VM_DONTEXPAND;
314 vma->vm_ops = &generic_file_vm_ops;
315 return 0;
316}
317
318static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
319{ 312{
313 struct file *file = vma->vm_file;
320 struct mm_struct *mm = vma->vm_mm; 314 struct mm_struct *mm = vma->vm_mm;
321 struct kioctx_table *table; 315 struct kioctx_table *table;
322 int i, res = -EINVAL; 316 int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
342 return res; 336 return res;
343} 337}
344 338
339static const struct vm_operations_struct aio_ring_vm_ops = {
340 .mremap = aio_ring_mremap,
341#if IS_ENABLED(CONFIG_MMU)
342 .fault = filemap_fault,
343 .map_pages = filemap_map_pages,
344 .page_mkwrite = filemap_page_mkwrite,
345#endif
346};
347
348static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
349{
350 vma->vm_flags |= VM_DONTEXPAND;
351 vma->vm_ops = &aio_ring_vm_ops;
352 return 0;
353}
354
345static const struct file_operations aio_ring_fops = { 355static const struct file_operations aio_ring_fops = {
346 .mmap = aio_ring_mmap, 356 .mmap = aio_ring_mmap,
347 .mremap = aio_ring_remap,
348}; 357};
349 358
350#if IS_ENABLED(CONFIG_MIGRATION) 359#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..7b6bfcbf801c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
479 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) 479 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
480 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); 480 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
481 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 481 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
482 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); 482 seq_show_option(m, "snapdirname", fsopt->snapdir_name);
483 483
484 return 0; 484 return 0;
485} 485}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
394 struct sockaddr *srcaddr; 394 struct sockaddr *srcaddr;
395 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; 395 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
396 396
397 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); 397 seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
398 cifs_show_security(s, tcon->ses); 398 cifs_show_security(s, tcon->ses);
399 cifs_show_cache_flavor(s, cifs_sb); 399 cifs_show_cache_flavor(s, cifs_sb);
400 400
401 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 401 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
402 seq_puts(s, ",multiuser"); 402 seq_puts(s, ",multiuser");
403 else if (tcon->ses->user_name) 403 else if (tcon->ses->user_name)
404 seq_printf(s, ",username=%s", tcon->ses->user_name); 404 seq_show_option(s, "username", tcon->ses->user_name);
405 405
406 if (tcon->ses->domainName) 406 if (tcon->ses->domainName)
407 seq_printf(s, ",domain=%s", tcon->ses->domainName); 407 seq_show_option(s, "domain", tcon->ses->domainName);
408 408
409 if (srcaddr->sa_family != AF_UNSPEC) { 409 if (srcaddr->sa_family != AF_UNSPEC) {
410 struct sockaddr_in *saddr4; 410 struct sockaddr_in *saddr4;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..a63c7b0a10cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1776 } 1776 }
1777 1777
1778 if (sbi->s_qf_names[USRQUOTA]) 1778 if (sbi->s_qf_names[USRQUOTA])
1779 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 1779 seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
1780 1780
1781 if (sbi->s_qf_names[GRPQUOTA]) 1781 if (sbi->s_qf_names[GRPQUOTA])
1782 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 1782 seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
1783#endif 1783#endif
1784} 1784}
1785 1785
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1334 if (is_ancestor(root, sdp->sd_master_dir)) 1334 if (is_ancestor(root, sdp->sd_master_dir))
1335 seq_puts(s, ",meta"); 1335 seq_puts(s, ",meta");
1336 if (args->ar_lockproto[0]) 1336 if (args->ar_lockproto[0])
1337 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 1337 seq_show_option(s, "lockproto", args->ar_lockproto);
1338 if (args->ar_locktable[0]) 1338 if (args->ar_locktable[0])
1339 seq_printf(s, ",locktable=%s", args->ar_locktable); 1339 seq_show_option(s, "locktable", args->ar_locktable);
1340 if (args->ar_hostdata[0]) 1340 if (args->ar_hostdata[0])
1341 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1341 seq_show_option(s, "hostdata", args->ar_hostdata);
1342 if (args->ar_spectator) 1342 if (args->ar_spectator)
1343 seq_puts(s, ",spectator"); 1343 seq_puts(s, ",spectator");
1344 if (args->ar_localflocks) 1344 if (args->ar_localflocks)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
136 struct hfs_sb_info *sbi = HFS_SB(root->d_sb); 136 struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
137 137
138 if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) 138 if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
139 seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); 139 seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
140 if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) 140 if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
141 seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); 141 seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
142 seq_printf(seq, ",uid=%u,gid=%u", 142 seq_printf(seq, ",uid=%u,gid=%u",
143 from_kuid_munged(&init_user_ns, sbi->s_uid), 143 from_kuid_munged(&init_user_ns, sbi->s_uid),
144 from_kgid_munged(&init_user_ns, sbi->s_gid)); 144 from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
218 struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); 218 struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
219 219
220 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 220 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
221 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 221 seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
222 if (sbi->type != HFSPLUS_DEF_CR_TYPE) 222 if (sbi->type != HFSPLUS_DEF_CR_TYPE)
223 seq_printf(seq, ",type=%.4s", (char *)&sbi->type); 223 seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
224 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, 224 seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
225 from_kuid_munged(&init_user_ns, sbi->uid), 225 from_kuid_munged(&init_user_ns, sbi->uid),
226 from_kgid_munged(&init_user_ns, sbi->gid)); 226 from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
260 size_t offset = strlen(root_ino) + 1; 260 size_t offset = strlen(root_ino) + 1;
261 261
262 if (strlen(root_path) > offset) 262 if (strlen(root_path) > offset)
263 seq_printf(seq, ",%s", root_path + offset); 263 seq_show_option(seq, root_path + offset, NULL);
264 264
265 if (append) 265 if (append)
266 seq_puts(seq, ",append"); 266 seq_puts(seq, ",append");
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
154 struct dnotify_struct *dn; 154 struct dnotify_struct *dn;
155 struct dnotify_struct **prev; 155 struct dnotify_struct **prev;
156 struct inode *inode; 156 struct inode *inode;
157 bool free = false;
157 158
158 inode = file_inode(filp); 159 inode = file_inode(filp);
159 if (!S_ISDIR(inode->i_mode)) 160 if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
182 183
183 /* nothing else could have found us thanks to the dnotify_groups 184 /* nothing else could have found us thanks to the dnotify_groups
184 mark_mutex */ 185 mark_mutex */
185 if (dn_mark->dn == NULL) 186 if (dn_mark->dn == NULL) {
186 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); 187 fsnotify_detach_mark(fsn_mark);
188 free = true;
189 }
187 190
188 mutex_unlock(&dnotify_group->mark_mutex); 191 mutex_unlock(&dnotify_group->mark_mutex);
189 192
193 if (free)
194 fsnotify_free_mark(fsn_mark);
190 fsnotify_put_mark(fsn_mark); 195 fsnotify_put_mark(fsn_mark);
191} 196}
192 197
@@ -362,9 +367,10 @@ out:
362 spin_unlock(&fsn_mark->lock); 367 spin_unlock(&fsn_mark->lock);
363 368
364 if (destroy) 369 if (destroy)
365 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); 370 fsnotify_detach_mark(fsn_mark);
366
367 mutex_unlock(&dnotify_group->mark_mutex); 371 mutex_unlock(&dnotify_group->mark_mutex);
372 if (destroy)
373 fsnotify_free_mark(fsn_mark);
368 fsnotify_put_mark(fsn_mark); 374 fsnotify_put_mark(fsn_mark);
369out_err: 375out_err:
370 if (new_fsn_mark) 376 if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
529 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 529 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
530 &destroy_mark); 530 &destroy_mark);
531 if (destroy_mark) 531 if (destroy_mark)
532 fsnotify_destroy_mark_locked(fsn_mark, group); 532 fsnotify_detach_mark(fsn_mark);
533 mutex_unlock(&group->mark_mutex); 533 mutex_unlock(&group->mark_mutex);
534 if (destroy_mark)
535 fsnotify_free_mark(fsn_mark);
534 536
535 fsnotify_put_mark(fsn_mark); 537 fsnotify_put_mark(fsn_mark);
536 if (removed & real_mount(mnt)->mnt_fsnotify_mask) 538 if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
557 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 559 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
558 &destroy_mark); 560 &destroy_mark);
559 if (destroy_mark) 561 if (destroy_mark)
560 fsnotify_destroy_mark_locked(fsn_mark, group); 562 fsnotify_detach_mark(fsn_mark);
561 mutex_unlock(&group->mark_mutex); 563 mutex_unlock(&group->mark_mutex);
564 if (destroy_mark)
565 fsnotify_free_mark(fsn_mark);
562 566
563 /* matches the fsnotify_find_inode_mark() */ 567 /* matches the fsnotify_find_inode_mark() */
564 fsnotify_put_mark(fsn_mark); 568 fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
76 struct inotify_inode_mark *inode_mark; 76 struct inotify_inode_mark *inode_mark;
77 struct inode *inode; 77 struct inode *inode;
78 78
79 if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) 79 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
80 !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
80 return; 81 return;
81 82
82 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); 83 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fsnotify_backend.h> 27#include <linux/fsnotify_backend.h>
28#include "fsnotify.h" 28#include "fsnotify.h"
29#include "../mount.h"
30 29
31/* 30/*
32 * Clear all of the marks on an inode when it is being evicted from core 31 * Clear all of the marks on an inode when it is being evicted from core
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
205 mnt = NULL; 204 mnt = NULL;
206 205
207 /* 206 /*
207 * Optimization: srcu_read_lock() has a memory barrier which can
208 * be expensive. It protects walking the *_fsnotify_marks lists.
209 * However, if we do not walk the lists, we do not have to do
210 * SRCU because we have no references to any objects and do not
211 * need SRCU to keep them "alive".
212 */
213 if (hlist_empty(&to_tell->i_fsnotify_marks) &&
214 (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
215 return 0;
216 /*
208 * if this is a modify event we may need to clear the ignored masks 217 * if this is a modify event we may need to clear the ignored masks
209 * otherwise return if neither the inode nor the vfsmount care about 218 * otherwise return if neither the inode nor the vfsmount care about
210 * this type of event. 219 * this type of event.
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
6#include <linux/srcu.h> 6#include <linux/srcu.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "../mount.h"
10
9/* destroy all events sitting in this groups notification queue */ 11/* destroy all events sitting in this groups notification queue */
10extern void fsnotify_flush_notify(struct fsnotify_group *group); 12extern void fsnotify_flush_notify(struct fsnotify_group *group);
11 13
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
38extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); 40extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
39/* inode specific destruction of a mark */ 41/* inode specific destruction of a mark */
40extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); 42extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
41/* Destroy all marks in the given list */
42extern void fsnotify_destroy_marks(struct list_head *to_free);
43/* Find mark belonging to given group in the list of marks */ 43/* Find mark belonging to given group in the list of marks */
44extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, 44extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
45 struct fsnotify_group *group); 45 struct fsnotify_group *group);
46/* run the list of all marks associated with inode and flag them to be freed */ 46/* Destroy all marks in the given list protected by 'lock' */
47extern void fsnotify_clear_marks_by_inode(struct inode *inode); 47extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
48/* run the list of all marks associated with vfsmount and flag them to be freed */ 48/* run the list of all marks associated with inode and destroy them */
49extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); 49static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
50{
51 fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
52}
53/* run the list of all marks associated with vfsmount and destroy them */
54static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
55{
56 fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
57 &mnt->mnt_root->d_lock);
58}
50/* 59/*
51 * update the dentry->d_flags of all of inode's children to indicate if inode cares 60 * update the dentry->d_flags of all of inode's children to indicate if inode cares
52 * about events that happen to its children. 61 * about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..474a3ce1b5e1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
65} 65}
66 66
67/* 67/*
68 * Given an inode, destroy all of the marks associated with that inode.
69 */
70void fsnotify_clear_marks_by_inode(struct inode *inode)
71{
72 struct fsnotify_mark *mark;
73 struct hlist_node *n;
74 LIST_HEAD(free_list);
75
76 spin_lock(&inode->i_lock);
77 hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
78 list_add(&mark->free_list, &free_list);
79 hlist_del_init_rcu(&mark->obj_list);
80 fsnotify_get_mark(mark);
81 }
82 spin_unlock(&inode->i_lock);
83
84 fsnotify_destroy_marks(&free_list);
85}
86
87/*
88 * Given a group clear all of the inode marks associated with that group. 68 * Given a group clear all of the inode marks associated with that group.
89 */ 69 */
90void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) 70void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
122} 122}
123 123
124/* 124/*
125 * Any time a mark is getting freed we end up here. 125 * Remove mark from inode / vfsmount list, group list, drop inode reference
126 * The caller had better be holding a reference to this mark so we don't actually 126 * if we got one.
127 * do the final put under the mark->lock 127 *
128 * Must be called with group->mark_mutex held.
128 */ 129 */
129void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, 130void fsnotify_detach_mark(struct fsnotify_mark *mark)
130 struct fsnotify_group *group)
131{ 131{
132 struct inode *inode = NULL; 132 struct inode *inode = NULL;
133 struct fsnotify_group *group = mark->group;
133 134
134 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 135 BUG_ON(!mutex_is_locked(&group->mark_mutex));
135 136
136 spin_lock(&mark->lock); 137 spin_lock(&mark->lock);
137 138
138 /* something else already called this function on this mark */ 139 /* something else already called this function on this mark */
139 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 140 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
140 spin_unlock(&mark->lock); 141 spin_unlock(&mark->lock);
141 return; 142 return;
142 } 143 }
143 144
144 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 145 mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
145 146
146 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 147 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
147 inode = mark->inode; 148 inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
150 fsnotify_destroy_vfsmount_mark(mark); 151 fsnotify_destroy_vfsmount_mark(mark);
151 else 152 else
152 BUG(); 153 BUG();
154 /*
155 * Note that we didn't update flags telling whether inode cares about
156 * what's happening with children. We update these flags from
157 * __fsnotify_parent() lazily when next event happens on one of our
158 * children.
159 */
153 160
154 list_del_init(&mark->g_list); 161 list_del_init(&mark->g_list);
155 162
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
157 164
158 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) 165 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
159 iput(inode); 166 iput(inode);
160 /* release lock temporarily */ 167
161 mutex_unlock(&group->mark_mutex); 168 atomic_dec(&group->num_marks);
169}
170
171/*
172 * Free fsnotify mark. The freeing is actually happening from a kthread which
173 * first waits for srcu period end. Caller must have a reference to the mark
174 * or be protected by fsnotify_mark_srcu.
175 */
176void fsnotify_free_mark(struct fsnotify_mark *mark)
177{
178 struct fsnotify_group *group = mark->group;
179
180 spin_lock(&mark->lock);
181 /* something else already called this function on this mark */
182 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
183 spin_unlock(&mark->lock);
184 return;
185 }
186 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
187 spin_unlock(&mark->lock);
162 188
163 spin_lock(&destroy_lock); 189 spin_lock(&destroy_lock);
164 list_add(&mark->g_list, &destroy_list); 190 list_add(&mark->g_list, &destroy_list);
165 spin_unlock(&destroy_lock); 191 spin_unlock(&destroy_lock);
166 wake_up(&destroy_waitq); 192 wake_up(&destroy_waitq);
167 /*
168 * We don't necessarily have a ref on mark from caller so the above destroy
169 * may have actually freed it, unless this group provides a 'freeing_mark'
170 * function which must be holding a reference.
171 */
172 193
173 /* 194 /*
174 * Some groups like to know that marks are being freed. This is a 195 * Some groups like to know that marks are being freed. This is a
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
177 */ 198 */
178 if (group->ops->freeing_mark) 199 if (group->ops->freeing_mark)
179 group->ops->freeing_mark(mark, group); 200 group->ops->freeing_mark(mark, group);
180
181 /*
182 * __fsnotify_update_child_dentry_flags(inode);
183 *
184 * I really want to call that, but we can't, we have no idea if the inode
185 * still exists the second we drop the mark->lock.
186 *
187 * The next time an event arrive to this inode from one of it's children
188 * __fsnotify_parent will see that the inode doesn't care about it's
189 * children and will update all of these flags then. So really this
190 * is just a lazy update (and could be a perf win...)
191 */
192
193 atomic_dec(&group->num_marks);
194
195 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
196} 201}
197 202
198void fsnotify_destroy_mark(struct fsnotify_mark *mark, 203void fsnotify_destroy_mark(struct fsnotify_mark *mark,
199 struct fsnotify_group *group) 204 struct fsnotify_group *group)
200{ 205{
201 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 206 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
202 fsnotify_destroy_mark_locked(mark, group); 207 fsnotify_detach_mark(mark);
203 mutex_unlock(&group->mark_mutex); 208 mutex_unlock(&group->mark_mutex);
209 fsnotify_free_mark(mark);
204} 210}
205 211
206/* 212void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
207 * Destroy all marks in the given list. The marks must be already detached from
208 * the original inode / vfsmount.
209 */
210void fsnotify_destroy_marks(struct list_head *to_free)
211{ 213{
212 struct fsnotify_mark *mark, *lmark; 214 struct fsnotify_mark *mark;
213 struct fsnotify_group *group;
214
215 list_for_each_entry_safe(mark, lmark, to_free, free_list) {
216 spin_lock(&mark->lock);
217 fsnotify_get_group(mark->group);
218 group = mark->group;
219 spin_unlock(&mark->lock);
220 215
221 fsnotify_destroy_mark(mark, group); 216 while (1) {
217 /*
218 * We have to be careful since we can race with e.g.
219 * fsnotify_clear_marks_by_group() and once we drop 'lock',
220 * mark can get removed from the obj_list and destroyed. But
221 * we are holding mark reference so mark cannot be freed and
222 * calling fsnotify_destroy_mark() more than once is fine.
223 */
224 spin_lock(lock);
225 if (hlist_empty(head)) {
226 spin_unlock(lock);
227 break;
228 }
229 mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
230 /*
231 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
232 * since inode / mount is going away anyway. So just remove
233 * mark from the list.
234 */
235 hlist_del_init_rcu(&mark->obj_list);
236 fsnotify_get_mark(mark);
237 spin_unlock(lock);
238 fsnotify_destroy_mark(mark, mark->group);
222 fsnotify_put_mark(mark); 239 fsnotify_put_mark(mark);
223 fsnotify_put_group(group);
224 } 240 }
225} 241}
226 242
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
332 * inode->i_lock 348 * inode->i_lock
333 */ 349 */
334 spin_lock(&mark->lock); 350 spin_lock(&mark->lock);
335 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; 351 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
336 352
337 fsnotify_get_group(group); 353 fsnotify_get_group(group);
338 mark->group = group; 354 mark->group = group;
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
438 } 454 }
439 mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); 455 mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
440 fsnotify_get_mark(mark); 456 fsnotify_get_mark(mark);
441 fsnotify_destroy_mark_locked(mark, group); 457 fsnotify_detach_mark(mark);
442 mutex_unlock(&group->mark_mutex); 458 mutex_unlock(&group->mark_mutex);
459 fsnotify_free_mark(mark);
443 fsnotify_put_mark(mark); 460 fsnotify_put_mark(mark);
444 } 461 }
445} 462}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
28 28
29#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 30#include "fsnotify.h"
31#include "../mount.h"
32
33void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
34{
35 struct fsnotify_mark *mark;
36 struct hlist_node *n;
37 struct mount *m = real_mount(mnt);
38 LIST_HEAD(free_list);
39
40 spin_lock(&mnt->mnt_root->d_lock);
41 hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
42 list_add(&mark->free_list, &free_list);
43 hlist_del_init_rcu(&mark->obj_list);
44 fsnotify_get_mark(mark);
45 }
46 spin_unlock(&mnt->mnt_root->d_lock);
47
48 fsnotify_destroy_marks(&free_list);
49}
50 31
51void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) 32void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
52{ 33{
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c1128bcbeb5e..d1a853585b53 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
2204 return true; 2204 return true;
2205#ifdef NTFS_RW 2205#ifdef NTFS_RW
2206iput_usnjrnl_err_out: 2206iput_usnjrnl_err_out:
2207 if (vol->usnjrnl_j_ino) 2207 iput(vol->usnjrnl_j_ino);
2208 iput(vol->usnjrnl_j_ino); 2208 iput(vol->usnjrnl_max_ino);
2209 if (vol->usnjrnl_max_ino) 2209 iput(vol->usnjrnl_ino);
2210 iput(vol->usnjrnl_max_ino);
2211 if (vol->usnjrnl_ino)
2212 iput(vol->usnjrnl_ino);
2213iput_quota_err_out: 2210iput_quota_err_out:
2214 if (vol->quota_q_ino) 2211 iput(vol->quota_q_ino);
2215 iput(vol->quota_q_ino); 2212 iput(vol->quota_ino);
2216 if (vol->quota_ino)
2217 iput(vol->quota_ino);
2218 iput(vol->extend_ino); 2213 iput(vol->extend_ino);
2219#endif /* NTFS_RW */ 2214#endif /* NTFS_RW */
2220iput_sec_err_out: 2215iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
2223 iput(vol->root_ino); 2218 iput(vol->root_ino);
2224iput_logfile_err_out: 2219iput_logfile_err_out:
2225#ifdef NTFS_RW 2220#ifdef NTFS_RW
2226 if (vol->logfile_ino) 2221 iput(vol->logfile_ino);
2227 iput(vol->logfile_ino);
2228iput_vol_err_out: 2222iput_vol_err_out:
2229#endif /* NTFS_RW */ 2223#endif /* NTFS_RW */
2230 iput(vol->vol_ino); 2224 iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
2254 iput(vol->mftbmp_ino); 2248 iput(vol->mftbmp_ino);
2255iput_mirr_err_out: 2249iput_mirr_err_out:
2256#ifdef NTFS_RW 2250#ifdef NTFS_RW
2257 if (vol->mftmirr_ino) 2251 iput(vol->mftmirr_ino);
2258 iput(vol->mftmirr_ino);
2259#endif /* NTFS_RW */ 2252#endif /* NTFS_RW */
2260 return false; 2253 return false;
2261} 2254}
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
284 284
285int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) 285int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
286{ 286{
287 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); 287 struct buffer_head *bh = NULL;
288 int status = 0;
289
290 status = ocfs2_inode_lock(inode, &bh, 1);
291 if (status < 0) {
292 if (status != -ENOENT)
293 mlog_errno(status);
294 return status;
295 }
296 status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
297 ocfs2_inode_unlock(inode, 1);
298 brelse(bh);
299 return status;
288} 300}
289 301
290struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) 302struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
292 struct ocfs2_super *osb; 304 struct ocfs2_super *osb;
293 struct buffer_head *di_bh = NULL; 305 struct buffer_head *di_bh = NULL;
294 struct posix_acl *acl; 306 struct posix_acl *acl;
295 int ret = -EAGAIN; 307 int ret;
296 308
297 osb = OCFS2_SB(inode->i_sb); 309 osb = OCFS2_SB(inode->i_sb);
298 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 310 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
299 return NULL; 311 return NULL;
300 312 ret = ocfs2_inode_lock(inode, &di_bh, 0);
301 ret = ocfs2_read_inode_block(inode, &di_bh); 313 if (ret < 0) {
302 if (ret < 0) 314 if (ret != -ENOENT)
315 mlog_errno(ret);
303 return ERR_PTR(ret); 316 return ERR_PTR(ret);
317 }
304 318
305 acl = ocfs2_get_acl_nolock(inode, type, di_bh); 319 acl = ocfs2_get_acl_nolock(inode, type, di_bh);
306 320
321 ocfs2_inode_unlock(inode, 0);
307 brelse(di_bh); 322 brelse(di_bh);
308
309 return acl; 323 return acl;
310} 324}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..86181d6526dc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
908 */ 908 */
909 909
910 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 910 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
911 ocfs2_error(sb, 911 rc = ocfs2_error(sb,
912 "Extent block #%llu has bad signature %.*s", 912 "Extent block #%llu has bad signature %.*s\n",
913 (unsigned long long)bh->b_blocknr, 7, 913 (unsigned long long)bh->b_blocknr, 7,
914 eb->h_signature); 914 eb->h_signature);
915 return -EINVAL; 915 goto bail;
916 } 916 }
917 917
918 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { 918 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
919 ocfs2_error(sb, 919 rc = ocfs2_error(sb,
920 "Extent block #%llu has an invalid h_blkno " 920 "Extent block #%llu has an invalid h_blkno of %llu\n",
921 "of %llu", 921 (unsigned long long)bh->b_blocknr,
922 (unsigned long long)bh->b_blocknr, 922 (unsigned long long)le64_to_cpu(eb->h_blkno));
923 (unsigned long long)le64_to_cpu(eb->h_blkno)); 923 goto bail;
924 return -EINVAL;
925 } 924 }
926 925
927 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { 926 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
928 ocfs2_error(sb, 927 rc = ocfs2_error(sb,
929 "Extent block #%llu has an invalid " 928 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
930 "h_fs_generation of #%u", 929 (unsigned long long)bh->b_blocknr,
931 (unsigned long long)bh->b_blocknr, 930 le32_to_cpu(eb->h_fs_generation));
932 le32_to_cpu(eb->h_fs_generation)); 931 goto bail;
933 return -EINVAL;
934 } 932 }
935 933bail:
936 return 0; 934 return rc;
937} 935}
938 936
939int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, 937int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1446 while(le16_to_cpu(el->l_tree_depth) > 1) { 1444 while(le16_to_cpu(el->l_tree_depth) > 1) {
1447 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1445 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1448 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1446 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1449 "Owner %llu has empty " 1447 "Owner %llu has empty extent list (next_free_rec == 0)\n",
1450 "extent list (next_free_rec == 0)",
1451 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 1448 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1452 status = -EIO; 1449 status = -EIO;
1453 goto bail; 1450 goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1456 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1453 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1457 if (!blkno) { 1454 if (!blkno) {
1458 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1455 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1459 "Owner %llu has extent " 1456 "Owner %llu has extent list where extent # %d has no physical block start\n",
1460 "list where extent # %d has no physical "
1461 "block start",
1462 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); 1457 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1463 status = -EIO; 1458 status = -EIO;
1464 goto bail; 1459 goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1788 while (el->l_tree_depth) { 1783 while (el->l_tree_depth) {
1789 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1784 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1790 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1785 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1791 "Owner %llu has empty extent list at " 1786 "Owner %llu has empty extent list at depth %u\n",
1792 "depth %u\n",
1793 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1787 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1794 le16_to_cpu(el->l_tree_depth)); 1788 le16_to_cpu(el->l_tree_depth));
1795 ret = -EROFS; 1789 ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1814 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1808 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1815 if (blkno == 0) { 1809 if (blkno == 0) {
1816 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1810 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1817 "Owner %llu has bad blkno in extent list " 1811 "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
1818 "at depth %u (index %d)\n",
1819 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1812 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1820 le16_to_cpu(el->l_tree_depth), i); 1813 le16_to_cpu(el->l_tree_depth), i);
1821 ret = -EROFS; 1814 ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1836 if (le16_to_cpu(el->l_next_free_rec) > 1829 if (le16_to_cpu(el->l_next_free_rec) >
1837 le16_to_cpu(el->l_count)) { 1830 le16_to_cpu(el->l_count)) {
1838 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1831 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1839 "Owner %llu has bad count in extent list " 1832 "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
1840 "at block %llu (next free=%u, count=%u)\n",
1841 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1833 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1842 (unsigned long long)bh->b_blocknr, 1834 (unsigned long long)bh->b_blocknr,
1843 le16_to_cpu(el->l_next_free_rec), 1835 le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
2116 2108
2117 if (left_el->l_next_free_rec != left_el->l_count) { 2109 if (left_el->l_next_free_rec != left_el->l_count) {
2118 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 2110 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2119 "Inode %llu has non-full interior leaf node %llu" 2111 "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
2120 "(next free = %u)",
2121 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2112 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2122 (unsigned long long)left_leaf_bh->b_blocknr, 2113 (unsigned long long)left_leaf_bh->b_blocknr,
2123 le16_to_cpu(left_el->l_next_free_rec)); 2114 le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2256 * If we got here, we never found a valid node where 2247 * If we got here, we never found a valid node where
2257 * the tree indicated one should be. 2248 * the tree indicated one should be.
2258 */ 2249 */
2259 ocfs2_error(sb, 2250 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2260 "Invalid extent tree at extent block %llu\n",
2261 (unsigned long long)blkno); 2251 (unsigned long long)blkno);
2262 ret = -EROFS; 2252 ret = -EROFS;
2263 goto out; 2253 goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2872 * If we got here, we never found a valid node where 2862 * If we got here, we never found a valid node where
2873 * the tree indicated one should be. 2863 * the tree indicated one should be.
2874 */ 2864 */
2875 ocfs2_error(sb, 2865 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2876 "Invalid extent tree at extent block %llu\n",
2877 (unsigned long long)blkno); 2866 (unsigned long long)blkno);
2878 ret = -EROFS; 2867 ret = -EROFS;
2879 goto out; 2868 goto out;
@@ -3131,6 +3120,30 @@ out:
3131 return ret; 3120 return ret;
3132} 3121}
3133 3122
3123static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
3124 struct ocfs2_extent_tree *et,
3125 struct ocfs2_path *path,
3126 struct ocfs2_cached_dealloc_ctxt *dealloc)
3127{
3128 handle_t *handle;
3129 int ret;
3130 int credits = path->p_tree_depth * 2 + 1;
3131
3132 handle = ocfs2_start_trans(osb, credits);
3133 if (IS_ERR(handle)) {
3134 ret = PTR_ERR(handle);
3135 mlog_errno(ret);
3136 return ret;
3137 }
3138
3139 ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
3140 if (ret)
3141 mlog_errno(ret);
3142
3143 ocfs2_commit_trans(osb, handle);
3144 return ret;
3145}
3146
3134/* 3147/*
3135 * Left rotation of btree records. 3148 * Left rotation of btree records.
3136 * 3149 *
@@ -3200,7 +3213,7 @@ rightmost_no_delete:
3200 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3213 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3201 ret = -EIO; 3214 ret = -EIO;
3202 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 3215 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3203 "Owner %llu has empty extent block at %llu", 3216 "Owner %llu has empty extent block at %llu\n",
3204 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 3217 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3205 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3218 (unsigned long long)le64_to_cpu(eb->h_blkno));
3206 goto out; 3219 goto out;
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
3930 next_free = le16_to_cpu(el->l_next_free_rec); 3943 next_free = le16_to_cpu(el->l_next_free_rec);
3931 if (next_free == 0) { 3944 if (next_free == 0) {
3932 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 3945 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3933 "Owner %llu has a bad extent list", 3946 "Owner %llu has a bad extent list\n",
3934 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 3947 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3935 ret = -EIO; 3948 ret = -EIO;
3936 return; 3949 return;
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4355 bh = path_leaf_bh(left_path); 4368 bh = path_leaf_bh(left_path);
4356 eb = (struct ocfs2_extent_block *)bh->b_data; 4369 eb = (struct ocfs2_extent_block *)bh->b_data;
4357 ocfs2_error(sb, 4370 ocfs2_error(sb,
4358 "Extent block #%llu has an " 4371 "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
4359 "invalid l_next_free_rec of "
4360 "%d. It should have "
4361 "matched the l_count of %d",
4362 (unsigned long long)le64_to_cpu(eb->h_blkno), 4372 (unsigned long long)le64_to_cpu(eb->h_blkno),
4363 le16_to_cpu(new_el->l_next_free_rec), 4373 le16_to_cpu(new_el->l_next_free_rec),
4364 le16_to_cpu(new_el->l_count)); 4374 le16_to_cpu(new_el->l_count));
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4413 bh = path_leaf_bh(right_path); 4423 bh = path_leaf_bh(right_path);
4414 eb = (struct ocfs2_extent_block *)bh->b_data; 4424 eb = (struct ocfs2_extent_block *)bh->b_data;
4415 ocfs2_error(sb, 4425 ocfs2_error(sb,
4416 "Extent block #%llu has an " 4426 "Extent block #%llu has an invalid l_next_free_rec of %d\n",
4417 "invalid l_next_free_rec of %d",
4418 (unsigned long long)le64_to_cpu(eb->h_blkno), 4427 (unsigned long long)le64_to_cpu(eb->h_blkno),
4419 le16_to_cpu(new_el->l_next_free_rec)); 4428 le16_to_cpu(new_el->l_next_free_rec));
4420 status = -EINVAL; 4429 status = -EINVAL;
@@ -4970,10 +4979,9 @@ leftright:
4970 split_index = ocfs2_search_extent_list(el, cpos); 4979 split_index = ocfs2_search_extent_list(el, cpos);
4971 if (split_index == -1) { 4980 if (split_index == -1) {
4972 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 4981 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
4973 "Owner %llu has an extent at cpos %u " 4982 "Owner %llu has an extent at cpos %u which can no longer be found\n",
4974 "which can no longer be found.\n", 4983 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4975 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 4984 cpos);
4976 cpos);
4977 ret = -EROFS; 4985 ret = -EROFS;
4978 goto out; 4986 goto out;
4979 } 4987 }
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
5158 index = ocfs2_search_extent_list(el, cpos); 5166 index = ocfs2_search_extent_list(el, cpos);
5159 if (index == -1) { 5167 if (index == -1) {
5160 ocfs2_error(sb, 5168 ocfs2_error(sb,
5161 "Owner %llu has an extent at cpos %u which can no " 5169 "Owner %llu has an extent at cpos %u which can no longer be found\n",
5162 "longer be found.\n", 5170 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5163 (unsigned long long) 5171 cpos);
5164 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5165 ret = -EROFS; 5172 ret = -EROFS;
5166 goto out; 5173 goto out;
5167 } 5174 }
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
5228 cpos, len, phys); 5235 cpos, len, phys);
5229 5236
5230 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 5237 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5231 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 5238 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
5232 "that are being written to, but the feature bit "
5233 "is not set in the super block.",
5234 (unsigned long long)OCFS2_I(inode)->ip_blkno); 5239 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5235 ret = -EROFS; 5240 ret = -EROFS;
5236 goto out; 5241 goto out;
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
5514 index = ocfs2_search_extent_list(el, cpos); 5519 index = ocfs2_search_extent_list(el, cpos);
5515 if (index == -1) { 5520 if (index == -1) {
5516 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5521 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5517 "Owner %llu has an extent at cpos %u which can no " 5522 "Owner %llu has an extent at cpos %u which can no longer be found\n",
5518 "longer be found.\n",
5519 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5523 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5520 cpos); 5524 cpos);
5521 ret = -EROFS; 5525 ret = -EROFS;
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
5580 index = ocfs2_search_extent_list(el, cpos); 5584 index = ocfs2_search_extent_list(el, cpos);
5581 if (index == -1) { 5585 if (index == -1) {
5582 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5586 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5583 "Owner %llu: split at cpos %u lost record.", 5587 "Owner %llu: split at cpos %u lost record\n",
5584 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5588 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5585 cpos); 5589 cpos);
5586 ret = -EROFS; 5590 ret = -EROFS;
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
5596 ocfs2_rec_clusters(el, rec); 5600 ocfs2_rec_clusters(el, rec);
5597 if (rec_range != trunc_range) { 5601 if (rec_range != trunc_range) {
5598 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5602 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5599 "Owner %llu: error after split at cpos %u" 5603 "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
5600 "trunc len %u, existing record is (%u,%u)",
5601 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5604 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5602 cpos, len, le32_to_cpu(rec->e_cpos), 5605 cpos, len, le32_to_cpu(rec->e_cpos),
5603 ocfs2_rec_clusters(el, rec)); 5606 ocfs2_rec_clusters(el, rec));
@@ -6175,7 +6178,7 @@ bail:
6175 iput(tl_inode); 6178 iput(tl_inode);
6176 brelse(tl_bh); 6179 brelse(tl_bh);
6177 6180
6178 if (status < 0 && (*tl_copy)) { 6181 if (status < 0) {
6179 kfree(*tl_copy); 6182 kfree(*tl_copy);
6180 *tl_copy = NULL; 6183 *tl_copy = NULL;
6181 mlog_errno(status); 6184 mlog_errno(status);
@@ -7108,15 +7111,23 @@ start:
7108 * to check it up here before changing the tree. 7111 * to check it up here before changing the tree.
7109 */ 7112 */
7110 if (root_el->l_tree_depth && rec->e_int_clusters == 0) { 7113 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7111 ocfs2_error(inode->i_sb, "Inode %lu has an empty " 7114 mlog(ML_ERROR, "Inode %lu has an empty "
7112 "extent record, depth %u\n", inode->i_ino, 7115 "extent record, depth %u\n", inode->i_ino,
7113 le16_to_cpu(root_el->l_tree_depth)); 7116 le16_to_cpu(root_el->l_tree_depth));
7114 status = -EROFS; 7117 status = ocfs2_remove_rightmost_empty_extent(osb,
7115 goto bail; 7118 &et, path, &dealloc);
7119 if (status) {
7120 mlog_errno(status);
7121 goto bail;
7122 }
7123
7124 ocfs2_reinit_path(path, 1);
7125 goto start;
7126 } else {
7127 trunc_cpos = le32_to_cpu(rec->e_cpos);
7128 trunc_len = 0;
7129 blkno = 0;
7116 } 7130 }
7117 trunc_cpos = le32_to_cpu(rec->e_cpos);
7118 trunc_len = 0;
7119 blkno = 0;
7120 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { 7131 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7121 /* 7132 /*
7122 * Truncate entire record. 7133 * Truncate entire record.
@@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7204 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7215 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7205 !ocfs2_supports_inline_data(osb)) { 7216 !ocfs2_supports_inline_data(osb)) {
7206 ocfs2_error(inode->i_sb, 7217 ocfs2_error(inode->i_sb,
7207 "Inline data flags for inode %llu don't agree! " 7218 "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7208 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7209 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7219 (unsigned long long)OCFS2_I(inode)->ip_blkno,
7210 le16_to_cpu(di->i_dyn_features), 7220 le16_to_cpu(di->i_dyn_features),
7211 OCFS2_I(inode)->ip_dyn_features, 7221 OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9db8194..64b11d90eca6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
228 228
229 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 229 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
230 ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", 230 ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
231 (unsigned long long)OCFS2_I(inode)->ip_blkno); 231 (unsigned long long)OCFS2_I(inode)->ip_blkno);
232 return -EROFS; 232 return -EROFS;
233 } 233 }
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
237 if (size > PAGE_CACHE_SIZE || 237 if (size > PAGE_CACHE_SIZE ||
238 size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { 238 size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
239 ocfs2_error(inode->i_sb, 239 ocfs2_error(inode->i_sb,
240 "Inode %llu has with inline data has bad size: %Lu", 240 "Inode %llu has with inline data has bad size: %Lu\n",
241 (unsigned long long)OCFS2_I(inode)->ip_blkno, 241 (unsigned long long)OCFS2_I(inode)->ip_blkno,
242 (unsigned long long)size); 242 (unsigned long long)size);
243 return -EROFS; 243 return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
533 533
534 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 534 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
535 535
536 down_read(&OCFS2_I(inode)->ip_alloc_sem);
537
536 /* This figures out the size of the next contiguous block, and 538 /* This figures out the size of the next contiguous block, and
537 * our logical offset */ 539 * our logical offset */
538 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 540 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
539 &contig_blocks, &ext_flags); 541 &contig_blocks, &ext_flags);
542 up_read(&OCFS2_I(inode)->ip_alloc_sem);
543
540 if (ret) { 544 if (ret) {
541 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 545 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
542 (unsigned long long)iblock); 546 (unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
557 561
558 alloc_locked = 1; 562 alloc_locked = 1;
559 563
564 down_write(&OCFS2_I(inode)->ip_alloc_sem);
565
560 /* fill hole, allocate blocks can't be larger than the size 566 /* fill hole, allocate blocks can't be larger than the size
561 * of the hole */ 567 * of the hole */
562 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); 568 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
569 ret = ocfs2_extend_allocation(inode, cpos, 575 ret = ocfs2_extend_allocation(inode, cpos,
570 clusters_to_alloc, 0); 576 clusters_to_alloc, 0);
571 if (ret < 0) { 577 if (ret < 0) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
572 mlog_errno(ret); 579 mlog_errno(ret);
573 goto bail; 580 goto bail;
574 } 581 }
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
576 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 583 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
577 &contig_blocks, &ext_flags); 584 &contig_blocks, &ext_flags);
578 if (ret < 0) { 585 if (ret < 0) {
586 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 587 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
580 (unsigned long long)iblock); 588 (unsigned long long)iblock);
581 ret = -EIO; 589 ret = -EIO;
582 goto bail; 590 goto bail;
583 } 591 }
592 up_write(&OCFS2_I(inode)->ip_alloc_sem);
584 } 593 }
585 594
586 /* 595 /*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
627 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); 636 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
628 } 637 }
629 638
630 ocfs2_iocb_clear_rw_locked(iocb); 639 /* Let rw unlock to be done later to protect append direct io write */
640 if (offset + bytes <= i_size_read(inode)) {
641 ocfs2_iocb_clear_rw_locked(iocb);
631 642
632 level = ocfs2_iocb_rw_locked_level(iocb); 643 level = ocfs2_iocb_rw_locked_level(iocb);
633 ocfs2_rw_unlock(inode, level); 644 ocfs2_rw_unlock(inode, level);
645 }
634} 646}
635 647
636static int ocfs2_releasepage(struct page *page, gfp_t wait) 648static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
832 844
833 /* zeroing out the previously allocated cluster tail 845 /* zeroing out the previously allocated cluster tail
834 * that but not zeroed */ 846 * that but not zeroed */
835 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 847 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
848 down_read(&OCFS2_I(inode)->ip_alloc_sem);
836 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, 849 ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
837 zero_len_tail, cluster_align_tail); 850 zero_len_tail, cluster_align_tail);
838 else 851 up_read(&OCFS2_I(inode)->ip_alloc_sem);
852 } else {
853 down_write(&OCFS2_I(inode)->ip_alloc_sem);
839 ret = ocfs2_direct_IO_extend_no_holes(osb, inode, 854 ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
840 offset); 855 offset);
856 up_write(&OCFS2_I(inode)->ip_alloc_sem);
857 }
841 if (ret < 0) { 858 if (ret < 0) {
842 mlog_errno(ret); 859 mlog_errno(ret);
843 ocfs2_inode_unlock(inode, 1); 860 ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
857 written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, 874 written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
858 offset, ocfs2_direct_IO_get_blocks, 875 offset, ocfs2_direct_IO_get_blocks,
859 ocfs2_dio_end_io, NULL, 0); 876 ocfs2_dio_end_io, NULL, 0);
860 if (unlikely(written < 0)) { 877 /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
878 if ((written < 0) && (written != -EIOCBQUEUED)) {
861 loff_t i_size = i_size_read(inode); 879 loff_t i_size = i_size_read(inode);
862 880
863 if (offset + count > i_size) { 881 if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
876 894
877 ocfs2_inode_unlock(inode, 1); 895 ocfs2_inode_unlock(inode, 1);
878 brelse(di_bh); 896 brelse(di_bh);
897 di_bh = NULL;
879 goto clean_orphan; 898 goto clean_orphan;
880 } 899 }
881 } 900 }
882 901
883 ocfs2_inode_unlock(inode, 1); 902 ocfs2_inode_unlock(inode, 1);
884 brelse(di_bh); 903 brelse(di_bh);
904 di_bh = NULL;
885 905
886 ret = jbd2_journal_force_commit(journal); 906 ret = jbd2_journal_force_commit(journal);
887 if (ret < 0) 907 if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
936 if (tmp_ret < 0) { 956 if (tmp_ret < 0) {
937 ret = tmp_ret; 957 ret = tmp_ret;
938 mlog_errno(ret); 958 mlog_errno(ret);
959 brelse(di_bh);
939 goto out; 960 goto out;
940 } 961 }
941 962
942 ocfs2_inode_unlock(inode, 1); 963 ocfs2_inode_unlock(inode, 1);
964 brelse(di_bh);
943 965
944 tmp_ret = jbd2_journal_force_commit(journal); 966 tmp_ret = jbd2_journal_force_commit(journal);
945 if (tmp_ret < 0) { 967 if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
2185 if (ret) 2207 if (ret)
2186 goto out_commit; 2208 goto out_commit;
2187 } 2209 }
2188 /* 2210
2189 * We don't want this to fail in ocfs2_write_end(), so do it
2190 * here.
2191 */
2192 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 2211 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
2193 OCFS2_JOURNAL_ACCESS_WRITE); 2212 OCFS2_JOURNAL_ACCESS_WRITE);
2194 if (ret) { 2213 if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2345 loff_t pos, unsigned len, unsigned copied, 2364 loff_t pos, unsigned len, unsigned copied,
2346 struct page *page, void *fsdata) 2365 struct page *page, void *fsdata)
2347{ 2366{
2348 int i; 2367 int i, ret;
2349 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); 2368 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
2350 struct inode *inode = mapping->host; 2369 struct inode *inode = mapping->host;
2351 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2370 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2354 handle_t *handle = wc->w_handle; 2373 handle_t *handle = wc->w_handle;
2355 struct page *tmppage; 2374 struct page *tmppage;
2356 2375
2376 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
2377 OCFS2_JOURNAL_ACCESS_WRITE);
2378 if (ret) {
2379 copied = ret;
2380 mlog_errno(ret);
2381 goto out;
2382 }
2383
2357 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2384 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2358 ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); 2385 ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
2359 goto out_write_size; 2386 goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
2409 ocfs2_update_inode_fsync_trans(handle, inode, 1); 2436 ocfs2_update_inode_fsync_trans(handle, inode, 1);
2410 ocfs2_journal_dirty(handle, wc->w_di_bh); 2437 ocfs2_journal_dirty(handle, wc->w_di_bh);
2411 2438
2439out:
2412 /* unlock pages before dealloc since it needs acquiring j_trans_barrier 2440 /* unlock pages before dealloc since it needs acquiring j_trans_barrier
2413 * lock, or it will cause a deadlock since journal commit threads holds 2441 * lock, or it will cause a deadlock since journal commit threads holds
2414 * this lock and will ask for the page lock when flushing the data. 2442 * this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
316 bh = bhs[i]; 316 bh = bhs[i];
317 317
318 if (!(flags & OCFS2_BH_READAHEAD)) { 318 if (!(flags & OCFS2_BH_READAHEAD)) {
319 if (status) {
320 /* Clear the rest of the buffers on error */
321 put_bh(bh);
322 bhs[i] = NULL;
323 continue;
324 }
319 /* We know this can't have changed as we hold the 325 /* We know this can't have changed as we hold the
320 * owner sem. Avoid doing any work on the bh if the 326 * owner sem. Avoid doing any work on the bh if the
321 * journal has it. */ 327 * journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 140de3c93d2e..fa15debcc02b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/bitmap.h> 38#include <linux/bitmap.h>
39 39#include <linux/ktime.h>
40#include "heartbeat.h" 40#include "heartbeat.h"
41#include "tcp.h" 41#include "tcp.h"
42#include "nodemanager.h" 42#include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
1060 return ret; 1060 return ret;
1061} 1061}
1062 1062
1063/* Subtract b from a, storing the result in a. a *must* have a larger
1064 * value than b. */
1065static void o2hb_tv_subtract(struct timeval *a,
1066 struct timeval *b)
1067{
1068 /* just return 0 when a is after b */
1069 if (a->tv_sec < b->tv_sec ||
1070 (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
1071 a->tv_sec = 0;
1072 a->tv_usec = 0;
1073 return;
1074 }
1075
1076 a->tv_sec -= b->tv_sec;
1077 a->tv_usec -= b->tv_usec;
1078 while ( a->tv_usec < 0 ) {
1079 a->tv_sec--;
1080 a->tv_usec += 1000000;
1081 }
1082}
1083
1084static unsigned int o2hb_elapsed_msecs(struct timeval *start,
1085 struct timeval *end)
1086{
1087 struct timeval res = *end;
1088
1089 o2hb_tv_subtract(&res, start);
1090
1091 return res.tv_sec * 1000 + res.tv_usec / 1000;
1092}
1093
1094/* 1063/*
1095 * we ride the region ref that the region dir holds. before the region 1064 * we ride the region ref that the region dir holds. before the region
1096 * dir is removed and drops it ref it will wait to tear down this 1065 * dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
1101 int i, ret; 1070 int i, ret;
1102 struct o2hb_region *reg = data; 1071 struct o2hb_region *reg = data;
1103 struct o2hb_bio_wait_ctxt write_wc; 1072 struct o2hb_bio_wait_ctxt write_wc;
1104 struct timeval before_hb, after_hb; 1073 ktime_t before_hb, after_hb;
1105 unsigned int elapsed_msec; 1074 unsigned int elapsed_msec;
1106 1075
1107 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); 1076 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
1118 * hr_timeout_ms between disk writes. On busy systems 1087 * hr_timeout_ms between disk writes. On busy systems
1119 * this should result in a heartbeat which is less 1088 * this should result in a heartbeat which is less
1120 * likely to time itself out. */ 1089 * likely to time itself out. */
1121 do_gettimeofday(&before_hb); 1090 before_hb = ktime_get_real();
1122 1091
1123 ret = o2hb_do_disk_heartbeat(reg); 1092 ret = o2hb_do_disk_heartbeat(reg);
1124 1093
1125 do_gettimeofday(&after_hb); 1094 after_hb = ktime_get_real();
1126 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1095
1096 elapsed_msec = (unsigned int)
1097 ktime_ms_delta(after_hb, before_hb);
1127 1098
1128 mlog(ML_HEARTBEAT, 1099 mlog(ML_HEARTBEAT,
1129 "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", 1100 "start = %lld, end = %lld, msec = %u, ret = %d\n",
1130 before_hb.tv_sec, (unsigned long) before_hb.tv_usec, 1101 before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
1131 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1132 elapsed_msec, ret);
1133 1102
1134 if (!kthread_should_stop() && 1103 if (!kthread_should_stop() &&
1135 elapsed_msec < reg->hr_timeout_ms) { 1104 elapsed_msec < reg->hr_timeout_ms) {
@@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
1619 struct o2hb_disk_slot *slot; 1588 struct o2hb_disk_slot *slot;
1620 1589
1621 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); 1590 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
1622 if (reg->hr_tmp_block == NULL) { 1591 if (reg->hr_tmp_block == NULL)
1623 mlog_errno(-ENOMEM);
1624 return -ENOMEM; 1592 return -ENOMEM;
1625 }
1626 1593
1627 reg->hr_slots = kcalloc(reg->hr_blocks, 1594 reg->hr_slots = kcalloc(reg->hr_blocks,
1628 sizeof(struct o2hb_disk_slot), GFP_KERNEL); 1595 sizeof(struct o2hb_disk_slot), GFP_KERNEL);
1629 if (reg->hr_slots == NULL) { 1596 if (reg->hr_slots == NULL)
1630 mlog_errno(-ENOMEM);
1631 return -ENOMEM; 1597 return -ENOMEM;
1632 }
1633 1598
1634 for(i = 0; i < reg->hr_blocks; i++) { 1599 for(i = 0; i < reg->hr_blocks; i++) {
1635 slot = &reg->hr_slots[i]; 1600 slot = &reg->hr_slots[i];
@@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
1645 1610
1646 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), 1611 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
1647 GFP_KERNEL); 1612 GFP_KERNEL);
1648 if (!reg->hr_slot_data) { 1613 if (!reg->hr_slot_data)
1649 mlog_errno(-ENOMEM);
1650 return -ENOMEM; 1614 return -ENOMEM;
1651 }
1652 1615
1653 for(i = 0; i < reg->hr_num_pages; i++) { 1616 for(i = 0; i < reg->hr_num_pages; i++) {
1654 page = alloc_page(GFP_KERNEL); 1617 page = alloc_page(GFP_KERNEL);
1655 if (!page) { 1618 if (!page)
1656 mlog_errno(-ENOMEM);
1657 return -ENOMEM; 1619 return -ENOMEM;
1658 }
1659 1620
1660 reg->hr_slot_data[i] = page; 1621 reg->hr_slot_data[i] = page;
1661 1622
@@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
1687 struct o2hb_disk_heartbeat_block *hb_block; 1648 struct o2hb_disk_heartbeat_block *hb_block;
1688 1649
1689 ret = o2hb_read_slots(reg, reg->hr_blocks); 1650 ret = o2hb_read_slots(reg, reg->hr_blocks);
1690 if (ret) { 1651 if (ret)
1691 mlog_errno(ret);
1692 goto out; 1652 goto out;
1693 }
1694 1653
1695 /* We only want to get an idea of the values initially in each 1654 /* We only want to get an idea of the values initially in each
1696 * slot, so we do no verification - o2hb_check_slot will 1655 * slot, so we do no verification - o2hb_check_slot will
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
480 480
481 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); 481 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
482 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 482 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
483 rc = -EINVAL; 483 rc = ocfs2_error(dir->i_sb,
484 ocfs2_error(dir->i_sb, 484 "Invalid dirblock #%llu: signature = %.*s\n",
485 "Invalid dirblock #%llu: " 485 (unsigned long long)bh->b_blocknr, 7,
486 "signature = %.*s\n", 486 trailer->db_signature);
487 (unsigned long long)bh->b_blocknr, 7,
488 trailer->db_signature);
489 goto out; 487 goto out;
490 } 488 }
491 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { 489 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
492 rc = -EINVAL; 490 rc = ocfs2_error(dir->i_sb,
493 ocfs2_error(dir->i_sb, 491 "Directory block #%llu has an invalid db_blkno of %llu\n",
494 "Directory block #%llu has an invalid " 492 (unsigned long long)bh->b_blocknr,
495 "db_blkno of %llu", 493 (unsigned long long)le64_to_cpu(trailer->db_blkno));
496 (unsigned long long)bh->b_blocknr,
497 (unsigned long long)le64_to_cpu(trailer->db_blkno));
498 goto out; 494 goto out;
499 } 495 }
500 if (le64_to_cpu(trailer->db_parent_dinode) != 496 if (le64_to_cpu(trailer->db_parent_dinode) !=
501 OCFS2_I(dir)->ip_blkno) { 497 OCFS2_I(dir)->ip_blkno) {
502 rc = -EINVAL; 498 rc = ocfs2_error(dir->i_sb,
503 ocfs2_error(dir->i_sb, 499 "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
504 "Directory block #%llu on dinode " 500 (unsigned long long)bh->b_blocknr,
505 "#%llu has an invalid parent_dinode " 501 (unsigned long long)OCFS2_I(dir)->ip_blkno,
506 "of %llu", 502 (unsigned long long)le64_to_cpu(trailer->db_blkno));
507 (unsigned long long)bh->b_blocknr,
508 (unsigned long long)OCFS2_I(dir)->ip_blkno,
509 (unsigned long long)le64_to_cpu(trailer->db_blkno));
510 goto out; 503 goto out;
511 } 504 }
512out: 505out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
604 } 597 }
605 598
606 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { 599 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
607 ocfs2_error(sb, 600 ret = ocfs2_error(sb,
608 "Dir Index Root # %llu has bad signature %.*s", 601 "Dir Index Root # %llu has bad signature %.*s\n",
609 (unsigned long long)le64_to_cpu(dx_root->dr_blkno), 602 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
610 7, dx_root->dr_signature); 603 7, dx_root->dr_signature);
611 return -EINVAL;
612 } 604 }
613 605
614 return 0; 606 return ret;
615} 607}
616 608
617static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, 609static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
648 } 640 }
649 641
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { 642 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", 643 ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
652 7, dx_leaf->dl_signature); 644 7, dx_leaf->dl_signature);
653 return -EROFS;
654 } 645 }
655 646
656 return 0; 647 return ret;
657} 648}
658 649
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, 650static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
812 el = &eb->h_list; 803 el = &eb->h_list;
813 804
814 if (el->l_tree_depth) { 805 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb, 806 ret = ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in " 807 "Inode %lu has non zero tree depth in btree tree block %llu\n",
817 "btree tree block %llu\n", inode->i_ino, 808 inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr); 809 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out; 810 goto out;
821 } 811 }
822 } 812 }
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
832 } 822 }
833 823
834 if (!found) { 824 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 825 ret = ocfs2_error(inode->i_sb,
836 "record (%u, %u, 0) in btree", inode->i_ino, 826 "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
837 le32_to_cpu(rec->e_cpos), 827 inode->i_ino,
838 ocfs2_rec_clusters(el, rec)); 828 le32_to_cpu(rec->e_cpos),
839 ret = -EROFS; 829 ocfs2_rec_clusters(el, rec));
840 goto out; 830 goto out;
841 } 831 }
842 832
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7df88a6dd626..6918f30d02cd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
1465 if (status == -ENOPROTOOPT) { 1465 if (status == -ENOPROTOOPT) {
1466 status = 0; 1466 status = 0;
1467 *response = JOIN_OK_NO_MAP; 1467 *response = JOIN_OK_NO_MAP;
1468 } else if (packet.code == JOIN_DISALLOW ||
1469 packet.code == JOIN_OK_NO_MAP) {
1470 *response = packet.code;
1471 } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
1472 mlog(ML_NOTICE,
1473 "This node requested DLM locking protocol %u.%u and "
1474 "filesystem locking protocol %u.%u. At least one of "
1475 "the protocol versions on node %d is not compatible, "
1476 "disconnecting\n",
1477 dlm->dlm_locking_proto.pv_major,
1478 dlm->dlm_locking_proto.pv_minor,
1479 dlm->fs_locking_proto.pv_major,
1480 dlm->fs_locking_proto.pv_minor,
1481 node);
1482 status = -EPROTO;
1483 *response = packet.code;
1484 } else if (packet.code == JOIN_OK) {
1485 *response = packet.code;
1486 /* Use the same locking protocol as the remote node */
1487 dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
1488 dlm->fs_locking_proto.pv_minor = packet.fs_minor;
1489 mlog(0,
1490 "Node %d responds JOIN_OK with DLM locking protocol "
1491 "%u.%u and fs locking protocol %u.%u\n",
1492 node,
1493 dlm->dlm_locking_proto.pv_major,
1494 dlm->dlm_locking_proto.pv_minor,
1495 dlm->fs_locking_proto.pv_major,
1496 dlm->fs_locking_proto.pv_minor);
1497 } else { 1468 } else {
1498 status = -EINVAL; 1469 *response = packet.code;
1499 mlog(ML_ERROR, "invalid response %d from node %u\n", 1470 switch (packet.code) {
1500 packet.code, node); 1471 case JOIN_DISALLOW:
1472 case JOIN_OK_NO_MAP:
1473 break;
1474 case JOIN_PROTOCOL_MISMATCH:
1475 mlog(ML_NOTICE,
1476 "This node requested DLM locking protocol %u.%u and "
1477 "filesystem locking protocol %u.%u. At least one of "
1478 "the protocol versions on node %d is not compatible, "
1479 "disconnecting\n",
1480 dlm->dlm_locking_proto.pv_major,
1481 dlm->dlm_locking_proto.pv_minor,
1482 dlm->fs_locking_proto.pv_major,
1483 dlm->fs_locking_proto.pv_minor,
1484 node);
1485 status = -EPROTO;
1486 break;
1487 case JOIN_OK:
1488 /* Use the same locking protocol as the remote node */
1489 dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
1490 dlm->fs_locking_proto.pv_minor = packet.fs_minor;
1491 mlog(0,
1492 "Node %d responds JOIN_OK with DLM locking protocol "
1493 "%u.%u and fs locking protocol %u.%u\n",
1494 node,
1495 dlm->dlm_locking_proto.pv_major,
1496 dlm->dlm_locking_proto.pv_minor,
1497 dlm->fs_locking_proto.pv_major,
1498 dlm->fs_locking_proto.pv_minor);
1499 break;
1500 default:
1501 status = -EINVAL;
1502 mlog(ML_ERROR, "invalid response %d from node %u\n",
1503 packet.code, node);
1504 /* Reset response to JOIN_DISALLOW */
1505 *response = JOIN_DISALLOW;
1506 break;
1507 }
1501 } 1508 }
1502 1509
1503 mlog(0, "status %d, node %d response is %d\n", status, node, 1510 mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1725 1732
1726 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1733 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1727 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1734 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1735 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1736 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1737
1728 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); 1738 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
1729 if (status) 1739 if (status)
1730 goto bail; 1740 goto bail;
1731 1741
1732 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1733 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1734 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); 1742 status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
1735 if (status) 1743 if (status)
1736 goto bail; 1744 goto bail;
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1845 sizeof(struct dlm_exit_domain), 1853 sizeof(struct dlm_exit_domain),
1846 dlm_begin_exit_domain_handler, 1854 dlm_begin_exit_domain_handler,
1847 dlm, NULL, &dlm->dlm_domain_handlers); 1855 dlm, NULL, &dlm->dlm_domain_handlers);
1848 if (status)
1849 goto bail;
1850 1856
1851bail: 1857bail:
1852 if (status) 1858 if (status)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
498 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 498 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
499 res->lockname.name); 499 res->lockname.name);
500 500
501 spin_lock(&dlm->track_lock);
502 if (!list_empty(&res->tracking))
503 list_del_init(&res->tracking);
504 else {
505 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
506 res->lockname.len, res->lockname.name);
507 dlm_print_one_lock_resource(res);
508 }
509 spin_unlock(&dlm->track_lock);
510
511 atomic_dec(&dlm->res_cur_count); 501 atomic_dec(&dlm->res_cur_count);
512 502
513 if (!hlist_unhashed(&res->hash_node) || 503 if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
795 dlm_lockres_grab_inflight_ref(dlm, tmpres); 785 dlm_lockres_grab_inflight_ref(dlm, tmpres);
796 786
797 spin_unlock(&tmpres->spinlock); 787 spin_unlock(&tmpres->spinlock);
798 if (res) 788 if (res) {
789 spin_lock(&dlm->track_lock);
790 if (!list_empty(&res->tracking))
791 list_del_init(&res->tracking);
792 else
793 mlog(ML_ERROR, "Resource %.*s not "
794 "on the Tracking list\n",
795 res->lockname.len,
796 res->lockname.name);
797 spin_unlock(&dlm->track_lock);
799 dlm_lockres_put(res); 798 dlm_lockres_put(res);
799 }
800 res = tmpres; 800 res = tmpres;
801 goto leave; 801 goto leave;
802 } 802 }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b1a31f..d0e436dc6437 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1776 struct dlm_migratable_lockres *mres) 1776 struct dlm_migratable_lockres *mres)
1777{ 1777{
1778 struct dlm_migratable_lock *ml; 1778 struct dlm_migratable_lock *ml;
1779 struct list_head *queue, *iter; 1779 struct list_head *queue;
1780 struct list_head *tmpq = NULL; 1780 struct list_head *tmpq = NULL;
1781 struct dlm_lock *newlock = NULL; 1781 struct dlm_lock *newlock = NULL;
1782 struct dlm_lockstatus *lksb = NULL; 1782 struct dlm_lockstatus *lksb = NULL;
@@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1821 spin_lock(&res->spinlock); 1821 spin_lock(&res->spinlock);
1822 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1822 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1823 tmpq = dlm_list_idx_to_ptr(res, j); 1823 tmpq = dlm_list_idx_to_ptr(res, j);
1824 list_for_each(iter, tmpq) { 1824 list_for_each_entry(lock, tmpq, list) {
1825 lock = list_entry(iter,
1826 struct dlm_lock, list);
1827 if (lock->ml.cookie == ml->cookie) 1825 if (lock->ml.cookie == ml->cookie)
1828 break; 1826 break;
1829 lock = NULL; 1827 lock = NULL;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
211 211
212 __dlm_unhash_lockres(dlm, res); 212 __dlm_unhash_lockres(dlm, res);
213 213
214 spin_lock(&dlm->track_lock);
215 if (!list_empty(&res->tracking))
216 list_del_init(&res->tracking);
217 else {
218 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
219 res->lockname.len, res->lockname.name);
220 __dlm_print_one_lock_resource(res);
221 }
222 spin_unlock(&dlm->track_lock);
223
214 /* lockres is not in the hash now. drop the flag and wake up 224 /* lockres is not in the hash now. drop the flag and wake up
215 * any processes waiting in dlm_get_lock_resource. */ 225 * any processes waiting in dlm_get_lock_resource. */
216 if (!master) { 226 if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e40dd74..1c91103c1333 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3035,8 +3035,6 @@ local:
3035 ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); 3035 ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
3036 3036
3037 osb->cconn = conn; 3037 osb->cconn = conn;
3038
3039 status = 0;
3040bail: 3038bail:
3041 if (status < 0) { 3039 if (status < 0) {
3042 ocfs2_dlm_shutdown_debug(osb); 3040 ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
305 305
306 if (el->l_tree_depth) { 306 if (el->l_tree_depth) {
307 ocfs2_error(inode->i_sb, 307 ocfs2_error(inode->i_sb,
308 "Inode %lu has non zero tree depth in " 308 "Inode %lu has non zero tree depth in leaf block %llu\n",
309 "leaf block %llu\n", inode->i_ino, 309 inode->i_ino,
310 (unsigned long long)eb_bh->b_blocknr); 310 (unsigned long long)eb_bh->b_blocknr);
311 ret = -EROFS; 311 ret = -EROFS;
312 goto out; 312 goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
441 441
442 if (el->l_tree_depth) { 442 if (el->l_tree_depth) {
443 ocfs2_error(inode->i_sb, 443 ocfs2_error(inode->i_sb,
444 "Inode %lu has non zero tree depth in " 444 "Inode %lu has non zero tree depth in leaf block %llu\n",
445 "leaf block %llu\n", inode->i_ino, 445 inode->i_ino,
446 (unsigned long long)eb_bh->b_blocknr); 446 (unsigned long long)eb_bh->b_blocknr);
447 ret = -EROFS; 447 ret = -EROFS;
448 goto out; 448 goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
475 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 475 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
476 476
477 if (!rec->e_blkno) { 477 if (!rec->e_blkno) {
478 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 478 ocfs2_error(inode->i_sb,
479 "record (%u, %u, 0)", inode->i_ino, 479 "Inode %lu has bad extent record (%u, %u, 0)\n",
480 inode->i_ino,
480 le32_to_cpu(rec->e_cpos), 481 le32_to_cpu(rec->e_cpos),
481 ocfs2_rec_clusters(el, rec)); 482 ocfs2_rec_clusters(el, rec));
482 ret = -EROFS; 483 ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
564 565
565 if (el->l_tree_depth) { 566 if (el->l_tree_depth) {
566 ocfs2_error(inode->i_sb, 567 ocfs2_error(inode->i_sb,
567 "Inode %lu has non zero tree depth in " 568 "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
568 "xattr leaf block %llu\n", inode->i_ino, 569 inode->i_ino,
569 (unsigned long long)eb_bh->b_blocknr); 570 (unsigned long long)eb_bh->b_blocknr);
570 ret = -EROFS; 571 ret = -EROFS;
571 goto out; 572 goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
582 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 583 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
583 584
584 if (!rec->e_blkno) { 585 if (!rec->e_blkno) {
585 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 586 ocfs2_error(inode->i_sb,
586 "record (%u, %u, 0) in xattr", inode->i_ino, 587 "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
588 inode->i_ino,
587 le32_to_cpu(rec->e_cpos), 589 le32_to_cpu(rec->e_cpos),
588 ocfs2_rec_clusters(el, rec)); 590 ocfs2_rec_clusters(el, rec));
589 ret = -EROFS; 591 ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..0e5b4515f92e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
1130int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 1130int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1131{ 1131{
1132 int status = 0, size_change; 1132 int status = 0, size_change;
1133 int inode_locked = 0;
1133 struct inode *inode = d_inode(dentry); 1134 struct inode *inode = d_inode(dentry);
1134 struct super_block *sb = inode->i_sb; 1135 struct super_block *sb = inode->i_sb;
1135 struct ocfs2_super *osb = OCFS2_SB(sb); 1136 struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1178 mlog_errno(status); 1179 mlog_errno(status);
1179 goto bail_unlock_rw; 1180 goto bail_unlock_rw;
1180 } 1181 }
1182 inode_locked = 1;
1181 1183
1182 if (size_change) { 1184 if (size_change) {
1183 status = inode_newsize_ok(inode, attr->ia_size); 1185 status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1258bail_commit: 1260bail_commit:
1259 ocfs2_commit_trans(osb, handle); 1261 ocfs2_commit_trans(osb, handle);
1260bail_unlock: 1262bail_unlock:
1261 ocfs2_inode_unlock(inode, 1); 1263 if (status) {
1264 ocfs2_inode_unlock(inode, 1);
1265 inode_locked = 0;
1266 }
1262bail_unlock_rw: 1267bail_unlock_rw:
1263 if (size_change) 1268 if (size_change)
1264 ocfs2_rw_unlock(inode, 1); 1269 ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
1274 if (status < 0) 1279 if (status < 0)
1275 mlog_errno(status); 1280 mlog_errno(status);
1276 } 1281 }
1282 if (inode_locked)
1283 ocfs2_inode_unlock(inode, 1);
1277 1284
1278 return status; 1285 return status;
1279} 1286}
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2262 ssize_t written = 0; 2269 ssize_t written = 0;
2263 ssize_t ret; 2270 ssize_t ret;
2264 size_t count = iov_iter_count(from), orig_count; 2271 size_t count = iov_iter_count(from), orig_count;
2265 loff_t old_size;
2266 u32 old_clusters;
2267 struct file *file = iocb->ki_filp; 2272 struct file *file = iocb->ki_filp;
2268 struct inode *inode = file_inode(file); 2273 struct inode *inode = file_inode(file);
2269 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2274 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2271 OCFS2_MOUNT_COHERENCY_BUFFERED); 2276 OCFS2_MOUNT_COHERENCY_BUFFERED);
2272 int unaligned_dio = 0; 2277 int unaligned_dio = 0;
2273 int dropped_dio = 0; 2278 int dropped_dio = 0;
2279 int append_write = ((iocb->ki_pos + count) >=
2280 i_size_read(inode) ? 1 : 0);
2274 2281
2275 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2282 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2276 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2283 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
2290 /* 2297 /*
2291 * Concurrent O_DIRECT writes are allowed with 2298 * Concurrent O_DIRECT writes are allowed with
2292 * mount_option "coherency=buffered". 2299 * mount_option "coherency=buffered".
2300 * For append write, we must take rw EX.
2293 */ 2301 */
2294 rw_level = (!direct_io || full_coherency); 2302 rw_level = (!direct_io || full_coherency || append_write);
2295 2303
2296 ret = ocfs2_rw_lock(inode, rw_level); 2304 ret = ocfs2_rw_lock(inode, rw_level);
2297 if (ret < 0) { 2305 if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
2364 ocfs2_iocb_set_unaligned_aio(iocb); 2372 ocfs2_iocb_set_unaligned_aio(iocb);
2365 } 2373 }
2366 2374
2367 /*
2368 * To later detect whether a journal commit for sync writes is
2369 * necessary, we sample i_size, and cluster count here.
2370 */
2371 old_size = i_size_read(inode);
2372 old_clusters = OCFS2_I(inode)->ip_clusters;
2373
2374 /* communicate with ocfs2_dio_end_io */ 2375 /* communicate with ocfs2_dio_end_io */
2375 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2376 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2376 2377
@@ -2378,6 +2379,20 @@ relock:
2378 /* buffered aio wouldn't have proper lock coverage today */ 2379 /* buffered aio wouldn't have proper lock coverage today */
2379 BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2380 BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
2380 2381
2382 /*
2383 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2384 * function pointer which is called when o_direct io completes so that
2385 * it can unlock our rw lock.
2386 * Unfortunately there are error cases which call end_io and others
2387 * that don't. so we don't have to unlock the rw_lock if either an
2388 * async dio is going to do it in the future or an end_io after an
2389 * error has already done it.
2390 */
2391 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2392 rw_level = -1;
2393 unaligned_dio = 0;
2394 }
2395
2381 if (unlikely(written <= 0)) 2396 if (unlikely(written <= 0))
2382 goto no_sync; 2397 goto no_sync;
2383 2398
@@ -2402,21 +2417,7 @@ relock:
2402 } 2417 }
2403 2418
2404no_sync: 2419no_sync:
2405 /* 2420 if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
2406 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2407 * function pointer which is called when o_direct io completes so that
2408 * it can unlock our rw lock.
2409 * Unfortunately there are error cases which call end_io and others
2410 * that don't. so we don't have to unlock the rw_lock if either an
2411 * async dio is going to do it in the future or an end_io after an
2412 * error has already done it.
2413 */
2414 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2415 rw_level = -1;
2416 unaligned_dio = 0;
2417 }
2418
2419 if (unaligned_dio) {
2420 ocfs2_iocb_clear_unaligned_aio(iocb); 2421 ocfs2_iocb_clear_unaligned_aio(iocb);
2421 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); 2422 mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
2422 } 2423 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..8f87e05ee25d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
971 int wipe, status; 971 int wipe, status;
972 sigset_t oldset; 972 sigset_t oldset;
973 struct buffer_head *di_bh = NULL; 973 struct buffer_head *di_bh = NULL;
974 struct ocfs2_dinode *di = NULL;
974 975
975 trace_ocfs2_delete_inode(inode->i_ino, 976 trace_ocfs2_delete_inode(inode->i_ino,
976 (unsigned long long)OCFS2_I(inode)->ip_blkno, 977 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
1025 goto bail_unlock_nfs_sync; 1026 goto bail_unlock_nfs_sync;
1026 } 1027 }
1027 1028
1029 di = (struct ocfs2_dinode *)di_bh->b_data;
1030 /* Skip inode deletion and wait for dio orphan entry recovered
1031 * first */
1032 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
1033 ocfs2_cleanup_delete_inode(inode, 0);
1034 goto bail_unlock_inode;
1035 }
1036
1028 /* Query the cluster. This will be the final decision made 1037 /* Query the cluster. This will be the final decision made
1029 * before we go ahead and wipe the inode. */ 1038 * before we go ahead and wipe the inode. */
1030 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 1039 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode)
1191int ocfs2_drop_inode(struct inode *inode) 1200int ocfs2_drop_inode(struct inode *inode)
1192{ 1201{
1193 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1202 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1194 int res;
1195 1203
1196 trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, 1204 trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
1197 inode->i_nlink, oi->ip_flags); 1205 inode->i_nlink, oi->ip_flags);
1198 1206
1199 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1207 assert_spin_locked(&inode->i_lock);
1200 res = 1; 1208 inode->i_state |= I_WILL_FREE;
1201 else 1209 spin_unlock(&inode->i_lock);
1202 res = generic_drop_inode(inode); 1210 write_inode_now(inode, 1);
1211 spin_lock(&inode->i_lock);
1212 WARN_ON(inode->i_state & I_NEW);
1213 inode->i_state &= ~I_WILL_FREE;
1203 1214
1204 return res; 1215 return 1;
1205} 1216}
1206 1217
1207/* 1218/*
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
1350 rc = -EINVAL; 1361 rc = -EINVAL;
1351 1362
1352 if (!OCFS2_IS_VALID_DINODE(di)) { 1363 if (!OCFS2_IS_VALID_DINODE(di)) {
1353 ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", 1364 rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
1354 (unsigned long long)bh->b_blocknr, 7, 1365 (unsigned long long)bh->b_blocknr, 7,
1355 di->i_signature); 1366 di->i_signature);
1356 goto bail; 1367 goto bail;
1357 } 1368 }
1358 1369
1359 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1370 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1360 ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", 1371 rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
1361 (unsigned long long)bh->b_blocknr, 1372 (unsigned long long)bh->b_blocknr,
1362 (unsigned long long)le64_to_cpu(di->i_blkno)); 1373 (unsigned long long)le64_to_cpu(di->i_blkno));
1363 goto bail; 1374 goto bail;
1364 } 1375 }
1365 1376
1366 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1377 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1367 ocfs2_error(sb, 1378 rc = ocfs2_error(sb,
1368 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", 1379 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
1369 (unsigned long long)bh->b_blocknr); 1380 (unsigned long long)bh->b_blocknr);
1370 goto bail; 1381 goto bail;
1371 } 1382 }
1372 1383
1373 if (le32_to_cpu(di->i_fs_generation) != 1384 if (le32_to_cpu(di->i_fs_generation) !=
1374 OCFS2_SB(sb)->fs_generation) { 1385 OCFS2_SB(sb)->fs_generation) {
1375 ocfs2_error(sb, 1386 rc = ocfs2_error(sb,
1376 "Invalid dinode #%llu: fs_generation is %u\n", 1387 "Invalid dinode #%llu: fs_generation is %u\n",
1377 (unsigned long long)bh->b_blocknr, 1388 (unsigned long long)bh->b_blocknr,
1378 le32_to_cpu(di->i_fs_generation)); 1389 le32_to_cpu(di->i_fs_generation));
1379 goto bail; 1390 goto bail;
1380 } 1391 }
1381 1392
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..ca3431ee7f24 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
81 tid_t i_sync_tid; 81 tid_t i_sync_tid;
82 tid_t i_datasync_tid; 82 tid_t i_datasync_tid;
83 83
84 wait_queue_head_t append_dio_wq;
85
86 struct dquot *i_dquot[MAXQUOTAS]; 84 struct dquot *i_dquot[MAXQUOTAS];
87}; 85};
88 86
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
374 mlog_errno(PTR_ERR(handle)); 374 mlog_errno(PTR_ERR(handle));
375 375
376 if (is_journal_aborted(journal)) { 376 if (is_journal_aborted(journal)) {
377 ocfs2_abort(osb->sb, "Detected aborted journal"); 377 ocfs2_abort(osb->sb, "Detected aborted journal\n");
378 handle = ERR_PTR(-EROFS); 378 handle = ERR_PTR(-EROFS);
379 } 379 }
380 } else { 380 } else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
668 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); 668 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
669 mlog(ML_ERROR, "b_blocknr=%llu\n", 669 mlog(ML_ERROR, "b_blocknr=%llu\n",
670 (unsigned long long)bh->b_blocknr); 670 (unsigned long long)bh->b_blocknr);
671 BUG(); 671
672 lock_buffer(bh);
673 /*
674 * A previous attempt to write this buffer head failed.
675 * Nothing we can do but to retry the write and hope for
676 * the best.
677 */
678 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
679 clear_buffer_write_io_error(bh);
680 set_buffer_uptodate(bh);
681 }
682
683 if (!buffer_uptodate(bh)) {
684 unlock_buffer(bh);
685 return -EIO;
686 }
687 unlock_buffer(bh);
672 } 688 }
673 689
674 /* Set the current transaction information on the ci so 690 /* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2170 iter = oi->ip_next_orphan; 2186 iter = oi->ip_next_orphan;
2171 oi->ip_next_orphan = NULL; 2187 oi->ip_next_orphan = NULL;
2172 2188
2189 mutex_lock(&inode->i_mutex);
2173 ret = ocfs2_rw_lock(inode, 1); 2190 ret = ocfs2_rw_lock(inode, 1);
2174 if (ret < 0) { 2191 if (ret < 0) {
2175 mlog_errno(ret); 2192 mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2193 * ocfs2_delete_inode. */ 2210 * ocfs2_delete_inode. */
2194 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 2211 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2195 spin_unlock(&oi->ip_lock); 2212 spin_unlock(&oi->ip_lock);
2196 } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && 2213 }
2214
2215 if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
2197 (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { 2216 (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2198 ret = ocfs2_truncate_file(inode, di_bh, 2217 ret = ocfs2_truncate_file(inode, di_bh,
2199 i_size_read(inode)); 2218 i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2206 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); 2225 ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
2207 if (ret) 2226 if (ret)
2208 mlog_errno(ret); 2227 mlog_errno(ret);
2209
2210 wake_up(&OCFS2_I(inode)->append_dio_wq);
2211 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ 2228 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
2212unlock_inode: 2229unlock_inode:
2213 ocfs2_inode_unlock(inode, 1); 2230 ocfs2_inode_unlock(inode, 1);
2231 brelse(di_bh);
2232 di_bh = NULL;
2214unlock_rw: 2233unlock_rw:
2215 ocfs2_rw_unlock(inode, 1); 2234 ocfs2_rw_unlock(inode, 1);
2216next: 2235next:
2236 mutex_unlock(&inode->i_mutex);
2217 iput(inode); 2237 iput(inode);
2218 brelse(di_bh);
2219 di_bh = NULL;
2220 inode = iter; 2238 inode = iter;
2221 } 2239 }
2222 2240
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
665#ifdef CONFIG_OCFS2_DEBUG_FS 665#ifdef CONFIG_OCFS2_DEBUG_FS
666 if (le32_to_cpu(alloc->id1.bitmap1.i_used) != 666 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
667 ocfs2_local_alloc_count_bits(alloc)) { 667 ocfs2_local_alloc_count_bits(alloc)) {
668 ocfs2_error(osb->sb, "local alloc inode %llu says it has " 668 ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
669 "%u used bits, but a count shows %u",
670 (unsigned long long)le64_to_cpu(alloc->i_blkno), 669 (unsigned long long)le64_to_cpu(alloc->i_blkno),
671 le32_to_cpu(alloc->id1.bitmap1.i_used), 670 le32_to_cpu(alloc->id1.bitmap1.i_used),
672 ocfs2_local_alloc_count_bits(alloc)); 671 ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
99 99
100 index = ocfs2_search_extent_list(el, cpos); 100 index = ocfs2_search_extent_list(el, cpos);
101 if (index == -1) { 101 if (index == -1) {
102 ocfs2_error(inode->i_sb, 102 ret = ocfs2_error(inode->i_sb,
103 "Inode %llu has an extent at cpos %u which can no " 103 "Inode %llu has an extent at cpos %u which can no longer be found\n",
104 "longer be found.\n", 104 (unsigned long long)ino, cpos);
105 (unsigned long long)ino, cpos);
106 ret = -EROFS;
107 goto out; 105 goto out;
108 } 106 }
109 107
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..b7dfac226b1e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1035,11 +1035,6 @@ leave:
1035 if (handle) 1035 if (handle)
1036 ocfs2_commit_trans(osb, handle); 1036 ocfs2_commit_trans(osb, handle);
1037 1037
1038 if (child_locked)
1039 ocfs2_inode_unlock(inode, 1);
1040
1041 ocfs2_inode_unlock(dir, 1);
1042
1043 if (orphan_dir) { 1038 if (orphan_dir) {
1044 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1039 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1045 ocfs2_inode_unlock(orphan_dir, 1); 1040 ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
1047 iput(orphan_dir); 1042 iput(orphan_dir);
1048 } 1043 }
1049 1044
1045 if (child_locked)
1046 ocfs2_inode_unlock(inode, 1);
1047
1048 ocfs2_inode_unlock(dir, 1);
1049
1050 brelse(fe_bh); 1050 brelse(fe_bh);
1051 brelse(parent_node_bh); 1051 brelse(parent_node_bh);
1052 1052
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir,
1309 } 1309 }
1310 parents_locked = 1; 1310 parents_locked = 1;
1311 1311
1312 if (!new_dir->i_nlink) {
1313 status = -EACCES;
1314 goto bail;
1315 }
1316
1312 /* make sure both dirs have bhs 1317 /* make sure both dirs have bhs
1313 * get an extra ref on old_dir_bh if old==new */ 1318 * get an extra ref on old_dir_bh if old==new */
1314 if (!new_dir_bh) { 1319 if (!new_dir_bh) {
@@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir,
1569 status = ocfs2_find_entry(old_dentry->d_name.name, 1574 status = ocfs2_find_entry(old_dentry->d_name.name,
1570 old_dentry->d_name.len, old_dir, 1575 old_dentry->d_name.len, old_dir,
1571 &old_entry_lookup); 1576 &old_entry_lookup);
1572 if (status) 1577 if (status) {
1578 if (!is_journal_aborted(osb->journal->j_journal)) {
1579 ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
1580 "is not deleted.",
1581 new_dentry->d_name.len, new_dentry->d_name.name,
1582 old_dentry->d_name.len, old_dentry->d_name.name);
1583 }
1573 goto bail; 1584 goto bail;
1585 }
1574 1586
1575 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); 1587 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1576 if (status < 0) { 1588 if (status < 0) {
1577 mlog_errno(status); 1589 mlog_errno(status);
1590 if (!is_journal_aborted(osb->journal->j_journal)) {
1591 ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
1592 "is not deleted.",
1593 new_dentry->d_name.len, new_dentry->d_name.name,
1594 old_dentry->d_name.len, old_dentry->d_name.name);
1595 }
1578 goto bail; 1596 goto bail;
1579 } 1597 }
1580 1598
@@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir,
1633 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1651 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1634 status = 0; 1652 status = 0;
1635bail: 1653bail:
1636 if (rename_lock)
1637 ocfs2_rename_unlock(osb);
1638
1639 if (handle) 1654 if (handle)
1640 ocfs2_commit_trans(osb, handle); 1655 ocfs2_commit_trans(osb, handle);
1641 1656
1642 if (parents_locked)
1643 ocfs2_double_unlock(old_dir, new_dir);
1644
1645 if (old_child_locked)
1646 ocfs2_inode_unlock(old_inode, 1);
1647
1648 if (new_child_locked)
1649 ocfs2_inode_unlock(new_inode, 1);
1650
1651 if (orphan_dir) { 1657 if (orphan_dir) {
1652 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1658 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1653 ocfs2_inode_unlock(orphan_dir, 1); 1659 ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1661,18 @@ bail:
1655 iput(orphan_dir); 1661 iput(orphan_dir);
1656 } 1662 }
1657 1663
1664 if (new_child_locked)
1665 ocfs2_inode_unlock(new_inode, 1);
1666
1667 if (old_child_locked)
1668 ocfs2_inode_unlock(old_inode, 1);
1669
1670 if (parents_locked)
1671 ocfs2_double_unlock(old_dir, new_dir);
1672
1673 if (rename_lock)
1674 ocfs2_rename_unlock(osb);
1675
1658 if (new_inode) 1676 if (new_inode)
1659 sync_mapping_buffers(old_inode->i_mapping); 1677 sync_mapping_buffers(old_inode->i_mapping);
1660 1678
@@ -2601,27 +2619,6 @@ leave:
2601 return status; 2619 return status;
2602} 2620}
2603 2621
2604static int ocfs2_dio_orphan_recovered(struct inode *inode)
2605{
2606 int ret;
2607 struct buffer_head *di_bh = NULL;
2608 struct ocfs2_dinode *di = NULL;
2609
2610 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2611 if (ret < 0) {
2612 mlog_errno(ret);
2613 return 0;
2614 }
2615
2616 di = (struct ocfs2_dinode *) di_bh->b_data;
2617 ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
2618 ocfs2_inode_unlock(inode, 1);
2619 brelse(di_bh);
2620
2621 return ret;
2622}
2623
2624#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
2625int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, 2622int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
2626 struct inode *inode) 2623 struct inode *inode)
2627{ 2624{
@@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
2633 handle_t *handle = NULL; 2630 handle_t *handle = NULL;
2634 struct ocfs2_dinode *di = NULL; 2631 struct ocfs2_dinode *di = NULL;
2635 2632
2636restart:
2637 status = ocfs2_inode_lock(inode, &di_bh, 1); 2633 status = ocfs2_inode_lock(inode, &di_bh, 1);
2638 if (status < 0) { 2634 if (status < 0) {
2639 mlog_errno(status); 2635 mlog_errno(status);
@@ -2643,15 +2639,21 @@ restart:
2643 di = (struct ocfs2_dinode *) di_bh->b_data; 2639 di = (struct ocfs2_dinode *) di_bh->b_data;
2644 /* 2640 /*
2645 * Another append dio crashed? 2641 * Another append dio crashed?
2646 * If so, wait for recovery first. 2642 * If so, manually recover it first.
2647 */ 2643 */
2648 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { 2644 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2649 ocfs2_inode_unlock(inode, 1); 2645 status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
2650 brelse(di_bh); 2646 if (status < 0) {
2651 wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, 2647 if (status != -ENOSPC)
2652 ocfs2_dio_orphan_recovered(inode), 2648 mlog_errno(status);
2653 msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); 2649 goto bail_unlock_inode;
2654 goto restart; 2650 }
2651
2652 status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
2653 if (status < 0) {
2654 mlog_errno(status);
2655 goto bail_unlock_inode;
2656 }
2655 } 2657 }
2656 2658
2657 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, 2659 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
286 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 286 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
287 287
288 OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ 288 OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
289 OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
290 OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
289}; 291};
290 292
291#define OCFS2_OSB_SOFT_RO 0x0001 293#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
138 138
139 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { 139 if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
140 ocfs2_error(inode->i_sb, 140 ocfs2_error(inode->i_sb,
141 "Quota file %llu is probably corrupted! Requested " 141 "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
142 "to read block %Lu but file has size only %Lu\n",
143 (unsigned long long)OCFS2_I(inode)->ip_blkno, 142 (unsigned long long)OCFS2_I(inode)->ip_blkno,
144 (unsigned long long)v_block, 143 (unsigned long long)v_block,
145 (unsigned long long)i_size_read(inode)); 144 (unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
102 102
103 103
104 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { 104 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
105 ocfs2_error(sb, 105 rc = ocfs2_error(sb,
106 "Refcount block #%llu has bad signature %.*s", 106 "Refcount block #%llu has bad signature %.*s\n",
107 (unsigned long long)bh->b_blocknr, 7, 107 (unsigned long long)bh->b_blocknr, 7,
108 rb->rf_signature); 108 rb->rf_signature);
109 return -EINVAL; 109 goto out;
110 } 110 }
111 111
112 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { 112 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
113 ocfs2_error(sb, 113 rc = ocfs2_error(sb,
114 "Refcount block #%llu has an invalid rf_blkno " 114 "Refcount block #%llu has an invalid rf_blkno of %llu\n",
115 "of %llu", 115 (unsigned long long)bh->b_blocknr,
116 (unsigned long long)bh->b_blocknr, 116 (unsigned long long)le64_to_cpu(rb->rf_blkno));
117 (unsigned long long)le64_to_cpu(rb->rf_blkno)); 117 goto out;
118 return -EINVAL;
119 } 118 }
120 119
121 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { 120 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
122 ocfs2_error(sb, 121 rc = ocfs2_error(sb,
123 "Refcount block #%llu has an invalid " 122 "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
124 "rf_fs_generation of #%u", 123 (unsigned long long)bh->b_blocknr,
125 (unsigned long long)bh->b_blocknr, 124 le32_to_cpu(rb->rf_fs_generation));
126 le32_to_cpu(rb->rf_fs_generation)); 125 goto out;
127 return -EINVAL;
128 } 126 }
129 127out:
130 return 0; 128 return rc;
131} 129}
132 130
133static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, 131static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1102 el = &eb->h_list; 1100 el = &eb->h_list;
1103 1101
1104 if (el->l_tree_depth) { 1102 if (el->l_tree_depth) {
1105 ocfs2_error(sb, 1103 ret = ocfs2_error(sb,
1106 "refcount tree %llu has non zero tree " 1104 "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
1107 "depth in leaf btree tree block %llu\n", 1105 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1108 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1106 (unsigned long long)eb_bh->b_blocknr);
1109 (unsigned long long)eb_bh->b_blocknr);
1110 ret = -EROFS;
1111 goto out; 1107 goto out;
1112 } 1108 }
1113 } 1109 }
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
2359 cpos, len, phys); 2355 cpos, len, phys);
2360 2356
2361 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2357 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2362 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2358 ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
2363 "tree, but the feature bit is not set in the " 2359 inode->i_ino);
2364 "super block.", inode->i_ino);
2365 ret = -EROFS;
2366 goto out; 2360 goto out;
2367 } 2361 }
2368 2362
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2545 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); 2539 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2546 2540
2547 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2541 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2548 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2542 ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
2549 "tree, but the feature bit is not set in the " 2543 inode->i_ino);
2550 "super block.", inode->i_ino);
2551 ret = -EROFS;
2552 goto out; 2544 goto out;
2553 } 2545 }
2554 2546
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2672 el = &eb->h_list; 2664 el = &eb->h_list;
2673 2665
2674 if (el->l_tree_depth) { 2666 if (el->l_tree_depth) {
2675 ocfs2_error(inode->i_sb, 2667 ret = ocfs2_error(inode->i_sb,
2676 "Inode %lu has non zero tree depth in " 2668 "Inode %lu has non zero tree depth in leaf block %llu\n",
2677 "leaf block %llu\n", inode->i_ino, 2669 inode->i_ino,
2678 (unsigned long long)eb_bh->b_blocknr); 2670 (unsigned long long)eb_bh->b_blocknr);
2679 ret = -EROFS;
2680 goto out; 2671 goto out;
2681 } 2672 }
2682 } 2673 }
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
3106 3097
3107 index = ocfs2_search_extent_list(el, cpos); 3098 index = ocfs2_search_extent_list(el, cpos);
3108 if (index == -1) { 3099 if (index == -1) {
3109 ocfs2_error(sb, 3100 ret = ocfs2_error(sb,
3110 "Inode %llu has an extent at cpos %u which can no " 3101 "Inode %llu has an extent at cpos %u which can no longer be found\n",
3111 "longer be found.\n", 3102 (unsigned long long)ino, cpos);
3112 (unsigned long long)ino, cpos);
3113 ret = -EROFS;
3114 goto out; 3103 goto out;
3115 } 3104 }
3116 3105
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3376 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3377 3366
3378 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 3367 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3379 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 3368 return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
3380 "tree, but the feature bit is not set in the " 3369 inode->i_ino);
3381 "super block.", inode->i_ino);
3382 return -EROFS;
3383 } 3370 }
3384 3371
3385 ocfs2_init_dealloc_ctxt(&context->dealloc); 3372 ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..d83d2602cf2b 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
149 brelse(ac->ac_bh); 149 brelse(ac->ac_bh);
150 ac->ac_bh = NULL; 150 ac->ac_bh = NULL;
151 ac->ac_resv = NULL; 151 ac->ac_resv = NULL;
152 if (ac->ac_find_loc_priv) { 152 kfree(ac->ac_find_loc_priv);
153 kfree(ac->ac_find_loc_priv); 153 ac->ac_find_loc_priv = NULL;
154 ac->ac_find_loc_priv = NULL;
155 }
156} 154}
157 155
158void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 156void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
167} 165}
168 166
169#define do_error(fmt, ...) \ 167#define do_error(fmt, ...) \
170 do{ \ 168do { \
171 if (resize) \ 169 if (resize) \
172 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 170 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
173 else \ 171 else \
174 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 172 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
175 } while (0) 173} while (0)
176 174
177static int ocfs2_validate_gd_self(struct super_block *sb, 175static int ocfs2_validate_gd_self(struct super_block *sb,
178 struct buffer_head *bh, 176 struct buffer_head *bh,
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
181 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 179 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
182 180
183 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 181 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
184 do_error("Group descriptor #%llu has bad signature %.*s", 182 do_error("Group descriptor #%llu has bad signature %.*s\n",
185 (unsigned long long)bh->b_blocknr, 7, 183 (unsigned long long)bh->b_blocknr, 7,
186 gd->bg_signature); 184 gd->bg_signature);
187 return -EINVAL;
188 } 185 }
189 186
190 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { 187 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
191 do_error("Group descriptor #%llu has an invalid bg_blkno " 188 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
192 "of %llu",
193 (unsigned long long)bh->b_blocknr, 189 (unsigned long long)bh->b_blocknr,
194 (unsigned long long)le64_to_cpu(gd->bg_blkno)); 190 (unsigned long long)le64_to_cpu(gd->bg_blkno));
195 return -EINVAL;
196 } 191 }
197 192
198 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { 193 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
199 do_error("Group descriptor #%llu has an invalid " 194 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
200 "fs_generation of #%u",
201 (unsigned long long)bh->b_blocknr, 195 (unsigned long long)bh->b_blocknr,
202 le32_to_cpu(gd->bg_generation)); 196 le32_to_cpu(gd->bg_generation));
203 return -EINVAL;
204 } 197 }
205 198
206 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 199 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
207 do_error("Group descriptor #%llu has bit count %u but " 200 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
208 "claims that %u are free",
209 (unsigned long long)bh->b_blocknr, 201 (unsigned long long)bh->b_blocknr,
210 le16_to_cpu(gd->bg_bits), 202 le16_to_cpu(gd->bg_bits),
211 le16_to_cpu(gd->bg_free_bits_count)); 203 le16_to_cpu(gd->bg_free_bits_count));
212 return -EINVAL;
213 } 204 }
214 205
215 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 206 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
216 do_error("Group descriptor #%llu has bit count %u but " 207 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
217 "max bitmap bits of %u",
218 (unsigned long long)bh->b_blocknr, 208 (unsigned long long)bh->b_blocknr,
219 le16_to_cpu(gd->bg_bits), 209 le16_to_cpu(gd->bg_bits),
220 8 * le16_to_cpu(gd->bg_size)); 210 8 * le16_to_cpu(gd->bg_size));
221 return -EINVAL;
222 } 211 }
223 212
224 return 0; 213 return 0;
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 222 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
234 223
235 if (di->i_blkno != gd->bg_parent_dinode) { 224 if (di->i_blkno != gd->bg_parent_dinode) {
236 do_error("Group descriptor #%llu has bad parent " 225 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
237 "pointer (%llu, expected %llu)",
238 (unsigned long long)bh->b_blocknr, 226 (unsigned long long)bh->b_blocknr,
239 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 227 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
240 (unsigned long long)le64_to_cpu(di->i_blkno)); 228 (unsigned long long)le64_to_cpu(di->i_blkno));
241 return -EINVAL;
242 } 229 }
243 230
244 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 231 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
245 if (le16_to_cpu(gd->bg_bits) > max_bits) { 232 if (le16_to_cpu(gd->bg_bits) > max_bits) {
246 do_error("Group descriptor #%llu has bit count of %u", 233 do_error("Group descriptor #%llu has bit count of %u\n",
247 (unsigned long long)bh->b_blocknr, 234 (unsigned long long)bh->b_blocknr,
248 le16_to_cpu(gd->bg_bits)); 235 le16_to_cpu(gd->bg_bits));
249 return -EINVAL;
250 } 236 }
251 237
252 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ 238 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
254 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || 240 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
255 ((le16_to_cpu(gd->bg_chain) == 241 ((le16_to_cpu(gd->bg_chain) ==
256 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { 242 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
257 do_error("Group descriptor #%llu has bad chain %u", 243 do_error("Group descriptor #%llu has bad chain %u\n",
258 (unsigned long long)bh->b_blocknr, 244 (unsigned long long)bh->b_blocknr,
259 le16_to_cpu(gd->bg_chain)); 245 le16_to_cpu(gd->bg_chain));
260 return -EINVAL;
261 } 246 }
262 247
263 return 0; 248 return 0;
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
384 struct super_block * sb = alloc_inode->i_sb; 369 struct super_block * sb = alloc_inode->i_sb;
385 370
386 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 371 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
387 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " 372 status = ocfs2_error(alloc_inode->i_sb,
388 "b_blocknr (%llu)", 373 "group block (%llu) != b_blocknr (%llu)\n",
389 (unsigned long long)group_blkno, 374 (unsigned long long)group_blkno,
390 (unsigned long long) bg_bh->b_blocknr); 375 (unsigned long long) bg_bh->b_blocknr);
391 status = -EIO;
392 goto bail; 376 goto bail;
393 } 377 }
394 378
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
834 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 818 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
835 819
836 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 820 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
837 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 821 status = ocfs2_error(alloc_inode->i_sb,
838 (unsigned long long)le64_to_cpu(fe->i_blkno)); 822 "Invalid chain allocator %llu\n",
839 status = -EIO; 823 (unsigned long long)le64_to_cpu(fe->i_blkno));
840 goto bail; 824 goto bail;
841 } 825 }
842 826
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
1370 1354
1371 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1355 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1372 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 1356 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1373 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 1357 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1374 " count %u but claims %u are freed. num_bits %d", 1358 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1375 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1359 le16_to_cpu(bg->bg_bits),
1376 le16_to_cpu(bg->bg_bits), 1360 le16_to_cpu(bg->bg_free_bits_count),
1377 le16_to_cpu(bg->bg_free_bits_count), num_bits); 1361 num_bits);
1378 return -EROFS;
1379 } 1362 }
1380 while(num_bits--) 1363 while(num_bits--)
1381 ocfs2_set_bit(bit_off++, bitmap); 1364 ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1905 1888
1906 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1889 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1907 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1890 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1908 ocfs2_error(ac->ac_inode->i_sb, 1891 status = ocfs2_error(ac->ac_inode->i_sb,
1909 "Chain allocator dinode %llu has %u used " 1892 "Chain allocator dinode %llu has %u used bits but only %u total\n",
1910 "bits but only %u total.", 1893 (unsigned long long)le64_to_cpu(fe->i_blkno),
1911 (unsigned long long)le64_to_cpu(fe->i_blkno), 1894 le32_to_cpu(fe->id1.bitmap1.i_used),
1912 le32_to_cpu(fe->id1.bitmap1.i_used), 1895 le32_to_cpu(fe->id1.bitmap1.i_total));
1913 le32_to_cpu(fe->id1.bitmap1.i_total));
1914 status = -EIO;
1915 goto bail; 1896 goto bail;
1916 } 1897 }
1917 1898
@@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2429 } 2410 }
2430 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2411 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2431 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 2412 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2432 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 2413 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2433 " count %u but claims %u are freed. num_bits %d", 2414 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 (unsigned long long)le64_to_cpu(bg->bg_blkno), 2415 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_bits), 2416 le16_to_cpu(bg->bg_free_bits_count),
2436 le16_to_cpu(bg->bg_free_bits_count), num_bits); 2417 num_bits);
2437 return -EROFS;
2438 } 2418 }
2439 2419
2440 if (undo_fn) 2420 if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..2de4c8a9340c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
192 Opt_resv_level, 192 Opt_resv_level,
193 Opt_dir_resv_level, 193 Opt_dir_resv_level,
194 Opt_journal_async_commit, 194 Opt_journal_async_commit,
195 Opt_err_cont,
195 Opt_err, 196 Opt_err,
196}; 197};
197 198
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
224 {Opt_resv_level, "resv_level=%u"}, 225 {Opt_resv_level, "resv_level=%u"},
225 {Opt_dir_resv_level, "dir_resv_level=%u"}, 226 {Opt_dir_resv_level, "dir_resv_level=%u"},
226 {Opt_journal_async_commit, "journal_async_commit"}, 227 {Opt_journal_async_commit, "journal_async_commit"},
228 {Opt_err_cont, "errors=continue"},
227 {Opt_err, NULL} 229 {Opt_err, NULL}
228}; 230};
229 231
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
1330 mopt->mount_opt |= OCFS2_MOUNT_NOINTR; 1332 mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
1331 break; 1333 break;
1332 case Opt_err_panic: 1334 case Opt_err_panic:
1335 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
1336 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
1333 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 1337 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1334 break; 1338 break;
1335 case Opt_err_ro: 1339 case Opt_err_ro:
1340 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
1336 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; 1341 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
1342 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
1343 break;
1344 case Opt_err_cont:
1345 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
1346 mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
1347 mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
1337 break; 1348 break;
1338 case Opt_data_ordered: 1349 case Opt_data_ordered:
1339 mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; 1350 mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1530 1541
1531 if (opts & OCFS2_MOUNT_ERRORS_PANIC) 1542 if (opts & OCFS2_MOUNT_ERRORS_PANIC)
1532 seq_printf(s, ",errors=panic"); 1543 seq_printf(s, ",errors=panic");
1544 else if (opts & OCFS2_MOUNT_ERRORS_CONT)
1545 seq_printf(s, ",errors=continue");
1533 else 1546 else
1534 seq_printf(s, ",errors=remount-ro"); 1547 seq_printf(s, ",errors=remount-ro");
1535 1548
@@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1550 seq_printf(s, ",localflocks,"); 1563 seq_printf(s, ",localflocks,");
1551 1564
1552 if (osb->osb_cluster_stack[0]) 1565 if (osb->osb_cluster_stack[0])
1553 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1566 seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
1554 osb->osb_cluster_stack); 1567 OCFS2_STACK_LABEL_LEN);
1555 if (opts & OCFS2_MOUNT_USRQUOTA) 1568 if (opts & OCFS2_MOUNT_USRQUOTA)
1556 seq_printf(s, ",usrquota"); 1569 seq_printf(s, ",usrquota");
1557 if (opts & OCFS2_MOUNT_GRPQUOTA) 1570 if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data)
1746 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1759 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1747 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1760 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1748 1761
1749 init_waitqueue_head(&oi->append_dio_wq);
1750
1751 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), 1762 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1752 &ocfs2_inode_caching_ops); 1763 &ocfs2_inode_caching_ops);
1753 1764
@@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2541 memset(osb, 0, sizeof(struct ocfs2_super)); 2552 memset(osb, 0, sizeof(struct ocfs2_super));
2542} 2553}
2543 2554
2544/* Put OCFS2 into a readonly state, or (if the user specifies it), 2555/* Depending on the mount option passed, perform one of the following:
2545 * panic(). We do not support continue-on-error operation. */ 2556 * Put OCFS2 into a readonly state (default)
2546static void ocfs2_handle_error(struct super_block *sb) 2557 * Return EIO so that only the process errs
2558 * Fix the error as if fsck.ocfs2 -y
2559 * panic
2560 */
2561static int ocfs2_handle_error(struct super_block *sb)
2547{ 2562{
2548 struct ocfs2_super *osb = OCFS2_SB(sb); 2563 struct ocfs2_super *osb = OCFS2_SB(sb);
2549 2564 int rv = 0;
2550 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
2551 panic("OCFS2: (device %s): panic forced after error\n",
2552 sb->s_id);
2553 2565
2554 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); 2566 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
2567 pr_crit("On-disk corruption discovered. "
2568 "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
2555 2569
2556 if (sb->s_flags & MS_RDONLY && 2570 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
2557 (ocfs2_is_soft_readonly(osb) || 2571 panic("OCFS2: (device %s): panic forced after error\n",
2558 ocfs2_is_hard_readonly(osb))) 2572 sb->s_id);
2559 return; 2573 } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
2560 2574 pr_crit("OCFS2: Returning error to the calling process.\n");
2561 printk(KERN_CRIT "File system is now read-only due to the potential " 2575 rv = -EIO;
2562 "of on-disk corruption. Please run fsck.ocfs2 once the file " 2576 } else { /* default option */
2563 "system is unmounted.\n"); 2577 rv = -EROFS;
2564 sb->s_flags |= MS_RDONLY; 2578 if (sb->s_flags & MS_RDONLY &&
2565 ocfs2_set_ro_flag(osb, 0); 2579 (ocfs2_is_soft_readonly(osb) ||
2580 ocfs2_is_hard_readonly(osb)))
2581 return rv;
2582
2583 pr_crit("OCFS2: File system is now read-only.\n");
2584 sb->s_flags |= MS_RDONLY;
2585 ocfs2_set_ro_flag(osb, 0);
2586 }
2587
2588 return rv;
2566} 2589}
2567 2590
2568void __ocfs2_error(struct super_block *sb, const char *function, 2591int __ocfs2_error(struct super_block *sb, const char *function,
2569 const char *fmt, ...) 2592 const char *fmt, ...)
2570{ 2593{
2571 struct va_format vaf; 2594 struct va_format vaf;
@@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
2577 2600
2578 /* Not using mlog here because we want to show the actual 2601 /* Not using mlog here because we want to show the actual
2579 * function the error came from. */ 2602 * function the error came from. */
2580 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", 2603 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
2581 sb->s_id, function, &vaf); 2604 sb->s_id, function, &vaf);
2582 2605
2583 va_end(args); 2606 va_end(args);
2584 2607
2585 ocfs2_handle_error(sb); 2608 return ocfs2_handle_error(sb);
2586} 2609}
2587 2610
2588/* Handle critical errors. This is intentionally more drastic than 2611/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
2599 vaf.fmt = fmt; 2622 vaf.fmt = fmt;
2600 vaf.va = &args; 2623 vaf.va = &args;
2601 2624
2602 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", 2625 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
2603 sb->s_id, function, &vaf); 2626 sb->s_id, function, &vaf);
2604 2627
2605 va_end(args); 2628 va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num); 32 int node_num);
33 33
34__printf(3, 4) 34__printf(3, 4)
35void __ocfs2_error(struct super_block *sb, const char *function, 35int __ocfs2_error(struct super_block *sb, const char *function,
36 const char *fmt, ...); 36 const char *fmt, ...);
37 37
38#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) 38#define ocfs2_error(sb, fmt, ...) \
39 __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
39 40
40__printf(3, 4) 41__printf(3, 4)
41void __ocfs2_abort(struct super_block *sb, const char *function, 42void __ocfs2_abort(struct super_block *sb, const char *function,
42 const char *fmt, ...); 43 const char *fmt, ...);
43 44
44#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 45#define ocfs2_abort(sb, fmt, ...) \
46 __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
45 47
46/* 48/*
47 * Void signal blockers, because in-kernel sigprocmask() only fails 49 * Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
499 */ 499 */
500 500
501 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { 501 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
502 ocfs2_error(sb, 502 return ocfs2_error(sb,
503 "Extended attribute block #%llu has bad " 503 "Extended attribute block #%llu has bad signature %.*s\n",
504 "signature %.*s", 504 (unsigned long long)bh->b_blocknr, 7,
505 (unsigned long long)bh->b_blocknr, 7, 505 xb->xb_signature);
506 xb->xb_signature);
507 return -EINVAL;
508 } 506 }
509 507
510 if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { 508 if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
511 ocfs2_error(sb, 509 return ocfs2_error(sb,
512 "Extended attribute block #%llu has an " 510 "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
513 "invalid xb_blkno of %llu", 511 (unsigned long long)bh->b_blocknr,
514 (unsigned long long)bh->b_blocknr, 512 (unsigned long long)le64_to_cpu(xb->xb_blkno));
515 (unsigned long long)le64_to_cpu(xb->xb_blkno));
516 return -EINVAL;
517 } 513 }
518 514
519 if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { 515 if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
520 ocfs2_error(sb, 516 return ocfs2_error(sb,
521 "Extended attribute block #%llu has an invalid " 517 "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
522 "xb_fs_generation of #%u", 518 (unsigned long long)bh->b_blocknr,
523 (unsigned long long)bh->b_blocknr, 519 le32_to_cpu(xb->xb_fs_generation));
524 le32_to_cpu(xb->xb_fs_generation));
525 return -EINVAL;
526 } 520 }
527 521
528 return 0; 522 return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
3694 el = &eb->h_list; 3688 el = &eb->h_list;
3695 3689
3696 if (el->l_tree_depth) { 3690 if (el->l_tree_depth) {
3697 ocfs2_error(inode->i_sb, 3691 ret = ocfs2_error(inode->i_sb,
3698 "Inode %lu has non zero tree depth in " 3692 "Inode %lu has non zero tree depth in xattr tree block %llu\n",
3699 "xattr tree block %llu\n", inode->i_ino, 3693 inode->i_ino,
3700 (unsigned long long)eb_bh->b_blocknr); 3694 (unsigned long long)eb_bh->b_blocknr);
3701 ret = -EROFS;
3702 goto out; 3695 goto out;
3703 } 3696 }
3704 } 3697 }
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
3713 } 3706 }
3714 3707
3715 if (!e_blkno) { 3708 if (!e_blkno) {
3716 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 3709 ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
3717 "record (%u, %u, 0) in xattr", inode->i_ino, 3710 inode->i_ino,
3718 le32_to_cpu(rec->e_cpos), 3711 le32_to_cpu(rec->e_cpos),
3719 ocfs2_rec_clusters(el, rec)); 3712 ocfs2_rec_clusters(el, rec));
3720 ret = -EROFS;
3721 goto out; 3713 goto out;
3722 } 3714 }
3723 3715
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
7334 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; 7326 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7335 const size_t total_len = prefix_len + name_len + 1; 7327 const size_t total_len = prefix_len + name_len + 1;
7336 7328
7329 if (!capable(CAP_SYS_ADMIN))
7330 return 0;
7331
7337 if (list && total_len <= list_size) { 7332 if (list && total_len <= list_size) {
7338 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); 7333 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
7339 memcpy(list + prefix_len, name, name_len); 7334 memcpy(list + prefix_len, name, name_len);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
588 struct super_block *sb = dentry->d_sb; 588 struct super_block *sb = dentry->d_sb;
589 struct ovl_fs *ufs = sb->s_fs_info; 589 struct ovl_fs *ufs = sb->s_fs_info;
590 590
591 seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); 591 seq_show_option(m, "lowerdir", ufs->config.lowerdir);
592 if (ufs->config.upperdir) { 592 if (ufs->config.upperdir) {
593 seq_printf(m, ",upperdir=%s", ufs->config.upperdir); 593 seq_show_option(m, "upperdir", ufs->config.upperdir);
594 seq_printf(m, ",workdir=%s", ufs->config.workdir); 594 seq_show_option(m, "workdir", ufs->config.workdir);
595 } 595 }
596 return 0; 596 return 0;
597} 597}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
308static inline void task_cap(struct seq_file *m, struct task_struct *p) 308static inline void task_cap(struct seq_file *m, struct task_struct *p)
309{ 309{
310 const struct cred *cred; 310 const struct cred *cred;
311 kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; 311 kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
312 cap_bset, cap_ambient;
312 313
313 rcu_read_lock(); 314 rcu_read_lock();
314 cred = __task_cred(p); 315 cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
316 cap_permitted = cred->cap_permitted; 317 cap_permitted = cred->cap_permitted;
317 cap_effective = cred->cap_effective; 318 cap_effective = cred->cap_effective;
318 cap_bset = cred->cap_bset; 319 cap_bset = cred->cap_bset;
320 cap_ambient = cred->cap_ambient;
319 rcu_read_unlock(); 321 rcu_read_unlock();
320 322
321 render_cap_t(m, "CapInh:\t", &cap_inheritable); 323 render_cap_t(m, "CapInh:\t", &cap_inheritable);
322 render_cap_t(m, "CapPrm:\t", &cap_permitted); 324 render_cap_t(m, "CapPrm:\t", &cap_permitted);
323 render_cap_t(m, "CapEff:\t", &cap_effective); 325 render_cap_t(m, "CapEff:\t", &cap_effective);
324 render_cap_t(m, "CapBnd:\t", &cap_bset); 326 render_cap_t(m, "CapBnd:\t", &cap_bset);
327 render_cap_t(m, "CapAmb:\t", &cap_ambient);
325} 328}
326 329
327static inline void task_seccomp(struct seq_file *m, struct task_struct *p) 330static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..3b4d8255e806 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
597 [ilog2(VM_HUGEPAGE)] = "hg", 597 [ilog2(VM_HUGEPAGE)] = "hg",
598 [ilog2(VM_NOHUGEPAGE)] = "nh", 598 [ilog2(VM_NOHUGEPAGE)] = "nh",
599 [ilog2(VM_MERGEABLE)] = "mg", 599 [ilog2(VM_MERGEABLE)] = "mg",
600 [ilog2(VM_UFFD_MISSING)]= "um",
601 [ilog2(VM_UFFD_WP)] = "uw",
600 }; 602 };
601 size_t i; 603 size_t i;
602 604
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
714 seq_puts(seq, ",acl"); 714 seq_puts(seq, ",acl");
715 715
716 if (REISERFS_SB(s)->s_jdev) 716 if (REISERFS_SB(s)->s_jdev)
717 seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); 717 seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
718 718
719 if (journal->j_max_commit_age != journal->j_default_max_commit_age) 719 if (journal->j_max_commit_age != journal->j_default_max_commit_age)
720 seq_printf(seq, ",commit=%d", journal->j_max_commit_age); 720 seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
721 721
722#ifdef CONFIG_QUOTA 722#ifdef CONFIG_QUOTA
723 if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) 723 if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
724 seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); 724 seq_show_option(seq, "usrjquota",
725 REISERFS_SB(s)->s_qf_names[USRQUOTA]);
725 else if (opts & (1 << REISERFS_USRQUOTA)) 726 else if (opts & (1 << REISERFS_USRQUOTA))
726 seq_puts(seq, ",usrquota"); 727 seq_puts(seq, ",usrquota");
727 if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) 728 if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
728 seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); 729 seq_show_option(seq, "grpjquota",
730 REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
729 else if (opts & (1 << REISERFS_GRPQUOTA)) 731 else if (opts & (1 << REISERFS_GRPQUOTA))
730 seq_puts(seq, ",grpquota"); 732 seq_puts(seq, ",grpquota");
731 if (REISERFS_SB(s)->s_jquota_fmt) { 733 if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..634e676072cb
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1330 @@
1/*
2 * fs/userfaultfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 * Copyright (C) 2008-2009 Red Hat, Inc.
6 * Copyright (C) 2015 Red Hat, Inc.
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 *
11 * Some part derived from fs/eventfd.c (anon inode setup) and
12 * mm/ksm.c (mm hashing).
13 */
14
15#include <linux/hashtable.h>
16#include <linux/sched.h>
17#include <linux/mm.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/seq_file.h>
21#include <linux/file.h>
22#include <linux/bug.h>
23#include <linux/anon_inodes.h>
24#include <linux/syscalls.h>
25#include <linux/userfaultfd_k.h>
26#include <linux/mempolicy.h>
27#include <linux/ioctl.h>
28#include <linux/security.h>
29
30static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
31
32enum userfaultfd_state {
33 UFFD_STATE_WAIT_API,
34 UFFD_STATE_RUNNING,
35};
36
37/*
38 * Start with fault_pending_wqh and fault_wqh so they're more likely
39 * to be in the same cacheline.
40 */
41struct userfaultfd_ctx {
42 /* waitqueue head for the pending (i.e. not read) userfaults */
43 wait_queue_head_t fault_pending_wqh;
44 /* waitqueue head for the userfaults */
45 wait_queue_head_t fault_wqh;
46 /* waitqueue head for the pseudo fd to wakeup poll/read */
47 wait_queue_head_t fd_wqh;
48 /* a refile sequence protected by fault_pending_wqh lock */
49 struct seqcount refile_seq;
50 /* pseudo fd refcounting */
51 atomic_t refcount;
52 /* userfaultfd syscall flags */
53 unsigned int flags;
54 /* state machine */
55 enum userfaultfd_state state;
56 /* released */
57 bool released;
58 /* mm with one ore more vmas attached to this userfaultfd_ctx */
59 struct mm_struct *mm;
60};
61
62struct userfaultfd_wait_queue {
63 struct uffd_msg msg;
64 wait_queue_t wq;
65 struct userfaultfd_ctx *ctx;
66};
67
68struct userfaultfd_wake_range {
69 unsigned long start;
70 unsigned long len;
71};
72
73static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
74 int wake_flags, void *key)
75{
76 struct userfaultfd_wake_range *range = key;
77 int ret;
78 struct userfaultfd_wait_queue *uwq;
79 unsigned long start, len;
80
81 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
82 ret = 0;
83 /* len == 0 means wake all */
84 start = range->start;
85 len = range->len;
86 if (len && (start > uwq->msg.arg.pagefault.address ||
87 start + len <= uwq->msg.arg.pagefault.address))
88 goto out;
89 ret = wake_up_state(wq->private, mode);
90 if (ret)
91 /*
92 * Wake only once, autoremove behavior.
93 *
94 * After the effect of list_del_init is visible to the
95 * other CPUs, the waitqueue may disappear from under
96 * us, see the !list_empty_careful() in
97 * handle_userfault(). try_to_wake_up() has an
98 * implicit smp_mb__before_spinlock, and the
99 * wq->private is read before calling the extern
100 * function "wake_up_state" (which in turns calls
101 * try_to_wake_up). While the spin_lock;spin_unlock;
102 * wouldn't be enough, the smp_mb__before_spinlock is
103 * enough to avoid an explicit smp_mb() here.
104 */
105 list_del_init(&wq->task_list);
106out:
107 return ret;
108}
109
110/**
111 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
112 * context.
113 * @ctx: [in] Pointer to the userfaultfd context.
114 *
115 * Returns: In case of success, returns not zero.
116 */
117static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
118{
119 if (!atomic_inc_not_zero(&ctx->refcount))
120 BUG();
121}
122
123/**
124 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
125 * context.
126 * @ctx: [in] Pointer to userfaultfd context.
127 *
128 * The userfaultfd context reference must have been previously acquired either
129 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
130 */
131static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
132{
133 if (atomic_dec_and_test(&ctx->refcount)) {
134 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
135 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
136 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
137 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
138 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
139 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
140 mmput(ctx->mm);
141 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
142 }
143}
144
145static inline void msg_init(struct uffd_msg *msg)
146{
147 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
148 /*
149 * Must use memset to zero out the paddings or kernel data is
150 * leaked to userland.
151 */
152 memset(msg, 0, sizeof(struct uffd_msg));
153}
154
155static inline struct uffd_msg userfault_msg(unsigned long address,
156 unsigned int flags,
157 unsigned long reason)
158{
159 struct uffd_msg msg;
160 msg_init(&msg);
161 msg.event = UFFD_EVENT_PAGEFAULT;
162 msg.arg.pagefault.address = address;
163 if (flags & FAULT_FLAG_WRITE)
164 /*
165 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
166 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
167 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
168 * was a read fault, otherwise if set it means it's
169 * a write fault.
170 */
171 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
172 if (reason & VM_UFFD_WP)
173 /*
174 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
175 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
176 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
177 * a missing fault, otherwise if set it means it's a
178 * write protect fault.
179 */
180 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
181 return msg;
182}
183
184/*
185 * Verify the pagetables are still not ok after having reigstered into
186 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
187 * userfault that has already been resolved, if userfaultfd_read and
188 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
189 * threads.
190 */
191static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
192 unsigned long address,
193 unsigned long flags,
194 unsigned long reason)
195{
196 struct mm_struct *mm = ctx->mm;
197 pgd_t *pgd;
198 pud_t *pud;
199 pmd_t *pmd, _pmd;
200 pte_t *pte;
201 bool ret = true;
202
203 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
204
205 pgd = pgd_offset(mm, address);
206 if (!pgd_present(*pgd))
207 goto out;
208 pud = pud_offset(pgd, address);
209 if (!pud_present(*pud))
210 goto out;
211 pmd = pmd_offset(pud, address);
212 /*
213 * READ_ONCE must function as a barrier with narrower scope
214 * and it must be equivalent to:
215 * _pmd = *pmd; barrier();
216 *
217 * This is to deal with the instability (as in
218 * pmd_trans_unstable) of the pmd.
219 */
220 _pmd = READ_ONCE(*pmd);
221 if (!pmd_present(_pmd))
222 goto out;
223
224 ret = false;
225 if (pmd_trans_huge(_pmd))
226 goto out;
227
228 /*
229 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
230 * and use the standard pte_offset_map() instead of parsing _pmd.
231 */
232 pte = pte_offset_map(pmd, address);
233 /*
234 * Lockless access: we're in a wait_event so it's ok if it
235 * changes under us.
236 */
237 if (pte_none(*pte))
238 ret = true;
239 pte_unmap(pte);
240
241out:
242 return ret;
243}
244
245/*
246 * The locking rules involved in returning VM_FAULT_RETRY depending on
247 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
248 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
249 * recommendation in __lock_page_or_retry is not an understatement.
250 *
251 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
252 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
253 * not set.
254 *
255 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
256 * set, VM_FAULT_RETRY can still be returned if and only if there are
257 * fatal_signal_pending()s, and the mmap_sem must be released before
258 * returning it.
259 */
260int handle_userfault(struct vm_area_struct *vma, unsigned long address,
261 unsigned int flags, unsigned long reason)
262{
263 struct mm_struct *mm = vma->vm_mm;
264 struct userfaultfd_ctx *ctx;
265 struct userfaultfd_wait_queue uwq;
266 int ret;
267 bool must_wait, return_to_userland;
268
269 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
270
271 ret = VM_FAULT_SIGBUS;
272 ctx = vma->vm_userfaultfd_ctx.ctx;
273 if (!ctx)
274 goto out;
275
276 BUG_ON(ctx->mm != mm);
277
278 VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
279 VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
280
281 /*
282 * If it's already released don't get it. This avoids to loop
283 * in __get_user_pages if userfaultfd_release waits on the
284 * caller of handle_userfault to release the mmap_sem.
285 */
286 if (unlikely(ACCESS_ONCE(ctx->released)))
287 goto out;
288
289 /*
290 * Check that we can return VM_FAULT_RETRY.
291 *
292 * NOTE: it should become possible to return VM_FAULT_RETRY
293 * even if FAULT_FLAG_TRIED is set without leading to gup()
294 * -EBUSY failures, if the userfaultfd is to be extended for
295 * VM_UFFD_WP tracking and we intend to arm the userfault
296 * without first stopping userland access to the memory. For
297 * VM_UFFD_MISSING userfaults this is enough for now.
298 */
299 if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
300 /*
301 * Validate the invariant that nowait must allow retry
302 * to be sure not to return SIGBUS erroneously on
303 * nowait invocations.
304 */
305 BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
306#ifdef CONFIG_DEBUG_VM
307 if (printk_ratelimit()) {
308 printk(KERN_WARNING
309 "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
310 dump_stack();
311 }
312#endif
313 goto out;
314 }
315
316 /*
317 * Handle nowait, not much to do other than tell it to retry
318 * and wait.
319 */
320 ret = VM_FAULT_RETRY;
321 if (flags & FAULT_FLAG_RETRY_NOWAIT)
322 goto out;
323
324 /* take the reference before dropping the mmap_sem */
325 userfaultfd_ctx_get(ctx);
326
327 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
328 uwq.wq.private = current;
329 uwq.msg = userfault_msg(address, flags, reason);
330 uwq.ctx = ctx;
331
332 return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
333 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
334
335 spin_lock(&ctx->fault_pending_wqh.lock);
336 /*
337 * After the __add_wait_queue the uwq is visible to userland
338 * through poll/read().
339 */
340 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
341 /*
342 * The smp_mb() after __set_current_state prevents the reads
343 * following the spin_unlock to happen before the list_add in
344 * __add_wait_queue.
345 */
346 set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
347 TASK_KILLABLE);
348 spin_unlock(&ctx->fault_pending_wqh.lock);
349
350 must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
351 up_read(&mm->mmap_sem);
352
353 if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
354 (return_to_userland ? !signal_pending(current) :
355 !fatal_signal_pending(current)))) {
356 wake_up_poll(&ctx->fd_wqh, POLLIN);
357 schedule();
358 ret |= VM_FAULT_MAJOR;
359 }
360
361 __set_current_state(TASK_RUNNING);
362
363 if (return_to_userland) {
364 if (signal_pending(current) &&
365 !fatal_signal_pending(current)) {
366 /*
367 * If we got a SIGSTOP or SIGCONT and this is
368 * a normal userland page fault, just let
369 * userland return so the signal will be
370 * handled and gdb debugging works. The page
371 * fault code immediately after we return from
372 * this function is going to release the
373 * mmap_sem and it's not depending on it
374 * (unlike gup would if we were not to return
375 * VM_FAULT_RETRY).
376 *
377 * If a fatal signal is pending we still take
378 * the streamlined VM_FAULT_RETRY failure path
379 * and there's no need to retake the mmap_sem
380 * in such case.
381 */
382 down_read(&mm->mmap_sem);
383 ret = 0;
384 }
385 }
386
387 /*
388 * Here we race with the list_del; list_add in
389 * userfaultfd_ctx_read(), however because we don't ever run
390 * list_del_init() to refile across the two lists, the prev
391 * and next pointers will never point to self. list_add also
392 * would never let any of the two pointers to point to
393 * self. So list_empty_careful won't risk to see both pointers
394 * pointing to self at any time during the list refile. The
395 * only case where list_del_init() is called is the full
396 * removal in the wake function and there we don't re-list_add
397 * and it's fine not to block on the spinlock. The uwq on this
398 * kernel stack can be released after the list_del_init.
399 */
400 if (!list_empty_careful(&uwq.wq.task_list)) {
401 spin_lock(&ctx->fault_pending_wqh.lock);
402 /*
403 * No need of list_del_init(), the uwq on the stack
404 * will be freed shortly anyway.
405 */
406 list_del(&uwq.wq.task_list);
407 spin_unlock(&ctx->fault_pending_wqh.lock);
408 }
409
410 /*
411 * ctx may go away after this if the userfault pseudo fd is
412 * already released.
413 */
414 userfaultfd_ctx_put(ctx);
415
416out:
417 return ret;
418}
419
420static int userfaultfd_release(struct inode *inode, struct file *file)
421{
422 struct userfaultfd_ctx *ctx = file->private_data;
423 struct mm_struct *mm = ctx->mm;
424 struct vm_area_struct *vma, *prev;
425 /* len == 0 means wake all */
426 struct userfaultfd_wake_range range = { .len = 0, };
427 unsigned long new_flags;
428
429 ACCESS_ONCE(ctx->released) = true;
430
431 /*
432 * Flush page faults out of all CPUs. NOTE: all page faults
433 * must be retried without returning VM_FAULT_SIGBUS if
434 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
435 * changes while handle_userfault released the mmap_sem. So
436 * it's critical that released is set to true (above), before
437 * taking the mmap_sem for writing.
438 */
439 down_write(&mm->mmap_sem);
440 prev = NULL;
441 for (vma = mm->mmap; vma; vma = vma->vm_next) {
442 cond_resched();
443 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
444 !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
445 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
446 prev = vma;
447 continue;
448 }
449 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
450 prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
451 new_flags, vma->anon_vma,
452 vma->vm_file, vma->vm_pgoff,
453 vma_policy(vma),
454 NULL_VM_UFFD_CTX);
455 if (prev)
456 vma = prev;
457 else
458 prev = vma;
459 vma->vm_flags = new_flags;
460 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
461 }
462 up_write(&mm->mmap_sem);
463
464 /*
465 * After no new page faults can wait on this fault_*wqh, flush
466 * the last page faults that may have been already waiting on
467 * the fault_*wqh.
468 */
469 spin_lock(&ctx->fault_pending_wqh.lock);
470 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
471 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
472 spin_unlock(&ctx->fault_pending_wqh.lock);
473
474 wake_up_poll(&ctx->fd_wqh, POLLHUP);
475 userfaultfd_ctx_put(ctx);
476 return 0;
477}
478
479/* fault_pending_wqh.lock must be hold by the caller */
480static inline struct userfaultfd_wait_queue *find_userfault(
481 struct userfaultfd_ctx *ctx)
482{
483 wait_queue_t *wq;
484 struct userfaultfd_wait_queue *uwq;
485
486 VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
487
488 uwq = NULL;
489 if (!waitqueue_active(&ctx->fault_pending_wqh))
490 goto out;
491 /* walk in reverse to provide FIFO behavior to read userfaults */
492 wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
493 typeof(*wq), task_list);
494 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
495out:
496 return uwq;
497}
498
499static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
500{
501 struct userfaultfd_ctx *ctx = file->private_data;
502 unsigned int ret;
503
504 poll_wait(file, &ctx->fd_wqh, wait);
505
506 switch (ctx->state) {
507 case UFFD_STATE_WAIT_API:
508 return POLLERR;
509 case UFFD_STATE_RUNNING:
510 /*
511 * poll() never guarantees that read won't block.
512 * userfaults can be waken before they're read().
513 */
514 if (unlikely(!(file->f_flags & O_NONBLOCK)))
515 return POLLERR;
516 /*
517 * lockless access to see if there are pending faults
518 * __pollwait last action is the add_wait_queue but
519 * the spin_unlock would allow the waitqueue_active to
520 * pass above the actual list_add inside
521 * add_wait_queue critical section. So use a full
522 * memory barrier to serialize the list_add write of
523 * add_wait_queue() with the waitqueue_active read
524 * below.
525 */
526 ret = 0;
527 smp_mb();
528 if (waitqueue_active(&ctx->fault_pending_wqh))
529 ret = POLLIN;
530 return ret;
531 default:
532 BUG();
533 }
534}
535
536static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
537 struct uffd_msg *msg)
538{
539 ssize_t ret;
540 DECLARE_WAITQUEUE(wait, current);
541 struct userfaultfd_wait_queue *uwq;
542
543 /* always take the fd_wqh lock before the fault_pending_wqh lock */
544 spin_lock(&ctx->fd_wqh.lock);
545 __add_wait_queue(&ctx->fd_wqh, &wait);
546 for (;;) {
547 set_current_state(TASK_INTERRUPTIBLE);
548 spin_lock(&ctx->fault_pending_wqh.lock);
549 uwq = find_userfault(ctx);
550 if (uwq) {
551 /*
552 * Use a seqcount to repeat the lockless check
553 * in wake_userfault() to avoid missing
554 * wakeups because during the refile both
555 * waitqueue could become empty if this is the
556 * only userfault.
557 */
558 write_seqcount_begin(&ctx->refile_seq);
559
560 /*
561 * The fault_pending_wqh.lock prevents the uwq
562 * to disappear from under us.
563 *
564 * Refile this userfault from
565 * fault_pending_wqh to fault_wqh, it's not
566 * pending anymore after we read it.
567 *
568 * Use list_del() by hand (as
569 * userfaultfd_wake_function also uses
570 * list_del_init() by hand) to be sure nobody
571 * changes __remove_wait_queue() to use
572 * list_del_init() in turn breaking the
573 * !list_empty_careful() check in
574 * handle_userfault(). The uwq->wq.task_list
575 * must never be empty at any time during the
576 * refile, or the waitqueue could disappear
577 * from under us. The "wait_queue_head_t"
578 * parameter of __remove_wait_queue() is unused
579 * anyway.
580 */
581 list_del(&uwq->wq.task_list);
582 __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
583
584 write_seqcount_end(&ctx->refile_seq);
585
586 /* careful to always initialize msg if ret == 0 */
587 *msg = uwq->msg;
588 spin_unlock(&ctx->fault_pending_wqh.lock);
589 ret = 0;
590 break;
591 }
592 spin_unlock(&ctx->fault_pending_wqh.lock);
593 if (signal_pending(current)) {
594 ret = -ERESTARTSYS;
595 break;
596 }
597 if (no_wait) {
598 ret = -EAGAIN;
599 break;
600 }
601 spin_unlock(&ctx->fd_wqh.lock);
602 schedule();
603 spin_lock(&ctx->fd_wqh.lock);
604 }
605 __remove_wait_queue(&ctx->fd_wqh, &wait);
606 __set_current_state(TASK_RUNNING);
607 spin_unlock(&ctx->fd_wqh.lock);
608
609 return ret;
610}
611
612static ssize_t userfaultfd_read(struct file *file, char __user *buf,
613 size_t count, loff_t *ppos)
614{
615 struct userfaultfd_ctx *ctx = file->private_data;
616 ssize_t _ret, ret = 0;
617 struct uffd_msg msg;
618 int no_wait = file->f_flags & O_NONBLOCK;
619
620 if (ctx->state == UFFD_STATE_WAIT_API)
621 return -EINVAL;
622
623 for (;;) {
624 if (count < sizeof(msg))
625 return ret ? ret : -EINVAL;
626 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
627 if (_ret < 0)
628 return ret ? ret : _ret;
629 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
630 return ret ? ret : -EFAULT;
631 ret += sizeof(msg);
632 buf += sizeof(msg);
633 count -= sizeof(msg);
634 /*
635 * Allow to read more than one fault at time but only
636 * block if waiting for the very first one.
637 */
638 no_wait = O_NONBLOCK;
639 }
640}
641
642static void __wake_userfault(struct userfaultfd_ctx *ctx,
643 struct userfaultfd_wake_range *range)
644{
645 unsigned long start, end;
646
647 start = range->start;
648 end = range->start + range->len;
649
650 spin_lock(&ctx->fault_pending_wqh.lock);
651 /* wake all in the range and autoremove */
652 if (waitqueue_active(&ctx->fault_pending_wqh))
653 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
654 range);
655 if (waitqueue_active(&ctx->fault_wqh))
656 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
657 spin_unlock(&ctx->fault_pending_wqh.lock);
658}
659
660static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
661 struct userfaultfd_wake_range *range)
662{
663 unsigned seq;
664 bool need_wakeup;
665
666 /*
667 * To be sure waitqueue_active() is not reordered by the CPU
668 * before the pagetable update, use an explicit SMP memory
669 * barrier here. PT lock release or up_read(mmap_sem) still
670 * have release semantics that can allow the
671 * waitqueue_active() to be reordered before the pte update.
672 */
673 smp_mb();
674
675 /*
676 * Use waitqueue_active because it's very frequent to
677 * change the address space atomically even if there are no
678 * userfaults yet. So we take the spinlock only when we're
679 * sure we've userfaults to wake.
680 */
681 do {
682 seq = read_seqcount_begin(&ctx->refile_seq);
683 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
684 waitqueue_active(&ctx->fault_wqh);
685 cond_resched();
686 } while (read_seqcount_retry(&ctx->refile_seq, seq));
687 if (need_wakeup)
688 __wake_userfault(ctx, range);
689}
690
691static __always_inline int validate_range(struct mm_struct *mm,
692 __u64 start, __u64 len)
693{
694 __u64 task_size = mm->task_size;
695
696 if (start & ~PAGE_MASK)
697 return -EINVAL;
698 if (len & ~PAGE_MASK)
699 return -EINVAL;
700 if (!len)
701 return -EINVAL;
702 if (start < mmap_min_addr)
703 return -EINVAL;
704 if (start >= task_size)
705 return -EINVAL;
706 if (len > task_size - start)
707 return -EINVAL;
708 return 0;
709}
710
711static int userfaultfd_register(struct userfaultfd_ctx *ctx,
712 unsigned long arg)
713{
714 struct mm_struct *mm = ctx->mm;
715 struct vm_area_struct *vma, *prev, *cur;
716 int ret;
717 struct uffdio_register uffdio_register;
718 struct uffdio_register __user *user_uffdio_register;
719 unsigned long vm_flags, new_flags;
720 bool found;
721 unsigned long start, end, vma_end;
722
723 user_uffdio_register = (struct uffdio_register __user *) arg;
724
725 ret = -EFAULT;
726 if (copy_from_user(&uffdio_register, user_uffdio_register,
727 sizeof(uffdio_register)-sizeof(__u64)))
728 goto out;
729
730 ret = -EINVAL;
731 if (!uffdio_register.mode)
732 goto out;
733 if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
734 UFFDIO_REGISTER_MODE_WP))
735 goto out;
736 vm_flags = 0;
737 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
738 vm_flags |= VM_UFFD_MISSING;
739 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
740 vm_flags |= VM_UFFD_WP;
741 /*
742 * FIXME: remove the below error constraint by
743 * implementing the wprotect tracking mode.
744 */
745 ret = -EINVAL;
746 goto out;
747 }
748
749 ret = validate_range(mm, uffdio_register.range.start,
750 uffdio_register.range.len);
751 if (ret)
752 goto out;
753
754 start = uffdio_register.range.start;
755 end = start + uffdio_register.range.len;
756
757 down_write(&mm->mmap_sem);
758 vma = find_vma_prev(mm, start, &prev);
759
760 ret = -ENOMEM;
761 if (!vma)
762 goto out_unlock;
763
764 /* check that there's at least one vma in the range */
765 ret = -EINVAL;
766 if (vma->vm_start >= end)
767 goto out_unlock;
768
769 /*
770 * Search for not compatible vmas.
771 *
772 * FIXME: this shall be relaxed later so that it doesn't fail
773 * on tmpfs backed vmas (in addition to the current allowance
774 * on anonymous vmas).
775 */
776 found = false;
777 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
778 cond_resched();
779
780 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
781 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
782
783 /* check not compatible vmas */
784 ret = -EINVAL;
785 if (cur->vm_ops)
786 goto out_unlock;
787
788 /*
789 * Check that this vma isn't already owned by a
790 * different userfaultfd. We can't allow more than one
791 * userfaultfd to own a single vma simultaneously or we
792 * wouldn't know which one to deliver the userfaults to.
793 */
794 ret = -EBUSY;
795 if (cur->vm_userfaultfd_ctx.ctx &&
796 cur->vm_userfaultfd_ctx.ctx != ctx)
797 goto out_unlock;
798
799 found = true;
800 }
801 BUG_ON(!found);
802
803 if (vma->vm_start < start)
804 prev = vma;
805
806 ret = 0;
807 do {
808 cond_resched();
809
810 BUG_ON(vma->vm_ops);
811 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
812 vma->vm_userfaultfd_ctx.ctx != ctx);
813
814 /*
815 * Nothing to do: this vma is already registered into this
816 * userfaultfd and with the right tracking mode too.
817 */
818 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
819 (vma->vm_flags & vm_flags) == vm_flags)
820 goto skip;
821
822 if (vma->vm_start > start)
823 start = vma->vm_start;
824 vma_end = min(end, vma->vm_end);
825
826 new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
827 prev = vma_merge(mm, prev, start, vma_end, new_flags,
828 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
829 vma_policy(vma),
830 ((struct vm_userfaultfd_ctx){ ctx }));
831 if (prev) {
832 vma = prev;
833 goto next;
834 }
835 if (vma->vm_start < start) {
836 ret = split_vma(mm, vma, start, 1);
837 if (ret)
838 break;
839 }
840 if (vma->vm_end > end) {
841 ret = split_vma(mm, vma, end, 0);
842 if (ret)
843 break;
844 }
845 next:
846 /*
847 * In the vma_merge() successful mprotect-like case 8:
848 * the next vma was merged into the current one and
849 * the current one has not been updated yet.
850 */
851 vma->vm_flags = new_flags;
852 vma->vm_userfaultfd_ctx.ctx = ctx;
853
854 skip:
855 prev = vma;
856 start = vma->vm_end;
857 vma = vma->vm_next;
858 } while (vma && vma->vm_start < end);
859out_unlock:
860 up_write(&mm->mmap_sem);
861 if (!ret) {
862 /*
863 * Now that we scanned all vmas we can already tell
864 * userland which ioctls methods are guaranteed to
865 * succeed on this range.
866 */
867 if (put_user(UFFD_API_RANGE_IOCTLS,
868 &user_uffdio_register->ioctls))
869 ret = -EFAULT;
870 }
871out:
872 return ret;
873}
874
875static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
876 unsigned long arg)
877{
878 struct mm_struct *mm = ctx->mm;
879 struct vm_area_struct *vma, *prev, *cur;
880 int ret;
881 struct uffdio_range uffdio_unregister;
882 unsigned long new_flags;
883 bool found;
884 unsigned long start, end, vma_end;
885 const void __user *buf = (void __user *)arg;
886
887 ret = -EFAULT;
888 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
889 goto out;
890
891 ret = validate_range(mm, uffdio_unregister.start,
892 uffdio_unregister.len);
893 if (ret)
894 goto out;
895
896 start = uffdio_unregister.start;
897 end = start + uffdio_unregister.len;
898
899 down_write(&mm->mmap_sem);
900 vma = find_vma_prev(mm, start, &prev);
901
902 ret = -ENOMEM;
903 if (!vma)
904 goto out_unlock;
905
906 /* check that there's at least one vma in the range */
907 ret = -EINVAL;
908 if (vma->vm_start >= end)
909 goto out_unlock;
910
911 /*
912 * Search for not compatible vmas.
913 *
914 * FIXME: this shall be relaxed later so that it doesn't fail
915 * on tmpfs backed vmas (in addition to the current allowance
916 * on anonymous vmas).
917 */
918 found = false;
919 ret = -EINVAL;
920 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
921 cond_resched();
922
923 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
924 !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
925
926 /*
927 * Check not compatible vmas, not strictly required
928 * here as not compatible vmas cannot have an
929 * userfaultfd_ctx registered on them, but this
930 * provides for more strict behavior to notice
931 * unregistration errors.
932 */
933 if (cur->vm_ops)
934 goto out_unlock;
935
936 found = true;
937 }
938 BUG_ON(!found);
939
940 if (vma->vm_start < start)
941 prev = vma;
942
943 ret = 0;
944 do {
945 cond_resched();
946
947 BUG_ON(vma->vm_ops);
948
949 /*
950 * Nothing to do: this vma is already registered into this
951 * userfaultfd and with the right tracking mode too.
952 */
953 if (!vma->vm_userfaultfd_ctx.ctx)
954 goto skip;
955
956 if (vma->vm_start > start)
957 start = vma->vm_start;
958 vma_end = min(end, vma->vm_end);
959
960 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
961 prev = vma_merge(mm, prev, start, vma_end, new_flags,
962 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
963 vma_policy(vma),
964 NULL_VM_UFFD_CTX);
965 if (prev) {
966 vma = prev;
967 goto next;
968 }
969 if (vma->vm_start < start) {
970 ret = split_vma(mm, vma, start, 1);
971 if (ret)
972 break;
973 }
974 if (vma->vm_end > end) {
975 ret = split_vma(mm, vma, end, 0);
976 if (ret)
977 break;
978 }
979 next:
980 /*
981 * In the vma_merge() successful mprotect-like case 8:
982 * the next vma was merged into the current one and
983 * the current one has not been updated yet.
984 */
985 vma->vm_flags = new_flags;
986 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
987
988 skip:
989 prev = vma;
990 start = vma->vm_end;
991 vma = vma->vm_next;
992 } while (vma && vma->vm_start < end);
993out_unlock:
994 up_write(&mm->mmap_sem);
995out:
996 return ret;
997}
998
999/*
1000 * userfaultfd_wake may be used in combination with the
1001 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1002 */
1003static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1004 unsigned long arg)
1005{
1006 int ret;
1007 struct uffdio_range uffdio_wake;
1008 struct userfaultfd_wake_range range;
1009 const void __user *buf = (void __user *)arg;
1010
1011 ret = -EFAULT;
1012 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1013 goto out;
1014
1015 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1016 if (ret)
1017 goto out;
1018
1019 range.start = uffdio_wake.start;
1020 range.len = uffdio_wake.len;
1021
1022 /*
1023 * len == 0 means wake all and we don't want to wake all here,
1024 * so check it again to be sure.
1025 */
1026 VM_BUG_ON(!range.len);
1027
1028 wake_userfault(ctx, &range);
1029 ret = 0;
1030
1031out:
1032 return ret;
1033}
1034
1035static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1036 unsigned long arg)
1037{
1038 __s64 ret;
1039 struct uffdio_copy uffdio_copy;
1040 struct uffdio_copy __user *user_uffdio_copy;
1041 struct userfaultfd_wake_range range;
1042
1043 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1044
1045 ret = -EFAULT;
1046 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1047 /* don't copy "copy" last field */
1048 sizeof(uffdio_copy)-sizeof(__s64)))
1049 goto out;
1050
1051 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1052 if (ret)
1053 goto out;
1054 /*
1055 * double check for wraparound just in case. copy_from_user()
1056 * will later check uffdio_copy.src + uffdio_copy.len to fit
1057 * in the userland range.
1058 */
1059 ret = -EINVAL;
1060 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
1061 goto out;
1062 if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
1063 goto out;
1064
1065 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1066 uffdio_copy.len);
1067 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1068 return -EFAULT;
1069 if (ret < 0)
1070 goto out;
1071 BUG_ON(!ret);
1072 /* len == 0 would wake all */
1073 range.len = ret;
1074 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1075 range.start = uffdio_copy.dst;
1076 wake_userfault(ctx, &range);
1077 }
1078 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1079out:
1080 return ret;
1081}
1082
1083static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1084 unsigned long arg)
1085{
1086 __s64 ret;
1087 struct uffdio_zeropage uffdio_zeropage;
1088 struct uffdio_zeropage __user *user_uffdio_zeropage;
1089 struct userfaultfd_wake_range range;
1090
1091 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1092
1093 ret = -EFAULT;
1094 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1095 /* don't copy "zeropage" last field */
1096 sizeof(uffdio_zeropage)-sizeof(__s64)))
1097 goto out;
1098
1099 ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1100 uffdio_zeropage.range.len);
1101 if (ret)
1102 goto out;
1103 ret = -EINVAL;
1104 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1105 goto out;
1106
1107 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1108 uffdio_zeropage.range.len);
1109 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1110 return -EFAULT;
1111 if (ret < 0)
1112 goto out;
1113 /* len == 0 would wake all */
1114 BUG_ON(!ret);
1115 range.len = ret;
1116 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1117 range.start = uffdio_zeropage.range.start;
1118 wake_userfault(ctx, &range);
1119 }
1120 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1121out:
1122 return ret;
1123}
1124
1125/*
1126 * userland asks for a certain API version and we return which bits
1127 * and ioctl commands are implemented in this kernel for such API
1128 * version or -EINVAL if unknown.
1129 */
1130static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1131 unsigned long arg)
1132{
1133 struct uffdio_api uffdio_api;
1134 void __user *buf = (void __user *)arg;
1135 int ret;
1136
1137 ret = -EINVAL;
1138 if (ctx->state != UFFD_STATE_WAIT_API)
1139 goto out;
1140 ret = -EFAULT;
1141 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
1142 goto out;
1143 if (uffdio_api.api != UFFD_API || uffdio_api.features) {
1144 memset(&uffdio_api, 0, sizeof(uffdio_api));
1145 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1146 goto out;
1147 ret = -EINVAL;
1148 goto out;
1149 }
1150 uffdio_api.features = UFFD_API_FEATURES;
1151 uffdio_api.ioctls = UFFD_API_IOCTLS;
1152 ret = -EFAULT;
1153 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1154 goto out;
1155 ctx->state = UFFD_STATE_RUNNING;
1156 ret = 0;
1157out:
1158 return ret;
1159}
1160
1161static long userfaultfd_ioctl(struct file *file, unsigned cmd,
1162 unsigned long arg)
1163{
1164 int ret = -EINVAL;
1165 struct userfaultfd_ctx *ctx = file->private_data;
1166
1167 if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
1168 return -EINVAL;
1169
1170 switch(cmd) {
1171 case UFFDIO_API:
1172 ret = userfaultfd_api(ctx, arg);
1173 break;
1174 case UFFDIO_REGISTER:
1175 ret = userfaultfd_register(ctx, arg);
1176 break;
1177 case UFFDIO_UNREGISTER:
1178 ret = userfaultfd_unregister(ctx, arg);
1179 break;
1180 case UFFDIO_WAKE:
1181 ret = userfaultfd_wake(ctx, arg);
1182 break;
1183 case UFFDIO_COPY:
1184 ret = userfaultfd_copy(ctx, arg);
1185 break;
1186 case UFFDIO_ZEROPAGE:
1187 ret = userfaultfd_zeropage(ctx, arg);
1188 break;
1189 }
1190 return ret;
1191}
1192
1193#ifdef CONFIG_PROC_FS
1194static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
1195{
1196 struct userfaultfd_ctx *ctx = f->private_data;
1197 wait_queue_t *wq;
1198 struct userfaultfd_wait_queue *uwq;
1199 unsigned long pending = 0, total = 0;
1200
1201 spin_lock(&ctx->fault_pending_wqh.lock);
1202 list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
1203 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
1204 pending++;
1205 total++;
1206 }
1207 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
1208 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
1209 total++;
1210 }
1211 spin_unlock(&ctx->fault_pending_wqh.lock);
1212
1213 /*
1214 * If more protocols will be added, there will be all shown
1215 * separated by a space. Like this:
1216 * protocols: aa:... bb:...
1217 */
1218 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
1219 pending, total, UFFD_API, UFFD_API_FEATURES,
1220 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
1221}
1222#endif
1223
1224static const struct file_operations userfaultfd_fops = {
1225#ifdef CONFIG_PROC_FS
1226 .show_fdinfo = userfaultfd_show_fdinfo,
1227#endif
1228 .release = userfaultfd_release,
1229 .poll = userfaultfd_poll,
1230 .read = userfaultfd_read,
1231 .unlocked_ioctl = userfaultfd_ioctl,
1232 .compat_ioctl = userfaultfd_ioctl,
1233 .llseek = noop_llseek,
1234};
1235
1236static void init_once_userfaultfd_ctx(void *mem)
1237{
1238 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
1239
1240 init_waitqueue_head(&ctx->fault_pending_wqh);
1241 init_waitqueue_head(&ctx->fault_wqh);
1242 init_waitqueue_head(&ctx->fd_wqh);
1243 seqcount_init(&ctx->refile_seq);
1244}
1245
1246/**
1247 * userfaultfd_file_create - Creates an userfaultfd file pointer.
1248 * @flags: Flags for the userfaultfd file.
1249 *
1250 * This function creates an userfaultfd file pointer, w/out installing
1251 * it into the fd table. This is useful when the userfaultfd file is
1252 * used during the initialization of data structures that require
1253 * extra setup after the userfaultfd creation. So the userfaultfd
1254 * creation is split into the file pointer creation phase, and the
1255 * file descriptor installation phase. In this way races with
1256 * userspace closing the newly installed file descriptor can be
1257 * avoided. Returns an userfaultfd file pointer, or a proper error
1258 * pointer.
1259 */
1260static struct file *userfaultfd_file_create(int flags)
1261{
1262 struct file *file;
1263 struct userfaultfd_ctx *ctx;
1264
1265 BUG_ON(!current->mm);
1266
1267 /* Check the UFFD_* constants for consistency. */
1268 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
1269 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
1270
1271 file = ERR_PTR(-EINVAL);
1272 if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
1273 goto out;
1274
1275 file = ERR_PTR(-ENOMEM);
1276 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
1277 if (!ctx)
1278 goto out;
1279
1280 atomic_set(&ctx->refcount, 1);
1281 ctx->flags = flags;
1282 ctx->state = UFFD_STATE_WAIT_API;
1283 ctx->released = false;
1284 ctx->mm = current->mm;
1285 /* prevent the mm struct to be freed */
1286 atomic_inc(&ctx->mm->mm_users);
1287
1288 file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
1289 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
1290 if (IS_ERR(file))
1291 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
1292out:
1293 return file;
1294}
1295
1296SYSCALL_DEFINE1(userfaultfd, int, flags)
1297{
1298 int fd, error;
1299 struct file *file;
1300
1301 error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
1302 if (error < 0)
1303 return error;
1304 fd = error;
1305
1306 file = userfaultfd_file_create(flags);
1307 if (IS_ERR(file)) {
1308 error = PTR_ERR(file);
1309 goto err_put_unused_fd;
1310 }
1311 fd_install(fd, file);
1312
1313 return fd;
1314
1315err_put_unused_fd:
1316 put_unused_fd(fd);
1317
1318 return error;
1319}
1320
1321static int __init userfaultfd_init(void)
1322{
1323 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
1324 sizeof(struct userfaultfd_ctx),
1325 0,
1326 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1327 init_once_userfaultfd_ctx);
1328 return 0;
1329}
1330__initcall(userfaultfd_init);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562c159..bbd9b1f10ffb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -511,9 +511,9 @@ xfs_showargs(
511 seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); 511 seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
512 512
513 if (mp->m_logname) 513 if (mp->m_logname)
514 seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); 514 seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
515 if (mp->m_rtname) 515 if (mp->m_rtname)
516 seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); 516 seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
517 517
518 if (mp->m_dalign > 0) 518 if (mp->m_dalign > 0)
519 seq_printf(m, "," MNTOPT_SUNIT "=%d", 519 seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
137 kernel_cap_t cap_permitted; /* caps we're permitted */ 137 kernel_cap_t cap_permitted; /* caps we're permitted */
138 kernel_cap_t cap_effective; /* caps we can actually use */ 138 kernel_cap_t cap_effective; /* caps we can actually use */
139 kernel_cap_t cap_bset; /* capability bounding set */ 139 kernel_cap_t cap_bset; /* capability bounding set */
140 kernel_cap_t cap_ambient; /* Ambient capability set */
140#ifdef CONFIG_KEYS 141#ifdef CONFIG_KEYS
141 unsigned char jit_keyring; /* default keyring to attach requested 142 unsigned char jit_keyring; /* default keyring to attach requested
142 * keys to */ 143 * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
212} 213}
213#endif 214#endif
214 215
216static inline bool cap_ambient_invariant_ok(const struct cred *cred)
217{
218 return cap_issubset(cred->cap_ambient,
219 cap_intersect(cred->cap_permitted,
220 cred->cap_inheritable));
221}
222
215/** 223/**
216 * get_new_cred - Get a reference on a new set of credentials 224 * get_new_cred - Get a reference on a new set of credentials
217 * @cred: The new credentials to reference 225 * @cred: The new credentials to reference
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbd780c33c5f..864203c10dbc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct file_operations {
1612 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1612 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1613 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1613 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1614 int (*mmap) (struct file *, struct vm_area_struct *); 1614 int (*mmap) (struct file *, struct vm_area_struct *);
1615 int (*mremap)(struct file *, struct vm_area_struct *);
1616 int (*open) (struct inode *, struct file *); 1615 int (*open) (struct inode *, struct file *);
1617 int (*flush) (struct file *, fl_owner_t id); 1616 int (*flush) (struct file *, fl_owner_t id);
1618 int (*release) (struct inode *, struct file *); 1617 int (*release) (struct inode *, struct file *);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..e0727d77feaf 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,49 @@ struct fsnotify_group {
195#define FSNOTIFY_EVENT_INODE 2 195#define FSNOTIFY_EVENT_INODE 2
196 196
197/* 197/*
198 * a mark is simply an object attached to an in core inode which allows an 198 * A mark is simply an object attached to an in core inode which allows an
199 * fsnotify listener to indicate they are either no longer interested in events 199 * fsnotify listener to indicate they are either no longer interested in events
200 * of a type matching mask or only interested in those events. 200 * of a type matching mask or only interested in those events.
201 * 201 *
202 * these are flushed when an inode is evicted from core and may be flushed 202 * These are flushed when an inode is evicted from core and may be flushed
203 * when the inode is modified (as seen by fsnotify_access). Some fsnotify users 203 * when the inode is modified (as seen by fsnotify_access). Some fsnotify
204 * (such as dnotify) will flush these when the open fd is closed and not at 204 * users (such as dnotify) will flush these when the open fd is closed and not
205 * inode eviction or modification. 205 * at inode eviction or modification.
206 *
207 * Text in brackets is showing the lock(s) protecting modifications of a
208 * particular entry. obj_lock means either inode->i_lock or
209 * mnt->mnt_root->d_lock depending on the mark type.
206 */ 210 */
207struct fsnotify_mark { 211struct fsnotify_mark {
208 __u32 mask; /* mask this mark is for */ 212 /* Mask this mark is for [mark->lock, group->mark_mutex] */
209 /* we hold ref for each i_list and g_list. also one ref for each 'thing' 213 __u32 mask;
214 /* We hold one for presence in g_list. Also one ref for each 'thing'
210 * in kernel that found and may be using this mark. */ 215 * in kernel that found and may be using this mark. */
211 atomic_t refcnt; /* active things looking at this mark */ 216 atomic_t refcnt;
212 struct fsnotify_group *group; /* group this mark is for */ 217 /* Group this mark is for. Set on mark creation, stable until last ref
213 struct list_head g_list; /* list of marks by group->i_fsnotify_marks 218 * is dropped */
214 * Also reused for queueing mark into 219 struct fsnotify_group *group;
215 * destroy_list when it's waiting for 220 /* List of marks by group->i_fsnotify_marks. Also reused for queueing
216 * the end of SRCU period before it can 221 * mark into destroy_list when it's waiting for the end of SRCU period
217 * be freed */ 222 * before it can be freed. [group->mark_mutex] */
218 spinlock_t lock; /* protect group and inode */ 223 struct list_head g_list;
219 struct hlist_node obj_list; /* list of marks for inode / vfsmount */ 224 /* Protects inode / mnt pointers, flags, masks */
220 struct list_head free_list; /* tmp list used when freeing this mark */ 225 spinlock_t lock;
221 union { 226 /* List of marks for inode / vfsmount [obj_lock] */
227 struct hlist_node obj_list;
228 union { /* Object pointer [mark->lock, group->mark_mutex] */
222 struct inode *inode; /* inode this mark is associated with */ 229 struct inode *inode; /* inode this mark is associated with */
223 struct vfsmount *mnt; /* vfsmount this mark is associated with */ 230 struct vfsmount *mnt; /* vfsmount this mark is associated with */
224 }; 231 };
225 __u32 ignored_mask; /* events types to ignore */ 232 /* Events types to ignore [mark->lock, group->mark_mutex] */
233 __u32 ignored_mask;
226#define FSNOTIFY_MARK_FLAG_INODE 0x01 234#define FSNOTIFY_MARK_FLAG_INODE 0x01
227#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 235#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02
228#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 236#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04
229#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 237#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
230#define FSNOTIFY_MARK_FLAG_ALIVE 0x10 238#define FSNOTIFY_MARK_FLAG_ALIVE 0x10
231 unsigned int flags; /* vfsmount or inode mark? */ 239#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20
240 unsigned int flags; /* flags [mark->lock] */
232 void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ 241 void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
233}; 242};
234 243
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
345/* given a group and a mark, flag mark to be freed when all references are dropped */ 354/* given a group and a mark, flag mark to be freed when all references are dropped */
346extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, 355extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
347 struct fsnotify_group *group); 356 struct fsnotify_group *group);
348extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, 357/* detach mark from inode / mount list, group list, drop inode reference */
349 struct fsnotify_group *group); 358extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
359/* free mark */
360extern void fsnotify_free_mark(struct fsnotify_mark *mark);
350/* run all the marks in a group, and clear all of the vfsmount marks */ 361/* run all the marks in a group, and clear all of the vfsmount marks */
351extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); 362extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
352/* run all the marks in a group, and clear all of the inode marks */ 363/* run all the marks in a group, and clear all of the inode marks */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
59 59
60 genpool_algo_t algo; /* allocation function */ 60 genpool_algo_t algo; /* allocation function */
61 void *data; 61 void *data;
62
63 const char *name;
62}; 64};
63 65
64/* 66/*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
118 unsigned long start, unsigned int nr, void *data); 120 unsigned long start, unsigned int nr, void *data);
119 121
120extern struct gen_pool *devm_gen_pool_create(struct device *dev, 122extern struct gen_pool *devm_gen_pool_create(struct device *dev,
121 int min_alloc_order, int nid); 123 int min_alloc_order, int nid, const char *name);
122extern struct gen_pool *gen_pool_get(struct device *dev); 124extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
123 125
124bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, 126bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
125 size_t size); 127 size_t size);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
11 const char namefmt[], ...); 11 const char namefmt[], ...);
12 12
13#define kthread_create(threadfn, data, namefmt, arg...) \ 13#define kthread_create(threadfn, data, namefmt, arg...) \
14 kthread_create_on_node(threadfn, data, -1, namefmt, ##arg) 14 kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
15 15
16 16
17struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), 17struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf6f117fcf4d..8b257c43855b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
124#define VM_MAYSHARE 0x00000080 124#define VM_MAYSHARE 0x00000080
125 125
126#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 126#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
127#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
127#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 128#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
128#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 129#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
130#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
129 131
130#define VM_LOCKED 0x00002000 132#define VM_LOCKED 0x00002000
131#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 133#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -245,6 +247,7 @@ struct vm_fault {
245struct vm_operations_struct { 247struct vm_operations_struct {
246 void (*open)(struct vm_area_struct * area); 248 void (*open)(struct vm_area_struct * area);
247 void (*close)(struct vm_area_struct * area); 249 void (*close)(struct vm_area_struct * area);
250 int (*mremap)(struct vm_area_struct * area);
248 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 251 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
249 void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); 252 void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
250 253
@@ -1833,7 +1836,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1833extern struct vm_area_struct *vma_merge(struct mm_struct *, 1836extern struct vm_area_struct *vma_merge(struct mm_struct *,
1834 struct vm_area_struct *prev, unsigned long addr, unsigned long end, 1837 struct vm_area_struct *prev, unsigned long addr, unsigned long end,
1835 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, 1838 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
1836 struct mempolicy *); 1839 struct mempolicy *, struct vm_userfaultfd_ctx);
1837extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); 1840extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
1838extern int split_vma(struct mm_struct *, 1841extern int split_vma(struct mm_struct *,
1839 struct vm_area_struct *, unsigned long addr, int new_below); 1842 struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..c8d0a73d64c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -256,6 +256,16 @@ struct vm_region {
256 * this region */ 256 * this region */
257}; 257};
258 258
259#ifdef CONFIG_USERFAULTFD
260#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
261struct vm_userfaultfd_ctx {
262 struct userfaultfd_ctx *ctx;
263};
264#else /* CONFIG_USERFAULTFD */
265#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
266struct vm_userfaultfd_ctx {};
267#endif /* CONFIG_USERFAULTFD */
268
259/* 269/*
260 * This struct defines a memory VMM memory area. There is one of these 270 * This struct defines a memory VMM memory area. There is one of these
261 * per VM-area/task. A VM area is any part of the process virtual memory 271 * per VM-area/task. A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
322#ifdef CONFIG_NUMA 332#ifdef CONFIG_NUMA
323 struct mempolicy *vm_policy; /* NUMA policy for the VMA */ 333 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
324#endif 334#endif
335 struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
325}; 336};
326 337
327struct core_thread { 338struct core_thread {
@@ -543,6 +554,7 @@ enum tlb_flush_reason {
543 TLB_REMOTE_SHOOTDOWN, 554 TLB_REMOTE_SHOOTDOWN,
544 TLB_LOCAL_SHOOTDOWN, 555 TLB_LOCAL_SHOOTDOWN,
545 TLB_LOCAL_MM_SHOOTDOWN, 556 TLB_LOCAL_MM_SHOOTDOWN,
557 TLB_REMOTE_SEND_IPI,
546 NR_TLB_FLUSH_REASONS, 558 NR_TLB_FLUSH_REASONS,
547}; 559};
548 560
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
690#endif 690#endif
691}; 691};
692 692
693#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
694struct node_active_region {
695 unsigned long start_pfn;
696 unsigned long end_pfn;
697 int nid;
698};
699#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
700
701#ifndef CONFIG_DISCONTIGMEM 693#ifndef CONFIG_DISCONTIGMEM
702/* The array of struct pages - for discontigmem use pgdat->lmem_map */ 694/* The array of struct pages - for discontigmem use pgdat->lmem_map */
703extern struct page *mem_map; 695extern struct page *mem_map;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void)
27#if defined(CONFIG_HARDLOCKUP_DETECTOR) 27#if defined(CONFIG_HARDLOCKUP_DETECTOR)
28extern void hardlockup_detector_disable(void); 28extern void hardlockup_detector_disable(void);
29#else 29#else
30static inline void hardlockup_detector_disable(void) 30static inline void hardlockup_detector_disable(void) {}
31{
32}
33#endif 31#endif
34 32
35/* 33/*
@@ -80,6 +78,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
80 void __user *, size_t *, loff_t *); 78 void __user *, size_t *, loff_t *);
81extern int proc_watchdog_cpumask(struct ctl_table *, int, 79extern int proc_watchdog_cpumask(struct ctl_table *, int,
82 void __user *, size_t *, loff_t *); 80 void __user *, size_t *, loff_t *);
81extern int lockup_detector_suspend(void);
82extern void lockup_detector_resume(void);
83#else
84static inline int lockup_detector_suspend(void)
85{
86 return 0;
87}
88
89static inline void lockup_detector_resume(void)
90{
91}
83#endif 92#endif
84 93
85#ifdef CONFIG_HAVE_ACPI_APEI_NMI 94#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ 89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ 90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
91 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ 91 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
92 TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
93 * and caller guarantees they will
94 * do a final flush if necessary */
92}; 95};
93 96
94#ifdef CONFIG_MMU 97#ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,25 @@ enum perf_event_task_context {
1344 perf_nr_task_contexts, 1344 perf_nr_task_contexts,
1345}; 1345};
1346 1346
1347/* Track pages that require TLB flushes */
1348struct tlbflush_unmap_batch {
1349 /*
1350 * Each bit set is a CPU that potentially has a TLB entry for one of
1351 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
1352 */
1353 struct cpumask cpumask;
1354
1355 /* True if any bit in cpumask is set */
1356 bool flush_required;
1357
1358 /*
1359 * If true then the PTE was dirty when unmapped. The entry must be
1360 * flushed before IO is initiated or a stale TLB entry potentially
1361 * allows an update without redirtying the page.
1362 */
1363 bool writable;
1364};
1365
1347struct task_struct { 1366struct task_struct {
1348 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1367 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1349 void *stack; 1368 void *stack;
@@ -1700,6 +1719,10 @@ struct task_struct {
1700 unsigned long numa_pages_migrated; 1719 unsigned long numa_pages_migrated;
1701#endif /* CONFIG_NUMA_BALANCING */ 1720#endif /* CONFIG_NUMA_BALANCING */
1702 1721
1722#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1723 struct tlbflush_unmap_batch tlb_ubc;
1724#endif
1725
1703 struct rcu_head rcu; 1726 struct rcu_head rcu;
1704 1727
1705 /* 1728 /*
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..d4c7271382cb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
149#endif 149#endif
150} 150}
151 151
152/**
153 * seq_show_options - display mount options with appropriate escapes.
154 * @m: the seq_file handle
155 * @name: the mount option name
156 * @value: the mount option name's value, can be NULL
157 */
158static inline void seq_show_option(struct seq_file *m, const char *name,
159 const char *value)
160{
161 seq_putc(m, ',');
162 seq_escape(m, name, ",= \t\n\\");
163 if (value) {
164 seq_putc(m, '=');
165 seq_escape(m, value, ", \t\n\\");
166 }
167}
168
169/**
170 * seq_show_option_n - display mount options with appropriate escapes
171 * where @value must be a specific length.
172 * @m: the seq_file handle
173 * @name: the mount option name
174 * @value: the mount option name's value, cannot be NULL
175 * @length: the length of @value to display
176 *
177 * This is a macro since this uses "length" to define the size of the
178 * stack buffer.
179 */
180#define seq_show_option_n(m, name, value, length) { \
181 char val_buf[length + 1]; \
182 strncpy(val_buf, value, length); \
183 val_buf[length] = '\0'; \
184 seq_show_option(m, name, val_buf); \
185}
186
152#define SEQ_START_TOKEN ((void *)1) 187#define SEQ_START_TOKEN ((void *)1)
153/* 188/*
154 * Helpers for iteration over list_head-s in seq_files 189 * Helpers for iteration over list_head-s in seq_files
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
290void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); 290void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
291void kmem_cache_free(struct kmem_cache *, void *); 291void kmem_cache_free(struct kmem_cache *, void *);
292 292
293/*
294 * Bulk allocation and freeing operations. These are accellerated in an
295 * allocator specific way to avoid taking locks repeatedly or building
296 * metadata structures unnecessarily.
297 *
298 * Note that interrupts must be enabled when calling these functions.
299 */
300void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
301bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
302
293#ifdef CONFIG_NUMA 303#ifdef CONFIG_NUMA
294void *__kmalloc_node(size_t size, gfp_t flags, int node); 304void *__kmalloc_node(size_t size, gfp_t flags, int node);
295void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); 305void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
48 const char *thread_comm; 48 const char *thread_comm;
49}; 49};
50 50
51int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); 51int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
52 const struct cpumask *cpumask);
53
54static inline int
55smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
56{
57 return smpboot_register_percpu_thread_cpumask(plug_thread,
58 cpu_possible_mask);
59}
60
52void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); 61void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
53int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, 62int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
54 const struct cpumask *); 63 const struct cpumask *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..08001317aee7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
810asmlinkage long sys_eventfd(unsigned int count); 810asmlinkage long sys_eventfd(unsigned int count);
811asmlinkage long sys_eventfd2(unsigned int count, int flags); 811asmlinkage long sys_eventfd2(unsigned int count, int flags);
812asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); 812asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
813asmlinkage long sys_userfaultfd(int flags);
813asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); 814asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
814asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); 815asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
815asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, 816asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..587480ad41b7
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
1/*
2 * include/linux/userfaultfd_k.h
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 */
7
8#ifndef _LINUX_USERFAULTFD_K_H
9#define _LINUX_USERFAULTFD_K_H
10
11#ifdef CONFIG_USERFAULTFD
12
13#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
14
15#include <linux/fcntl.h>
16
17/*
18 * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
19 * new flags, since they might collide with O_* ones. We want
20 * to re-use O_* flags that couldn't possibly have a meaning
21 * from userfaultfd, in order to leave a free define-space for
22 * shared O_* flags.
23 */
24#define UFFD_CLOEXEC O_CLOEXEC
25#define UFFD_NONBLOCK O_NONBLOCK
26
27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
29
30extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
31 unsigned int flags, unsigned long reason);
32
33extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34 unsigned long src_start, unsigned long len);
35extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
36 unsigned long dst_start,
37 unsigned long len);
38
39/* mm helpers */
40static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
41 struct vm_userfaultfd_ctx vm_ctx)
42{
43 return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
44}
45
46static inline bool userfaultfd_missing(struct vm_area_struct *vma)
47{
48 return vma->vm_flags & VM_UFFD_MISSING;
49}
50
51static inline bool userfaultfd_armed(struct vm_area_struct *vma)
52{
53 return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
54}
55
56#else /* CONFIG_USERFAULTFD */
57
58/* mm helpers */
59static inline int handle_userfault(struct vm_area_struct *vma,
60 unsigned long address,
61 unsigned int flags,
62 unsigned long reason)
63{
64 return VM_FAULT_SIGBUS;
65}
66
67static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
68 struct vm_userfaultfd_ctx vm_ctx)
69{
70 return true;
71}
72
73static inline bool userfaultfd_missing(struct vm_area_struct *vma)
74{
75 return false;
76}
77
78static inline bool userfaultfd_armed(struct vm_area_struct *vma)
79{
80 return false;
81}
82
83#endif /* CONFIG_USERFAULTFD */
84
85#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
147 147
148typedef int wait_bit_action_f(struct wait_bit_key *); 148typedef int wait_bit_action_f(struct wait_bit_key *);
149void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 149void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
150void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 150void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
151 void *key);
151void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 152void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
152void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); 153void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
153void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); 154void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
179#define wake_up_poll(x, m) \ 180#define wake_up_poll(x, m) \
180 __wake_up(x, TASK_NORMAL, 1, (void *) (m)) 181 __wake_up(x, TASK_NORMAL, 1, (void *) (m))
181#define wake_up_locked_poll(x, m) \ 182#define wake_up_locked_poll(x, m) \
182 __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) 183 __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
183#define wake_up_interruptible_poll(x, m) \ 184#define wake_up_interruptible_poll(x, m) \
184 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) 185 __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
185#define wake_up_interruptible_sync_poll(x, m) \ 186#define wake_up_interruptible_sync_poll(x, m) \
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
140extern int watchdog_register_device(struct watchdog_device *); 140extern int watchdog_register_device(struct watchdog_device *);
141extern void watchdog_unregister_device(struct watchdog_device *); 141extern void watchdog_unregister_device(struct watchdog_device *);
142 142
143#ifdef CONFIG_HARDLOCKUP_DETECTOR
144void watchdog_nmi_disable_all(void);
145void watchdog_nmi_enable_all(void);
146#else
147static inline void watchdog_nmi_disable_all(void) {}
148static inline void watchdog_nmi_enable_all(void) {}
149#endif
150
151#endif /* ifndef _LINUX_WATCHDOG_H */ 143#endif /* ifndef _LINUX_WATCHDOG_H */
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
11 EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ 11 EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \
12 EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ 12 EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \
13 EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ 13 EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \
14 EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) 14 EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \
15 EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" )
15 16
16/* 17/*
17 * First define the enums in TLB_FLUSH_REASON to be exported to userspace 18 * First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
456header-y += xilinx-v4l2-controls.h 456header-y += xilinx-v4l2-controls.h
457header-y += zorro.h 457header-y += zorro.h
458header-y += zorro_ids.h 458header-y += zorro_ids.h
459header-y += userfaultfd.h
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
190# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ 190# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
191# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ 191# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */
192 192
193/* Control the ambient capability set */
194#define PR_CAP_AMBIENT 47
195# define PR_CAP_AMBIENT_IS_SET 1
196# define PR_CAP_AMBIENT_RAISE 2
197# define PR_CAP_AMBIENT_LOWER 3
198# define PR_CAP_AMBIENT_CLEAR_ALL 4
199
193#endif /* _LINUX_PRCTL_H */ 200#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
43#define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS)) 43#define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS))
44#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED)) 44#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
45 45
46/* When set, a process cannot add new capabilities to its ambient set. */
47#define SECURE_NO_CAP_AMBIENT_RAISE 6
48#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */
49
50#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
51#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
52 (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
53
46#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ 54#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \
47 issecure_mask(SECURE_NO_SETUID_FIXUP) | \ 55 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
48 issecure_mask(SECURE_KEEP_CAPS)) 56 issecure_mask(SECURE_KEEP_CAPS) | \
57 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
49#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 58#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
50 59
51#endif /* _UAPI_LINUX_SECUREBITS_H */ 60#endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..df0e09bb7dd5
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
1/*
2 * include/linux/userfaultfd.h
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 * Copyright (C) 2015 Red Hat, Inc.
6 *
7 */
8
9#ifndef _LINUX_USERFAULTFD_H
10#define _LINUX_USERFAULTFD_H
11
12#include <linux/types.h>
13
14#include <linux/compiler.h>
15
16#define UFFD_API ((__u64)0xAA)
17/*
18 * After implementing the respective features it will become:
19 * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
20 * UFFD_FEATURE_EVENT_FORK)
21 */
22#define UFFD_API_FEATURES (0)
23#define UFFD_API_IOCTLS \
24 ((__u64)1 << _UFFDIO_REGISTER | \
25 (__u64)1 << _UFFDIO_UNREGISTER | \
26 (__u64)1 << _UFFDIO_API)
27#define UFFD_API_RANGE_IOCTLS \
28 ((__u64)1 << _UFFDIO_WAKE | \
29 (__u64)1 << _UFFDIO_COPY | \
30 (__u64)1 << _UFFDIO_ZEROPAGE)
31
32/*
33 * Valid ioctl command number range with this API is from 0x00 to
34 * 0x3F. UFFDIO_API is the fixed number, everything else can be
35 * changed by implementing a different UFFD_API. If sticking to the
36 * same UFFD_API more ioctl can be added and userland will be aware of
37 * which ioctl the running kernel implements through the ioctl command
38 * bitmask written by the UFFDIO_API.
39 */
40#define _UFFDIO_REGISTER (0x00)
41#define _UFFDIO_UNREGISTER (0x01)
42#define _UFFDIO_WAKE (0x02)
43#define _UFFDIO_COPY (0x03)
44#define _UFFDIO_ZEROPAGE (0x04)
45#define _UFFDIO_API (0x3F)
46
47/* userfaultfd ioctl ids */
48#define UFFDIO 0xAA
49#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
50 struct uffdio_api)
51#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
52 struct uffdio_register)
53#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
54 struct uffdio_range)
55#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
56 struct uffdio_range)
57#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
58 struct uffdio_copy)
59#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
60 struct uffdio_zeropage)
61
62/* read() structure */
63struct uffd_msg {
64 __u8 event;
65
66 __u8 reserved1;
67 __u16 reserved2;
68 __u32 reserved3;
69
70 union {
71 struct {
72 __u64 flags;
73 __u64 address;
74 } pagefault;
75
76 struct {
77 /* unused reserved fields */
78 __u64 reserved1;
79 __u64 reserved2;
80 __u64 reserved3;
81 } reserved;
82 } arg;
83} __packed;
84
85/*
86 * Start at 0x12 and not at 0 to be more strict against bugs.
87 */
88#define UFFD_EVENT_PAGEFAULT 0x12
89#if 0 /* not available yet */
90#define UFFD_EVENT_FORK 0x13
91#endif
92
93/* flags for UFFD_EVENT_PAGEFAULT */
94#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
95#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
96
97struct uffdio_api {
98 /* userland asks for an API number and the features to enable */
99 __u64 api;
100 /*
101 * Kernel answers below with the all available features for
102 * the API, this notifies userland of which events and/or
103 * which flags for each event are enabled in the current
104 * kernel.
105 *
106 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
107 * are to be considered implicitly always enabled in all kernels as
108 * long as the uffdio_api.api requested matches UFFD_API.
109 */
110#if 0 /* not available yet */
111#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
112#define UFFD_FEATURE_EVENT_FORK (1<<1)
113#endif
114 __u64 features;
115
116 __u64 ioctls;
117};
118
119struct uffdio_range {
120 __u64 start;
121 __u64 len;
122};
123
124struct uffdio_register {
125 struct uffdio_range range;
126#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
127#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
128 __u64 mode;
129
130 /*
131 * kernel answers which ioctl commands are available for the
132 * range, keep at the end as the last 8 bytes aren't read.
133 */
134 __u64 ioctls;
135};
136
137struct uffdio_copy {
138 __u64 dst;
139 __u64 src;
140 __u64 len;
141 /*
142 * There will be a wrprotection flag later that allows to map
143 * pages wrprotected on the fly. And such a flag will be
144 * available if the wrprotection ioctl are implemented for the
145 * range according to the uffdio_register.ioctls.
146 */
147#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
148 __u64 mode;
149
150 /*
151 * "copy" is written by the ioctl and must be at the end: the
152 * copy_from_user will not read the last 8 bytes.
153 */
154 __s64 copy;
155};
156
157struct uffdio_zeropage {
158 struct uffdio_range range;
159#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
160 __u64 mode;
161
162 /*
163 * "zeropage" is written by the ioctl and must be at the end:
164 * the copy_from_user will not read the last 8 bytes.
165 */
166 __s64 zeropage;
167};
168
169#endif /* _LINUX_USERFAULTFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index bb9b4dd55889..2c0e50ef554a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
883 bool 883 bool
884 884
885# 885#
886# For architectures that prefer to flush all TLBs after a number of pages
887# are unmapped instead of sending one IPI per page to flush. The architecture
888# must provide guarantees on what happens if a clean TLB cache entry is
889# written after the unmap. Details are in mm/rmap.c near the check for
890# should_defer_flush. The architecture should also consider if the full flush
891# and the refill costs are offset by the savings of sending fewer IPIs.
892config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
893 bool
894
895#
886# For architectures that know their GCC __int128 support is sound 896# For architectures that know their GCC __int128 support is sound
887# 897#
888config ARCH_SUPPORTS_INT128 898config ARCH_SUPPORTS_INT128
@@ -1576,6 +1586,14 @@ config ADVISE_SYSCALLS
1576 applications use these syscalls, you can disable this option to save 1586 applications use these syscalls, you can disable this option to save
1577 space. 1587 space.
1578 1588
1589config USERFAULTFD
1590 bool "Enable userfaultfd() system call"
1591 select ANON_INODES
1592 depends on MMU
1593 help
1594 Enable the userfaultfd() system call that allows to intercept and
1595 handle page faults in userland.
1596
1579config PCI_QUIRKS 1597config PCI_QUIRKS
1580 default y 1598 default y
1581 bool "Enable PCI quirk workarounds" if EXPERT 1599 bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
1342 if (root != &cgrp_dfl_root) 1342 if (root != &cgrp_dfl_root)
1343 for_each_subsys(ss, ssid) 1343 for_each_subsys(ss, ssid)
1344 if (root->subsys_mask & (1 << ssid)) 1344 if (root->subsys_mask & (1 << ssid))
1345 seq_printf(seq, ",%s", ss->legacy_name); 1345 seq_show_option(seq, ss->name, NULL);
1346 if (root->flags & CGRP_ROOT_NOPREFIX) 1346 if (root->flags & CGRP_ROOT_NOPREFIX)
1347 seq_puts(seq, ",noprefix"); 1347 seq_puts(seq, ",noprefix");
1348 if (root->flags & CGRP_ROOT_XATTR) 1348 if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
1350 1350
1351 spin_lock(&release_agent_path_lock); 1351 spin_lock(&release_agent_path_lock);
1352 if (strlen(root->release_agent_path)) 1352 if (strlen(root->release_agent_path))
1353 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1353 seq_show_option(seq, "release_agent",
1354 root->release_agent_path);
1354 spin_unlock(&release_agent_path_lock); 1355 spin_unlock(&release_agent_path_lock);
1355 1356
1356 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) 1357 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1357 seq_puts(seq, ",clone_children"); 1358 seq_puts(seq, ",clone_children");
1358 if (strlen(root->name)) 1359 if (strlen(root->name))
1359 seq_printf(seq, ",name=%s", root->name); 1360 seq_show_option(seq, "name", root->name);
1360 return 0; 1361 return 0;
1361} 1362}
1362 1363
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
454 tmp->vm_mm = mm; 454 tmp->vm_mm = mm;
455 if (anon_vma_fork(tmp, mpnt)) 455 if (anon_vma_fork(tmp, mpnt))
456 goto fail_nomem_anon_vma_fork; 456 goto fail_nomem_anon_vma_fork;
457 tmp->vm_flags &= ~VM_LOCKED; 457 tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
458 tmp->vm_next = tmp->vm_prev = NULL; 458 tmp->vm_next = tmp->vm_prev = NULL;
459 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
459 file = tmp->vm_file; 460 file = tmp->vm_file;
460 if (file) { 461 if (file) {
461 struct inode *inode = file_inode(file); 462 struct inode *inode = file_inode(file);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
248 * kthread_create_on_node - create a kthread. 248 * kthread_create_on_node - create a kthread.
249 * @threadfn: the function to run until signal_pending(current). 249 * @threadfn: the function to run until signal_pending(current).
250 * @data: data ptr for @threadfn. 250 * @data: data ptr for @threadfn.
251 * @node: memory node number. 251 * @node: task and thread structures for the thread are allocated on this node
252 * @namefmt: printf-style name for the thread. 252 * @namefmt: printf-style name for the thread.
253 * 253 *
254 * Description: This helper function creates and names a kernel 254 * Description: This helper function creates and names a kernel
255 * thread. The thread will be stopped: use wake_up_process() to start 255 * thread. The thread will be stopped: use wake_up_process() to start
256 * it. See also kthread_run(). 256 * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
257 * is affine to all CPUs.
257 * 258 *
258 * If thread is going to be bound on a particular cpu, give its node 259 * If thread is going to be bound on a particular cpu, give its node
259 * in @node, to get NUMA affinity for kthread stack, or else give -1. 260 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
260 * When woken, the thread will run @threadfn() with @data as its 261 * When woken, the thread will run @threadfn() with @data as its
261 * argument. @threadfn() can either call do_exit() directly if it is a 262 * argument. @threadfn() can either call do_exit() directly if it is a
262 * standalone thread for which no one will call kthread_stop(), or 263 * standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
106} 106}
107EXPORT_SYMBOL_GPL(__wake_up_locked); 107EXPORT_SYMBOL_GPL(__wake_up_locked);
108 108
109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
110 void *key)
110{ 111{
111 __wake_up_common(q, mode, 1, 0, key); 112 __wake_up_common(q, mode, nr, 0, key);
112} 113}
113EXPORT_SYMBOL_GPL(__wake_up_locked_key); 114EXPORT_SYMBOL_GPL(__wake_up_locked_key);
114 115
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
283 if (!list_empty(&wait->task_list)) 284 if (!list_empty(&wait->task_list))
284 list_del_init(&wait->task_list); 285 list_del_init(&wait->task_list);
285 else if (waitqueue_active(q)) 286 else if (waitqueue_active(q))
286 __wake_up_locked_key(q, mode, key); 287 __wake_up_locked_key(q, mode, 1, key);
287 spin_unlock_irqrestore(&q->lock, flags); 288 spin_unlock_irqrestore(&q->lock, flags);
288} 289}
289EXPORT_SYMBOL(abort_exclusive_wait); 290EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
113 if (kthread_should_stop()) { 113 if (kthread_should_stop()) {
114 __set_current_state(TASK_RUNNING); 114 __set_current_state(TASK_RUNNING);
115 preempt_enable(); 115 preempt_enable();
116 if (ht->cleanup) 116 /* cleanup must mirror setup */
117 if (ht->cleanup && td->status != HP_THREAD_NONE)
117 ht->cleanup(td->cpu, cpu_online(td->cpu)); 118 ht->cleanup(td->cpu, cpu_online(td->cpu));
118 kfree(td); 119 kfree(td);
119 return 0; 120 return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
259{ 260{
260 unsigned int cpu; 261 unsigned int cpu;
261 262
262 /* Unpark any threads that were voluntarily parked. */
263 for_each_cpu_not(cpu, ht->cpumask) {
264 if (cpu_online(cpu)) {
265 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
266 if (tsk)
267 kthread_unpark(tsk);
268 }
269 }
270
271 /* We need to destroy also the parked threads of offline cpus */ 263 /* We need to destroy also the parked threads of offline cpus */
272 for_each_possible_cpu(cpu) { 264 for_each_possible_cpu(cpu) {
273 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 265 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
281} 273}
282 274
283/** 275/**
284 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug 276 * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
277 * to hotplug
285 * @plug_thread: Hotplug thread descriptor 278 * @plug_thread: Hotplug thread descriptor
279 * @cpumask: The cpumask where threads run
286 * 280 *
287 * Creates and starts the threads on all online cpus. 281 * Creates and starts the threads on all online cpus.
288 */ 282 */
289int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) 283int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
284 const struct cpumask *cpumask)
290{ 285{
291 unsigned int cpu; 286 unsigned int cpu;
292 int ret = 0; 287 int ret = 0;
293 288
294 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) 289 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
295 return -ENOMEM; 290 return -ENOMEM;
296 cpumask_copy(plug_thread->cpumask, cpu_possible_mask); 291 cpumask_copy(plug_thread->cpumask, cpumask);
297 292
298 get_online_cpus(); 293 get_online_cpus();
299 mutex_lock(&smpboot_threads_lock); 294 mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
301 ret = __smpboot_create_thread(plug_thread, cpu); 296 ret = __smpboot_create_thread(plug_thread, cpu);
302 if (ret) { 297 if (ret) {
303 smpboot_destroy_threads(plug_thread); 298 smpboot_destroy_threads(plug_thread);
299 free_cpumask_var(plug_thread->cpumask);
304 goto out; 300 goto out;
305 } 301 }
306 smpboot_unpark_thread(plug_thread, cpu); 302 if (cpumask_test_cpu(cpu, cpumask))
303 smpboot_unpark_thread(plug_thread, cpu);
307 } 304 }
308 list_add(&plug_thread->list, &hotplug_threads); 305 list_add(&plug_thread->list, &hotplug_threads);
309out: 306out:
@@ -311,7 +308,7 @@ out:
311 put_online_cpus(); 308 put_online_cpus();
312 return ret; 309 return ret;
313} 310}
314EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); 311EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
315 312
316/** 313/**
317 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug 314 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
219cond_syscall(sys_eventfd); 219cond_syscall(sys_eventfd);
220cond_syscall(sys_eventfd2); 220cond_syscall(sys_eventfd2);
221cond_syscall(sys_memfd_create); 221cond_syscall(sys_memfd_create);
222cond_syscall(sys_userfaultfd);
222 223
223/* performance counters: */ 224/* performance counters: */
224cond_syscall(sys_perf_event_open); 225cond_syscall(sys_perf_event_open);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
39 cred->cap_inheritable = CAP_EMPTY_SET; 39 cred->cap_inheritable = CAP_EMPTY_SET;
40 cred->cap_permitted = CAP_FULL_SET; 40 cred->cap_permitted = CAP_FULL_SET;
41 cred->cap_effective = CAP_FULL_SET; 41 cred->cap_effective = CAP_FULL_SET;
42 cred->cap_ambient = CAP_EMPTY_SET;
42 cred->cap_bset = CAP_FULL_SET; 43 cred->cap_bset = CAP_FULL_SET;
43#ifdef CONFIG_KEYS 44#ifdef CONFIG_KEYS
44 key_put(cred->request_key_auth); 45 key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
24#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
25#include <linux/kvm_para.h> 25#include <linux/kvm_para.h>
26#include <linux/perf_event.h> 26#include <linux/perf_event.h>
27#include <linux/kthread.h>
27 28
28/* 29/*
29 * The run state of the lockup detectors is controlled by the content of the 30 * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
66#define for_each_watchdog_cpu(cpu) \ 67#define for_each_watchdog_cpu(cpu) \
67 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) 68 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
68 69
70/*
71 * The 'watchdog_running' variable is set to 1 when the watchdog threads
72 * are registered/started and is set to 0 when the watchdog threads are
73 * unregistered/stopped, so it is an indicator whether the threads exist.
74 */
69static int __read_mostly watchdog_running; 75static int __read_mostly watchdog_running;
76/*
77 * If a subsystem has a need to deactivate the watchdog temporarily, it
78 * can use the suspend/resume interface to achieve this. The content of
79 * the 'watchdog_suspended' variable reflects this state. Existing threads
80 * are parked/unparked by the lockup_detector_{suspend|resume} functions
81 * (see comment blocks pertaining to those functions for further details).
82 *
83 * 'watchdog_suspended' also prevents threads from being registered/started
84 * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
85 * of 'watchdog_running' cannot change while the watchdog is deactivated
86 * temporarily (see related code in 'proc' handlers).
87 */
88static int __read_mostly watchdog_suspended;
89
70static u64 __read_mostly sample_period; 90static u64 __read_mostly sample_period;
71 91
72static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 92static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
613 } 633 }
614} 634}
615 635
616void watchdog_nmi_enable_all(void)
617{
618 int cpu;
619
620 mutex_lock(&watchdog_proc_mutex);
621
622 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
623 goto unlock;
624
625 get_online_cpus();
626 for_each_watchdog_cpu(cpu)
627 watchdog_nmi_enable(cpu);
628 put_online_cpus();
629
630unlock:
631 mutex_unlock(&watchdog_proc_mutex);
632}
633
634void watchdog_nmi_disable_all(void)
635{
636 int cpu;
637
638 mutex_lock(&watchdog_proc_mutex);
639
640 if (!watchdog_running)
641 goto unlock;
642
643 get_online_cpus();
644 for_each_watchdog_cpu(cpu)
645 watchdog_nmi_disable(cpu);
646 put_online_cpus();
647
648unlock:
649 mutex_unlock(&watchdog_proc_mutex);
650}
651#else 636#else
652static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 637static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
653static void watchdog_nmi_disable(unsigned int cpu) { return; } 638static void watchdog_nmi_disable(unsigned int cpu) { return; }
654void watchdog_nmi_enable_all(void) {}
655void watchdog_nmi_disable_all(void) {}
656#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 639#endif /* CONFIG_HARDLOCKUP_DETECTOR */
657 640
658static struct smp_hotplug_thread watchdog_threads = { 641static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
666 .unpark = watchdog_enable, 649 .unpark = watchdog_enable,
667}; 650};
668 651
669static void restart_watchdog_hrtimer(void *info) 652/*
653 * park all watchdog threads that are specified in 'watchdog_cpumask'
654 */
655static int watchdog_park_threads(void)
670{ 656{
671 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 657 int cpu, ret = 0;
672 int ret;
673 658
659 get_online_cpus();
660 for_each_watchdog_cpu(cpu) {
661 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
662 if (ret)
663 break;
664 }
665 if (ret) {
666 for_each_watchdog_cpu(cpu)
667 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
668 }
669 put_online_cpus();
670
671 return ret;
672}
673
674/*
675 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
676 */
677static void watchdog_unpark_threads(void)
678{
679 int cpu;
680
681 get_online_cpus();
682 for_each_watchdog_cpu(cpu)
683 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
684 put_online_cpus();
685}
686
687/*
688 * Suspend the hard and soft lockup detector by parking the watchdog threads.
689 */
690int lockup_detector_suspend(void)
691{
692 int ret = 0;
693
694 mutex_lock(&watchdog_proc_mutex);
674 /* 695 /*
675 * No need to cancel and restart hrtimer if it is currently executing 696 * Multiple suspend requests can be active in parallel (counted by
676 * because it will reprogram itself with the new period now. 697 * the 'watchdog_suspended' variable). If the watchdog threads are
677 * We should never see it unqueued here because we are running per-cpu 698 * running, the first caller takes care that they will be parked.
678 * with interrupts disabled. 699 * The state of 'watchdog_running' cannot change while a suspend
700 * request is active (see related code in 'proc' handlers).
679 */ 701 */
680 ret = hrtimer_try_to_cancel(hrtimer); 702 if (watchdog_running && !watchdog_suspended)
681 if (ret == 1) 703 ret = watchdog_park_threads();
682 hrtimer_start(hrtimer, ns_to_ktime(sample_period), 704
683 HRTIMER_MODE_REL_PINNED); 705 if (ret == 0)
706 watchdog_suspended++;
707
708 mutex_unlock(&watchdog_proc_mutex);
709
710 return ret;
684} 711}
685 712
686static void update_watchdog(int cpu) 713/*
714 * Resume the hard and soft lockup detector by unparking the watchdog threads.
715 */
716void lockup_detector_resume(void)
687{ 717{
718 mutex_lock(&watchdog_proc_mutex);
719
720 watchdog_suspended--;
688 /* 721 /*
689 * Make sure that perf event counter will adopt to a new 722 * The watchdog threads are unparked if they were previously running
690 * sampling period. Updating the sampling period directly would 723 * and if there is no more active suspend request.
691 * be much nicer but we do not have an API for that now so
692 * let's use a big hammer.
693 * Hrtimer will adopt the new period on the next tick but this
694 * might be late already so we have to restart the timer as well.
695 */ 724 */
696 watchdog_nmi_disable(cpu); 725 if (watchdog_running && !watchdog_suspended)
697 smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); 726 watchdog_unpark_threads();
698 watchdog_nmi_enable(cpu); 727
728 mutex_unlock(&watchdog_proc_mutex);
699} 729}
700 730
701static void update_watchdog_all_cpus(void) 731static void update_watchdog_all_cpus(void)
702{ 732{
703 int cpu; 733 watchdog_park_threads();
704 734 watchdog_unpark_threads();
705 get_online_cpus();
706 for_each_watchdog_cpu(cpu)
707 update_watchdog(cpu);
708 put_online_cpus();
709} 735}
710 736
711static int watchdog_enable_all_cpus(void) 737static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
713 int err = 0; 739 int err = 0;
714 740
715 if (!watchdog_running) { 741 if (!watchdog_running) {
716 err = smpboot_register_percpu_thread(&watchdog_threads); 742 err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
743 &watchdog_cpumask);
717 if (err) 744 if (err)
718 pr_err("Failed to create watchdog threads, disabled\n"); 745 pr_err("Failed to create watchdog threads, disabled\n");
719 else { 746 else
720 if (smpboot_update_cpumask_percpu_thread(
721 &watchdog_threads, &watchdog_cpumask))
722 pr_err("Failed to set cpumask for watchdog threads\n");
723 watchdog_running = 1; 747 watchdog_running = 1;
724 }
725 } else { 748 } else {
726 /* 749 /*
727 * Enable/disable the lockup detectors or 750 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
787 810
788 mutex_lock(&watchdog_proc_mutex); 811 mutex_lock(&watchdog_proc_mutex);
789 812
813 if (watchdog_suspended) {
814 /* no parameter changes allowed while watchdog is suspended */
815 err = -EAGAIN;
816 goto out;
817 }
818
790 /* 819 /*
791 * If the parameter is being read return the state of the corresponding 820 * If the parameter is being read return the state of the corresponding
792 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the 821 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
872 901
873 mutex_lock(&watchdog_proc_mutex); 902 mutex_lock(&watchdog_proc_mutex);
874 903
904 if (watchdog_suspended) {
905 /* no parameter changes allowed while watchdog is suspended */
906 err = -EAGAIN;
907 goto out;
908 }
909
875 old = ACCESS_ONCE(watchdog_thresh); 910 old = ACCESS_ONCE(watchdog_thresh);
876 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 911 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
877 912
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
903 int err; 938 int err;
904 939
905 mutex_lock(&watchdog_proc_mutex); 940 mutex_lock(&watchdog_proc_mutex);
941
942 if (watchdog_suspended) {
943 /* no parameter changes allowed while watchdog is suspended */
944 err = -EAGAIN;
945 goto out;
946 }
947
906 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); 948 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
907 if (!err && write) { 949 if (!err && write) {
908 /* Remove impossible cpus to keep sysctl output cleaner. */ 950 /* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
920 pr_err("cpumask update failed\n"); 962 pr_err("cpumask update failed\n");
921 } 963 }
922 } 964 }
965out:
923 mutex_unlock(&watchdog_proc_mutex); 966 mutex_unlock(&watchdog_proc_mutex);
924 return err; 967 return err;
925} 968}
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
932 975
933#ifdef CONFIG_NO_HZ_FULL 976#ifdef CONFIG_NO_HZ_FULL
934 if (tick_nohz_full_enabled()) { 977 if (tick_nohz_full_enabled()) {
935 if (!cpumask_empty(tick_nohz_full_mask)) 978 pr_info("Disabling watchdog on nohz_full cores by default\n");
936 pr_info("Disabling watchdog on nohz_full cores by default\n"); 979 cpumask_copy(&watchdog_cpumask, housekeeping_mask);
937 cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
938 tick_nohz_full_mask);
939 } else 980 } else
940 cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 981 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
941#else 982#else
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
160 pool->min_alloc_order = min_alloc_order; 160 pool->min_alloc_order = min_alloc_order;
161 pool->algo = gen_pool_first_fit; 161 pool->algo = gen_pool_first_fit;
162 pool->data = NULL; 162 pool->data = NULL;
163 pool->name = NULL;
163 } 164 }
164 return pool; 165 return pool;
165} 166}
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
252 253
253 kfree(chunk); 254 kfree(chunk);
254 } 255 }
256 kfree_const(pool->name);
255 kfree(pool); 257 kfree(pool);
256 return;
257} 258}
258EXPORT_SYMBOL(gen_pool_destroy); 259EXPORT_SYMBOL(gen_pool_destroy);
259 260
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res)
570 gen_pool_destroy(*(struct gen_pool **)res); 571 gen_pool_destroy(*(struct gen_pool **)res);
571} 572}
572 573
574static int devm_gen_pool_match(struct device *dev, void *res, void *data)
575{
576 struct gen_pool **p = res;
577
578 /* NULL data matches only a pool without an assigned name */
579 if (!data && !(*p)->name)
580 return 1;
581
582 if (!data || !(*p)->name)
583 return 0;
584
585 return !strcmp((*p)->name, data);
586}
587
588/**
589 * gen_pool_get - Obtain the gen_pool (if any) for a device
590 * @dev: device to retrieve the gen_pool from
591 * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
592 *
593 * Returns the gen_pool for the device if one is present, or NULL.
594 */
595struct gen_pool *gen_pool_get(struct device *dev, const char *name)
596{
597 struct gen_pool **p;
598
599 p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
600 (void *)name);
601 if (!p)
602 return NULL;
603 return *p;
604}
605EXPORT_SYMBOL_GPL(gen_pool_get);
606
573/** 607/**
574 * devm_gen_pool_create - managed gen_pool_create 608 * devm_gen_pool_create - managed gen_pool_create
575 * @dev: device that provides the gen_pool 609 * @dev: device that provides the gen_pool
576 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents 610 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
577 * @nid: node id of the node the pool structure should be allocated on, or -1 611 * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
612 * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
578 * 613 *
579 * Create a new special memory pool that can be used to manage special purpose 614 * Create a new special memory pool that can be used to manage special purpose
580 * memory not managed by the regular kmalloc/kfree interface. The pool will be 615 * memory not managed by the regular kmalloc/kfree interface. The pool will be
581 * automatically destroyed by the device management code. 616 * automatically destroyed by the device management code.
582 */ 617 */
583struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, 618struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
584 int nid) 619 int nid, const char *name)
585{ 620{
586 struct gen_pool **ptr, *pool; 621 struct gen_pool **ptr, *pool;
622 const char *pool_name = NULL;
623
624 /* Check that genpool to be created is uniquely addressed on device */
625 if (gen_pool_get(dev, name))
626 return ERR_PTR(-EINVAL);
627
628 if (name) {
629 pool_name = kstrdup_const(name, GFP_KERNEL);
630 if (!pool_name)
631 return ERR_PTR(-ENOMEM);
632 }
587 633
588 ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); 634 ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
589 if (!ptr) 635 if (!ptr)
590 return NULL; 636 goto free_pool_name;
591 637
592 pool = gen_pool_create(min_alloc_order, nid); 638 pool = gen_pool_create(min_alloc_order, nid);
593 if (pool) { 639 if (!pool)
594 *ptr = pool; 640 goto free_devres;
595 devres_add(dev, ptr); 641
596 } else { 642 *ptr = pool;
597 devres_free(ptr); 643 pool->name = pool_name;
598 } 644 devres_add(dev, ptr);
599 645
600 return pool; 646 return pool;
601}
602EXPORT_SYMBOL(devm_gen_pool_create);
603 647
604/** 648free_devres:
605 * gen_pool_get - Obtain the gen_pool (if any) for a device 649 devres_free(ptr);
606 * @dev: device to retrieve the gen_pool from 650free_pool_name:
607 * 651 kfree_const(pool_name);
608 * Returns the gen_pool for the device if one is present, or NULL.
609 */
610struct gen_pool *gen_pool_get(struct device *dev)
611{
612 struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
613 NULL);
614 652
615 if (!p) 653 return ERR_PTR(-ENOMEM);
616 return NULL;
617 return *p;
618} 654}
619EXPORT_SYMBOL_GPL(gen_pool_get); 655EXPORT_SYMBOL(devm_gen_pool_create);
620 656
621#ifdef CONFIG_OF 657#ifdef CONFIG_OF
622/** 658/**
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
633 const char *propname, int index) 669 const char *propname, int index)
634{ 670{
635 struct platform_device *pdev; 671 struct platform_device *pdev;
636 struct device_node *np_pool; 672 struct device_node *np_pool, *parent;
673 const char *name = NULL;
674 struct gen_pool *pool = NULL;
637 675
638 np_pool = of_parse_phandle(np, propname, index); 676 np_pool = of_parse_phandle(np, propname, index);
639 if (!np_pool) 677 if (!np_pool)
640 return NULL; 678 return NULL;
679
641 pdev = of_find_device_by_node(np_pool); 680 pdev = of_find_device_by_node(np_pool);
681 if (!pdev) {
682 /* Check if named gen_pool is created by parent node device */
683 parent = of_get_parent(np_pool);
684 pdev = of_find_device_by_node(parent);
685 of_node_put(parent);
686
687 of_property_read_string(np_pool, "label", &name);
688 if (!name)
689 name = np_pool->name;
690 }
691 if (pdev)
692 pool = gen_pool_get(&pdev->dev, name);
642 of_node_put(np_pool); 693 of_node_put(np_pool);
643 if (!pdev) 694
644 return NULL; 695 return pool;
645 return gen_pool_get(&pdev->dev);
646} 696}
647EXPORT_SYMBOL_GPL(of_gen_pool_get); 697EXPORT_SYMBOL_GPL(of_gen_pool_get);
648#endif /* CONFIG_OF */ 698#endif /* CONFIG_OF */
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..b424d5e5b6ff 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
78obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 78obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o 79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o 80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
81obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fd5fe4342e93..59d10d16f0a5 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
242 return page; 242 return page;
243} 243}
244 244
245static inline int is_page_busy(struct dma_page *page) 245static inline bool is_page_busy(struct dma_page *page)
246{ 246{
247 return page->in_use != 0; 247 return page->in_use != 0;
248} 248}
diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..a798293fc648 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/rwsem.h> 13#include <linux/rwsem.h>
14#include <linux/hugetlb.h> 14#include <linux/hugetlb.h>
15
15#include <asm/pgtable.h> 16#include <asm/pgtable.h>
17#include <asm/tlbflush.h>
16 18
17#include "internal.h" 19#include "internal.h"
18 20
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
32 return NULL; 34 return NULL;
33} 35}
34 36
37static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
38 pte_t *pte, unsigned int flags)
39{
40 /* No page to get reference */
41 if (flags & FOLL_GET)
42 return -EFAULT;
43
44 if (flags & FOLL_TOUCH) {
45 pte_t entry = *pte;
46
47 if (flags & FOLL_WRITE)
48 entry = pte_mkdirty(entry);
49 entry = pte_mkyoung(entry);
50
51 if (!pte_same(*pte, entry)) {
52 set_pte_at(vma->vm_mm, address, pte, entry);
53 update_mmu_cache(vma, address, pte);
54 }
55 }
56
57 /* Proper page table entry exists, but no corresponding struct page */
58 return -EEXIST;
59}
60
35static struct page *follow_page_pte(struct vm_area_struct *vma, 61static struct page *follow_page_pte(struct vm_area_struct *vma,
36 unsigned long address, pmd_t *pmd, unsigned int flags) 62 unsigned long address, pmd_t *pmd, unsigned int flags)
37{ 63{
@@ -73,10 +99,21 @@ retry:
73 99
74 page = vm_normal_page(vma, address, pte); 100 page = vm_normal_page(vma, address, pte);
75 if (unlikely(!page)) { 101 if (unlikely(!page)) {
76 if ((flags & FOLL_DUMP) || 102 if (flags & FOLL_DUMP) {
77 !is_zero_pfn(pte_pfn(pte))) 103 /* Avoid special (like zero) pages in core dumps */
78 goto bad_page; 104 page = ERR_PTR(-EFAULT);
79 page = pte_page(pte); 105 goto out;
106 }
107
108 if (is_zero_pfn(pte_pfn(pte))) {
109 page = pte_page(pte);
110 } else {
111 int ret;
112
113 ret = follow_pfn_pte(vma, address, ptep, flags);
114 page = ERR_PTR(ret);
115 goto out;
116 }
80 } 117 }
81 118
82 if (flags & FOLL_GET) 119 if (flags & FOLL_GET)
@@ -114,12 +151,9 @@ retry:
114 unlock_page(page); 151 unlock_page(page);
115 } 152 }
116 } 153 }
154out:
117 pte_unmap_unlock(ptep, ptl); 155 pte_unmap_unlock(ptep, ptl);
118 return page; 156 return page;
119bad_page:
120 pte_unmap_unlock(ptep, ptl);
121 return ERR_PTR(-EFAULT);
122
123no_page: 157no_page:
124 pte_unmap_unlock(ptep, ptl); 158 pte_unmap_unlock(ptep, ptl);
125 if (!pte_none(pte)) 159 if (!pte_none(pte))
@@ -489,9 +523,15 @@ retry:
489 goto next_page; 523 goto next_page;
490 } 524 }
491 BUG(); 525 BUG();
492 } 526 } else if (PTR_ERR(page) == -EEXIST) {
493 if (IS_ERR(page)) 527 /*
528 * Proper page table entry exists, but no corresponding
529 * struct page.
530 */
531 goto next_page;
532 } else if (IS_ERR(page)) {
494 return i ? i : PTR_ERR(page); 533 return i ? i : PTR_ERR(page);
534 }
495 if (pages) { 535 if (pages) {
496 pages[i] = page; 536 pages[i] = page;
497 flush_anon_page(vma, page, start); 537 flush_anon_page(vma, page, start);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 097c7a4bfbd9..279a818a39b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -23,6 +23,7 @@
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/hashtable.h> 25#include <linux/hashtable.h>
26#include <linux/userfaultfd_k.h>
26 27
27#include <asm/tlb.h> 28#include <asm/tlb.h>
28#include <asm/pgalloc.h> 29#include <asm/pgalloc.h>
@@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
716 717
717static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 718static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
718 struct vm_area_struct *vma, 719 struct vm_area_struct *vma,
719 unsigned long haddr, pmd_t *pmd, 720 unsigned long address, pmd_t *pmd,
720 struct page *page, gfp_t gfp) 721 struct page *page, gfp_t gfp,
722 unsigned int flags)
721{ 723{
722 struct mem_cgroup *memcg; 724 struct mem_cgroup *memcg;
723 pgtable_t pgtable; 725 pgtable_t pgtable;
724 spinlock_t *ptl; 726 spinlock_t *ptl;
727 unsigned long haddr = address & HPAGE_PMD_MASK;
725 728
726 VM_BUG_ON_PAGE(!PageCompound(page), page); 729 VM_BUG_ON_PAGE(!PageCompound(page), page);
727 730
728 if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) 731 if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
729 return VM_FAULT_OOM; 732 put_page(page);
733 count_vm_event(THP_FAULT_FALLBACK);
734 return VM_FAULT_FALLBACK;
735 }
730 736
731 pgtable = pte_alloc_one(mm, haddr); 737 pgtable = pte_alloc_one(mm, haddr);
732 if (unlikely(!pgtable)) { 738 if (unlikely(!pgtable)) {
733 mem_cgroup_cancel_charge(page, memcg); 739 mem_cgroup_cancel_charge(page, memcg);
740 put_page(page);
734 return VM_FAULT_OOM; 741 return VM_FAULT_OOM;
735 } 742 }
736 743
@@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
750 pte_free(mm, pgtable); 757 pte_free(mm, pgtable);
751 } else { 758 } else {
752 pmd_t entry; 759 pmd_t entry;
760
761 /* Deliver the page fault to userland */
762 if (userfaultfd_missing(vma)) {
763 int ret;
764
765 spin_unlock(ptl);
766 mem_cgroup_cancel_charge(page, memcg);
767 put_page(page);
768 pte_free(mm, pgtable);
769 ret = handle_userfault(vma, address, flags,
770 VM_UFFD_MISSING);
771 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
772 return ret;
773 }
774
753 entry = mk_huge_pmd(page, vma->vm_page_prot); 775 entry = mk_huge_pmd(page, vma->vm_page_prot);
754 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 776 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
755 page_add_new_anon_rmap(page, vma, haddr); 777 page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
760 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 782 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
761 atomic_long_inc(&mm->nr_ptes); 783 atomic_long_inc(&mm->nr_ptes);
762 spin_unlock(ptl); 784 spin_unlock(ptl);
785 count_vm_event(THP_FAULT_ALLOC);
763 } 786 }
764 787
765 return 0; 788 return 0;
@@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
771} 794}
772 795
773/* Caller must hold page table lock. */ 796/* Caller must hold page table lock. */
774static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 797static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
775 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 798 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
776 struct page *zero_page) 799 struct page *zero_page)
777{ 800{
778 pmd_t entry; 801 pmd_t entry;
779 if (!pmd_none(*pmd))
780 return false;
781 entry = mk_pmd(zero_page, vma->vm_page_prot); 802 entry = mk_pmd(zero_page, vma->vm_page_prot);
782 entry = pmd_mkhuge(entry); 803 entry = pmd_mkhuge(entry);
783 pgtable_trans_huge_deposit(mm, pmd, pgtable); 804 pgtable_trans_huge_deposit(mm, pmd, pgtable);
784 set_pmd_at(mm, haddr, pmd, entry); 805 set_pmd_at(mm, haddr, pmd, entry);
785 atomic_long_inc(&mm->nr_ptes); 806 atomic_long_inc(&mm->nr_ptes);
786 return true;
787} 807}
788 808
789int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 809int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
806 pgtable_t pgtable; 826 pgtable_t pgtable;
807 struct page *zero_page; 827 struct page *zero_page;
808 bool set; 828 bool set;
829 int ret;
809 pgtable = pte_alloc_one(mm, haddr); 830 pgtable = pte_alloc_one(mm, haddr);
810 if (unlikely(!pgtable)) 831 if (unlikely(!pgtable))
811 return VM_FAULT_OOM; 832 return VM_FAULT_OOM;
@@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
816 return VM_FAULT_FALLBACK; 837 return VM_FAULT_FALLBACK;
817 } 838 }
818 ptl = pmd_lock(mm, pmd); 839 ptl = pmd_lock(mm, pmd);
819 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 840 ret = 0;
820 zero_page); 841 set = false;
821 spin_unlock(ptl); 842 if (pmd_none(*pmd)) {
843 if (userfaultfd_missing(vma)) {
844 spin_unlock(ptl);
845 ret = handle_userfault(vma, address, flags,
846 VM_UFFD_MISSING);
847 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
848 } else {
849 set_huge_zero_page(pgtable, mm, vma,
850 haddr, pmd,
851 zero_page);
852 spin_unlock(ptl);
853 set = true;
854 }
855 } else
856 spin_unlock(ptl);
822 if (!set) { 857 if (!set) {
823 pte_free(mm, pgtable); 858 pte_free(mm, pgtable);
824 put_huge_zero_page(); 859 put_huge_zero_page();
825 } 860 }
826 return 0; 861 return ret;
827 } 862 }
828 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); 863 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
829 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 864 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
831 count_vm_event(THP_FAULT_FALLBACK); 866 count_vm_event(THP_FAULT_FALLBACK);
832 return VM_FAULT_FALLBACK; 867 return VM_FAULT_FALLBACK;
833 } 868 }
834 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { 869 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
835 put_page(page); 870 flags);
836 count_vm_event(THP_FAULT_FALLBACK);
837 return VM_FAULT_FALLBACK;
838 }
839
840 count_vm_event(THP_FAULT_ALLOC);
841 return 0;
842} 871}
843 872
844int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 873int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
873 */ 902 */
874 if (is_huge_zero_pmd(pmd)) { 903 if (is_huge_zero_pmd(pmd)) {
875 struct page *zero_page; 904 struct page *zero_page;
876 bool set;
877 /* 905 /*
878 * get_huge_zero_page() will never allocate a new page here, 906 * get_huge_zero_page() will never allocate a new page here,
879 * since we already have a zero page to copy. It just takes a 907 * since we already have a zero page to copy. It just takes a
880 * reference. 908 * reference.
881 */ 909 */
882 zero_page = get_huge_zero_page(); 910 zero_page = get_huge_zero_page();
883 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 911 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
884 zero_page); 912 zero_page);
885 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
886 ret = 0; 913 ret = 0;
887 goto out_unlock; 914 goto out_unlock;
888 } 915 }
@@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2133 _pte++, address += PAGE_SIZE) { 2160 _pte++, address += PAGE_SIZE) {
2134 pte_t pteval = *_pte; 2161 pte_t pteval = *_pte;
2135 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2162 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2136 if (++none_or_zero <= khugepaged_max_ptes_none) 2163 if (!userfaultfd_armed(vma) &&
2164 ++none_or_zero <= khugepaged_max_ptes_none)
2137 continue; 2165 continue;
2138 else 2166 else
2139 goto out; 2167 goto out;
@@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2586 _pte++, _address += PAGE_SIZE) { 2614 _pte++, _address += PAGE_SIZE) {
2587 pte_t pteval = *_pte; 2615 pte_t pteval = *_pte;
2588 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2616 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2589 if (++none_or_zero <= khugepaged_max_ptes_none) 2617 if (!userfaultfd_armed(vma) &&
2618 ++none_or_zero <= khugepaged_max_ptes_none)
2590 continue; 2619 continue;
2591 else 2620 else
2592 goto out_unmap; 2621 goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c3087089d8..51ae41d0fbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
616} 616}
617 617
618/* Returns true if the VMA has associated reserve pages */ 618/* Returns true if the VMA has associated reserve pages */
619static int vma_has_reserves(struct vm_area_struct *vma, long chg) 619static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
620{ 620{
621 if (vma->vm_flags & VM_NORESERVE) { 621 if (vma->vm_flags & VM_NORESERVE) {
622 /* 622 /*
@@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
629 * properly, so add work-around here. 629 * properly, so add work-around here.
630 */ 630 */
631 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 631 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
632 return 1; 632 return true;
633 else 633 else
634 return 0; 634 return false;
635 } 635 }
636 636
637 /* Shared mappings always use reserves */ 637 /* Shared mappings always use reserves */
638 if (vma->vm_flags & VM_MAYSHARE) 638 if (vma->vm_flags & VM_MAYSHARE)
639 return 1; 639 return true;
640 640
641 /* 641 /*
642 * Only the process that called mmap() has reserves for 642 * Only the process that called mmap() has reserves for
643 * private mappings. 643 * private mappings.
644 */ 644 */
645 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 645 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
646 return 1; 646 return true;
647 647
648 return 0; 648 return false;
649} 649}
650 650
651static void enqueue_huge_page(struct hstate *h, struct page *page) 651static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
3779 return saddr; 3779 return saddr;
3780} 3780}
3781 3781
3782static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 3782static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3783{ 3783{
3784 unsigned long base = addr & PUD_MASK; 3784 unsigned long base = addr & PUD_MASK;
3785 unsigned long end = base + PUD_SIZE; 3785 unsigned long end = base + PUD_SIZE;
@@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3789 */ 3789 */
3790 if (vma->vm_flags & VM_MAYSHARE && 3790 if (vma->vm_flags & VM_MAYSHARE &&
3791 vma->vm_start <= base && end <= vma->vm_end) 3791 vma->vm_start <= base && end <= vma->vm_end)
3792 return 1; 3792 return true;
3793 return 0; 3793 return false;
3794} 3794}
3795 3795
3796/* 3796/*
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..1195dd2d6a2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
426#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 426#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
427#define ALLOC_FAIR 0x100 /* fair zone allocation */ 427#define ALLOC_FAIR 0x100 /* fair zone allocation */
428 428
429enum ttu_flags;
430struct tlbflush_unmap_batch;
431
432#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
433void try_to_unmap_flush(void);
434void try_to_unmap_flush_dirty(void);
435#else
436static inline void try_to_unmap_flush(void)
437{
438}
439static inline void try_to_unmap_flush_dirty(void)
440{
441}
442
443#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
429#endif /* __MM_INTERNAL_H */ 444#endif /* __MM_INTERNAL_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..ce3a4222c7e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
103 103
104 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 104 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
105 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 105 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
106 vma->vm_file, pgoff, vma_policy(vma)); 106 vma->vm_file, pgoff, vma_policy(vma),
107 vma->vm_userfaultfd_ctx);
107 if (*prev) { 108 if (*prev) {
108 vma = *prev; 109 vma = *prev;
109 goto success; 110 goto success;
@@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
385 } 386 }
386} 387}
387 388
388static int 389static bool
389madvise_behavior_valid(int behavior) 390madvise_behavior_valid(int behavior)
390{ 391{
391 switch (behavior) { 392 switch (behavior) {
@@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior)
407#endif 408#endif
408 case MADV_DONTDUMP: 409 case MADV_DONTDUMP:
409 case MADV_DODUMP: 410 case MADV_DODUMP:
410 return 1; 411 return true;
411 412
412 default: 413 default:
413 return 0; 414 return false;
414 } 415 }
415} 416}
416 417
diff --git a/mm/memblock.c b/mm/memblock.c
index 87108e77e476..95ce68c6da8a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,6 +566,9 @@ repeat:
566 * area, insert that portion. 566 * area, insert that portion.
567 */ 567 */
568 if (rbase > base) { 568 if (rbase > base) {
569#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
570 WARN_ON(nid != memblock_get_region_node(rgn));
571#endif
569 nr_new++; 572 nr_new++;
570 if (insert) 573 if (insert)
571 memblock_insert_region(type, i++, base, 574 memblock_insert_region(type, i++, base,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..1af057575ce9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5965 if (!mem_cgroup_is_root(memcg)) 5965 if (!mem_cgroup_is_root(memcg))
5966 page_counter_uncharge(&memcg->memory, 1); 5966 page_counter_uncharge(&memcg->memory, 1);
5967 5967
5968 /* Caller disabled preemption with mapping->tree_lock */ 5968 /*
5969 * Interrupts should be disabled here because the caller holds the
5970 * mapping->tree_lock lock which is taken with interrupts-off. It is
5971 * important here to have the interrupts disabled because it is the
5972 * only synchronisation we have for udpating the per-CPU variables.
5973 */
5974 VM_BUG_ON(!irqs_disabled());
5969 mem_cgroup_charge_statistics(memcg, page, -1); 5975 mem_cgroup_charge_statistics(memcg, page, -1);
5970 memcg_check_events(memcg, page); 5976 memcg_check_events(memcg, page);
5971} 5977}
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9aa283..bb04d8f2f86c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
61#include <linux/string.h> 61#include <linux/string.h>
62#include <linux/dma-debug.h> 62#include <linux/dma-debug.h>
63#include <linux/debugfs.h> 63#include <linux/debugfs.h>
64#include <linux/userfaultfd_k.h>
64 65
65#include <asm/io.h> 66#include <asm/io.h>
66#include <asm/pgalloc.h> 67#include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
180 181
181#ifdef HAVE_GENERIC_MMU_GATHER 182#ifdef HAVE_GENERIC_MMU_GATHER
182 183
183static int tlb_next_batch(struct mmu_gather *tlb) 184static bool tlb_next_batch(struct mmu_gather *tlb)
184{ 185{
185 struct mmu_gather_batch *batch; 186 struct mmu_gather_batch *batch;
186 187
187 batch = tlb->active; 188 batch = tlb->active;
188 if (batch->next) { 189 if (batch->next) {
189 tlb->active = batch->next; 190 tlb->active = batch->next;
190 return 1; 191 return true;
191 } 192 }
192 193
193 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) 194 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
194 return 0; 195 return false;
195 196
196 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 197 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
197 if (!batch) 198 if (!batch)
198 return 0; 199 return false;
199 200
200 tlb->batch_count++; 201 tlb->batch_count++;
201 batch->next = NULL; 202 batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
205 tlb->active->next = batch; 206 tlb->active->next = batch;
206 tlb->active = batch; 207 tlb->active = batch;
207 208
208 return 1; 209 return true;
209} 210}
210 211
211/* tlb_gather_mmu 212/* tlb_gather_mmu
@@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2685 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2686 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2686 if (!pte_none(*page_table)) 2687 if (!pte_none(*page_table))
2687 goto unlock; 2688 goto unlock;
2689 /* Deliver the page fault to userland, check inside PT lock */
2690 if (userfaultfd_missing(vma)) {
2691 pte_unmap_unlock(page_table, ptl);
2692 return handle_userfault(vma, address, flags,
2693 VM_UFFD_MISSING);
2694 }
2688 goto setpte; 2695 goto setpte;
2689 } 2696 }
2690 2697
@@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2713 if (!pte_none(*page_table)) 2720 if (!pte_none(*page_table))
2714 goto release; 2721 goto release;
2715 2722
2723 /* Deliver the page fault to userland, check inside PT lock */
2724 if (userfaultfd_missing(vma)) {
2725 pte_unmap_unlock(page_table, ptl);
2726 mem_cgroup_cancel_charge(page, memcg);
2727 page_cache_release(page);
2728 return handle_userfault(vma, address, flags,
2729 VM_UFFD_MISSING);
2730 }
2731
2716 inc_mm_counter_fast(mm, MM_ANONPAGES); 2732 inc_mm_counter_fast(mm, MM_ANONPAGES);
2717 page_add_new_anon_rmap(page, vma, address); 2733 page_add_new_anon_rmap(page, vma, address);
2718 mem_cgroup_commit_charge(page, memcg, false); 2734 mem_cgroup_commit_charge(page, memcg, false);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6da82bcb0a8b..8fd97dac538a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size)
1248 1248
1249 mem_hotplug_begin(); 1249 mem_hotplug_begin();
1250 1250
1251 /*
1252 * Add new range to memblock so that when hotadd_new_pgdat() is called
1253 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1254 * this new range and calculate total pages correctly. The range will
1255 * be removed at hot-remove time.
1256 */
1257 memblock_add_node(start, size, nid);
1258
1251 new_node = !node_online(nid); 1259 new_node = !node_online(nid);
1252 if (new_node) { 1260 if (new_node) {
1253 pgdat = hotadd_new_pgdat(nid, start); 1261 pgdat = hotadd_new_pgdat(nid, start);
@@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size)
1277 1285
1278 /* create new memmap entry */ 1286 /* create new memmap entry */
1279 firmware_map_add_hotplug(start, start + size, "System RAM"); 1287 firmware_map_add_hotplug(start, start + size, "System RAM");
1280 memblock_add_node(start, size, nid);
1281 1288
1282 goto out; 1289 goto out;
1283 1290
@@ -1286,6 +1293,7 @@ error:
1286 if (new_pgdat) 1293 if (new_pgdat)
1287 rollback_node_hotadd(nid, pgdat); 1294 rollback_node_hotadd(nid, pgdat);
1288 release_memory_resource(res); 1295 release_memory_resource(res);
1296 memblock_remove(start, size);
1289 1297
1290out: 1298out:
1291 mem_hotplug_done(); 1299 mem_hotplug_done();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..a7f1e0d1d6b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
722 pgoff = vma->vm_pgoff + 722 pgoff = vma->vm_pgoff +
723 ((vmstart - vma->vm_start) >> PAGE_SHIFT); 723 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
724 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 724 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
725 vma->anon_vma, vma->vm_file, pgoff, 725 vma->anon_vma, vma->vm_file, pgoff,
726 new_pol); 726 new_pol, vma->vm_userfaultfd_ctx);
727 if (prev) { 727 if (prev) {
728 vma = prev; 728 vma = prev;
729 next = vma->vm_next; 729 next = vma->vm_next;
diff --git a/mm/migrate.c b/mm/migrate.c
index eb4267107d1f..5c08cab5419e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1226 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1226 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1227 goto set_status; 1227 goto set_status;
1228 1228
1229 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); 1229 /* FOLL_DUMP to ignore special (like zero) pages */
1230 page = follow_page(vma, pp->addr,
1231 FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
1230 1232
1231 err = PTR_ERR(page); 1233 err = PTR_ERR(page);
1232 if (IS_ERR(page)) 1234 if (IS_ERR(page))
@@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1236 if (!page) 1238 if (!page)
1237 goto set_status; 1239 goto set_status;
1238 1240
1239 /* Use PageReserved to check for zero page */
1240 if (PageReserved(page))
1241 goto put_and_set;
1242
1243 pp->page = page; 1241 pp->page = page;
1244 err = page_to_nid(page); 1242 err = page_to_nid(page);
1245 1243
@@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1396 if (!vma || addr < vma->vm_start) 1394 if (!vma || addr < vma->vm_start)
1397 goto set_status; 1395 goto set_status;
1398 1396
1399 page = follow_page(vma, addr, 0); 1397 /* FOLL_DUMP to ignore special (like zero) pages */
1398 page = follow_page(vma, addr, FOLL_DUMP);
1400 1399
1401 err = PTR_ERR(page); 1400 err = PTR_ERR(page);
1402 if (IS_ERR(page)) 1401 if (IS_ERR(page))
1403 goto set_status; 1402 goto set_status;
1404 1403
1405 err = -ENOENT; 1404 err = page ? page_to_nid(page) : -ENOENT;
1406 /* Use PageReserved to check for zero page */
1407 if (!page || PageReserved(page))
1408 goto set_status;
1409
1410 err = page_to_nid(page);
1411set_status: 1405set_status:
1412 *status = err; 1406 *status = err;
1413 1407
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..25936680064f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
510 510
511 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 511 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
512 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 512 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
513 vma->vm_file, pgoff, vma_policy(vma)); 513 vma->vm_file, pgoff, vma_policy(vma),
514 vma->vm_userfaultfd_ctx);
514 if (*prev) { 515 if (*prev) {
515 vma = *prev; 516 vma = *prev;
516 goto success; 517 goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..82db4fc0a9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/memory.h> 42#include <linux/memory.h>
43#include <linux/printk.h> 43#include <linux/printk.h>
44#include <linux/userfaultfd_k.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/cacheflush.h> 47#include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end);
919 * per-vma resources, so we don't attempt to merge those. 920 * per-vma resources, so we don't attempt to merge those.
920 */ 921 */
921static inline int is_mergeable_vma(struct vm_area_struct *vma, 922static inline int is_mergeable_vma(struct vm_area_struct *vma,
922 struct file *file, unsigned long vm_flags) 923 struct file *file, unsigned long vm_flags,
924 struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
923{ 925{
924 /* 926 /*
925 * VM_SOFTDIRTY should not prevent from VMA merging, if we 927 * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
935 return 0; 937 return 0;
936 if (vma->vm_ops && vma->vm_ops->close) 938 if (vma->vm_ops && vma->vm_ops->close)
937 return 0; 939 return 0;
940 if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
941 return 0;
938 return 1; 942 return 1;
939} 943}
940 944
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
965 */ 969 */
966static int 970static int
967can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 971can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
968 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 972 struct anon_vma *anon_vma, struct file *file,
973 pgoff_t vm_pgoff,
974 struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
969{ 975{
970 if (is_mergeable_vma(vma, file, vm_flags) && 976 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
971 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 977 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
972 if (vma->vm_pgoff == vm_pgoff) 978 if (vma->vm_pgoff == vm_pgoff)
973 return 1; 979 return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
984 */ 990 */
985static int 991static int
986can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 992can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
987 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 993 struct anon_vma *anon_vma, struct file *file,
994 pgoff_t vm_pgoff,
995 struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
988{ 996{
989 if (is_mergeable_vma(vma, file, vm_flags) && 997 if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
990 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 998 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
991 pgoff_t vm_pglen; 999 pgoff_t vm_pglen;
992 vm_pglen = vma_pages(vma); 1000 vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1029 struct vm_area_struct *prev, unsigned long addr, 1037 struct vm_area_struct *prev, unsigned long addr,
1030 unsigned long end, unsigned long vm_flags, 1038 unsigned long end, unsigned long vm_flags,
1031 struct anon_vma *anon_vma, struct file *file, 1039 struct anon_vma *anon_vma, struct file *file,
1032 pgoff_t pgoff, struct mempolicy *policy) 1040 pgoff_t pgoff, struct mempolicy *policy,
1041 struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
1033{ 1042{
1034 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1043 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1035 struct vm_area_struct *area, *next; 1044 struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1056 if (prev && prev->vm_end == addr && 1065 if (prev && prev->vm_end == addr &&
1057 mpol_equal(vma_policy(prev), policy) && 1066 mpol_equal(vma_policy(prev), policy) &&
1058 can_vma_merge_after(prev, vm_flags, 1067 can_vma_merge_after(prev, vm_flags,
1059 anon_vma, file, pgoff)) { 1068 anon_vma, file, pgoff,
1069 vm_userfaultfd_ctx)) {
1060 /* 1070 /*
1061 * OK, it can. Can we now merge in the successor as well? 1071 * OK, it can. Can we now merge in the successor as well?
1062 */ 1072 */
1063 if (next && end == next->vm_start && 1073 if (next && end == next->vm_start &&
1064 mpol_equal(policy, vma_policy(next)) && 1074 mpol_equal(policy, vma_policy(next)) &&
1065 can_vma_merge_before(next, vm_flags, 1075 can_vma_merge_before(next, vm_flags,
1066 anon_vma, file, pgoff+pglen) && 1076 anon_vma, file,
1077 pgoff+pglen,
1078 vm_userfaultfd_ctx) &&
1067 is_mergeable_anon_vma(prev->anon_vma, 1079 is_mergeable_anon_vma(prev->anon_vma,
1068 next->anon_vma, NULL)) { 1080 next->anon_vma, NULL)) {
1069 /* cases 1, 6 */ 1081 /* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1084 if (next && end == next->vm_start && 1096 if (next && end == next->vm_start &&
1085 mpol_equal(policy, vma_policy(next)) && 1097 mpol_equal(policy, vma_policy(next)) &&
1086 can_vma_merge_before(next, vm_flags, 1098 can_vma_merge_before(next, vm_flags,
1087 anon_vma, file, pgoff+pglen)) { 1099 anon_vma, file, pgoff+pglen,
1100 vm_userfaultfd_ctx)) {
1088 if (prev && addr < prev->vm_end) /* case 4 */ 1101 if (prev && addr < prev->vm_end) /* case 4 */
1089 err = vma_adjust(prev, prev->vm_start, 1102 err = vma_adjust(prev, prev->vm_start,
1090 addr, prev->vm_pgoff, NULL); 1103 addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1570 /* 1583 /*
1571 * Can we just expand an old mapping? 1584 * Can we just expand an old mapping?
1572 */ 1585 */
1573 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, 1586 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1574 NULL); 1587 NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
1575 if (vma) 1588 if (vma)
1576 goto out; 1589 goto out;
1577 1590
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2757 2770
2758 /* Can we just expand an old private anonymous mapping? */ 2771 /* Can we just expand an old private anonymous mapping? */
2759 vma = vma_merge(mm, prev, addr, addr + len, flags, 2772 vma = vma_merge(mm, prev, addr, addr + len, flags,
2760 NULL, NULL, pgoff, NULL); 2773 NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
2761 if (vma) 2774 if (vma)
2762 goto out; 2775 goto out;
2763 2776
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2913 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) 2926 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2914 return NULL; /* should never get here */ 2927 return NULL; /* should never get here */
2915 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2928 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2916 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2929 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
2930 vma->vm_userfaultfd_ctx);
2917 if (new_vma) { 2931 if (new_vma) {
2918 /* 2932 /*
2919 * Source vma may have been merged into new_vma 2933 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
292 */ 292 */
293 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 293 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
294 *pprev = vma_merge(mm, *pprev, start, end, newflags, 294 *pprev = vma_merge(mm, *pprev, start, end, newflags,
295 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 295 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
296 vma->vm_userfaultfd_ctx);
296 if (*pprev) { 297 if (*pprev) {
297 vma = *pprev; 298 vma = *pprev;
298 goto success; 299 goto success;
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c93eceb1c8..5a71cce8c6ea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
276 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, 276 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
277 need_rmap_locks); 277 need_rmap_locks);
278 if (moved_len < old_len) { 278 if (moved_len < old_len) {
279 err = -ENOMEM;
280 } else if (vma->vm_ops && vma->vm_ops->mremap) {
281 err = vma->vm_ops->mremap(new_vma);
282 }
283
284 if (unlikely(err)) {
279 /* 285 /*
280 * On error, move entries back from new area to old, 286 * On error, move entries back from new area to old,
281 * which will succeed since page tables still there, 287 * which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
286 vma = new_vma; 292 vma = new_vma;
287 old_len = new_len; 293 old_len = new_len;
288 old_addr = new_addr; 294 old_addr = new_addr;
289 new_addr = -ENOMEM; 295 new_addr = err;
290 } else { 296 } else {
291 if (vma->vm_file && vma->vm_file->f_op->mremap) {
292 err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
293 if (err < 0) {
294 move_page_tables(new_vma, new_addr, vma,
295 old_addr, moved_len, true);
296 return err;
297 }
298 }
299 arch_remap(mm, old_addr, old_addr + old_len, 297 arch_remap(mm, old_addr, old_addr + old_len,
300 new_addr, new_addr + new_len); 298 new_addr, new_addr + new_len);
301 } 299 }
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
348{ 346{
349 struct mm_struct *mm = current->mm; 347 struct mm_struct *mm = current->mm;
350 struct vm_area_struct *vma = find_vma(mm, addr); 348 struct vm_area_struct *vma = find_vma(mm, addr);
349 unsigned long pgoff;
351 350
352 if (!vma || vma->vm_start > addr) 351 if (!vma || vma->vm_start > addr)
353 return ERR_PTR(-EFAULT); 352 return ERR_PTR(-EFAULT);
@@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
359 if (old_len > vma->vm_end - addr) 358 if (old_len > vma->vm_end - addr)
360 return ERR_PTR(-EFAULT); 359 return ERR_PTR(-EFAULT);
361 360
361 if (new_len == old_len)
362 return vma;
363
362 /* Need to be careful about a growing mapping */ 364 /* Need to be careful about a growing mapping */
363 if (new_len > old_len) { 365 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
364 unsigned long pgoff; 366 pgoff += vma->vm_pgoff;
365 367 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
366 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 368 return ERR_PTR(-EINVAL);
367 return ERR_PTR(-EFAULT); 369
368 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 370 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
369 pgoff += vma->vm_pgoff; 371 return ERR_PTR(-EFAULT);
370 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
371 return ERR_PTR(-EINVAL);
372 }
373 372
374 if (vma->vm_flags & VM_LOCKED) { 373 if (vma->vm_flags & VM_LOCKED) {
375 unsigned long locked, lock_limit; 374 unsigned long locked, lock_limit;
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
408 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) 407 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
409 goto out; 408 goto out;
410 409
411 /* Check if the location we're moving into overlaps the 410 /* Ensure the old/new locations do not overlap */
412 * old location at all, and fail if it does. 411 if (addr + old_len > new_addr && new_addr + new_len > addr)
413 */
414 if ((new_addr <= addr) && (new_addr+new_len) > addr)
415 goto out;
416
417 if ((addr <= new_addr) && (addr+old_len) > new_addr)
418 goto out; 412 goto out;
419 413
420 ret = do_munmap(mm, new_addr, new_len); 414 ret = do_munmap(mm, new_addr, new_len);
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
580 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); 574 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
581 } 575 }
582out: 576out:
583 if (ret & ~PAGE_MASK) 577 if (ret & ~PAGE_MASK) {
584 vm_unacct_memory(charged); 578 vm_unacct_memory(charged);
579 locked = 0;
580 }
585 up_write(&current->mm->mmap_sem); 581 up_write(&current->mm->mmap_sem);
586 if (locked && new_len > old_len) 582 if (locked && new_len > old_len)
587 mm_populate(new_addr + old_len, new_len - old_len); 583 mm_populate(new_addr + old_len, new_len - old_len);
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..0db38e7d0a72 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
62 62
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64 64
65#include <trace/events/tlb.h>
66
65#include "internal.h" 67#include "internal.h"
66 68
67static struct kmem_cache *anon_vma_cachep; 69static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
583 return address; 585 return address;
584} 586}
585 587
588#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
589static void percpu_flush_tlb_batch_pages(void *data)
590{
591 /*
592 * All TLB entries are flushed on the assumption that it is
593 * cheaper to flush all TLBs and let them be refilled than
594 * flushing individual PFNs. Note that we do not track mm's
595 * to flush as that might simply be multiple full TLB flushes
596 * for no gain.
597 */
598 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
599 flush_tlb_local();
600}
601
602/*
603 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
604 * important if a PTE was dirty when it was unmapped that it's flushed
605 * before any IO is initiated on the page to prevent lost writes. Similarly,
606 * it must be flushed before freeing to prevent data leakage.
607 */
608void try_to_unmap_flush(void)
609{
610 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
611 int cpu;
612
613 if (!tlb_ubc->flush_required)
614 return;
615
616 cpu = get_cpu();
617
618 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
619
620 if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
621 percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
622
623 if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
624 smp_call_function_many(&tlb_ubc->cpumask,
625 percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
626 }
627 cpumask_clear(&tlb_ubc->cpumask);
628 tlb_ubc->flush_required = false;
629 tlb_ubc->writable = false;
630 put_cpu();
631}
632
633/* Flush iff there are potentially writable TLB entries that can race with IO */
634void try_to_unmap_flush_dirty(void)
635{
636 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
637
638 if (tlb_ubc->writable)
639 try_to_unmap_flush();
640}
641
642static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
643 struct page *page, bool writable)
644{
645 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
646
647 cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
648 tlb_ubc->flush_required = true;
649
650 /*
651 * If the PTE was dirty then it's best to assume it's writable. The
652 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
653 * before the page is queued for IO.
654 */
655 if (writable)
656 tlb_ubc->writable = true;
657}
658
659/*
660 * Returns true if the TLB flush should be deferred to the end of a batch of
661 * unmap operations to reduce IPIs.
662 */
663static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
664{
665 bool should_defer = false;
666
667 if (!(flags & TTU_BATCH_FLUSH))
668 return false;
669
670 /* If remote CPUs need to be flushed then defer batch the flush */
671 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
672 should_defer = true;
673 put_cpu();
674
675 return should_defer;
676}
677#else
678static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
679 struct page *page, bool writable)
680{
681}
682
683static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
684{
685 return false;
686}
687#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
688
586/* 689/*
587 * At what user virtual address is page expected in vma? 690 * At what user virtual address is page expected in vma?
588 * Caller should check the page is actually part of the vma. 691 * Caller should check the page is actually part of the vma.
@@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1220 1323
1221 /* Nuke the page table entry. */ 1324 /* Nuke the page table entry. */
1222 flush_cache_page(vma, address, page_to_pfn(page)); 1325 flush_cache_page(vma, address, page_to_pfn(page));
1223 pteval = ptep_clear_flush(vma, address, pte); 1326 if (should_defer_flush(mm, flags)) {
1327 /*
1328 * We clear the PTE but do not flush so potentially a remote
1329 * CPU could still be writing to the page. If the entry was
1330 * previously clean then the architecture must guarantee that
1331 * a clear->dirty transition on a cached TLB entry is written
1332 * through and traps if the PTE is unmapped.
1333 */
1334 pteval = ptep_get_and_clear(mm, address, pte);
1335
1336 set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
1337 } else {
1338 pteval = ptep_clear_flush(vma, address, pte);
1339 }
1224 1340
1225 /* Move the dirty bit to the physical page now the pte is gone. */ 1341 /* Move the dirty bit to the physical page now the pte is gone. */
1226 if (pte_dirty(pteval)) 1342 if (pte_dirty(pteval))
diff --git a/mm/slab.c b/mm/slab.c
index bbd0b47dc6a9..60c936938b84 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3416} 3416}
3417EXPORT_SYMBOL(kmem_cache_alloc); 3417EXPORT_SYMBOL(kmem_cache_alloc);
3418 3418
3419void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3420{
3421 __kmem_cache_free_bulk(s, size, p);
3422}
3423EXPORT_SYMBOL(kmem_cache_free_bulk);
3424
3425bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3426 void **p)
3427{
3428 return __kmem_cache_alloc_bulk(s, flags, size, p);
3429}
3430EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3431
3419#ifdef CONFIG_TRACING 3432#ifdef CONFIG_TRACING
3420void * 3433void *
3421kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) 3434kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..a3a967d7d7c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
163ssize_t slabinfo_write(struct file *file, const char __user *buffer, 163ssize_t slabinfo_write(struct file *file, const char __user *buffer,
164 size_t count, loff_t *ppos); 164 size_t count, loff_t *ppos);
165 165
166/*
167 * Generic implementation of bulk operations
168 * These are useful for situations in which the allocator cannot
169 * perform optimizations. In that case segments of the objecct listed
170 * may be allocated or freed using these operations.
171 */
172void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
173bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
174
166#ifdef CONFIG_MEMCG_KMEM 175#ifdef CONFIG_MEMCG_KMEM
167/* 176/*
168 * Iterate over all memcg caches of the given root cache. The caller must hold 177 * Iterate over all memcg caches of the given root cache. The caller must hold
@@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
321 return cachep; 330 return cachep;
322 331
323 pr_err("%s: Wrong slab cache. %s but object is from %s\n", 332 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
324 __func__, cachep->name, s->name); 333 __func__, s->name, cachep->name);
325 WARN_ON_ONCE(1); 334 WARN_ON_ONCE(1);
326 return s; 335 return s;
327} 336}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 86831105a09f..c26829fe4e37 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
104} 104}
105#endif 105#endif
106 106
107void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
108{
109 size_t i;
110
111 for (i = 0; i < nr; i++)
112 kmem_cache_free(s, p[i]);
113}
114
115bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
116 void **p)
117{
118 size_t i;
119
120 for (i = 0; i < nr; i++) {
121 void *x = p[i] = kmem_cache_alloc(s, flags);
122 if (!x) {
123 __kmem_cache_free_bulk(s, i, p);
124 return false;
125 }
126 }
127 return true;
128}
129
107#ifdef CONFIG_MEMCG_KMEM 130#ifdef CONFIG_MEMCG_KMEM
108void slab_init_memcg_params(struct kmem_cache *s) 131void slab_init_memcg_params(struct kmem_cache *s)
109{ 132{
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
611} 611}
612EXPORT_SYMBOL(kmem_cache_free); 612EXPORT_SYMBOL(kmem_cache_free);
613 613
614void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
615{
616 __kmem_cache_free_bulk(s, size, p);
617}
618EXPORT_SYMBOL(kmem_cache_free_bulk);
619
620bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
621 void **p)
622{
623 return __kmem_cache_alloc_bulk(s, flags, size, p);
624}
625EXPORT_SYMBOL(kmem_cache_alloc_bulk);
626
614int __kmem_cache_shutdown(struct kmem_cache *c) 627int __kmem_cache_shutdown(struct kmem_cache *c)
615{ 628{
616 /* No way to check for remaining objects */ 629 /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index f68c0e50f3c0..084184e706c6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1306 kasan_slab_free(s, x); 1306 kasan_slab_free(s, x);
1307} 1307}
1308 1308
1309static void setup_object(struct kmem_cache *s, struct page *page,
1310 void *object)
1311{
1312 setup_object_debug(s, page, object);
1313 if (unlikely(s->ctor)) {
1314 kasan_unpoison_object_data(s, object);
1315 s->ctor(object);
1316 kasan_poison_object_data(s, object);
1317 }
1318}
1319
1309/* 1320/*
1310 * Slab allocation and freeing 1321 * Slab allocation and freeing
1311 */ 1322 */
@@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1336 struct page *page; 1347 struct page *page;
1337 struct kmem_cache_order_objects oo = s->oo; 1348 struct kmem_cache_order_objects oo = s->oo;
1338 gfp_t alloc_gfp; 1349 gfp_t alloc_gfp;
1350 void *start, *p;
1351 int idx, order;
1339 1352
1340 flags &= gfp_allowed_mask; 1353 flags &= gfp_allowed_mask;
1341 1354
@@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1349 * so we fall-back to the minimum order allocation. 1362 * so we fall-back to the minimum order allocation.
1350 */ 1363 */
1351 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1364 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1365 if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
1366 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
1352 1367
1353 page = alloc_slab_page(s, alloc_gfp, node, oo); 1368 page = alloc_slab_page(s, alloc_gfp, node, oo);
1354 if (unlikely(!page)) { 1369 if (unlikely(!page)) {
@@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1359 * Try a lower order alloc if possible 1374 * Try a lower order alloc if possible
1360 */ 1375 */
1361 page = alloc_slab_page(s, alloc_gfp, node, oo); 1376 page = alloc_slab_page(s, alloc_gfp, node, oo);
1362 1377 if (unlikely(!page))
1363 if (page) 1378 goto out;
1364 stat(s, ORDER_FALLBACK); 1379 stat(s, ORDER_FALLBACK);
1365 } 1380 }
1366 1381
1367 if (kmemcheck_enabled && page 1382 if (kmemcheck_enabled &&
1368 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1383 !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1369 int pages = 1 << oo_order(oo); 1384 int pages = 1 << oo_order(oo);
1370 1385
1371 kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); 1386 kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1380 kmemcheck_mark_unallocated_pages(page, pages); 1395 kmemcheck_mark_unallocated_pages(page, pages);
1381 } 1396 }
1382 1397
1383 if (flags & __GFP_WAIT)
1384 local_irq_disable();
1385 if (!page)
1386 return NULL;
1387
1388 page->objects = oo_objects(oo); 1398 page->objects = oo_objects(oo);
1389 mod_zone_page_state(page_zone(page),
1390 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1391 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1392 1 << oo_order(oo));
1393
1394 return page;
1395}
1396
1397static void setup_object(struct kmem_cache *s, struct page *page,
1398 void *object)
1399{
1400 setup_object_debug(s, page, object);
1401 if (unlikely(s->ctor)) {
1402 kasan_unpoison_object_data(s, object);
1403 s->ctor(object);
1404 kasan_poison_object_data(s, object);
1405 }
1406}
1407
1408static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1409{
1410 struct page *page;
1411 void *start;
1412 void *p;
1413 int order;
1414 int idx;
1415
1416 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1417 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
1418 BUG();
1419 }
1420
1421 page = allocate_slab(s,
1422 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1423 if (!page)
1424 goto out;
1425 1399
1426 order = compound_order(page); 1400 order = compound_order(page);
1427 inc_slabs_node(s, page_to_nid(page), page->objects);
1428 page->slab_cache = s; 1401 page->slab_cache = s;
1429 __SetPageSlab(page); 1402 __SetPageSlab(page);
1430 if (page_is_pfmemalloc(page)) 1403 if (page_is_pfmemalloc(page))
@@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1448 page->freelist = start; 1421 page->freelist = start;
1449 page->inuse = page->objects; 1422 page->inuse = page->objects;
1450 page->frozen = 1; 1423 page->frozen = 1;
1424
1451out: 1425out:
1426 if (flags & __GFP_WAIT)
1427 local_irq_disable();
1428 if (!page)
1429 return NULL;
1430
1431 mod_zone_page_state(page_zone(page),
1432 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1433 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1434 1 << oo_order(oo));
1435
1436 inc_slabs_node(s, page_to_nid(page), page->objects);
1437
1452 return page; 1438 return page;
1453} 1439}
1454 1440
1441static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1442{
1443 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1444 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
1445 BUG();
1446 }
1447
1448 return allocate_slab(s,
1449 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1450}
1451
1455static void __free_slab(struct kmem_cache *s, struct page *page) 1452static void __free_slab(struct kmem_cache *s, struct page *page)
1456{ 1453{
1457 int order = compound_order(page); 1454 int order = compound_order(page);
@@ -2712,7 +2709,7 @@ redo:
2712 * Determine the currently cpus per cpu slab. 2709 * Determine the currently cpus per cpu slab.
2713 * The cpu may change afterward. However that does not matter since 2710 * The cpu may change afterward. However that does not matter since
2714 * data is retrieved via this pointer. If we are on the same cpu 2711 * data is retrieved via this pointer. If we are on the same cpu
2715 * during the cmpxchg then the free will succedd. 2712 * during the cmpxchg then the free will succeed.
2716 */ 2713 */
2717 do { 2714 do {
2718 tid = this_cpu_read(s->cpu_slab->tid); 2715 tid = this_cpu_read(s->cpu_slab->tid);
@@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
2750} 2747}
2751EXPORT_SYMBOL(kmem_cache_free); 2748EXPORT_SYMBOL(kmem_cache_free);
2752 2749
2750/* Note that interrupts must be enabled when calling this function. */
2751void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2752{
2753 struct kmem_cache_cpu *c;
2754 struct page *page;
2755 int i;
2756
2757 local_irq_disable();
2758 c = this_cpu_ptr(s->cpu_slab);
2759
2760 for (i = 0; i < size; i++) {
2761 void *object = p[i];
2762
2763 BUG_ON(!object);
2764 /* kmem cache debug support */
2765 s = cache_from_obj(s, object);
2766 if (unlikely(!s))
2767 goto exit;
2768 slab_free_hook(s, object);
2769
2770 page = virt_to_head_page(object);
2771
2772 if (c->page == page) {
2773 /* Fastpath: local CPU free */
2774 set_freepointer(s, object, c->freelist);
2775 c->freelist = object;
2776 } else {
2777 c->tid = next_tid(c->tid);
2778 local_irq_enable();
2779 /* Slowpath: overhead locked cmpxchg_double_slab */
2780 __slab_free(s, page, object, _RET_IP_);
2781 local_irq_disable();
2782 c = this_cpu_ptr(s->cpu_slab);
2783 }
2784 }
2785exit:
2786 c->tid = next_tid(c->tid);
2787 local_irq_enable();
2788}
2789EXPORT_SYMBOL(kmem_cache_free_bulk);
2790
2791/* Note that interrupts must be enabled when calling this function. */
2792bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
2793 void **p)
2794{
2795 struct kmem_cache_cpu *c;
2796 int i;
2797
2798 /*
2799 * Drain objects in the per cpu slab, while disabling local
2800 * IRQs, which protects against PREEMPT and interrupts
2801 * handlers invoking normal fastpath.
2802 */
2803 local_irq_disable();
2804 c = this_cpu_ptr(s->cpu_slab);
2805
2806 for (i = 0; i < size; i++) {
2807 void *object = c->freelist;
2808
2809 if (unlikely(!object)) {
2810 local_irq_enable();
2811 /*
2812 * Invoking slow path likely have side-effect
2813 * of re-populating per CPU c->freelist
2814 */
2815 p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
2816 _RET_IP_, c);
2817 if (unlikely(!p[i])) {
2818 __kmem_cache_free_bulk(s, i, p);
2819 return false;
2820 }
2821 local_irq_disable();
2822 c = this_cpu_ptr(s->cpu_slab);
2823 continue; /* goto for-loop */
2824 }
2825
2826 /* kmem_cache debug support */
2827 s = slab_pre_alloc_hook(s, flags);
2828 if (unlikely(!s)) {
2829 __kmem_cache_free_bulk(s, i, p);
2830 c->tid = next_tid(c->tid);
2831 local_irq_enable();
2832 return false;
2833 }
2834
2835 c->freelist = get_freepointer(s, object);
2836 p[i] = object;
2837
2838 /* kmem_cache debug support */
2839 slab_post_alloc_hook(s, flags, object);
2840 }
2841 c->tid = next_tid(c->tid);
2842 local_irq_enable();
2843
2844 /* Clear memory outside IRQ disabled fastpath loop */
2845 if (unlikely(flags & __GFP_ZERO)) {
2846 int j;
2847
2848 for (j = 0; j < i; j++)
2849 memset(p[j], 0, s->object_size);
2850 }
2851
2852 return true;
2853}
2854EXPORT_SYMBOL(kmem_cache_alloc_bulk);
2855
2856
2753/* 2857/*
2754 * Object placement in a slab is made very easy because we always start at 2858 * Object placement in a slab is made very easy because we always start at
2755 * offset 0. If we tune the size of the object to the alignment then we can 2859 * offset 0. If we tune the size of the object to the alignment then we can
@@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5181 s->kobj.kset = cache_kset(s); 5285 s->kobj.kset = cache_kset(s);
5182 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); 5286 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5183 if (err) 5287 if (err)
5184 goto out_put_kobj; 5288 goto out;
5185 5289
5186 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5290 err = sysfs_create_group(&s->kobj, &slab_attr_group);
5187 if (err) 5291 if (err)
@@ -5208,8 +5312,6 @@ out:
5208 return err; 5312 return err;
5209out_del_kobj: 5313out_del_kobj:
5210 kobject_del(&s->kobj); 5314 kobject_del(&s->kobj);
5211out_put_kobj:
5212 kobject_put(&s->kobj);
5213 goto out; 5315 goto out;
5214} 5316}
5215 5317
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..77fee9325a57
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,308 @@
1/*
2 * mm/userfaultfd.c
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 */
9
10#include <linux/mm.h>
11#include <linux/pagemap.h>
12#include <linux/rmap.h>
13#include <linux/swap.h>
14#include <linux/swapops.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/mmu_notifier.h>
17#include <asm/tlbflush.h>
18#include "internal.h"
19
20static int mcopy_atomic_pte(struct mm_struct *dst_mm,
21 pmd_t *dst_pmd,
22 struct vm_area_struct *dst_vma,
23 unsigned long dst_addr,
24 unsigned long src_addr,
25 struct page **pagep)
26{
27 struct mem_cgroup *memcg;
28 pte_t _dst_pte, *dst_pte;
29 spinlock_t *ptl;
30 void *page_kaddr;
31 int ret;
32 struct page *page;
33
34 if (!*pagep) {
35 ret = -ENOMEM;
36 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
37 if (!page)
38 goto out;
39
40 page_kaddr = kmap_atomic(page);
41 ret = copy_from_user(page_kaddr,
42 (const void __user *) src_addr,
43 PAGE_SIZE);
44 kunmap_atomic(page_kaddr);
45
46 /* fallback to copy_from_user outside mmap_sem */
47 if (unlikely(ret)) {
48 ret = -EFAULT;
49 *pagep = page;
50 /* don't free the page */
51 goto out;
52 }
53 } else {
54 page = *pagep;
55 *pagep = NULL;
56 }
57
58 /*
59 * The memory barrier inside __SetPageUptodate makes sure that
60 * preceeding stores to the page contents become visible before
61 * the set_pte_at() write.
62 */
63 __SetPageUptodate(page);
64
65 ret = -ENOMEM;
66 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
67 goto out_release;
68
69 _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
70 if (dst_vma->vm_flags & VM_WRITE)
71 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
72
73 ret = -EEXIST;
74 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
75 if (!pte_none(*dst_pte))
76 goto out_release_uncharge_unlock;
77
78 inc_mm_counter(dst_mm, MM_ANONPAGES);
79 page_add_new_anon_rmap(page, dst_vma, dst_addr);
80 mem_cgroup_commit_charge(page, memcg, false);
81 lru_cache_add_active_or_unevictable(page, dst_vma);
82
83 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
84
85 /* No need to invalidate - it was non-present before */
86 update_mmu_cache(dst_vma, dst_addr, dst_pte);
87
88 pte_unmap_unlock(dst_pte, ptl);
89 ret = 0;
90out:
91 return ret;
92out_release_uncharge_unlock:
93 pte_unmap_unlock(dst_pte, ptl);
94 mem_cgroup_cancel_charge(page, memcg);
95out_release:
96 page_cache_release(page);
97 goto out;
98}
99
100static int mfill_zeropage_pte(struct mm_struct *dst_mm,
101 pmd_t *dst_pmd,
102 struct vm_area_struct *dst_vma,
103 unsigned long dst_addr)
104{
105 pte_t _dst_pte, *dst_pte;
106 spinlock_t *ptl;
107 int ret;
108
109 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
110 dst_vma->vm_page_prot));
111 ret = -EEXIST;
112 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
113 if (!pte_none(*dst_pte))
114 goto out_unlock;
115 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
116 /* No need to invalidate - it was non-present before */
117 update_mmu_cache(dst_vma, dst_addr, dst_pte);
118 ret = 0;
119out_unlock:
120 pte_unmap_unlock(dst_pte, ptl);
121 return ret;
122}
123
124static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
125{
126 pgd_t *pgd;
127 pud_t *pud;
128 pmd_t *pmd = NULL;
129
130 pgd = pgd_offset(mm, address);
131 pud = pud_alloc(mm, pgd, address);
132 if (pud)
133 /*
134 * Note that we didn't run this because the pmd was
135 * missing, the *pmd may be already established and in
136 * turn it may also be a trans_huge_pmd.
137 */
138 pmd = pmd_alloc(mm, pud, address);
139 return pmd;
140}
141
142static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
143 unsigned long dst_start,
144 unsigned long src_start,
145 unsigned long len,
146 bool zeropage)
147{
148 struct vm_area_struct *dst_vma;
149 ssize_t err;
150 pmd_t *dst_pmd;
151 unsigned long src_addr, dst_addr;
152 long copied;
153 struct page *page;
154
155 /*
156 * Sanitize the command parameters:
157 */
158 BUG_ON(dst_start & ~PAGE_MASK);
159 BUG_ON(len & ~PAGE_MASK);
160
161 /* Does the address range wrap, or is the span zero-sized? */
162 BUG_ON(src_start + len <= src_start);
163 BUG_ON(dst_start + len <= dst_start);
164
165 src_addr = src_start;
166 dst_addr = dst_start;
167 copied = 0;
168 page = NULL;
169retry:
170 down_read(&dst_mm->mmap_sem);
171
172 /*
173 * Make sure the vma is not shared, that the dst range is
174 * both valid and fully within a single existing vma.
175 */
176 err = -EINVAL;
177 dst_vma = find_vma(dst_mm, dst_start);
178 if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
179 goto out_unlock;
180 if (dst_start < dst_vma->vm_start ||
181 dst_start + len > dst_vma->vm_end)
182 goto out_unlock;
183
184 /*
185 * Be strict and only allow __mcopy_atomic on userfaultfd
186 * registered ranges to prevent userland errors going
187 * unnoticed. As far as the VM consistency is concerned, it
188 * would be perfectly safe to remove this check, but there's
189 * no useful usage for __mcopy_atomic ouside of userfaultfd
190 * registered ranges. This is after all why these are ioctls
191 * belonging to the userfaultfd and not syscalls.
192 */
193 if (!dst_vma->vm_userfaultfd_ctx.ctx)
194 goto out_unlock;
195
196 /*
197 * FIXME: only allow copying on anonymous vmas, tmpfs should
198 * be added.
199 */
200 if (dst_vma->vm_ops)
201 goto out_unlock;
202
203 /*
204 * Ensure the dst_vma has a anon_vma or this page
205 * would get a NULL anon_vma when moved in the
206 * dst_vma.
207 */
208 err = -ENOMEM;
209 if (unlikely(anon_vma_prepare(dst_vma)))
210 goto out_unlock;
211
212 while (src_addr < src_start + len) {
213 pmd_t dst_pmdval;
214
215 BUG_ON(dst_addr >= dst_start + len);
216
217 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
218 if (unlikely(!dst_pmd)) {
219 err = -ENOMEM;
220 break;
221 }
222
223 dst_pmdval = pmd_read_atomic(dst_pmd);
224 /*
225 * If the dst_pmd is mapped as THP don't
226 * override it and just be strict.
227 */
228 if (unlikely(pmd_trans_huge(dst_pmdval))) {
229 err = -EEXIST;
230 break;
231 }
232 if (unlikely(pmd_none(dst_pmdval)) &&
233 unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
234 dst_addr))) {
235 err = -ENOMEM;
236 break;
237 }
238 /* If an huge pmd materialized from under us fail */
239 if (unlikely(pmd_trans_huge(*dst_pmd))) {
240 err = -EFAULT;
241 break;
242 }
243
244 BUG_ON(pmd_none(*dst_pmd));
245 BUG_ON(pmd_trans_huge(*dst_pmd));
246
247 if (!zeropage)
248 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
249 dst_addr, src_addr, &page);
250 else
251 err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
252 dst_addr);
253
254 cond_resched();
255
256 if (unlikely(err == -EFAULT)) {
257 void *page_kaddr;
258
259 up_read(&dst_mm->mmap_sem);
260 BUG_ON(!page);
261
262 page_kaddr = kmap(page);
263 err = copy_from_user(page_kaddr,
264 (const void __user *) src_addr,
265 PAGE_SIZE);
266 kunmap(page);
267 if (unlikely(err)) {
268 err = -EFAULT;
269 goto out;
270 }
271 goto retry;
272 } else
273 BUG_ON(page);
274
275 if (!err) {
276 dst_addr += PAGE_SIZE;
277 src_addr += PAGE_SIZE;
278 copied += PAGE_SIZE;
279
280 if (fatal_signal_pending(current))
281 err = -EINTR;
282 }
283 if (err)
284 break;
285 }
286
287out_unlock:
288 up_read(&dst_mm->mmap_sem);
289out:
290 if (page)
291 page_cache_release(page);
292 BUG_ON(copied < 0);
293 BUG_ON(err > 0);
294 BUG_ON(!copied && !err);
295 return copied ? copied : err;
296}
297
298ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
299 unsigned long src_start, unsigned long len)
300{
301 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
302}
303
304ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
305 unsigned long len)
306{
307 return __mcopy_atomic(dst_mm, start, 0, len, true);
308}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..b1139039122a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1057 * processes. Try to unmap it here. 1057 * processes. Try to unmap it here.
1058 */ 1058 */
1059 if (page_mapped(page) && mapping) { 1059 if (page_mapped(page) && mapping) {
1060 switch (try_to_unmap(page, ttu_flags)) { 1060 switch (try_to_unmap(page,
1061 ttu_flags|TTU_BATCH_FLUSH)) {
1061 case SWAP_FAIL: 1062 case SWAP_FAIL:
1062 goto activate_locked; 1063 goto activate_locked;
1063 case SWAP_AGAIN: 1064 case SWAP_AGAIN:
@@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1097 if (!sc->may_writepage) 1098 if (!sc->may_writepage)
1098 goto keep_locked; 1099 goto keep_locked;
1099 1100
1100 /* Page is dirty, try to write it out here */ 1101 /*
1102 * Page is dirty. Flush the TLB if a writable entry
1103 * potentially exists to avoid CPU writes after IO
1104 * starts and then write it out here.
1105 */
1106 try_to_unmap_flush_dirty();
1101 switch (pageout(page, mapping, sc)) { 1107 switch (pageout(page, mapping, sc)) {
1102 case PAGE_KEEP: 1108 case PAGE_KEEP:
1103 goto keep_locked; 1109 goto keep_locked;
@@ -1208,6 +1214,7 @@ keep:
1208 } 1214 }
1209 1215
1210 mem_cgroup_uncharge_list(&free_pages); 1216 mem_cgroup_uncharge_list(&free_pages);
1217 try_to_unmap_flush();
1211 free_hot_cold_page_list(&free_pages, true); 1218 free_hot_cold_page_list(&free_pages, true);
1212 1219
1213 list_splice(&ret_pages, page_list); 1220 list_splice(&ret_pages, page_list);
@@ -2151,6 +2158,23 @@ out:
2151 } 2158 }
2152} 2159}
2153 2160
2161#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
2162static void init_tlb_ubc(void)
2163{
2164 /*
2165 * This deliberately does not clear the cpumask as it's expensive
2166 * and unnecessary. If there happens to be data in there then the
2167 * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
2168 * then will be cleared.
2169 */
2170 current->tlb_ubc.flush_required = false;
2171}
2172#else
2173static inline void init_tlb_ubc(void)
2174{
2175}
2176#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
2177
2154/* 2178/*
2155 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2179 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2156 */ 2180 */
@@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2185 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && 2209 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2186 sc->priority == DEF_PRIORITY); 2210 sc->priority == DEF_PRIORITY);
2187 2211
2212 init_tlb_ubc();
2213
2188 blk_start_plug(&plug); 2214 blk_start_plug(&plug);
2189 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2215 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2190 nr[LRU_INACTIVE_FILE]) { 2216 nr[LRU_INACTIVE_FILE]) {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..69a4d30a9ccf 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
517 struct ceph_options *opt = client->options; 517 struct ceph_options *opt = client->options;
518 size_t pos = m->count; 518 size_t pos = m->count;
519 519
520 if (opt->name) 520 if (opt->name) {
521 seq_printf(m, "name=%s,", opt->name); 521 seq_puts(m, "name=");
522 seq_escape(m, opt->name, ", \t\n\\");
523 seq_putc(m, ',');
524 }
522 if (opt->key) 525 if (opt->key)
523 seq_puts(m, "secret=<hidden>,"); 526 seq_puts(m, "secret=<hidden>,");
524 527
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
297 clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); 297 clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
298 ret = atomic_dec_and_test(&task->tk_count); 298 ret = atomic_dec_and_test(&task->tk_count);
299 if (waitqueue_active(wq)) 299 if (waitqueue_active(wq))
300 __wake_up_locked_key(wq, TASK_NORMAL, &k); 300 __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
301 spin_unlock_irqrestore(&wq->lock, flags); 301 spin_unlock_irqrestore(&wq->lock, flags);
302 return ret; 302 return ret;
303} 303}
diff --git a/scripts/Lindent b/scripts/Lindent
index 9c4b3e2b7098..6d889de4e70b 100755
--- a/scripts/Lindent
+++ b/scripts/Lindent
@@ -1,6 +1,9 @@
1#!/bin/sh 1#!/bin/sh
2PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" 2PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
3RES=`indent --version` 3RES=`indent --version`
4if [ "$RES" = "" ]; then
5 exit 1
6fi
4V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1` 7V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
5V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2` 8V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
6V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3` 9V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 515c4c00e957..00d6d53c2681 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -14,11 +14,14 @@ declare -A cache
14 14
15parse_symbol() { 15parse_symbol() {
16 # The structure of symbol at this point is: 16 # The structure of symbol at this point is:
17 # [name]+[offset]/[total length] 17 # ([name]+[offset]/[total length])
18 # 18 #
19 # For example: 19 # For example:
20 # do_basic_setup+0x9c/0xbf 20 # do_basic_setup+0x9c/0xbf
21 21
22 # Remove the englobing parenthesis
23 symbol=${symbol#\(}
24 symbol=${symbol%\)}
22 25
23 # Strip the symbol name so that we could look it up 26 # Strip the symbol name so that we could look it up
24 local name=${symbol%+*} 27 local name=${symbol%+*}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a7bf5f68aacb..9a08fb5c1af6 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -469,7 +469,7 @@ sub dump_section {
469 } else { 469 } else {
470# print STDERR "other section '$name' = '$contents'\n"; 470# print STDERR "other section '$name' = '$contents'\n";
471 if (defined($sections{$name}) && ($sections{$name} ne "")) { 471 if (defined($sections{$name}) && ($sections{$name} ne "")) {
472 print STDERR "Error(${file}:$.): duplicate section name '$name'\n"; 472 print STDERR "${file}:$.: error: duplicate section name '$name'\n";
473 ++$errors; 473 ++$errors;
474 } 474 }
475 $sections{$name} = $contents; 475 $sections{$name} = $contents;
@@ -1820,7 +1820,7 @@ sub dump_struct($$) {
1820 }); 1820 });
1821 } 1821 }
1822 else { 1822 else {
1823 print STDERR "Error(${file}:$.): Cannot parse struct or union!\n"; 1823 print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
1824 ++$errors; 1824 ++$errors;
1825 } 1825 }
1826} 1826}
@@ -1841,7 +1841,7 @@ sub dump_enum($$) {
1841 push @parameterlist, $arg; 1841 push @parameterlist, $arg;
1842 if (!$parameterdescs{$arg}) { 1842 if (!$parameterdescs{$arg}) {
1843 $parameterdescs{$arg} = $undescribed; 1843 $parameterdescs{$arg} = $undescribed;
1844 print STDERR "Warning(${file}:$.): Enum value '$arg' ". 1844 print STDERR "${file}:$.: warning: Enum value '$arg' ".
1845 "not described in enum '$declaration_name'\n"; 1845 "not described in enum '$declaration_name'\n";
1846 } 1846 }
1847 1847
@@ -1859,7 +1859,7 @@ sub dump_enum($$) {
1859 }); 1859 });
1860 } 1860 }
1861 else { 1861 else {
1862 print STDERR "Error(${file}:$.): Cannot parse enum!\n"; 1862 print STDERR "${file}:$.: error: Cannot parse enum!\n";
1863 ++$errors; 1863 ++$errors;
1864 } 1864 }
1865} 1865}
@@ -1887,7 +1887,7 @@ sub dump_typedef($$) {
1887 }); 1887 });
1888 } 1888 }
1889 else { 1889 else {
1890 print STDERR "Error(${file}:$.): Cannot parse typedef!\n"; 1890 print STDERR "${file}:$.: error: Cannot parse typedef!\n";
1891 ++$errors; 1891 ++$errors;
1892 } 1892 }
1893} 1893}
@@ -2019,11 +2019,11 @@ sub push_parameter($$$) {
2019 $parameterdescs{$param_name} = $undescribed; 2019 $parameterdescs{$param_name} = $undescribed;
2020 2020
2021 if (($type eq 'function') || ($type eq 'enum')) { 2021 if (($type eq 'function') || ($type eq 'enum')) {
2022 print STDERR "Warning(${file}:$.): Function parameter ". 2022 print STDERR "${file}:$.: warning: Function parameter ".
2023 "or member '$param' not " . 2023 "or member '$param' not " .
2024 "described in '$declaration_name'\n"; 2024 "described in '$declaration_name'\n";
2025 } 2025 }
2026 print STDERR "Warning(${file}:$.):" . 2026 print STDERR "${file}:$.: warning:" .
2027 " No description found for parameter '$param'\n"; 2027 " No description found for parameter '$param'\n";
2028 ++$warnings; 2028 ++$warnings;
2029 } 2029 }
@@ -2074,14 +2074,14 @@ sub check_sections($$$$$$) {
2074 } 2074 }
2075 if ($err) { 2075 if ($err) {
2076 if ($decl_type eq "function") { 2076 if ($decl_type eq "function") {
2077 print STDERR "Warning(${file}:$.): " . 2077 print STDERR "${file}:$.: warning: " .
2078 "Excess function parameter " . 2078 "Excess function parameter " .
2079 "'$sects[$sx]' " . 2079 "'$sects[$sx]' " .
2080 "description in '$decl_name'\n"; 2080 "description in '$decl_name'\n";
2081 ++$warnings; 2081 ++$warnings;
2082 } else { 2082 } else {
2083 if ($nested !~ m/\Q$sects[$sx]\E/) { 2083 if ($nested !~ m/\Q$sects[$sx]\E/) {
2084 print STDERR "Warning(${file}:$.): " . 2084 print STDERR "${file}:$.: warning: " .
2085 "Excess struct/union/enum/typedef member " . 2085 "Excess struct/union/enum/typedef member " .
2086 "'$sects[$sx]' " . 2086 "'$sects[$sx]' " .
2087 "description in '$decl_name'\n"; 2087 "description in '$decl_name'\n";
@@ -2107,7 +2107,7 @@ sub check_return_section {
2107 2107
2108 if (!defined($sections{$section_return}) || 2108 if (!defined($sections{$section_return}) ||
2109 $sections{$section_return} eq "") { 2109 $sections{$section_return} eq "") {
2110 print STDERR "Warning(${file}:$.): " . 2110 print STDERR "${file}:$.: warning: " .
2111 "No description found for return value of " . 2111 "No description found for return value of " .
2112 "'$declaration_name'\n"; 2112 "'$declaration_name'\n";
2113 ++$warnings; 2113 ++$warnings;
@@ -2186,7 +2186,7 @@ sub dump_function($$) {
2186 2186
2187 create_parameterlist($args, ',', $file); 2187 create_parameterlist($args, ',', $file);
2188 } else { 2188 } else {
2189 print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n"; 2189 print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
2190 return; 2190 return;
2191 } 2191 }
2192 2192
@@ -2251,7 +2251,7 @@ sub tracepoint_munge($) {
2251 $tracepointargs = $1; 2251 $tracepointargs = $1;
2252 } 2252 }
2253 if (($tracepointname eq 0) || ($tracepointargs eq 0)) { 2253 if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
2254 print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n". 2254 print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
2255 "$prototype\n"; 2255 "$prototype\n";
2256 } else { 2256 } else {
2257 $prototype = "static inline void trace_$tracepointname($tracepointargs)"; 2257 $prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2450,7 +2450,7 @@ sub process_file($) {
2450 } 2450 }
2451 2451
2452 if (($declaration_purpose eq "") && $verbose) { 2452 if (($declaration_purpose eq "") && $verbose) {
2453 print STDERR "Warning(${file}:$.): missing initial short description on line:\n"; 2453 print STDERR "${file}:$.: warning: missing initial short description on line:\n";
2454 print STDERR $_; 2454 print STDERR $_;
2455 ++$warnings; 2455 ++$warnings;
2456 } 2456 }
@@ -2468,10 +2468,10 @@ sub process_file($) {
2468 } 2468 }
2469 2469
2470 if ($verbose) { 2470 if ($verbose) {
2471 print STDERR "Info(${file}:$.): Scanning doc for $identifier\n"; 2471 print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
2472 } 2472 }
2473 } else { 2473 } else {
2474 print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.", 2474 print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
2475 " - I thought it was a doc line\n"; 2475 " - I thought it was a doc line\n";
2476 ++$warnings; 2476 ++$warnings;
2477 $state = 0; 2477 $state = 0;
@@ -2483,7 +2483,7 @@ sub process_file($) {
2483 2483
2484 if (($contents ne "") && ($contents ne "\n")) { 2484 if (($contents ne "") && ($contents ne "\n")) {
2485 if (!$in_doc_sect && $verbose) { 2485 if (!$in_doc_sect && $verbose) {
2486 print STDERR "Warning(${file}:$.): contents before sections\n"; 2486 print STDERR "${file}:$.: warning: contents before sections\n";
2487 ++$warnings; 2487 ++$warnings;
2488 } 2488 }
2489 dump_section($file, $section, xml_escape($contents)); 2489 dump_section($file, $section, xml_escape($contents));
@@ -2509,7 +2509,7 @@ sub process_file($) {
2509 } 2509 }
2510 # look for doc_com + <text> + doc_end: 2510 # look for doc_com + <text> + doc_end:
2511 if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') { 2511 if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
2512 print STDERR "Warning(${file}:$.): suspicious ending line: $_"; 2512 print STDERR "${file}:$.: warning: suspicious ending line: $_";
2513 ++$warnings; 2513 ++$warnings;
2514 } 2514 }
2515 2515
@@ -2539,7 +2539,7 @@ sub process_file($) {
2539 } 2539 }
2540 } else { 2540 } else {
2541 # i dont know - bad line? ignore. 2541 # i dont know - bad line? ignore.
2542 print STDERR "Warning(${file}:$.): bad line: $_"; 2542 print STDERR "${file}:$.: warning: bad line: $_";
2543 ++$warnings; 2543 ++$warnings;
2544 } 2544 }
2545 } elsif ($state == 5) { # scanning for split parameters 2545 } elsif ($state == 5) { # scanning for split parameters
@@ -2631,7 +2631,7 @@ sub process_file($) {
2631 } 2631 }
2632 } 2632 }
2633 if ($initial_section_counter == $section_counter) { 2633 if ($initial_section_counter == $section_counter) {
2634 print STDERR "Warning(${file}): no structured comments found\n"; 2634 print STDERR "${file}:1: warning: no structured comments found\n";
2635 if (($function_only == 1) && ($show_not_found == 1)) { 2635 if (($function_only == 1) && ($show_not_found == 1)) {
2636 print STDERR " Was looking for '$_'.\n" for keys %function_table; 2636 print STDERR " Was looking for '$_'.\n" for keys %function_table;
2637 } 2637 }
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bb8e4d0a1911..946caf3bd694 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -32,6 +32,7 @@ accoring||according
32accout||account 32accout||account
33accquire||acquire 33accquire||acquire
34accquired||acquired 34accquired||acquired
35accross||across
35acessable||accessible 36acessable||accessible
36acess||access 37acess||access
37achitecture||architecture 38achitecture||architecture
@@ -100,8 +101,10 @@ appropiate||appropriate
100appropriatly||appropriately 101appropriatly||appropriately
101approriate||appropriate 102approriate||appropriate
102approriately||appropriately 103approriately||appropriately
104apropriate||appropriate
103aquainted||acquainted 105aquainted||acquainted
104aquired||acquired 106aquired||acquired
107aquisition||acquisition
105arbitary||arbitrary 108arbitary||arbitrary
106architechture||architecture 109architechture||architecture
107arguement||argument 110arguement||argument
@@ -111,6 +114,8 @@ arne't||aren't
111arraival||arrival 114arraival||arrival
112artifical||artificial 115artifical||artificial
113artillary||artillery 116artillary||artillery
117asign||assign
118assertation||assertion
114assiged||assigned 119assiged||assigned
115assigment||assignment 120assigment||assignment
116assigments||assignments 121assigments||assignments
@@ -136,6 +141,7 @@ automatize||automate
136automatized||automated 141automatized||automated
137automatizes||automates 142automatizes||automates
138autonymous||autonomous 143autonymous||autonomous
144auxillary||auxiliary
139auxilliary||auxiliary 145auxilliary||auxiliary
140avaiable||available 146avaiable||available
141avaible||available 147avaible||available
@@ -187,6 +193,7 @@ capatibilities||capabilities
187carefuly||carefully 193carefuly||carefully
188cariage||carriage 194cariage||carriage
189catagory||category 195catagory||category
196cehck||check
190challange||challenge 197challange||challenge
191challanges||challenges 198challanges||challenges
192chanell||channel 199chanell||channel
@@ -199,6 +206,8 @@ charactor||character
199charater||character 206charater||character
200charaters||characters 207charaters||characters
201charcter||character 208charcter||character
209chcek||check
210chck||check
202checksuming||checksumming 211checksuming||checksumming
203childern||children 212childern||children
204childs||children 213childs||children
@@ -231,6 +240,8 @@ compatability||compatibility
231compatable||compatible 240compatable||compatible
232compatibiliy||compatibility 241compatibiliy||compatibility
233compatibilty||compatibility 242compatibilty||compatibility
243compatiblity||compatibility
244competion||completion
234compilant||compliant 245compilant||compliant
235compleatly||completely 246compleatly||completely
236completly||completely 247completly||completely
@@ -291,6 +302,7 @@ defferred||deferred
291definate||definite 302definate||definite
292definately||definitely 303definately||definitely
293defintion||definition 304defintion||definition
305defintions||definitions
294defualt||default 306defualt||default
295defult||default 307defult||default
296deivce||device 308deivce||device
@@ -306,6 +318,7 @@ depreacted||deprecated
306depreacte||deprecate 318depreacte||deprecate
307desactivate||deactivate 319desactivate||deactivate
308desciptors||descriptors 320desciptors||descriptors
321descripton||description
309descrition||description 322descrition||description
310descritptor||descriptor 323descritptor||descriptor
311desctiptor||descriptor 324desctiptor||descriptor
@@ -327,6 +340,7 @@ devided||divided
327deviece||device 340deviece||device
328diable||disable 341diable||disable
329dictionnary||dictionary 342dictionnary||dictionary
343didnt||didn't
330diferent||different 344diferent||different
331differrence||difference 345differrence||difference
332difinition||definition 346difinition||definition
@@ -344,6 +358,7 @@ docuentation||documentation
344documantation||documentation 358documantation||documentation
345documentaion||documentation 359documentaion||documentation
346documment||document 360documment||document
361doesnt||doesn't
347dorp||drop 362dorp||drop
348dosen||doesn 363dosen||doesn
349downlad||download 364downlad||download
@@ -450,11 +465,13 @@ grahical||graphical
450grahpical||graphical 465grahpical||graphical
451grapic||graphic 466grapic||graphic
452guage||gauge 467guage||gauge
468guarenteed||guaranteed
453guarentee||guarantee 469guarentee||guarantee
454halfs||halves 470halfs||halves
455hander||handler 471hander||handler
456handfull||handful 472handfull||handful
457hanled||handled 473hanled||handled
474happend||happened
458harware||hardware 475harware||hardware
459heirarchically||hierarchically 476heirarchically||hierarchically
460helpfull||helpful 477helpfull||helpful
@@ -512,6 +529,7 @@ initialzed||initialized
512initilization||initialization 529initilization||initialization
513initilize||initialize 530initilize||initialize
514inofficial||unofficial 531inofficial||unofficial
532insititute||institute
515instal||install 533instal||install
516inteface||interface 534inteface||interface
517integreated||integrated 535integreated||integrated
@@ -546,6 +564,7 @@ invididual||individual
546invokation||invocation 564invokation||invocation
547invokations||invocations 565invokations||invocations
548irrelevent||irrelevant 566irrelevent||irrelevant
567isnt||isn't
549isssue||issue 568isssue||issue
550itslef||itself 569itslef||itself
551jave||java 570jave||java
@@ -558,6 +577,7 @@ langauage||language
558langauge||language 577langauge||language
559langugage||language 578langugage||language
560lauch||launch 579lauch||launch
580layed||laid
561leightweight||lightweight 581leightweight||lightweight
562lengh||length 582lengh||length
563lenght||length 583lenght||length
@@ -714,6 +734,7 @@ preceeding||preceding
714preceed||precede 734preceed||precede
715precendence||precedence 735precendence||precedence
716precission||precision 736precission||precision
737preemptable||preemptible
717prefered||preferred 738prefered||preferred
718prefferably||preferably 739prefferably||preferably
719premption||preemption 740premption||preemption
@@ -744,6 +765,7 @@ programers||programmers
744programm||program 765programm||program
745programms||programs 766programms||programs
746progresss||progress 767progresss||progress
768promiscous||promiscuous
747promps||prompts 769promps||prompts
748pronnounced||pronounced 770pronnounced||pronounced
749prononciation||pronunciation 771prononciation||pronunciation
@@ -817,6 +839,7 @@ reseting||resetting
817resizeable||resizable 839resizeable||resizable
818resouces||resources 840resouces||resources
819resoures||resources 841resoures||resources
842responce||response
820ressizes||resizes 843ressizes||resizes
821ressource||resource 844ressource||resource
822ressources||resources 845ressources||resources
@@ -869,6 +892,7 @@ setts||sets
869settting||setting 892settting||setting
870shotdown||shutdown 893shotdown||shutdown
871shoud||should 894shoud||should
895shouldnt||shouldn't
872shoule||should 896shoule||should
873shrinked||shrunk 897shrinked||shrunk
874siginificantly||significantly 898siginificantly||significantly
@@ -913,9 +937,11 @@ straming||streaming
913struc||struct 937struc||struct
914structres||structures 938structres||structures
915stuct||struct 939stuct||struct
940stucture||structure
916sturcture||structure 941sturcture||structure
917subdirectoires||subdirectories 942subdirectoires||subdirectories
918suble||subtle 943suble||subtle
944substract||subtract
919succesfully||successfully 945succesfully||successfully
920succesful||successful 946succesful||successful
921successfull||successful 947successfull||successful
@@ -987,6 +1013,7 @@ unexpectd||unexpected
987unexpeted||unexpected 1013unexpeted||unexpected
988unfortunatelly||unfortunately 1014unfortunatelly||unfortunately
989unifiy||unify 1015unifiy||unify
1016unintialized||uninitialized
990unknonw||unknown 1017unknonw||unknown
991unknow||unknown 1018unknow||unknown
992unkown||unknown 1019unkown||unknown
@@ -1027,7 +1054,9 @@ virtiual||virtual
1027visiters||visitors 1054visiters||visitors
1028vitual||virtual 1055vitual||virtual
1029wating||waiting 1056wating||waiting
1057wether||whether
1030whataver||whatever 1058whataver||whatever
1059whcih||which
1031whenver||whenever 1060whenver||whenever
1032wheter||whether 1061wheter||whether
1033whe||when 1062whe||when
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
267 new->cap_effective = *effective; 267 new->cap_effective = *effective;
268 new->cap_inheritable = *inheritable; 268 new->cap_inheritable = *inheritable;
269 new->cap_permitted = *permitted; 269 new->cap_permitted = *permitted;
270
271 /*
272 * Mask off ambient bits that are no longer both permitted and
273 * inheritable.
274 */
275 new->cap_ambient = cap_intersect(new->cap_ambient,
276 cap_intersect(*permitted,
277 *inheritable));
278 if (WARN_ON(!cap_ambient_invariant_ok(new)))
279 return -EINVAL;
270 return 0; 280 return 0;
271} 281}
272 282
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
347 357
348 /* 358 /*
349 * pP' = (X & fP) | (pI & fI) 359 * pP' = (X & fP) | (pI & fI)
360 * The addition of pA' is handled later.
350 */ 361 */
351 new->cap_permitted.cap[i] = 362 new->cap_permitted.cap[i] =
352 (new->cap_bset.cap[i] & permitted) | 363 (new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
474{ 485{
475 const struct cred *old = current_cred(); 486 const struct cred *old = current_cred();
476 struct cred *new = bprm->cred; 487 struct cred *new = bprm->cred;
477 bool effective, has_cap = false; 488 bool effective, has_cap = false, is_setid;
478 int ret; 489 int ret;
479 kuid_t root_uid; 490 kuid_t root_uid;
480 491
492 if (WARN_ON(!cap_ambient_invariant_ok(old)))
493 return -EPERM;
494
481 effective = false; 495 effective = false;
482 ret = get_file_caps(bprm, &effective, &has_cap); 496 ret = get_file_caps(bprm, &effective, &has_cap);
483 if (ret < 0) 497 if (ret < 0)
@@ -522,8 +536,9 @@ skip:
522 * 536 *
523 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. 537 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
524 */ 538 */
525 if ((!uid_eq(new->euid, old->uid) || 539 is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
526 !gid_eq(new->egid, old->gid) || 540
541 if ((is_setid ||
527 !cap_issubset(new->cap_permitted, old->cap_permitted)) && 542 !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
528 bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { 543 bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
529 /* downgrade; they get no more than they had, and maybe less */ 544 /* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
539 new->suid = new->fsuid = new->euid; 554 new->suid = new->fsuid = new->euid;
540 new->sgid = new->fsgid = new->egid; 555 new->sgid = new->fsgid = new->egid;
541 556
557 /* File caps or setid cancels ambient. */
558 if (has_cap || is_setid)
559 cap_clear(new->cap_ambient);
560
561 /*
562 * Now that we've computed pA', update pP' to give:
563 * pP' = (X & fP) | (pI & fI) | pA'
564 */
565 new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
566
567 /*
568 * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
569 * this is the same as pE' = (fE ? pP' : 0) | pA'.
570 */
542 if (effective) 571 if (effective)
543 new->cap_effective = new->cap_permitted; 572 new->cap_effective = new->cap_permitted;
544 else 573 else
545 cap_clear(new->cap_effective); 574 new->cap_effective = new->cap_ambient;
575
576 if (WARN_ON(!cap_ambient_invariant_ok(new)))
577 return -EPERM;
578
546 bprm->cap_effective = effective; 579 bprm->cap_effective = effective;
547 580
548 /* 581 /*
@@ -557,7 +590,7 @@ skip:
557 * Number 1 above might fail if you don't have a full bset, but I think 590 * Number 1 above might fail if you don't have a full bset, but I think
558 * that is interesting information to audit. 591 * that is interesting information to audit.
559 */ 592 */
560 if (!cap_isclear(new->cap_effective)) { 593 if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
561 if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || 594 if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
562 !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) || 595 !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
563 issecure(SECURE_NOROOT)) { 596 issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
568 } 601 }
569 602
570 new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); 603 new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
604
605 if (WARN_ON(!cap_ambient_invariant_ok(new)))
606 return -EPERM;
607
571 return 0; 608 return 0;
572} 609}
573 610
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
589 if (!uid_eq(cred->uid, root_uid)) { 626 if (!uid_eq(cred->uid, root_uid)) {
590 if (bprm->cap_effective) 627 if (bprm->cap_effective)
591 return 1; 628 return 1;
592 if (!cap_isclear(cred->cap_permitted)) 629 if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
593 return 1; 630 return 1;
594 } 631 }
595 632
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
691 uid_eq(old->suid, root_uid)) && 728 uid_eq(old->suid, root_uid)) &&
692 (!uid_eq(new->uid, root_uid) && 729 (!uid_eq(new->uid, root_uid) &&
693 !uid_eq(new->euid, root_uid) && 730 !uid_eq(new->euid, root_uid) &&
694 !uid_eq(new->suid, root_uid)) && 731 !uid_eq(new->suid, root_uid))) {
695 !issecure(SECURE_KEEP_CAPS)) { 732 if (!issecure(SECURE_KEEP_CAPS)) {
696 cap_clear(new->cap_permitted); 733 cap_clear(new->cap_permitted);
697 cap_clear(new->cap_effective); 734 cap_clear(new->cap_effective);
735 }
736
737 /*
738 * Pre-ambient programs expect setresuid to nonroot followed
739 * by exec to drop capabilities. We should make sure that
740 * this remains the case.
741 */
742 cap_clear(new->cap_ambient);
698 } 743 }
699 if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) 744 if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
700 cap_clear(new->cap_effective); 745 cap_clear(new->cap_effective);
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
924 new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); 969 new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
925 return commit_creds(new); 970 return commit_creds(new);
926 971
972 case PR_CAP_AMBIENT:
973 if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
974 if (arg3 | arg4 | arg5)
975 return -EINVAL;
976
977 new = prepare_creds();
978 if (!new)
979 return -ENOMEM;
980 cap_clear(new->cap_ambient);
981 return commit_creds(new);
982 }
983
984 if (((!cap_valid(arg3)) | arg4 | arg5))
985 return -EINVAL;
986
987 if (arg2 == PR_CAP_AMBIENT_IS_SET) {
988 return !!cap_raised(current_cred()->cap_ambient, arg3);
989 } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
990 arg2 != PR_CAP_AMBIENT_LOWER) {
991 return -EINVAL;
992 } else {
993 if (arg2 == PR_CAP_AMBIENT_RAISE &&
994 (!cap_raised(current_cred()->cap_permitted, arg3) ||
995 !cap_raised(current_cred()->cap_inheritable,
996 arg3) ||
997 issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
998 return -EPERM;
999
1000 new = prepare_creds();
1001 if (!new)
1002 return -ENOMEM;
1003 if (arg2 == PR_CAP_AMBIENT_RAISE)
1004 cap_raise(new->cap_ambient, arg3);
1005 else
1006 cap_lower(new->cap_ambient, arg3);
1007 return commit_creds(new);
1008 }
1009
927 default: 1010 default:
928 /* No functionality available - continue with default */ 1011 /* No functionality available - continue with default */
929 return -ENOSYS; 1012 return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
848 new->cap_inheritable = old->cap_inheritable; 848 new->cap_inheritable = old->cap_inheritable;
849 new->cap_permitted = old->cap_permitted; 849 new->cap_permitted = old->cap_permitted;
850 new->cap_effective = old->cap_effective; 850 new->cap_effective = old->cap_effective;
851 new->cap_ambient = old->cap_ambient;
851 new->cap_bset = old->cap_bset; 852 new->cap_bset = old->cap_bset;
852 853
853 new->jit_keyring = old->jit_keyring; 854 new->jit_keyring = old->jit_keyring;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 564079c5c49d..cdf4c589a391 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m,
1100 seq_puts(m, prefix); 1100 seq_puts(m, prefix);
1101 if (has_comma) 1101 if (has_comma)
1102 seq_putc(m, '\"'); 1102 seq_putc(m, '\"');
1103 seq_puts(m, opts->mnt_opts[i]); 1103 seq_escape(m, opts->mnt_opts[i], "\"\n\\");
1104 if (has_comma) 1104 if (has_comma)
1105 seq_putc(m, '\"'); 1105 seq_putc(m, '\"');
1106 } 1106 }
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644
index 000000000000..b732dd0d4738
--- /dev/null
+++ b/tools/testing/selftests/capabilities/.gitignore
@@ -0,0 +1,2 @@
1test_execve
2validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644
index 000000000000..8c8f0c1f0889
--- /dev/null
+++ b/tools/testing/selftests/capabilities/Makefile
@@ -0,0 +1,18 @@
1all:
2
3include ../lib.mk
4
5.PHONY: all clean
6
7TARGETS := validate_cap test_execve
8TEST_PROGS := test_execve
9
10CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng
11
12all: $(TARGETS)
13
14clean:
15 $(RM) $(TARGETS)
16
17$(TARGETS): %: %.c
18 $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644
index 000000000000..10a21a958aaf
--- /dev/null
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -0,0 +1,427 @@
1#define _GNU_SOURCE
2
3#include <cap-ng.h>
4#include <err.h>
5#include <linux/capability.h>
6#include <stdbool.h>
7#include <string.h>
8#include <stdio.h>
9#include <fcntl.h>
10#include <errno.h>
11#include <stdarg.h>
12#include <sched.h>
13#include <sys/mount.h>
14#include <limits.h>
15#include <libgen.h>
16#include <malloc.h>
17#include <sys/wait.h>
18#include <sys/prctl.h>
19#include <sys/stat.h>
20
21#ifndef PR_CAP_AMBIENT
22#define PR_CAP_AMBIENT 47
23# define PR_CAP_AMBIENT_IS_SET 1
24# define PR_CAP_AMBIENT_RAISE 2
25# define PR_CAP_AMBIENT_LOWER 3
26# define PR_CAP_AMBIENT_CLEAR_ALL 4
27#endif
28
29static int nerrs;
30
31static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
32{
33 char buf[4096];
34 int fd;
35 ssize_t written;
36 int buf_len;
37
38 buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
39 if (buf_len < 0) {
40 err(1, "vsnprintf failed");
41 }
42 if (buf_len >= sizeof(buf)) {
43 errx(1, "vsnprintf output truncated");
44 }
45
46 fd = open(filename, O_WRONLY);
47 if (fd < 0) {
48 if ((errno == ENOENT) && enoent_ok)
49 return;
50 err(1, "open of %s failed", filename);
51 }
52 written = write(fd, buf, buf_len);
53 if (written != buf_len) {
54 if (written >= 0) {
55 errx(1, "short write to %s", filename);
56 } else {
57 err(1, "write to %s failed", filename);
58 }
59 }
60 if (close(fd) != 0) {
61 err(1, "close of %s failed", filename);
62 }
63}
64
65static void maybe_write_file(char *filename, char *fmt, ...)
66{
67 va_list ap;
68
69 va_start(ap, fmt);
70 vmaybe_write_file(true, filename, fmt, ap);
71 va_end(ap);
72}
73
74static void write_file(char *filename, char *fmt, ...)
75{
76 va_list ap;
77
78 va_start(ap, fmt);
79 vmaybe_write_file(false, filename, fmt, ap);
80 va_end(ap);
81}
82
83static bool create_and_enter_ns(uid_t inner_uid)
84{
85 uid_t outer_uid;
86 gid_t outer_gid;
87 int i;
88 bool have_outer_privilege;
89
90 outer_uid = getuid();
91 outer_gid = getgid();
92
93 /*
94 * TODO: If we're already root, we could skip creating the userns.
95 */
96
97 if (unshare(CLONE_NEWNS) == 0) {
98 printf("[NOTE]\tUsing global UIDs for tests\n");
99 if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
100 err(1, "PR_SET_KEEPCAPS");
101 if (setresuid(inner_uid, inner_uid, -1) != 0)
102 err(1, "setresuid");
103
104 // Re-enable effective caps
105 capng_get_caps_process();
106 for (i = 0; i < CAP_LAST_CAP; i++)
107 if (capng_have_capability(CAPNG_PERMITTED, i))
108 capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
109 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
110 err(1, "capng_apply");
111
112 have_outer_privilege = true;
113 } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
114 printf("[NOTE]\tUsing a user namespace for tests\n");
115 maybe_write_file("/proc/self/setgroups", "deny");
116 write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
117 write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
118
119 have_outer_privilege = false;
120 } else {
121 errx(1, "must be root or be able to create a userns");
122 }
123
124 if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
125 err(1, "remount everything private");
126
127 return have_outer_privilege;
128}
129
130static void chdir_to_tmpfs(void)
131{
132 char cwd[PATH_MAX];
133 if (getcwd(cwd, sizeof(cwd)) != cwd)
134 err(1, "getcwd");
135
136 if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
137 err(1, "mount private tmpfs");
138
139 if (chdir(cwd) != 0)
140 err(1, "chdir to private tmpfs");
141
142 if (umount2(".", MNT_DETACH) != 0)
143 err(1, "detach private tmpfs");
144}
145
146static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
147{
148 int from = openat(fromfd, fromname, O_RDONLY);
149 if (from == -1)
150 err(1, "open copy source");
151
152 int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
153
154 while (true) {
155 char buf[4096];
156 ssize_t sz = read(from, buf, sizeof(buf));
157 if (sz == 0)
158 break;
159 if (sz < 0)
160 err(1, "read");
161
162 if (write(to, buf, sz) != sz)
163 err(1, "write"); /* no short writes on tmpfs */
164 }
165
166 close(from);
167 close(to);
168}
169
170static bool fork_wait(void)
171{
172 pid_t child = fork();
173 if (child == 0) {
174 nerrs = 0;
175 return true;
176 } else if (child > 0) {
177 int status;
178 if (waitpid(child, &status, 0) != child ||
179 !WIFEXITED(status)) {
180 printf("[FAIL]\tChild died\n");
181 nerrs++;
182 } else if (WEXITSTATUS(status) != 0) {
183 printf("[FAIL]\tChild failed\n");
184 nerrs++;
185 } else {
186 printf("[OK]\tChild succeeded\n");
187 }
188
189 return false;
190 } else {
191 err(1, "fork");
192 }
193}
194
195static void exec_other_validate_cap(const char *name,
196 bool eff, bool perm, bool inh, bool ambient)
197{
198 execl(name, name, (eff ? "1" : "0"),
199 (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
200 NULL);
201 err(1, "execl");
202}
203
204static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
205{
206 exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
207}
208
209static int do_tests(int uid, const char *our_path)
210{
211 bool have_outer_privilege = create_and_enter_ns(uid);
212
213 int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
214 if (ourpath_fd == -1)
215 err(1, "open '%s'", our_path);
216
217 chdir_to_tmpfs();
218
219 copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
220
221 if (have_outer_privilege) {
222 uid_t gid = getegid();
223
224 copy_fromat_to(ourpath_fd, "validate_cap",
225 "validate_cap_suidroot");
226 if (chown("validate_cap_suidroot", 0, -1) != 0)
227 err(1, "chown");
228 if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
229 err(1, "chmod");
230
231 copy_fromat_to(ourpath_fd, "validate_cap",
232 "validate_cap_suidnonroot");
233 if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
234 err(1, "chown");
235 if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
236 err(1, "chmod");
237
238 copy_fromat_to(ourpath_fd, "validate_cap",
239 "validate_cap_sgidroot");
240 if (chown("validate_cap_sgidroot", -1, 0) != 0)
241 err(1, "chown");
242 if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
243 err(1, "chmod");
244
245 copy_fromat_to(ourpath_fd, "validate_cap",
246 "validate_cap_sgidnonroot");
247 if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
248 err(1, "chown");
249 if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
250 err(1, "chmod");
251}
252
253 capng_get_caps_process();
254
255 /* Make sure that i starts out clear */
256 capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
257 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
258 err(1, "capng_apply");
259
260 if (uid == 0) {
261 printf("[RUN]\tRoot => ep\n");
262 if (fork_wait())
263 exec_validate_cap(true, true, false, false);
264 } else {
265 printf("[RUN]\tNon-root => no caps\n");
266 if (fork_wait())
267 exec_validate_cap(false, false, false, false);
268 }
269
270 printf("[OK]\tCheck cap_ambient manipulation rules\n");
271
272 /* We should not be able to add ambient caps yet. */
273 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
274 if (errno == EINVAL)
275 printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n");
276 else
277 printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
278 return 1;
279 }
280 printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
281
282 capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
283 capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
284 capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
285 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
286 err(1, "capng_apply");
287 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
288 printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
289 return 1;
290 }
291 printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
292
293 capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
294 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
295 err(1, "capng_apply");
296 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
297 printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n");
298 return 1;
299 }
300 printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n");
301
302 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
303 printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n");
304 return 1;
305 }
306
307 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
308 err(1, "PR_CAP_AMBIENT_CLEAR_ALL");
309
310 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
311 printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
312 return 1;
313 }
314
315 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
316 err(1, "PR_CAP_AMBIENT_RAISE");
317
318 capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
319 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
320 err(1, "capng_apply");
321
322 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
323 printf("[FAIL]\tDropping I should have dropped A\n");
324 return 1;
325 }
326
327 printf("[OK]\tBasic manipulation appears to work\n");
328
329 capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
330 if (capng_apply(CAPNG_SELECT_CAPS) != 0)
331 err(1, "capng_apply");
332 if (uid == 0) {
333 printf("[RUN]\tRoot +i => eip\n");
334 if (fork_wait())
335 exec_validate_cap(true, true, true, false);
336 } else {
337 printf("[RUN]\tNon-root +i => i\n");
338 if (fork_wait())
339 exec_validate_cap(false, false, true, false);
340 }
341
342 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
343 err(1, "PR_CAP_AMBIENT_RAISE");
344
345 printf("[RUN]\tUID %d +ia => eipa\n", uid);
346 if (fork_wait())
347 exec_validate_cap(true, true, true, true);
348
349 /* The remaining tests need real privilege */
350
351 if (!have_outer_privilege) {
352 printf("[SKIP]\tSUID/SGID tests (needs privilege)\n");
353 goto done;
354 }
355
356 if (uid == 0) {
357 printf("[RUN]\tRoot +ia, suidroot => eipa\n");
358 if (fork_wait())
359 exec_other_validate_cap("./validate_cap_suidroot",
360 true, true, true, true);
361
362 printf("[RUN]\tRoot +ia, suidnonroot => ip\n");
363 if (fork_wait())
364 exec_other_validate_cap("./validate_cap_suidnonroot",
365 false, true, true, false);
366
367 printf("[RUN]\tRoot +ia, sgidroot => eipa\n");
368 if (fork_wait())
369 exec_other_validate_cap("./validate_cap_sgidroot",
370 true, true, true, true);
371
372 if (fork_wait()) {
373 printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
374 if (setresgid(1, 1, 1) != 0)
375 err(1, "setresgid");
376 exec_other_validate_cap("./validate_cap_sgidroot",
377 true, true, true, false);
378 }
379
380 printf("[RUN]\tRoot +ia, sgidnonroot => eip\n");
381 if (fork_wait())
382 exec_other_validate_cap("./validate_cap_sgidnonroot",
383 true, true, true, false);
384 } else {
385 printf("[RUN]\tNon-root +ia, sgidnonroot => i\n");
386 exec_other_validate_cap("./validate_cap_sgidnonroot",
387 false, false, true, false);
388
389 if (fork_wait()) {
390 printf("[RUN]\tNon-root +ia, sgidroot => i\n");
391 if (setresgid(1, 1, 1) != 0)
392 err(1, "setresgid");
393 exec_other_validate_cap("./validate_cap_sgidroot",
394 false, false, true, false);
395 }
396 }
397
398done:
399 return nerrs ? 1 : 0;
400}
401
402int main(int argc, char **argv)
403{
404 char *tmp1, *tmp2, *our_path;
405
406 /* Find our path */
407 tmp1 = strdup(argv[0]);
408 if (!tmp1)
409 err(1, "strdup");
410 tmp2 = dirname(tmp1);
411 our_path = strdup(tmp2);
412 if (!our_path)
413 err(1, "strdup");
414 free(tmp1);
415
416 if (fork_wait()) {
417 printf("[RUN]\t+++ Tests with uid == 0 +++\n");
418 return do_tests(0, our_path);
419 }
420
421 if (fork_wait()) {
422 printf("[RUN]\t+++ Tests with uid != 0 +++\n");
423 return do_tests(1, our_path);
424 }
425
426 return nerrs ? 1 : 0;
427}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644
index 000000000000..dd3c45f7b23c
--- /dev/null
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -0,0 +1,73 @@
1#include <cap-ng.h>
2#include <err.h>
3#include <linux/capability.h>
4#include <stdbool.h>
5#include <string.h>
6#include <stdio.h>
7#include <sys/prctl.h>
8#include <sys/auxv.h>
9
10#ifndef PR_CAP_AMBIENT
11#define PR_CAP_AMBIENT 47
12# define PR_CAP_AMBIENT_IS_SET 1
13# define PR_CAP_AMBIENT_RAISE 2
14# define PR_CAP_AMBIENT_LOWER 3
15# define PR_CAP_AMBIENT_CLEAR_ALL 4
16#endif
17
18#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
19# define HAVE_GETAUXVAL
20#endif
21
22static bool bool_arg(char **argv, int i)
23{
24 if (!strcmp(argv[i], "0"))
25 return false;
26 else if (!strcmp(argv[i], "1"))
27 return true;
28 else
29 errx(1, "wrong argv[%d]", i);
30}
31
32int main(int argc, char **argv)
33{
34 const char *atsec = "";
35
36 /*
37 * Be careful just in case a setgid or setcapped copy of this
38 * helper gets out.
39 */
40
41 if (argc != 5)
42 errx(1, "wrong argc");
43
44#ifdef HAVE_GETAUXVAL
45 if (getauxval(AT_SECURE))
46 atsec = " (AT_SECURE is set)";
47 else
48 atsec = " (AT_SECURE is not set)";
49#endif
50
51 capng_get_caps_process();
52
53 if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
54 printf("[FAIL]\tWrong effective state%s\n", atsec);
55 return 1;
56 }
57 if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
58 printf("[FAIL]\tWrong permitted state%s\n", atsec);
59 return 1;
60 }
61 if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
62 printf("[FAIL]\tWrong inheritable state%s\n", atsec);
63 return 1;
64 }
65
66 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
67 printf("[FAIL]\tWrong ambient state%s\n", atsec);
68 return 1;
69 }
70
71 printf("[OK]\tCapabilities after execve were correct\n");
72 return 0;
73}
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..0d6854744b37 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest
8BINARIES += map_hugetlb 8BINARIES += map_hugetlb
9BINARIES += thuge-gen 9BINARIES += thuge-gen
10BINARIES += transhuge-stress 10BINARIES += transhuge-stress
11BINARIES += userfaultfd
11 12
12all: $(BINARIES) 13all: $(BINARIES)
13%: %.c 14%: %.c
14 $(CC) $(CFLAGS) -o $@ $^ -lrt 15 $(CC) $(CFLAGS) -o $@ $^ -lrt
16userfaultfd: userfaultfd.c
17 $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
15 18
16TEST_PROGS := run_vmtests 19TEST_PROGS := run_vmtests
17TEST_FILES := $(BINARIES) 20TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..831adeb5fc55 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -86,6 +86,17 @@ else
86 echo "[PASS]" 86 echo "[PASS]"
87fi 87fi
88 88
89echo "--------------------"
90echo "running userfaultfd"
91echo "--------------------"
92./userfaultfd 128 32
93if [ $? -ne 0 ]; then
94 echo "[FAIL]"
95 exitcode=1
96else
97 echo "[PASS]"
98fi
99
89#cleanup 100#cleanup
90umount $mnt 101umount $mnt
91rm -rf $mnt 102rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..0c0b83953352
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,636 @@
1/*
2 * Stress userfaultfd syscall.
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 * This test allocates two virtual areas and bounces the physical
10 * memory across the two virtual areas (from area_src to area_dst)
11 * using userfaultfd.
12 *
13 * There are three threads running per CPU:
14 *
15 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
16 * page of the area_dst (while the physical page may still be in
17 * area_src), and increments a per-page counter in the same page,
18 * and checks its value against a verification region.
19 *
20 * 2) another per-CPU thread handles the userfaults generated by
21 * thread 1 above. userfaultfd blocking reads or poll() modes are
22 * exercised interleaved.
23 *
24 * 3) one last per-CPU thread transfers the memory in the background
25 * at maximum bandwidth (if not already transferred by thread
26 * 2). Each cpu thread takes cares of transferring a portion of the
27 * area.
28 *
29 * When all threads of type 3 completed the transfer, one bounce is
30 * complete. area_src and area_dst are then swapped. All threads are
31 * respawned and so the bounce is immediately restarted in the
32 * opposite direction.
33 *
34 * per-CPU threads 1 by triggering userfaults inside
35 * pthread_mutex_lock will also verify the atomicity of the memory
36 * transfer (UFFDIO_COPY).
37 *
38 * The program takes two parameters: the amounts of physical memory in
39 * megabytes (MiB) of the area and the number of bounces to execute.
40 *
41 * # 100MiB 99999 bounces
42 * ./userfaultfd 100 99999
43 *
44 * # 1GiB 99 bounces
45 * ./userfaultfd 1000 99
46 *
47 * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48 * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49 */
50
51#define _GNU_SOURCE
52#include <stdio.h>
53#include <errno.h>
54#include <unistd.h>
55#include <stdlib.h>
56#include <sys/types.h>
57#include <sys/stat.h>
58#include <fcntl.h>
59#include <time.h>
60#include <signal.h>
61#include <poll.h>
62#include <string.h>
63#include <sys/mman.h>
64#include <sys/syscall.h>
65#include <sys/ioctl.h>
66#include <pthread.h>
67#include "../../../../include/uapi/linux/userfaultfd.h"
68
69#ifdef __x86_64__
70#define __NR_userfaultfd 323
71#elif defined(__i386__)
72#define __NR_userfaultfd 359
73#elif defined(__powewrpc__)
74#define __NR_userfaultfd 364
75#else
76#error "missing __NR_userfaultfd definition"
77#endif
78
79static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
80
81#define BOUNCE_RANDOM (1<<0)
82#define BOUNCE_RACINGFAULTS (1<<1)
83#define BOUNCE_VERIFY (1<<2)
84#define BOUNCE_POLL (1<<3)
85static int bounces;
86
87static unsigned long long *count_verify;
88static int uffd, finished, *pipefd;
89static char *area_src, *area_dst;
90static char *zeropage;
91pthread_attr_t attr;
92
93/* pthread_mutex_t starts at page offset 0 */
94#define area_mutex(___area, ___nr) \
95 ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
96/*
97 * count is placed in the page after pthread_mutex_t naturally aligned
98 * to avoid non alignment faults on non-x86 archs.
99 */
100#define area_count(___area, ___nr) \
101 ((volatile unsigned long long *) ((unsigned long) \
102 ((___area) + (___nr)*page_size + \
103 sizeof(pthread_mutex_t) + \
104 sizeof(unsigned long long) - 1) & \
105 ~(unsigned long)(sizeof(unsigned long long) \
106 - 1)))
107
108static int my_bcmp(char *str1, char *str2, size_t n)
109{
110 unsigned long i;
111 for (i = 0; i < n; i++)
112 if (str1[i] != str2[i])
113 return 1;
114 return 0;
115}
116
117static void *locking_thread(void *arg)
118{
119 unsigned long cpu = (unsigned long) arg;
120 struct random_data rand;
121 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
122 int32_t rand_nr;
123 unsigned long long count;
124 char randstate[64];
125 unsigned int seed;
126 time_t start;
127
128 if (bounces & BOUNCE_RANDOM) {
129 seed = (unsigned int) time(NULL) - bounces;
130 if (!(bounces & BOUNCE_RACINGFAULTS))
131 seed += cpu;
132 bzero(&rand, sizeof(rand));
133 bzero(&randstate, sizeof(randstate));
134 if (initstate_r(seed, randstate, sizeof(randstate), &rand))
135 fprintf(stderr, "srandom_r error\n"), exit(1);
136 } else {
137 page_nr = -bounces;
138 if (!(bounces & BOUNCE_RACINGFAULTS))
139 page_nr += cpu * nr_pages_per_cpu;
140 }
141
142 while (!finished) {
143 if (bounces & BOUNCE_RANDOM) {
144 if (random_r(&rand, &rand_nr))
145 fprintf(stderr, "random_r 1 error\n"), exit(1);
146 page_nr = rand_nr;
147 if (sizeof(page_nr) > sizeof(rand_nr)) {
148 if (random_r(&rand, &rand_nr))
149 fprintf(stderr, "random_r 2 error\n"), exit(1);
150 page_nr |= ((unsigned long) rand_nr) << 32;
151 }
152 } else
153 page_nr += 1;
154 page_nr %= nr_pages;
155
156 start = time(NULL);
157 if (bounces & BOUNCE_VERIFY) {
158 count = *area_count(area_dst, page_nr);
159 if (!count)
160 fprintf(stderr,
161 "page_nr %lu wrong count %Lu %Lu\n",
162 page_nr, count,
163 count_verify[page_nr]), exit(1);
164
165
166 /*
167 * We can't use bcmp (or memcmp) because that
168 * returns 0 erroneously if the memory is
169 * changing under it (even if the end of the
170 * page is never changing and always
171 * different).
172 */
173#if 1
174 if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
175 page_size))
176 fprintf(stderr,
177 "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
178 page_nr, count,
179 count_verify[page_nr]), exit(1);
180#else
181 unsigned long loops;
182
183 loops = 0;
184 /* uncomment the below line to test with mutex */
185 /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
186 while (!bcmp(area_dst + page_nr * page_size, zeropage,
187 page_size)) {
188 loops += 1;
189 if (loops > 10)
190 break;
191 }
192 /* uncomment below line to test with mutex */
193 /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
194 if (loops) {
195 fprintf(stderr,
196 "page_nr %lu all zero thread %lu %p %lu\n",
197 page_nr, cpu, area_dst + page_nr * page_size,
198 loops);
199 if (loops > 10)
200 exit(1);
201 }
202#endif
203 }
204
205 pthread_mutex_lock(area_mutex(area_dst, page_nr));
206 count = *area_count(area_dst, page_nr);
207 if (count != count_verify[page_nr]) {
208 fprintf(stderr,
209 "page_nr %lu memory corruption %Lu %Lu\n",
210 page_nr, count,
211 count_verify[page_nr]), exit(1);
212 }
213 count++;
214 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
215 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
216
217 if (time(NULL) - start > 1)
218 fprintf(stderr,
219 "userfault too slow %ld "
220 "possible false positive with overcommit\n",
221 time(NULL) - start);
222 }
223
224 return NULL;
225}
226
227static int copy_page(unsigned long offset)
228{
229 struct uffdio_copy uffdio_copy;
230
231 if (offset >= nr_pages * page_size)
232 fprintf(stderr, "unexpected offset %lu\n",
233 offset), exit(1);
234 uffdio_copy.dst = (unsigned long) area_dst + offset;
235 uffdio_copy.src = (unsigned long) area_src + offset;
236 uffdio_copy.len = page_size;
237 uffdio_copy.mode = 0;
238 uffdio_copy.copy = 0;
239 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
240 /* real retval in ufdio_copy.copy */
241 if (uffdio_copy.copy != -EEXIST)
242 fprintf(stderr, "UFFDIO_COPY error %Ld\n",
243 uffdio_copy.copy), exit(1);
244 } else if (uffdio_copy.copy != page_size) {
245 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
246 uffdio_copy.copy), exit(1);
247 } else
248 return 1;
249 return 0;
250}
251
252static void *uffd_poll_thread(void *arg)
253{
254 unsigned long cpu = (unsigned long) arg;
255 struct pollfd pollfd[2];
256 struct uffd_msg msg;
257 int ret;
258 unsigned long offset;
259 char tmp_chr;
260 unsigned long userfaults = 0;
261
262 pollfd[0].fd = uffd;
263 pollfd[0].events = POLLIN;
264 pollfd[1].fd = pipefd[cpu*2];
265 pollfd[1].events = POLLIN;
266
267 for (;;) {
268 ret = poll(pollfd, 2, -1);
269 if (!ret)
270 fprintf(stderr, "poll error %d\n", ret), exit(1);
271 if (ret < 0)
272 perror("poll"), exit(1);
273 if (pollfd[1].revents & POLLIN) {
274 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
275 fprintf(stderr, "read pipefd error\n"),
276 exit(1);
277 break;
278 }
279 if (!(pollfd[0].revents & POLLIN))
280 fprintf(stderr, "pollfd[0].revents %d\n",
281 pollfd[0].revents), exit(1);
282 ret = read(uffd, &msg, sizeof(msg));
283 if (ret < 0) {
284 if (errno == EAGAIN)
285 continue;
286 perror("nonblocking read error"), exit(1);
287 }
288 if (msg.event != UFFD_EVENT_PAGEFAULT)
289 fprintf(stderr, "unexpected msg event %u\n",
290 msg.event), exit(1);
291 if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
292 fprintf(stderr, "unexpected write fault\n"), exit(1);
293 offset = (char *)msg.arg.pagefault.address - area_dst;
294 offset &= ~(page_size-1);
295 if (copy_page(offset))
296 userfaults++;
297 }
298 return (void *)userfaults;
299}
300
301pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
302
303static void *uffd_read_thread(void *arg)
304{
305 unsigned long *this_cpu_userfaults;
306 struct uffd_msg msg;
307 unsigned long offset;
308 int ret;
309
310 this_cpu_userfaults = (unsigned long *) arg;
311 *this_cpu_userfaults = 0;
312
313 pthread_mutex_unlock(&uffd_read_mutex);
314 /* from here cancellation is ok */
315
316 for (;;) {
317 ret = read(uffd, &msg, sizeof(msg));
318 if (ret != sizeof(msg)) {
319 if (ret < 0)
320 perror("blocking read error"), exit(1);
321 else
322 fprintf(stderr, "short read\n"), exit(1);
323 }
324 if (msg.event != UFFD_EVENT_PAGEFAULT)
325 fprintf(stderr, "unexpected msg event %u\n",
326 msg.event), exit(1);
327 if (bounces & BOUNCE_VERIFY &&
328 msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
329 fprintf(stderr, "unexpected write fault\n"), exit(1);
330 offset = (char *)msg.arg.pagefault.address - area_dst;
331 offset &= ~(page_size-1);
332 if (copy_page(offset))
333 (*this_cpu_userfaults)++;
334 }
335 return (void *)NULL;
336}
337
338static void *background_thread(void *arg)
339{
340 unsigned long cpu = (unsigned long) arg;
341 unsigned long page_nr;
342
343 for (page_nr = cpu * nr_pages_per_cpu;
344 page_nr < (cpu+1) * nr_pages_per_cpu;
345 page_nr++)
346 copy_page(page_nr * page_size);
347
348 return NULL;
349}
350
351static int stress(unsigned long *userfaults)
352{
353 unsigned long cpu;
354 pthread_t locking_threads[nr_cpus];
355 pthread_t uffd_threads[nr_cpus];
356 pthread_t background_threads[nr_cpus];
357 void **_userfaults = (void **) userfaults;
358
359 finished = 0;
360 for (cpu = 0; cpu < nr_cpus; cpu++) {
361 if (pthread_create(&locking_threads[cpu], &attr,
362 locking_thread, (void *)cpu))
363 return 1;
364 if (bounces & BOUNCE_POLL) {
365 if (pthread_create(&uffd_threads[cpu], &attr,
366 uffd_poll_thread, (void *)cpu))
367 return 1;
368 } else {
369 if (pthread_create(&uffd_threads[cpu], &attr,
370 uffd_read_thread,
371 &_userfaults[cpu]))
372 return 1;
373 pthread_mutex_lock(&uffd_read_mutex);
374 }
375 if (pthread_create(&background_threads[cpu], &attr,
376 background_thread, (void *)cpu))
377 return 1;
378 }
379 for (cpu = 0; cpu < nr_cpus; cpu++)
380 if (pthread_join(background_threads[cpu], NULL))
381 return 1;
382
383 /*
384 * Be strict and immediately zap area_src, the whole area has
385 * been transferred already by the background treads. The
386 * area_src could then be faulted in in a racy way by still
387 * running uffdio_threads reading zeropages after we zapped
388 * area_src (but they're guaranteed to get -EEXIST from
389 * UFFDIO_COPY without writing zero pages into area_dst
390 * because the background threads already completed).
391 */
392 if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
393 perror("madvise");
394 return 1;
395 }
396
397 for (cpu = 0; cpu < nr_cpus; cpu++) {
398 char c;
399 if (bounces & BOUNCE_POLL) {
400 if (write(pipefd[cpu*2+1], &c, 1) != 1) {
401 fprintf(stderr, "pipefd write error\n");
402 return 1;
403 }
404 if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
405 return 1;
406 } else {
407 if (pthread_cancel(uffd_threads[cpu]))
408 return 1;
409 if (pthread_join(uffd_threads[cpu], NULL))
410 return 1;
411 }
412 }
413
414 finished = 1;
415 for (cpu = 0; cpu < nr_cpus; cpu++)
416 if (pthread_join(locking_threads[cpu], NULL))
417 return 1;
418
419 return 0;
420}
421
422static int userfaultfd_stress(void)
423{
424 void *area;
425 char *tmp_area;
426 unsigned long nr;
427 struct uffdio_register uffdio_register;
428 struct uffdio_api uffdio_api;
429 unsigned long cpu;
430 int uffd_flags;
431 unsigned long userfaults[nr_cpus];
432
433 if (posix_memalign(&area, page_size, nr_pages * page_size)) {
434 fprintf(stderr, "out of memory\n");
435 return 1;
436 }
437 area_src = area;
438 if (posix_memalign(&area, page_size, nr_pages * page_size)) {
439 fprintf(stderr, "out of memory\n");
440 return 1;
441 }
442 area_dst = area;
443
444 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
445 if (uffd < 0) {
446 fprintf(stderr,
447 "userfaultfd syscall not available in this kernel\n");
448 return 1;
449 }
450 uffd_flags = fcntl(uffd, F_GETFD, NULL);
451
452 uffdio_api.api = UFFD_API;
453 uffdio_api.features = 0;
454 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
455 fprintf(stderr, "UFFDIO_API\n");
456 return 1;
457 }
458 if (uffdio_api.api != UFFD_API) {
459 fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
460 return 1;
461 }
462
463 count_verify = malloc(nr_pages * sizeof(unsigned long long));
464 if (!count_verify) {
465 perror("count_verify");
466 return 1;
467 }
468
469 for (nr = 0; nr < nr_pages; nr++) {
470 *area_mutex(area_src, nr) = (pthread_mutex_t)
471 PTHREAD_MUTEX_INITIALIZER;
472 count_verify[nr] = *area_count(area_src, nr) = 1;
473 }
474
475 pipefd = malloc(sizeof(int) * nr_cpus * 2);
476 if (!pipefd) {
477 perror("pipefd");
478 return 1;
479 }
480 for (cpu = 0; cpu < nr_cpus; cpu++) {
481 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
482 perror("pipe");
483 return 1;
484 }
485 }
486
487 if (posix_memalign(&area, page_size, page_size)) {
488 fprintf(stderr, "out of memory\n");
489 return 1;
490 }
491 zeropage = area;
492 bzero(zeropage, page_size);
493
494 pthread_mutex_lock(&uffd_read_mutex);
495
496 pthread_attr_init(&attr);
497 pthread_attr_setstacksize(&attr, 16*1024*1024);
498
499 while (bounces--) {
500 unsigned long expected_ioctls;
501
502 printf("bounces: %d, mode:", bounces);
503 if (bounces & BOUNCE_RANDOM)
504 printf(" rnd");
505 if (bounces & BOUNCE_RACINGFAULTS)
506 printf(" racing");
507 if (bounces & BOUNCE_VERIFY)
508 printf(" ver");
509 if (bounces & BOUNCE_POLL)
510 printf(" poll");
511 printf(", ");
512 fflush(stdout);
513
514 if (bounces & BOUNCE_POLL)
515 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
516 else
517 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
518
519 /* register */
520 uffdio_register.range.start = (unsigned long) area_dst;
521 uffdio_register.range.len = nr_pages * page_size;
522 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
523 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
524 fprintf(stderr, "register failure\n");
525 return 1;
526 }
527 expected_ioctls = (1 << _UFFDIO_WAKE) |
528 (1 << _UFFDIO_COPY) |
529 (1 << _UFFDIO_ZEROPAGE);
530 if ((uffdio_register.ioctls & expected_ioctls) !=
531 expected_ioctls) {
532 fprintf(stderr,
533 "unexpected missing ioctl for anon memory\n");
534 return 1;
535 }
536
537 /*
538 * The madvise done previously isn't enough: some
539 * uffd_thread could have read userfaults (one of
540 * those already resolved by the background thread)
541 * and it may be in the process of calling
542 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
543 * area_src and it would map a zero page in it (of
544 * course such a UFFDIO_COPY is perfectly safe as it'd
545 * return -EEXIST). The problem comes at the next
546 * bounce though: that racing UFFDIO_COPY would
547 * generate zeropages in the area_src, so invalidating
548 * the previous MADV_DONTNEED. Without this additional
549 * MADV_DONTNEED those zeropages leftovers in the
550 * area_src would lead to -EEXIST failure during the
551 * next bounce, effectively leaving a zeropage in the
552 * area_dst.
553 *
554 * Try to comment this out madvise to see the memory
555 * corruption being caught pretty quick.
556 *
557 * khugepaged is also inhibited to collapse THP after
558 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
559 * required to MADV_DONTNEED here.
560 */
561 if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
562 perror("madvise 2");
563 return 1;
564 }
565
566 /* bounce pass */
567 if (stress(userfaults))
568 return 1;
569
570 /* unregister */
571 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
572 fprintf(stderr, "register failure\n");
573 return 1;
574 }
575
576 /* verification */
577 if (bounces & BOUNCE_VERIFY) {
578 for (nr = 0; nr < nr_pages; nr++) {
579 if (my_bcmp(area_dst,
580 area_dst + nr * page_size,
581 sizeof(pthread_mutex_t))) {
582 fprintf(stderr,
583 "error mutex 2 %lu\n",
584 nr);
585 bounces = 0;
586 }
587 if (*area_count(area_dst, nr) != count_verify[nr]) {
588 fprintf(stderr,
589 "error area_count %Lu %Lu %lu\n",
590 *area_count(area_src, nr),
591 count_verify[nr],
592 nr);
593 bounces = 0;
594 }
595 }
596 }
597
598 /* prepare next bounce */
599 tmp_area = area_src;
600 area_src = area_dst;
601 area_dst = tmp_area;
602
603 printf("userfaults:");
604 for (cpu = 0; cpu < nr_cpus; cpu++)
605 printf(" %lu", userfaults[cpu]);
606 printf("\n");
607 }
608
609 return 0;
610}
611
612int main(int argc, char **argv)
613{
614 if (argc < 3)
615 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
616 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
617 page_size = sysconf(_SC_PAGE_SIZE);
618 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
619 page_size)
620 fprintf(stderr, "Impossible to run this test\n"), exit(2);
621 nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
622 nr_cpus;
623 if (!nr_pages_per_cpu) {
624 fprintf(stderr, "invalid MiB\n");
625 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
626 }
627 bounces = atoi(argv[2]);
628 if (bounces <= 0) {
629 fprintf(stderr, "invalid bounces\n");
630 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
631 }
632 nr_pages = nr_pages_per_cpu * nr_cpus;
633 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
634 nr_pages, nr_pages_per_cpu);
635 return userfaultfd_stress();
636}