Merge branch 'akpm' (patches from Andrew)

Merge patch-bomb from Andrew Morton: - a few misc things - Andy's "ambient capabilities" - fs/nofity updates - the ocfs2 queue - kernel/watchdog.c updates and feature work. - some of MM. Includes Andrea's userfaultfd feature. [ Hadn't noticed that userfaultfd was 'default y' when applying the patches, so that got fixed in this merge instead. We do _not_ mark new features that nobody uses yet 'default y' - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits) mm/hugetlb.c: make vma_has_reserves() return bool mm/madvise.c: make madvise_behaviour_valid() return bool mm/memory.c: make tlb_next_batch() return bool mm/dmapool.c: change is_page_busy() return from int to bool mm: remove struct node_active_region mremap: simplify the "overlap" check in mremap_to() mremap: don't do uneccesary checks if new_len == old_len mremap: don't do mm_populate(new_addr) on failure mm: move ->mremap() from file_operations to vm_operations_struct mremap: don't leak new_vma if f_op->mremap() fails mm/hugetlb.c: make vma_shareable() return bool mm: make GUP handle pfn mapping unless FOLL_GET is requested mm: fix status code which move_pages() returns for zero page mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout() genalloc: add support of multiple gen_pools per device genalloc: add name arg to gen_pool_get() and devm_gen_pool_create() mm/memblock: WARN_ON when nid differs from overlap region Documentation/features/vm: add feature description and arch support status for batched TLB flush after unmap mm: defer flush of writable TLB entries mm: send one IPI per CPU to TLB flush all entries after unmapping pages ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-05 17:27:38 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-05 17:27:38 -0400
commit: 6c0f568e84a3cfc775682311d65205462c3f3bc1 (patch)
tree: 5105a137a9ea2459d55e895d3c096bbd31274724
parent: c82199061009d1561e31e17fca5e47a87cb7ff4c (diff)
parent: 559ec2f8fd50981821621f52db5e1a8ffcf8d792 (diff)
143 files changed, 5174 insertions, 1062 deletions
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644
index 000000000000..261b92e2fb1a
--- /dev/null
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -0,0 +1,40 @@
+#
+# Feature name:          batch-unmap-tlb-flush
+#         Kconfig:       ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#         description:   arch supports deferral of TLB flush until multiple pages are unmapped
+#
+    -----------------------
+    |         arch |status|
+    -----------------------
+    |       alpha: | TODO |
+    |         arc: | TODO |
+    |         arm: | TODO |
+    |       arm64: | TODO |
+    |       avr32: |  ..  |
+    |    blackfin: | TODO |
+    |         c6x: |  ..  |
+    |        cris: |  ..  |
+    |         frv: |  ..  |
+    |       h8300: |  ..  |
+    |     hexagon: | TODO |
+    |        ia64: | TODO |
+    |        m32r: | TODO |
+    |        m68k: |  ..  |
+    |       metag: | TODO |
+    |  microblaze: |  ..  |
+    |        mips: | TODO |
+    |     mn10300: | TODO |
+    |       nios2: |  ..  |
+    |    openrisc: |  ..  |
+    |      parisc: | TODO |
+    |     powerpc: | TODO |
+    |        s390: | TODO |
+    |       score: |  ..  |
+    |          sh: | TODO |
+    |       sparc: | TODO |
+    |        tile: | TODO |
+    |          um: |  ..  |
+    |   unicore32: |  ..  |
+    |         x86: |  ok  |
+    |      xtensa: | TODO |
+    -----------------------
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 64df08db4657..39ac6546d4a4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -303,6 +303,7 @@ Code  Seq#(hex)	Include File		Comments
 0xA3    80-8F   Port ACL                in development:
                                        <mailto:tlewis@mindspring.com>
 0xA3    90-9F   linux/dtlk.h
+0xAA    00-3F   linux/uapi/linux/userfaultfd.h
 0xAB    00-1F   linux/nbd.h
 0xAC    00-1F   linux/raw.h
 0xAD    00      Netfilter device        in development:
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 000000000000..70a3c94d1941
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,144 @@
+= Userfaultfd =
+== Objective ==
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+== Design ==
+Userfaults are delivered and resolved through the userfaultfd syscall.
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+1) read/POLLIN protocol to notify a userland thread of the faults
+   happening
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+   registered in the userfaultfd that allows userland to efficiently
+   resolve the userfaults it receives via 1) or to manage the virtual
+   memory in the background
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+== API ==
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+== QEMU/KVM ==
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 265ffeb2037e..80e277cfcc8b 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
                return;
        }
-        sram_pool = gen_pool_get(&pdev->dev);
+        sram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!sram_pool) {
                pr_warn("%s: sram pool unavailable!\n", __func__);
                return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
                goto put_node;
        }
-        ocram_pool = gen_pool_get(&pdev->dev);
+        ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
                goto put_node;
        }
-        ocram_pool = gen_pool_get(&pdev->dev);
+        ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
                goto put_node;
        }
-        ocram_pool = gen_pool_get(&pdev->dev);
+        ocram_pool = gen_pool_get(&pdev->dev, NULL);
        if (!ocram_pool) {
                pr_warn("%s: ocram pool unavailable!\n", __func__);
                ret = -ENODEV;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a64157..17f486233db0 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat;
-        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int arch_remove_memory(u64 start, u64 size)
 {
-        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct zone *zone;
        int ret;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index bce52ba66206..05713d190247 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
        /* Don't allow bogus node assignment */
        BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
-        start_pfn = start >> PAGE_SHIFT;
+        start_pfn = PFN_DOWN(start);
-        end_pfn = end >> PAGE_SHIFT;
+        end_pfn = PFN_DOWN(end);
        pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
                         PAGE_KERNEL);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
        select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+        select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..477bfa6db370 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,4 @@
 371     i386    recvfrom                sys_recvfrom                    compat_sys_recvfrom
 372     i386    recvmsg                 sys_recvmsg                     compat_sys_recvmsg
 373     i386    shutdown                sys_shutdown
+374     i386    userfaultfd             sys_userfaultfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..81c490634db9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
 320     common  kexec_file_load         sys_kexec_file_load
 321     common  bpf                     sys_bpf
 322     64      execveat                stub_execveat
+323     common  userfaultfd             sys_userfaultfd
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 #endif  /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {             \
+        inc_irq_stat(irq_tlb_count);    \
+        local_flush_tlb();              \
+}
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)  \
        native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
                return 0;
        }
-        watchdog_nmi_disable_all();
+        if (lockup_detector_suspend() != 0) {
+                pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+                return 0;
+        }
        x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
        x86_pmu.commit_scheduling = NULL;
        x86_pmu.stop_scheduling = NULL;
-        watchdog_nmi_enable_all();
+        lockup_detector_resume();
        get_online_cpus();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        info.flush_end = end;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+        trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
        if (is_uv_system()) {
                unsigned int cpu;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 31df474d72f4..560751bad294 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -392,6 +392,16 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
        for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
                int page_nid;
+                /*
+                 * memory block could have several absent sections from start.
+                 * skip pfn range from absent section
+                 */
+                if (!pfn_present(pfn)) {
+                        pfn = round_down(pfn + PAGES_PER_SECTION,
+                                         PAGES_PER_SECTION) - 1;
+                        continue;
+                }
                page_nid = get_nid_for_pfn(pfn);
                if (page_nid < 0)
                        continue;
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 58f65486de33..284ac4c934ba 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
        /* Get IRAM pool from device tree or platform data */
        pool = of_gen_pool_get(np, "iram", 0);
        if (!pool && pdata)
-                pool = gen_pool_get(pdata->iram_dev);
+                pool = gen_pool_get(pdata->iram_dev, NULL);
        if (!pool) {
                dev_err(&pdev->dev, "iram pool not available\n");
                return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
        if (IS_ERR(sram->virt_base))
                return PTR_ERR(sram->virt_base);
-        sram->pool = devm_gen_pool_create(sram->dev,
+        sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
-                                          ilog2(SRAM_GRANULARITY), -1);
+                                          NUMA_NO_NODE, NULL);
-        if (!sram->pool)
+        if (IS_ERR(sram->pool))
-                return -ENOMEM;
+                return PTR_ERR(sram->pool);
        ret = sram_reserve_regions(sram, res);
        if (ret)
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index ba97efc3bf70..071280643db7 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -9,7 +9,7 @@ config VGA_CONSOLE
        depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
                !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
                (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
-                !ARM64
+                !ARM64 && !ARC
        default y
        help
          Saying Y here will allow you to use Linux in text mode through a
diff --git a/fs/Makefile b/fs/Makefile
index 09e051fefc5b..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)          += signalfd.o
 obj-$(CONFIG_TIMERFD)           += timerfd.o
 obj-$(CONFIG_EVENTFD)           += eventfd.o
+obj-$(CONFIG_USERFAULTFD)       += userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)            += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
        }
 }
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
-{
-        vma->vm_flags |= VM_DONTEXPAND;
-        vma->vm_ops = &generic_file_vm_ops;
-        return 0;
-}
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 {
+        struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
        struct kioctx_table *table;
        int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
        return res;
 }
+static const struct vm_operations_struct aio_ring_vm_ops = {
+        .mremap         = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
+        .page_mkwrite   = filemap_page_mkwrite,
+#endif
+};
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        vma->vm_flags |= VM_DONTEXPAND;
+        vma->vm_ops = &aio_ring_vm_ops;
+        return 0;
+}
 static const struct file_operations aio_ring_fops = {
        .mmap = aio_ring_mmap,
-        .mremap = aio_ring_remap,
 };
 #if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..7b6bfcbf801c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-                seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+                seq_show_option(m, "snapdirname", fsopt->snapdir_name);
        return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
        struct sockaddr *srcaddr;
        srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
-        seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+        seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
        cifs_show_security(s, tcon->ses);
        cifs_show_cache_flavor(s, cifs_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
                seq_puts(s, ",multiuser");
        else if (tcon->ses->user_name)
-                seq_printf(s, ",username=%s", tcon->ses->user_name);
+                seq_show_option(s, "username", tcon->ses->user_name);
        if (tcon->ses->domainName)
-                seq_printf(s, ",domain=%s", tcon->ses->domainName);
+                seq_show_option(s, "domain", tcon->ses->domainName);
        if (srcaddr->sa_family != AF_UNSPEC) {
                struct sockaddr_in *saddr4;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..a63c7b0a10cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        }
        if (sbi->s_qf_names[USRQUOTA])
-                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+                seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
        if (sbi->s_qf_names[GRPQUOTA])
-                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+                seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
 #endif
 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
        if (is_ancestor(root, sdp->sd_master_dir))
                seq_puts(s, ",meta");
        if (args->ar_lockproto[0])
-                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+                seq_show_option(s, "lockproto", args->ar_lockproto);
        if (args->ar_locktable[0])
-                seq_printf(s, ",locktable=%s", args->ar_locktable);
+                seq_show_option(s, "locktable", args->ar_locktable);
        if (args->ar_hostdata[0])
-                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+                seq_show_option(s, "hostdata", args->ar_hostdata);
        if (args->ar_spectator)
                seq_puts(s, ",spectator");
        if (args->ar_localflocks)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
        struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
        if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
-                seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+                seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
        if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
-                seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+                seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
        seq_printf(seq, ",uid=%u,gid=%u",
                        from_kuid_munged(&init_user_ns, sbi->s_uid),
                        from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
        struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
        if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
-                seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+                seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
-                seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+                seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
                        from_kuid_munged(&init_user_ns, sbi->uid),
                        from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
        size_t offset = strlen(root_ino) + 1;
        if (strlen(root_path) > offset)
-                seq_printf(seq, ",%s", root_path + offset);
+                seq_show_option(seq, root_path + offset, NULL);
        if (append)
                seq_puts(seq, ",append");
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
+        bool free = false;
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
-        if (dn_mark->dn == NULL)
+        if (dn_mark->dn == NULL) {
-                fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+                fsnotify_detach_mark(fsn_mark);
+                free = true;
+        }
        mutex_unlock(&dnotify_group->mark_mutex);
+        if (free)
+                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
 }
@@ -362,9 +367,10 @@ out:
        spin_unlock(&fsn_mark->lock);
        if (destroy)
-                fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+                fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&dnotify_group->mark_mutex);
+        if (destroy)
+                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
 out_err:
        if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-                fsnotify_destroy_mark_locked(fsn_mark, group);
+                fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&group->mark_mutex);
+        if (destroy_mark)
+                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
        if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
                                                 &destroy_mark);
        if (destroy_mark)
-                fsnotify_destroy_mark_locked(fsn_mark, group);
+                fsnotify_detach_mark(fsn_mark);
        mutex_unlock(&group->mark_mutex);
+        if (destroy_mark)
+                fsnotify_free_mark(fsn_mark);
        /* matches the fsnotify_find_inode_mark() */
        fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
        struct inotify_inode_mark *inode_mark;
        struct inode *inode;
-        if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+            !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
                return;
        inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
 /*
 * Clear all of the marks on an inode when it is being evicted from core
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
                mnt = NULL;
        /*
+         * Optimization: srcu_read_lock() has a memory barrier which can
+         * be expensive.  It protects walking the *_fsnotify_marks lists.
+         * However, if we do not walk the lists, we do not have to do
+         * SRCU because we have no references to any objects and do not
+         * need SRCU to keep them "alive".
+         */
+        if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+            (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+                return 0;
+        /*
         * if this is a modify event we may need to clear the ignored masks
         * otherwise return if neither the inode nor the vfsmount care about
         * this type of event.
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
+#include "../mount.h"
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
                                                struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
+/* Destroy all marks in the given list protected by 'lock' */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
+/* run the list of all marks associated with inode and destroy them */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+        fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+        fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+                               &mnt->mnt_root->d_lock);
+}
 /*
 * update the dentry->d_flags of all of inode's children to indicate if inode cares
 * about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..474a3ce1b5e1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 }
 /*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
-        struct fsnotify_mark *mark;
-        struct hlist_node *n;
-        LIST_HEAD(free_list);
-        spin_lock(&inode->i_lock);
-        hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
-                list_add(&mark->free_list, &free_list);
-                hlist_del_init_rcu(&mark->obj_list);
-                fsnotify_get_mark(mark);
-        }
-        spin_unlock(&inode->i_lock);
-        fsnotify_destroy_marks(&free_list);
-}
-/*
 * Given a group clear all of the inode marks associated with that group.
 */
 void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
 }
 /*
- * Any time a mark is getting freed we end up here.
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
- * The caller had better be holding a reference to this mark so we don't actually
+ * if we got one.
- * do the final put under the mark->lock
+ *
+ * Must be called with group->mark_mutex held.
 */
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
-                                  struct fsnotify_group *group)
 {
        struct inode *inode = NULL;
+        struct fsnotify_group *group = mark->group;
        BUG_ON(!mutex_is_locked(&group->mark_mutex));
        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
-        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
-        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
                inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
                fsnotify_destroy_vfsmount_mark(mark);
        else
                BUG();
+        /*
+         * Note that we didn't update flags telling whether inode cares about
+         * what's happening with children. We update these flags from
+         * __fsnotify_parent() lazily when next event happens on one of our
+         * children.
+         */
        list_del_init(&mark->g_list);
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
        if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
                iput(inode);
-        /* release lock temporarily */
-        mutex_unlock(&group->mark_mutex);
+        atomic_dec(&group->num_marks);
+}
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+        struct fsnotify_group *group = mark->group;
+        spin_lock(&mark->lock);
+        /* something else already called this function on this mark */
+        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+                spin_unlock(&mark->lock);
+                return;
+        }
+        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+        spin_unlock(&mark->lock);
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        wake_up(&destroy_waitq);
-        /*
-         * We don't necessarily have a ref on mark from caller so the above destroy
-         * may have actually freed it, unless this group provides a 'freeing_mark'
-         * function which must be holding a reference.
-         */
        /*
         * Some groups like to know that marks are being freed.  This is a
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
-        /*
-         * __fsnotify_update_child_dentry_flags(inode);
-         *
-         * I really want to call that, but we can't, we have no idea if the inode
-         * still exists the second we drop the mark->lock.
-         *
-         * The next time an event arrive to this inode from one of it's children
-         * __fsnotify_parent will see that the inode doesn't care about it's
-         * children and will update all of these flags then.  So really this
-         * is just a lazy update (and could be a perf win...)
-         */
-        atomic_dec(&group->num_marks);
-        mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 }
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
 {
        mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-        fsnotify_destroy_mark_locked(mark, group);
+        fsnotify_detach_mark(mark);
        mutex_unlock(&group->mark_mutex);
+        fsnotify_free_mark(mark);
 }
-/*
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
 {
-        struct fsnotify_mark *mark, *lmark;
+        struct fsnotify_mark *mark;
-        struct fsnotify_group *group;
-        list_for_each_entry_safe(mark, lmark, to_free, free_list) {
-                spin_lock(&mark->lock);
-                fsnotify_get_group(mark->group);
-                group = mark->group;
-                spin_unlock(&mark->lock);
-                fsnotify_destroy_mark(mark, group);
+        while (1) {
+                /*
+                 * We have to be careful since we can race with e.g.
+                 * fsnotify_clear_marks_by_group() and once we drop 'lock',
+                 * mark can get removed from the obj_list and destroyed. But
+                 * we are holding mark reference so mark cannot be freed and
+                 * calling fsnotify_destroy_mark() more than once is fine.
+                 */
+                spin_lock(lock);
+                if (hlist_empty(head)) {
+                        spin_unlock(lock);
+                        break;
+                }
+                mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+                /*
+                 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+                 * since inode / mount is going away anyway. So just remove
+                 * mark from the list.
+                 */
+                hlist_del_init_rcu(&mark->obj_list);
+                fsnotify_get_mark(mark);
+                spin_unlock(lock);
+                fsnotify_destroy_mark(mark, mark->group);
                fsnotify_put_mark(mark);
-                fsnotify_put_group(group);
        }
 }
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
         * inode->i_lock
         */
        spin_lock(&mark->lock);
-        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
        fsnotify_get_group(group);
        mark->group = group;
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
                }
                mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
-                fsnotify_destroy_mark_locked(mark, group);
+                fsnotify_detach_mark(mark);
                mutex_unlock(&group->mark_mutex);
+                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
 }
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
-        struct fsnotify_mark *mark;
-        struct hlist_node *n;
-        struct mount *m = real_mount(mnt);
-        LIST_HEAD(free_list);
-        spin_lock(&mnt->mnt_root->d_lock);
-        hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
-                list_add(&mark->free_list, &free_list);
-                hlist_del_init_rcu(&mark->obj_list);
-                fsnotify_get_mark(mark);
-        }
-        spin_unlock(&mnt->mnt_root->d_lock);
-        fsnotify_destroy_marks(&free_list);
-}
 void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c1128bcbeb5e..d1a853585b53 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
        return true;
 #ifdef NTFS_RW
 iput_usnjrnl_err_out:
-        if (vol->usnjrnl_j_ino)
+        iput(vol->usnjrnl_j_ino);
-                iput(vol->usnjrnl_j_ino);
+        iput(vol->usnjrnl_max_ino);
-        if (vol->usnjrnl_max_ino)
+        iput(vol->usnjrnl_ino);
-                iput(vol->usnjrnl_max_ino);
-        if (vol->usnjrnl_ino)
-                iput(vol->usnjrnl_ino);
 iput_quota_err_out:
-        if (vol->quota_q_ino)
+        iput(vol->quota_q_ino);
-                iput(vol->quota_q_ino);
+        iput(vol->quota_ino);
-        if (vol->quota_ino)
-                iput(vol->quota_ino);
        iput(vol->extend_ino);
 #endif /* NTFS_RW */
 iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
        iput(vol->root_ino);
 iput_logfile_err_out:
 #ifdef NTFS_RW
-        if (vol->logfile_ino)
+        iput(vol->logfile_ino);
-                iput(vol->logfile_ino);
 iput_vol_err_out:
 #endif /* NTFS_RW */
        iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
        iput(vol->mftbmp_ino);
 iput_mirr_err_out:
 #ifdef NTFS_RW
-        if (vol->mftmirr_ino)
+        iput(vol->mftmirr_ino);
-                iput(vol->mftmirr_ino);
 #endif /* NTFS_RW */
        return false;
 }
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-        return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+        struct buffer_head *bh = NULL;
+        int status = 0;
+        status = ocfs2_inode_lock(inode, &bh, 1);
+        if (status < 0) {
+                if (status != -ENOENT)
+                        mlog_errno(status);
+                return status;
+        }
+        status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(bh);
+        return status;
 }
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
-        int ret = -EAGAIN;
+        int ret;
        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret < 0) {
-        if (ret < 0)
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
                return ERR_PTR(ret);
+        }
        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+        ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
        return acl;
 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..86181d6526dc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
         */
        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Extent block #%llu has bad signature %.*s",
+                                 "Extent block #%llu has bad signature %.*s\n",
-                            (unsigned long long)bh->b_blocknr, 7,
+                                 (unsigned long long)bh->b_blocknr, 7,
-                            eb->h_signature);
+                                 eb->h_signature);
-                return -EINVAL;
+                goto bail;
        }
        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Extent block #%llu has an invalid h_blkno "
+                                 "Extent block #%llu has an invalid h_blkno of %llu\n",
-                            "of %llu",
+                                 (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                 (unsigned long long)le64_to_cpu(eb->h_blkno));
-                            (unsigned long long)le64_to_cpu(eb->h_blkno));
+                goto bail;
-                return -EINVAL;
        }
        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Extent block #%llu has an invalid "
+                                 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
-                            "h_fs_generation of #%u",
+                                 (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                 le32_to_cpu(eb->h_fs_generation));
-                            le32_to_cpu(eb->h_fs_generation));
+                goto bail;
-                return -EINVAL;
        }
+bail:
-        return 0;
+        return rc;
 }
 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
        while(le16_to_cpu(el->l_tree_depth) > 1) {
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu has empty "
+                                    "Owner %llu has empty extent list (next_free_rec == 0)\n",
-                                    "extent list (next_free_rec == 0)",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
                        status = -EIO;
                        goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
                if (!blkno) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu has extent "
+                                    "Owner %llu has extent list where extent # %d has no physical block start\n",
-                                    "list where extent # %d has no physical "
-                                    "block start",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
                        status = -EIO;
                        goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
        while (el->l_tree_depth) {
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                    "Owner %llu has empty extent list at "
+                                    "Owner %llu has empty extent list at depth %u\n",
-                                    "depth %u\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    le16_to_cpu(el->l_tree_depth));
                        ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
                if (blkno == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                    "Owner %llu has bad blkno in extent list "
+                                    "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
-                                    "at depth %u (index %d)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    le16_to_cpu(el->l_tree_depth), i);
                        ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-                                    "Owner %llu has bad count in extent list "
+                                    "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
-                                    "at block %llu (next free=%u, count=%u)\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
                                    (unsigned long long)bh->b_blocknr,
                                    le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        if (left_el->l_next_free_rec != left_el->l_count) {
                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                            "Inode %llu has non-full interior leaf node %llu"
+                            "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
-                            "(next free = %u)",
                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                            (unsigned long long)left_leaf_bh->b_blocknr,
                            le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
                 * If we got here, we never found a valid node where
                 * the tree indicated one should be.
                 */
-                ocfs2_error(sb,
+                ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
-                            "Invalid extent tree at extent block %llu\n",
                            (unsigned long long)blkno);
                ret = -EROFS;
                goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                 * If we got here, we never found a valid node where
                 * the tree indicated one should be.
                 */
-                ocfs2_error(sb,
+                ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
-                            "Invalid extent tree at extent block %llu\n",
                            (unsigned long long)blkno);
                ret = -EROFS;
                goto out;
@@ -3131,6 +3120,30 @@ out:
        return ret;
 }
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+                                struct ocfs2_extent_tree *et,
+                                struct ocfs2_path *path,
+                                struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        handle_t *handle;
+        int ret;
+        int credits = path->p_tree_depth * 2 + 1;
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+        if (ret)
+                mlog_errno(ret);
+        ocfs2_commit_trans(osb, handle);
+        return ret;
+}
 /*
 * Left rotation of btree records.
 *
@@ -3200,7 +3213,7 @@ rightmost_no_delete:
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
                        ret = -EIO;
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu has empty extent block at %llu",
+                                    "Owner %llu has empty extent block at %llu\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
                        goto out;
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                next_free = le16_to_cpu(el->l_next_free_rec);
                if (next_free == 0) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu has a bad extent list",
+                                    "Owner %llu has a bad extent list\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
                        ret = -EIO;
                        return;
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
                                ocfs2_error(sb,
-                                            "Extent block #%llu has an "
+                                            "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
-                                            "invalid l_next_free_rec of "
-                                            "%d.  It should have "
-                                            "matched the l_count of %d",
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec),
                                            le16_to_cpu(new_el->l_count));
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
                                ocfs2_error(sb,
-                                            "Extent block #%llu has an "
+                                            "Extent block #%llu has an invalid l_next_free_rec of %d\n",
-                                            "invalid l_next_free_rec of %d",
                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
                                            le16_to_cpu(new_el->l_next_free_rec));
                                status = -EINVAL;
@@ -4970,10 +4979,9 @@ leftright:
                split_index = ocfs2_search_extent_list(el, cpos);
                if (split_index == -1) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                        "Owner %llu has an extent at cpos %u "
+                                    "Owner %llu has an extent at cpos %u which can no longer be found\n",
-                                        "which can no longer be found.\n",
+                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-                                        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                                    cpos);
-                                        cpos);
                        ret = -EROFS;
                        goto out;
                }
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
                ocfs2_error(sb,
-                            "Owner %llu has an extent at cpos %u which can no "
+                            "Owner %llu has an extent at cpos %u which can no longer be found\n",
-                            "longer be found.\n",
+                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-                             (unsigned long long)
+                            cpos);
-                             ocfs2_metadata_cache_owner(et->et_ci), cpos);
                ret = -EROFS;
                goto out;
        }
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
                cpos, len, phys);
        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
-                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
-                            "that are being written to, but the feature bit "
-                            "is not set in the super block.",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
                ret = -EROFS;
                goto out;
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                            "Owner %llu has an extent at cpos %u which can no "
+                            "Owner %llu has an extent at cpos %u which can no longer be found\n",
-                            "longer be found.\n",
                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                            cpos);
                ret = -EROFS;
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
                index = ocfs2_search_extent_list(el, cpos);
                if (index == -1) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu: split at cpos %u lost record.",
+                                    "Owner %llu: split at cpos %u lost record\n",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    cpos);
                        ret = -EROFS;
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
                        ocfs2_rec_clusters(el, rec);
                if (rec_range != trunc_range) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-                                    "Owner %llu: error after split at cpos %u"
+                                    "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
-                                    "trunc len %u, existing record is (%u,%u)",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
                                    cpos, len, le32_to_cpu(rec->e_cpos),
                                    ocfs2_rec_clusters(el, rec));
@@ -6175,7 +6178,7 @@ bail:
                iput(tl_inode);
        brelse(tl_bh);
-        if (status < 0 && (*tl_copy)) {
+        if (status < 0) {
                kfree(*tl_copy);
                *tl_copy = NULL;
                mlog_errno(status);
@@ -7108,15 +7111,23 @@ start:
                 * to check it up here before changing the tree.
                */
                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
-                        ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                        mlog(ML_ERROR, "Inode %lu has an empty "
                                    "extent record, depth %u\n", inode->i_ino,
                                    le16_to_cpu(root_el->l_tree_depth));
-                        status = -EROFS;
+                        status = ocfs2_remove_rightmost_empty_extent(osb,
-                        goto bail;
+                                        &et, path, &dealloc);
+                        if (status) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                        ocfs2_reinit_path(path, 1);
+                        goto start;
+                } else {
+                        trunc_cpos = le32_to_cpu(rec->e_cpos);
+                        trunc_len = 0;
+                        blkno = 0;
                }
-                trunc_cpos = le32_to_cpu(rec->e_cpos);
-                trunc_len = 0;
-                blkno = 0;
        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
                /*
                 * Truncate entire record.
@@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
            !ocfs2_supports_inline_data(osb)) {
                ocfs2_error(inode->i_sb,
-                            "Inline data flags for inode %llu don't agree! "
+                            "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
-                            "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            le16_to_cpu(di->i_dyn_features),
                            OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9db8194..64b11d90eca6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
-                ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+                ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
                return -EROFS;
        }
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        if (size > PAGE_CACHE_SIZE ||
            size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
                ocfs2_error(inode->i_sb,
-                            "Inode %llu has with inline data has bad size: %Lu",
+                            "Inode %llu has with inline data has bad size: %Lu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            (unsigned long long)size);
                return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
                                          &contig_blocks, &ext_flags);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
        if (ret) {
                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                     (unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                alloc_locked = 1;
+                down_write(&OCFS2_I(inode)->ip_alloc_sem);
                /* fill hole, allocate blocks can't be larger than the size
                 * of the hole */
                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                ret = ocfs2_extend_allocation(inode, cpos,
                                clusters_to_alloc, 0);
                if (ret < 0) {
+                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
                        mlog_errno(ret);
                        goto bail;
                }
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
                                &contig_blocks, &ext_flags);
                if (ret < 0) {
+                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                                        (unsigned long long)iblock);
                        ret = -EIO;
                        goto bail;
                }
+                up_write(&OCFS2_I(inode)->ip_alloc_sem);
        }
        /*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
        }
-        ocfs2_iocb_clear_rw_locked(iocb);
+        /* Let rw unlock to be done later to protect append direct io write */
+        if (offset + bytes <= i_size_read(inode)) {
+                ocfs2_iocb_clear_rw_locked(iocb);
-        level = ocfs2_iocb_rw_locked_level(iocb);
+                level = ocfs2_iocb_rw_locked_level(iocb);
-        ocfs2_rw_unlock(inode, level);
+                ocfs2_rw_unlock(inode, level);
+        }
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                /* zeroing out the previously allocated cluster tail
                 * that but not zeroed */
-                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                        down_read(&OCFS2_I(inode)->ip_alloc_sem);
                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
                                        zero_len_tail, cluster_align_tail);
-                else
+                        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                } else {
+                        down_write(&OCFS2_I(inode)->ip_alloc_sem);
                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
                                        offset);
+                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+                }
                if (ret < 0) {
                        mlog_errno(ret);
                        ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
        written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                                       offset, ocfs2_direct_IO_get_blocks,
                                       ocfs2_dio_end_io, NULL, 0);
-        if (unlikely(written < 0)) {
+        /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+        if ((written < 0) && (written != -EIOCBQUEUED)) {
                loff_t i_size = i_size_read(inode);
                if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
                                        ocfs2_inode_unlock(inode, 1);
                                        brelse(di_bh);
+                                        di_bh = NULL;
                                        goto clean_orphan;
                                }
                        }
                        ocfs2_inode_unlock(inode, 1);
                        brelse(di_bh);
+                        di_bh = NULL;
                        ret = jbd2_journal_force_commit(journal);
                        if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
                if (tmp_ret < 0) {
                        ret = tmp_ret;
                        mlog_errno(ret);
+                        brelse(di_bh);
                        goto out;
                }
                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
                tmp_ret = jbd2_journal_force_commit(journal);
                if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
                if (ret)
                        goto out_commit;
        }
-        /*
-         * We don't want this to fail in ocfs2_write_end(), so do it
-         * here.
-         */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata)
 {
-        int i;
+        int i, ret;
        unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+                        OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                copied = ret;
+                mlog_errno(ret);
+                goto out;
+        }
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
                goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
        /* unlock pages before dealloc since it needs acquiring j_trans_barrier
         * lock, or it will cause a deadlock since journal commit threads holds
         * this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                bh = bhs[i];
                if (!(flags & OCFS2_BH_READAHEAD)) {
+                        if (status) {
+                                /* Clear the rest of the buffers on error */
+                                put_bh(bh);
+                                bhs[i] = NULL;
+                                continue;
+                        }
                        /* We know this can't have changed as we hold the
                         * owner sem. Avoid doing any work on the bh if the
                         * journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 140de3c93d2e..fa15debcc02b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
+#include <linux/ktime.h>
 #include "heartbeat.h"
 #include "tcp.h"
 #include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
        return ret;
 }
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
-                             struct timeval *b)
-{
-        /* just return 0 when a is after b */
-        if (a->tv_sec < b->tv_sec ||
-            (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
-                a->tv_sec = 0;
-                a->tv_usec = 0;
-                return;
-        }
-        a->tv_sec -= b->tv_sec;
-        a->tv_usec -= b->tv_usec;
-        while ( a->tv_usec < 0 ) {
-                a->tv_sec--;
-                a->tv_usec += 1000000;
-        }
-}
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
-                                       struct timeval *end)
-{
-        struct timeval res = *end;
-        o2hb_tv_subtract(&res, start);
-        return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
 /*
 * we ride the region ref that the region dir holds.  before the region
 * dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
        int i, ret;
        struct o2hb_region *reg = data;
        struct o2hb_bio_wait_ctxt write_wc;
-        struct timeval before_hb, after_hb;
+        ktime_t before_hb, after_hb;
        unsigned int elapsed_msec;
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
-                do_gettimeofday(&before_hb);
+                before_hb = ktime_get_real();
                ret = o2hb_do_disk_heartbeat(reg);
-                do_gettimeofday(&after_hb);
+                after_hb = ktime_get_real();
-                elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+                elapsed_msec = (unsigned int)
+                                ktime_ms_delta(after_hb, before_hb);
                mlog(ML_HEARTBEAT,
-                     "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
+                     "start = %lld, end = %lld, msec = %u, ret = %d\n",
-                     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
+                     before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
-                     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
-                     elapsed_msec, ret);
                if (!kthread_should_stop() &&
                    elapsed_msec < reg->hr_timeout_ms) {
@@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
        struct o2hb_disk_slot *slot;
        reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
-        if (reg->hr_tmp_block == NULL) {
+        if (reg->hr_tmp_block == NULL)
-                mlog_errno(-ENOMEM);
                return -ENOMEM;
-        }
        reg->hr_slots = kcalloc(reg->hr_blocks,
                                sizeof(struct o2hb_disk_slot), GFP_KERNEL);
-        if (reg->hr_slots == NULL) {
+        if (reg->hr_slots == NULL)
-                mlog_errno(-ENOMEM);
                return -ENOMEM;
-        }
        for(i = 0; i < reg->hr_blocks; i++) {
                slot = &reg->hr_slots[i];
@@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
        reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
                                    GFP_KERNEL);
-        if (!reg->hr_slot_data) {
+        if (!reg->hr_slot_data)
-                mlog_errno(-ENOMEM);
                return -ENOMEM;
-        }
        for(i = 0; i < reg->hr_num_pages; i++) {
                page = alloc_page(GFP_KERNEL);
-                if (!page) {
+                if (!page)
-                        mlog_errno(-ENOMEM);
                        return -ENOMEM;
-                }
                reg->hr_slot_data[i] = page;
@@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
        struct o2hb_disk_heartbeat_block *hb_block;
        ret = o2hb_read_slots(reg, reg->hr_blocks);
-        if (ret) {
+        if (ret)
-                mlog_errno(ret);
                goto out;
-        }
        /* We only want to get an idea of the values initially in each
         * slot, so we do no verification - o2hb_check_slot will
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
-                rc = -EINVAL;
+                rc = ocfs2_error(dir->i_sb,
-                ocfs2_error(dir->i_sb,
+                                 "Invalid dirblock #%llu: signature = %.*s\n",
-                            "Invalid dirblock #%llu: "
+                                 (unsigned long long)bh->b_blocknr, 7,
-                            "signature = %.*s\n",
+                                 trailer->db_signature);
-                            (unsigned long long)bh->b_blocknr, 7,
-                            trailer->db_signature);
                goto out;
        }
        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
-                rc = -EINVAL;
+                rc = ocfs2_error(dir->i_sb,
-                ocfs2_error(dir->i_sb,
+                                 "Directory block #%llu has an invalid db_blkno of %llu\n",
-                            "Directory block #%llu has an invalid "
+                                 (unsigned long long)bh->b_blocknr,
-                            "db_blkno of %llu",
+                                 (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                            (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
                goto out;
        }
        if (le64_to_cpu(trailer->db_parent_dinode) !=
            OCFS2_I(dir)->ip_blkno) {
-                rc = -EINVAL;
+                rc = ocfs2_error(dir->i_sb,
-                ocfs2_error(dir->i_sb,
+                                 "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
-                            "Directory block #%llu on dinode "
+                                 (unsigned long long)bh->b_blocknr,
-                            "#%llu has an invalid parent_dinode "
+                                 (unsigned long long)OCFS2_I(dir)->ip_blkno,
-                            "of %llu",
+                                 (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                            (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)OCFS2_I(dir)->ip_blkno,
-                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
                goto out;
        }
 out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
        }
        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
-                ocfs2_error(sb,
+                ret = ocfs2_error(sb,
-                            "Dir Index Root # %llu has bad signature %.*s",
+                                  "Dir Index Root # %llu has bad signature %.*s\n",
-                            (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                                  (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
-                            7, dx_root->dr_signature);
+                                  7, dx_root->dr_signature);
-                return -EINVAL;
        }
-        return 0;
+        return ret;
 }
 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
        }
        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
-                ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+                ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
-                            7, dx_leaf->dl_signature);
+                                  7, dx_leaf->dl_signature);
-                return -EROFS;
        }
-        return 0;
+        return ret;
 }
 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
                el = &eb->h_list;
                if (el->l_tree_depth) {
-                        ocfs2_error(inode->i_sb,
+                        ret = ocfs2_error(inode->i_sb,
-                                    "Inode %lu has non zero tree depth in "
+                                          "Inode %lu has non zero tree depth in btree tree block %llu\n",
-                                    "btree tree block %llu\n", inode->i_ino,
+                                          inode->i_ino,
-                                    (unsigned long long)eb_bh->b_blocknr);
+                                          (unsigned long long)eb_bh->b_blocknr);
-                        ret = -EROFS;
                        goto out;
                }
        }
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
        }
        if (!found) {
-                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                ret = ocfs2_error(inode->i_sb,
-                            "record (%u, %u, 0) in btree", inode->i_ino,
+                                  "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
-                            le32_to_cpu(rec->e_cpos),
+                                  inode->i_ino,
-                            ocfs2_rec_clusters(el, rec));
+                                  le32_to_cpu(rec->e_cpos),
-                ret = -EROFS;
+                                  ocfs2_rec_clusters(el, rec));
                goto out;
        }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7df88a6dd626..6918f30d02cd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        if (status == -ENOPROTOOPT) {
                status = 0;
                *response = JOIN_OK_NO_MAP;
-        } else if (packet.code == JOIN_DISALLOW ||
-                   packet.code == JOIN_OK_NO_MAP) {
-                *response = packet.code;
-        } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
-                mlog(ML_NOTICE,
-                     "This node requested DLM locking protocol %u.%u and "
-                     "filesystem locking protocol %u.%u.  At least one of "
-                     "the protocol versions on node %d is not compatible, "
-                     "disconnecting\n",
-                     dlm->dlm_locking_proto.pv_major,
-                     dlm->dlm_locking_proto.pv_minor,
-                     dlm->fs_locking_proto.pv_major,
-                     dlm->fs_locking_proto.pv_minor,
-                     node);
-                status = -EPROTO;
-                *response = packet.code;
-        } else if (packet.code == JOIN_OK) {
-                *response = packet.code;
-                /* Use the same locking protocol as the remote node */
-                dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
-                dlm->fs_locking_proto.pv_minor = packet.fs_minor;
-                mlog(0,
-                     "Node %d responds JOIN_OK with DLM locking protocol "
-                     "%u.%u and fs locking protocol %u.%u\n",
-                     node,
-                     dlm->dlm_locking_proto.pv_major,
-                     dlm->dlm_locking_proto.pv_minor,
-                     dlm->fs_locking_proto.pv_major,
-                     dlm->fs_locking_proto.pv_minor);
        } else {
-                status = -EINVAL;
+                *response = packet.code;
-                mlog(ML_ERROR, "invalid response %d from node %u\n",
+                switch (packet.code) {
-                     packet.code, node);
+                case JOIN_DISALLOW:
+                case JOIN_OK_NO_MAP:
+                        break;
+                case JOIN_PROTOCOL_MISMATCH:
+                        mlog(ML_NOTICE,
+                             "This node requested DLM locking protocol %u.%u and "
+                             "filesystem locking protocol %u.%u.  At least one of "
+                             "the protocol versions on node %d is not compatible, "
+                             "disconnecting\n",
+                             dlm->dlm_locking_proto.pv_major,
+                             dlm->dlm_locking_proto.pv_minor,
+                             dlm->fs_locking_proto.pv_major,
+                             dlm->fs_locking_proto.pv_minor,
+                             node);
+                        status = -EPROTO;
+                        break;
+                case JOIN_OK:
+                        /* Use the same locking protocol as the remote node */
+                        dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+                        dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+                        mlog(0,
+                             "Node %d responds JOIN_OK with DLM locking protocol "
+                             "%u.%u and fs locking protocol %u.%u\n",
+                             node,
+                             dlm->dlm_locking_proto.pv_major,
+                             dlm->dlm_locking_proto.pv_minor,
+                             dlm->fs_locking_proto.pv_major,
+                             dlm->fs_locking_proto.pv_minor);
+                        break;
+                default:
+                        status = -EINVAL;
+                        mlog(ML_ERROR, "invalid response %d from node %u\n",
+                             packet.code, node);
+                        /* Reset response to JOIN_DISALLOW */
+                        *response = JOIN_DISALLOW;
+                        break;
+                }
        }
        mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
        if (status)
                goto bail;
-        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
-                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
        status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
        if (status)
                goto bail;
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
                                        sizeof(struct dlm_exit_domain),
                                        dlm_begin_exit_domain_handler,
                                        dlm, NULL, &dlm->dlm_domain_handlers);
-        if (status)
-                goto bail;
 bail:
        if (status)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
-        spin_lock(&dlm->track_lock);
-        if (!list_empty(&res->tracking))
-                list_del_init(&res->tracking);
-        else {
-                mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
-                     res->lockname.len, res->lockname.name);
-                dlm_print_one_lock_resource(res);
-        }
-        spin_unlock(&dlm->track_lock);
        atomic_dec(&dlm->res_cur_count);
        if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
                dlm_lockres_grab_inflight_ref(dlm, tmpres);
                spin_unlock(&tmpres->spinlock);
-                if (res)
+                if (res) {
+                        spin_lock(&dlm->track_lock);
+                        if (!list_empty(&res->tracking))
+                                list_del_init(&res->tracking);
+                        else
+                                mlog(ML_ERROR, "Resource %.*s not "
+                                                "on the Tracking list\n",
+                                                res->lockname.len,
+                                                res->lockname.name);
+                        spin_unlock(&dlm->track_lock);
                        dlm_lockres_put(res);
+                }
                res = tmpres;
                goto leave;
        }
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b1a31f..d0e436dc6437 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                     struct dlm_migratable_lockres *mres)
 {
        struct dlm_migratable_lock *ml;
-        struct list_head *queue, *iter;
+        struct list_head *queue;
        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
@@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                                list_for_each(iter, tmpq) {
+                                list_for_each_entry(lock, tmpq, list) {
-                                        lock = list_entry(iter,
-                                                  struct dlm_lock, list);
                                        if (lock->ml.cookie == ml->cookie)
                                                break;
                                        lock = NULL;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
        __dlm_unhash_lockres(dlm, res);
+        spin_lock(&dlm->track_lock);
+        if (!list_empty(&res->tracking))
+                list_del_init(&res->tracking);
+        else {
+                mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+                                res->lockname.len, res->lockname.name);
+                __dlm_print_one_lock_resource(res);
+        }
+        spin_unlock(&dlm->track_lock);
        /* lockres is not in the hash now.  drop the flag and wake up
         * any processes waiting in dlm_get_lock_resource. */
        if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e40dd74..1c91103c1333 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3035,8 +3035,6 @@ local:
        ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
        osb->cconn = conn;
-        status = 0;
 bail:
        if (status < 0) {
                ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
-                            "Inode %lu has non zero tree depth in "
+                            "Inode %lu has non zero tree depth in leaf block %llu\n",
-                            "leaf block %llu\n", inode->i_ino,
+                            inode->i_ino,
                            (unsigned long long)eb_bh->b_blocknr);
                ret = -EROFS;
                goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
                if (el->l_tree_depth) {
                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has non zero tree depth in "
+                                    "Inode %lu has non zero tree depth in leaf block %llu\n",
-                                    "leaf block %llu\n", inode->i_ino,
+                                    inode->i_ino,
                                    (unsigned long long)eb_bh->b_blocknr);
                        ret = -EROFS;
                        goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
        BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
        if (!rec->e_blkno) {
-                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                ocfs2_error(inode->i_sb,
-                            "record (%u, %u, 0)", inode->i_ino,
+                            "Inode %lu has bad extent record (%u, %u, 0)\n",
+                            inode->i_ino,
                            le32_to_cpu(rec->e_cpos),
                            ocfs2_rec_clusters(el, rec));
                ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                if (el->l_tree_depth) {
                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has non zero tree depth in "
+                                    "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
-                                    "xattr leaf block %llu\n", inode->i_ino,
+                                    inode->i_ino,
                                    (unsigned long long)eb_bh->b_blocknr);
                        ret = -EROFS;
                        goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
                if (!rec->e_blkno) {
-                        ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                        ocfs2_error(inode->i_sb,
-                                    "record (%u, %u, 0) in xattr", inode->i_ino,
+                                    "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+                                    inode->i_ino,
                                    le32_to_cpu(rec->e_cpos),
                                    ocfs2_rec_clusters(el, rec));
                        ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..0e5b4515f92e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
        int status = 0, size_change;
+        int inode_locked = 0;
        struct inode *inode = d_inode(dentry);
        struct super_block *sb = inode->i_sb;
        struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                goto bail_unlock_rw;
        }
+        inode_locked = 1;
        if (size_change) {
                status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_inode_unlock(inode, 1);
+        if (status) {
+                ocfs2_inode_unlock(inode, 1);
+                inode_locked = 0;
+        }
 bail_unlock_rw:
        if (size_change)
                ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
                if (status < 0)
                        mlog_errno(status);
        }
+        if (inode_locked)
+                ocfs2_inode_unlock(inode, 1);
        return status;
 }
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        ssize_t written = 0;
        ssize_t ret;
        size_t count = iov_iter_count(from), orig_count;
-        loff_t old_size;
-        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
        int unaligned_dio = 0;
        int dropped_dio = 0;
+        int append_write = ((iocb->ki_pos + count) >=
+                        i_size_read(inode) ? 1 : 0);
        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
+         * For append write, we must take rw EX.
         */
-        rw_level = (!direct_io || full_coherency);
+        rw_level = (!direct_io || full_coherency || append_write);
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
                ocfs2_iocb_set_unaligned_aio(iocb);
        }
-        /*
-         * To later detect whether a journal commit for sync writes is
-         * necessary, we sample i_size, and cluster count here.
-         */
-        old_size = i_size_read(inode);
-        old_clusters = OCFS2_I(inode)->ip_clusters;
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -2378,6 +2379,20 @@ relock:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+        /*
+         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+         * function pointer which is called when o_direct io completes so that
+         * it can unlock our rw lock.
+         * Unfortunately there are error cases which call end_io and others
+         * that don't.  so we don't have to unlock the rw_lock if either an
+         * async dio is going to do it in the future or an end_io after an
+         * error has already done it.
+         */
+        if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+                rw_level = -1;
+                unaligned_dio = 0;
+        }
        if (unlikely(written <= 0))
                goto no_sync;
@@ -2402,21 +2417,7 @@ relock:
        }
 no_sync:
-        /*
+        if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
-         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
-         * function pointer which is called when o_direct io completes so that
-         * it can unlock our rw lock.
-         * Unfortunately there are error cases which call end_io and others
-         * that don't.  so we don't have to unlock the rw_lock if either an
-         * async dio is going to do it in the future or an end_io after an
-         * error has already done it.
-         */
-        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
-                rw_level = -1;
-                unaligned_dio = 0;
-        }
-        if (unaligned_dio) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
        }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..8f87e05ee25d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
        int wipe, status;
        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
        trace_ocfs2_delete_inode(inode->i_ino,
                                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
                goto bail_unlock_nfs_sync;
        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        /* Skip inode deletion and wait for dio orphan entry recovered
+         * first */
+        if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+                ocfs2_cleanup_delete_inode(inode, 0);
+                goto bail_unlock_inode;
+        }
        /* Query the cluster. This will be the final decision made
         * before we go ahead and wipe the inode. */
        status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode)
 int ocfs2_drop_inode(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        int res;
        trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
                                inode->i_nlink, oi->ip_flags);
-        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
+        assert_spin_locked(&inode->i_lock);
-                res = 1;
+        inode->i_state |= I_WILL_FREE;
-        else
+        spin_unlock(&inode->i_lock);
-                res = generic_drop_inode(inode);
+        write_inode_now(inode, 1);
+        spin_lock(&inode->i_lock);
+        WARN_ON(inode->i_state & I_NEW);
+        inode->i_state &= ~I_WILL_FREE;
-        return res;
+        return 1;
 }
 /*
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
        rc = -EINVAL;
        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
-                            (unsigned long long)bh->b_blocknr, 7,
+                                 (unsigned long long)bh->b_blocknr, 7,
-                            di->i_signature);
+                                 di->i_signature);
                goto bail;
        }
        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
-                ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
-                            (unsigned long long)bh->b_blocknr,
+                                 (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                                 (unsigned long long)le64_to_cpu(di->i_blkno));
                goto bail;
        }
        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                                 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
-                            (unsigned long long)bh->b_blocknr);
+                                 (unsigned long long)bh->b_blocknr);
                goto bail;
        }
        if (le32_to_cpu(di->i_fs_generation) !=
            OCFS2_SB(sb)->fs_generation) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Invalid dinode #%llu: fs_generation is %u\n",
+                                 "Invalid dinode #%llu: fs_generation is %u\n",
-                            (unsigned long long)bh->b_blocknr,
+                                 (unsigned long long)bh->b_blocknr,
-                            le32_to_cpu(di->i_fs_generation));
+                                 le32_to_cpu(di->i_fs_generation));
                goto bail;
        }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..ca3431ee7f24 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
-        wait_queue_head_t append_dio_wq;
        struct dquot *i_dquot[MAXQUOTAS];
 };
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
                mlog_errno(PTR_ERR(handle));
                if (is_journal_aborted(journal)) {
-                        ocfs2_abort(osb->sb, "Detected aborted journal");
+                        ocfs2_abort(osb->sb, "Detected aborted journal\n");
                        handle = ERR_PTR(-EROFS);
                }
        } else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
                mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
                mlog(ML_ERROR, "b_blocknr=%llu\n",
                     (unsigned long long)bh->b_blocknr);
-                BUG();
+                lock_buffer(bh);
+                /*
+                 * A previous attempt to write this buffer head failed.
+                 * Nothing we can do but to retry the write and hope for
+                 * the best.
+                 */
+                if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+                        clear_buffer_write_io_error(bh);
+                        set_buffer_uptodate(bh);
+                }
+                if (!buffer_uptodate(bh)) {
+                        unlock_buffer(bh);
+                        return -EIO;
+                }
+                unlock_buffer(bh);
        }
        /* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                oi->ip_next_orphan = NULL;
+                mutex_lock(&inode->i_mutex);
                ret = ocfs2_rw_lock(inode, 1);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                         * ocfs2_delete_inode. */
                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
                        spin_unlock(&oi->ip_lock);
-                } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+                }
+                if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
                                (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
                        ret = ocfs2_truncate_file(inode, di_bh,
                                        i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                        ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
                        if (ret)
                                mlog_errno(ret);
-                        wake_up(&OCFS2_I(inode)->append_dio_wq);
                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
 unlock_inode:
                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
+                di_bh = NULL;
 unlock_rw:
                ocfs2_rw_unlock(inode, 1);
 next:
+                mutex_unlock(&inode->i_mutex);
                iput(inode);
-                brelse(di_bh);
-                di_bh = NULL;
                inode = iter;
        }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 #ifdef CONFIG_OCFS2_DEBUG_FS
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
-                ocfs2_error(osb->sb, "local alloc inode %llu says it has "
+                ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
-                            "%u used bits, but a count shows %u",
                            (unsigned long long)le64_to_cpu(alloc->i_blkno),
                            le32_to_cpu(alloc->id1.bitmap1.i_used),
                            ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
-                ocfs2_error(inode->i_sb,
+                ret = ocfs2_error(inode->i_sb,
-                            "Inode %llu has an extent at cpos %u which can no "
+                                  "Inode %llu has an extent at cpos %u which can no longer be found\n",
-                            "longer be found.\n",
+                                  (unsigned long long)ino, cpos);
-                            (unsigned long long)ino, cpos);
-                ret = -EROFS;
                goto out;
        }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..b7dfac226b1e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1035,11 +1035,6 @@ leave:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (child_locked)
-                ocfs2_inode_unlock(inode, 1);
-        ocfs2_inode_unlock(dir, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
                iput(orphan_dir);
        }
+        if (child_locked)
+                ocfs2_inode_unlock(inode, 1);
+        ocfs2_inode_unlock(dir, 1);
        brelse(fe_bh);
        brelse(parent_node_bh);
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir,
        }
        parents_locked = 1;
+        if (!new_dir->i_nlink) {
+                status = -EACCES;
+                goto bail;
+        }
        /* make sure both dirs have bhs
         * get an extra ref on old_dir_bh if old==new */
        if (!new_dir_bh) {
@@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir,
        status = ocfs2_find_entry(old_dentry->d_name.name,
                                  old_dentry->d_name.len, old_dir,
                                  &old_entry_lookup);
-        if (status)
+        if (status) {
+                if (!is_journal_aborted(osb->journal->j_journal)) {
+                        ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+                                        "is not deleted.",
+                                        new_dentry->d_name.len, new_dentry->d_name.name,
+                                        old_dentry->d_name.len, old_dentry->d_name.name);
+                }
                goto bail;
+        }
        status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
+                if (!is_journal_aborted(osb->journal->j_journal)) {
+                        ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+                                        "is not deleted.",
+                                        new_dentry->d_name.len, new_dentry->d_name.name,
+                                        old_dentry->d_name.len, old_dentry->d_name.name);
+                }
                goto bail;
        }
@@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir,
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
-        if (rename_lock)
-                ocfs2_rename_unlock(osb);
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (parents_locked)
-                ocfs2_double_unlock(old_dir, new_dir);
-        if (old_child_locked)
-                ocfs2_inode_unlock(old_inode, 1);
-        if (new_child_locked)
-                ocfs2_inode_unlock(new_inode, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1661,18 @@ bail:
                iput(orphan_dir);
        }
+        if (new_child_locked)
+                ocfs2_inode_unlock(new_inode, 1);
+        if (old_child_locked)
+                ocfs2_inode_unlock(old_inode, 1);
+        if (parents_locked)
+                ocfs2_double_unlock(old_dir, new_dir);
+        if (rename_lock)
+                ocfs2_rename_unlock(osb);
        if (new_inode)
                sync_mapping_buffers(old_inode->i_mapping);
@@ -2601,27 +2619,6 @@ leave:
        return status;
 }
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
-        int ret;
-        struct buffer_head *di_bh = NULL;
-        struct ocfs2_dinode *di = NULL;
-        ret = ocfs2_inode_lock(inode, &di_bh, 1);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return 0;
-        }
-        di = (struct ocfs2_dinode *) di_bh->b_data;
-        ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
-        ocfs2_inode_unlock(inode, 1);
-        brelse(di_bh);
-        return ret;
-}
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
        struct inode *inode)
 {
@@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
        handle_t *handle = NULL;
        struct ocfs2_dinode *di = NULL;
-restart:
        status = ocfs2_inode_lock(inode, &di_bh, 1);
        if (status < 0) {
                mlog_errno(status);
@@ -2643,15 +2639,21 @@ restart:
        di = (struct ocfs2_dinode *) di_bh->b_data;
        /*
         * Another append dio crashed?
-         * If so, wait for recovery first.
+         * If so, manually recover it first.
         */
        if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-                ocfs2_inode_unlock(inode, 1);
+                status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
-                brelse(di_bh);
+                if (status < 0) {
-                wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+                        if (status != -ENOSPC)
-                                ocfs2_dio_orphan_recovered(inode),
+                                mlog_errno(status);
-                                msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+                        goto bail_unlock_inode;
-                goto restart;
+                }
+                status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail_unlock_inode;
+                }
        }
        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
        OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
+        OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+        OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
                ocfs2_error(inode->i_sb,
-                            "Quota file %llu is probably corrupted! Requested "
+                            "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
-                            "to read block %Lu but file has size only %Lu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
                            (unsigned long long)v_block,
                            (unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
        if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Refcount block #%llu has bad signature %.*s",
+                                 "Refcount block #%llu has bad signature %.*s\n",
-                            (unsigned long long)bh->b_blocknr, 7,
+                                 (unsigned long long)bh->b_blocknr, 7,
-                            rb->rf_signature);
+                                 rb->rf_signature);
-                return -EINVAL;
+                goto out;
        }
        if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Refcount block #%llu has an invalid rf_blkno "
+                                 "Refcount block #%llu has an invalid rf_blkno of %llu\n",
-                            "of %llu",
+                                 (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                 (unsigned long long)le64_to_cpu(rb->rf_blkno));
-                            (unsigned long long)le64_to_cpu(rb->rf_blkno));
+                goto out;
-                return -EINVAL;
        }
        if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-                ocfs2_error(sb,
+                rc = ocfs2_error(sb,
-                            "Refcount block #%llu has an invalid "
+                                 "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
-                            "rf_fs_generation of #%u",
+                                 (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                 le32_to_cpu(rb->rf_fs_generation));
-                            le32_to_cpu(rb->rf_fs_generation));
+                goto out;
-                return -EINVAL;
        }
+out:
-        return 0;
+        return rc;
 }
 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                el = &eb->h_list;
                if (el->l_tree_depth) {
-                        ocfs2_error(sb,
+                        ret = ocfs2_error(sb,
-                        "refcount tree %llu has non zero tree "
+                                          "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
-                        "depth in leaf btree tree block %llu\n",
+                                          (unsigned long long)ocfs2_metadata_cache_owner(ci),
-                        (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                                          (unsigned long long)eb_bh->b_blocknr);
-                        (unsigned long long)eb_bh->b_blocknr);
-                        ret = -EROFS;
                        goto out;
                }
        }
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
                                           cpos, len, phys);
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+                ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
-                            "tree, but the feature bit is not set in the "
+                                  inode->i_ino);
-                            "super block.", inode->i_ino);
-                ret = -EROFS;
                goto out;
        }
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+                ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
-                            "tree, but the feature bit is not set in the "
+                                  inode->i_ino);
-                            "super block.", inode->i_ino);
-                ret = -EROFS;
                goto out;
        }
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
                el = &eb->h_list;
                if (el->l_tree_depth) {
-                        ocfs2_error(inode->i_sb,
+                        ret = ocfs2_error(inode->i_sb,
-                                    "Inode %lu has non zero tree depth in "
+                                          "Inode %lu has non zero tree depth in leaf block %llu\n",
-                                    "leaf block %llu\n", inode->i_ino,
+                                          inode->i_ino,
-                                    (unsigned long long)eb_bh->b_blocknr);
+                                          (unsigned long long)eb_bh->b_blocknr);
-                        ret = -EROFS;
                        goto out;
                }
        }
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
        index = ocfs2_search_extent_list(el, cpos);
        if (index == -1) {
-                ocfs2_error(sb,
+                ret = ocfs2_error(sb,
-                            "Inode %llu has an extent at cpos %u which can no "
+                                  "Inode %llu has an extent at cpos %u which can no longer be found\n",
-                            "longer be found.\n",
+                                  (unsigned long long)ino, cpos);
-                            (unsigned long long)ino, cpos);
-                ret = -EROFS;
                goto out;
        }
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+                return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
-                            "tree, but the feature bit is not set in the "
+                                   inode->i_ino);
-                            "super block.", inode->i_ino);
-                return -EROFS;
        }
        ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..d83d2602cf2b 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
        ac->ac_resv = NULL;
-        if (ac->ac_find_loc_priv) {
+        kfree(ac->ac_find_loc_priv);
-                kfree(ac->ac_find_loc_priv);
+        ac->ac_find_loc_priv = NULL;
-                ac->ac_find_loc_priv = NULL;
-        }
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 #define do_error(fmt, ...)                                              \
-        do{                                                             \
+do {                                                                    \
-                if (resize)                                     \
+        if (resize)                                                     \
-                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
+                mlog(ML_ERROR, fmt, ##__VA_ARGS__);                     \
-                else                                                    \
+        else                                                            \
-                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
+                return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
-        } while (0)
+} while (0)
 static int ocfs2_validate_gd_self(struct super_block *sb,
                                  struct buffer_head *bh,
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                do_error("Group descriptor #%llu has bad signature %.*s",
+                do_error("Group descriptor #%llu has bad signature %.*s\n",
                         (unsigned long long)bh->b_blocknr, 7,
                         gd->bg_signature);
-                return -EINVAL;
        }
        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
-                do_error("Group descriptor #%llu has an invalid bg_blkno "
+                do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
-                         "of %llu",
                         (unsigned long long)bh->b_blocknr,
                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
-                return -EINVAL;
        }
        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
-                do_error("Group descriptor #%llu has an invalid "
+                do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
-                         "fs_generation of #%u",
                         (unsigned long long)bh->b_blocknr,
                         le32_to_cpu(gd->bg_generation));
-                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-                do_error("Group descriptor #%llu has bit count %u but "
+                do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
-                         "claims that %u are free",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits),
                         le16_to_cpu(gd->bg_free_bits_count));
-                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-                do_error("Group descriptor #%llu has bit count %u but "
+                do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
-                         "max bitmap bits of %u",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits),
                         8 * le16_to_cpu(gd->bg_size));
-                return -EINVAL;
        }
        return 0;
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (di->i_blkno != gd->bg_parent_dinode) {
-                do_error("Group descriptor #%llu has bad parent "
+                do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
-                         "pointer (%llu, expected %llu)",
                         (unsigned long long)bh->b_blocknr,
                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
                         (unsigned long long)le64_to_cpu(di->i_blkno));
-                return -EINVAL;
        }
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-                do_error("Group descriptor #%llu has bit count of %u",
+                do_error("Group descriptor #%llu has bit count of %u\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_bits));
-                return -EINVAL;
        }
        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
            ((le16_to_cpu(gd->bg_chain) ==
             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
-                do_error("Group descriptor #%llu has bad chain %u",
+                do_error("Group descriptor #%llu has bad chain %u\n",
                         (unsigned long long)bh->b_blocknr,
                         le16_to_cpu(gd->bg_chain));
-                return -EINVAL;
        }
        return 0;
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
        struct super_block * sb = alloc_inode->i_sb;
        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
-                ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
+                status = ocfs2_error(alloc_inode->i_sb,
-                            "b_blocknr (%llu)",
+                                     "group block (%llu) != b_blocknr (%llu)\n",
-                            (unsigned long long)group_blkno,
+                                     (unsigned long long)group_blkno,
-                            (unsigned long long) bg_bh->b_blocknr);
+                                     (unsigned long long) bg_bh->b_blocknr);
-                status = -EIO;
                goto bail;
        }
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
-                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
+                status = ocfs2_error(alloc_inode->i_sb,
-                            (unsigned long long)le64_to_cpu(fe->i_blkno));
+                                     "Invalid chain allocator %llu\n",
-                status = -EIO;
+                                     (unsigned long long)le64_to_cpu(fe->i_blkno));
                goto bail;
        }
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
-                            " count %u but claims %u are freed. num_bits %d",
+                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                                   le16_to_cpu(bg->bg_bits),
-                            le16_to_cpu(bg->bg_bits),
+                                   le16_to_cpu(bg->bg_free_bits_count),
-                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                                   num_bits);
-                return -EROFS;
        }
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-                ocfs2_error(ac->ac_inode->i_sb,
+                status = ocfs2_error(ac->ac_inode->i_sb,
-                            "Chain allocator dinode %llu has %u used "
+                                     "Chain allocator dinode %llu has %u used bits but only %u total\n",
-                            "bits but only %u total.",
+                                     (unsigned long long)le64_to_cpu(fe->i_blkno),
-                            (unsigned long long)le64_to_cpu(fe->i_blkno),
+                                     le32_to_cpu(fe->id1.bitmap1.i_used),
-                            le32_to_cpu(fe->id1.bitmap1.i_used),
+                                     le32_to_cpu(fe->id1.bitmap1.i_total));
-                            le32_to_cpu(fe->id1.bitmap1.i_total));
-                status = -EIO;
                goto bail;
        }
@@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
-                            " count %u but claims %u are freed. num_bits %d",
+                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
-                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                                   le16_to_cpu(bg->bg_bits),
-                            le16_to_cpu(bg->bg_bits),
+                                   le16_to_cpu(bg->bg_free_bits_count),
-                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
+                                   num_bits);
-                return -EROFS;
        }
        if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..2de4c8a9340c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
        Opt_resv_level,
        Opt_dir_resv_level,
        Opt_journal_async_commit,
+        Opt_err_cont,
        Opt_err,
 };
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_journal_async_commit, "journal_async_commit"},
+        {Opt_err_cont, "errors=continue"},
        {Opt_err, NULL}
 };
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
                        break;
                case Opt_err_panic:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
                        mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
                        break;
                case Opt_err_ro:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+                        mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+                        break;
+                case Opt_err_cont:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+                        mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+                        mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
                        break;
                case Opt_data_ordered:
                        mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
        if (opts & OCFS2_MOUNT_ERRORS_PANIC)
                seq_printf(s, ",errors=panic");
+        else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+                seq_printf(s, ",errors=continue");
        else
                seq_printf(s, ",errors=remount-ro");
@@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
                seq_printf(s, ",localflocks,");
        if (osb->osb_cluster_stack[0])
-                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+                seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
-                           osb->osb_cluster_stack);
+                                  OCFS2_STACK_LABEL_LEN);
        if (opts & OCFS2_MOUNT_USRQUOTA)
                seq_printf(s, ",usrquota");
        if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
-        init_waitqueue_head(&oi->append_dio_wq);
        ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
                                  &ocfs2_inode_caching_ops);
@@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        memset(osb, 0, sizeof(struct ocfs2_super));
 }
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
+/* Depending on the mount option passed, perform one of the following:
- * panic(). We do not support continue-on-error operation. */
+ * Put OCFS2 into a readonly state (default)
-static void ocfs2_handle_error(struct super_block *sb)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
 {
        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int rv = 0;
-        if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
-                panic("OCFS2: (device %s): panic forced after error\n",
-                      sb->s_id);
        ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+        pr_crit("On-disk corruption discovered. "
+                "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
-        if (sb->s_flags & MS_RDONLY &&
+        if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
-            (ocfs2_is_soft_readonly(osb) ||
+                panic("OCFS2: (device %s): panic forced after error\n",
-             ocfs2_is_hard_readonly(osb)))
+                      sb->s_id);
-                return;
+        } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+                pr_crit("OCFS2: Returning error to the calling process.\n");
-        printk(KERN_CRIT "File system is now read-only due to the potential "
+                rv = -EIO;
-               "of on-disk corruption. Please run fsck.ocfs2 once the file "
+        } else { /* default option */
-               "system is unmounted.\n");
+                rv = -EROFS;
-        sb->s_flags |= MS_RDONLY;
+                if (sb->s_flags & MS_RDONLY &&
-        ocfs2_set_ro_flag(osb, 0);
+                                (ocfs2_is_soft_readonly(osb) ||
+                                 ocfs2_is_hard_readonly(osb)))
+                        return rv;
+                pr_crit("OCFS2: File system is now read-only.\n");
+                sb->s_flags |= MS_RDONLY;
+                ocfs2_set_ro_flag(osb, 0);
+        }
+        return rv;
 }
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
        struct va_format vaf;
@@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
        /* Not using mlog here because we want to show the actual
         * function the error came from. */
-        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+        printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
               sb->s_id, function, &vaf);
        va_end(args);
-        ocfs2_handle_error(sb);
+        return ocfs2_handle_error(sb);
 }
 /* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+        printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
               sb->s_id, function, &vaf);
        va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
 __printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
                   const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...)                                       \
+        __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 __printf(3, 4)
 void __ocfs2_abort(struct super_block *sb, const char *function,
                   const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...)                                       \
+        __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 /*
 * Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
         */
        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ocfs2_error(sb,
+                return ocfs2_error(sb,
-                            "Extended attribute block #%llu has bad "
+                                   "Extended attribute block #%llu has bad signature %.*s\n",
-                            "signature %.*s",
+                                   (unsigned long long)bh->b_blocknr, 7,
-                            (unsigned long long)bh->b_blocknr, 7,
+                                   xb->xb_signature);
-                            xb->xb_signature);
-                return -EINVAL;
        }
        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
-                ocfs2_error(sb,
+                return ocfs2_error(sb,
-                            "Extended attribute block #%llu has an "
+                                   "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
-                            "invalid xb_blkno of %llu",
+                                   (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                   (unsigned long long)le64_to_cpu(xb->xb_blkno));
-                            (unsigned long long)le64_to_cpu(xb->xb_blkno));
-                return -EINVAL;
        }
        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-                ocfs2_error(sb,
+                return ocfs2_error(sb,
-                            "Extended attribute block #%llu has an invalid "
+                                   "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
-                            "xb_fs_generation of #%u",
+                                   (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)bh->b_blocknr,
+                                   le32_to_cpu(xb->xb_fs_generation));
-                            le32_to_cpu(xb->xb_fs_generation));
-                return -EINVAL;
        }
        return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
                el = &eb->h_list;
                if (el->l_tree_depth) {
-                        ocfs2_error(inode->i_sb,
+                        ret = ocfs2_error(inode->i_sb,
-                                    "Inode %lu has non zero tree depth in "
+                                          "Inode %lu has non zero tree depth in xattr tree block %llu\n",
-                                    "xattr tree block %llu\n", inode->i_ino,
+                                          inode->i_ino,
-                                    (unsigned long long)eb_bh->b_blocknr);
+                                          (unsigned long long)eb_bh->b_blocknr);
-                        ret = -EROFS;
                        goto out;
                }
        }
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
        }
        if (!e_blkno) {
-                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
-                            "record (%u, %u, 0) in xattr", inode->i_ino,
+                                  inode->i_ino,
-                            le32_to_cpu(rec->e_cpos),
+                                  le32_to_cpu(rec->e_cpos),
-                            ocfs2_rec_clusters(el, rec));
+                                  ocfs2_rec_clusters(el, rec));
-                ret = -EROFS;
                goto out;
        }
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
        if (list && total_len <= list_size) {
                memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
                memcpy(list + prefix_len, name, name_len);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
        struct super_block *sb = dentry->d_sb;
        struct ovl_fs *ufs = sb->s_fs_info;
-        seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+        seq_show_option(m, "lowerdir", ufs->config.lowerdir);
        if (ufs->config.upperdir) {
-                seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+                seq_show_option(m, "upperdir", ufs->config.upperdir);
-                seq_printf(m, ",workdir=%s", ufs->config.workdir);
+                seq_show_option(m, "workdir", ufs->config.workdir);
        }
        return 0;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
        const struct cred *cred;
-        kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+        kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+                        cap_bset, cap_ambient;
        rcu_read_lock();
        cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
        cap_permitted   = cred->cap_permitted;
        cap_effective   = cred->cap_effective;
        cap_bset        = cred->cap_bset;
+        cap_ambient     = cred->cap_ambient;
        rcu_read_unlock();
        render_cap_t(m, "CapInh:\t", &cap_inheritable);
        render_cap_t(m, "CapPrm:\t", &cap_permitted);
        render_cap_t(m, "CapEff:\t", &cap_effective);
        render_cap_t(m, "CapBnd:\t", &cap_bset);
+        render_cap_t(m, "CapAmb:\t", &cap_ambient);
 }
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..3b4d8255e806 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_HUGEPAGE)]    = "hg",
                [ilog2(VM_NOHUGEPAGE)]  = "nh",
                [ilog2(VM_MERGEABLE)]   = "mg",
+                [ilog2(VM_UFFD_MISSING)]= "um",
+                [ilog2(VM_UFFD_WP)]     = "uw",
        };
        size_t i;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",acl");
        if (REISERFS_SB(s)->s_jdev)
-                seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+                seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
        if (journal->j_max_commit_age != journal->j_default_max_commit_age)
                seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
 #ifdef CONFIG_QUOTA
        if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-                seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+                seq_show_option(seq, "usrjquota",
+                                REISERFS_SB(s)->s_qf_names[USRQUOTA]);
        else if (opts & (1 << REISERFS_USRQUOTA))
                seq_puts(seq, ",usrquota");
        if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-                seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+                seq_show_option(seq, "grpjquota",
+                                REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
        else if (opts & (1 << REISERFS_GRPQUOTA))
                seq_puts(seq, ",grpquota");
        if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..634e676072cb
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1330 @@
+/*
+ *  fs/userfaultfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2008-2009 Red Hat, Inc.
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ *  Some part derived from fs/eventfd.c (anon inode setup) and
+ *  mm/ksm.c (mm hashing).
+ */
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+enum userfaultfd_state {
+        UFFD_STATE_WAIT_API,
+        UFFD_STATE_RUNNING,
+};
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+        /* waitqueue head for the pending (i.e. not read) userfaults */
+        wait_queue_head_t fault_pending_wqh;
+        /* waitqueue head for the userfaults */
+        wait_queue_head_t fault_wqh;
+        /* waitqueue head for the pseudo fd to wakeup poll/read */
+        wait_queue_head_t fd_wqh;
+        /* a refile sequence protected by fault_pending_wqh lock */
+        struct seqcount refile_seq;
+        /* pseudo fd refcounting */
+        atomic_t refcount;
+        /* userfaultfd syscall flags */
+        unsigned int flags;
+        /* state machine */
+        enum userfaultfd_state state;
+        /* released */
+        bool released;
+        /* mm with one ore more vmas attached to this userfaultfd_ctx */
+        struct mm_struct *mm;
+};
+struct userfaultfd_wait_queue {
+        struct uffd_msg msg;
+        wait_queue_t wq;
+        struct userfaultfd_ctx *ctx;
+};
+struct userfaultfd_wake_range {
+        unsigned long start;
+        unsigned long len;
+};
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+                                     int wake_flags, void *key)
+{
+        struct userfaultfd_wake_range *range = key;
+        int ret;
+        struct userfaultfd_wait_queue *uwq;
+        unsigned long start, len;
+        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+        ret = 0;
+        /* len == 0 means wake all */
+        start = range->start;
+        len = range->len;
+        if (len && (start > uwq->msg.arg.pagefault.address ||
+                    start + len <= uwq->msg.arg.pagefault.address))
+                goto out;
+        ret = wake_up_state(wq->private, mode);
+        if (ret)
+                /*
+                 * Wake only once, autoremove behavior.
+                 *
+                 * After the effect of list_del_init is visible to the
+                 * other CPUs, the waitqueue may disappear from under
+                 * us, see the !list_empty_careful() in
+                 * handle_userfault(). try_to_wake_up() has an
+                 * implicit smp_mb__before_spinlock, and the
+                 * wq->private is read before calling the extern
+                 * function "wake_up_state" (which in turns calls
+                 * try_to_wake_up). While the spin_lock;spin_unlock;
+                 * wouldn't be enough, the smp_mb__before_spinlock is
+                 * enough to avoid an explicit smp_mb() here.
+                 */
+                list_del_init(&wq->task_list);
+out:
+        return ret;
+}
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+        if (!atomic_inc_not_zero(&ctx->refcount))
+                BUG();
+}
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+        if (atomic_dec_and_test(&ctx->refcount)) {
+                VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+                VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+                VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+                VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+                VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+                VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+                mmput(ctx->mm);
+                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+        }
+}
+static inline void msg_init(struct uffd_msg *msg)
+{
+        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+        /*
+         * Must use memset to zero out the paddings or kernel data is
+         * leaked to userland.
+         */
+        memset(msg, 0, sizeof(struct uffd_msg));
+}
+static inline struct uffd_msg userfault_msg(unsigned long address,
+                                            unsigned int flags,
+                                            unsigned long reason)
+{
+        struct uffd_msg msg;
+        msg_init(&msg);
+        msg.event = UFFD_EVENT_PAGEFAULT;
+        msg.arg.pagefault.address = address;
+        if (flags & FAULT_FLAG_WRITE)
+                /*
+                 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+                 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+                 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+                 * was a read fault, otherwise if set it means it's
+                 * a write fault.
+                 */
+                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+        if (reason & VM_UFFD_WP)
+                /*
+                 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+                 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+                 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+                 * a missing fault, otherwise if set it means it's a
+                 * write protect fault.
+                 */
+                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+        return msg;
+}
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+                                         unsigned long address,
+                                         unsigned long flags,
+                                         unsigned long reason)
+{
+        struct mm_struct *mm = ctx->mm;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        bool ret = true;
+        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        /*
+         * READ_ONCE must function as a barrier with narrower scope
+         * and it must be equivalent to:
+         *      _pmd = *pmd; barrier();
+         *
+         * This is to deal with the instability (as in
+         * pmd_trans_unstable) of the pmd.
+         */
+        _pmd = READ_ONCE(*pmd);
+        if (!pmd_present(_pmd))
+                goto out;
+        ret = false;
+        if (pmd_trans_huge(_pmd))
+                goto out;
+        /*
+         * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+         * and use the standard pte_offset_map() instead of parsing _pmd.
+         */
+        pte = pte_offset_map(pmd, address);
+        /*
+         * Lockless access: we're in a wait_event so it's ok if it
+         * changes under us.
+         */
+        if (pte_none(*pte))
+                ret = true;
+        pte_unmap(pte);
+out:
+        return ret;
+}
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+                     unsigned int flags, unsigned long reason)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        struct userfaultfd_ctx *ctx;
+        struct userfaultfd_wait_queue uwq;
+        int ret;
+        bool must_wait, return_to_userland;
+        BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+        ret = VM_FAULT_SIGBUS;
+        ctx = vma->vm_userfaultfd_ctx.ctx;
+        if (!ctx)
+                goto out;
+        BUG_ON(ctx->mm != mm);
+        VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+        VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+        /*
+         * If it's already released don't get it. This avoids to loop
+         * in __get_user_pages if userfaultfd_release waits on the
+         * caller of handle_userfault to release the mmap_sem.
+         */
+        if (unlikely(ACCESS_ONCE(ctx->released)))
+                goto out;
+        /*
+         * Check that we can return VM_FAULT_RETRY.
+         *
+         * NOTE: it should become possible to return VM_FAULT_RETRY
+         * even if FAULT_FLAG_TRIED is set without leading to gup()
+         * -EBUSY failures, if the userfaultfd is to be extended for
+         * VM_UFFD_WP tracking and we intend to arm the userfault
+         * without first stopping userland access to the memory. For
+         * VM_UFFD_MISSING userfaults this is enough for now.
+         */
+        if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+                /*
+                 * Validate the invariant that nowait must allow retry
+                 * to be sure not to return SIGBUS erroneously on
+                 * nowait invocations.
+                 */
+                BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+                if (printk_ratelimit()) {
+                        printk(KERN_WARNING
+                               "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+                        dump_stack();
+                }
+#endif
+                goto out;
+        }
+        /*
+         * Handle nowait, not much to do other than tell it to retry
+         * and wait.
+         */
+        ret = VM_FAULT_RETRY;
+        if (flags & FAULT_FLAG_RETRY_NOWAIT)
+                goto out;
+        /* take the reference before dropping the mmap_sem */
+        userfaultfd_ctx_get(ctx);
+        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+        uwq.wq.private = current;
+        uwq.msg = userfault_msg(address, flags, reason);
+        uwq.ctx = ctx;
+        return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+                (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+        spin_lock(&ctx->fault_pending_wqh.lock);
+        /*
+         * After the __add_wait_queue the uwq is visible to userland
+         * through poll/read().
+         */
+        __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+        /*
+         * The smp_mb() after __set_current_state prevents the reads
+         * following the spin_unlock to happen before the list_add in
+         * __add_wait_queue.
+         */
+        set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+                          TASK_KILLABLE);
+        spin_unlock(&ctx->fault_pending_wqh.lock);
+        must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+        up_read(&mm->mmap_sem);
+        if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+                   (return_to_userland ? !signal_pending(current) :
+                    !fatal_signal_pending(current)))) {
+                wake_up_poll(&ctx->fd_wqh, POLLIN);
+                schedule();
+                ret |= VM_FAULT_MAJOR;
+        }
+        __set_current_state(TASK_RUNNING);
+        if (return_to_userland) {
+                if (signal_pending(current) &&
+                    !fatal_signal_pending(current)) {
+                        /*
+                         * If we got a SIGSTOP or SIGCONT and this is
+                         * a normal userland page fault, just let
+                         * userland return so the signal will be
+                         * handled and gdb debugging works.  The page
+                         * fault code immediately after we return from
+                         * this function is going to release the
+                         * mmap_sem and it's not depending on it
+                         * (unlike gup would if we were not to return
+                         * VM_FAULT_RETRY).
+                         *
+                         * If a fatal signal is pending we still take
+                         * the streamlined VM_FAULT_RETRY failure path
+                         * and there's no need to retake the mmap_sem
+                         * in such case.
+                         */
+                        down_read(&mm->mmap_sem);
+                        ret = 0;
+                }
+        }
+        /*
+         * Here we race with the list_del; list_add in
+         * userfaultfd_ctx_read(), however because we don't ever run
+         * list_del_init() to refile across the two lists, the prev
+         * and next pointers will never point to self. list_add also
+         * would never let any of the two pointers to point to
+         * self. So list_empty_careful won't risk to see both pointers
+         * pointing to self at any time during the list refile. The
+         * only case where list_del_init() is called is the full
+         * removal in the wake function and there we don't re-list_add
+         * and it's fine not to block on the spinlock. The uwq on this
+         * kernel stack can be released after the list_del_init.
+         */
+        if (!list_empty_careful(&uwq.wq.task_list)) {
+                spin_lock(&ctx->fault_pending_wqh.lock);
+                /*
+                 * No need of list_del_init(), the uwq on the stack
+                 * will be freed shortly anyway.
+                 */
+                list_del(&uwq.wq.task_list);
+                spin_unlock(&ctx->fault_pending_wqh.lock);
+        }
+        /*
+         * ctx may go away after this if the userfault pseudo fd is
+         * already released.
+         */
+        userfaultfd_ctx_put(ctx);
+out:
+        return ret;
+}
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+        struct userfaultfd_ctx *ctx = file->private_data;
+        struct mm_struct *mm = ctx->mm;
+        struct vm_area_struct *vma, *prev;
+        /* len == 0 means wake all */
+        struct userfaultfd_wake_range range = { .len = 0, };
+        unsigned long new_flags;
+        ACCESS_ONCE(ctx->released) = true;
+        /*
+         * Flush page faults out of all CPUs. NOTE: all page faults
+         * must be retried without returning VM_FAULT_SIGBUS if
+         * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+         * changes while handle_userfault released the mmap_sem. So
+         * it's critical that released is set to true (above), before
+         * taking the mmap_sem for writing.
+         */
+        down_write(&mm->mmap_sem);
+        prev = NULL;
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                cond_resched();
+                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+                       !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+                        prev = vma;
+                        continue;
+                }
+                new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+                prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+                                 new_flags, vma->anon_vma,
+                                 vma->vm_file, vma->vm_pgoff,
+                                 vma_policy(vma),
+                                 NULL_VM_UFFD_CTX);
+                if (prev)
+                        vma = prev;
+                else
+                        prev = vma;
+                vma->vm_flags = new_flags;
+                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+        }
+        up_write(&mm->mmap_sem);
+        /*
+         * After no new page faults can wait on this fault_*wqh, flush
+         * the last page faults that may have been already waiting on
+         * the fault_*wqh.
+         */
+        spin_lock(&ctx->fault_pending_wqh.lock);
+        __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+        __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+        spin_unlock(&ctx->fault_pending_wqh.lock);
+        wake_up_poll(&ctx->fd_wqh, POLLHUP);
+        userfaultfd_ctx_put(ctx);
+        return 0;
+}
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+        struct userfaultfd_ctx *ctx)
+{
+        wait_queue_t *wq;
+        struct userfaultfd_wait_queue *uwq;
+        VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+        uwq = NULL;
+        if (!waitqueue_active(&ctx->fault_pending_wqh))
+                goto out;
+        /* walk in reverse to provide FIFO behavior to read userfaults */
+        wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+                             typeof(*wq), task_list);
+        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+        return uwq;
+}
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+        struct userfaultfd_ctx *ctx = file->private_data;
+        unsigned int ret;
+        poll_wait(file, &ctx->fd_wqh, wait);
+        switch (ctx->state) {
+        case UFFD_STATE_WAIT_API:
+                return POLLERR;
+        case UFFD_STATE_RUNNING:
+                /*
+                 * poll() never guarantees that read won't block.
+                 * userfaults can be waken before they're read().
+                 */
+                if (unlikely(!(file->f_flags & O_NONBLOCK)))
+                        return POLLERR;
+                /*
+                 * lockless access to see if there are pending faults
+                 * __pollwait last action is the add_wait_queue but
+                 * the spin_unlock would allow the waitqueue_active to
+                 * pass above the actual list_add inside
+                 * add_wait_queue critical section. So use a full
+                 * memory barrier to serialize the list_add write of
+                 * add_wait_queue() with the waitqueue_active read
+                 * below.
+                 */
+                ret = 0;
+                smp_mb();
+                if (waitqueue_active(&ctx->fault_pending_wqh))
+                        ret = POLLIN;
+                return ret;
+        default:
+                BUG();
+        }
+}
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+                                    struct uffd_msg *msg)
+{
+        ssize_t ret;
+        DECLARE_WAITQUEUE(wait, current);
+        struct userfaultfd_wait_queue *uwq;
+        /* always take the fd_wqh lock before the fault_pending_wqh lock */
+        spin_lock(&ctx->fd_wqh.lock);
+        __add_wait_queue(&ctx->fd_wqh, &wait);
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                spin_lock(&ctx->fault_pending_wqh.lock);
+                uwq = find_userfault(ctx);
+                if (uwq) {
+                        /*
+                         * Use a seqcount to repeat the lockless check
+                         * in wake_userfault() to avoid missing
+                         * wakeups because during the refile both
+                         * waitqueue could become empty if this is the
+                         * only userfault.
+                         */
+                        write_seqcount_begin(&ctx->refile_seq);
+                        /*
+                         * The fault_pending_wqh.lock prevents the uwq
+                         * to disappear from under us.
+                         *
+                         * Refile this userfault from
+                         * fault_pending_wqh to fault_wqh, it's not
+                         * pending anymore after we read it.
+                         *
+                         * Use list_del() by hand (as
+                         * userfaultfd_wake_function also uses
+                         * list_del_init() by hand) to be sure nobody
+                         * changes __remove_wait_queue() to use
+                         * list_del_init() in turn breaking the
+                         * !list_empty_careful() check in
+                         * handle_userfault(). The uwq->wq.task_list
+                         * must never be empty at any time during the
+                         * refile, or the waitqueue could disappear
+                         * from under us. The "wait_queue_head_t"
+                         * parameter of __remove_wait_queue() is unused
+                         * anyway.
+                         */
+                        list_del(&uwq->wq.task_list);
+                        __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+                        write_seqcount_end(&ctx->refile_seq);
+                        /* careful to always initialize msg if ret == 0 */
+                        *msg = uwq->msg;
+                        spin_unlock(&ctx->fault_pending_wqh.lock);
+                        ret = 0;
+                        break;
+                }
+                spin_unlock(&ctx->fault_pending_wqh.lock);
+                if (signal_pending(current)) {
+                        ret = -ERESTARTSYS;
+                        break;
+                }
+                if (no_wait) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                spin_unlock(&ctx->fd_wqh.lock);
+                schedule();
+                spin_lock(&ctx->fd_wqh.lock);
+        }
+        __remove_wait_queue(&ctx->fd_wqh, &wait);
+        __set_current_state(TASK_RUNNING);
+        spin_unlock(&ctx->fd_wqh.lock);
+        return ret;
+}
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        struct userfaultfd_ctx *ctx = file->private_data;
+        ssize_t _ret, ret = 0;
+        struct uffd_msg msg;
+        int no_wait = file->f_flags & O_NONBLOCK;
+        if (ctx->state == UFFD_STATE_WAIT_API)
+                return -EINVAL;
+        for (;;) {
+                if (count < sizeof(msg))
+                        return ret ? ret : -EINVAL;
+                _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+                if (_ret < 0)
+                        return ret ? ret : _ret;
+                if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+                        return ret ? ret : -EFAULT;
+                ret += sizeof(msg);
+                buf += sizeof(msg);
+                count -= sizeof(msg);
+                /*
+                 * Allow to read more than one fault at time but only
+                 * block if waiting for the very first one.
+                 */
+                no_wait = O_NONBLOCK;
+        }
+}
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+                             struct userfaultfd_wake_range *range)
+{
+        unsigned long start, end;
+        start = range->start;
+        end = range->start + range->len;
+        spin_lock(&ctx->fault_pending_wqh.lock);
+        /* wake all in the range and autoremove */
+        if (waitqueue_active(&ctx->fault_pending_wqh))
+                __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+                                     range);
+        if (waitqueue_active(&ctx->fault_wqh))
+                __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+        spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+                                           struct userfaultfd_wake_range *range)
+{
+        unsigned seq;
+        bool need_wakeup;
+        /*
+         * To be sure waitqueue_active() is not reordered by the CPU
+         * before the pagetable update, use an explicit SMP memory
+         * barrier here. PT lock release or up_read(mmap_sem) still
+         * have release semantics that can allow the
+         * waitqueue_active() to be reordered before the pte update.
+         */
+        smp_mb();
+        /*
+         * Use waitqueue_active because it's very frequent to
+         * change the address space atomically even if there are no
+         * userfaults yet. So we take the spinlock only when we're
+         * sure we've userfaults to wake.
+         */
+        do {
+                seq = read_seqcount_begin(&ctx->refile_seq);
+                need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+                        waitqueue_active(&ctx->fault_wqh);
+                cond_resched();
+        } while (read_seqcount_retry(&ctx->refile_seq, seq));
+        if (need_wakeup)
+                __wake_userfault(ctx, range);
+}
+static __always_inline int validate_range(struct mm_struct *mm,
+                                          __u64 start, __u64 len)
+{
+        __u64 task_size = mm->task_size;
+        if (start & ~PAGE_MASK)
+                return -EINVAL;
+        if (len & ~PAGE_MASK)
+                return -EINVAL;
+        if (!len)
+                return -EINVAL;
+        if (start < mmap_min_addr)
+                return -EINVAL;
+        if (start >= task_size)
+                return -EINVAL;
+        if (len > task_size - start)
+                return -EINVAL;
+        return 0;
+}
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+                                unsigned long arg)
+{
+        struct mm_struct *mm = ctx->mm;
+        struct vm_area_struct *vma, *prev, *cur;
+        int ret;
+        struct uffdio_register uffdio_register;
+        struct uffdio_register __user *user_uffdio_register;
+        unsigned long vm_flags, new_flags;
+        bool found;
+        unsigned long start, end, vma_end;
+        user_uffdio_register = (struct uffdio_register __user *) arg;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_register, user_uffdio_register,
+                           sizeof(uffdio_register)-sizeof(__u64)))
+                goto out;
+        ret = -EINVAL;
+        if (!uffdio_register.mode)
+                goto out;
+        if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+                                     UFFDIO_REGISTER_MODE_WP))
+                goto out;
+        vm_flags = 0;
+        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+                vm_flags |= VM_UFFD_MISSING;
+        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+                vm_flags |= VM_UFFD_WP;
+                /*
+                 * FIXME: remove the below error constraint by
+                 * implementing the wprotect tracking mode.
+                 */
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = validate_range(mm, uffdio_register.range.start,
+                             uffdio_register.range.len);
+        if (ret)
+                goto out;
+        start = uffdio_register.range.start;
+        end = start + uffdio_register.range.len;
+        down_write(&mm->mmap_sem);
+        vma = find_vma_prev(mm, start, &prev);
+        ret = -ENOMEM;
+        if (!vma)
+                goto out_unlock;
+        /* check that there's at least one vma in the range */
+        ret = -EINVAL;
+        if (vma->vm_start >= end)
+                goto out_unlock;
+        /*
+         * Search for not compatible vmas.
+         *
+         * FIXME: this shall be relaxed later so that it doesn't fail
+         * on tmpfs backed vmas (in addition to the current allowance
+         * on anonymous vmas).
+         */
+        found = false;
+        for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+                cond_resched();
+                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+                       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                /* check not compatible vmas */
+                ret = -EINVAL;
+                if (cur->vm_ops)
+                        goto out_unlock;
+                /*
+                 * Check that this vma isn't already owned by a
+                 * different userfaultfd. We can't allow more than one
+                 * userfaultfd to own a single vma simultaneously or we
+                 * wouldn't know which one to deliver the userfaults to.
+                 */
+                ret = -EBUSY;
+                if (cur->vm_userfaultfd_ctx.ctx &&
+                    cur->vm_userfaultfd_ctx.ctx != ctx)
+                        goto out_unlock;
+                found = true;
+        }
+        BUG_ON(!found);
+        if (vma->vm_start < start)
+                prev = vma;
+        ret = 0;
+        do {
+                cond_resched();
+                BUG_ON(vma->vm_ops);
+                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+                       vma->vm_userfaultfd_ctx.ctx != ctx);
+                /*
+                 * Nothing to do: this vma is already registered into this
+                 * userfaultfd and with the right tracking mode too.
+                 */
+                if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+                    (vma->vm_flags & vm_flags) == vm_flags)
+                        goto skip;
+                if (vma->vm_start > start)
+                        start = vma->vm_start;
+                vma_end = min(end, vma->vm_end);
+                new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+                prev = vma_merge(mm, prev, start, vma_end, new_flags,
+                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+                                 vma_policy(vma),
+                                 ((struct vm_userfaultfd_ctx){ ctx }));
+                if (prev) {
+                        vma = prev;
+                        goto next;
+                }
+                if (vma->vm_start < start) {
+                        ret = split_vma(mm, vma, start, 1);
+                        if (ret)
+                                break;
+                }
+                if (vma->vm_end > end) {
+                        ret = split_vma(mm, vma, end, 0);
+                        if (ret)
+                                break;
+                }
+        next:
+                /*
+                 * In the vma_merge() successful mprotect-like case 8:
+                 * the next vma was merged into the current one and
+                 * the current one has not been updated yet.
+                 */
+                vma->vm_flags = new_flags;
+                vma->vm_userfaultfd_ctx.ctx = ctx;
+        skip:
+                prev = vma;
+                start = vma->vm_end;
+                vma = vma->vm_next;
+        } while (vma && vma->vm_start < end);
+out_unlock:
+        up_write(&mm->mmap_sem);
+        if (!ret) {
+                /*
+                 * Now that we scanned all vmas we can already tell
+                 * userland which ioctls methods are guaranteed to
+                 * succeed on this range.
+                 */
+                if (put_user(UFFD_API_RANGE_IOCTLS,
+                             &user_uffdio_register->ioctls))
+                        ret = -EFAULT;
+        }
+out:
+        return ret;
+}
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+                                  unsigned long arg)
+{
+        struct mm_struct *mm = ctx->mm;
+        struct vm_area_struct *vma, *prev, *cur;
+        int ret;
+        struct uffdio_range uffdio_unregister;
+        unsigned long new_flags;
+        bool found;
+        unsigned long start, end, vma_end;
+        const void __user *buf = (void __user *)arg;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+                goto out;
+        ret = validate_range(mm, uffdio_unregister.start,
+                             uffdio_unregister.len);
+        if (ret)
+                goto out;
+        start = uffdio_unregister.start;
+        end = start + uffdio_unregister.len;
+        down_write(&mm->mmap_sem);
+        vma = find_vma_prev(mm, start, &prev);
+        ret = -ENOMEM;
+        if (!vma)
+                goto out_unlock;
+        /* check that there's at least one vma in the range */
+        ret = -EINVAL;
+        if (vma->vm_start >= end)
+                goto out_unlock;
+        /*
+         * Search for not compatible vmas.
+         *
+         * FIXME: this shall be relaxed later so that it doesn't fail
+         * on tmpfs backed vmas (in addition to the current allowance
+         * on anonymous vmas).
+         */
+        found = false;
+        ret = -EINVAL;
+        for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+                cond_resched();
+                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+                       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                /*
+                 * Check not compatible vmas, not strictly required
+                 * here as not compatible vmas cannot have an
+                 * userfaultfd_ctx registered on them, but this
+                 * provides for more strict behavior to notice
+                 * unregistration errors.
+                 */
+                if (cur->vm_ops)
+                        goto out_unlock;
+                found = true;
+        }
+        BUG_ON(!found);
+        if (vma->vm_start < start)
+                prev = vma;
+        ret = 0;
+        do {
+                cond_resched();
+                BUG_ON(vma->vm_ops);
+                /*
+                 * Nothing to do: this vma is already registered into this
+                 * userfaultfd and with the right tracking mode too.
+                 */
+                if (!vma->vm_userfaultfd_ctx.ctx)
+                        goto skip;
+                if (vma->vm_start > start)
+                        start = vma->vm_start;
+                vma_end = min(end, vma->vm_end);
+                new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+                prev = vma_merge(mm, prev, start, vma_end, new_flags,
+                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+                                 vma_policy(vma),
+                                 NULL_VM_UFFD_CTX);
+                if (prev) {
+                        vma = prev;
+                        goto next;
+                }
+                if (vma->vm_start < start) {
+                        ret = split_vma(mm, vma, start, 1);
+                        if (ret)
+                                break;
+                }
+                if (vma->vm_end > end) {
+                        ret = split_vma(mm, vma, end, 0);
+                        if (ret)
+                                break;
+                }
+        next:
+                /*
+                 * In the vma_merge() successful mprotect-like case 8:
+                 * the next vma was merged into the current one and
+                 * the current one has not been updated yet.
+                 */
+                vma->vm_flags = new_flags;
+                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+        skip:
+                prev = vma;
+                start = vma->vm_end;
+                vma = vma->vm_next;
+        } while (vma && vma->vm_start < end);
+out_unlock:
+        up_write(&mm->mmap_sem);
+out:
+        return ret;
+}
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+                            unsigned long arg)
+{
+        int ret;
+        struct uffdio_range uffdio_wake;
+        struct userfaultfd_wake_range range;
+        const void __user *buf = (void __user *)arg;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+                goto out;
+        ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+        if (ret)
+                goto out;
+        range.start = uffdio_wake.start;
+        range.len = uffdio_wake.len;
+        /*
+         * len == 0 means wake all and we don't want to wake all here,
+         * so check it again to be sure.
+         */
+        VM_BUG_ON(!range.len);
+        wake_userfault(ctx, &range);
+        ret = 0;
+out:
+        return ret;
+}
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+                            unsigned long arg)
+{
+        __s64 ret;
+        struct uffdio_copy uffdio_copy;
+        struct uffdio_copy __user *user_uffdio_copy;
+        struct userfaultfd_wake_range range;
+        user_uffdio_copy = (struct uffdio_copy __user *) arg;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+                           /* don't copy "copy" last field */
+                           sizeof(uffdio_copy)-sizeof(__s64)))
+                goto out;
+        ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+        if (ret)
+                goto out;
+        /*
+         * double check for wraparound just in case. copy_from_user()
+         * will later check uffdio_copy.src + uffdio_copy.len to fit
+         * in the userland range.
+         */
+        ret = -EINVAL;
+        if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+                goto out;
+        if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+                goto out;
+        ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+                           uffdio_copy.len);
+        if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+                return -EFAULT;
+        if (ret < 0)
+                goto out;
+        BUG_ON(!ret);
+        /* len == 0 would wake all */
+        range.len = ret;
+        if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+                range.start = uffdio_copy.dst;
+                wake_userfault(ctx, &range);
+        }
+        ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+        return ret;
+}
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+                                unsigned long arg)
+{
+        __s64 ret;
+        struct uffdio_zeropage uffdio_zeropage;
+        struct uffdio_zeropage __user *user_uffdio_zeropage;
+        struct userfaultfd_wake_range range;
+        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+                           /* don't copy "zeropage" last field */
+                           sizeof(uffdio_zeropage)-sizeof(__s64)))
+                goto out;
+        ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+                             uffdio_zeropage.range.len);
+        if (ret)
+                goto out;
+        ret = -EINVAL;
+        if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+                goto out;
+        ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+                             uffdio_zeropage.range.len);
+        if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+                return -EFAULT;
+        if (ret < 0)
+                goto out;
+        /* len == 0 would wake all */
+        BUG_ON(!ret);
+        range.len = ret;
+        if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+                range.start = uffdio_zeropage.range.start;
+                wake_userfault(ctx, &range);
+        }
+        ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+        return ret;
+}
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+                           unsigned long arg)
+{
+        struct uffdio_api uffdio_api;
+        void __user *buf = (void __user *)arg;
+        int ret;
+        ret = -EINVAL;
+        if (ctx->state != UFFD_STATE_WAIT_API)
+                goto out;
+        ret = -EFAULT;
+        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+                goto out;
+        if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+                memset(&uffdio_api, 0, sizeof(uffdio_api));
+                if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+                        goto out;
+                ret = -EINVAL;
+                goto out;
+        }
+        uffdio_api.features = UFFD_API_FEATURES;
+        uffdio_api.ioctls = UFFD_API_IOCTLS;
+        ret = -EFAULT;
+        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+                goto out;
+        ctx->state = UFFD_STATE_RUNNING;
+        ret = 0;
+out:
+        return ret;
+}
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+                              unsigned long arg)
+{
+        int ret = -EINVAL;
+        struct userfaultfd_ctx *ctx = file->private_data;
+        if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+                return -EINVAL;
+        switch(cmd) {
+        case UFFDIO_API:
+                ret = userfaultfd_api(ctx, arg);
+                break;
+        case UFFDIO_REGISTER:
+                ret = userfaultfd_register(ctx, arg);
+                break;
+        case UFFDIO_UNREGISTER:
+                ret = userfaultfd_unregister(ctx, arg);
+                break;
+        case UFFDIO_WAKE:
+                ret = userfaultfd_wake(ctx, arg);
+                break;
+        case UFFDIO_COPY:
+                ret = userfaultfd_copy(ctx, arg);
+                break;
+        case UFFDIO_ZEROPAGE:
+                ret = userfaultfd_zeropage(ctx, arg);
+                break;
+        }
+        return ret;
+}
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct userfaultfd_ctx *ctx = f->private_data;
+        wait_queue_t *wq;
+        struct userfaultfd_wait_queue *uwq;
+        unsigned long pending = 0, total = 0;
+        spin_lock(&ctx->fault_pending_wqh.lock);
+        list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+                uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+                pending++;
+                total++;
+        }
+        list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+                uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+                total++;
+        }
+        spin_unlock(&ctx->fault_pending_wqh.lock);
+        /*
+         * If more protocols will be added, there will be all shown
+         * separated by a space. Like this:
+         *      protocols: aa:... bb:...
+         */
+        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+                   pending, total, UFFD_API, UFFD_API_FEATURES,
+                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+        .show_fdinfo    = userfaultfd_show_fdinfo,
+#endif
+        .release        = userfaultfd_release,
+        .poll           = userfaultfd_poll,
+        .read           = userfaultfd_read,
+        .unlocked_ioctl = userfaultfd_ioctl,
+        .compat_ioctl   = userfaultfd_ioctl,
+        .llseek         = noop_llseek,
+};
+static void init_once_userfaultfd_ctx(void *mem)
+{
+        struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+        init_waitqueue_head(&ctx->fault_pending_wqh);
+        init_waitqueue_head(&ctx->fault_wqh);
+        init_waitqueue_head(&ctx->fd_wqh);
+        seqcount_init(&ctx->refile_seq);
+}
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase.  In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided.  Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+        struct file *file;
+        struct userfaultfd_ctx *ctx;
+        BUG_ON(!current->mm);
+        /* Check the UFFD_* constants for consistency.  */
+        BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+        BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+        file = ERR_PTR(-EINVAL);
+        if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+                goto out;
+        file = ERR_PTR(-ENOMEM);
+        ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+        if (!ctx)
+                goto out;
+        atomic_set(&ctx->refcount, 1);
+        ctx->flags = flags;
+        ctx->state = UFFD_STATE_WAIT_API;
+        ctx->released = false;
+        ctx->mm = current->mm;
+        /* prevent the mm struct to be freed */
+        atomic_inc(&ctx->mm->mm_users);
+        file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+                                  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+        if (IS_ERR(file))
+                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+        return file;
+}
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+        int fd, error;
+        struct file *file;
+        error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+        if (error < 0)
+                return error;
+        fd = error;
+        file = userfaultfd_file_create(flags);
+        if (IS_ERR(file)) {
+                error = PTR_ERR(file);
+                goto err_put_unused_fd;
+        }
+        fd_install(fd, file);
+        return fd;
+err_put_unused_fd:
+        put_unused_fd(fd);
+        return error;
+}
+static int __init userfaultfd_init(void)
+{
+        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+                                                sizeof(struct userfaultfd_ctx),
+                                                0,
+                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+                                                init_once_userfaultfd_ctx);
+        return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562c159..bbd9b1f10ffb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -511,9 +511,9 @@ xfs_showargs(
                seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
        if (mp->m_logname)
-                seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+                seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
        if (mp->m_rtname)
-                seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+                seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
        if (mp->m_dalign > 0)
                seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
        kernel_cap_t    cap_permitted;  /* caps we're permitted */
        kernel_cap_t    cap_effective;  /* caps we can actually use */
        kernel_cap_t    cap_bset;       /* capability bounding set */
+        kernel_cap_t    cap_ambient;    /* Ambient capability set */
 #ifdef CONFIG_KEYS
        unsigned char   jit_keyring;    /* default keyring to attach requested
                                         * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
 }
 #endif
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+        return cap_issubset(cred->cap_ambient,
+                            cap_intersect(cred->cap_permitted,
+                                          cred->cap_inheritable));
+}
 /**
 * get_new_cred - Get a reference on a new set of credentials
 * @cred: The new credentials to reference
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbd780c33c5f..864203c10dbc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct file_operations {
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
-        int (*mremap)(struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..e0727d77feaf 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,49 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_INODE    2
 /*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
- * these are flushed when an inode is evicted from core and may be flushed
+ * These are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
- * (such as dnotify) will flush these when the open fd is closed and not at
+ * users (such as dnotify) will flush these when the open fd is closed and not
- * inode eviction or modification.
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
 */
 struct fsnotify_mark {
-        __u32 mask;                     /* mask this mark is for */
+        /* Mask this mark is for [mark->lock, group->mark_mutex] */
-        /* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+        __u32 mask;
+        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
-        atomic_t refcnt;                /* active things looking at this mark */
+        atomic_t refcnt;
-        struct fsnotify_group *group;   /* group this mark is for */
+        /* Group this mark is for. Set on mark creation, stable until last ref
-        struct list_head g_list;        /* list of marks by group->i_fsnotify_marks
+         * is dropped */
-                                         * Also reused for queueing mark into
+        struct fsnotify_group *group;
-                                         * destroy_list when it's waiting for
+        /* List of marks by group->i_fsnotify_marks. Also reused for queueing
-                                         * the end of SRCU period before it can
+         * mark into destroy_list when it's waiting for the end of SRCU period
-                                         * be freed */
+         * before it can be freed. [group->mark_mutex] */
-        spinlock_t lock;                /* protect group and inode */
+        struct list_head g_list;
-        struct hlist_node obj_list;     /* list of marks for inode / vfsmount */
+        /* Protects inode / mnt pointers, flags, masks */
-        struct list_head free_list;     /* tmp list used when freeing this mark */
+        spinlock_t lock;
-        union {
+        /* List of marks for inode / vfsmount [obj_lock] */
+        struct hlist_node obj_list;
+        union { /* Object pointer [mark->lock, group->mark_mutex] */
                struct inode *inode;    /* inode this mark is associated with */
                struct vfsmount *mnt;   /* vfsmount this mark is associated with */
        };
-        __u32 ignored_mask;             /* events types to ignore */
+        /* Events types to ignore [mark->lock, group->mark_mutex] */
+        __u32 ignored_mask;
 #define FSNOTIFY_MARK_FLAG_INODE                0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT             0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED        0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY  0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE                0x10
-        unsigned int flags;             /* vfsmount or inode mark? */
+#define FSNOTIFY_MARK_FLAG_ATTACHED             0x20
+        unsigned int flags;             /* flags [mark->lock] */
        void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
+/* detach mark from inode / mount list, group list, drop inode reference */
-                                         struct fsnotify_group *group);
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the inode marks */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
        genpool_algo_t algo;            /* allocation function */
        void *data;
+        const char *name;
 };
 /*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data);
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
-                int min_alloc_order, int nid);
+                int min_alloc_order, int nid, const char *name);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
                        size_t size);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           const char namefmt[], ...);
 #define kthread_create(threadfn, data, namefmt, arg...) \
-        kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+        kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf6f117fcf4d..8b257c43855b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE     0x00000080
 #define VM_GROWSDOWN    0x00000100      /* general info on the segment */
+#define VM_UFFD_MISSING 0x00000200      /* missing pages tracking */
 #define VM_PFNMAP       0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE    0x00000800      /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP      0x00001000      /* wrprotect pages tracking */
 #define VM_LOCKED       0x00002000
 #define VM_IO           0x00004000      /* Memory mapped I/O or similar */
@@ -245,6 +247,7 @@ struct vm_fault {
 struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
+        int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
        void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -1833,7 +1836,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-        struct mempolicy *);
+        struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
        struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..c8d0a73d64c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -256,6 +256,16 @@ struct vm_region {
                                                * this region */
 };
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+        struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
 /*
 * This struct defines a memory VMM memory area. There is one of these
 * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
        struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 #endif
+        struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 struct core_thread {
@@ -543,6 +554,7 @@ enum tlb_flush_reason {
        TLB_REMOTE_SHOOTDOWN,
        TLB_LOCAL_SHOOTDOWN,
        TLB_LOCAL_MM_SHOOTDOWN,
+        TLB_REMOTE_SEND_IPI,
        NR_TLB_FLUSH_REASONS,
 };
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
 #endif
 };
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
-        unsigned long start_pfn;
-        unsigned long end_pfn;
-        int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void)
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
 #else
-static inline void hardlockup_detector_disable(void)
+static inline void hardlockup_detector_disable(void) {}
-{
-}
 #endif
 /*
@@ -80,6 +78,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
                                 void __user *, size_t *, loff_t *);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
+#else
+static inline int lockup_detector_suspend(void)
+{
+        return 0;
+}
+static inline void lockup_detector_resume(void)
+{
+}
 #endif
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
        TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+        TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+                                         * and caller guarantees they will
+                                         * do a final flush if necessary */
 };
 #ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,25 @@ enum perf_event_task_context {
        perf_nr_task_contexts,
 };
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+        /*
+         * Each bit set is a CPU that potentially has a TLB entry for one of
+         * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+         */
+        struct cpumask cpumask;
+        /* True if any bit in cpumask is set */
+        bool flush_required;
+        /*
+         * If true then the PTE was dirty when unmapped. The entry must be
+         * flushed before IO is initiated or a stale TLB entry potentially
+         * allows an update without redirtying the page.
+         */
+        bool writable;
+};
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1700,6 +1719,10 @@ struct task_struct {
        unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+        struct tlbflush_unmap_batch tlb_ubc;
+#endif
        struct rcu_head rcu;
        /*
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..d4c7271382cb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 #endif
 }
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+                                   const char *value)
+{
+        seq_putc(m, ',');
+        seq_escape(m, name, ",= \t\n\\");
+        if (value) {
+                seq_putc(m, '=');
+                seq_escape(m, value, ", \t\n\\");
+        }
+}
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ *                     where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) {     \
+        char val_buf[length + 1];                       \
+        strncpy(val_buf, value, length);                \
+        val_buf[length] = '\0';                         \
+        seq_show_option(m, name, val_buf);              \
+}
 #define SEQ_START_TOKEN ((void *)1)
 /*
 * Helpers for iteration over list_head-s in seq_files
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
 void kmem_cache_free(struct kmem_cache *, void *);
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
        const char                      *thread_comm;
 };
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+                                           const struct cpumask *cpumask);
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+        return smpboot_register_percpu_thread_cpumask(plug_thread,
+                                                      cpu_possible_mask);
+}
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
 int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
                                         const struct cpumask *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..08001317aee7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..587480ad41b7
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
+/*
+ *  include/linux/userfaultfd_k.h
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+#ifdef CONFIG_USERFAULTFD
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+#include <linux/fcntl.h>
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+                            unsigned int flags, unsigned long reason);
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+                            unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+                              unsigned long dst_start,
+                              unsigned long len);
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+                                        struct vm_userfaultfd_ctx vm_ctx)
+{
+        return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+        return vma->vm_flags & VM_UFFD_MISSING;
+}
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+        return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+#else /* CONFIG_USERFAULTFD */
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+                                   unsigned long address,
+                                   unsigned int flags,
+                                   unsigned long reason)
+{
+        return VM_FAULT_SIGBUS;
+}
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+                                        struct vm_userfaultfd_ctx vm_ctx)
+{
+        return true;
+}
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+        return false;
+}
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+        return false;
+}
+#endif /* CONFIG_USERFAULTFD */
+#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+                          void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)                                              \
        __wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)                                       \
-        __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+        __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
 #define wake_up_interruptible_poll(x, m)                                \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)                           \
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
        EM(  TLB_FLUSH_ON_TASK_SWITCH,  "flush on task switch" )        \
        EM(  TLB_REMOTE_SHOOTDOWN,      "remote shootdown" )            \
        EM(  TLB_LOCAL_SHOOTDOWN,       "local shootdown" )             \
-        EMe( TLB_LOCAL_MM_SHOOTDOWN,    "local mm shootdown" )
+        EM(  TLB_LOCAL_MM_SHOOTDOWN,    "local mm shootdown" )          \
+        EMe( TLB_REMOTE_SEND_IPI,       "remote ipi send" )
 /*
 * First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
 header-y += xilinx-v4l2-controls.h
 header-y += zorro.h
 header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR          (1 << 0)        /* 64b FP registers */
 # define PR_FP_MODE_FRE         (1 << 1)        /* 32b compatibility */
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT                  47
+# define PR_CAP_AMBIENT_IS_SET          1
+# define PR_CAP_AMBIENT_RAISE           2
+# define PR_CAP_AMBIENT_LOWER           3
+# define PR_CAP_AMBIENT_CLEAR_ALL       4
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
 #define SECBIT_KEEP_CAPS        (issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE             6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED      7  /* make bit-6 immutable */
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+                        (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
 #define SECURE_ALL_BITS         (issecure_mask(SECURE_NOROOT) | \
                                 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-                                 issecure_mask(SECURE_KEEP_CAPS))
+                                 issecure_mask(SECURE_KEEP_CAPS) | \
+                                 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
 #define SECURE_ALL_LOCKS        (SECURE_ALL_BITS << 1)
 #endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..df0e09bb7dd5
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+#include <linux/types.h>
+#include <linux/compiler.h>
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *                            UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS                         \
+        ((__u64)1 << _UFFDIO_REGISTER |         \
+         (__u64)1 << _UFFDIO_UNREGISTER |       \
+         (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS                   \
+        ((__u64)1 << _UFFDIO_WAKE |             \
+         (__u64)1 << _UFFDIO_COPY |             \
+         (__u64)1 << _UFFDIO_ZEROPAGE)
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER                (0x00)
+#define _UFFDIO_UNREGISTER              (0x01)
+#define _UFFDIO_WAKE                    (0x02)
+#define _UFFDIO_COPY                    (0x03)
+#define _UFFDIO_ZEROPAGE                (0x04)
+#define _UFFDIO_API                     (0x3F)
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API              _IOWR(UFFDIO, _UFFDIO_API,      \
+                                      struct uffdio_api)
+#define UFFDIO_REGISTER         _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+                                      struct uffdio_register)
+#define UFFDIO_UNREGISTER       _IOR(UFFDIO, _UFFDIO_UNREGISTER,        \
+                                     struct uffdio_range)
+#define UFFDIO_WAKE             _IOR(UFFDIO, _UFFDIO_WAKE,      \
+                                     struct uffdio_range)
+#define UFFDIO_COPY             _IOWR(UFFDIO, _UFFDIO_COPY,     \
+                                      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE         _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+                                      struct uffdio_zeropage)
+/* read() structure */
+struct uffd_msg {
+        __u8    event;
+        __u8    reserved1;
+        __u16   reserved2;
+        __u32   reserved3;
+        union {
+                struct {
+                        __u64   flags;
+                        __u64   address;
+                } pagefault;
+                struct {
+                        /* unused reserved fields */
+                        __u64   reserved1;
+                        __u64   reserved2;
+                        __u64   reserved3;
+                } reserved;
+        } arg;
+} __packed;
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT    0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK         0x13
+#endif
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE       (1<<0)  /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP          (1<<1)  /* If reason is VM_UFFD_WP */
+struct uffdio_api {
+        /* userland asks for an API number and the features to enable */
+        __u64 api;
+        /*
+         * Kernel answers below with the all available features for
+         * the API, this notifies userland of which events and/or
+         * which flags for each event are enabled in the current
+         * kernel.
+         *
+         * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+         * are to be considered implicitly always enabled in all kernels as
+         * long as the uffdio_api.api requested matches UFFD_API.
+         */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP          (1<<0)
+#define UFFD_FEATURE_EVENT_FORK                 (1<<1)
+#endif
+        __u64 features;
+        __u64 ioctls;
+};
+struct uffdio_range {
+        __u64 start;
+        __u64 len;
+};
+struct uffdio_register {
+        struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING    ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP         ((__u64)1<<1)
+        __u64 mode;
+        /*
+         * kernel answers which ioctl commands are available for the
+         * range, keep at the end as the last 8 bytes aren't read.
+         */
+        __u64 ioctls;
+};
+struct uffdio_copy {
+        __u64 dst;
+        __u64 src;
+        __u64 len;
+        /*
+         * There will be a wrprotection flag later that allows to map
+         * pages wrprotected on the fly. And such a flag will be
+         * available if the wrprotection ioctl are implemented for the
+         * range according to the uffdio_register.ioctls.
+         */
+#define UFFDIO_COPY_MODE_DONTWAKE               ((__u64)1<<0)
+        __u64 mode;
+        /*
+         * "copy" is written by the ioctl and must be at the end: the
+         * copy_from_user will not read the last 8 bytes.
+         */
+        __s64 copy;
+};
+struct uffdio_zeropage {
+        struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE           ((__u64)1<<0)
+        __u64 mode;
+        /*
+         * "zeropage" is written by the ioctl and must be at the end:
+         * the copy_from_user will not read the last 8 bytes.
+         */
+        __s64 zeropage;
+};
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index bb9b4dd55889..2c0e50ef554a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -883,6 +883,16 @@ config ARCH_SUPPORTS_NUMA_BALANCING
        bool
 #
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+        bool
+#
 # For architectures that know their GCC __int128 support is sound
 #
 config ARCH_SUPPORTS_INT128
@@ -1576,6 +1586,14 @@ config ADVISE_SYSCALLS
          applications use these syscalls, you can disable this option to save
          space.
+config USERFAULTFD
+        bool "Enable userfaultfd() system call"
+        select ANON_INODES
+        depends on MMU
+        help
+          Enable the userfaultfd() system call that allows to intercept and
+          handle page faults in userland.
 config PCI_QUIRKS
        default y
        bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
        if (root != &cgrp_dfl_root)
                for_each_subsys(ss, ssid)
                        if (root->subsys_mask & (1 << ssid))
-                                seq_printf(seq, ",%s", ss->legacy_name);
+                                seq_show_option(seq, ss->name, NULL);
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
        spin_lock(&release_agent_path_lock);
        if (strlen(root->release_agent_path))
-                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+                seq_show_option(seq, "release_agent",
+                                root->release_agent_path);
        spin_unlock(&release_agent_path_lock);
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
-                seq_printf(seq, ",name=%s", root->name);
+                seq_show_option(seq, "name", root->name);
        return 0;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-                tmp->vm_flags &= ~VM_LOCKED;
+                tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
+                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+                          void *key)
 {
-        __wake_up_common(q, mode, 1, 0, key);
+        __wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, key);
+                __wake_up_locked_key(q, mode, 1, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
                if (kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        preempt_enable();
-                        if (ht->cleanup)
+                        /* cleanup must mirror setup */
+                        if (ht->cleanup && td->status != HP_THREAD_NONE)
                                ht->cleanup(td->cpu, cpu_online(td->cpu));
                        kfree(td);
                        return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
        unsigned int cpu;
-        /* Unpark any threads that were voluntarily parked. */
-        for_each_cpu_not(cpu, ht->cpumask) {
-                if (cpu_online(cpu)) {
-                        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-                        if (tsk)
-                                kthread_unpark(tsk);
-                }
-        }
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ *                                          to hotplug
 * @plug_thread:        Hotplug thread descriptor
+ * @cpumask:            The cpumask where threads run
 *
 * Creates and starts the threads on all online cpus.
 */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+                                           const struct cpumask *cpumask)
 {
        unsigned int cpu;
        int ret = 0;
        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
                return -ENOMEM;
-        cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+        cpumask_copy(plug_thread->cpumask, cpumask);
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
                ret = __smpboot_create_thread(plug_thread, cpu);
                if (ret) {
                        smpboot_destroy_threads(plug_thread);
+                        free_cpumask_var(plug_thread->cpumask);
                        goto out;
                }
-                smpboot_unpark_thread(plug_thread, cpu);
+                if (cpumask_test_cpu(cpu, cpumask))
+                        smpboot_unpark_thread(plug_thread, cpu);
        }
        list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -311,7 +308,7 @@ out:
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 /**
 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
+        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
+#include <linux/kthread.h>
 /*
 * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
 static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
        }
 }
-void watchdog_nmi_enable_all(void)
-{
-        int cpu;
-        mutex_lock(&watchdog_proc_mutex);
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                goto unlock;
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                watchdog_nmi_enable(cpu);
-        put_online_cpus();
-unlock:
-        mutex_unlock(&watchdog_proc_mutex);
-}
-void watchdog_nmi_disable_all(void)
-{
-        int cpu;
-        mutex_lock(&watchdog_proc_mutex);
-        if (!watchdog_running)
-                goto unlock;
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                watchdog_nmi_disable(cpu);
-        put_online_cpus();
-unlock:
-        mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
 {
-        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+        int cpu, ret = 0;
-        int ret;
+        get_online_cpus();
+        for_each_watchdog_cpu(cpu) {
+                ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+                if (ret)
+                        break;
+        }
+        if (ret) {
+                for_each_watchdog_cpu(cpu)
+                        kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+        }
+        put_online_cpus();
+        return ret;
+}
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+        int cpu;
+        get_online_cpus();
+        for_each_watchdog_cpu(cpu)
+                kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+        put_online_cpus();
+}
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+        int ret = 0;
+        mutex_lock(&watchdog_proc_mutex);
        /*
-         * No need to cancel and restart hrtimer if it is currently executing
+         * Multiple suspend requests can be active in parallel (counted by
-         * because it will reprogram itself with the new period now.
+         * the 'watchdog_suspended' variable). If the watchdog threads are
-         * We should never see it unqueued here because we are running per-cpu
+         * running, the first caller takes care that they will be parked.
-         * with interrupts disabled.
+         * The state of 'watchdog_running' cannot change while a suspend
+         * request is active (see related code in 'proc' handlers).
         */
-        ret = hrtimer_try_to_cancel(hrtimer);
+        if (watchdog_running && !watchdog_suspended)
-        if (ret == 1)
+                ret = watchdog_park_threads();
-                hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-                                HRTIMER_MODE_REL_PINNED);
+        if (ret == 0)
+                watchdog_suspended++;
+        mutex_unlock(&watchdog_proc_mutex);
+        return ret;
 }
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
 {
+        mutex_lock(&watchdog_proc_mutex);
+        watchdog_suspended--;
        /*
-         * Make sure that perf event counter will adopt to a new
+         * The watchdog threads are unparked if they were previously running
-         * sampling period. Updating the sampling period directly would
+         * and if there is no more active suspend request.
-         * be much nicer but we do not have an API for that now so
-         * let's use a big hammer.
-         * Hrtimer will adopt the new period on the next tick but this
-         * might be late already so we have to restart the timer as well.
         */
-        watchdog_nmi_disable(cpu);
+        if (watchdog_running && !watchdog_suspended)
-        smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
+                watchdog_unpark_threads();
-        watchdog_nmi_enable(cpu);
+        mutex_unlock(&watchdog_proc_mutex);
 }
 static void update_watchdog_all_cpus(void)
 {
-        int cpu;
+        watchdog_park_threads();
+        watchdog_unpark_threads();
-        get_online_cpus();
-        for_each_watchdog_cpu(cpu)
-                update_watchdog(cpu);
-        put_online_cpus();
 }
 static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
        int err = 0;
        if (!watchdog_running) {
-                err = smpboot_register_percpu_thread(&watchdog_threads);
+                err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+                                                             &watchdog_cpumask);
                if (err)
                        pr_err("Failed to create watchdog threads, disabled\n");
-                else {
+                else
-                        if (smpboot_update_cpumask_percpu_thread(
-                                    &watchdog_threads, &watchdog_cpumask))
-                                pr_err("Failed to set cpumask for watchdog threads\n");
                        watchdog_running = 1;
-                }
        } else {
                /*
                 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        /*
         * If the parameter is being read return the state of the corresponding
         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        old = ACCESS_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
        int err;
        mutex_lock(&watchdog_proc_mutex);
+        if (watchdog_suspended) {
+                /* no parameter changes allowed while watchdog is suspended */
+                err = -EAGAIN;
+                goto out;
+        }
        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
        if (!err && write) {
                /* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
                                pr_err("cpumask update failed\n");
                }
        }
+out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_enabled()) {
-                if (!cpumask_empty(tick_nohz_full_mask))
+                pr_info("Disabling watchdog on nohz_full cores by default\n");
-                        pr_info("Disabling watchdog on nohz_full cores by default\n");
+                cpumask_copy(&watchdog_cpumask, housekeeping_mask);
-                cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
-                               tick_nohz_full_mask);
        } else
                cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #else
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
                pool->min_alloc_order = min_alloc_order;
                pool->algo = gen_pool_first_fit;
                pool->data = NULL;
+                pool->name = NULL;
        }
        return pool;
 }
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
                kfree(chunk);
        }
+        kfree_const(pool->name);
        kfree(pool);
-        return;
 }
 EXPORT_SYMBOL(gen_pool_destroy);
@@ -570,53 +571,88 @@ static void devm_gen_pool_release(struct device *dev, void *res)
        gen_pool_destroy(*(struct gen_pool **)res);
 }
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+        struct gen_pool **p = res;
+        /* NULL data matches only a pool without an assigned name */
+        if (!data && !(*p)->name)
+                return 1;
+        if (!data || !(*p)->name)
+                return 0;
+        return !strcmp((*p)->name, data);
+}
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+        struct gen_pool **p;
+        p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+                        (void *)name);
+        if (!p)
+                return NULL;
+        return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
 /**
 * devm_gen_pool_create - managed gen_pool_create
 * @dev: device that provides the gen_pool
 * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
 *
 * Create a new special memory pool that can be used to manage special purpose
 * memory not managed by the regular kmalloc/kfree interface. The pool will be
 * automatically destroyed by the device management code.
 */
 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
-                int nid)
+                                      int nid, const char *name)
 {
        struct gen_pool **ptr, *pool;
+        const char *pool_name = NULL;
+        /* Check that genpool to be created is uniquely addressed on device */
+        if (gen_pool_get(dev, name))
+                return ERR_PTR(-EINVAL);
+        if (name) {
+                pool_name = kstrdup_const(name, GFP_KERNEL);
+                if (!pool_name)
+                        return ERR_PTR(-ENOMEM);
+        }
        ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
-                return NULL;
+                goto free_pool_name;
        pool = gen_pool_create(min_alloc_order, nid);
-        if (pool) {
+        if (!pool)
-                *ptr = pool;
+                goto free_devres;
-                devres_add(dev, ptr);
-        } else {
+        *ptr = pool;
-                devres_free(ptr);
+        pool->name = pool_name;
-        }
+        devres_add(dev, ptr);
        return pool;
-}
-EXPORT_SYMBOL(devm_gen_pool_create);
-/**
+free_devres:
- * gen_pool_get - Obtain the gen_pool (if any) for a device
+        devres_free(ptr);
- * @dev: device to retrieve the gen_pool from
+free_pool_name:
- *
+        kfree_const(pool_name);
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
-        struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
-                                        NULL);
-        if (!p)
+        return ERR_PTR(-ENOMEM);
-                return NULL;
-        return *p;
 }
-EXPORT_SYMBOL_GPL(gen_pool_get);
+EXPORT_SYMBOL(devm_gen_pool_create);
 #ifdef CONFIG_OF
 /**
@@ -633,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
        const char *propname, int index)
 {
        struct platform_device *pdev;
-        struct device_node *np_pool;
+        struct device_node *np_pool, *parent;
+        const char *name = NULL;
+        struct gen_pool *pool = NULL;
        np_pool = of_parse_phandle(np, propname, index);
        if (!np_pool)
                return NULL;
        pdev = of_find_device_by_node(np_pool);
+        if (!pdev) {
+                /* Check if named gen_pool is created by parent node device */
+                parent = of_get_parent(np_pool);
+                pdev = of_find_device_by_node(parent);
+                of_node_put(parent);
+                of_property_read_string(np_pool, "label", &name);
+                if (!name)
+                        name = np_pool->name;
+        }
+        if (pdev)
+                pool = gen_pool_get(&pdev->dev, name);
        of_node_put(np_pool);
-        if (!pdev)
-                return NULL;
+        return pool;
-        return gen_pool_get(&pdev->dev);
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..b424d5e5b6ff 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fd5fe4342e93..59d10d16f0a5 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
        return page;
 }
-static inline int is_page_busy(struct dma_page *page)
+static inline bool is_page_busy(struct dma_page *page)
 {
        return page->in_use != 0;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..a798293fc648 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
 #include <linux/sched.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 #include "internal.h"
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
        return NULL;
 }
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+                pte_t *pte, unsigned int flags)
+{
+        /* No page to get reference */
+        if (flags & FOLL_GET)
+                return -EFAULT;
+        if (flags & FOLL_TOUCH) {
+                pte_t entry = *pte;
+                if (flags & FOLL_WRITE)
+                        entry = pte_mkdirty(entry);
+                entry = pte_mkyoung(entry);
+                if (!pte_same(*pte, entry)) {
+                        set_pte_at(vma->vm_mm, address, pte, entry);
+                        update_mmu_cache(vma, address, pte);
+                }
+        }
+        /* Proper page table entry exists, but no corresponding struct page */
+        return -EEXIST;
+}
 static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags)
 {
@@ -73,10 +99,21 @@ retry:
        page = vm_normal_page(vma, address, pte);
        if (unlikely(!page)) {
-                if ((flags & FOLL_DUMP) ||
+                if (flags & FOLL_DUMP) {
-                    !is_zero_pfn(pte_pfn(pte)))
+                        /* Avoid special (like zero) pages in core dumps */
-                        goto bad_page;
+                        page = ERR_PTR(-EFAULT);
-                page = pte_page(pte);
+                        goto out;
+                }
+                if (is_zero_pfn(pte_pfn(pte))) {
+                        page = pte_page(pte);
+                } else {
+                        int ret;
+                        ret = follow_pfn_pte(vma, address, ptep, flags);
+                        page = ERR_PTR(ret);
+                        goto out;
+                }
        }
        if (flags & FOLL_GET)
@@ -114,12 +151,9 @@ retry:
                        unlock_page(page);
                }
        }
+out:
        pte_unmap_unlock(ptep, ptl);
        return page;
-bad_page:
-        pte_unmap_unlock(ptep, ptl);
-        return ERR_PTR(-EFAULT);
 no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
@@ -489,9 +523,15 @@ retry:
                                goto next_page;
                        }
                        BUG();
-                }
+                } else if (PTR_ERR(page) == -EEXIST) {
-                if (IS_ERR(page))
+                        /*
+                         * Proper page table entry exists, but no corresponding
+                         * struct page.
+                         */
+                        goto next_page;
+                } else if (IS_ERR(page)) {
                        return i ? i : PTR_ERR(page);
+                }
                if (pages) {
                        pages[i] = page;
                        flush_anon_page(vma, page, start);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 097c7a4bfbd9..279a818a39b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
-                                        unsigned long haddr, pmd_t *pmd,
+                                        unsigned long address, pmd_t *pmd,
-                                        struct page *page, gfp_t gfp)
+                                        struct page *page, gfp_t gfp,
+                                        unsigned int flags)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        spinlock_t *ptl;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
+        if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
-                return VM_FAULT_OOM;
+                put_page(page);
+                count_vm_event(THP_FAULT_FALLBACK);
+                return VM_FAULT_FALLBACK;
+        }
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
                mem_cgroup_cancel_charge(page, memcg);
+                put_page(page);
                return VM_FAULT_OOM;
        }
@@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
+                /* Deliver the page fault to userland */
+                if (userfaultfd_missing(vma)) {
+                        int ret;
+                        spin_unlock(ptl);
+                        mem_cgroup_cancel_charge(page, memcg);
+                        put_page(page);
+                        pte_free(mm, pgtable);
+                        ret = handle_userfault(vma, address, flags,
+                                               VM_UFFD_MISSING);
+                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                        return ret;
+                }
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                atomic_long_inc(&mm->nr_ptes);
                spin_unlock(ptl);
+                count_vm_event(THP_FAULT_ALLOC);
        }
        return 0;
@@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 /* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
 {
        pmd_t entry;
-        if (!pmd_none(*pmd))
-                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
-        return true;
 }
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pgtable_t pgtable;
                struct page *zero_page;
                bool set;
+                int ret;
                pgtable = pte_alloc_one(mm, haddr);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
@@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_FALLBACK;
                }
                ptl = pmd_lock(mm, pmd);
-                set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+                ret = 0;
-                                zero_page);
+                set = false;
-                spin_unlock(ptl);
+                if (pmd_none(*pmd)) {
+                        if (userfaultfd_missing(vma)) {
+                                spin_unlock(ptl);
+                                ret = handle_userfault(vma, address, flags,
+                                                       VM_UFFD_MISSING);
+                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                        } else {
+                                set_huge_zero_page(pgtable, mm, vma,
+                                                   haddr, pmd,
+                                                   zero_page);
+                                spin_unlock(ptl);
+                                set = true;
+                        }
+                } else
+                        spin_unlock(ptl);
                if (!set) {
                        pte_free(mm, pgtable);
                        put_huge_zero_page();
                }
-                return 0;
+                return ret;
        }
        gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-        if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
+        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
-                put_page(page);
+                                            flags);
-                count_vm_event(THP_FAULT_FALLBACK);
-                return VM_FAULT_FALLBACK;
-        }
-        count_vm_event(THP_FAULT_ALLOC);
-        return 0;
 }
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         */
        if (is_huge_zero_pmd(pmd)) {
                struct page *zero_page;
-                bool set;
                /*
                 * get_huge_zero_page() will never allocate a new page here,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
                zero_page = get_huge_zero_page();
-                set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+                set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                zero_page);
-                BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
                ret = 0;
                goto out_unlock;
        }
@@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none_or_zero <= khugepaged_max_ptes_none)
+                        if (!userfaultfd_armed(vma) &&
+                            ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none_or_zero <= khugepaged_max_ptes_none)
+                        if (!userfaultfd_armed(vma) &&
+                            ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c3087089d8..51ae41d0fbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
        if (vma->vm_flags & VM_NORESERVE) {
                /*
@@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
                 * properly, so add work-around here.
                 */
                if (vma->vm_flags & VM_MAYSHARE && chg == 0)
-                        return 1;
+                        return true;
                else
-                        return 0;
+                        return false;
        }
        /* Shared mappings always use reserves */
        if (vma->vm_flags & VM_MAYSHARE)
-                return 1;
+                return true;
        /*
         * Only the process that called mmap() has reserves for
         * private mappings.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
 static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
        return saddr;
 }
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 {
        unsigned long base = addr & PUD_MASK;
        unsigned long end = base + PUD_SIZE;
@@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
         */
        if (vma->vm_flags & VM_MAYSHARE &&
            vma->vm_start <= base && end <= vma->vm_end)
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
 /*
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..1195dd2d6a2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR              0x100 /* fair zone allocation */
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..ce3a4222c7e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-                                vma->vm_file, pgoff, vma_policy(vma));
+                          vma->vm_file, pgoff, vma_policy(vma),
+                          vma->vm_userfaultfd_ctx);
        if (*prev) {
                vma = *prev;
                goto success;
@@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        }
 }
-static int
+static bool
 madvise_behavior_valid(int behavior)
 {
        switch (behavior) {
@@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior)
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
-                return 1;
+                return true;
        default:
-                return 0;
+                return false;
        }
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 87108e77e476..95ce68c6da8a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,6 +566,9 @@ repeat:
                 * area, insert that portion.
                 */
                if (rbase > base) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+                        WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..1af057575ce9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        if (!mem_cgroup_is_root(memcg))
                page_counter_uncharge(&memcg->memory, 1);
-        /* Caller disabled preemption with mapping->tree_lock */
+        /*
+         * Interrupts should be disabled here because the caller holds the
+         * mapping->tree_lock lock which is taken with interrupts-off. It is
+         * important here to have the interrupts disabled because it is the
+         * only synchronisation we have for udpating the per-CPU variables.
+         */
+        VM_BUG_ON(!irqs_disabled());
        mem_cgroup_charge_statistics(memcg, page, -1);
        memcg_check_events(memcg, page);
 }
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9aa283..bb04d8f2f86c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
 #ifdef HAVE_GENERIC_MMU_GATHER
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
 {
        struct mmu_gather_batch *batch;
        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
-                return 1;
+                return true;
        }
        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
-                return 0;
+                return false;
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
-                return 0;
+                return false;
        tlb->batch_count++;
        batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
        tlb->active->next = batch;
        tlb->active = batch;
-        return 1;
+        return true;
 }
 /* tlb_gather_mmu
@@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
                if (!pte_none(*page_table))
                        goto unlock;
+                /* Deliver the page fault to userland, check inside PT lock */
+                if (userfaultfd_missing(vma)) {
+                        pte_unmap_unlock(page_table, ptl);
+                        return handle_userfault(vma, address, flags,
+                                                VM_UFFD_MISSING);
+                }
                goto setpte;
        }
@@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
+        /* Deliver the page fault to userland, check inside PT lock */
+        if (userfaultfd_missing(vma)) {
+                pte_unmap_unlock(page_table, ptl);
+                mem_cgroup_cancel_charge(page, memcg);
+                page_cache_release(page);
+                return handle_userfault(vma, address, flags,
+                                        VM_UFFD_MISSING);
+        }
        inc_mm_counter_fast(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, address);
        mem_cgroup_commit_charge(page, memcg, false);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6da82bcb0a8b..8fd97dac538a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size)
        mem_hotplug_begin();
+        /*
+         * Add new range to memblock so that when hotadd_new_pgdat() is called
+         * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
+         * this new range and calculate total pages correctly.  The range will
+         * be removed at hot-remove time.
+         */
+        memblock_add_node(start, size, nid);
        new_node = !node_online(nid);
        if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
@@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size)
        /* create new memmap entry */
        firmware_map_add_hotplug(start, start + size, "System RAM");
-        memblock_add_node(start, size, nid);
        goto out;
@@ -1286,6 +1293,7 @@ error:
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
        release_memory_resource(res);
+        memblock_remove(start, size);
 out:
        mem_hotplug_done();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..a7f1e0d1d6b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                pgoff = vma->vm_pgoff +
                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-                                  vma->anon_vma, vma->vm_file, pgoff,
+                                 vma->anon_vma, vma->vm_file, pgoff,
-                                  new_pol);
+                                 new_pol, vma->vm_userfaultfd_ctx);
                if (prev) {
                        vma = prev;
                        next = vma->vm_next;
diff --git a/mm/migrate.c b/mm/migrate.c
index eb4267107d1f..5c08cab5419e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
-                page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+                /* FOLL_DUMP to ignore special (like zero) pages */
+                page = follow_page(vma, pp->addr,
+                                FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!page)
                        goto set_status;
-                /* Use PageReserved to check for zero page */
-                if (PageReserved(page))
-                        goto put_and_set;
                pp->page = page;
                err = page_to_nid(page);
@@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                if (!vma || addr < vma->vm_start)
                        goto set_status;
-                page = follow_page(vma, addr, 0);
+                /* FOLL_DUMP to ignore special (like zero) pages */
+                page = follow_page(vma, addr, FOLL_DUMP);
                err = PTR_ERR(page);
                if (IS_ERR(page))
                        goto set_status;
-                err = -ENOENT;
+                err = page ? page_to_nid(page) : -ENOENT;
-                /* Use PageReserved to check for zero page */
-                if (!page || PageReserved(page))
-                        goto set_status;
-                err = page_to_nid(page);
 set_status:
                *status = err;
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..25936680064f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-                          vma->vm_file, pgoff, vma_policy(vma));
+                          vma->vm_file, pgoff, vma_policy(vma),
+                          vma->vm_userfaultfd_ctx);
        if (*prev) {
                vma = *prev;
                goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..82db4fc0a9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 * per-vma resources, so we don't attempt to merge those.
 */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-                        struct file *file, unsigned long vm_flags)
+                                struct file *file, unsigned long vm_flags,
+                                struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
                return 0;
        if (vma->vm_ops && vma->vm_ops->close)
                return 0;
+        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+                return 0;
        return 1;
 }
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
 */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+                     struct anon_vma *anon_vma, struct file *file,
+                     pgoff_t vm_pgoff,
+                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-        if (is_mergeable_vma(vma, file, vm_flags) &&
+        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+                    struct anon_vma *anon_vma, struct file *file,
+                    pgoff_t vm_pgoff,
+                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-        if (is_mergeable_vma(vma, file, vm_flags) &&
+        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
-                        pgoff_t pgoff, struct mempolicy *policy)
+                        pgoff_t pgoff, struct mempolicy *policy,
+                        struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
        if (prev && prev->vm_end == addr &&
                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
-                                                anon_vma, file, pgoff)) {
+                                            anon_vma, file, pgoff,
+                                            vm_userfaultfd_ctx)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
                if (next && end == next->vm_start &&
                                mpol_equal(policy, vma_policy(next)) &&
                                can_vma_merge_before(next, vm_flags,
-                                        anon_vma, file, pgoff+pglen) &&
+                                                     anon_vma, file,
+                                                     pgoff+pglen,
+                                                     vm_userfaultfd_ctx) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
        if (next && end == next->vm_start &&
                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
-                                        anon_vma, file, pgoff+pglen)) {
+                                             anon_vma, file, pgoff+pglen,
+                                             vm_userfaultfd_ctx)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = vma_adjust(prev, prev->vm_start,
                                addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /*
         * Can we just expand an old mapping?
         */
-        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+        vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-                        NULL);
+                        NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
-                                        NULL, NULL, pgoff, NULL);
+                        NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+                            vma->vm_userfaultfd_ctx);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         */
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *pprev = vma_merge(mm, *pprev, start, end, newflags,
-                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+                           vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+                           vma->vm_userfaultfd_ctx);
        if (*pprev) {
                vma = *pprev;
                goto success;
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c93eceb1c8..5a71cce8c6ea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
                                     need_rmap_locks);
        if (moved_len < old_len) {
+                err = -ENOMEM;
+        } else if (vma->vm_ops && vma->vm_ops->mremap) {
+                err = vma->vm_ops->mremap(new_vma);
+        }
+        if (unlikely(err)) {
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
-                new_addr = -ENOMEM;
+                new_addr = err;
        } else {
-                if (vma->vm_file && vma->vm_file->f_op->mremap) {
-                        err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-                        if (err < 0) {
-                                move_page_tables(new_vma, new_addr, vma,
-                                                 old_addr, moved_len, true);
-                                return err;
-                        }
-                }
                arch_remap(mm, old_addr, old_addr + old_len,
                           new_addr, new_addr + new_len);
        }
@@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = find_vma(mm, addr);
+        unsigned long pgoff;
        if (!vma || vma->vm_start > addr)
                return ERR_PTR(-EFAULT);
@@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (old_len > vma->vm_end - addr)
                return ERR_PTR(-EFAULT);
+        if (new_len == old_len)
+                return vma;
        /* Need to be careful about a growing mapping */
-        if (new_len > old_len) {
+        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-                unsigned long pgoff;
+        pgoff += vma->vm_pgoff;
+        if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-                if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+                return ERR_PTR(-EINVAL);
-                        return ERR_PTR(-EFAULT);
-                pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
-                pgoff += vma->vm_pgoff;
+                return ERR_PTR(-EFAULT);
-                if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-                        return ERR_PTR(-EINVAL);
-        }
        if (vma->vm_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
@@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
                goto out;
-        /* Check if the location we're moving into overlaps the
+        /* Ensure the old/new locations do not overlap */
-         * old location at all, and fail if it does.
+        if (addr + old_len > new_addr && new_addr + new_len > addr)
-         */
-        if ((new_addr <= addr) && (new_addr+new_len) > addr)
-                goto out;
-        if ((addr <= new_addr) && (addr+old_len) > new_addr)
                goto out;
        ret = do_munmap(mm, new_addr, new_len);
@@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
        }
 out:
-        if (ret & ~PAGE_MASK)
+        if (ret & ~PAGE_MASK) {
                vm_unacct_memory(charged);
+                locked = 0;
+        }
        up_write(&current->mm->mmap_sem);
        if (locked && new_len > old_len)
                mm_populate(new_addr + old_len, new_len - old_len);
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..0db38e7d0a72 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
 #include <asm/tlbflush.h>
+#include <trace/events/tlb.h>
 #include "internal.h"
 static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        return address;
 }
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+        /*
+         * All TLB entries are flushed on the assumption that it is
+         * cheaper to flush all TLBs and let them be refilled than
+         * flushing individual PFNs. Note that we do not track mm's
+         * to flush as that might simply be multiple full TLB flushes
+         * for no gain.
+         */
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+        flush_tlb_local();
+}
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+        int cpu;
+        if (!tlb_ubc->flush_required)
+                return;
+        cpu = get_cpu();
+        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+        if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+                percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+        if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+                smp_call_function_many(&tlb_ubc->cpumask,
+                        percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+        }
+        cpumask_clear(&tlb_ubc->cpumask);
+        tlb_ubc->flush_required = false;
+        tlb_ubc->writable = false;
+        put_cpu();
+}
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+        if (tlb_ubc->writable)
+                try_to_unmap_flush();
+}
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+                struct page *page, bool writable)
+{
+        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+        cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+        tlb_ubc->flush_required = true;
+        /*
+         * If the PTE was dirty then it's best to assume it's writable. The
+         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+         * before the page is queued for IO.
+         */
+        if (writable)
+                tlb_ubc->writable = true;
+}
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+        bool should_defer = false;
+        if (!(flags & TTU_BATCH_FLUSH))
+                return false;
+        /* If remote CPUs need to be flushed then defer batch the flush */
+        if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+                should_defer = true;
+        put_cpu();
+        return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+                struct page *page, bool writable)
+{
+}
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+        return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 /*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
@@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        if (should_defer_flush(mm, flags)) {
+                /*
+                 * We clear the PTE but do not flush so potentially a remote
+                 * CPU could still be writing to the page. If the entry was
+                 * previously clean then the architecture must guarantee that
+                 * a clear->dirty transition on a cached TLB entry is written
+                 * through and traps if the PTE is unmapped.
+                 */
+                pteval = ptep_get_and_clear(mm, address, pte);
+                set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
+        } else {
+                pteval = ptep_clear_flush(vma, address, pte);
+        }
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
diff --git a/mm/slab.c b/mm/slab.c
index bbd0b47dc6a9..60c936938b84 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+        __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                                                                void **p)
+{
+        return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 #ifdef CONFIG_TRACING
 void *
 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..a3a967d7d7c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos);
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 #ifdef CONFIG_MEMCG_KMEM
 /*
 * Iterate over all memcg caches of the given root cache. The caller must hold
@@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
                return cachep;
        pr_err("%s: Wrong slab cache. %s but object is from %s\n",
-               __func__, cachep->name, s->name);
+               __func__, s->name, cachep->name);
        WARN_ON_ONCE(1);
        return s;
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 86831105a09f..c26829fe4e37 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 }
 #endif
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+        size_t i;
+        for (i = 0; i < nr; i++)
+                kmem_cache_free(s, p[i]);
+}
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+                                                                void **p)
+{
+        size_t i;
+        for (i = 0; i < nr; i++) {
+                void *x = p[i] = kmem_cache_alloc(s, flags);
+                if (!x) {
+                        __kmem_cache_free_bulk(s, i, p);
+                        return false;
+                }
+        }
+        return true;
+}
 #ifdef CONFIG_MEMCG_KMEM
 void slab_init_memcg_params(struct kmem_cache *s)
 {
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+        __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                                                                void **p)
+{
+        return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
        /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index f68c0e50f3c0..084184e706c6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
        kasan_slab_free(s, x);
 }
+static void setup_object(struct kmem_cache *s, struct page *page,
+                                void *object)
+{
+        setup_object_debug(s, page, object);
+        if (unlikely(s->ctor)) {
+                kasan_unpoison_object_data(s, object);
+                s->ctor(object);
+                kasan_poison_object_data(s, object);
+        }
+}
 /*
 * Slab allocation and freeing
 */
@@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        struct page *page;
        struct kmem_cache_order_objects oo = s->oo;
        gfp_t alloc_gfp;
+        void *start, *p;
+        int idx, order;
        flags &= gfp_allowed_mask;
@@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         * so we fall-back to the minimum order allocation.
         */
        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+        if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
+                alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
        page = alloc_slab_page(s, alloc_gfp, node, oo);
        if (unlikely(!page)) {
@@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                 * Try a lower order alloc if possible
                 */
                page = alloc_slab_page(s, alloc_gfp, node, oo);
+                if (unlikely(!page))
-                if (page)
+                        goto out;
-                        stat(s, ORDER_FALLBACK);
+                stat(s, ORDER_FALLBACK);
        }
-        if (kmemcheck_enabled && page
+        if (kmemcheck_enabled &&
-                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+            !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
                kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1395,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        kmemcheck_mark_unallocated_pages(page, pages);
        }
-        if (flags & __GFP_WAIT)
-                local_irq_disable();
-        if (!page)
-                return NULL;
        page->objects = oo_objects(oo);
-        mod_zone_page_state(page_zone(page),
-                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-                1 << oo_order(oo));
-        return page;
-}
-static void setup_object(struct kmem_cache *s, struct page *page,
-                                void *object)
-{
-        setup_object_debug(s, page, object);
-        if (unlikely(s->ctor)) {
-                kasan_unpoison_object_data(s, object);
-                s->ctor(object);
-                kasan_poison_object_data(s, object);
-        }
-}
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
-        struct page *page;
-        void *start;
-        void *p;
-        int order;
-        int idx;
-        if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-                pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
-                BUG();
-        }
-        page = allocate_slab(s,
-                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-        if (!page)
-                goto out;
        order = compound_order(page);
-        inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab_cache = s;
        __SetPageSlab(page);
        if (page_is_pfmemalloc(page))
@@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->freelist = start;
        page->inuse = page->objects;
        page->frozen = 1;
 out:
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
+        mod_zone_page_state(page_zone(page),
+                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+                1 << oo_order(oo));
+        inc_slabs_node(s, page_to_nid(page), page->objects);
        return page;
 }
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+        if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+                pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+                BUG();
+        }
+        return allocate_slab(s,
+                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
 static void __free_slab(struct kmem_cache *s, struct page *page)
 {
        int order = compound_order(page);
@@ -2712,7 +2709,7 @@ redo:
         * Determine the currently cpus per cpu slab.
         * The cpu may change afterward. However that does not matter since
         * data is retrieved via this pointer. If we are on the same cpu
-         * during the cmpxchg then the free will succedd.
+         * during the cmpxchg then the free will succeed.
         */
        do {
                tid = this_cpu_read(s->cpu_slab->tid);
@@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+        struct kmem_cache_cpu *c;
+        struct page *page;
+        int i;
+        local_irq_disable();
+        c = this_cpu_ptr(s->cpu_slab);
+        for (i = 0; i < size; i++) {
+                void *object = p[i];
+                BUG_ON(!object);
+                /* kmem cache debug support */
+                s = cache_from_obj(s, object);
+                if (unlikely(!s))
+                        goto exit;
+                slab_free_hook(s, object);
+                page = virt_to_head_page(object);
+                if (c->page == page) {
+                        /* Fastpath: local CPU free */
+                        set_freepointer(s, object, c->freelist);
+                        c->freelist = object;
+                } else {
+                        c->tid = next_tid(c->tid);
+                        local_irq_enable();
+                        /* Slowpath: overhead locked cmpxchg_double_slab */
+                        __slab_free(s, page, object, _RET_IP_);
+                        local_irq_disable();
+                        c = this_cpu_ptr(s->cpu_slab);
+                }
+        }
+exit:
+        c->tid = next_tid(c->tid);
+        local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+/* Note that interrupts must be enabled when calling this function. */
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+                           void **p)
+{
+        struct kmem_cache_cpu *c;
+        int i;
+        /*
+         * Drain objects in the per cpu slab, while disabling local
+         * IRQs, which protects against PREEMPT and interrupts
+         * handlers invoking normal fastpath.
+         */
+        local_irq_disable();
+        c = this_cpu_ptr(s->cpu_slab);
+        for (i = 0; i < size; i++) {
+                void *object = c->freelist;
+                if (unlikely(!object)) {
+                        local_irq_enable();
+                        /*
+                         * Invoking slow path likely have side-effect
+                         * of re-populating per CPU c->freelist
+                         */
+                        p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+                                            _RET_IP_, c);
+                        if (unlikely(!p[i])) {
+                                __kmem_cache_free_bulk(s, i, p);
+                                return false;
+                        }
+                        local_irq_disable();
+                        c = this_cpu_ptr(s->cpu_slab);
+                        continue; /* goto for-loop */
+                }
+                /* kmem_cache debug support */
+                s = slab_pre_alloc_hook(s, flags);
+                if (unlikely(!s)) {
+                        __kmem_cache_free_bulk(s, i, p);
+                        c->tid = next_tid(c->tid);
+                        local_irq_enable();
+                        return false;
+                }
+                c->freelist = get_freepointer(s, object);
+                p[i] = object;
+                /* kmem_cache debug support */
+                slab_post_alloc_hook(s, flags, object);
+        }
+        c->tid = next_tid(c->tid);
+        local_irq_enable();
+        /* Clear memory outside IRQ disabled fastpath loop */
+        if (unlikely(flags & __GFP_ZERO)) {
+                int j;
+                for (j = 0; j < i; j++)
+                        memset(p[j], 0, s->object_size);
+        }
+        return true;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 /*
 * Object placement in a slab is made very easy because we always start at
 * offset 0. If we tune the size of the object to the alignment then we can
@@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        s->kobj.kset = cache_kset(s);
        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
        if (err)
-                goto out_put_kobj;
+                goto out;
        err = sysfs_create_group(&s->kobj, &slab_attr_group);
        if (err)
@@ -5208,8 +5312,6 @@ out:
        return err;
 out_del_kobj:
        kobject_del(&s->kobj);
-out_put_kobj:
-        kobject_put(&s->kobj);
        goto out;
 }
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..77fee9325a57
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,308 @@
+/*
+ *  mm/userfaultfd.c
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+                            pmd_t *dst_pmd,
+                            struct vm_area_struct *dst_vma,
+                            unsigned long dst_addr,
+                            unsigned long src_addr,
+                            struct page **pagep)
+{
+        struct mem_cgroup *memcg;
+        pte_t _dst_pte, *dst_pte;
+        spinlock_t *ptl;
+        void *page_kaddr;
+        int ret;
+        struct page *page;
+        if (!*pagep) {
+                ret = -ENOMEM;
+                page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+                if (!page)
+                        goto out;
+                page_kaddr = kmap_atomic(page);
+                ret = copy_from_user(page_kaddr,
+                                     (const void __user *) src_addr,
+                                     PAGE_SIZE);
+                kunmap_atomic(page_kaddr);
+                /* fallback to copy_from_user outside mmap_sem */
+                if (unlikely(ret)) {
+                        ret = -EFAULT;
+                        *pagep = page;
+                        /* don't free the page */
+                        goto out;
+                }
+        } else {
+                page = *pagep;
+                *pagep = NULL;
+        }
+        /*
+         * The memory barrier inside __SetPageUptodate makes sure that
+         * preceeding stores to the page contents become visible before
+         * the set_pte_at() write.
+         */
+        __SetPageUptodate(page);
+        ret = -ENOMEM;
+        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+                goto out_release;
+        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+        if (dst_vma->vm_flags & VM_WRITE)
+                _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+        ret = -EEXIST;
+        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+        if (!pte_none(*dst_pte))
+                goto out_release_uncharge_unlock;
+        inc_mm_counter(dst_mm, MM_ANONPAGES);
+        page_add_new_anon_rmap(page, dst_vma, dst_addr);
+        mem_cgroup_commit_charge(page, memcg, false);
+        lru_cache_add_active_or_unevictable(page, dst_vma);
+        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+        /* No need to invalidate - it was non-present before */
+        update_mmu_cache(dst_vma, dst_addr, dst_pte);
+        pte_unmap_unlock(dst_pte, ptl);
+        ret = 0;
+out:
+        return ret;
+out_release_uncharge_unlock:
+        pte_unmap_unlock(dst_pte, ptl);
+        mem_cgroup_cancel_charge(page, memcg);
+out_release:
+        page_cache_release(page);
+        goto out;
+}
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+                              pmd_t *dst_pmd,
+                              struct vm_area_struct *dst_vma,
+                              unsigned long dst_addr)
+{
+        pte_t _dst_pte, *dst_pte;
+        spinlock_t *ptl;
+        int ret;
+        _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+                                         dst_vma->vm_page_prot));
+        ret = -EEXIST;
+        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+        if (!pte_none(*dst_pte))
+                goto out_unlock;
+        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+        /* No need to invalidate - it was non-present before */
+        update_mmu_cache(dst_vma, dst_addr, dst_pte);
+        ret = 0;
+out_unlock:
+        pte_unmap_unlock(dst_pte, ptl);
+        return ret;
+}
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd = NULL;
+        pgd = pgd_offset(mm, address);
+        pud = pud_alloc(mm, pgd, address);
+        if (pud)
+                /*
+                 * Note that we didn't run this because the pmd was
+                 * missing, the *pmd may be already established and in
+                 * turn it may also be a trans_huge_pmd.
+                 */
+                pmd = pmd_alloc(mm, pud, address);
+        return pmd;
+}
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+                                              unsigned long dst_start,
+                                              unsigned long src_start,
+                                              unsigned long len,
+                                              bool zeropage)
+{
+        struct vm_area_struct *dst_vma;
+        ssize_t err;
+        pmd_t *dst_pmd;
+        unsigned long src_addr, dst_addr;
+        long copied;
+        struct page *page;
+        /*
+         * Sanitize the command parameters:
+         */
+        BUG_ON(dst_start & ~PAGE_MASK);
+        BUG_ON(len & ~PAGE_MASK);
+        /* Does the address range wrap, or is the span zero-sized? */
+        BUG_ON(src_start + len <= src_start);
+        BUG_ON(dst_start + len <= dst_start);
+        src_addr = src_start;
+        dst_addr = dst_start;
+        copied = 0;
+        page = NULL;
+retry:
+        down_read(&dst_mm->mmap_sem);
+        /*
+         * Make sure the vma is not shared, that the dst range is
+         * both valid and fully within a single existing vma.
+         */
+        err = -EINVAL;
+        dst_vma = find_vma(dst_mm, dst_start);
+        if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+                goto out_unlock;
+        if (dst_start < dst_vma->vm_start ||
+            dst_start + len > dst_vma->vm_end)
+                goto out_unlock;
+        /*
+         * Be strict and only allow __mcopy_atomic on userfaultfd
+         * registered ranges to prevent userland errors going
+         * unnoticed. As far as the VM consistency is concerned, it
+         * would be perfectly safe to remove this check, but there's
+         * no useful usage for __mcopy_atomic ouside of userfaultfd
+         * registered ranges. This is after all why these are ioctls
+         * belonging to the userfaultfd and not syscalls.
+         */
+        if (!dst_vma->vm_userfaultfd_ctx.ctx)
+                goto out_unlock;
+        /*
+         * FIXME: only allow copying on anonymous vmas, tmpfs should
+         * be added.
+         */
+        if (dst_vma->vm_ops)
+                goto out_unlock;
+        /*
+         * Ensure the dst_vma has a anon_vma or this page
+         * would get a NULL anon_vma when moved in the
+         * dst_vma.
+         */
+        err = -ENOMEM;
+        if (unlikely(anon_vma_prepare(dst_vma)))
+                goto out_unlock;
+        while (src_addr < src_start + len) {
+                pmd_t dst_pmdval;
+                BUG_ON(dst_addr >= dst_start + len);
+                dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+                if (unlikely(!dst_pmd)) {
+                        err = -ENOMEM;
+                        break;
+                }
+                dst_pmdval = pmd_read_atomic(dst_pmd);
+                /*
+                 * If the dst_pmd is mapped as THP don't
+                 * override it and just be strict.
+                 */
+                if (unlikely(pmd_trans_huge(dst_pmdval))) {
+                        err = -EEXIST;
+                        break;
+                }
+                if (unlikely(pmd_none(dst_pmdval)) &&
+                    unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+                                         dst_addr))) {
+                        err = -ENOMEM;
+                        break;
+                }
+                /* If an huge pmd materialized from under us fail */
+                if (unlikely(pmd_trans_huge(*dst_pmd))) {
+                        err = -EFAULT;
+                        break;
+                }
+                BUG_ON(pmd_none(*dst_pmd));
+                BUG_ON(pmd_trans_huge(*dst_pmd));
+                if (!zeropage)
+                        err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                               dst_addr, src_addr, &page);
+                else
+                        err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+                                                 dst_addr);
+                cond_resched();
+                if (unlikely(err == -EFAULT)) {
+                        void *page_kaddr;
+                        up_read(&dst_mm->mmap_sem);
+                        BUG_ON(!page);
+                        page_kaddr = kmap(page);
+                        err = copy_from_user(page_kaddr,
+                                             (const void __user *) src_addr,
+                                             PAGE_SIZE);
+                        kunmap(page);
+                        if (unlikely(err)) {
+                                err = -EFAULT;
+                                goto out;
+                        }
+                        goto retry;
+                } else
+                        BUG_ON(page);
+                if (!err) {
+                        dst_addr += PAGE_SIZE;
+                        src_addr += PAGE_SIZE;
+                        copied += PAGE_SIZE;
+                        if (fatal_signal_pending(current))
+                                err = -EINTR;
+                }
+                if (err)
+                        break;
+        }
+out_unlock:
+        up_read(&dst_mm->mmap_sem);
+out:
+        if (page)
+                page_cache_release(page);
+        BUG_ON(copied < 0);
+        BUG_ON(err > 0);
+        BUG_ON(!copied && !err);
+        return copied ? copied : err;
+}
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+                     unsigned long src_start, unsigned long len)
+{
+        return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+                       unsigned long len)
+{
+        return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..b1139039122a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page, ttu_flags)) {
+                        switch (try_to_unmap(page,
+                                        ttu_flags|TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!sc->may_writepage)
                                goto keep_locked;
-                        /* Page is dirty, try to write it out here */
+                        /*
+                         * Page is dirty. Flush the TLB if a writable entry
+                         * potentially exists to avoid CPU writes after IO
+                         * starts and then write it out here.
+                         */
+                        try_to_unmap_flush_dirty();
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
                                goto keep_locked;
@@ -1208,6 +1214,7 @@ keep:
        }
        mem_cgroup_uncharge_list(&free_pages);
+        try_to_unmap_flush();
        free_hot_cold_page_list(&free_pages, true);
        list_splice(&ret_pages, page_list);
@@ -2151,6 +2158,23 @@ out:
        }
 }
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+        /*
+         * This deliberately does not clear the cpumask as it's expensive
+         * and unnecessary. If there happens to be data in there then the
+         * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+         * then will be cleared.
+         */
+        current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
@@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
+        init_tlb_ubc();
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..69a4d30a9ccf 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
        struct ceph_options *opt = client->options;
        size_t pos = m->count;
-        if (opt->name)
+        if (opt->name) {
-                seq_printf(m, "name=%s,", opt->name);
+                seq_puts(m, "name=");
+                seq_escape(m, opt->name, ", \t\n\\");
+                seq_putc(m, ',');
+        }
        if (opt->key)
                seq_puts(m, "secret=<hidden>,");
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
        clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
        ret = atomic_dec_and_test(&task->tk_count);
        if (waitqueue_active(wq))
-                __wake_up_locked_key(wq, TASK_NORMAL, &k);
+                __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
        spin_unlock_irqrestore(&wq->lock, flags);
        return ret;
 }
diff --git a/scripts/Lindent b/scripts/Lindent
index 9c4b3e2b7098..6d889de4e70b 100755
--- a/scripts/Lindent
+++ b/scripts/Lindent
@@ -1,6 +1,9 @@
 #!/bin/sh
 PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
 RES=`indent --version`
+if [ "$RES" = "" ]; then
+        exit 1
+fi
 V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
 V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
 V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 515c4c00e957..00d6d53c2681 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -14,11 +14,14 @@ declare -A cache
 parse_symbol() {
        # The structure of symbol at this point is:
-        #   [name]+[offset]/[total length]
+        #   ([name]+[offset]/[total length])
        #
        # For example:
        #   do_basic_setup+0x9c/0xbf
+        # Remove the englobing parenthesis
+        symbol=${symbol#\(}
+        symbol=${symbol%\)}
        # Strip the symbol name so that we could look it up
        local name=${symbol%+*}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a7bf5f68aacb..9a08fb5c1af6 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -469,7 +469,7 @@ sub dump_section {
    } else {
 #       print STDERR "other section '$name' = '$contents'\n";
        if (defined($sections{$name}) && ($sections{$name} ne "")) {
-                print STDERR "Error(${file}:$.): duplicate section name '$name'\n";
+                print STDERR "${file}:$.: error: duplicate section name '$name'\n";
                ++$errors;
        }
        $sections{$name} = $contents;
@@ -1820,7 +1820,7 @@ sub dump_struct($$) {
                           });
    }
    else {
-        print STDERR "Error(${file}:$.): Cannot parse struct or union!\n";
+        print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
        ++$errors;
    }
 }
@@ -1841,7 +1841,7 @@ sub dump_enum($$) {
            push @parameterlist, $arg;
            if (!$parameterdescs{$arg}) {
                $parameterdescs{$arg} = $undescribed;
-                print STDERR "Warning(${file}:$.): Enum value '$arg' ".
+                print STDERR "${file}:$.: warning: Enum value '$arg' ".
                    "not described in enum '$declaration_name'\n";
            }
@@ -1859,7 +1859,7 @@ sub dump_enum($$) {
                           });
    }
    else {
-        print STDERR "Error(${file}:$.): Cannot parse enum!\n";
+        print STDERR "${file}:$.: error: Cannot parse enum!\n";
        ++$errors;
    }
 }
@@ -1887,7 +1887,7 @@ sub dump_typedef($$) {
                           });
    }
    else {
-        print STDERR "Error(${file}:$.): Cannot parse typedef!\n";
+        print STDERR "${file}:$.: error: Cannot parse typedef!\n";
        ++$errors;
    }
 }
@@ -2019,11 +2019,11 @@ sub push_parameter($$$) {
            $parameterdescs{$param_name} = $undescribed;
            if (($type eq 'function') || ($type eq 'enum')) {
-                print STDERR "Warning(${file}:$.): Function parameter ".
+                print STDERR "${file}:$.: warning: Function parameter ".
                    "or member '$param' not " .
                    "described in '$declaration_name'\n";
            }
-            print STDERR "Warning(${file}:$.):" .
+            print STDERR "${file}:$.: warning:" .
                         " No description found for parameter '$param'\n";
            ++$warnings;
        }
@@ -2074,14 +2074,14 @@ sub check_sections($$$$$$) {
                }
                if ($err) {
                        if ($decl_type eq "function") {
-                                print STDERR "Warning(${file}:$.): " .
+                                print STDERR "${file}:$.: warning: " .
                                        "Excess function parameter " .
                                        "'$sects[$sx]' " .
                                        "description in '$decl_name'\n";
                                ++$warnings;
                        } else {
                                if ($nested !~ m/\Q$sects[$sx]\E/) {
-                                    print STDERR "Warning(${file}:$.): " .
+                                    print STDERR "${file}:$.: warning: " .
                                        "Excess struct/union/enum/typedef member " .
                                        "'$sects[$sx]' " .
                                        "description in '$decl_name'\n";
@@ -2107,7 +2107,7 @@ sub check_return_section {
        if (!defined($sections{$section_return}) ||
            $sections{$section_return} eq "") {
-                print STDERR "Warning(${file}:$.): " .
+                print STDERR "${file}:$.: warning: " .
                        "No description found for return value of " .
                        "'$declaration_name'\n";
                ++$warnings;
@@ -2186,7 +2186,7 @@ sub dump_function($$) {
        create_parameterlist($args, ',', $file);
    } else {
-        print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n";
+        print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
        return;
    }
@@ -2251,7 +2251,7 @@ sub tracepoint_munge($) {
                $tracepointargs = $1;
        }
        if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
-                print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+                print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
                             "$prototype\n";
        } else {
                $prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2450,7 +2450,7 @@ sub process_file($) {
                }
                if (($declaration_purpose eq "") && $verbose) {
-                        print STDERR "Warning(${file}:$.): missing initial short description on line:\n";
+                        print STDERR "${file}:$.: warning: missing initial short description on line:\n";
                        print STDERR $_;
                        ++$warnings;
                }
@@ -2468,10 +2468,10 @@ sub process_file($) {
                }
                if ($verbose) {
-                    print STDERR "Info(${file}:$.): Scanning doc for $identifier\n";
+                    print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
                }
            } else {
-                print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.",
+                print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
                " - I thought it was a doc line\n";
                ++$warnings;
                $state = 0;
@@ -2483,7 +2483,7 @@ sub process_file($) {
                if (($contents ne "") && ($contents ne "\n")) {
                    if (!$in_doc_sect && $verbose) {
-                        print STDERR "Warning(${file}:$.): contents before sections\n";
+                        print STDERR "${file}:$.: warning: contents before sections\n";
                        ++$warnings;
                    }
                    dump_section($file, $section, xml_escape($contents));
@@ -2509,7 +2509,7 @@ sub process_file($) {
                }
                # look for doc_com + <text> + doc_end:
                if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
-                    print STDERR "Warning(${file}:$.): suspicious ending line: $_";
+                    print STDERR "${file}:$.: warning: suspicious ending line: $_";
                    ++$warnings;
                }
@@ -2539,7 +2539,7 @@ sub process_file($) {
                }
            } else {
                # i dont know - bad line?  ignore.
-                print STDERR "Warning(${file}:$.): bad line: $_";
+                print STDERR "${file}:$.: warning: bad line: $_";
                ++$warnings;
            }
        } elsif ($state == 5) { # scanning for split parameters
@@ -2631,7 +2631,7 @@ sub process_file($) {
        }
    }
    if ($initial_section_counter == $section_counter) {
-        print STDERR "Warning(${file}): no structured comments found\n";
+        print STDERR "${file}:1: warning: no structured comments found\n";
        if (($function_only == 1) && ($show_not_found == 1)) {
            print STDERR "    Was looking for '$_'.\n" for keys %function_table;
        }
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bb8e4d0a1911..946caf3bd694 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -32,6 +32,7 @@ accoring||according
 accout||account
 accquire||acquire
 accquired||acquired
+accross||across
 acessable||accessible
 acess||access
 achitecture||architecture
@@ -100,8 +101,10 @@ appropiate||appropriate
 appropriatly||appropriately
 approriate||appropriate
 approriately||appropriately
+apropriate||appropriate
 aquainted||acquainted
 aquired||acquired
+aquisition||acquisition
 arbitary||arbitrary
 architechture||architecture
 arguement||argument
@@ -111,6 +114,8 @@ arne't||aren't
 arraival||arrival
 artifical||artificial
 artillary||artillery
+asign||assign
+assertation||assertion
 assiged||assigned
 assigment||assignment
 assigments||assignments
@@ -136,6 +141,7 @@ automatize||automate
 automatized||automated
 automatizes||automates
 autonymous||autonomous
+auxillary||auxiliary
 auxilliary||auxiliary
 avaiable||available
 avaible||available
@@ -187,6 +193,7 @@ capatibilities||capabilities
 carefuly||carefully
 cariage||carriage
 catagory||category
+cehck||check
 challange||challenge
 challanges||challenges
 chanell||channel
@@ -199,6 +206,8 @@ charactor||character
 charater||character
 charaters||characters
 charcter||character
+chcek||check
+chck||check
 checksuming||checksumming
 childern||children
 childs||children
@@ -231,6 +240,8 @@ compatability||compatibility
 compatable||compatible
 compatibiliy||compatibility
 compatibilty||compatibility
+compatiblity||compatibility
+competion||completion
 compilant||compliant
 compleatly||completely
 completly||completely
@@ -291,6 +302,7 @@ defferred||deferred
 definate||definite
 definately||definitely
 defintion||definition
+defintions||definitions
 defualt||default
 defult||default
 deivce||device
@@ -306,6 +318,7 @@ depreacted||deprecated
 depreacte||deprecate
 desactivate||deactivate
 desciptors||descriptors
+descripton||description
 descrition||description
 descritptor||descriptor
 desctiptor||descriptor
@@ -327,6 +340,7 @@ devided||divided
 deviece||device
 diable||disable
 dictionnary||dictionary
+didnt||didn't
 diferent||different
 differrence||difference
 difinition||definition
@@ -344,6 +358,7 @@ docuentation||documentation
 documantation||documentation
 documentaion||documentation
 documment||document
+doesnt||doesn't
 dorp||drop
 dosen||doesn
 downlad||download
@@ -450,11 +465,13 @@ grahical||graphical
 grahpical||graphical
 grapic||graphic
 guage||gauge
+guarenteed||guaranteed
 guarentee||guarantee
 halfs||halves
 hander||handler
 handfull||handful
 hanled||handled
+happend||happened
 harware||hardware
 heirarchically||hierarchically
 helpfull||helpful
@@ -512,6 +529,7 @@ initialzed||initialized
 initilization||initialization
 initilize||initialize
 inofficial||unofficial
+insititute||institute
 instal||install
 inteface||interface
 integreated||integrated
@@ -546,6 +564,7 @@ invididual||individual
 invokation||invocation
 invokations||invocations
 irrelevent||irrelevant
+isnt||isn't
 isssue||issue
 itslef||itself
 jave||java
@@ -558,6 +577,7 @@ langauage||language
 langauge||language
 langugage||language
 lauch||launch
+layed||laid
 leightweight||lightweight
 lengh||length
 lenght||length
@@ -714,6 +734,7 @@ preceeding||preceding
 preceed||precede
 precendence||precedence
 precission||precision
+preemptable||preemptible
 prefered||preferred
 prefferably||preferably
 premption||preemption
@@ -744,6 +765,7 @@ programers||programmers
 programm||program
 programms||programs
 progresss||progress
+promiscous||promiscuous
 promps||prompts
 pronnounced||pronounced
 prononciation||pronunciation
@@ -817,6 +839,7 @@ reseting||resetting
 resizeable||resizable
 resouces||resources
 resoures||resources
+responce||response
 ressizes||resizes
 ressource||resource
 ressources||resources
@@ -869,6 +892,7 @@ setts||sets
 settting||setting
 shotdown||shutdown
 shoud||should
+shouldnt||shouldn't
 shoule||should
 shrinked||shrunk
 siginificantly||significantly
@@ -913,9 +937,11 @@ straming||streaming
 struc||struct
 structres||structures
 stuct||struct
+stucture||structure
 sturcture||structure
 subdirectoires||subdirectories
 suble||subtle
+substract||subtract
 succesfully||successfully
 succesful||successful
 successfull||successful
@@ -987,6 +1013,7 @@ unexpectd||unexpected
 unexpeted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
+unintialized||uninitialized
 unknonw||unknown
 unknow||unknown
 unkown||unknown
@@ -1027,7 +1054,9 @@ virtiual||virtual
 visiters||visitors
 vitual||virtual
 wating||waiting
+wether||whether
 whataver||whatever
+whcih||which
 whenver||whenever
 wheter||whether
 whe||when
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;
+        /*
+         * Mask off ambient bits that are no longer both permitted and
+         * inheritable.
+         */
+        new->cap_ambient = cap_intersect(new->cap_ambient,
+                                         cap_intersect(*permitted,
+                                                       *inheritable));
+        if (WARN_ON(!cap_ambient_invariant_ok(new)))
+                return -EINVAL;
        return 0;
 }
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                /*
                 * pP' = (X & fP) | (pI & fI)
+                 * The addition of pA' is handled later.
                 */
                new->cap_permitted.cap[i] =
                        (new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
-        bool effective, has_cap = false;
+        bool effective, has_cap = false, is_setid;
        int ret;
        kuid_t root_uid;
+        if (WARN_ON(!cap_ambient_invariant_ok(old)))
+                return -EPERM;
        effective = false;
        ret = get_file_caps(bprm, &effective, &has_cap);
        if (ret < 0)
@@ -522,8 +536,9 @@ skip:
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
-        if ((!uid_eq(new->euid, old->uid) ||
+        is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
-             !gid_eq(new->egid, old->gid) ||
+        if ((is_setid ||
             !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
            bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
                /* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;
+        /* File caps or setid cancels ambient. */
+        if (has_cap || is_setid)
+                cap_clear(new->cap_ambient);
+        /*
+         * Now that we've computed pA', update pP' to give:
+         *   pP' = (X & fP) | (pI & fI) | pA'
+         */
+        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+        /*
+         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
+         * this is the same as pE' = (fE ? pP' : 0) | pA'.
+         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
-                cap_clear(new->cap_effective);
+                new->cap_effective = new->cap_ambient;
+        if (WARN_ON(!cap_ambient_invariant_ok(new)))
+                return -EPERM;
        bprm->cap_effective = effective;
        /*
@@ -557,7 +590,7 @@ skip:
         * Number 1 above might fail if you don't have a full bset, but I think
         * that is interesting information to audit.
         */
-        if (!cap_isclear(new->cap_effective)) {
+        if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
                if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
                    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
                    issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
        }
        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+        if (WARN_ON(!cap_ambient_invariant_ok(new)))
+                return -EPERM;
        return 0;
 }
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
        if (!uid_eq(cred->uid, root_uid)) {
                if (bprm->cap_effective)
                        return 1;
-                if (!cap_isclear(cred->cap_permitted))
+                if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
                        return 1;
        }
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
-             !uid_eq(new->suid, root_uid)) &&
+             !uid_eq(new->suid, root_uid))) {
-            !issecure(SECURE_KEEP_CAPS)) {
+                if (!issecure(SECURE_KEEP_CAPS)) {
-                cap_clear(new->cap_permitted);
+                        cap_clear(new->cap_permitted);
-                cap_clear(new->cap_effective);
+                        cap_clear(new->cap_effective);
+                }
+                /*
+                 * Pre-ambient programs expect setresuid to nonroot followed
+                 * by exec to drop capabilities.  We should make sure that
+                 * this remains the case.
+                 */
+                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
@@ -924,6 +969,44 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);
+        case PR_CAP_AMBIENT:
+                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+                        if (arg3 | arg4 | arg5)
+                                return -EINVAL;
+                        new = prepare_creds();
+                        if (!new)
+                                return -ENOMEM;
+                        cap_clear(new->cap_ambient);
+                        return commit_creds(new);
+                }
+                if (((!cap_valid(arg3)) | arg4 | arg5))
+                        return -EINVAL;
+                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+                        return !!cap_raised(current_cred()->cap_ambient, arg3);
+                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+                           arg2 != PR_CAP_AMBIENT_LOWER) {
+                        return -EINVAL;
+                } else {
+                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
+                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
+                             !cap_raised(current_cred()->cap_inheritable,
+                                         arg3) ||
+                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
+                                return -EPERM;
+                        new = prepare_creds();
+                        if (!new)
+                                return -ENOMEM;
+                        if (arg2 == PR_CAP_AMBIENT_RAISE)
+                                cap_raise(new->cap_ambient, arg3);
+                        else
+                                cap_lower(new->cap_ambient, arg3);
+                        return commit_creds(new);
+                }
        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
        new->cap_inheritable    = old->cap_inheritable;
        new->cap_permitted      = old->cap_permitted;
        new->cap_effective      = old->cap_effective;
+        new->cap_ambient        = old->cap_ambient;
        new->cap_bset           = old->cap_bset;
        new->jit_keyring        = old->jit_keyring;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 564079c5c49d..cdf4c589a391 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m,
                seq_puts(m, prefix);
                if (has_comma)
                        seq_putc(m, '\"');
-                seq_puts(m, opts->mnt_opts[i]);
+                seq_escape(m, opts->mnt_opts[i], "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644
index 000000000000..b732dd0d4738
--- /dev/null
+++ b/tools/testing/selftests/capabilities/.gitignore
@@ -0,0 +1,2 @@
+test_execve
+validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644
index 000000000000..8c8f0c1f0889
--- /dev/null
+++ b/tools/testing/selftests/capabilities/Makefile
@@ -0,0 +1,18 @@
+all:
+include ../lib.mk
+.PHONY: all clean
+TARGETS := validate_cap test_execve
+TEST_PROGS := test_execve
+CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng
+all: $(TARGETS)
+clean:
+        $(RM) $(TARGETS)
+$(TARGETS): %: %.c
+        $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644
index 000000000000..10a21a958aaf
--- /dev/null
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -0,0 +1,427 @@
+#define _GNU_SOURCE
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <limits.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT                  47
+# define PR_CAP_AMBIENT_IS_SET          1
+# define PR_CAP_AMBIENT_RAISE           2
+# define PR_CAP_AMBIENT_LOWER           3
+# define PR_CAP_AMBIENT_CLEAR_ALL       4
+#endif
+static int nerrs;
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+        char buf[4096];
+        int fd;
+        ssize_t written;
+        int buf_len;
+        buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+        if (buf_len < 0) {
+                err(1, "vsnprintf failed");
+        }
+        if (buf_len >= sizeof(buf)) {
+                errx(1, "vsnprintf output truncated");
+        }
+        fd = open(filename, O_WRONLY);
+        if (fd < 0) {
+                if ((errno == ENOENT) && enoent_ok)
+                        return;
+                err(1, "open of %s failed", filename);
+        }
+        written = write(fd, buf, buf_len);
+        if (written != buf_len) {
+                if (written >= 0) {
+                        errx(1, "short write to %s", filename);
+                } else {
+                        err(1, "write to %s failed", filename);
+                }
+        }
+        if (close(fd) != 0) {
+                err(1, "close of %s failed", filename);
+        }
+}
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        vmaybe_write_file(true, filename, fmt, ap);
+        va_end(ap);
+}
+static void write_file(char *filename, char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        vmaybe_write_file(false, filename, fmt, ap);
+        va_end(ap);
+}
+static bool create_and_enter_ns(uid_t inner_uid)
+{
+        uid_t outer_uid;
+        gid_t outer_gid;
+        int i;
+        bool have_outer_privilege;
+        outer_uid = getuid();
+        outer_gid = getgid();
+        /*
+         * TODO: If we're already root, we could skip creating the userns.
+         */
+        if (unshare(CLONE_NEWNS) == 0) {
+                printf("[NOTE]\tUsing global UIDs for tests\n");
+                if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
+                        err(1, "PR_SET_KEEPCAPS");
+                if (setresuid(inner_uid, inner_uid, -1) != 0)
+                        err(1, "setresuid");
+                // Re-enable effective caps
+                capng_get_caps_process();
+                for (i = 0; i < CAP_LAST_CAP; i++)
+                        if (capng_have_capability(CAPNG_PERMITTED, i))
+                                capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
+                if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                        err(1, "capng_apply");
+                have_outer_privilege = true;
+        } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
+                printf("[NOTE]\tUsing a user namespace for tests\n");
+                maybe_write_file("/proc/self/setgroups", "deny");
+                write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
+                write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
+                have_outer_privilege = false;
+        } else {
+                errx(1, "must be root or be able to create a userns");
+        }
+        if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
+                err(1, "remount everything private");
+        return have_outer_privilege;
+}
+static void chdir_to_tmpfs(void)
+{
+        char cwd[PATH_MAX];
+        if (getcwd(cwd, sizeof(cwd)) != cwd)
+                err(1, "getcwd");
+        if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
+                err(1, "mount private tmpfs");
+        if (chdir(cwd) != 0)
+                err(1, "chdir to private tmpfs");
+        if (umount2(".", MNT_DETACH) != 0)
+                err(1, "detach private tmpfs");
+}
+static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
+{
+        int from = openat(fromfd, fromname, O_RDONLY);
+        if (from == -1)
+                err(1, "open copy source");
+        int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
+        while (true) {
+                char buf[4096];
+                ssize_t sz = read(from, buf, sizeof(buf));
+                if (sz == 0)
+                        break;
+                if (sz < 0)
+                        err(1, "read");
+                if (write(to, buf, sz) != sz)
+                        err(1, "write");        /* no short writes on tmpfs */
+        }
+        close(from);
+        close(to);
+}
+static bool fork_wait(void)
+{
+        pid_t child = fork();
+        if (child == 0) {
+                nerrs = 0;
+                return true;
+        } else if (child > 0) {
+                int status;
+                if (waitpid(child, &status, 0) != child ||
+                    !WIFEXITED(status)) {
+                        printf("[FAIL]\tChild died\n");
+                        nerrs++;
+                } else if (WEXITSTATUS(status) != 0) {
+                        printf("[FAIL]\tChild failed\n");
+                        nerrs++;
+                } else {
+                        printf("[OK]\tChild succeeded\n");
+                }
+                return false;
+        } else {
+                err(1, "fork");
+        }
+}
+static void exec_other_validate_cap(const char *name,
+                                    bool eff, bool perm, bool inh, bool ambient)
+{
+        execl(name, name, (eff ? "1" : "0"),
+              (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
+              NULL);
+        err(1, "execl");
+}
+static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
+{
+        exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
+}
+static int do_tests(int uid, const char *our_path)
+{
+        bool have_outer_privilege = create_and_enter_ns(uid);
+        int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
+        if (ourpath_fd == -1)
+                err(1, "open '%s'", our_path);
+        chdir_to_tmpfs();
+        copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
+        if (have_outer_privilege) {
+                uid_t gid = getegid();
+                copy_fromat_to(ourpath_fd, "validate_cap",
+                               "validate_cap_suidroot");
+                if (chown("validate_cap_suidroot", 0, -1) != 0)
+                        err(1, "chown");
+                if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
+                        err(1, "chmod");
+                copy_fromat_to(ourpath_fd, "validate_cap",
+                               "validate_cap_suidnonroot");
+                if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
+                        err(1, "chown");
+                if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
+                        err(1, "chmod");
+                copy_fromat_to(ourpath_fd, "validate_cap",
+                               "validate_cap_sgidroot");
+                if (chown("validate_cap_sgidroot", -1, 0) != 0)
+                        err(1, "chown");
+                if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
+                        err(1, "chmod");
+                copy_fromat_to(ourpath_fd, "validate_cap",
+                               "validate_cap_sgidnonroot");
+                if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
+                        err(1, "chown");
+                if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
+                        err(1, "chmod");
+}
+        capng_get_caps_process();
+        /* Make sure that i starts out clear */
+        capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+        if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                err(1, "capng_apply");
+        if (uid == 0) {
+                printf("[RUN]\tRoot => ep\n");
+                if (fork_wait())
+                        exec_validate_cap(true, true, false, false);
+        } else {
+                printf("[RUN]\tNon-root => no caps\n");
+                if (fork_wait())
+                        exec_validate_cap(false, false, false, false);
+        }
+        printf("[OK]\tCheck cap_ambient manipulation rules\n");
+        /* We should not be able to add ambient caps yet. */
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
+                if (errno == EINVAL)
+                        printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n");
+                else
+                        printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
+                return 1;
+        }
+        printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
+        capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
+        capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
+        capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
+        if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                err(1, "capng_apply");
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
+                printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
+                return 1;
+        }
+        printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
+        capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+        if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                err(1, "capng_apply");
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+                printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n");
+                return 1;
+        }
+        printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n");
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
+                printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n");
+                return 1;
+        }
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
+                err(1, "PR_CAP_AMBIENT_CLEAR_ALL");
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+                printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
+                return 1;
+        }
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+                err(1, "PR_CAP_AMBIENT_RAISE");
+        capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+        if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                err(1, "capng_apply");
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+                printf("[FAIL]\tDropping I should have dropped A\n");
+                return 1;
+        }
+        printf("[OK]\tBasic manipulation appears to work\n");
+        capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+        if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+                err(1, "capng_apply");
+        if (uid == 0) {
+                printf("[RUN]\tRoot +i => eip\n");
+                if (fork_wait())
+                        exec_validate_cap(true, true, true, false);
+        } else {
+                printf("[RUN]\tNon-root +i => i\n");
+                if (fork_wait())
+                        exec_validate_cap(false, false, true, false);
+        }
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+                err(1, "PR_CAP_AMBIENT_RAISE");
+        printf("[RUN]\tUID %d +ia => eipa\n", uid);
+        if (fork_wait())
+                exec_validate_cap(true, true, true, true);
+        /* The remaining tests need real privilege */
+        if (!have_outer_privilege) {
+                printf("[SKIP]\tSUID/SGID tests (needs privilege)\n");
+                goto done;
+        }
+        if (uid == 0) {
+                printf("[RUN]\tRoot +ia, suidroot => eipa\n");
+                if (fork_wait())
+                        exec_other_validate_cap("./validate_cap_suidroot",
+                                                true, true, true, true);
+                printf("[RUN]\tRoot +ia, suidnonroot => ip\n");
+                if (fork_wait())
+                        exec_other_validate_cap("./validate_cap_suidnonroot",
+                                                false, true, true, false);
+                printf("[RUN]\tRoot +ia, sgidroot => eipa\n");
+                if (fork_wait())
+                        exec_other_validate_cap("./validate_cap_sgidroot",
+                                                true, true, true, true);
+                if (fork_wait()) {
+                        printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
+                        if (setresgid(1, 1, 1) != 0)
+                                err(1, "setresgid");
+                        exec_other_validate_cap("./validate_cap_sgidroot",
+                                                true, true, true, false);
+                }
+                printf("[RUN]\tRoot +ia, sgidnonroot => eip\n");
+                if (fork_wait())
+                        exec_other_validate_cap("./validate_cap_sgidnonroot",
+                                                true, true, true, false);
+        } else {
+                printf("[RUN]\tNon-root +ia, sgidnonroot => i\n");
+                exec_other_validate_cap("./validate_cap_sgidnonroot",
+                                                false, false, true, false);
+                if (fork_wait()) {
+                        printf("[RUN]\tNon-root +ia, sgidroot => i\n");
+                        if (setresgid(1, 1, 1) != 0)
+                                err(1, "setresgid");
+                        exec_other_validate_cap("./validate_cap_sgidroot",
+                                                false, false, true, false);
+                }
+        }
+done:
+        return nerrs ? 1 : 0;
+}
+int main(int argc, char **argv)
+{
+        char *tmp1, *tmp2, *our_path;
+        /* Find our path */
+        tmp1 = strdup(argv[0]);
+        if (!tmp1)
+                err(1, "strdup");
+        tmp2 = dirname(tmp1);
+        our_path = strdup(tmp2);
+        if (!our_path)
+                err(1, "strdup");
+        free(tmp1);
+        if (fork_wait()) {
+                printf("[RUN]\t+++ Tests with uid == 0 +++\n");
+                return do_tests(0, our_path);
+        }
+        if (fork_wait()) {
+                printf("[RUN]\t+++ Tests with uid != 0 +++\n");
+                return do_tests(1, our_path);
+        }
+        return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644
index 000000000000..dd3c45f7b23c
--- /dev/null
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -0,0 +1,73 @@
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include <sys/auxv.h>
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT                  47
+# define PR_CAP_AMBIENT_IS_SET          1
+# define PR_CAP_AMBIENT_RAISE           2
+# define PR_CAP_AMBIENT_LOWER           3
+# define PR_CAP_AMBIENT_CLEAR_ALL       4
+#endif
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
+# define HAVE_GETAUXVAL
+#endif
+static bool bool_arg(char **argv, int i)
+{
+        if (!strcmp(argv[i], "0"))
+                return false;
+        else if (!strcmp(argv[i], "1"))
+                return true;
+        else
+                errx(1, "wrong argv[%d]", i);
+}
+int main(int argc, char **argv)
+{
+        const char *atsec = "";
+        /*
+         * Be careful just in case a setgid or setcapped copy of this
+         * helper gets out.
+         */
+        if (argc != 5)
+                errx(1, "wrong argc");
+#ifdef HAVE_GETAUXVAL
+        if (getauxval(AT_SECURE))
+                atsec = " (AT_SECURE is set)";
+        else
+                atsec = " (AT_SECURE is not set)";
+#endif
+        capng_get_caps_process();
+        if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
+                printf("[FAIL]\tWrong effective state%s\n", atsec);
+                return 1;
+        }
+        if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
+                printf("[FAIL]\tWrong permitted state%s\n", atsec);
+                return 1;
+        }
+        if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
+                printf("[FAIL]\tWrong inheritable state%s\n", atsec);
+                return 1;
+        }
+        if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
+                printf("[FAIL]\tWrong ambient state%s\n", atsec);
+                return 1;
+        }
+        printf("[OK]\tCapabilities after execve were correct\n");
+        return 0;
+}
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..0d6854744b37 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest
 BINARIES += map_hugetlb
 BINARIES += thuge-gen
 BINARIES += transhuge-stress
+BINARIES += userfaultfd
 all: $(BINARIES)
 %: %.c
        $(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+        $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
 TEST_PROGS := run_vmtests
 TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..831adeb5fc55 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -86,6 +86,17 @@ else
        echo "[PASS]"
 fi
+echo "--------------------"
+echo "running userfaultfd"
+echo "--------------------"
+./userfaultfd 128 32
+if [ $? -ne 0 ]; then
+        echo "[FAIL]"
+        exitcode=1
+else
+        echo "[PASS]"
+fi
 #cleanup
 umount $mnt
 rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..0c0b83953352
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,636 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include "../../../../include/uapi/linux/userfaultfd.h"
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 359
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+#define BOUNCE_RANDOM           (1<<0)
+#define BOUNCE_RACINGFAULTS     (1<<1)
+#define BOUNCE_VERIFY           (1<<2)
+#define BOUNCE_POLL             (1<<3)
+static int bounces;
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)                                      \
+        ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)                                      \
+        ((volatile unsigned long long *) ((unsigned long)               \
+                                 ((___area) + (___nr)*page_size +       \
+                                  sizeof(pthread_mutex_t) +             \
+                                  sizeof(unsigned long long) - 1) &     \
+                                 ~(unsigned long)(sizeof(unsigned long long) \
+                                                  -  1)))
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+        unsigned long i;
+        for (i = 0; i < n; i++)
+                if (str1[i] != str2[i])
+                        return 1;
+        return 0;
+}
+static void *locking_thread(void *arg)
+{
+        unsigned long cpu = (unsigned long) arg;
+        struct random_data rand;
+        unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+        int32_t rand_nr;
+        unsigned long long count;
+        char randstate[64];
+        unsigned int seed;
+        time_t start;
+        if (bounces & BOUNCE_RANDOM) {
+                seed = (unsigned int) time(NULL) - bounces;
+                if (!(bounces & BOUNCE_RACINGFAULTS))
+                        seed += cpu;
+                bzero(&rand, sizeof(rand));
+                bzero(&randstate, sizeof(randstate));
+                if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+                        fprintf(stderr, "srandom_r error\n"), exit(1);
+        } else {
+                page_nr = -bounces;
+                if (!(bounces & BOUNCE_RACINGFAULTS))
+                        page_nr += cpu * nr_pages_per_cpu;
+        }
+        while (!finished) {
+                if (bounces & BOUNCE_RANDOM) {
+                        if (random_r(&rand, &rand_nr))
+                                fprintf(stderr, "random_r 1 error\n"), exit(1);
+                        page_nr = rand_nr;
+                        if (sizeof(page_nr) > sizeof(rand_nr)) {
+                                if (random_r(&rand, &rand_nr))
+                                        fprintf(stderr, "random_r 2 error\n"), exit(1);
+                                page_nr |= ((unsigned long) rand_nr) << 32;
+                        }
+                } else
+                        page_nr += 1;
+                page_nr %= nr_pages;
+                start = time(NULL);
+                if (bounces & BOUNCE_VERIFY) {
+                        count = *area_count(area_dst, page_nr);
+                        if (!count)
+                                fprintf(stderr,
+                                        "page_nr %lu wrong count %Lu %Lu\n",
+                                        page_nr, count,
+                                        count_verify[page_nr]), exit(1);
+                        /*
+                         * We can't use bcmp (or memcmp) because that
+                         * returns 0 erroneously if the memory is
+                         * changing under it (even if the end of the
+                         * page is never changing and always
+                         * different).
+                         */
+#if 1
+                        if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+                                     page_size))
+                                fprintf(stderr,
+                                        "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+                                        page_nr, count,
+                                        count_verify[page_nr]), exit(1);
+#else
+                        unsigned long loops;
+                        loops = 0;
+                        /* uncomment the below line to test with mutex */
+                        /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+                        while (!bcmp(area_dst + page_nr * page_size, zeropage,
+                                     page_size)) {
+                                loops += 1;
+                                if (loops > 10)
+                                        break;
+                        }
+                        /* uncomment below line to test with mutex */
+                        /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+                        if (loops) {
+                                fprintf(stderr,
+                                        "page_nr %lu all zero thread %lu %p %lu\n",
+                                        page_nr, cpu, area_dst + page_nr * page_size,
+                                        loops);
+                                if (loops > 10)
+                                        exit(1);
+                        }
+#endif
+                }
+                pthread_mutex_lock(area_mutex(area_dst, page_nr));
+                count = *area_count(area_dst, page_nr);
+                if (count != count_verify[page_nr]) {
+                        fprintf(stderr,
+                                "page_nr %lu memory corruption %Lu %Lu\n",
+                                page_nr, count,
+                                count_verify[page_nr]), exit(1);
+                }
+                count++;
+                *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+                pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+                if (time(NULL) - start > 1)
+                        fprintf(stderr,
+                                "userfault too slow %ld "
+                                "possible false positive with overcommit\n",
+                                time(NULL) - start);
+        }
+        return NULL;
+}
+static int copy_page(unsigned long offset)
+{
+        struct uffdio_copy uffdio_copy;
+        if (offset >= nr_pages * page_size)
+                fprintf(stderr, "unexpected offset %lu\n",
+                        offset), exit(1);
+        uffdio_copy.dst = (unsigned long) area_dst + offset;
+        uffdio_copy.src = (unsigned long) area_src + offset;
+        uffdio_copy.len = page_size;
+        uffdio_copy.mode = 0;
+        uffdio_copy.copy = 0;
+        if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+                /* real retval in ufdio_copy.copy */
+                if (uffdio_copy.copy != -EEXIST)
+                        fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+                                uffdio_copy.copy), exit(1);
+        } else if (uffdio_copy.copy != page_size) {
+                fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+                        uffdio_copy.copy), exit(1);
+        } else
+                return 1;
+        return 0;
+}
+static void *uffd_poll_thread(void *arg)
+{
+        unsigned long cpu = (unsigned long) arg;
+        struct pollfd pollfd[2];
+        struct uffd_msg msg;
+        int ret;
+        unsigned long offset;
+        char tmp_chr;
+        unsigned long userfaults = 0;
+        pollfd[0].fd = uffd;
+        pollfd[0].events = POLLIN;
+        pollfd[1].fd = pipefd[cpu*2];
+        pollfd[1].events = POLLIN;
+        for (;;) {
+                ret = poll(pollfd, 2, -1);
+                if (!ret)
+                        fprintf(stderr, "poll error %d\n", ret), exit(1);
+                if (ret < 0)
+                        perror("poll"), exit(1);
+                if (pollfd[1].revents & POLLIN) {
+                        if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+                                fprintf(stderr, "read pipefd error\n"),
+                                        exit(1);
+                        break;
+                }
+                if (!(pollfd[0].revents & POLLIN))
+                        fprintf(stderr, "pollfd[0].revents %d\n",
+                                pollfd[0].revents), exit(1);
+                ret = read(uffd, &msg, sizeof(msg));
+                if (ret < 0) {
+                        if (errno == EAGAIN)
+                                continue;
+                        perror("nonblocking read error"), exit(1);
+                }
+                if (msg.event != UFFD_EVENT_PAGEFAULT)
+                        fprintf(stderr, "unexpected msg event %u\n",
+                                msg.event), exit(1);
+                if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                        fprintf(stderr, "unexpected write fault\n"), exit(1);
+                offset = (char *)msg.arg.pagefault.address - area_dst;
+                offset &= ~(page_size-1);
+                if (copy_page(offset))
+                        userfaults++;
+        }
+        return (void *)userfaults;
+}
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+static void *uffd_read_thread(void *arg)
+{
+        unsigned long *this_cpu_userfaults;
+        struct uffd_msg msg;
+        unsigned long offset;
+        int ret;
+        this_cpu_userfaults = (unsigned long *) arg;
+        *this_cpu_userfaults = 0;
+        pthread_mutex_unlock(&uffd_read_mutex);
+        /* from here cancellation is ok */
+        for (;;) {
+                ret = read(uffd, &msg, sizeof(msg));
+                if (ret != sizeof(msg)) {
+                        if (ret < 0)
+                                perror("blocking read error"), exit(1);
+                        else
+                                fprintf(stderr, "short read\n"), exit(1);
+                }
+                if (msg.event != UFFD_EVENT_PAGEFAULT)
+                        fprintf(stderr, "unexpected msg event %u\n",
+                                msg.event), exit(1);
+                if (bounces & BOUNCE_VERIFY &&
+                    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                        fprintf(stderr, "unexpected write fault\n"), exit(1);
+                offset = (char *)msg.arg.pagefault.address - area_dst;
+                offset &= ~(page_size-1);
+                if (copy_page(offset))
+                        (*this_cpu_userfaults)++;
+        }
+        return (void *)NULL;
+}
+static void *background_thread(void *arg)
+{
+        unsigned long cpu = (unsigned long) arg;
+        unsigned long page_nr;
+        for (page_nr = cpu * nr_pages_per_cpu;
+             page_nr < (cpu+1) * nr_pages_per_cpu;
+             page_nr++)
+                copy_page(page_nr * page_size);
+        return NULL;
+}
+static int stress(unsigned long *userfaults)
+{
+        unsigned long cpu;
+        pthread_t locking_threads[nr_cpus];
+        pthread_t uffd_threads[nr_cpus];
+        pthread_t background_threads[nr_cpus];
+        void **_userfaults = (void **) userfaults;
+        finished = 0;
+        for (cpu = 0; cpu < nr_cpus; cpu++) {
+                if (pthread_create(&locking_threads[cpu], &attr,
+                                   locking_thread, (void *)cpu))
+                        return 1;
+                if (bounces & BOUNCE_POLL) {
+                        if (pthread_create(&uffd_threads[cpu], &attr,
+                                           uffd_poll_thread, (void *)cpu))
+                                return 1;
+                } else {
+                        if (pthread_create(&uffd_threads[cpu], &attr,
+                                           uffd_read_thread,
+                                           &_userfaults[cpu]))
+                                return 1;
+                        pthread_mutex_lock(&uffd_read_mutex);
+                }
+                if (pthread_create(&background_threads[cpu], &attr,
+                                   background_thread, (void *)cpu))
+                        return 1;
+        }
+        for (cpu = 0; cpu < nr_cpus; cpu++)
+                if (pthread_join(background_threads[cpu], NULL))
+                        return 1;
+        /*
+         * Be strict and immediately zap area_src, the whole area has
+         * been transferred already by the background treads. The
+         * area_src could then be faulted in in a racy way by still
+         * running uffdio_threads reading zeropages after we zapped
+         * area_src (but they're guaranteed to get -EEXIST from
+         * UFFDIO_COPY without writing zero pages into area_dst
+         * because the background threads already completed).
+         */
+        if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+                perror("madvise");
+                return 1;
+        }
+        for (cpu = 0; cpu < nr_cpus; cpu++) {
+                char c;
+                if (bounces & BOUNCE_POLL) {
+                        if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+                                fprintf(stderr, "pipefd write error\n");
+                                return 1;
+                        }
+                        if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+                                return 1;
+                } else {
+                        if (pthread_cancel(uffd_threads[cpu]))
+                                return 1;
+                        if (pthread_join(uffd_threads[cpu], NULL))
+                                return 1;
+                }
+        }
+        finished = 1;
+        for (cpu = 0; cpu < nr_cpus; cpu++)
+                if (pthread_join(locking_threads[cpu], NULL))
+                        return 1;
+        return 0;
+}
+static int userfaultfd_stress(void)
+{
+        void *area;
+        char *tmp_area;
+        unsigned long nr;
+        struct uffdio_register uffdio_register;
+        struct uffdio_api uffdio_api;
+        unsigned long cpu;
+        int uffd_flags;
+        unsigned long userfaults[nr_cpus];
+        if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+                fprintf(stderr, "out of memory\n");
+                return 1;
+        }
+        area_src = area;
+        if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+                fprintf(stderr, "out of memory\n");
+                return 1;
+        }
+        area_dst = area;
+        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+        if (uffd < 0) {
+                fprintf(stderr,
+                        "userfaultfd syscall not available in this kernel\n");
+                return 1;
+        }
+        uffd_flags = fcntl(uffd, F_GETFD, NULL);
+        uffdio_api.api = UFFD_API;
+        uffdio_api.features = 0;
+        if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+                fprintf(stderr, "UFFDIO_API\n");
+                return 1;
+        }
+        if (uffdio_api.api != UFFD_API) {
+                fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+                return 1;
+        }
+        count_verify = malloc(nr_pages * sizeof(unsigned long long));
+        if (!count_verify) {
+                perror("count_verify");
+                return 1;
+        }
+        for (nr = 0; nr < nr_pages; nr++) {
+                *area_mutex(area_src, nr) = (pthread_mutex_t)
+                        PTHREAD_MUTEX_INITIALIZER;
+                count_verify[nr] = *area_count(area_src, nr) = 1;
+        }
+        pipefd = malloc(sizeof(int) * nr_cpus * 2);
+        if (!pipefd) {
+                perror("pipefd");
+                return 1;
+        }
+        for (cpu = 0; cpu < nr_cpus; cpu++) {
+                if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+                        perror("pipe");
+                        return 1;
+                }
+        }
+        if (posix_memalign(&area, page_size, page_size)) {
+                fprintf(stderr, "out of memory\n");
+                return 1;
+        }
+        zeropage = area;
+        bzero(zeropage, page_size);
+        pthread_mutex_lock(&uffd_read_mutex);
+        pthread_attr_init(&attr);
+        pthread_attr_setstacksize(&attr, 16*1024*1024);
+        while (bounces--) {
+                unsigned long expected_ioctls;
+                printf("bounces: %d, mode:", bounces);
+                if (bounces & BOUNCE_RANDOM)
+                        printf(" rnd");
+                if (bounces & BOUNCE_RACINGFAULTS)
+                        printf(" racing");
+                if (bounces & BOUNCE_VERIFY)
+                        printf(" ver");
+                if (bounces & BOUNCE_POLL)
+                        printf(" poll");
+                printf(", ");
+                fflush(stdout);
+                if (bounces & BOUNCE_POLL)
+                        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+                else
+                        fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+                /* register */
+                uffdio_register.range.start = (unsigned long) area_dst;
+                uffdio_register.range.len = nr_pages * page_size;
+                uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+                if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+                        fprintf(stderr, "register failure\n");
+                        return 1;
+                }
+                expected_ioctls = (1 << _UFFDIO_WAKE) |
+                                  (1 << _UFFDIO_COPY) |
+                                  (1 << _UFFDIO_ZEROPAGE);
+                if ((uffdio_register.ioctls & expected_ioctls) !=
+                    expected_ioctls) {
+                        fprintf(stderr,
+                                "unexpected missing ioctl for anon memory\n");
+                        return 1;
+                }
+                /*
+                 * The madvise done previously isn't enough: some
+                 * uffd_thread could have read userfaults (one of
+                 * those already resolved by the background thread)
+                 * and it may be in the process of calling
+                 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+                 * area_src and it would map a zero page in it (of
+                 * course such a UFFDIO_COPY is perfectly safe as it'd
+                 * return -EEXIST). The problem comes at the next
+                 * bounce though: that racing UFFDIO_COPY would
+                 * generate zeropages in the area_src, so invalidating
+                 * the previous MADV_DONTNEED. Without this additional
+                 * MADV_DONTNEED those zeropages leftovers in the
+                 * area_src would lead to -EEXIST failure during the
+                 * next bounce, effectively leaving a zeropage in the
+                 * area_dst.
+                 *
+                 * Try to comment this out madvise to see the memory
+                 * corruption being caught pretty quick.
+                 *
+                 * khugepaged is also inhibited to collapse THP after
+                 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+                 * required to MADV_DONTNEED here.
+                 */
+                if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+                        perror("madvise 2");
+                        return 1;
+                }
+                /* bounce pass */
+                if (stress(userfaults))
+                        return 1;
+                /* unregister */
+                if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+                        fprintf(stderr, "register failure\n");
+                        return 1;
+                }
+                /* verification */
+                if (bounces & BOUNCE_VERIFY) {
+                        for (nr = 0; nr < nr_pages; nr++) {
+                                if (my_bcmp(area_dst,
+                                            area_dst + nr * page_size,
+                                            sizeof(pthread_mutex_t))) {
+                                        fprintf(stderr,
+                                                "error mutex 2 %lu\n",
+                                                nr);
+                                        bounces = 0;
+                                }
+                                if (*area_count(area_dst, nr) != count_verify[nr]) {
+                                        fprintf(stderr,
+                                                "error area_count %Lu %Lu %lu\n",
+                                                *area_count(area_src, nr),
+                                                count_verify[nr],
+                                                nr);
+                                        bounces = 0;
+                                }
+                        }
+                }
+                /* prepare next bounce */
+                tmp_area = area_src;
+                area_src = area_dst;
+                area_dst = tmp_area;
+                printf("userfaults:");
+                for (cpu = 0; cpu < nr_cpus; cpu++)
+                        printf(" %lu", userfaults[cpu]);
+                printf("\n");
+        }
+        return 0;
+}
+int main(int argc, char **argv)
+{
+        if (argc < 3)
+                fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+        nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+        page_size = sysconf(_SC_PAGE_SIZE);
+        if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+            page_size)
+                fprintf(stderr, "Impossible to run this test\n"), exit(2);
+        nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+                nr_cpus;
+        if (!nr_pages_per_cpu) {
+                fprintf(stderr, "invalid MiB\n");
+                fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+        }
+        bounces = atoi(argv[2]);
+        if (bounces <= 0) {
+                fprintf(stderr, "invalid bounces\n");
+                fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+        }
+        nr_pages = nr_pages_per_cpu * nr_cpus;
+        printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+               nr_pages, nr_pages_per_cpu);
+        return userfaultfd_stress();
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-05 17:27:38 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-05 17:27:38 -0400
commit	6c0f568e84a3cfc775682311d65205462c3f3bc1 (patch)
tree	5105a137a9ea2459d55e895d3c096bbd31274724
parent	c82199061009d1561e31e17fca5e47a87cb7ff4c (diff)
parent	559ec2f8fd50981821621f52db5e1a8ffcf8d792 (diff)