Merge branch 'akpm' (patches from Andrew)

Merge more updates from Andrew Morton: - a few misc things - kexec updates - DMA-mapping updates to better support networking DMA operations - IPC updates - various MM changes to improve DAX fault handling - lots of radix-tree changes, mainly to the test suite. All leading up to reimplementing the IDA/IDR code to be a wrapper layer over the radix-tree. However the final trigger-pulling patch is held off for 4.11. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) radix tree test suite: delete unused rcupdate.c radix tree test suite: add new tag check radix-tree: ensure counts are initialised radix tree test suite: cache recently freed objects radix tree test suite: add some more functionality idr: reduce the number of bits per level from 8 to 6 rxrpc: abstract away knowledge of IDR internals tpm: use idr_find(), not idr_find_slowpath() idr: add ida_is_empty radix tree test suite: check multiorder iteration radix-tree: fix replacement for multiorder entries radix-tree: add radix_tree_split_preload() radix-tree: add radix_tree_split radix-tree: add radix_tree_join radix-tree: delete radix_tree_range_tag_if_tagged() radix-tree: delete radix_tree_locate_item() radix-tree: improve multiorder iterators btrfs: fix race in btrfs_free_dummy_fs_info() radix-tree: improve dump output radix-tree: make radix_tree_find_next_bit more useful ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-12-14 20:25:18 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-12-14 20:25:18 -0500
commit: a57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree: 5a42ee9a668f171143464bc86013954c1bbe94ad
parent: cf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parent: e1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)
140 files changed, 3428 insertions, 2218 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1b5f15653b1b..69e2387ca278 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must
 not block.  If it's not possible to reach a page without blocking,
 filesystem should skip it. Filesystem should use do_set_pte() to setup
 page table entry. Pointer to entry associated with the page is passed in
-"pte" field in fault_env structure. Pointers to entries for other offsets
+"pte" field in vm_fault structure. Pointers to entries for other offsets
 should be calculated relative to "pte".
        ->page_mkwrite() is called when a previously read-only pte is
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index cd8aad8226dd..08450a1a5b5f 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -158,7 +158,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
                unsigned long attrs)
 {
        phys_addr_t paddr = page_to_phys(page) + offset;
-        _dma_cache_sync(paddr, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                _dma_cache_sync(paddr, size, dir);
        return plat_phys_to_dma(dev, paddr);
 }
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 301281645d08..75055df1cda3 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size)
 }
 static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
-                enum dma_data_direction dir)
+                                    enum dma_data_direction dir,
+                                    unsigned long attrs)
 {
        struct dmabounce_device_info *device_info = dev->archdata.dmabounce;
        struct safe_buffer *buf;
@@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
                __func__, buf->ptr, virt_to_dma(dev, buf->ptr),
                buf->safe, buf->safe_dma_addr);
-        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) {
+        if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) &&
+            !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
                dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n",
                        __func__, ptr, buf->safe, size);
                memcpy(buf->safe, ptr, size);
@@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
 }
 static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
-                size_t size, enum dma_data_direction dir)
+                                size_t size, enum dma_data_direction dir,
+                                unsigned long attrs)
 {
        BUG_ON(buf->size != size);
        BUG_ON(buf->direction != dir);
@@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
        DO_STATS(dev->archdata.dmabounce->bounce_count++);
-        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) {
+        if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) &&
+            !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
                void *ptr = buf->ptr;
                dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n",
@@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page,
                return DMA_ERROR_CODE;
        }
-        return map_single(dev, page_address(page) + offset, size, dir);
+        return map_single(dev, page_address(page) + offset, size, dir, attrs);
 }
 /*
@@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t
                return;
        }
-        unmap_single(dev, buf, size, dir);
+        unmap_single(dev, buf, size, dir, attrs);
 }
 static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index 58610d0df7ed..54534e5d0781 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page,
 {
        void *cpu_addr = page_address(page) + offset;
-        dma_cache_sync(dev, cpu_addr, size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                dma_cache_sync(dev, cpu_addr, size, direction);
        return virt_to_bus(cpu_addr);
 }
@@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist,
                sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
                virt = sg_virt(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                dma_cache_sync(dev, virt, sg->length, direction);
        }
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c
index 53fbbb61aa86..a27a74a18fb0 100644
--- a/arch/blackfin/kernel/dma-mapping.c
+++ b/arch/blackfin/kernel/dma-mapping.c
@@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list,
        for_each_sg(sg_list, sg, nents, i) {
                sg->dma_address = (dma_addr_t) sg_virt(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction);
        }
@@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page,
 {
        dma_addr_t handle = (dma_addr_t)(page_address(page) + offset);
-        _dma_sync(handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                _dma_sync(handle, size, dir);
        return handle;
 }
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index db4a6a301f5e..6752df32ef06 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
 {
        dma_addr_t handle = virt_to_phys(page_address(page) + offset);
-        c6x_dma_sync(handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                c6x_dma_sync(handle, size, dir);
        return handle;
 }
 static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
                size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-        c6x_dma_sync(handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                c6x_dma_sync(handle, size, dir);
 }
 static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
        for_each_sg(sglist, sg, nents, i) {
                sg->dma_address = sg_phys(sg);
-                c6x_dma_sync(sg->dma_address, sg->length, dir);
+                if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                        c6x_dma_sync(sg->dma_address, sg->length, dir);
        }
        return nents;
@@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
        struct scatterlist *sg;
        int i;
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return;
        for_each_sg(sglist, sg, nents, i)
                c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
 }
 static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 90f2e4cb33d6..187688128c65 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -109,16 +109,19 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
                int nents, enum dma_data_direction direction,
                unsigned long attrs)
 {
-        int i;
        struct scatterlist *sg;
+        int i;
+        BUG_ON(direction == DMA_NONE);
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return nents;
        for_each_sg(sglist, sg, nents, i) {
                frv_cache_wback_inv(sg_dma_address(sg),
                                    sg_dma_address(sg) + sg_dma_len(sg));
        }
-        BUG_ON(direction == DMA_NONE);
        return nents;
 }
@@ -127,7 +130,10 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
                enum dma_data_direction direction, unsigned long attrs)
 {
        BUG_ON(direction == DMA_NONE);
-        flush_dcache_page(page);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                flush_dcache_page(page);
        return (dma_addr_t) page_to_phys(page) + offset;
 }
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index f585745b1abc..dba7df918144 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -40,13 +40,16 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
                int nents, enum dma_data_direction direction,
                unsigned long attrs)
 {
+        struct scatterlist *sg;
        unsigned long dampr2;
        void *vaddr;
        int i;
-        struct scatterlist *sg;
        BUG_ON(direction == DMA_NONE);
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return nents;
        dampr2 = __get_DAMPR(2);
        for_each_sg(sglist, sg, nents, i) {
@@ -70,7 +73,9 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
                unsigned long offset, size_t size,
                enum dma_data_direction direction, unsigned long attrs)
 {
-        flush_dcache_page(page);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                flush_dcache_page(page);
        return (dma_addr_t) page_to_phys(page) + offset;
 }
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index b9017785fb71..dbc4f1003da4 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -119,6 +119,9 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg,
                s->dma_length = s->length;
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                flush_dcache_range(dma_addr_to_virt(s->dma_address),
                                   dma_addr_to_virt(s->dma_address + s->length));
        }
@@ -180,7 +183,8 @@ static dma_addr_t hexagon_map_page(struct device *dev, struct page *page,
        if (!check_addr("map_single", dev, bus, size))
                return bad_dma_address;
-        dma_sync(dma_addr_to_virt(bus), size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                dma_sync(dma_addr_to_virt(bus), size, dir);
        return bus;
 }
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 8cf97cbadc91..07070065a425 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -134,7 +134,9 @@ static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page,
 {
        dma_addr_t handle = page_to_phys(page) + offset;
-        dma_sync_single_for_device(dev, handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                dma_sync_single_for_device(dev, handle, size, dir);
        return handle;
 }
@@ -146,6 +148,10 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist,
        for_each_sg(sglist, sg, nents, i) {
                sg->dma_address = sg_phys(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                dma_sync_single_for_device(dev, sg->dma_address, sg->length,
                                           dir);
        }
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c
index 0db31e24c541..91968d92652b 100644
--- a/arch/metag/kernel/dma.c
+++ b/arch/metag/kernel/dma.c
@@ -484,8 +484,9 @@ static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page,
                unsigned long offset, size_t size,
                enum dma_data_direction direction, unsigned long attrs)
 {
-        dma_sync_for_device((void *)(page_to_phys(page) + offset), size,
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-                            direction);
+                dma_sync_for_device((void *)(page_to_phys(page) + offset),
+                                    size, direction);
        return page_to_phys(page) + offset;
 }
@@ -493,7 +494,8 @@ static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
                size_t size, enum dma_data_direction direction,
                unsigned long attrs)
 {
-        dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
 }
 static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -507,6 +509,10 @@ static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
                BUG_ON(!sg_page(sg));
                sg->dma_address = sg_phys(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                dma_sync_for_device(sg_virt(sg), sg->length, direction);
        }
@@ -525,6 +531,10 @@ static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                BUG_ON(!sg_page(sg));
                sg->dma_address = sg_phys(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
        }
 }
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index ec04dc1e2527..818daf230eb4 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -61,6 +61,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
        /* FIXME this part of code is untested */
        for_each_sg(sgl, sg, nents, i) {
                sg->dma_address = sg_phys(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                __dma_sync(page_to_phys(sg_page(sg)) + sg->offset,
                                                        sg->length, direction);
        }
@@ -80,7 +84,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
                                             enum dma_data_direction direction,
                                             unsigned long attrs)
 {
-        __dma_sync(page_to_phys(page) + offset, size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_sync(page_to_phys(page) + offset, size, direction);
        return page_to_phys(page) + offset;
 }
@@ -95,7 +100,8 @@ static inline void dma_direct_unmap_page(struct device *dev,
 * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and
 * dma_address is physical address
 */
-        __dma_sync(dma_address, size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_sync(dma_address, size, direction);
 }
 static inline void
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index 1a80b6f73ab2..aab4fd681e1f 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -61,7 +61,7 @@ static int loongson_dma_map_sg(struct device *dev, struct scatterlist *sg,
                                int nents, enum dma_data_direction dir,
                                unsigned long attrs)
 {
-        int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, 0);
+        int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, attrs);
        mb();
        return r;
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 46d5696c4f27..a39c36af97ad 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -293,7 +293,7 @@ static inline void __dma_sync(struct page *page,
 static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
        size_t size, enum dma_data_direction direction, unsigned long attrs)
 {
-        if (cpu_needs_post_dma_flush(dev))
+        if (cpu_needs_post_dma_flush(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                __dma_sync(dma_addr_to_page(dev, dma_addr),
                           dma_addr & ~PAGE_MASK, size, direction);
        plat_post_dma_flush(dev);
@@ -307,7 +307,8 @@ static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist,
        struct scatterlist *sg;
        for_each_sg(sglist, sg, nents, i) {
-                if (!plat_device_is_coherent(dev))
+                if (!plat_device_is_coherent(dev) &&
+                    !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                        __dma_sync(sg_page(sg), sg->offset, sg->length,
                                   direction);
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -324,7 +325,7 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page,
        unsigned long offset, size_t size, enum dma_data_direction direction,
        unsigned long attrs)
 {
-        if (!plat_device_is_coherent(dev))
+        if (!plat_device_is_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                __dma_sync(page, offset, size, direction);
        return plat_map_dma_mem_page(dev, page) + offset;
@@ -339,6 +340,7 @@ static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
        for_each_sg(sglist, sg, nhwentries, i) {
                if (!plat_device_is_coherent(dev) &&
+                    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
                    direction != DMA_TO_DEVICE)
                        __dma_sync(sg_page(sg), sg->offset, sg->length,
                                   direction);
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index d800fad87896..f6a5dcf9d682 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -98,13 +98,17 @@ static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg,
        int i;
        for_each_sg(sg, sg, nents, i) {
-                void *addr;
+                void *addr = sg_virt(sg);
-                addr = sg_virt(sg);
+                if (!addr)
-                if (addr) {
+                        continue;
-                        __dma_sync_for_device(addr, sg->length, direction);
-                        sg->dma_address = sg_phys(sg);
+                sg->dma_address = sg_phys(sg);
-                }
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
+                __dma_sync_for_device(addr, sg->length, direction);
        }
        return nents;
@@ -117,7 +121,9 @@ static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page,
 {
        void *addr = page_address(page) + offset;
-        __dma_sync_for_device(addr, size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_sync_for_device(addr, size, direction);
        return page_to_phys(page) + offset;
 }
@@ -125,7 +131,8 @@ static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
                size_t size, enum dma_data_direction direction,
                unsigned long attrs)
 {
-        __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
 }
 static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
@@ -138,6 +145,9 @@ static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
        if (direction == DMA_TO_DEVICE)
                return;
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return;
        for_each_sg(sg, sg, nhwentries, i) {
                addr = sg_virt(sg);
                if (addr)
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 140c99140649..906998bac957 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -141,6 +141,9 @@ or1k_map_page(struct device *dev, struct page *page,
        unsigned long cl;
        dma_addr_t addr = page_to_phys(page) + offset;
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return addr;
        switch (dir) {
        case DMA_TO_DEVICE:
                /* Flush the dcache for the requested range */
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 494ff6e8c88a..b6298a85e8ae 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -459,7 +459,9 @@ static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page,
        void *addr = page_address(page) + offset;
        BUG_ON(direction == DMA_NONE);
-        flush_kernel_dcache_range((unsigned long) addr, size);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                flush_kernel_dcache_range((unsigned long) addr, size);
        return virt_to_phys(addr);
 }
@@ -469,8 +471,11 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
 {
        BUG_ON(direction == DMA_NONE);
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return;
        if (direction == DMA_TO_DEVICE)
-            return;
+                return;
        /*
         * For PCI_DMA_FROMDEVICE this flush is not necessary for the
@@ -479,7 +484,6 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
         */
        flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size);
-        return;
 }
 static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -496,6 +500,10 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
                sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
                sg_dma_len(sg) = sg->length;
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                flush_kernel_dcache_range(vaddr, sg->length);
        }
        return nents;
@@ -510,14 +518,16 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
        BUG_ON(direction == DMA_NONE);
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return;
        if (direction == DMA_TO_DEVICE)
-            return;
+                return;
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
        for_each_sg(sglist, sg, nents, i)
                flush_kernel_vmap_range(sg_virt(sg), sg->length);
-        return;
 }
 static void pa11_dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index e64a6016fba7..6877e3fa95bb 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
        for_each_sg(sgl, sg, nents, i) {
                sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
                sg->dma_length = sg->length;
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
        }
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
                                             unsigned long attrs)
 {
        BUG_ON(dir == DMA_NONE);
-        __dma_sync_page(page, offset, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_sync_page(page, offset, size, dir);
        return page_to_phys(page) + offset + get_dma_offset(dev);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 06254467e4dd..3a147122bc98 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -236,7 +236,6 @@ static int
 spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct spu_context *ctx = vma->vm_file->private_data;
-        unsigned long address = (unsigned long)vmf->virtual_address;
        unsigned long pfn, offset;
        offset = vmf->pgoff << PAGE_SHIFT;
@@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
        pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n",
-                        address, offset);
+                        vmf->address, offset);
        if (spu_acquire(ctx))
                return VM_FAULT_NOPAGE;
@@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
                pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
        }
-        vm_insert_pfn(vma, address, pfn);
+        vm_insert_pfn(vma, vmf->address, pfn);
        spu_release(ctx);
@@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
                down_read(&current->mm->mmap_sem);
        } else {
                area = ctx->spu->problem_phys + ps_offs;
-                vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+                vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
-                                        (area + offset) >> PAGE_SHIFT);
                spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
        }
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index eadb669a7329..47fee3b6e29c 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -18,7 +18,9 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
        dma_addr_t addr = page_to_phys(page) + offset;
        WARN_ON(size == 0);
-        dma_cache_sync(dev, page_address(page) + offset, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                dma_cache_sync(dev, page_address(page) + offset, size, dir);
        return addr;
 }
@@ -35,7 +37,8 @@ static int nommu_map_sg(struct device *dev, struct scatterlist *sg,
        for_each_sg(sg, s, nents, i) {
                BUG_ON(!sg_page(s));
-                dma_cache_sync(dev, sg_virt(s), s->length, dir);
+                if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                        dma_cache_sync(dev, sg_virt(s), s->length, dir);
                s->dma_address = sg_phys(s);
                s->dma_length = s->length;
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 852a3291db96..9df997995f6b 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -415,7 +415,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
                ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL;
        /* Step 1: Kick data out of streaming buffers if necessary. */
-        if (strbuf->strbuf_enabled)
+        if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                strbuf_flush(strbuf, iommu, bus_addr, ctx,
                             npages, direction);
@@ -640,7 +640,7 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
                base = iommu->page_table + entry;
                dma_handle &= IO_PAGE_MASK;
-                if (strbuf->strbuf_enabled)
+                if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                        strbuf_flush(strbuf, iommu, dma_handle, ctx,
                                     npages, direction);
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 2344103414d1..6ffaec44931a 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -527,7 +527,7 @@ static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
 static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size,
                             enum dma_data_direction dir, unsigned long attrs)
 {
-        if (dir != PCI_DMA_TODEVICE)
+        if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
                dma_make_coherent(ba, PAGE_ALIGN(size));
 }
@@ -572,7 +572,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
        struct scatterlist *sg;
        int n;
-        if (dir != PCI_DMA_TODEVICE) {
+        if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
                for_each_sg(sgl, sg, nents, n) {
                        dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length));
                }
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index a9973bb4a1b2..95e73c63c99d 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -42,7 +42,7 @@ static int panic_on_timeout;
 */
 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
+static int nmi_init_done;
 static unsigned int nmi_hz = HZ;
 static DEFINE_PER_CPU(short, wd_enabled);
 static int endflag __initdata;
@@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
 void stop_nmi_watchdog(void *unused)
 {
+        if (!__this_cpu_read(wd_enabled))
+                return;
        pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
        __this_cpu_write(wd_enabled, 0);
        atomic_dec(&nmi_active);
@@ -207,6 +209,9 @@ error:
 void start_nmi_watchdog(void *unused)
 {
+        if (__this_cpu_read(wd_enabled))
+                return;
        __this_cpu_write(wd_enabled, 1);
        atomic_inc(&nmi_active);
@@ -259,6 +264,8 @@ int __init nmi_init(void)
                }
        }
+        nmi_init_done = 1;
        return err;
 }
@@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str)
        return 0;
 }
 __setup("nmi_watchdog=", setup_nmi_watchdog);
+/*
+ * sparc specific NMI watchdog enable function.
+ * Enables watchdog if it is not enabled already.
+ */
+int watchdog_nmi_enable(unsigned int cpu)
+{
+        if (atomic_read(&nmi_active) == -1) {
+                pr_warn("NMI watchdog cannot be enabled or disabled\n");
+                return -1;
+        }
+        /*
+         * watchdog thread could start even before nmi_init is called.
+         * Just Return in that case. Let nmi_init finish the init
+         * process first.
+         */
+        if (!nmi_init_done)
+                return 0;
+        smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1);
+        return 0;
+}
+/*
+ * sparc specific NMI watchdog disable function.
+ * Disables watchdog if it is not disabled already.
+ */
+void watchdog_nmi_disable(unsigned int cpu)
+{
+        if (atomic_read(&nmi_active) == -1)
+                pr_warn_once("NMI watchdog cannot be enabled or disabled\n");
+        else
+                smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1);
+}
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 09bb774b39cd..24e0f8c21f2f 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -213,10 +213,12 @@ static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist,
        for_each_sg(sglist, sg, nents, i) {
                sg->dma_address = sg_phys(sg);
-                __dma_prep_pa_range(sg->dma_address, sg->length, direction);
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
                sg->dma_length = sg->length;
 #endif
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
+                __dma_prep_pa_range(sg->dma_address, sg->length, direction);
        }
        return nents;
@@ -232,6 +234,8 @@ static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
        BUG_ON(!valid_dma_direction(direction));
        for_each_sg(sglist, sg, nents, i) {
                sg->dma_address = sg_phys(sg);
+                if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                        continue;
                __dma_complete_pa_range(sg->dma_address, sg->length,
                                        direction);
        }
@@ -245,7 +249,8 @@ static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page,
        BUG_ON(!valid_dma_direction(direction));
        BUG_ON(offset + size > PAGE_SIZE);
-        __dma_prep_page(page, offset, size, direction);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                __dma_prep_page(page, offset, size, direction);
        return page_to_pa(page) + offset;
 }
@@ -256,6 +261,9 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
 {
        BUG_ON(!valid_dma_direction(direction));
+        if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
+                return;
        __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
                            dma_address & (PAGE_SIZE - 1), size, direction);
 }
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index e739002427ed..40121d14d34d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
                return VM_FAULT_SIGBUS;
        if (sym_offset == image->sym_vvar_page) {
-                ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+                ret = vm_insert_pfn(vma, vmf->address,
                                    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
        } else if (sym_offset == image->sym_pvclock_page) {
                struct pvclock_vsyscall_time_info *pvti =
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
                if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
                        ret = vm_insert_pfn(
                                vma,
-                                (unsigned long)vmf->virtual_address,
+                                vmf->address,
                                __pa(pvti) >> PAGE_SHIFT);
                }
        }
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 8c1f218926d7..307b1f4543de 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image)
 void arch_crash_save_vmcoreinfo(void)
 {
-        VMCOREINFO_SYMBOL(phys_base);
+        VMCOREINFO_NUMBER(phys_base);
        VMCOREINFO_SYMBOL(init_level4_pgt);
 #ifdef CONFIG_NUMA
@@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
        vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
                              kaslr_offset());
-        VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET);
+        VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-        VMCOREINFO_VMALLOC_START(VMALLOC_START);
-        VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
 }
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 1e68806d6695..6a16decf278f 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -189,7 +189,9 @@ static dma_addr_t xtensa_map_page(struct device *dev, struct page *page,
 {
        dma_addr_t dma_handle = page_to_phys(page) + offset;
-        xtensa_sync_single_for_device(dev, dma_handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                xtensa_sync_single_for_device(dev, dma_handle, size, dir);
        return dma_handle;
 }
@@ -197,7 +199,8 @@ static void xtensa_unmap_page(struct device *dev, dma_addr_t dma_handle,
                              size_t size, enum dma_data_direction dir,
                              unsigned long attrs)
 {
-        xtensa_sync_single_for_cpu(dev, dma_handle, size, dir);
+        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+                xtensa_sync_single_for_cpu(dev, dma_handle, size, dir);
 }
 static int xtensa_map_sg(struct device *dev, struct scatterlist *sg,
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 199b8e99f7d7..737187865269 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
        unsigned long pa;
        struct page *page;
-        dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start
+        dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
-                                                + agp->aperture.bus_base;
        pa = agp->ops->translate(agp, dma_addr);
        if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index f3f92d5fcda0..a697ca0cab1e 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * be because another thread has installed the pte first, so it
         * is no problem.
         */
-        vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+        vm_insert_pfn(vma, vmf->address, pfn);
        return VM_FAULT_NOPAGE;
 }
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 7a4869151d3b..a77262d31911 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops);
 *
 * The return'd chip has been tpm_try_get_ops'd and must be released via
 * tpm_put_ops
-  */
+ */
 struct tpm_chip *tpm_chip_find_get(int chip_num)
 {
        struct tpm_chip *chip, *res = NULL;
@@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num)
                        }
                } while (chip_prev != chip_num);
        } else {
-                chip = idr_find_slowpath(&dev_nums_idr, chip_num);
+                chip = idr_find(&dev_nums_idr, chip_num);
                if (chip && !tpm_try_get_ops(chip))
                        res = chip;
        }
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 286447a83dab..26ec39ddf21f 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
                struct vm_fault *vmf)
 {
-        unsigned long vaddr = (unsigned long) vmf->virtual_address;
        struct device *dev = &dax_dev->dev;
        struct dax_region *dax_region;
        int rc = VM_FAULT_SIGBUS;
@@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
        pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-        rc = vm_insert_mixed(vma, vaddr, pfn);
+        rc = vm_insert_mixed(vma, vmf->address, pfn);
        if (rc == -ENOMEM)
                return VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 768087ddb046..a293c8be232c 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -17,12 +17,11 @@
 static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
-        unsigned long addr = (unsigned long)vmf->virtual_address;
        unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
        int ret;
-        pfn += (addr - vma->vm_start) >> PAGE_SHIFT;
+        pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-        ret = vm_insert_pfn(vma, addr, pfn);
+        ret = vm_insert_pfn(vma, vmf->address, pfn);
        switch (ret) {
        case 0:
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index caa4e4ca616d..bd311c77c254 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * Using vm_pgoff as a selector forces us to use this unusual
                 * addressing scheme.
                 */
-                resource_size_t offset = (unsigned long)vmf->virtual_address -
+                resource_size_t offset = vmf->address - vma->vm_start;
-                        vma->vm_start;
                resource_size_t baddr = map->offset + offset;
                struct drm_agp_mem *agpmem;
                struct page *page;
@@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!map)
                return VM_FAULT_SIGBUS; /* Nothing allocated */
-        offset = (unsigned long)vmf->virtual_address - vma->vm_start;
+        offset = vmf->address - vma->vm_start;
        i = (unsigned long)map->handle + offset;
        page = vmalloc_to_page((void *)i);
        if (!page)
@@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!dma->pagelist)
                return VM_FAULT_SIGBUS; /* Nothing allocated */
-        offset = (unsigned long)vmf->virtual_address - vma->vm_start;   /* vm_[pg]off[set] should be 0 */
+        offset = vmf->address - vma->vm_start;
+                                        /* vm_[pg]off[set] should be 0 */
        page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */
        page = virt_to_page((void *)dma->pagelist[page_nr]);
@@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!entry->pagelist)
                return VM_FAULT_SIGBUS; /* Nothing allocated */
-        offset = (unsigned long)vmf->virtual_address - vma->vm_start;
+        offset = vmf->address - vma->vm_start;
        map_offset = map->offset - (unsigned long)dev->sg->virtual;
        page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT);
        page = entry->pagelist[page_offset];
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 7d066a91d778..114dddbd297b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        /* We don't use vmf->pgoff since that has the fake offset: */
-        pgoff = ((unsigned long)vmf->virtual_address -
+        pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                        vma->vm_start) >> PAGE_SHIFT;
        page = pages[pgoff];
-        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+        VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
             page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT);
-        ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+        ret = vm_insert_page(vma, vmf->address, page);
 out:
        switch (ret) {
@@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
        down_read(&mm->mmap_sem);
        while (pinned < npages) {
                ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
-                                            flags, pvec + pinned, NULL);
+                                            flags, pvec + pinned, NULL, NULL);
                if (ret < 0)
                        break;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index ea7a18230888..57b81460fec8 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        pgoff_t page_offset;
        int ret;
-        page_offset = ((unsigned long)vmf->virtual_address -
+        page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                        vma->vm_start) >> PAGE_SHIFT;
        if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) {
                DRM_ERROR("invalid page offset\n");
@@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        pfn = page_to_pfn(exynos_gem->pages[page_offset]);
-        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+        ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
-                        __pfn_to_pfn_t(pfn, PFN_DEV));
 out:
        switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 4071b2d1e8cf..8b44fa542562 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                                  psbfb->gtt->offset;
        page_num = vma_pages(vma);
-        address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT);
+        address = vmf->address - (vmf->pgoff << PAGE_SHIFT);
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 6d1cb6b370b1..527c62917660 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        /* Page relative to the VMA start - we must calculate this ourselves
           because vmf->pgoff is the fake GEM offset */
-        page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start)
+        page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                                >> PAGE_SHIFT;
        /* CPU view of the page, don't go via the GART for CPU writes */
        if (r->stolen)
                pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT;
        else
                pfn = page_to_pfn(r->pages[page_offset]);
-        ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+        ret = vm_insert_pfn(vma, vmf->address, pfn);
 fail:
        mutex_unlock(&dev_priv->mmap_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d0dcaf35b429..412f3513f269 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
        int ret;
        /* We don't use vmf->pgoff since that has the fake offset */
-        page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >>
+        page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
-                PAGE_SHIFT;
        trace_i915_gem_object_fault(obj, page_offset, true, write);
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 107ddf51065e..d068af2ec3a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
                                         obj->userptr.ptr + pinned * PAGE_SIZE,
                                         npages - pinned,
                                         flags,
-                                         pvec + pinned, NULL);
+                                         pvec + pinned, NULL, NULL);
                                if (ret < 0)
                                        break;
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index cd06cfd94687..d8bc59c7e261 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        /* We don't use vmf->pgoff since that has the fake offset: */
-        pgoff = ((unsigned long)vmf->virtual_address -
+        pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                        vma->vm_start) >> PAGE_SHIFT;
        pfn = page_to_pfn(pages[pgoff]);
-        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+        VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
                        pfn, pfn << PAGE_SHIFT);
-        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+        ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
-                        __pfn_to_pfn_t(pfn, PFN_DEV));
 out_unlock:
        mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index d4e1e11466f8..4a90c690f09e 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj,
        pgoff_t pgoff;
        /* We don't use vmf->pgoff since that has the fake offset: */
-        pgoff = ((unsigned long)vmf->virtual_address -
+        pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                        vma->vm_start) >> PAGE_SHIFT;
        if (omap_obj->pages) {
                omap_gem_cpu_sync(obj, pgoff);
@@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj,
                pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff;
        }
-        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+        VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
                        pfn, pfn << PAGE_SHIFT);
-        return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+        return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
-                        __pfn_to_pfn_t(pfn, PFN_DEV));
 }
 /* Special handling for the case of faulting in 2d tiled buffers */
@@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj,
        struct page *pages[64];  /* XXX is this too much to have on stack? */
        unsigned long pfn;
        pgoff_t pgoff, base_pgoff;
-        void __user *vaddr;
+        unsigned long vaddr;
        int i, ret, slots;
        /*
@@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj,
        const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE);
        /* We don't use vmf->pgoff since that has the fake offset: */
-        pgoff = ((unsigned long)vmf->virtual_address -
+        pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                        vma->vm_start) >> PAGE_SHIFT;
        /*
         * Actual address we start mapping at is rounded down to previous slot
@@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj,
        /* figure out buffer width in slots */
        slots = omap_obj->width >> priv->usergart[fmt].slot_shift;
-        vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT);
+        vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT);
        entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last];
@@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj,
        pfn = entry->paddr >> PAGE_SHIFT;
-        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
+        VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
                        pfn, pfn << PAGE_SHIFT);
        for (i = n; i > 0; i--) {
-                vm_insert_mixed(vma, (unsigned long)vaddr,
+                vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV));
-                                __pfn_to_pfn_t(pfn, PFN_DEV));
                pfn += priv->usergart[fmt].stride_pfn;
                vaddr += PAGE_SIZE * m;
        }
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index c08e5279eeac..7d853e6b5ff0 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (!bo->pages)
                return VM_FAULT_SIGBUS;
-        offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+        offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
        page = bo->pages[offset];
-        err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+        err = vm_insert_page(vma, vmf->address, page);
        switch (err) {
        case -EAGAIN:
        case 0:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 4748aedc933a..68ef993ab431 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page;
        int ret;
        int i;
-        unsigned long address = (unsigned long)vmf->virtual_address;
+        unsigned long address = vmf->address;
        int retval = VM_FAULT_NOPAGE;
        struct ttm_mem_type_manager *man =
                &bdev->man[bo->mem.mem_type];
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 818e70712b18..3c0c4bd3f750 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned int page_offset;
        int ret = 0;
-        page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
+        page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-                PAGE_SHIFT;
        if (!obj->pages)
                return VM_FAULT_SIGBUS;
        page = obj->pages[page_offset];
-        ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page);
+        ret = vm_insert_page(vma, vmf->address, page);
        switch (ret) {
        case -EAGAIN:
        case 0:
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index f36c14729b55..477e07f0ecb6 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct drm_vgem_gem_object *obj = vma->vm_private_data;
        /* We don't use vmf->pgoff since that has the fake offset */
-        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        unsigned long vaddr = vmf->address;
        struct page *page;
        page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 1f0fe3217f23..6b079a31dced 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
                 */
                npages = get_user_pages_remote(owning_process, owning_mm,
                                user_virt, gup_num_pages,
-                                flags, local_page_list, NULL);
+                                flags, local_page_list, NULL, NULL);
                up_read(&owning_mm->mmap_sem);
                if (npages < 0)
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 1db0af6c7f94..ba63ca57ed7e 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page;
        dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
-                (unsigned long)vmf->virtual_address,
+                vmf->address, vma->vm_start, vma->vm_end);
-                vma->vm_start, vma->vm_end);
        page = alloc_page(GFP_USER | __GFP_DMA32);
        if (!page)
                return VM_FAULT_OOM;
-        clear_user_highpage(page, (unsigned long)vmf->virtual_address);
+        clear_user_highpage(page, vmf->address);
        vmf->page = page;
        return 0;
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 5e506c19108a..5d36dcc7f47e 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct cxl_context *ctx = vma->vm_file->private_data;
-        unsigned long address = (unsigned long)vmf->virtual_address;
        u64 area, offset;
        offset = vmf->pgoff << PAGE_SHIFT;
        pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n",
-                        __func__, ctx->pe, address, offset);
+                        __func__, ctx->pe, vmf->address, offset);
        if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
                area = ctx->afu->psn_phys;
@@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
        }
-        vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
+        vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
        mutex_unlock(&ctx->status_mutex);
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 33741ad4a74a..af2e077da4b8 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned long paddr, vaddr;
        unsigned long expires;
-        vaddr = (unsigned long)vmf->virtual_address;
+        vaddr = vmf->address;
        gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
                vma, vaddr, GSEG_BASE(vaddr));
        STAT(nopfn);
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index d11093dce1b9..acbc3abe2ddd 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -210,7 +210,12 @@ struct igb_tx_buffer {
 struct igb_rx_buffer {
        dma_addr_t dma;
        struct page *page;
-        unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+        __u32 page_offset;
+#else
+        __u16 page_offset;
+#endif
+        __u16 pagecnt_bias;
 };
 struct igb_tx_queue_stats {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index cae24a8ccf47..a761001308dc 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3947,11 +3947,23 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
                if (!buffer_info->page)
                        continue;
-                dma_unmap_page(rx_ring->dev,
+                /* Invalidate cache lines that may have been written to by
-                               buffer_info->dma,
+                 * device so that we avoid corrupting memory.
-                               PAGE_SIZE,
+                 */
-                               DMA_FROM_DEVICE);
+                dma_sync_single_range_for_cpu(rx_ring->dev,
-                __free_page(buffer_info->page);
+                                              buffer_info->dma,
+                                              buffer_info->page_offset,
+                                              IGB_RX_BUFSZ,
+                                              DMA_FROM_DEVICE);
+                /* free resources associated with mapping */
+                dma_unmap_page_attrs(rx_ring->dev,
+                                     buffer_info->dma,
+                                     PAGE_SIZE,
+                                     DMA_FROM_DEVICE,
+                                     DMA_ATTR_SKIP_CPU_SYNC);
+                __page_frag_drain(buffer_info->page, 0,
+                                  buffer_info->pagecnt_bias);
                buffer_info->page = NULL;
        }
@@ -6812,12 +6824,6 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
        /* transfer page from old buffer to new buffer */
        *new_buff = *old_buff;
-        /* sync the buffer for use by the device */
-        dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma,
-                                         old_buff->page_offset,
-                                         IGB_RX_BUFSZ,
-                                         DMA_FROM_DEVICE);
 }
 static inline bool igb_page_is_reserved(struct page *page)
@@ -6829,13 +6835,15 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
                                  struct page *page,
                                  unsigned int truesize)
 {
+        unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
        /* avoid re-using remote pages */
        if (unlikely(igb_page_is_reserved(page)))
                return false;
 #if (PAGE_SIZE < 8192)
        /* if we are only owner of page we can reuse it */
-        if (unlikely(page_count(page) != 1))
+        if (unlikely(page_ref_count(page) != pagecnt_bias))
                return false;
        /* flip page offset to other buffer */
@@ -6848,10 +6856,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
                return false;
 #endif
-        /* Even if we own the page, we are not allowed to use atomic_set()
+        /* If we have drained the page fragment pool we need to update
-         * This would break get_page_unless_zero() users.
+         * the pagecnt_bias and page count so that we fully restock the
+         * number of references the driver holds.
         */
-        page_ref_inc(page);
+        if (unlikely(pagecnt_bias == 1)) {
+                page_ref_add(page, USHRT_MAX);
+                rx_buffer->pagecnt_bias = USHRT_MAX;
+        }
        return true;
 }
@@ -6903,7 +6915,6 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
                        return true;
                /* this page cannot be reused so discard it */
-                __free_page(page);
                return false;
        }
@@ -6938,6 +6949,13 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
        page = rx_buffer->page;
        prefetchw(page);
+        /* we are reusing so sync this buffer for CPU use */
+        dma_sync_single_range_for_cpu(rx_ring->dev,
+                                      rx_buffer->dma,
+                                      rx_buffer->page_offset,
+                                      size,
+                                      DMA_FROM_DEVICE);
        if (likely(!skb)) {
                void *page_addr = page_address(page) +
                                  rx_buffer->page_offset;
@@ -6962,21 +6980,18 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
                prefetchw(skb->data);
        }
-        /* we are reusing so sync this buffer for CPU use */
-        dma_sync_single_range_for_cpu(rx_ring->dev,
-                                      rx_buffer->dma,
-                                      rx_buffer->page_offset,
-                                      size,
-                                      DMA_FROM_DEVICE);
        /* pull page into skb */
        if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
                /* hand second half of page back to the ring */
                igb_reuse_rx_page(rx_ring, rx_buffer);
        } else {
-                /* we are not reusing the buffer so unmap it */
+                /* We are not reusing the buffer so unmap it and free
-                dma_unmap_page(rx_ring->dev, rx_buffer->dma,
+                 * any references we are holding to it
-                               PAGE_SIZE, DMA_FROM_DEVICE);
+                 */
+                dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+                                     PAGE_SIZE, DMA_FROM_DEVICE,
+                                     DMA_ATTR_SKIP_CPU_SYNC);
+                __page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
        }
        /* clear contents of rx_buffer */
@@ -7234,7 +7249,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
        }
        /* map page for use */
-        dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+        dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+                                 DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
        /* if mapping failed free memory back to system since
         * there isn't much point in holding memory we can't use
@@ -7249,6 +7265,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
        bi->dma = dma;
        bi->page = page;
        bi->page_offset = 0;
+        bi->pagecnt_bias = 1;
        return true;
 }
@@ -7275,6 +7292,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
                if (!igb_alloc_mapped_page(rx_ring, bi))
                        break;
+                /* sync the buffer for use by the device */
+                dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+                                                 bi->page_offset,
+                                                 IGB_RX_BUFSZ,
+                                                 DMA_FROM_DEVICE);
                /* Refresh the desc even if buffer_addrs didn't change
                 * because each write-back erases this info.
                 */
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index e9cef9de9ed8..c96f9b1d948a 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
                /* bound gain by 2 bits value max, 3rd bit is sign */
                data->delta_gain_code[i] =
-                        min(abs(delta_g),
+                        min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
-                        (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
                if (delta_g < 0)
                        /*
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index d5cc3070e83f..b653451843c8 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
        pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
-        ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn);
+        ret = vm_insert_pfn(vma, vmf->address, pfn);
        mutex_unlock(&buffer->lock);
        if (ret)
                return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 0b6d388d8aa4..697cbfbe9374 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
                       "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
                       vmf->page, vmf->page->mapping, vmf->page->index,
                       (long)vmf->page->flags, page_count(vmf->page),
-                       page_private(vmf->page), vmf->virtual_address);
+                       page_private(vmf->page), (void *)vmf->address);
                if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
                        lock_page(vmf->page);
                        cfio->ft_flags |= VM_FAULT_LOCKED;
@@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
        }
        if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
-                CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+                CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address);
                return -EFAULT;
        }
        if (cfio->ft_flags & VM_FAULT_OOM) {
-                CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+                CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address);
                return -ENOMEM;
        }
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index 7abd70b2a588..3151d2a0fe59 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f)
        mutex_lock(&hidg_ida_lock);
        hidg_put_minor(opts->minor);
-        if (idr_is_empty(&hidg_ida.idr))
+        if (ida_is_empty(&hidg_ida))
                ghid_cleanup();
        mutex_unlock(&hidg_ida_lock);
@@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
        mutex_lock(&hidg_ida_lock);
-        if (idr_is_empty(&hidg_ida.idr)) {
+        if (ida_is_empty(&hidg_ida)) {
                status = ghid_setup(NULL, HIDG_MINORS);
                if (status)  {
                        ret = ERR_PTR(status);
@@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
        if (opts->minor < 0) {
                ret = ERR_PTR(opts->minor);
                kfree(opts);
-                if (idr_is_empty(&hidg_ida.idr))
+                if (ida_is_empty(&hidg_ida))
                        ghid_cleanup();
                goto unlock;
        }
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index 0de36cda6e41..8054da9276dd 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f)
        mutex_lock(&printer_ida_lock);
        gprinter_put_minor(opts->minor);
-        if (idr_is_empty(&printer_ida.idr))
+        if (ida_is_empty(&printer_ida))
                gprinter_cleanup();
        mutex_unlock(&printer_ida_lock);
@@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
        mutex_lock(&printer_ida_lock);
-        if (idr_is_empty(&printer_ida.idr)) {
+        if (ida_is_empty(&printer_ida)) {
                status = gprinter_setup(PRINTER_MINORS);
                if (status) {
                        ret = ERR_PTR(status);
@@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
        if (opts->minor < 0) {
                ret = ERR_PTR(opts->minor);
                kfree(opts);
-                if (idr_is_empty(&printer_ida.idr))
+                if (ida_is_empty(&printer_ida))
                        gprinter_cleanup();
                goto unlock;
        }
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9815e45c23c4..f3726ba12aa6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
                down_read(&mm->mmap_sem);
                ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
-                                            NULL);
+                                            NULL, NULL);
                up_read(&mm->mmap_sem);
        }
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 702040fe2001..6e3306f4a525 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
               vma, vma->vm_start, vma->vm_end,
-               vmf->pgoff, vmf->virtual_address);
+               vmf->pgoff, (void *)vmf->address);
        return VM_FAULT_SIGBUS;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 180f910339f4..3b713b6fcc26 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,12 +202,12 @@ static struct ratelimit_state printk_limits[] = {
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
        struct super_block *sb = fs_info->sb;
-        char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1];
+        char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
        struct va_format vaf;
        va_list args;
-        const char *type = NULL;
        int kern_level;
-        struct ratelimit_state *ratelimit;
+        const char *type = logtypes[4];
+        struct ratelimit_state *ratelimit = &printk_limits[4];
        va_start(args, fmt);
@@ -223,12 +223,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
                fmt += size;
        }
-        if (!type) {
-                *lvl = '\0';
-                type = logtypes[4];
-                ratelimit = &printk_limits[4];
-        }
        vaf.fmt = fmt;
        vaf.va = &args;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad919a95..00ee006a8aa2 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
                                slot = radix_tree_iter_retry(&iter);
                        continue;
                }
+                slot = radix_tree_iter_resume(slot, &iter);
                spin_unlock(&fs_info->buffer_lock);
                free_extent_buffer_stale(eb);
                spin_lock(&fs_info->buffer_lock);
diff --git a/fs/dax.c b/fs/dax.c
index 5ae8e11ad786..a8732fbed381 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include "internal.h"
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
        }
 }
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+                                     pgoff_t index)
+{
+        void *entry, **slot;
+        spin_lock_irq(&mapping->tree_lock);
+        entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+        if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+                         !slot_locked(mapping, slot))) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return;
+        }
+        unlock_slot(mapping, slot);
+        spin_unlock_irq(&mapping->tree_lock);
+        dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
 static void put_locked_mapping_entry(struct address_space *mapping,
                                     pgoff_t index, void *entry)
 {
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 }
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-        void *entry, **slot;
-        spin_lock_irq(&mapping->tree_lock);
-        entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
-        if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
-                         !slot_locked(mapping, slot))) {
-                spin_unlock_irq(&mapping->tree_lock);
-                return;
-        }
-        unlock_slot(mapping, slot);
-        spin_unlock_irq(&mapping->tree_lock);
-        dax_wake_mapping_entry_waiter(mapping, index, entry, false);
-}
 /*
 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 * entry to get unlocked before deleting it.
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
        /* This will replace locked radix tree entry with a hole page */
        page = find_or_create_page(mapping, vmf->pgoff,
                                   vmf->gfp_mask | __GFP_ZERO);
-        if (!page) {
+        if (!page)
-                put_locked_mapping_entry(mapping, vmf->pgoff, entry);
                return VM_FAULT_OOM;
-        }
        vmf->page = page;
        return VM_FAULT_LOCKED;
 }
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        return new_entry;
 }
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+        unsigned long address;
+        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+        return address;
+}
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+                                      pgoff_t index, unsigned long pfn)
+{
+        struct vm_area_struct *vma;
+        pte_t *ptep;
+        pte_t pte;
+        spinlock_t *ptl;
+        bool changed;
+        i_mmap_lock_read(mapping);
+        vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+                unsigned long address;
+                cond_resched();
+                if (!(vma->vm_flags & VM_SHARED))
+                        continue;
+                address = pgoff_address(index, vma);
+                changed = false;
+                if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+                        continue;
+                if (pfn != pte_pfn(*ptep))
+                        goto unlock;
+                if (!pte_dirty(*ptep) && !pte_write(*ptep))
+                        goto unlock;
+                flush_cache_page(vma, address, pfn);
+                pte = ptep_clear_flush(vma, address, ptep);
+                pte = pte_wrprotect(pte);
+                pte = pte_mkclean(pte);
+                set_pte_at(vma->vm_mm, address, ptep, pte);
+                changed = true;
+unlock:
+                pte_unmap_unlock(ptep, ptl);
+                if (changed)
+                        mmu_notifier_invalidate_page(vma->vm_mm, address);
+        }
+        i_mmap_unlock_read(mapping);
+}
 static int dax_writeback_one(struct block_device *bdev,
                struct address_space *mapping, pgoff_t index, void *entry)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-        struct radix_tree_node *node;
        struct blk_dax_ctl dax;
-        void **slot;
+        void *entry2, **slot;
        int ret = 0;
-        spin_lock_irq(&mapping->tree_lock);
        /*
-         * Regular page slots are stabilized by the page lock even
+         * A page got tagged dirty in DAX mapping? Something is seriously
-         * without the tree itself locked.  These unlocked entries
+         * wrong.
-         * need verification under the tree lock.
         */
-        if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+        if (WARN_ON(!radix_tree_exceptional_entry(entry)))
-                goto unlock;
+                return -EIO;
-        if (*slot != entry)
-                goto unlock;
-        /* another fsync thread may have already written back this entry */
-        if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-                goto unlock;
+        spin_lock_irq(&mapping->tree_lock);
+        entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+        /* Entry got punched out / reallocated? */
+        if (!entry2 || !radix_tree_exceptional_entry(entry2))
+                goto put_unlocked;
+        /*
+         * Entry got reallocated elsewhere? No need to writeback. We have to
+         * compare sectors as we must not bail out due to difference in lockbit
+         * or entry type.
+         */
+        if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+                goto put_unlocked;
        if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                dax_is_zero_entry(entry))) {
                ret = -EIO;
-                goto unlock;
+                goto put_unlocked;
        }
+        /* Another fsync thread may have already written back this entry */
+        if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+                goto put_unlocked;
+        /* Lock the entry to serialize with page faults */
+        entry = lock_slot(mapping, slot);
+        /*
+         * We can clear the tag now but we have to be careful so that concurrent
+         * dax_writeback_one() calls for the same index cannot finish before we
+         * actually flush the caches. This is achieved as the calls will look
+         * at the entry only under tree_lock and once they do that they will
+         * see the entry locked and wait for it to unlock.
+         */
+        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+        spin_unlock_irq(&mapping->tree_lock);
        /*
         * Even if dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
         */
        dax.sector = dax_radix_sector(entry);
        dax.size = PAGE_SIZE << dax_radix_order(entry);
-        spin_unlock_irq(&mapping->tree_lock);
        /*
         * We cannot hold tree_lock while calling dax_map_atomic() because it
         * eventually calls cond_resched().
         */
        ret = dax_map_atomic(bdev, &dax);
-        if (ret < 0)
+        if (ret < 0) {
+                put_locked_mapping_entry(mapping, index, entry);
                return ret;
+        }
        if (WARN_ON_ONCE(ret < dax.size)) {
                ret = -EIO;
                goto unmap;
        }
+        dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
        wb_cache_pmem(dax.addr, dax.size);
+        /*
+         * After we have flushed the cache, we can clear the dirty tag. There
+         * cannot be new dirty data in the pfn after the flush has completed as
+         * the pfn mappings are writeprotected and fault waits for mapping
+         * entry lock.
+         */
        spin_lock_irq(&mapping->tree_lock);
-        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
        spin_unlock_irq(&mapping->tree_lock);
 unmap:
        dax_unmap_atomic(bdev, &dax);
+        put_locked_mapping_entry(mapping, index, entry);
        return ret;
- unlock:
+ put_unlocked:
+        put_unlocked_mapping_entry(mapping, index, entry2);
        spin_unlock_irq(&mapping->tree_lock);
        return ret;
 }
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
                struct block_device *bdev, sector_t sector, size_t size,
                void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        unsigned long vaddr = vmf->address;
        struct blk_dax_ctl dax = {
                .sector = sector,
                .size = size,
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
-        void *entry;
+        void *entry, **slot;
        pgoff_t index = vmf->pgoff;
        spin_lock_irq(&mapping->tree_lock);
-        entry = get_unlocked_mapping_entry(mapping, index, NULL);
+        entry = get_unlocked_mapping_entry(mapping, index, &slot);
-        if (!entry || !radix_tree_exceptional_entry(entry))
+        if (!entry || !radix_tree_exceptional_entry(entry)) {
-                goto out;
+                if (entry)
+                        put_unlocked_mapping_entry(mapping, index, entry);
+                spin_unlock_irq(&mapping->tree_lock);
+                return VM_FAULT_NOPAGE;
+        }
        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-        put_unlocked_mapping_entry(mapping, index, entry);
+        entry = lock_slot(mapping, slot);
-out:
        spin_unlock_irq(&mapping->tree_lock);
+        /*
+         * If we race with somebody updating the PTE and finish_mkwrite_fault()
+         * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+         * the fault in either case.
+         */
+        finish_mkwrite_fault(vmf);
+        put_locked_mapping_entry(mapping, index, entry);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 {
        struct address_space *mapping = vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
-        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        unsigned long vaddr = vmf->address;
        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
        sector_t sector;
        struct iomap iomap = { 0 };
        unsigned flags = IOMAP_FAULT;
        int error, major = 0;
-        int locked_status = 0;
+        int vmf_ret = 0;
        void *entry;
        /*
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                if (error)
                        goto finish_iomap;
-                if (!radix_tree_exceptional_entry(entry)) {
-                        vmf->page = entry;
+                __SetPageUptodate(vmf->cow_page);
-                        locked_status = VM_FAULT_LOCKED;
+                vmf_ret = finish_fault(vmf);
-                } else {
+                if (!vmf_ret)
-                        vmf->entry = entry;
+                        vmf_ret = VM_FAULT_DONE_COW;
-                        locked_status = VM_FAULT_DAX_LOCKED;
-                }
                goto finish_iomap;
        }
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-                        locked_status = dax_load_hole(mapping, entry, vmf);
+                        vmf_ret = dax_load_hole(mapping, entry, vmf);
                        break;
                }
                /*FALLTHRU*/
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 finish_iomap:
        if (ops->iomap_end) {
-                if (error) {
+                if (error || (vmf_ret & VM_FAULT_ERROR)) {
                        /* keep previous error */
                        ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
                                        &iomap);
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                }
        }
 unlock_entry:
-        if (!locked_status || error)
+        if (vmf_ret != VM_FAULT_LOCKED || error)
                put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 out:
        if (error == -ENOMEM)
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        /* -EBUSY is fine, somebody else faulted on the same PTE */
        if (error < 0 && error != -EBUSY)
                return VM_FAULT_SIGBUS | major;
-        if (locked_status) {
+        if (vmf_ret) {
                WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
-                return locked_status;
+                return vmf_ret;
        }
        return VM_FAULT_NOPAGE | major;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 88b5e1efdbd6..8112eacf10f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
         * doing the exec and bprm->mm is the new process's mm.
         */
        ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
-                        &page, NULL);
+                        &page, NULL, NULL);
        if (ret <= 0)
                return NULL;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..d96e2f30084b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,9 +257,9 @@ out:
 * fatal_signal_pending()s, and the mmap_sem must be released before
 * returning it.
 */
-int handle_userfault(struct fault_env *fe, unsigned long reason)
+int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 {
-        struct mm_struct *mm = fe->vma->vm_mm;
+        struct mm_struct *mm = vmf->vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue uwq;
        int ret;
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
        BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
        ret = VM_FAULT_SIGBUS;
-        ctx = fe->vma->vm_userfaultfd_ctx.ctx;
+        ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
        if (!ctx)
                goto out;
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
         * without first stopping userland access to the memory. For
         * VM_UFFD_MISSING userfaults this is enough for now.
         */
-        if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
+        if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
                /*
                 * Validate the invariant that nowait must allow retry
                 * to be sure not to return SIGBUS erroneously on
                 * nowait invocations.
                 */
-                BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
+                BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
 #ifdef CONFIG_DEBUG_VM
                if (printk_ratelimit()) {
                        printk(KERN_WARNING
-                               "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
+                               "FAULT_FLAG_ALLOW_RETRY missing %x\n",
+                               vmf->flags);
                        dump_stack();
                }
 #endif
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
         * and wait.
         */
        ret = VM_FAULT_RETRY;
-        if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
+        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                goto out;
        /* take the reference before dropping the mmap_sem */
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
-        uwq.msg = userfault_msg(fe->address, fe->flags, reason);
+        uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
        uwq.ctx = ctx;
        return_to_userland =
-                (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+                (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
                (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
        spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
                          TASK_KILLABLE);
        spin_unlock(&ctx->fault_pending_wqh.lock);
-        must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
+        must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+                                          reason);
        up_read(&mm->mmap_sem);
        if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0afade8bd3d7..f97bcfe79472 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
                unsigned int offset, unsigned int length);
 #else
@@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 {
        return ERR_PTR(-ENXIO);
 }
-/* Shouldn't ever be called when dax is disabled. */
-static inline void dax_unlock_mapping_entry(struct address_space *mapping,
-                                            pgoff_t index)
-{
-        BUG();
-}
 static inline int __dax_zero_page_range(struct block_device *bdev,
                sector_t sector, unsigned int offset, unsigned int length)
 {
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 08528afdf58b..10c5a17b1f51 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
                ops->unmap_sg(dev, sg, nents, dir, attrs);
 }
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+static inline dma_addr_t dma_map_page_attrs(struct device *dev,
-                                      size_t offset, size_t size,
+                                            struct page *page,
-                                      enum dma_data_direction dir)
+                                            size_t offset, size_t size,
+                                            enum dma_data_direction dir,
+                                            unsigned long attrs)
 {
        struct dma_map_ops *ops = get_dma_ops(dev);
        dma_addr_t addr;
        kmemcheck_mark_initialized(page_address(page) + offset, size);
        BUG_ON(!valid_dma_direction(dir));
-        addr = ops->map_page(dev, page, offset, size, dir, 0);
+        addr = ops->map_page(dev, page, offset, size, dir, attrs);
        debug_dma_map_page(dev, page, offset, size, dir, addr, false);
        return addr;
 }
-static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void dma_unmap_page_attrs(struct device *dev,
-                                  size_t size, enum dma_data_direction dir)
+                                        dma_addr_t addr, size_t size,
+                                        enum dma_data_direction dir,
+                                        unsigned long attrs)
 {
        struct dma_map_ops *ops = get_dma_ops(dev);
        BUG_ON(!valid_dma_direction(dir));
        if (ops->unmap_page)
-                ops->unmap_page(dev, addr, size, dir, 0);
+                ops->unmap_page(dev, addr, size, dir, attrs);
        debug_dma_unmap_page(dev, addr, size, dir, false);
 }
@@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
 #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
+#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
+#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
                           void *cpu_addr, dma_addr_t dma_addr, size_t size);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f8041f9de31e..4175dca4ac39 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold);
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 struct page_frag_cache;
+extern void __page_frag_drain(struct page *page, unsigned int order,
+                              unsigned int count);
 extern void *__alloc_page_frag(struct page_frag_cache *nc,
                               unsigned int fragsz, gfp_t gfp_mask);
 extern void __free_page_frag(void *addr);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1f782aa1d8e6..97e478d6b690 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,12 +1,12 @@
 #ifndef _LINUX_HUGE_MM_H
 #define _LINUX_HUGE_MM_H
-extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
+extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                         struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
+extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
-extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
+extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
 extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
@@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page)
        return 1;
 }
-extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
+extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 extern struct page *huge_zero_page;
@@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
        return NULL;
 }
-static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
+static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
        return 0;
 }
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 083d61e92706..3c01b89aed67 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -18,12 +18,11 @@
 #include <linux/rcupdate.h>
 /*
- * We want shallower trees and thus more bits covered at each layer.  8
+ * Using 6 bits at each layer allows us to allocate 7 layers out of each page.
- * bits gives us large enough first layer for most use cases and maximum
+ * 8 bits only gave us 3 layers out of every pair of pages, which is less
- * tree depth of 4.  Each idr_layer is slightly larger than 2k on 64bit and
+ * efficient except for trees with a largest element between 192-255 inclusive.
- * 1k on 32bit.
 */
-#define IDR_BITS 8
+#define IDR_BITS 6
 #define IDR_SIZE (1 << IDR_BITS)
 #define IDR_MASK ((1 << IDR_BITS)-1)
@@ -56,6 +55,32 @@ struct idr {
 #define DEFINE_IDR(name)        struct idr name = IDR_INIT(name)
 /**
+ * idr_get_cursor - Return the current position of the cyclic allocator
+ * @idr: idr handle
+ *
+ * The value returned is the value that will be next returned from
+ * idr_alloc_cyclic() if it is free (otherwise the search will start from
+ * this position).
+ */
+static inline unsigned int idr_get_cursor(struct idr *idr)
+{
+        return READ_ONCE(idr->cur);
+}
+/**
+ * idr_set_cursor - Set the current position of the cyclic allocator
+ * @idr: idr handle
+ * @val: new position
+ *
+ * The next call to idr_alloc_cyclic() will return @val if it is free
+ * (otherwise the search will start from this position).
+ */
+static inline void idr_set_cursor(struct idr *idr, unsigned int val)
+{
+        WRITE_ONCE(idr->cur, val);
+}
+/**
 * DOC: idr sync
 * idr synchronization (stolen from radix-tree.h)
 *
@@ -195,6 +220,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
        return ida_get_new_above(ida, 0, p_id);
 }
+static inline bool ida_is_empty(struct ida *ida)
+{
+        return idr_is_empty(&ida->idr);
+}
 void __init idr_init_cache(void);
 #endif /* __IDR_H__ */
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 410decacff8f..68bd88223417 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -77,7 +77,6 @@ extern int kdb_poll_idx;
 * number whenever the kernel debugger is entered.
 */
 extern int kdb_initial_cpu;
-extern atomic_t kdb_event;
 /* Types and messages used for dynamically added kdb shell commands */
@@ -162,6 +161,7 @@ enum kdb_msgsrc {
 };
 extern int kdb_trap_printk;
+extern int kdb_printf_cpu;
 extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
                                      va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 406c33dcae13..d7437777baaa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void);
        vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
 #define VMCOREINFO_CONFIG(name) \
        vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
-#define VMCOREINFO_PAGE_OFFSET(value) \
-        vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value)
-#define VMCOREINFO_VMALLOC_START(value) \
-        vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value)
-#define VMCOREINFO_VMEMMAP_START(value) \
-        vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value)
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b5b2e4df14e..4424784ac374 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -292,36 +292,23 @@ extern pgprot_t protection_map[16];
 * pgoff should be used in favour of virtual_address, if possible.
 */
 struct vm_fault {
+        struct vm_area_struct *vma;     /* Target VMA */
        unsigned int flags;             /* FAULT_FLAG_xxx flags */
        gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
        pgoff_t pgoff;                  /* Logical page offset based on vma */
-        void __user *virtual_address;   /* Faulting virtual address */
+        unsigned long address;          /* Faulting virtual address */
+        pmd_t *pmd;                     /* Pointer to pmd entry matching
+                                         * the 'address' */
+        pte_t orig_pte;                 /* Value of PTE at the time of fault */
-        struct page *cow_page;          /* Handler may choose to COW */
+        struct page *cow_page;          /* Page handler may use for COW fault */
+        struct mem_cgroup *memcg;       /* Cgroup cow_page belongs to */
        struct page *page;              /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
-        void *entry;                    /* ->fault handler can alternatively
+        /* These three entries are valid only while holding ptl lock */
-                                         * return locked DAX entry. In that
-                                         * case handler should return
-                                         * VM_FAULT_DAX_LOCKED and fill in
-                                         * entry here.
-                                         */
-};
-/*
- * Page fault context: passes though page fault handler instead of endless list
- * of function arguments.
- */
-struct fault_env {
-        struct vm_area_struct *vma;     /* Target VMA */
-        unsigned long address;          /* Faulting virtual address */
-        unsigned int flags;             /* FAULT_FLAG_xxx flags */
-        pmd_t *pmd;                     /* Pointer to pmd entry matching
-                                         * the 'address'
-                                         */
        pte_t *pte;                     /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
@@ -351,7 +338,7 @@ struct vm_operations_struct {
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
        int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
                                                pmd_t *, unsigned int flags);
-        void (*map_pages)(struct fault_env *fe,
+        void (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        /* notification that a previously read-only page is about to become
@@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
        return pte;
 }
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                struct page *page);
+int finish_fault(struct vm_fault *vmf);
+int finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 /*
@@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED 0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY  0x0400  /* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800        /* huge page fault failed, fall back to small */
-#define VM_FAULT_DAX_LOCKED 0x1000      /* ->fault has locked DAX entry */
+#define VM_FAULT_DONE_COW   0x1000      /* ->fault has fully handled COW */
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
@@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
                        struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+               spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
@@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
                            unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
-                            struct vm_area_struct **vmas);
+                            struct vm_area_struct **vmas, int *locked);
 long get_user_pages(unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
                            struct vm_area_struct **vmas);
 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
-                               unsigned long start, unsigned long nr_pages,
-                               struct page **pages, unsigned int gup_flags);
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
@@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *);
 /* generic vm_area_ops exported for stackable file systems */
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
-extern void filemap_map_pages(struct fault_env *fe,
+extern void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a78c35cff1ae..aacca824a6ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -7,6 +7,23 @@
 #include <linux/sched.h>
 #include <asm/irq.h>
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT   0
+#define SOFT_WATCHDOG_ENABLED_BIT  1
+#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 /**
 * touch_nmi_watchdog - restart NMI watchdog timeout.
 * 
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
 extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif
+extern bool is_hardlockup(void);
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 744486057e9e..5dea8f6440e4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -80,23 +80,25 @@ static inline bool radix_tree_is_internal_node(void *ptr)
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))
+/*
+ * @count is the count of every non-NULL element in the ->slots array
+ * whether that is an exceptional entry, a retry entry, a user pointer,
+ * a sibling entry or a pointer to the next level of the tree.
+ * @exceptional is the count of every element in ->slots which is
+ * either radix_tree_exceptional_entry() or is a sibling entry for an
+ * exceptional entry.
+ */
 struct radix_tree_node {
        unsigned char   shift;          /* Bits remaining in each slot */
        unsigned char   offset;         /* Slot offset in parent */
        unsigned char   count;          /* Total entry count */
        unsigned char   exceptional;    /* Exceptional entry count */
+        struct radix_tree_node *parent;         /* Used when ascending tree */
+        void *private_data;                     /* For tree user */
        union {
-                struct {
+                struct list_head private_list;  /* For tree user */
-                        /* Used when ascending tree */
+                struct rcu_head rcu_head;       /* Used when freeing node */
-                        struct radix_tree_node *parent;
-                        /* For tree user */
-                        void *private_data;
-                };
-                /* Used when freeing node */
-                struct rcu_head rcu_head;
        };
-        /* For tree user */
-        struct list_head private_list;
        void __rcu      *slots[RADIX_TREE_MAP_SIZE];
        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
@@ -127,6 +129,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root)
 }
 /**
+ * struct radix_tree_iter - radix tree iterator state
+ *
+ * @index:      index of current slot
+ * @next_index: one beyond the last index for this chunk
+ * @tags:       bit-mask for tag-iterating
+ * @node:       node that contains current slot
+ * @shift:      shift for the node that holds our slots
+ *
+ * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
+ * subinterval of slots contained within one radix tree leaf node.  It is
+ * described by a pointer to its first slot and a struct radix_tree_iter
+ * which holds the chunk's position in the tree and its size.  For tagged
+ * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
+ * radix tree tag.
+ */
+struct radix_tree_iter {
+        unsigned long   index;
+        unsigned long   next_index;
+        unsigned long   tags;
+        struct radix_tree_node *node;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+        unsigned int    shift;
+#endif
+};
+static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+        return iter->shift;
+#else
+        return 0;
+#endif
+}
+/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
@@ -264,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void **slot, void *item,
                          radix_tree_update_node_t update_node, void *private);
+void radix_tree_iter_replace(struct radix_tree_root *,
+                const struct radix_tree_iter *, void **slot, void *item);
 void radix_tree_replace_slot(struct radix_tree_root *root,
                             void **slot, void *item);
 void __radix_tree_delete_node(struct radix_tree_root *root,
@@ -289,6 +328,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag);
 int radix_tree_tag_get(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag);
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+                const struct radix_tree_iter *iter, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
@@ -297,50 +338,18 @@ unsigned int
 radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag);
-unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
-                unsigned long *first_indexp, unsigned long last_index,
-                unsigned long nr_to_tag,
-                unsigned int fromtag, unsigned int totag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 static inline void radix_tree_preload_end(void)
 {
        preempt_enable();
 }
-/**
+int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
- * struct radix_tree_iter - radix tree iterator state
+int radix_tree_split(struct radix_tree_root *, unsigned long index,
- *
+                        unsigned new_order);
- * @index:      index of current slot
+int radix_tree_join(struct radix_tree_root *, unsigned long index,
- * @next_index: one beyond the last index for this chunk
+                        unsigned new_order, void *);
- * @tags:       bit-mask for tag-iterating
- * @shift:      shift for the node that holds our slots
- *
- * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
- * subinterval of slots contained within one radix tree leaf node.  It is
- * described by a pointer to its first slot and a struct radix_tree_iter
- * which holds the chunk's position in the tree and its size.  For tagged
- * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
- * radix tree tag.
- */
-struct radix_tree_iter {
-        unsigned long   index;
-        unsigned long   next_index;
-        unsigned long   tags;
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-        unsigned int    shift;
-#endif
-};
-static inline unsigned int iter_shift(struct radix_tree_iter *iter)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-        return iter->shift;
-#else
-        return 0;
-#endif
-}
 #define RADIX_TREE_ITER_TAG_MASK        0x00FF  /* tag index in lower byte */
 #define RADIX_TREE_ITER_TAGGED          0x0100  /* lookup tagged slots */
@@ -409,20 +418,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
 }
 /**
- * radix_tree_iter_next - resume iterating when the chunk may be invalid
+ * radix_tree_iter_resume - resume iterating when the chunk may be invalid
- * @iter:       iterator state
+ * @slot: pointer to current slot
+ * @iter: iterator state
+ * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
- * to continue the iteration from the next index.
+ * before releasing the lock to continue the iteration from the next index.
 */
-static inline __must_check
+void **__must_check radix_tree_iter_resume(void **slot,
-void **radix_tree_iter_next(struct radix_tree_iter *iter)
+                                        struct radix_tree_iter *iter);
-{
-        iter->next_index = __radix_tree_iter_add(iter, 1);
-        iter->tags = 0;
-        return NULL;
-}
 /**
 * radix_tree_chunk_size - get current chunk size
@@ -436,10 +442,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
        return (iter->next_index - iter->index) >> iter_shift(iter);
 }
-static inline struct radix_tree_node *entry_to_node(void *ptr)
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+                                unsigned flags);
+#else
+/* Can't happen without sibling entries, but the compiler can't tell that */
+static inline void ** __radix_tree_next_slot(void **slot,
+                                struct radix_tree_iter *iter, unsigned flags)
 {
-        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+        return slot;
 }
+#endif
 /**
 * radix_tree_next_slot - find next slot in chunk
@@ -453,7 +466,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr)
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
- * function.  These cases result from the use of radix_tree_iter_next() or
+ * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
@@ -464,51 +477,31 @@ static __always_inline void **
 radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 {
        if (flags & RADIX_TREE_ITER_TAGGED) {
-                void *canon = slot;
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
-                while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-                                        radix_tree_is_internal_node(slot[1])) {
-                        if (entry_to_node(slot[1]) == canon) {
-                                iter->tags >>= 1;
-                                iter->index = __radix_tree_iter_add(iter, 1);
-                                slot++;
-                                continue;
-                        }
-                        iter->next_index = __radix_tree_iter_add(iter, 1);
-                        return NULL;
-                }
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
-                        return slot + 1;
+                        slot++;
+                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);
-                        iter->tags >>= offset;
+                        iter->tags >>= offset++;
-                        iter->index = __radix_tree_iter_add(iter, offset + 1);
+                        iter->index = __radix_tree_iter_add(iter, offset);
-                        return slot + offset + 1;
+                        slot += offset;
+                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);
-                void *canon = slot;
                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);
-                        if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-                            radix_tree_is_internal_node(*slot)) {
-                                if (entry_to_node(*slot) == canon)
-                                        continue;
-                                iter->next_index = iter->index;
-                                break;
-                        }
                        if (likely(*slot))
-                                return slot;
+                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
@@ -517,6 +510,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
                }
        }
        return NULL;
+ found:
+        if (unlikely(radix_tree_is_internal_node(*slot)))
+                return __radix_tree_next_slot(slot, iter, flags);
+        return slot;
 }
 /**
@@ -567,6 +565,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
             slot || (slot = radix_tree_next_chunk(root, iter,          \
                              RADIX_TREE_ITER_TAGGED | tag)) ;          \
             slot = radix_tree_next_slot(slot, iter,                    \
-                                RADIX_TREE_ITER_TAGGED))
+                                RADIX_TREE_ITER_TAGGED | tag))
 #endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b63f63eaa39c..5308304993be 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set)
        }
 }
+static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
+{
+        switch (_NSIG_WORDS) {
+        case 4:
+                return  (set1->sig[3] == set2->sig[3]) &&
+                        (set1->sig[2] == set2->sig[2]) &&
+                        (set1->sig[1] == set2->sig[1]) &&
+                        (set1->sig[0] == set2->sig[0]);
+        case 2:
+                return  (set1->sig[1] == set2->sig[1]) &&
+                        (set1->sig[0] == set2->sig[0]);
+        case 1:
+                return  set1->sig[0] == set2->sig[0];
+        }
+        return 0;
+}
 #define sigmask(sig)    (1UL << ((sig) - 1))
 #ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index dd66a952e8cd..11b92b047a1e 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,7 +27,7 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
-extern int handle_userfault(struct fault_env *fe, unsigned long reason);
+extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len);
@@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 #else /* CONFIG_USERFAULTFD */
 /* mm helpers */
-static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
+static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 {
        return VM_FAULT_SIGBUS;
 }
diff --git a/ipc/msg.c b/ipc/msg.c
index 32e9bd837cde..e3e52ce01123 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg)
        if (*msgtyp == 0)
                return SEARCH_ANY;
        if (*msgtyp < 0) {
-                *msgtyp = -*msgtyp;
+                if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */
+                        *msgtyp = LONG_MAX;
+                else
+                        *msgtyp = -*msgtyp;
                return SEARCH_LESSEQUAL;
        }
        if (msgflg & MSG_EXCEPT)
diff --git a/ipc/sem.c b/ipc/sem.c
index 10b94bc59d4a..e08b94851922 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -11,6 +11,7 @@
 * (c) 2001 Red Hat Inc
 * Lockless wakeup
 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
 * Further wakeup optimizations, documentation
 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
 *
@@ -53,15 +54,11 @@
 *   Semaphores are actively given to waiting tasks (necessary for FIFO).
 *   (see update_queue())
 * - To improve the scalability, the actual wake-up calls are performed after
- *   dropping all locks. (see wake_up_sem_queue_prepare(),
+ *   dropping all locks. (see wake_up_sem_queue_prepare())
- *   wake_up_sem_queue_do())
 * - All work is done by the waker, the woken up task does not have to do
 *   anything - not even acquiring a lock or dropping a refcount.
 * - A woken up task may not even touch the semaphore array anymore, it may
 *   have been destroyed already by a semctl(RMID).
- * - The synchronizations between wake-ups due to a timeout/signal and a
- *   wake-up due to a completed semaphore operation is achieved by using an
- *   intermediate state (IN_WAKEUP).
 * - UNDO values are stored in an array (one per process and per
 *   semaphore array, lazily allocated). For backwards compatibility, multiple
 *   modes for the UNDO variables are supported (per process, per thread)
@@ -118,7 +115,8 @@ struct sem_queue {
        struct sembuf           *sops;   /* array of pending operations */
        struct sembuf           *blocking; /* the operation that blocked */
        int                     nsops;   /* number of operations */
-        int                     alter;   /* does *sops alter the array? */
+        bool                    alter;   /* does *sops alter the array? */
+        bool                    dupsop;  /* sops on more than one sem_num */
 };
 /* Each task has a list of undo requests. They are executed automatically
@@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
 *
 * The caller holds the RCU read lock.
 */
-static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
-                        int id, struct sembuf *sops, int nsops, int *locknum)
-{
-        struct kern_ipc_perm *ipcp;
-        struct sem_array *sma;
-        ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
-        if (IS_ERR(ipcp))
-                return ERR_CAST(ipcp);
-        sma = container_of(ipcp, struct sem_array, sem_perm);
-        *locknum = sem_lock(sma, sops, nsops);
-        /* ipc_rmid() may have already freed the ID while sem_lock
-         * was spinning: verify that the structure is still valid
-         */
-        if (ipc_valid_object(ipcp))
-                return container_of(ipcp, struct sem_array, sem_perm);
-        sem_unlock(sma, *locknum);
-        return ERR_PTR(-EINVAL);
-}
 static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
 {
        struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
@@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
        ipc_rmid(&sem_ids(ns), &s->sem_perm);
 }
-/*
- * Lockless wakeup algorithm:
- * Without the check/retry algorithm a lockless wakeup is possible:
- * - queue.status is initialized to -EINTR before blocking.
- * - wakeup is performed by
- *      * unlinking the queue entry from the pending list
- *      * setting queue.status to IN_WAKEUP
- *        This is the notification for the blocked thread that a
- *        result value is imminent.
- *      * call wake_up_process
- *      * set queue.status to the final value.
- * - the previously blocked thread checks queue.status:
- *      * if it's IN_WAKEUP, then it must wait until the value changes
- *      * if it's not -EINTR, then the operation was completed by
- *        update_queue. semtimedop can return queue.status without
- *        performing any operation on the sem array.
- *      * otherwise it must acquire the spinlock and check what's up.
- *
- * The two-stage algorithm is necessary to protect against the following
- * races:
- * - if queue.status is set after wake_up_process, then the woken up idle
- *   thread could race forward and try (and fail) to acquire sma->lock
- *   before update_queue had a chance to set queue.status
- * - if queue.status is written before wake_up_process and if the
- *   blocked process is woken up by a signal between writing
- *   queue.status and the wake_up_process, then the woken up
- *   process could return from semtimedop and die by calling
- *   sys_exit before wake_up_process is called. Then wake_up_process
- *   will oops, because the task structure is already invalid.
- *   (yes, this happened on s390 with sysv msg).
- *
- */
-#define IN_WAKEUP       1
 /**
 * newary - Create a new semaphore set
 * @ns: namespace
@@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
 }
 /**
- * perform_atomic_semop - Perform (if possible) a semaphore operation
+ * perform_atomic_semop[_slow] - Attempt to perform semaphore
+ *                               operations on a given array.
 * @sma: semaphore array
 * @q: struct sem_queue that describes the operation
 *
+ * Caller blocking are as follows, based the value
+ * indicated by the semaphore operation (sem_op):
+ *
+ *  (1) >0 never blocks.
+ *  (2)  0 (wait-for-zero operation): semval is non-zero.
+ *  (3) <0 attempting to decrement semval to a value smaller than zero.
+ *
 * Returns 0 if the operation was possible.
 * Returns 1 if the operation is impossible, the caller must sleep.
- * Negative values are error codes.
+ * Returns <0 for error codes.
 */
-static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
+static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
 {
        int result, sem_op, nsops, pid;
        struct sembuf *sop;
@@ -703,51 +652,84 @@ undo:
        return result;
 }
-/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
+static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
- * @q: queue entry that must be signaled
- * @error: Error value for the signal
- *
- * Prepare the wake-up of the queue entry q.
- */
-static void wake_up_sem_queue_prepare(struct list_head *pt,
-                                struct sem_queue *q, int error)
 {
-        if (list_empty(pt)) {
+        int result, sem_op, nsops;
-                /*
+        struct sembuf *sop;
-                 * Hold preempt off so that we don't get preempted and have the
+        struct sem *curr;
-                 * wakee busy-wait until we're scheduled back on.
+        struct sembuf *sops;
-                 */
+        struct sem_undo *un;
-                preempt_disable();
+        sops = q->sops;
+        nsops = q->nsops;
+        un = q->undo;
+        if (unlikely(q->dupsop))
+                return perform_atomic_semop_slow(sma, q);
+        /*
+         * We scan the semaphore set twice, first to ensure that the entire
+         * operation can succeed, therefore avoiding any pointless writes
+         * to shared memory and having to undo such changes in order to block
+         * until the operations can go through.
+         */
+        for (sop = sops; sop < sops + nsops; sop++) {
+                curr = sma->sem_base + sop->sem_num;
+                sem_op = sop->sem_op;
+                result = curr->semval;
+                if (!sem_op && result)
+                        goto would_block; /* wait-for-zero */
+                result += sem_op;
+                if (result < 0)
+                        goto would_block;
+                if (result > SEMVMX)
+                        return -ERANGE;
+                if (sop->sem_flg & SEM_UNDO) {
+                        int undo = un->semadj[sop->sem_num] - sem_op;
+                        /* Exceeding the undo range is an error. */
+                        if (undo < (-SEMAEM - 1) || undo > SEMAEM)
+                                return -ERANGE;
+                }
+        }
+        for (sop = sops; sop < sops + nsops; sop++) {
+                curr = sma->sem_base + sop->sem_num;
+                sem_op = sop->sem_op;
+                result = curr->semval;
+                if (sop->sem_flg & SEM_UNDO) {
+                        int undo = un->semadj[sop->sem_num] - sem_op;
+                        un->semadj[sop->sem_num] = undo;
+                }
+                curr->semval += sem_op;
+                curr->sempid = q->pid;
        }
-        q->status = IN_WAKEUP;
-        q->pid = error;
-        list_add_tail(&q->list, pt);
+        return 0;
+would_block:
+        q->blocking = sop;
+        return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
 }
-/**
+static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
- * wake_up_sem_queue_do - do the actual wake-up
+                                             struct wake_q_head *wake_q)
- * @pt: list of tasks to be woken up
- *
- * Do the actual wake-up.
- * The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
- */
-static void wake_up_sem_queue_do(struct list_head *pt)
 {
-        struct sem_queue *q, *t;
+        wake_q_add(wake_q, q->sleeper);
-        int did_something;
+        /*
+         * Rely on the above implicit barrier, such that we can
-        did_something = !list_empty(pt);
+         * ensure that we hold reference to the task before setting
-        list_for_each_entry_safe(q, t, pt, list) {
+         * q->status. Otherwise we could race with do_exit if the
-                wake_up_process(q->sleeper);
+         * task is awoken by an external event before calling
-                /* q can disappear immediately after writing q->status. */
+         * wake_up_process().
-                smp_wmb();
+         */
-                q->status = q->pid;
+        WRITE_ONCE(q->status, error);
-        }
-        if (did_something)
-                preempt_enable();
 }
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
 * modified the array.
 * Note that wait-for-zero operations are handled without restart.
 */
-static int check_restart(struct sem_array *sma, struct sem_queue *q)
+static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
 {
        /* pending complex alter operations are too difficult to analyse */
        if (!list_empty(&sma->pending_alter))
@@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
 * wake_const_ops - wake up non-alter tasks
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
- * @pt: list head for the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head.
 *
 * wake_const_ops must be called after a semaphore in a semaphore array
 * was set to 0. If complex const operations are pending, wake_const_ops must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
- * The tasks that must be woken up are added to @pt. The return code
+ * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function returns 1 if at least one operation was completed successfully.
 */
 static int wake_const_ops(struct sem_array *sma, int semnum,
-                                struct list_head *pt)
+                          struct wake_q_head *wake_q)
 {
-        struct sem_queue *q;
+        struct sem_queue *q, *tmp;
-        struct list_head *walk;
        struct list_head *pending_list;
        int semop_completed = 0;
@@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
        else
                pending_list = &sma->sem_base[semnum].pending_const;
-        walk = pending_list->next;
+        list_for_each_entry_safe(q, tmp, pending_list, list) {
-        while (walk != pending_list) {
+                int error = perform_atomic_semop(sma, q);
-                int error;
-                q = container_of(walk, struct sem_queue, list);
-                walk = walk->next;
-                error = perform_atomic_semop(sma, q);
-                if (error <= 0) {
-                        /* operation completed, remove from queue & wakeup */
-                        unlink_queue(sma, q);
+                if (error > 0)
+                        continue;
+                /* operation completed, remove from queue & wakeup */
+                unlink_queue(sma, q);
-                        wake_up_sem_queue_prepare(pt, q, error);
+                wake_up_sem_queue_prepare(q, error, wake_q);
-                        if (error == 0)
+                if (error == 0)
-                                semop_completed = 1;
+                        semop_completed = 1;
-                }
        }
        return semop_completed;
 }
@@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
 * @sma: semaphore array
 * @sops: operations that were performed
 * @nsops: number of operations
- * @pt: list head of the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head
 *
 * Checks all required queue for wait-for-zero operations, based
 * on the actual changes that were performed on the semaphore array.
 * The function returns 1 if at least one operation was completed successfully.
 */
 static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
-                                        int nsops, struct list_head *pt)
+                                int nsops, struct wake_q_head *wake_q)
 {
        int i;
        int semop_completed = 0;
@@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
                        if (sma->sem_base[num].semval == 0) {
                                got_zero = 1;
-                                semop_completed |= wake_const_ops(sma, num, pt);
+                                semop_completed |= wake_const_ops(sma, num, wake_q);
                        }
                }
        } else {
@@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
                for (i = 0; i < sma->sem_nsems; i++) {
                        if (sma->sem_base[i].semval == 0) {
                                got_zero = 1;
-                                semop_completed |= wake_const_ops(sma, i, pt);
+                                semop_completed |= wake_const_ops(sma, i, wake_q);
                        }
                }
        }
@@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
         * then check the global queue, too.
         */
        if (got_zero)
-                semop_completed |= wake_const_ops(sma, -1, pt);
+                semop_completed |= wake_const_ops(sma, -1, wake_q);
        return semop_completed;
 }
@@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
 * update_queue - look for tasks that can be completed.
 * @sma: semaphore array.
 * @semnum: semaphore that was modified.
- * @pt: list head for the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head.
 *
 * update_queue must be called after a semaphore in a semaphore array
 * was modified. If multiple semaphores were modified, update_queue must
 * be called with semnum = -1, as well as with the number of each modified
 * semaphore.
- * The tasks that must be woken up are added to @pt. The return code
+ * The tasks that must be woken up are added to @wake_q. The return code
 * is stored in q->pid.
 * The function internally checks if const operations can now succeed.
 *
 * The function return 1 if at least one semop was completed successfully.
 */
-static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
 {
-        struct sem_queue *q;
+        struct sem_queue *q, *tmp;
-        struct list_head *walk;
        struct list_head *pending_list;
        int semop_completed = 0;
@@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
                pending_list = &sma->sem_base[semnum].pending_alter;
 again:
-        walk = pending_list->next;
+        list_for_each_entry_safe(q, tmp, pending_list, list) {
-        while (walk != pending_list) {
                int error, restart;
-                q = container_of(walk, struct sem_queue, list);
-                walk = walk->next;
                /* If we are scanning the single sop, per-semaphore list of
                 * one semaphore and that semaphore is 0, then it is not
                 * necessary to scan further: simple increments
@@ -949,11 +919,11 @@ again:
                        restart = 0;
                } else {
                        semop_completed = 1;
-                        do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+                        do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
                        restart = check_restart(sma, q);
                }
-                wake_up_sem_queue_prepare(pt, q, error);
+                wake_up_sem_queue_prepare(q, error, wake_q);
                if (restart)
                        goto again;
        }
@@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
 * @sops: operations that were performed
 * @nsops: number of operations
 * @otime: force setting otime
- * @pt: list head of the tasks that must be woken up.
+ * @wake_q: lockless wake-queue head
 *
 * do_smart_update() does the required calls to update_queue and wakeup_zero,
 * based on the actual changes that were performed on the semaphore array.
 * Note that the function does not do the actual wake-up: the caller is
- * responsible for calling wake_up_sem_queue_do(@pt).
+ * responsible for calling wake_up_q().
 * It is safe to perform this call after dropping all locks.
 */
 static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
-                        int otime, struct list_head *pt)
+                            int otime, struct wake_q_head *wake_q)
 {
        int i;
-        otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+        otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
        if (!list_empty(&sma->pending_alter)) {
                /* semaphore array uses the global queue - just process it. */
-                otime |= update_queue(sma, -1, pt);
+                otime |= update_queue(sma, -1, wake_q);
        } else {
                if (!sops) {
                        /*
@@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                         * known. Check all.
                         */
                        for (i = 0; i < sma->sem_nsems; i++)
-                                otime |= update_queue(sma, i, pt);
+                                otime |= update_queue(sma, i, wake_q);
                } else {
                        /*
                         * Check the semaphores that were increased:
@@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                        for (i = 0; i < nsops; i++) {
                                if (sops[i].sem_op > 0) {
                                        otime |= update_queue(sma,
-                                                        sops[i].sem_num, pt);
+                                                              sops[i].sem_num, wake_q);
                                }
                        }
                }
@@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
        struct sem_undo *un, *tu;
        struct sem_queue *q, *tq;
        struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
-        struct list_head tasks;
        int i;
+        DEFINE_WAKE_Q(wake_q);
        /* Free the existing undo structures for this semaphore set.  */
        ipc_assert_locked_object(&sma->sem_perm);
@@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
        }
        /* Wake up all pending processes and let them fail with EIDRM. */
-        INIT_LIST_HEAD(&tasks);
        list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
                unlink_queue(sma, q);
-                wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }
        list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
                unlink_queue(sma, q);
-                wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+                wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
        }
        for (i = 0; i < sma->sem_nsems; i++) {
                struct sem *sem = sma->sem_base + i;
                list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
                        unlink_queue(sma, q);
-                        wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
                list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
                        unlink_queue(sma, q);
-                        wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+                        wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
                }
        }
@@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
        sem_unlock(sma, -1);
        rcu_read_unlock();
-        wake_up_sem_queue_do(&tasks);
+        wake_up_q(&wake_q);
        ns->used_sems -= sma->sem_nsems;
        ipc_rcu_putref(sma, sem_rcu_free);
 }
@@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
        struct sem_undo *un;
        struct sem_array *sma;
        struct sem *curr;
-        int err;
+        int err, val;
-        struct list_head tasks;
+        DEFINE_WAKE_Q(wake_q);
-        int val;
 #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
        /* big-endian 64bit */
        val = arg >> 32;
@@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
        if (val > SEMVMX || val < 0)
                return -ERANGE;
-        INIT_LIST_HEAD(&tasks);
        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
        if (IS_ERR(sma)) {
@@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
        curr->sempid = task_tgid_vnr(current);
        sma->sem_ctime = get_seconds();
        /* maybe some queued-up processes were waiting for this */
-        do_smart_update(sma, NULL, 0, 0, &tasks);
+        do_smart_update(sma, NULL, 0, 0, &wake_q);
        sem_unlock(sma, -1);
        rcu_read_unlock();
-        wake_up_sem_queue_do(&tasks);
+        wake_up_q(&wake_q);
        return 0;
 }
@@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
        int err, nsems;
        ushort fast_sem_io[SEMMSL_FAST];
        ushort *sem_io = fast_sem_io;
-        struct list_head tasks;
+        DEFINE_WAKE_Q(wake_q);
-        INIT_LIST_HEAD(&tasks);
        rcu_read_lock();
        sma = sem_obtain_object_check(ns, semid);
@@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                }
                sma->sem_ctime = get_seconds();
                /* maybe some queued-up processes were waiting for this */
-                do_smart_update(sma, NULL, 0, 0, &tasks);
+                do_smart_update(sma, NULL, 0, 0, &wake_q);
                err = 0;
                goto out_unlock;
        }
@@ -1514,7 +1479,7 @@ out_unlock:
        sem_unlock(sma, -1);
 out_rcu_wakeup:
        rcu_read_unlock();
-        wake_up_sem_queue_do(&tasks);
+        wake_up_q(&wake_q);
 out_free:
        if (sem_io != fast_sem_io)
                ipc_free(sem_io);
@@ -1787,32 +1752,6 @@ out:
        return un;
 }
-/**
- * get_queue_result - retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
-        int error;
-        error = q->status;
-        while (unlikely(error == IN_WAKEUP)) {
-                cpu_relax();
-                error = q->status;
-        }
-        return error;
-}
 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                unsigned, nsops, const struct timespec __user *, timeout)
 {
@@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
        struct sembuf fast_sops[SEMOPM_FAST];
        struct sembuf *sops = fast_sops, *sop;
        struct sem_undo *un;
-        int undos = 0, alter = 0, max, locknum;
+        int max, locknum;
+        bool undos = false, alter = false, dupsop = false;
        struct sem_queue queue;
-        unsigned long jiffies_left = 0;
+        unsigned long dup = 0, jiffies_left = 0;
        struct ipc_namespace *ns;
-        struct list_head tasks;
        ns = current->nsproxy->ipc_ns;
@@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                if (sops == NULL)
                        return -ENOMEM;
        }
        if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
                error =  -EFAULT;
                goto out_free;
        }
        if (timeout) {
                struct timespec _timeout;
                if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
@@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                }
                jiffies_left = timespec_to_jiffies(&_timeout);
        }
        max = 0;
        for (sop = sops; sop < sops + nsops; sop++) {
+                unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
                if (sop->sem_num >= max)
                        max = sop->sem_num;
                if (sop->sem_flg & SEM_UNDO)
-                        undos = 1;
+                        undos = true;
-                if (sop->sem_op != 0)
+                if (dup & mask) {
-                        alter = 1;
+                        /*
+                         * There was a previous alter access that appears
+                         * to have accessed the same semaphore, thus use
+                         * the dupsop logic. "appears", because the detection
+                         * can only check % BITS_PER_LONG.
+                         */
+                        dupsop = true;
+                }
+                if (sop->sem_op != 0) {
+                        alter = true;
+                        dup |= mask;
+                }
        }
-        INIT_LIST_HEAD(&tasks);
        if (undos) {
                /* On success, find_alloc_undo takes the rcu_read_lock */
                un = find_alloc_undo(ns, semid);
@@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
        }
        error = -EFBIG;
-        if (max >= sma->sem_nsems)
+        if (max >= sma->sem_nsems) {
-                goto out_rcu_wakeup;
+                rcu_read_unlock();
+                goto out_free;
+        }
        error = -EACCES;
-        if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
+        if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
-                goto out_rcu_wakeup;
+                rcu_read_unlock();
+                goto out_free;
+        }
        error = security_sem_semop(sma, sops, nsops, alter);
-        if (error)
+        if (error) {
-                goto out_rcu_wakeup;
+                rcu_read_unlock();
+                goto out_free;
+        }
        error = -EIDRM;
        locknum = sem_lock(sma, sops, nsops);
@@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
        queue.undo = un;
        queue.pid = task_tgid_vnr(current);
        queue.alter = alter;
+        queue.dupsop = dupsop;
        error = perform_atomic_semop(sma, &queue);
-        if (error == 0) {
+        if (error == 0) { /* non-blocking succesfull path */
-                /* If the operation was successful, then do
+                DEFINE_WAKE_Q(wake_q);
+                /*
+                 * If the operation was successful, then do
                 * the required updates.
                 */
                if (alter)
-                        do_smart_update(sma, sops, nsops, 1, &tasks);
+                        do_smart_update(sma, sops, nsops, 1, &wake_q);
                else
                        set_semotime(sma, sops);
+                sem_unlock(sma, locknum);
+                rcu_read_unlock();
+                wake_up_q(&wake_q);
+                goto out_free;
        }
-        if (error <= 0)
+        if (error < 0) /* non-blocking error path */
                goto out_unlock_free;
-        /* We need to sleep on this operation, so we put the current
+        /*
+         * We need to sleep on this operation, so we put the current
         * task into the pending queue and go to sleep.
         */
        if (nsops == 1) {
                struct sem *curr;
                curr = &sma->sem_base[sops->sem_num];
@@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                sma->complex_count++;
        }
-        queue.status = -EINTR;
+        do {
-        queue.sleeper = current;
+                queue.status = -EINTR;
+                queue.sleeper = current;
-sleep_again:
+                __set_current_state(TASK_INTERRUPTIBLE);
-        __set_current_state(TASK_INTERRUPTIBLE);
+                sem_unlock(sma, locknum);
-        sem_unlock(sma, locknum);
+                rcu_read_unlock();
-        rcu_read_unlock();
-        if (timeout)
-                jiffies_left = schedule_timeout(jiffies_left);
-        else
-                schedule();
-        error = get_queue_result(&queue);
+                if (timeout)
+                        jiffies_left = schedule_timeout(jiffies_left);
+                else
+                        schedule();
-        if (error != -EINTR) {
+                /*
-                /* fast path: update_queue already obtained all requested
+                 * fastpath: the semop has completed, either successfully or
-                 * resources.
+                 * not, from the syscall pov, is quite irrelevant to us at this
-                 * Perform a smp_mb(): User space could assume that semop()
+                 * point; we're done.
-                 * is a memory barrier: Without the mb(), the cpu could
+                 *
-                 * speculatively read in user space stale data that was
+                 * We _do_ care, nonetheless, about being awoken by a signal or
-                 * overwritten by the previous owner of the semaphore.
+                 * spuriously.  The queue.status is checked again in the
+                 * slowpath (aka after taking sem_lock), such that we can detect
+                 * scenarios where we were awakened externally, during the
+                 * window between wake_q_add() and wake_up_q().
                 */
-                smp_mb();
+                error = READ_ONCE(queue.status);
+                if (error != -EINTR) {
-                goto out_free;
+                        /*
-        }
+                         * User space could assume that semop() is a memory
+                         * barrier: Without the mb(), the cpu could
-        rcu_read_lock();
+                         * speculatively read in userspace stale data that was
-        sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
+                         * overwritten by the previous owner of the semaphore.
+                         */
-        /*
+                        smp_mb();
-         * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
+                        goto out_free;
-         */
+                }
-        error = get_queue_result(&queue);
-        /*
+                rcu_read_lock();
-         * Array removed? If yes, leave without sem_unlock().
+                sem_lock(sma, sops, nsops);
-         */
-        if (IS_ERR(sma)) {
-                rcu_read_unlock();
-                goto out_free;
-        }
+                if (!ipc_valid_object(&sma->sem_perm))
+                        goto out_unlock_free;
-        /*
+                error = READ_ONCE(queue.status);
-         * If queue.status != -EINTR we are woken up by another process.
-         * Leave without unlink_queue(), but with sem_unlock().
-         */
-        if (error != -EINTR)
-                goto out_unlock_free;
-        /*
+                /*
-         * If an interrupt occurred we have to clean up the queue
+                 * If queue.status != -EINTR we are woken up by another process.
-         */
+                 * Leave without unlink_queue(), but with sem_unlock().
-        if (timeout && jiffies_left == 0)
+                 */
-                error = -EAGAIN;
+                if (error != -EINTR)
+                        goto out_unlock_free;
-        /*
+                /*
-         * If the wakeup was spurious, just retry
+                 * If an interrupt occurred we have to clean up the queue.
-         */
+                 */
-        if (error == -EINTR && !signal_pending(current))
+                if (timeout && jiffies_left == 0)
-                goto sleep_again;
+                        error = -EAGAIN;
+        } while (error == -EINTR && !signal_pending(current)); /* spurious */
        unlink_queue(sma, &queue);
 out_unlock_free:
        sem_unlock(sma, locknum);
-out_rcu_wakeup:
        rcu_read_unlock();
-        wake_up_sem_queue_do(&tasks);
 out_free:
        if (sops != fast_sops)
                kfree(sops);
@@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk)
        for (;;) {
                struct sem_array *sma;
                struct sem_undo *un;
-                struct list_head tasks;
                int semid, i;
+                DEFINE_WAKE_Q(wake_q);
                cond_resched();
@@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk)
                        }
                }
                /* maybe some queued-up processes were waiting for this */
-                INIT_LIST_HEAD(&tasks);
+                do_smart_update(sma, NULL, 0, 1, &wake_q);
-                do_smart_update(sma, NULL, 0, 1, &tasks);
                sem_unlock(sma, -1);
                rcu_read_unlock();
-                wake_up_sem_queue_do(&tasks);
+                wake_up_q(&wake_q);
                kfree_rcu(un, rcu);
        }
diff --git a/ipc/shm.c b/ipc/shm.c
index dbac8860c721..81203e8ba013 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -89,6 +89,7 @@ void shm_init_ns(struct ipc_namespace *ns)
 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
        struct shmid_kernel *shp;
        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        if (shp->shm_nattch) {
@@ -387,6 +388,7 @@ static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        int err = 0;
        if (sfd->vm_ops->set_policy)
                err = sfd->vm_ops->set_policy(vma, new);
        return err;
@@ -417,7 +419,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
         * In case of remap_file_pages() emulation, the file can represent
         * removed IPC ID: propogate shm_lock() error to caller.
         */
-        ret =__shm_open(vma);
+        ret = __shm_open(vma);
        if (ret)
                return ret;
@@ -468,6 +470,7 @@ static unsigned long shm_get_unmapped_area(struct file *file,
        unsigned long flags)
 {
        struct shm_file_data *sfd = shm_file_data(file);
        return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
                                                pgoff, flags);
 }
@@ -766,6 +769,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
        } else {
 #ifdef CONFIG_SHMEM
                struct shmem_inode_info *info = SHMEM_I(inode);
                spin_lock_irq(&info->lock);
                *rss_add += inode->i_mapping->nrpages;
                *swp_add += info->swapped;
@@ -1028,6 +1032,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
                if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
                        kuid_t euid = current_euid();
                        if (!uid_eq(euid, shp->shm_perm.uid) &&
                            !uid_eq(euid, shp->shm_perm.cuid)) {
                                err = -EPERM;
@@ -1045,6 +1050,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
                if (cmd == SHM_LOCK) {
                        struct user_struct *user = current_user();
                        err = shmem_lock(shm_file, 1, user);
                        if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
                                shp->shm_perm.mode |= SHM_LOCKED;
@@ -1354,9 +1360,10 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
                vma = next;
        }
-#else /* CONFIG_MMU */
+#else   /* CONFIG_MMU */
        /* under NOMMU conditions, the exact address to be destroyed must be
-         * given */
+         * given
+         */
        if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
                do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
                retval = 0;
diff --git a/kernel/Makefile b/kernel/Makefile
index eaee9de224bd..12c679f769c6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
        /*
         * Wait for the other CPUs to be notified and be waiting for us:
         */
-        time_left = loops_per_jiffy * HZ;
+        time_left = MSEC_PER_SEC;
        while (kgdb_do_roundup && --time_left &&
               (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
                   online_cpus)
-                cpu_relax();
+                udelay(1000);
        if (!time_left)
                pr_crit("Timed out waiting for secondary CPUs.\n");
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 98c9011eac78..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
 char kdb_prompt_str[CMD_BUFLEN];
 int kdb_trap_printk;
+int kdb_printf_cpu = -1;
 static int kgdb_transition_check(char *buffer)
 {
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
        int linecount;
        int colcount;
        int logging, saved_loglevel = 0;
-        int saved_trap_printk;
-        int got_printf_lock = 0;
        int retlen = 0;
        int fnd, len;
+        int this_cpu, old_cpu;
        char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
        char *moreprompt = "more> ";
        struct console *c = console_drivers;
-        static DEFINE_SPINLOCK(kdb_printf_lock);
        unsigned long uninitialized_var(flags);
-        preempt_disable();
-        saved_trap_printk = kdb_trap_printk;
-        kdb_trap_printk = 0;
        /* Serialize kdb_printf if multiple cpus try to write at once.
         * But if any cpu goes recursive in kdb, just print the output,
         * even if it is interleaved with any other text.
         */
-        if (!KDB_STATE(PRINTF_LOCK)) {
+        local_irq_save(flags);
-                KDB_STATE_SET(PRINTF_LOCK);
+        this_cpu = smp_processor_id();
-                spin_lock_irqsave(&kdb_printf_lock, flags);
+        for (;;) {
-                got_printf_lock = 1;
+                old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
-                atomic_inc(&kdb_event);
+                if (old_cpu == -1 || old_cpu == this_cpu)
-        } else {
+                        break;
-                __acquire(kdb_printf_lock);
+                cpu_relax();
        }
        diag = kdbgetintenv("LINES", &linecount);
@@ -847,16 +843,9 @@ kdb_print_out:
        suspend_grep = 0; /* end of what may have been a recursive call */
        if (logging)
                console_loglevel = saved_loglevel;
-        if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+        /* kdb_printf_cpu locked the code above. */
-                got_printf_lock = 0;
+        smp_store_release(&kdb_printf_cpu, old_cpu);
-                spin_unlock_irqrestore(&kdb_printf_lock, flags);
+        local_irq_restore(flags);
-                KDB_STATE_CLEAR(PRINTF_LOCK);
-                atomic_dec(&kdb_event);
-        } else {
-                __release(kdb_printf_lock);
-        }
-        kdb_trap_printk = saved_trap_printk;
-        preempt_enable();
        return retlen;
 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
 * Kernel debugger state flags
 */
 int kdb_flags;
-atomic_t kdb_event;
 /*
 * kdb_lock protects updates to kdb_initial_cpu.  Used to
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
 #define KDB_STATE_PAGER         0x00000400      /* pager is available */
 #define KDB_STATE_GO_SWITCH     0x00000800      /* go is switching
                                                 * back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK   0x00001000      /* Holds kdb_printf lock */
 #define KDB_STATE_WAIT_IPI      0x00002000      /* Waiting for kdb_ipi() NMI */
 #define KDB_STATE_RECURSE       0x00004000      /* Recursive entry to kdb */
 #define KDB_STATE_IP_ADJUSTED   0x00008000      /* Restart IP has been
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..215871bda3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 retry:
        /* Read the page with vaddr into memory */
        ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
-                        &vma);
+                        &vma, NULL);
        if (ret <= 0)
                return ret;
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
-                        NULL);
+                        NULL, NULL);
        if (result < 0)
                return result;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3cbb0c879705..cc2fa35ca480 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
 #define pr_fmt(fmt) "kcov: " fmt
 #define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
        while (hole_end <= crashk_res.end) {
                unsigned long i;
+                cond_resched();
                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
                /* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_X86
-        VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-#endif
 #ifdef CONFIG_HUGETLB_PAGE
        VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
 #endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 577f2288d19f..a3ce35e0fa1e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args)
        int r;
 #ifdef CONFIG_KGDB_KDB
-        if (unlikely(kdb_trap_printk)) {
+        /* Allow to pass printk() to kdb but avoid a recursion. */
+        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
                r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
                return r;
        }
diff --git a/kernel/relay.c b/kernel/relay.c
index da79a109dbeb..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
 {
        struct rchan_buf *buf;
-        if (!chan)
+        if (!chan || cpu >= NR_CPUS)
                return;
        buf = *per_cpu_ptr(chan->buf, cpu);
-        if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
+        if (!buf || subbufs_consumed > chan->n_subbufs)
                return;
        if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
diff --git a/kernel/signal.c b/kernel/signal.c
index 29a410780aa9..ae60996fedff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
 {
        struct task_struct *tsk = current;
+        /*
+         * In case the signal mask hasn't changed, there is nothing we need
+         * to do. The current->blocked shouldn't be modified by other task.
+         */
+        if (sigequalsets(&tsk->blocked, newset))
+                return;
        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39b3368f6de6..1475d2545b7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
 #ifdef CONFIG_COREDUMP
        if (suid_dumpable == SUID_DUMP_ROOT &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
-                printk(KERN_WARNING "Unsafe core_pattern used with "\
+                printk(KERN_WARNING
-                        "suid_dumpable=2. Pipe handler or fully qualified "\
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-                        "core dump path required.\n");
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+                );
        }
 #endif
 }
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
                        "warning: process `%s' used the deprecated sysctl "
                        "system call with ", current->comm);
                for (i = 0; i < nlen; i++)
-                        printk("%d.", name[i]);
+                        printk(KERN_CONT "%d.", name[i]);
-                printk("\n");
+                printk(KERN_CONT "\n");
        }
        return;
 }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9b08ca391aed..3921cf7fea8e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
        spin_lock_irqsave(&ptr->it_lock, flags);
        if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
-                if (posix_timer_event(ptr, 0) != 0)
+                if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
+                    posix_timer_event(ptr, 0) != 0)
                        ptr->it_overrun++;
        }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-#include <linux/perf_event.h>
 #include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT   0
-#define SOFT_WATCHDOG_ENABLED_BIT  1
-#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 int __read_mostly nmi_watchdog_enabled;
 int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 static struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
 static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
-                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
-        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-static int __init hardlockup_panic_setup(char *str)
-{
-        if (!strncmp(str, "panic", 5))
-                hardlockup_panic = 1;
-        else if (!strncmp(str, "nopanic", 7))
-                hardlockup_panic = 0;
-        else if (!strncmp(str, "0", 1))
-                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-        else if (!strncmp(str, "1", 1))
-                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-        return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
 unsigned int __read_mostly softlockup_panic =
                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
        wq_watchdog_touch(-1);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
-        /*
-         * Using __raw here because some code paths have
-         * preemption enabled.  If preemption is enabled
-         * then interrupts should be enabled too, in which
-         * case we shouldn't have to worry about the watchdog
-         * going off.
-         */
-        raw_cpu_write(watchdog_nmi_touch, true);
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-#endif
 void touch_softlockup_watchdog_sync(void)
 {
        __this_cpu_write(softlockup_touch_sync, true);
        __this_cpu_write(watchdog_touch_ts, 0);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
 {
        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
        __this_cpu_write(hrtimer_interrupts_saved, hrint);
        return false;
 }
-#endif
 static int is_softlockup(unsigned long touch_ts)
 {
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
        return 0;
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static struct perf_event_attr wd_hw_attr = {
-        .type           = PERF_TYPE_HARDWARE,
-        .config         = PERF_COUNT_HW_CPU_CYCLES,
-        .size           = sizeof(struct perf_event_attr),
-        .pinned         = 1,
-        .disabled       = 1,
-};
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-                 struct perf_sample_data *data,
-                 struct pt_regs *regs)
-{
-        /* Ensure the watchdog never gets throttled */
-        event->hw.interrupts = 0;
-        if (__this_cpu_read(watchdog_nmi_touch) == true) {
-                __this_cpu_write(watchdog_nmi_touch, false);
-                return;
-        }
-        /* check for a hardlockup
-         * This is done by making sure our timer interrupt
-         * is incrementing.  The timer interrupt should have
-         * fired multiple times before we overflow'd.  If it hasn't
-         * then this is a good indication the cpu is stuck
-         */
-        if (is_hardlockup()) {
-                int this_cpu = smp_processor_id();
-                struct pt_regs *regs = get_irq_regs();
-                /* only print hardlockups once */
-                if (__this_cpu_read(hard_watchdog_warn) == true)
-                        return;
-                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                print_modules();
-                print_irqtrace_events(current);
-                if (regs)
-                        show_regs(regs);
-                else
-                        dump_stack();
-                /*
-                 * Perform all-CPU dump only once to avoid multiple hardlockups
-                 * generating interleaving traces
-                 */
-                if (sysctl_hardlockup_all_cpu_backtrace &&
-                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
-                        trigger_allbutself_cpu_backtrace();
-                if (hardlockup_panic)
-                        nmi_panic(regs, "Hard LOCKUP");
-                __this_cpu_write(hard_watchdog_warn, true);
-                return;
-        }
-        __this_cpu_write(hard_watchdog_warn, false);
-        return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static void watchdog_interrupt_count(void)
 {
        __this_cpu_inc(hrtimer_interrupts);
 }
-static int watchdog_nmi_enable(unsigned int cpu);
+/*
-static void watchdog_nmi_disable(unsigned int cpu);
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+        return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
 static int watchdog_enable_all_cpus(void);
 static void watchdog_disable_all_cpus(void);
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu)
                watchdog_nmi_disable(cpu);
 }
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-static int watchdog_nmi_enable(unsigned int cpu)
-{
-        struct perf_event_attr *wd_attr;
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        /* nothing to do if the hard lockup detector is disabled */
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                goto out;
-        /* is it already setup and enabled? */
-        if (event && event->state > PERF_EVENT_STATE_OFF)
-                goto out;
-        /* it is setup but not enabled */
-        if (event != NULL)
-                goto out_enable;
-        wd_attr = &wd_hw_attr;
-        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-        /* Try to register using hardware perf events */
-        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-        /* save cpu0 error for future comparision */
-        if (cpu == 0 && IS_ERR(event))
-                cpu0_err = PTR_ERR(event);
-        if (!IS_ERR(event)) {
-                /* only print for cpu0 or different than cpu0 */
-                if (cpu == 0 || cpu0_err)
-                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-                goto out_save;
-        }
-        /*
-         * Disable the hard lockup detector if _any_ CPU fails to set up
-         * set up the hardware perf event. The watchdog() function checks
-         * the NMI_WATCHDOG_ENABLED bit periodically.
-         *
-         * The barriers are for syncing up watchdog_enabled across all the
-         * cpus, as clear_bit() does not use barriers.
-         */
-        smp_mb__before_atomic();
-        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-        smp_mb__after_atomic();
-        /* skip displaying the same error again */
-        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
-                return PTR_ERR(event);
-        /* vary the KERN level based on the returned errno */
-        if (PTR_ERR(event) == -EOPNOTSUPP)
-                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-        else if (PTR_ERR(event) == -ENOENT)
-                pr_warn("disabled (cpu%i): hardware events not enabled\n",
-                         cpu);
-        else
-                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-                        cpu, PTR_ERR(event));
-        pr_info("Shutting down hard lockup detector on all cpus\n");
-        return PTR_ERR(event);
-        /* success path */
-out_save:
-        per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-        perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-        return 0;
-}
-static void watchdog_nmi_disable(unsigned int cpu)
-{
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
-        if (event) {
-                perf_event_disable(event);
-                per_cpu(watchdog_ev, cpu) = NULL;
-                /* should be in cleanup, but blocks oprofile */
-                perf_event_release_kernel(event);
-        }
-        if (cpu == 0) {
-                /* watchdog_nmi_enable() expects this to be zero initially. */
-                cpu0_err = 0;
-        }
-}
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static struct smp_hotplug_thread watchdog_threads = {
        .store                  = &softlockup_watchdog,
        .thread_should_run      = watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+static int __init hardlockup_panic_setup(char *str)
+{
+        if (!strncmp(str, "panic", 5))
+                hardlockup_panic = 1;
+        else if (!strncmp(str, "nopanic", 7))
+                hardlockup_panic = 0;
+        else if (!strncmp(str, "0", 1))
+                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+        else if (!strncmp(str, "1", 1))
+                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+        return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+void touch_nmi_watchdog(void)
+{
+        /*
+         * Using __raw here because some code paths have
+         * preemption enabled.  If preemption is enabled
+         * then interrupts should be enabled too, in which
+         * case we shouldn't have to worry about the watchdog
+         * going off.
+         */
+        raw_cpu_write(watchdog_nmi_touch, true);
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+static struct perf_event_attr wd_hw_attr = {
+        .type           = PERF_TYPE_HARDWARE,
+        .config         = PERF_COUNT_HW_CPU_CYCLES,
+        .size           = sizeof(struct perf_event_attr),
+        .pinned         = 1,
+        .disabled       = 1,
+};
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+                 struct perf_sample_data *data,
+                 struct pt_regs *regs)
+{
+        /* Ensure the watchdog never gets throttled */
+        event->hw.interrupts = 0;
+        if (__this_cpu_read(watchdog_nmi_touch) == true) {
+                __this_cpu_write(watchdog_nmi_touch, false);
+                return;
+        }
+        /* check for a hardlockup
+         * This is done by making sure our timer interrupt
+         * is incrementing.  The timer interrupt should have
+         * fired multiple times before we overflow'd.  If it hasn't
+         * then this is a good indication the cpu is stuck
+         */
+        if (is_hardlockup()) {
+                int this_cpu = smp_processor_id();
+                /* only print hardlockups once */
+                if (__this_cpu_read(hard_watchdog_warn) == true)
+                        return;
+                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                print_modules();
+                print_irqtrace_events(current);
+                if (regs)
+                        show_regs(regs);
+                else
+                        dump_stack();
+                /*
+                 * Perform all-CPU dump only once to avoid multiple hardlockups
+                 * generating interleaving traces
+                 */
+                if (sysctl_hardlockup_all_cpu_backtrace &&
+                                !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+                        trigger_allbutself_cpu_backtrace();
+                if (hardlockup_panic)
+                        nmi_panic(regs, "Hard LOCKUP");
+                __this_cpu_write(hard_watchdog_warn, true);
+                return;
+        }
+        __this_cpu_write(hard_watchdog_warn, false);
+        return;
+}
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+int watchdog_nmi_enable(unsigned int cpu)
+{
+        struct perf_event_attr *wd_attr;
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        /* nothing to do if the hard lockup detector is disabled */
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                goto out;
+        /* is it already setup and enabled? */
+        if (event && event->state > PERF_EVENT_STATE_OFF)
+                goto out;
+        /* it is setup but not enabled */
+        if (event != NULL)
+                goto out_enable;
+        wd_attr = &wd_hw_attr;
+        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+        /* Try to register using hardware perf events */
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        /* save cpu0 error for future comparision */
+        if (cpu == 0 && IS_ERR(event))
+                cpu0_err = PTR_ERR(event);
+        if (!IS_ERR(event)) {
+                /* only print for cpu0 or different than cpu0 */
+                if (cpu == 0 || cpu0_err)
+                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+                goto out_save;
+        }
+        /*
+         * Disable the hard lockup detector if _any_ CPU fails to set up
+         * set up the hardware perf event. The watchdog() function checks
+         * the NMI_WATCHDOG_ENABLED bit periodically.
+         *
+         * The barriers are for syncing up watchdog_enabled across all the
+         * cpus, as clear_bit() does not use barriers.
+         */
+        smp_mb__before_atomic();
+        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+        smp_mb__after_atomic();
+        /* skip displaying the same error again */
+        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+                return PTR_ERR(event);
+        /* vary the KERN level based on the returned errno */
+        if (PTR_ERR(event) == -EOPNOTSUPP)
+                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+        else if (PTR_ERR(event) == -ENOENT)
+                pr_warn("disabled (cpu%i): hardware events not enabled\n",
+                         cpu);
+        else
+                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                        cpu, PTR_ERR(event));
+        pr_info("Shutting down hard lockup detector on all cpus\n");
+        return PTR_ERR(event);
+        /* success path */
+out_save:
+        per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+        perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+        return 0;
+}
+void watchdog_nmi_disable(unsigned int cpu)
+{
+        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        if (event) {
+                perf_event_disable(event);
+                per_cpu(watchdog_ev, cpu) = NULL;
+                /* should be in cleanup, but blocks oprofile */
+                perf_event_release_kernel(event);
+        }
+        if (cpu == 0) {
+                /* watchdog_nmi_enable() expects this to be zero initially. */
+                cpu0_err = 0;
+        }
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e6327d102184..7446097f72bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -194,8 +194,8 @@ config GDB_SCRIPTS
          build directory. If you load vmlinux into gdb, the helper
          scripts will be automatically imported by gdb as well, and
          additional functions are available to analyze a Linux kernel
-          instance. See Documentation/gdb-kernel-debugging.txt for further
+          instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
-          details.
+          for further details.
 config ENABLE_WARN_DEPRECATED
        bool "Enable __deprecated logic"
@@ -542,7 +542,7 @@ config DEBUG_KMEMLEAK
          difference being that the orphan objects are not freed but
          only shown in /sys/kernel/debug/kmemleak. Enabling this
          feature will introduce an overhead to memory
-          allocations. See Documentation/kmemleak.txt for more
+          allocations. See Documentation/dev-tools/kmemleak.rst for more
          details.
          Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
@@ -739,7 +739,7 @@ config KCOV
          different machines and across reboots. If you need stable PC values,
          disable RANDOMIZE_BASE.
-          For more details, see Documentation/kcov.txt.
+          For more details, see Documentation/dev-tools/kcov.rst.
 config KCOV_INSTRUMENT_ALL
        bool "Instrument all code by default"
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index bc6e651df68c..a669c193b878 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -10,7 +10,8 @@ config UBSAN
          This option enables undefined behaviour sanity checker
          Compile-time instrumentation is used to detect various undefined
          behaviours in runtime. Various types of checks may be enabled
-          via boot parameter ubsan_handle (see: Documentation/ubsan.txt).
+          via boot parameter ubsan_handle
+          (see: Documentation/dev-tools/ubsan.rst).
 config UBSAN_SANITIZE_ALL
        bool "Enable instrumentation for the entire kernel"
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 2e8c6f7aa56e..0019aca0f328 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -22,6 +22,7 @@
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -69,6 +70,11 @@ struct radix_tree_preload {
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static inline struct radix_tree_node *entry_to_node(void *ptr)
+{
+        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+}
 static inline void *node_to_entry(void *ptr)
 {
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -191,13 +197,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
 * Returns next bit offset, or size if nothing found.
 */
 static __always_inline unsigned long
-radix_tree_find_next_bit(const unsigned long *addr,
+radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
-                         unsigned long size, unsigned long offset)
+                         unsigned long offset)
 {
-        if (!__builtin_constant_p(size))
+        const unsigned long *addr = node->tags[tag];
-                return find_next_bit(addr, size, offset);
-        if (offset < size) {
+        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;
                addr += offset / BITS_PER_LONG;
@@ -205,14 +210,32 @@ radix_tree_find_next_bit(const unsigned long *addr,
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
-                while (offset < size) {
+                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
-        return size;
+        return RADIX_TREE_MAP_SIZE;
+}
+static unsigned int iter_offset(const struct radix_tree_iter *iter)
+{
+        return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
+}
+/*
+ * The maximum index which can be stored in a radix tree
+ */
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+        return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+        return shift_maxindex(node->shift);
 }
 #ifndef __KERNEL__
@@ -220,10 +243,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
        unsigned long i;
-        pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
+        pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
-                node, node->offset,
+                node, node->offset, index, index | node_maxindex(node),
+                node->parent,
                node->tags[0][0], node->tags[1][0], node->tags[2][0],
-                node->shift, node->count, node->exceptional, node->parent);
+                node->shift, node->count, node->exceptional);
        for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
                unsigned long first = index | (i << node->shift);
@@ -231,14 +255,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
                void *entry = node->slots[i];
                if (!entry)
                        continue;
-                if (is_sibling_entry(node, entry)) {
+                if (entry == RADIX_TREE_RETRY) {
-                        pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
+                        pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
-                                        entry, i,
+                                        i, first, last, node);
-                                        *(void **)entry_to_node(entry),
-                                        first, last);
                } else if (!radix_tree_is_internal_node(entry)) {
-                        pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
+                        pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
-                                        entry, i, first, last);
+                                        entry, i, first, last, node);
+                } else if (is_sibling_entry(node, entry)) {
+                        pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
+                                        entry, i, first, last, node,
+                                        *(void **)entry_to_node(entry));
                } else {
                        dump_node(entry_to_node(entry), first);
                }
@@ -262,7 +288,10 @@ static void radix_tree_dump(struct radix_tree_root *root)
 * that the caller has pinned this thread of control to the current CPU.
 */
 static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root)
+radix_tree_node_alloc(struct radix_tree_root *root,
+                        struct radix_tree_node *parent,
+                        unsigned int shift, unsigned int offset,
+                        unsigned int count, unsigned int exceptional)
 {
        struct radix_tree_node *ret = NULL;
        gfp_t gfp_mask = root_gfp_mask(root);
@@ -307,6 +336,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 out:
        BUG_ON(radix_tree_is_internal_node(ret));
+        if (ret) {
+                ret->parent = parent;
+                ret->shift = shift;
+                ret->offset = offset;
+                ret->count = count;
+                ret->exceptional = exceptional;
+        }
        return ret;
 }
@@ -314,17 +350,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
 {
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);
-        int i;
        /*
-         * must only free zeroed nodes into the slab. radix_tree_shrink
+         * Must only free zeroed nodes into the slab.  We can be left with
-         * can leave us with a non-NULL entry in the first slot, so clear
+         * non-NULL entries by radix_tree_free_nodes, so clear the entries
-         * that here to make sure.
+         * and tags here.
         */
-        for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
+        memset(node->slots, 0, sizeof(node->slots));
-                tag_clear(node, i, 0);
+        memset(node->tags, 0, sizeof(node->tags));
+        INIT_LIST_HEAD(&node->private_list);
-        node->slots[0] = NULL;
        kmem_cache_free(radix_tree_node_cachep, node);
 }
@@ -344,7 +378,7 @@ radix_tree_node_free(struct radix_tree_node *node)
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
-static int __radix_tree_preload(gfp_t gfp_mask, int nr)
+static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
 {
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
@@ -410,6 +444,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(radix_tree_maybe_preload);
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/*
+ * Preload with enough objects to ensure that we can split a single entry
+ * of order @old_order into many entries of size @new_order
+ */
+int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
+                                                        gfp_t gfp_mask)
+{
+        unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
+        unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
+                                (new_order / RADIX_TREE_MAP_SHIFT);
+        unsigned nr = 0;
+        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
+        BUG_ON(new_order >= old_order);
+        while (layers--)
+                nr = nr * RADIX_TREE_MAP_SIZE + 1;
+        return __radix_tree_preload(gfp_mask, top * nr);
+}
+#endif
 /*
 * The same as function above, but preload number of nodes required to insert
 * (1 << order) continuous naturally-aligned elements.
@@ -455,19 +511,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
        return __radix_tree_preload(gfp_mask, nr_nodes);
 }
-/*
- * The maximum index which can be stored in a radix tree
- */
-static inline unsigned long shift_maxindex(unsigned int shift)
-{
-        return (RADIX_TREE_MAP_SIZE << shift) - 1;
-}
-static inline unsigned long node_maxindex(struct radix_tree_node *node)
-{
-        return shift_maxindex(node->shift);
-}
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
 {
@@ -505,8 +548,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
                goto out;
        do {
-                struct radix_tree_node *node = radix_tree_node_alloc(root);
+                struct radix_tree_node *node = radix_tree_node_alloc(root,
+                                                        NULL, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;
@@ -517,16 +560,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
                }
                BUG_ON(shift > BITS_PER_LONG);
-                node->shift = shift;
-                node->offset = 0;
-                node->count = 1;
-                node->parent = NULL;
                if (radix_tree_is_internal_node(slot)) {
                        entry_to_node(slot)->parent = node;
-                } else {
+                } else if (radix_tree_exceptional_entry(slot)) {
                        /* Moving an exceptional root->rnode to a node */
-                        if (radix_tree_exceptional_entry(slot))
+                        node->exceptional = 1;
-                                node->exceptional = 1;
                }
                node->slots[0] = slot;
                slot = node_to_entry(node);
@@ -665,26 +703,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
        shift = radix_tree_load_root(root, &child, &maxindex);
        /* Make sure the tree is high enough.  */
+        if (order > 0 && max == ((1UL << order) - 1))
+                max++;
        if (max > maxindex) {
                int error = radix_tree_extend(root, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = root->rnode;
-                if (order == shift)
-                        shift += RADIX_TREE_MAP_SHIFT;
        }
        while (shift > order) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
-                        child = radix_tree_node_alloc(root);
+                        child = radix_tree_node_alloc(root, node, shift,
+                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
-                        child->shift = shift;
-                        child->offset = offset;
-                        child->parent = node;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
@@ -697,31 +733,125 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
                slot = &node->slots[offset];
        }
+        if (nodep)
+                *nodep = node;
+        if (slotp)
+                *slotp = slot;
+        return 0;
+}
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-        /* Insert pointers to the canonical entry */
+/*
-        if (order > shift) {
+ * Free any nodes below this node.  The tree is presumed to not need
-                unsigned i, n = 1 << (order - shift);
+ * shrinking, and any user data in the tree is presumed to not need a
+ * destructor called on it.  If we need to add a destructor, we can
+ * add that functionality later.  Note that we may not clear tags or
+ * slots from the tree as an RCU walker may still have a pointer into
+ * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
+ * but we'll still have to clear those in rcu_free.
+ */
+static void radix_tree_free_nodes(struct radix_tree_node *node)
+{
+        unsigned offset = 0;
+        struct radix_tree_node *child = entry_to_node(node);
+        for (;;) {
+                void *entry = child->slots[offset];
+                if (radix_tree_is_internal_node(entry) &&
+                                        !is_sibling_entry(child, entry)) {
+                        child = entry_to_node(entry);
+                        offset = 0;
+                        continue;
+                }
+                offset++;
+                while (offset == RADIX_TREE_MAP_SIZE) {
+                        struct radix_tree_node *old = child;
+                        offset = child->offset + 1;
+                        child = child->parent;
+                        radix_tree_node_free(old);
+                        if (old == entry_to_node(node))
+                                return;
+                }
+        }
+}
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+                                void *item, unsigned order, bool replace)
+{
+        struct radix_tree_node *child;
+        unsigned i, n, tag, offset, tags = 0;
+        if (node) {
+                if (order > node->shift)
+                        n = 1 << (order - node->shift);
+                else
+                        n = 1;
+                offset = get_slot_offset(node, slot);
+        } else {
+                n = 1;
+                offset = 0;
+        }
+        if (n > 1) {
                offset = offset & ~(n - 1);
                slot = &node->slots[offset];
-                child = node_to_entry(slot);
+        }
-                for (i = 0; i < n; i++) {
+        child = node_to_entry(slot);
-                        if (slot[i])
+        for (i = 0; i < n; i++) {
+                if (slot[i]) {
+                        if (replace) {
+                                node->count--;
+                                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                                        if (tag_get(node, tag, offset + i))
+                                                tags |= 1 << tag;
+                        } else
                                return -EEXIST;
                }
+        }
-                for (i = 1; i < n; i++) {
+        for (i = 0; i < n; i++) {
+                struct radix_tree_node *old = slot[i];
+                if (i) {
                        rcu_assign_pointer(slot[i], child);
-                        node->count++;
+                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                                if (tags & (1 << tag))
+                                        tag_clear(node, tag, offset + i);
+                } else {
+                        rcu_assign_pointer(slot[i], item);
+                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                                if (tags & (1 << tag))
+                                        tag_set(node, tag, offset);
                }
+                if (radix_tree_is_internal_node(old) &&
+                                        !is_sibling_entry(node, old) &&
+                                        (old != RADIX_TREE_RETRY))
+                        radix_tree_free_nodes(old);
+                if (radix_tree_exceptional_entry(old))
+                        node->exceptional--;
        }
-#endif
+        if (node) {
+                node->count += n;
-        if (nodep)
+                if (radix_tree_exceptional_entry(item))
-                *nodep = node;
+                        node->exceptional += n;
-        if (slotp)
+        }
-                *slotp = slot;
+        return n;
-        return 0;
+}
+#else
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+                                void *item, unsigned order, bool replace)
+{
+        if (*slot)
+                return -EEXIST;
+        rcu_assign_pointer(*slot, item);
+        if (node) {
+                node->count++;
+                if (radix_tree_exceptional_entry(item))
+                        node->exceptional++;
+        }
+        return 1;
 }
+#endif
 /**
 *      __radix_tree_insert    -    insert into a radix tree
@@ -744,15 +874,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
        error = __radix_tree_create(root, index, order, &node, &slot);
        if (error)
                return error;
-        if (*slot != NULL)
-                return -EEXIST;
+        error = insert_entries(node, slot, item, order, false);
-        rcu_assign_pointer(*slot, item);
+        if (error < 0)
+                return error;
        if (node) {
                unsigned offset = get_slot_offset(node, slot);
-                node->count++;
-                if (radix_tree_exceptional_entry(item))
-                        node->exceptional++;
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
@@ -850,6 +978,24 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_lookup);
+static inline int slot_count(struct radix_tree_node *node,
+                                                void **slot)
+{
+        int n = 1;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+        void *ptr = node_to_entry(slot);
+        unsigned offset = get_slot_offset(node, slot);
+        int i;
+        for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+                if (node->slots[offset + i] != ptr)
+                        break;
+                n++;
+        }
+#endif
+        return n;
+}
 static void replace_slot(struct radix_tree_root *root,
                         struct radix_tree_node *node,
                         void **slot, void *item,
@@ -868,12 +1014,35 @@ static void replace_slot(struct radix_tree_root *root,
        if (node) {
                node->count += count;
-                node->exceptional += exceptional;
+                if (exceptional) {
+                        exceptional *= slot_count(node, slot);
+                        node->exceptional += exceptional;
+                }
        }
        rcu_assign_pointer(*slot, item);
 }
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+                                                void **slot)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+        bool exceptional = radix_tree_exceptional_entry(*slot);
+        void *ptr = node_to_entry(slot);
+        unsigned offset = get_slot_offset(node, slot);
+        int i;
+        for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+                if (node->slots[offset + i] != ptr)
+                        break;
+                node->slots[offset + i] = NULL;
+                node->count--;
+                if (exceptional)
+                        node->exceptional--;
+        }
+#endif
+}
 /**
 * __radix_tree_replace         - replace item in a slot
 * @root:               radix tree root
@@ -891,6 +1060,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
                          void **slot, void *item,
                          radix_tree_update_node_t update_node, void *private)
 {
+        if (!item)
+                delete_sibling_entries(node, slot);
        /*
         * This function supports replacing exceptional entries and
         * deleting entries, but that needs accounting against the
@@ -921,7 +1092,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and exceptional entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
- * deleting, use __radix_tree_lookup() and __radix_tree_replace().
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
+ * radix_tree_iter_replace().
 */
 void radix_tree_replace_slot(struct radix_tree_root *root,
                             void **slot, void *item)
@@ -930,6 +1102,164 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
 }
 /**
+ * radix_tree_iter_replace - replace item in a slot
+ * @root:       radix tree root
+ * @slot:       pointer to slot
+ * @item:       new item to store in the slot.
+ *
+ * For use with radix_tree_split() and radix_tree_for_each_slot().
+ * Caller must hold tree write locked across split and replacement.
+ */
+void radix_tree_iter_replace(struct radix_tree_root *root,
+                const struct radix_tree_iter *iter, void **slot, void *item)
+{
+        __radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
+}
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/**
+ * radix_tree_join - replace multiple entries with one multiorder entry
+ * @root: radix tree root
+ * @index: an index inside the new entry
+ * @order: order of the new entry
+ * @item: new entry
+ *
+ * Call this function to replace several entries with one larger entry.
+ * The existing entries are presumed to not need freeing as a result of
+ * this call.
+ *
+ * The replacement entry will have all the tags set on it that were set
+ * on any of the entries it is replacing.
+ */
+int radix_tree_join(struct radix_tree_root *root, unsigned long index,
+                        unsigned order, void *item)
+{
+        struct radix_tree_node *node;
+        void **slot;
+        int error;
+        BUG_ON(radix_tree_is_internal_node(item));
+        error = __radix_tree_create(root, index, order, &node, &slot);
+        if (!error)
+                error = insert_entries(node, slot, item, order, true);
+        if (error > 0)
+                error = 0;
+        return error;
+}
+/**
+ * radix_tree_split - Split an entry into smaller entries
+ * @root: radix tree root
+ * @index: An index within the large entry
+ * @order: Order of new entries
+ *
+ * Call this function as the first step in replacing a multiorder entry
+ * with several entries of lower order.  After this function returns,
+ * loop over the relevant portion of the tree using radix_tree_for_each_slot()
+ * and call radix_tree_iter_replace() to set up each new entry.
+ *
+ * The tags from this entry are replicated to all the new entries.
+ *
+ * The radix tree should be locked against modification during the entire
+ * replacement operation.  Lock-free lookups will see RADIX_TREE_RETRY which
+ * should prompt RCU walkers to restart the lookup from the root.
+ */
+int radix_tree_split(struct radix_tree_root *root, unsigned long index,
+                                unsigned order)
+{
+        struct radix_tree_node *parent, *node, *child;
+        void **slot;
+        unsigned int offset, end;
+        unsigned n, tag, tags = 0;
+        if (!__radix_tree_lookup(root, index, &parent, &slot))
+                return -ENOENT;
+        if (!parent)
+                return -ENOENT;
+        offset = get_slot_offset(parent, slot);
+        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                if (tag_get(parent, tag, offset))
+                        tags |= 1 << tag;
+        for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
+                if (!is_sibling_entry(parent, parent->slots[end]))
+                        break;
+                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                        if (tags & (1 << tag))
+                                tag_set(parent, tag, end);
+                /* rcu_assign_pointer ensures tags are set before RETRY */
+                rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
+        }
+        rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
+        parent->exceptional -= (end - offset);
+        if (order == parent->shift)
+                return 0;
+        if (order > parent->shift) {
+                while (offset < end)
+                        offset += insert_entries(parent, &parent->slots[offset],
+                                        RADIX_TREE_RETRY, order, true);
+                return 0;
+        }
+        node = parent;
+        for (;;) {
+                if (node->shift > order) {
+                        child = radix_tree_node_alloc(root, node,
+                                        node->shift - RADIX_TREE_MAP_SHIFT,
+                                        offset, 0, 0);
+                        if (!child)
+                                goto nomem;
+                        if (node != parent) {
+                                node->count++;
+                                node->slots[offset] = node_to_entry(child);
+                                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                                        if (tags & (1 << tag))
+                                                tag_set(node, tag, offset);
+                        }
+                        node = child;
+                        offset = 0;
+                        continue;
+                }
+                n = insert_entries(node, &node->slots[offset],
+                                        RADIX_TREE_RETRY, order, false);
+                BUG_ON(n > RADIX_TREE_MAP_SIZE);
+                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+                        if (tags & (1 << tag))
+                                tag_set(node, tag, offset);
+                offset += n;
+                while (offset == RADIX_TREE_MAP_SIZE) {
+                        if (node == parent)
+                                break;
+                        offset = node->offset;
+                        child = node;
+                        node = node->parent;
+                        rcu_assign_pointer(node->slots[offset],
+                                                node_to_entry(child));
+                        offset++;
+                }
+                if ((node == parent) && (offset == end))
+                        return 0;
+        }
+ nomem:
+        /* Shouldn't happen; did user forget to preload? */
+        /* TODO: free all the allocated nodes */
+        WARN_ON(1);
+        return -ENOMEM;
+}
+#endif
+/**
 *      radix_tree_tag_set - set a tag on a radix tree node
 *      @root:          radix tree root
 *      @index:         index key
@@ -990,6 +1320,34 @@ static void node_tag_clear(struct radix_tree_root *root,
                root_tag_clear(root, tag);
 }
+static void node_tag_set(struct radix_tree_root *root,
+                                struct radix_tree_node *node,
+                                unsigned int tag, unsigned int offset)
+{
+        while (node) {
+                if (tag_get(node, tag, offset))
+                        return;
+                tag_set(node, tag, offset);
+                offset = node->offset;
+                node = node->parent;
+        }
+        if (!root_tag_get(root, tag))
+                root_tag_set(root, tag);
+}
+/**
+ * radix_tree_iter_tag_set - set a tag on the current iterator entry
+ * @root:       radix tree root
+ * @iter:       iterator state
+ * @tag:        tag to set
+ */
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+                        const struct radix_tree_iter *iter, unsigned int tag)
+{
+        node_tag_set(root, iter->node, tag, iter_offset(iter));
+}
 /**
 *      radix_tree_tag_clear - clear a tag on a radix tree node
 *      @root:          radix tree root
@@ -1085,6 +1443,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
 #endif
 }
+/* Construct iter->tags bit-mask from node->tags[tag] array */
+static void set_iter_tags(struct radix_tree_iter *iter,
+                                struct radix_tree_node *node, unsigned offset,
+                                unsigned tag)
+{
+        unsigned tag_long = offset / BITS_PER_LONG;
+        unsigned tag_bit  = offset % BITS_PER_LONG;
+        iter->tags = node->tags[tag][tag_long] >> tag_bit;
+        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
+        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
+                /* Pick tags from next element */
+                if (tag_bit)
+                        iter->tags |= node->tags[tag][tag_long + 1] <<
+                                                (BITS_PER_LONG - tag_bit);
+                /* Clip chunk size, here only BITS_PER_LONG tags */
+                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
+        }
+}
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+static void **skip_siblings(struct radix_tree_node **nodep,
+                        void **slot, struct radix_tree_iter *iter)
+{
+        void *sib = node_to_entry(slot - 1);
+        while (iter->index < iter->next_index) {
+                *nodep = rcu_dereference_raw(*slot);
+                if (*nodep && *nodep != sib)
+                        return slot;
+                slot++;
+                iter->index = __radix_tree_iter_add(iter, 1);
+                iter->tags >>= 1;
+        }
+        *nodep = NULL;
+        return NULL;
+}
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+                                        unsigned flags)
+{
+        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+        struct radix_tree_node *node = rcu_dereference_raw(*slot);
+        slot = skip_siblings(&node, slot, iter);
+        while (radix_tree_is_internal_node(node)) {
+                unsigned offset;
+                unsigned long next_index;
+                if (node == RADIX_TREE_RETRY)
+                        return slot;
+                node = entry_to_node(node);
+                iter->node = node;
+                iter->shift = node->shift;
+                if (flags & RADIX_TREE_ITER_TAGGED) {
+                        offset = radix_tree_find_next_bit(node, tag, 0);
+                        if (offset == RADIX_TREE_MAP_SIZE)
+                                return NULL;
+                        slot = &node->slots[offset];
+                        iter->index = __radix_tree_iter_add(iter, offset);
+                        set_iter_tags(iter, node, offset, tag);
+                        node = rcu_dereference_raw(*slot);
+                } else {
+                        offset = 0;
+                        slot = &node->slots[0];
+                        for (;;) {
+                                node = rcu_dereference_raw(*slot);
+                                if (node)
+                                        break;
+                                slot++;
+                                offset++;
+                                if (offset == RADIX_TREE_MAP_SIZE)
+                                        return NULL;
+                        }
+                        iter->index = __radix_tree_iter_add(iter, offset);
+                }
+                if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
+                        goto none;
+                next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
+                if (next_index < iter->next_index)
+                        iter->next_index = next_index;
+        }
+        return slot;
+ none:
+        iter->next_index = 0;
+        return NULL;
+}
+EXPORT_SYMBOL(__radix_tree_next_slot);
+#else
+static void **skip_siblings(struct radix_tree_node **nodep,
+                        void **slot, struct radix_tree_iter *iter)
+{
+        return slot;
+}
+#endif
+void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
+{
+        struct radix_tree_node *node;
+        slot++;
+        iter->index = __radix_tree_iter_add(iter, 1);
+        node = rcu_dereference_raw(*slot);
+        skip_siblings(&node, slot, iter);
+        iter->next_index = iter->index;
+        iter->tags = 0;
+        return NULL;
+}
+EXPORT_SYMBOL(radix_tree_iter_resume);
 /**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
@@ -1110,7 +1583,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
-         * contiguous iterating, and forbid swithing to the next chunk.
+         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
@@ -1128,6 +1601,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
+                iter->node = NULL;
                __set_iter_shift(iter, 0);
                return (void **)&root->rnode;
        }
@@ -1143,9 +1617,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
                                return NULL;
                        if (flags & RADIX_TREE_ITER_TAGGED)
-                                offset = radix_tree_find_next_bit(
+                                offset = radix_tree_find_next_bit(node, tag,
-                                                node->tags[tag],
-                                                RADIX_TREE_MAP_SIZE,
                                                offset + 1);
                        else
                                while (++offset < RADIX_TREE_MAP_SIZE) {
@@ -1165,154 +1637,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
                        child = rcu_dereference_raw(node->slots[offset]);
                }
-                if ((child == NULL) || (child == RADIX_TREE_RETRY))
+                if (!child)
                        goto restart;
+                if (child == RADIX_TREE_RETRY)
+                        break;
        } while (radix_tree_is_internal_node(child));
        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
        iter->next_index = (index | node_maxindex(node)) + 1;
+        iter->node = node;
        __set_iter_shift(iter, node->shift);
-        /* Construct iter->tags bit-mask from node->tags[tag] array */
+        if (flags & RADIX_TREE_ITER_TAGGED)
-        if (flags & RADIX_TREE_ITER_TAGGED) {
+                set_iter_tags(iter, node, offset, tag);
-                unsigned tag_long, tag_bit;
-                tag_long = offset / BITS_PER_LONG;
-                tag_bit  = offset % BITS_PER_LONG;
-                iter->tags = node->tags[tag][tag_long] >> tag_bit;
-                /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
-                if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
-                        /* Pick tags from next element */
-                        if (tag_bit)
-                                iter->tags |= node->tags[tag][tag_long + 1] <<
-                                                (BITS_PER_LONG - tag_bit);
-                        /* Clip chunk size, here only BITS_PER_LONG tags */
-                        iter->next_index = index + BITS_PER_LONG;
-                }
-        }
        return node->slots + offset;
 }
 EXPORT_SYMBOL(radix_tree_next_chunk);
 /**
- * radix_tree_range_tag_if_tagged - for each item in given range set given
- *                                 tag if item has another tag set
- * @root:               radix tree root
- * @first_indexp:       pointer to a starting index of a range to scan
- * @last_index:         last index of a range to scan
- * @nr_to_tag:          maximum number items to tag
- * @iftag:              tag index to test
- * @settag:             tag index to set if tested tag is set
- *
- * This function scans range of radix tree from first_index to last_index
- * (inclusive).  For each item in the range if iftag is set, the function sets
- * also settag. The function stops either after tagging nr_to_tag items or
- * after reaching last_index.
- *
- * The tags must be set from the leaf level only and propagated back up the
- * path to the root. We must do this so that we resolve the full path before
- * setting any tags on intermediate nodes. If we set tags as we descend, then
- * we can get to the leaf node and find that the index that has the iftag
- * set is outside the range we are scanning. This reults in dangling tags and
- * can lead to problems with later tag operations (e.g. livelocks on lookups).
- *
- * The function returns the number of leaves where the tag was set and sets
- * *first_indexp to the first unscanned index.
- * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
- * be prepared to handle that.
- */
-unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
-                unsigned long *first_indexp, unsigned long last_index,
-                unsigned long nr_to_tag,
-                unsigned int iftag, unsigned int settag)
-{
-        struct radix_tree_node *parent, *node, *child;
-        unsigned long maxindex;
-        unsigned long tagged = 0;
-        unsigned long index = *first_indexp;
-        radix_tree_load_root(root, &child, &maxindex);
-        last_index = min(last_index, maxindex);
-        if (index > last_index)
-                return 0;
-        if (!nr_to_tag)
-                return 0;
-        if (!root_tag_get(root, iftag)) {
-                *first_indexp = last_index + 1;
-                return 0;
-        }
-        if (!radix_tree_is_internal_node(child)) {
-                *first_indexp = last_index + 1;
-                root_tag_set(root, settag);
-                return 1;
-        }
-        node = entry_to_node(child);
-        for (;;) {
-                unsigned offset = radix_tree_descend(node, &child, index);
-                if (!child)
-                        goto next;
-                if (!tag_get(node, iftag, offset))
-                        goto next;
-                /* Sibling slots never have tags set on them */
-                if (radix_tree_is_internal_node(child)) {
-                        node = entry_to_node(child);
-                        continue;
-                }
-                /* tag the leaf */
-                tagged++;
-                tag_set(node, settag, offset);
-                /* walk back up the path tagging interior nodes */
-                parent = node;
-                for (;;) {
-                        offset = parent->offset;
-                        parent = parent->parent;
-                        if (!parent)
-                                break;
-                        /* stop if we find a node with the tag already set */
-                        if (tag_get(parent, settag, offset))
-                                break;
-                        tag_set(parent, settag, offset);
-                }
- next:
-                /* Go to next entry in node */
-                index = ((index >> node->shift) + 1) << node->shift;
-                /* Overflow can happen when last_index is ~0UL... */
-                if (index > last_index || !index)
-                        break;
-                offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
-                while (offset == 0) {
-                        /*
-                         * We've fully scanned this node. Go up. Because
-                         * last_index is guaranteed to be in the tree, what
-                         * we do below cannot wander astray.
-                         */
-                        node = node->parent;
-                        offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
-                }
-                if (is_sibling_entry(node, node->slots[offset]))
-                        goto next;
-                if (tagged >= nr_to_tag)
-                        break;
-        }
-        /*
-         * We need not to tag the root tag if there is no tag which is set with
-         * settag within the range from *first_indexp to last_index.
-         */
-        if (tagged > 0)
-                root_tag_set(root, settag);
-        *first_indexp = index;
-        return tagged;
-}
-EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
-/**
 *      radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *      @root:          radix tree root
 *      @results:       where the results of the lookup are placed
@@ -1477,105 +1821,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
-#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
-#include <linux/sched.h> /* for cond_resched() */
-struct locate_info {
-        unsigned long found_index;
-        bool stop;
-};
-/*
- * This linear search is at present only useful to shmem_unuse_inode().
- */
-static unsigned long __locate(struct radix_tree_node *slot, void *item,
-                              unsigned long index, struct locate_info *info)
-{
-        unsigned long i;
-        do {
-                unsigned int shift = slot->shift;
-                for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
-                     i < RADIX_TREE_MAP_SIZE;
-                     i++, index += (1UL << shift)) {
-                        struct radix_tree_node *node =
-                                        rcu_dereference_raw(slot->slots[i]);
-                        if (node == RADIX_TREE_RETRY)
-                                goto out;
-                        if (!radix_tree_is_internal_node(node)) {
-                                if (node == item) {
-                                        info->found_index = index;
-                                        info->stop = true;
-                                        goto out;
-                                }
-                                continue;
-                        }
-                        node = entry_to_node(node);
-                        if (is_sibling_entry(slot, node))
-                                continue;
-                        slot = node;
-                        break;
-                }
-        } while (i < RADIX_TREE_MAP_SIZE);
-out:
-        if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
-                info->stop = true;
-        return index;
-}
-/**
- *      radix_tree_locate_item - search through radix tree for item
- *      @root:          radix tree root
- *      @item:          item to be found
- *
- *      Returns index where item was found, or -1 if not found.
- *      Caller must hold no lock (since this time-consuming function needs
- *      to be preemptible), and must check afterwards if item is still there.
- */
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
-        struct radix_tree_node *node;
-        unsigned long max_index;
-        unsigned long cur_index = 0;
-        struct locate_info info = {
-                .found_index = -1,
-                .stop = false,
-        };
-        do {
-                rcu_read_lock();
-                node = rcu_dereference_raw(root->rnode);
-                if (!radix_tree_is_internal_node(node)) {
-                        rcu_read_unlock();
-                        if (node == item)
-                                info.found_index = 0;
-                        break;
-                }
-                node = entry_to_node(node);
-                max_index = node_maxindex(node);
-                if (cur_index > max_index) {
-                        rcu_read_unlock();
-                        break;
-                }
-                cur_index = __locate(node, item, cur_index, &info);
-                rcu_read_unlock();
-                cond_resched();
-        } while (!info.stop && cur_index <= max_index);
-        return info.found_index;
-}
-#else
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
-        return -1;
-}
-#endif /* CONFIG_SHMEM && CONFIG_SWAP */
 /**
 *      __radix_tree_delete_node    -    try to free node after clearing a slot
 *      @root:          radix tree root
@@ -1591,20 +1836,6 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
        delete_node(root, node, NULL, NULL);
 }
-static inline void delete_sibling_entries(struct radix_tree_node *node,
-                                        void *ptr, unsigned offset)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-        int i;
-        for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-                if (node->slots[offset + i] != ptr)
-                        break;
-                node->slots[offset + i] = NULL;
-                node->count--;
-        }
-#endif
-}
 /**
 *      radix_tree_delete_item    -    delete an item from a radix tree
 *      @root:          radix tree root
@@ -1644,7 +1875,6 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                node_tag_clear(root, node, tag, offset);
-        delete_sibling_entries(node, node_to_entry(slot), offset);
        __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
        return entry;
diff --git a/mm/compaction.c b/mm/compaction.c
index 223464227299..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                    page_count(page) > page_mapcount(page))
                        goto isolate_fail;
+                /*
+                 * Only allow to migrate anonymous pages in GFP_NOFS context
+                 * because those do not depend on fs locks.
+                 */
+                if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+                        goto isolate_fail;
                /* If we already hold the lock, we can skip some rechecking */
                if (!locked) {
                        locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
                enum compact_priority prio)
 {
-        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
        struct zoneref *z;
        struct zone *zone;
        enum compact_result rc = COMPACT_SKIPPED;
-        /* Check if the GFP flags allow compaction */
+        /*
-        if (!may_enter_fs || !may_perform_io)
+         * Check if the GFP flags allow compaction - GFP_NOIO is really
+         * tricky context because the migration might require IO
+         */
+        if (!may_perform_io)
                return COMPACT_SKIPPED;
        trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
                .whole_zone = true,
+                .gfp_mask = GFP_KERNEL,
        };
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
                .classzone_idx = pgdat->kcompactd_classzone_idx,
                .mode = MIGRATE_SYNC_LIGHT,
                .ignore_skip_hint = true,
+                .gfp_mask = GFP_KERNEL,
        };
        trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
diff --git a/mm/filemap.c b/mm/filemap.c
index b06517b7f97f..32be3c8f3a11 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2164,12 +2164,12 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
        struct radix_tree_iter iter;
        void **slot;
-        struct file *file = fe->vma->vm_file;
+        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        loff_t size;
@@ -2225,11 +2225,11 @@ repeat:
                if (file->f_ra.mmap_miss > 0)
                        file->f_ra.mmap_miss--;
-                fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+                vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
-                if (fe->pte)
+                if (vmf->pte)
-                        fe->pte += iter.index - last_pgoff;
+                        vmf->pte += iter.index - last_pgoff;
                last_pgoff = iter.index;
-                if (alloc_set_pte(fe, NULL, page))
+                if (alloc_set_pte(vmf, NULL, page))
                        goto unlock;
                unlock_page(page);
                goto next;
@@ -2239,7 +2239,7 @@ skip:
                put_page(page);
 next:
                /* Huge page is mapped? No need to proceed. */
-                if (pmd_trans_huge(*fe->pmd))
+                if (pmd_trans_huge(*vmf->pmd))
                        break;
                if (iter.index == end_pgoff)
                        break;
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
 * caller if required (just like with __get_user_pages). "FOLL_GET"
 * is set implicitly if "pages" is non-NULL.
 */
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
-                                               unsigned long start, unsigned long nr_pages,
+                struct mm_struct *mm, unsigned long start,
-                                               struct page **pages, unsigned int gup_flags)
+                unsigned long nr_pages, struct page **pages,
+                unsigned int gup_flags)
 {
        long ret;
        int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
                up_read(&mm->mmap_sem);
        return ret;
 }
-EXPORT_SYMBOL(__get_user_pages_unlocked);
 /*
 * get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
 *              only intends to ensure the pages are faulted in.
 * @vmas:       array of pointers to vmas corresponding to each page.
 *              Or NULL if the caller does not require them.
+ * @locked:     pointer to lock flag indicating whether lock is held and
+ *              subsequently whether VM_FAULT_RETRY functionality can be
+ *              utilised. Lock must initially be held.
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
-                struct vm_area_struct **vmas)
+                struct vm_area_struct **vmas, int *locked)
 {
        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
-                                       NULL, false,
+                                       locked, true,
                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
 /*
 * This is the same as get_user_pages_remote(), just with a
 * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's.  We also
+ * and mm being operated on are the current task's and don't allow
- * obviously don't pass FOLL_REMOTE in here.
+ * passing of a locked parameter.  We also obviously don't pass
+ * FOLL_REMOTE in here.
 */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cee42cf05477..10eedbf14421 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
                gfp_t gfp)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
         */
        __SetPageUptodate(page);
-        fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-        if (unlikely(!pmd_none(*fe->pmd))) {
+        if (unlikely(!pmd_none(*vmf->pmd))) {
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
                mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                pte_free(vma->vm_mm, pgtable);
@@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
                if (userfaultfd_missing(vma)) {
                        int ret;
-                        spin_unlock(fe->ptl);
+                        spin_unlock(vmf->ptl);
                        mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(vma->vm_mm, pgtable);
-                        ret = handle_userfault(fe, VM_UFFD_MISSING);
+                        ret = handle_userfault(vmf, VM_UFFD_MISSING);
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }
@@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
                page_add_new_anon_rmap(page, vma, haddr, true);
                mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
-                pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
+                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-                set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                atomic_long_inc(&vma->vm_mm->nr_ptes);
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
                count_vm_event(THP_FAULT_ALLOC);
        }
@@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
        return true;
 }
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        gfp_t gfp;
        struct page *page;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
                return VM_FAULT_FALLBACK;
@@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                return VM_FAULT_OOM;
        if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
                return VM_FAULT_OOM;
-        if (!(fe->flags & FAULT_FLAG_WRITE) &&
+        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
                        transparent_hugepage_use_zero_page()) {
                pgtable_t pgtable;
@@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
-                fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                ret = 0;
                set = false;
-                if (pmd_none(*fe->pmd)) {
+                if (pmd_none(*vmf->pmd)) {
                        if (userfaultfd_missing(vma)) {
-                                spin_unlock(fe->ptl);
+                                spin_unlock(vmf->ptl);
-                                ret = handle_userfault(fe, VM_UFFD_MISSING);
+                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        } else {
                                set_huge_zero_page(pgtable, vma->vm_mm, vma,
-                                                   haddr, fe->pmd, zero_page);
+                                                   haddr, vmf->pmd, zero_page);
-                                spin_unlock(fe->ptl);
+                                spin_unlock(vmf->ptl);
                                set = true;
                        }
                } else
-                        spin_unlock(fe->ptl);
+                        spin_unlock(vmf->ptl);
                if (!set)
                        pte_free(vma->vm_mm, pgtable);
                return ret;
@@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                return VM_FAULT_FALLBACK;
        }
        prep_transhuge_page(page);
-        return __do_huge_pmd_anonymous_page(fe, page, gfp);
+        return __do_huge_pmd_anonymous_page(vmf, page, gfp);
 }
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -879,30 +879,30 @@ out:
        return ret;
 }
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 {
        pmd_t entry;
        unsigned long haddr;
-        fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
-        if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                goto unlock;
        entry = pmd_mkyoung(orig_pmd);
-        haddr = fe->address & HPAGE_PMD_MASK;
+        haddr = vmf->address & HPAGE_PMD_MASK;
-        if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
+        if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
-                                fe->flags & FAULT_FLAG_WRITE))
+                                vmf->flags & FAULT_FLAG_WRITE))
-                update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+                update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
 unlock:
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
 }
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
                struct page *page)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        pmd_t _pmd;
@@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
                                               __GFP_OTHER_NODE, vma,
-                                               fe->address, page_to_nid(page));
+                                               vmf->address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_try_charge(pages[i], vma->vm_mm,
                                     GFP_KERNEL, &memcg, false))) {
@@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
        mmun_end   = haddr + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
-        fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-        if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+        pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
        /* leave pmd empty until pte is filled */
-        pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+        pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
        pmd_populate(vma->vm_mm, &_pmd, pgtable);
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-                page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+                page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
                mem_cgroup_commit_charge(pages[i], memcg, false, false);
                lru_cache_add_active_or_unevictable(pages[i], vma);
-                fe->pte = pte_offset_map(&_pmd, haddr);
+                vmf->pte = pte_offset_map(&_pmd, haddr);
-                VM_BUG_ON(!pte_none(*fe->pte));
+                VM_BUG_ON(!pte_none(*vmf->pte));
-                set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
+                set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
-                pte_unmap(fe->pte);
+                pte_unmap(vmf->pte);
        }
        kfree(pages);
        smp_wmb(); /* make pte visible before pmd */
-        pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+        pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
        page_remove_rmap(page, true);
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -993,7 +993,7 @@ out:
        return ret;
 out_free_pages:
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
@@ -1005,23 +1005,23 @@ out_free_pages:
        goto out;
 }
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL, *new_page;
        struct mem_cgroup *memcg;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
        gfp_t huge_gfp;                 /* for allocation and charge */
        int ret = 0;
-        fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
        if (is_huge_zero_pmd(orig_pmd))
                goto alloc;
-        spin_lock(fe->ptl);
+        spin_lock(vmf->ptl);
-        if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
                goto out_unlock;
        page = pmd_page(orig_pmd);
@@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry,  1))
+                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
-                        update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
        get_page(page);
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
@@ -1053,12 +1053,12 @@ alloc:
                prep_transhuge_page(new_page);
        } else {
                if (!page) {
-                        split_huge_pmd(vma, fe->pmd, fe->address);
+                        split_huge_pmd(vma, vmf->pmd, vmf->address);
                        ret |= VM_FAULT_FALLBACK;
                } else {
-                        ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+                        ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
                        if (ret & VM_FAULT_OOM) {
-                                split_huge_pmd(vma, fe->pmd, fe->address);
+                                split_huge_pmd(vma, vmf->pmd, vmf->address);
                                ret |= VM_FAULT_FALLBACK;
                        }
                        put_page(page);
@@ -1070,7 +1070,7 @@ alloc:
        if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
                                        huge_gfp, &memcg, true))) {
                put_page(new_page);
-                split_huge_pmd(vma, fe->pmd, fe->address);
+                split_huge_pmd(vma, vmf->pmd, vmf->address);
                if (page)
                        put_page(page);
                ret |= VM_FAULT_FALLBACK;
@@ -1090,11 +1090,11 @@ alloc:
        mmun_end   = haddr + HPAGE_PMD_SIZE;
        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
-        spin_lock(fe->ptl);
+        spin_lock(vmf->ptl);
        if (page)
                put_page(page);
-        if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
+        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
                mem_cgroup_cancel_charge(new_page, memcg, true);
                put_page(new_page);
                goto out_mn;
@@ -1102,12 +1102,12 @@ alloc:
                pmd_t entry;
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+                pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
                page_add_new_anon_rmap(new_page, vma, haddr, true);
                mem_cgroup_commit_charge(new_page, memcg, false, true);
                lru_cache_add_active_or_unevictable(new_page, vma);
-                set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-                update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                if (!page) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                } else {
@@ -1117,13 +1117,13 @@ alloc:
                }
                ret |= VM_FAULT_WRITE;
        }
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
 out_mn:
        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
 out:
        return ret;
 out_unlock:
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        return ret;
 }
@@ -1196,12 +1196,12 @@ out:
 }
 /* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct anon_vma *anon_vma = NULL;
        struct page *page;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int page_nid = -1, this_nid = numa_node_id();
        int target_nid, last_cpupid = -1;
        bool page_locked;
@@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
        bool was_writable;
        int flags = 0;
-        fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-        if (unlikely(!pmd_same(pmd, *fe->pmd)))
+        if (unlikely(!pmd_same(pmd, *vmf->pmd)))
                goto out_unlock;
        /*
@@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
         * without disrupting NUMA hinting information. Do not relock and
         * check_same as the page may no longer be mapped.
         */
-        if (unlikely(pmd_trans_migrating(*fe->pmd))) {
+        if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
-                page = pmd_page(*fe->pmd);
+                page = pmd_page(*vmf->pmd);
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
                wait_on_page_locked(page);
                goto out;
        }
@@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
        /* Migration could have started since the pmd_trans_migrating check */
        if (!page_locked) {
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
                wait_on_page_locked(page);
                page_nid = -1;
                goto out;
@@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
         * to serialises splits
         */
        get_page(page);
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        anon_vma = page_lock_anon_vma_read(page);
        /* Confirm the PMD did not change while page_table_lock was released */
-        spin_lock(fe->ptl);
+        spin_lock(vmf->ptl);
-        if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+        if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
                unlock_page(page);
                put_page(page);
                page_nid = -1;
@@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
         * Migrate the THP to the requested node, returns with page unlocked
         * and access rights restored.
         */
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
-                                fe->pmd, pmd, fe->address, page, target_nid);
+                                vmf->pmd, pmd, vmf->address, page, target_nid);
        if (migrated) {
                flags |= TNF_MIGRATED;
                page_nid = target_nid;
@@ -1304,18 +1304,19 @@ clear_pmdnuma:
        pmd = pmd_mkyoung(pmd);
        if (was_writable)
                pmd = pmd_mkwrite(pmd);
-        set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
+        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
-        update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
        unlock_page(page);
 out_unlock:
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
 out:
        if (anon_vma)
                page_unlock_anon_vma_read(anon_vma);
        if (page_nid != -1)
-                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+                                vmf->flags);
        return 0;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..44d68895a9b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,7 @@
 /* Do not use these with a slab allocator */
 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+int do_swap_page(struct vm_fault *vmf);
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 09460955e818..e32389a97030 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                                        unsigned long address, pmd_t *pmd,
                                        int referenced)
 {
-        pte_t pteval;
        int swapped_in = 0, ret = 0;
-        struct fault_env fe = {
+        struct vm_fault vmf = {
                .vma = vma,
                .address = address,
                .flags = FAULT_FLAG_ALLOW_RETRY,
                .pmd = pmd,
+                .pgoff = linear_page_index(vma, address),
        };
        /* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
                return false;
        }
-        fe.pte = pte_offset_map(pmd, address);
+        vmf.pte = pte_offset_map(pmd, address);
-        for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+        for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
-                        fe.pte++, fe.address += PAGE_SIZE) {
+                        vmf.pte++, vmf.address += PAGE_SIZE) {
-                pteval = *fe.pte;
+                vmf.orig_pte = *vmf.pte;
-                if (!is_swap_pte(pteval))
+                if (!is_swap_pte(vmf.orig_pte))
                        continue;
                swapped_in++;
-                ret = do_swap_page(&fe, pteval);
+                ret = do_swap_page(&vmf);
                /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
                if (ret & VM_FAULT_RETRY) {
                        down_read(&mm->mmap_sem);
-                        if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+                        if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
                                /* vma is no longer available, don't continue to swapin */
                                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
                                return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                        return false;
                }
                /* pte is unmapped now, we need to map it */
-                fe.pte = pte_offset_map(pmd, fe.address);
+                vmf.pte = pte_offset_map(pmd, vmf.address);
        }
-        fe.pte--;
+        vmf.pte--;
-        pte_unmap(fe.pte);
+        pte_unmap(vmf.pte);
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
        return true;
 }
@@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm,
                radix_tree_replace_slot(&mapping->page_tree, slot,
                                new_page + (index % HPAGE_PMD_NR));
-                slot = radix_tree_iter_next(&iter);
+                slot = radix_tree_iter_resume(slot, &iter);
                index++;
                continue;
 out_lru:
@@ -1546,7 +1546,6 @@ tree_unlocked:
                                /* Put holes back where they were */
                                radix_tree_delete(&mapping->page_tree,
                                                  iter.index);
-                                slot = radix_tree_iter_next(&iter);
                                continue;
                        }
@@ -1557,11 +1556,11 @@ tree_unlocked:
                        page_ref_unfreeze(page, 2);
                        radix_tree_replace_slot(&mapping->page_tree,
                                                slot, page);
+                        slot = radix_tree_iter_resume(slot, &iter);
                        spin_unlock_irq(&mapping->tree_lock);
                        putback_lru_page(page);
                        unlock_page(page);
                        spin_lock_irq(&mapping->tree_lock);
-                        slot = radix_tree_iter_next(&iter);
                }
                VM_BUG_ON(nr_none);
                spin_unlock_irq(&mapping->tree_lock);
@@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
                present++;
                if (need_resched()) {
+                        slot = radix_tree_iter_resume(slot, &iter);
                        cond_resched_rcu();
-                        slot = radix_tree_iter_next(&iter);
                }
        }
        rcu_read_unlock();
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da39de28..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+static int do_page_mkwrite(struct vm_fault *vmf)
-               unsigned long address)
 {
-        struct vm_fault vmf;
        int ret;
+        struct page *page = vmf->page;
+        unsigned int old_flags = vmf->flags;
-        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-        vmf.pgoff = page->index;
-        vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-        vmf.gfp_mask = __get_fault_gfp_mask(vma);
-        vmf.page = page;
-        vmf.cow_page = NULL;
-        ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+        ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+        /* Restore original flags so that caller is not surprised */
+        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 }
 /*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+                                    struct page *page)
+{
+        struct address_space *mapping;
+        bool dirtied;
+        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+        dirtied = set_page_dirty(page);
+        VM_BUG_ON_PAGE(PageAnon(page), page);
+        /*
+         * Take a local copy of the address_space - page.mapping may be zeroed
+         * by truncate after unlock_page().   The address_space itself remains
+         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
+         * release semantics to prevent the compiler from undoing this copying.
+         */
+        mapping = page_rmapping(page);
+        unlock_page(page);
+        if ((dirtied || page_mkwrite) && mapping) {
+                /*
+                 * Some device drivers do not set page.mapping
+                 * but still dirty their pages
+                 */
+                balance_dirty_pages_ratelimited(mapping);
+        }
+        if (!page_mkwrite)
+                file_update_time(vma->vm_file);
+}
+/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
+static inline void wp_page_reuse(struct vm_fault *vmf)
-                        struct page *page, int page_mkwrite, int dirty_shared)
+        __releases(vmf->ptl)
-        __releases(fe->ptl)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
+        struct page *page = vmf->page;
        pte_t entry;
        /*
         * Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
        if (page)
                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
-        flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-        entry = pte_mkyoung(orig_pte);
+        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-        if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
+        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
-                update_mmu_cache(vma, fe->address, fe->pte);
+                update_mmu_cache(vma, vmf->address, vmf->pte);
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
-        if (dirty_shared) {
-                struct address_space *mapping;
-                int dirtied;
-                if (!page_mkwrite)
-                        lock_page(page);
-                dirtied = set_page_dirty(page);
-                VM_BUG_ON_PAGE(PageAnon(page), page);
-                mapping = page->mapping;
-                unlock_page(page);
-                put_page(page);
-                if ((dirtied || page_mkwrite) && mapping) {
-                        /*
-                         * Some device drivers do not set page.mapping
-                         * but still dirty their pages
-                         */
-                        balance_dirty_pages_ratelimited(mapping);
-                }
-                if (!page_mkwrite)
-                        file_update_time(vma->vm_file);
-        }
-        return VM_FAULT_WRITE;
 }
 /*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
+static int wp_page_copy(struct vm_fault *vmf)
-                struct page *old_page)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
+        struct page *old_page = vmf->page;
        struct page *new_page = NULL;
        pte_t entry;
        int page_copied = 0;
-        const unsigned long mmun_start = fe->address & PAGE_MASK;
+        const unsigned long mmun_start = vmf->address & PAGE_MASK;
        const unsigned long mmun_end = mmun_start + PAGE_SIZE;
        struct mem_cgroup *memcg;
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-        if (is_zero_pfn(pte_pfn(orig_pte))) {
+        if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
-                new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+                new_page = alloc_zeroed_user_highpage_movable(vma,
+                                                              vmf->address);
                if (!new_page)
                        goto oom;
        } else {
                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
-                                fe->address);
+                                vmf->address);
                if (!new_page)
                        goto oom;
-                cow_user_page(new_page, old_page, fe->address, vma);
+                cow_user_page(new_page, old_page, vmf->address, vma);
        }
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
        /*
         * Re-check the pte - we dropped the lock
         */
-        fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
+        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
-        if (likely(pte_same(*fe->pte, orig_pte))) {
+        if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
                                dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                }
-                flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                /*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-                ptep_clear_flush_notify(vma, fe->address, fe->pte);
+                ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-                page_add_new_anon_rmap(new_page, vma, fe->address, false);
+                page_add_new_anon_rmap(new_page, vma, vmf->address, false);
                mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                 * mmu page tables (such as kvm shadow page tables), we want the
                 * new page to be mapped directly into the secondary page table.
                 */
-                set_pte_at_notify(mm, fe->address, fe->pte, entry);
+                set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
-                update_mmu_cache(vma, fe->address, fe->pte);
+                update_mmu_cache(vma, vmf->address, vmf->pte);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
        if (new_page)
                put_page(new_page);
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
@@ -2263,79 +2269,91 @@ oom:
        return VM_FAULT_OOM;
 }
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ *                        writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+                                       &vmf->ptl);
+        /*
+         * We might have raced with another page fault while we released the
+         * pte_offset_map_lock.
+         */
+        if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
+                return VM_FAULT_NOPAGE;
+        }
+        wp_page_reuse(vmf);
+        return 0;
+}
 /*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
-static int wp_pfn_shared(struct fault_env *fe,  pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
-                struct vm_fault vmf = {
-                        .page = NULL,
-                        .pgoff = linear_page_index(vma, fe->address),
-                        .virtual_address =
-                                (void __user *)(fe->address & PAGE_MASK),
-                        .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
-                };
                int ret;
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
-                ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+                vmf->flags |= FAULT_FLAG_MKWRITE;
-                if (ret & VM_FAULT_ERROR)
+                ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
-                fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+                return finish_mkwrite_fault(vmf);
-                                &fe->ptl);
-                /*
-                 * We might have raced with another page fault while we
-                 * released the pte_offset_map_lock.
-                 */
-                if (!pte_same(*fe->pte, orig_pte)) {
-                        pte_unmap_unlock(fe->pte, fe->ptl);
-                        return 0;
-                }
        }
-        return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+        wp_page_reuse(vmf);
+        return VM_FAULT_WRITE;
 }
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
+static int wp_page_shared(struct vm_fault *vmf)
-                struct page *old_page)
+        __releases(vmf->ptl)
-        __releases(fe->ptl)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        int page_mkwrite = 0;
-        get_page(old_page);
+        get_page(vmf->page);
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                int tmp;
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
-                tmp = do_page_mkwrite(vma, old_page, fe->address);
+                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                        put_page(old_page);
+                        put_page(vmf->page);
                        return tmp;
                }
-                /*
+                tmp = finish_mkwrite_fault(vmf);
-                 * Since we dropped the lock we need to revalidate
+                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                 * the PTE as someone else may have changed it.  If
+                        unlock_page(vmf->page);
-                 * they did, we just return, as we can count on the
+                        put_page(vmf->page);
-                 * MMU to tell us if they didn't also make it writable.
+                        return tmp;
-                 */
-                fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                                                 &fe->ptl);
-                if (!pte_same(*fe->pte, orig_pte)) {
-                        unlock_page(old_page);
-                        pte_unmap_unlock(fe->pte, fe->ptl);
-                        put_page(old_page);
-                        return 0;
                }
-                page_mkwrite = 1;
+        } else {
+                wp_page_reuse(vmf);
+                lock_page(vmf->page);
        }
+        fault_dirty_shared_page(vma, vmf->page);
+        put_page(vmf->page);
-        return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+        return VM_FAULT_WRITE;
 }
 /*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
+static int do_wp_page(struct vm_fault *vmf)
-        __releases(fe->ptl)
+        __releases(vmf->ptl)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        struct page *old_page;
-        old_page = vm_normal_page(vma, fe->address, orig_pte);
+        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
-        if (!old_page) {
+        if (!vmf->page) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
                 */
                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
-                        return wp_pfn_shared(fe, orig_pte);
+                        return wp_pfn_shared(vmf);
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
-                return wp_page_copy(fe, orig_pte, old_page);
+                return wp_page_copy(vmf);
        }
        /*
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
-        if (PageAnon(old_page) && !PageKsm(old_page)) {
+        if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
                int total_mapcount;
-                if (!trylock_page(old_page)) {
+                if (!trylock_page(vmf->page)) {
-                        get_page(old_page);
+                        get_page(vmf->page);
-                        pte_unmap_unlock(fe->pte, fe->ptl);
+                        pte_unmap_unlock(vmf->pte, vmf->ptl);
-                        lock_page(old_page);
+                        lock_page(vmf->page);
-                        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
+                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-                                        fe->address, &fe->ptl);
+                                        vmf->address, &vmf->ptl);
-                        if (!pte_same(*fe->pte, orig_pte)) {
+                        if (!pte_same(*vmf->pte, vmf->orig_pte)) {
-                                unlock_page(old_page);
+                                unlock_page(vmf->page);
-                                pte_unmap_unlock(fe->pte, fe->ptl);
+                                pte_unmap_unlock(vmf->pte, vmf->ptl);
-                                put_page(old_page);
+                                put_page(vmf->page);
                                return 0;
                        }
-                        put_page(old_page);
+                        put_page(vmf->page);
                }
-                if (reuse_swap_page(old_page, &total_mapcount)) {
+                if (reuse_swap_page(vmf->page, &total_mapcount)) {
                        if (total_mapcount == 1) {
                                /*
                                 * The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
                                 * Protected against the rmap code by
                                 * the page lock.
                                 */
-                                page_move_anon_rmap(old_page, vma);
+                                page_move_anon_rmap(vmf->page, vma);
                        }
-                        unlock_page(old_page);
+                        unlock_page(vmf->page);
-                        return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+                        wp_page_reuse(vmf);
+                        return VM_FAULT_WRITE;
                }
-                unlock_page(old_page);
+                unlock_page(vmf->page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
-                return wp_page_shared(fe, orig_pte, old_page);
+                return wp_page_shared(vmf);
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
-        get_page(old_page);
+        get_page(vmf->page);
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
-        return wp_page_copy(fe, orig_pte, old_page);
+        return wp_page_copy(vmf);
 }
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
 * We return with the mmap_sem locked or unlocked in the same cases
 * as does filemap_fault().
 */
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct page *page, *swapcache;
        struct mem_cgroup *memcg;
        swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        int exclusive = 0;
        int ret = 0;
-        if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
                goto out;
-        entry = pte_to_swp_entry(orig_pte);
+        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
-                        migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+                        migration_entry_wait(vma->vm_mm, vmf->pmd,
+                                             vmf->address);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
-                        print_bad_pte(vma, fe->address, orig_pte, NULL);
+                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
-                page = swapin_readahead(entry,
+                page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
-                                        GFP_HIGHUSER_MOVABLE, vma, fe->address);
+                                        vmf->address);
                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
-                        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
+                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-                                        fe->address, &fe->ptl);
+                                        vmf->address, &vmf->ptl);
-                        if (likely(pte_same(*fe->pte, orig_pte)))
+                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                        goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        }
        swapcache = page;
-        locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
                goto out_page;
-        page = ksm_might_need_to_copy(page, vma, fe->address);
+        page = ksm_might_need_to_copy(page, vma, vmf->address);
        if (unlikely(!page)) {
                ret = VM_FAULT_OOM;
                page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        /*
         * Back out if somebody else already faulted in this pte.
         */
-        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-                        &fe->ptl);
+                        &vmf->ptl);
-        if (unlikely(!pte_same(*fe->pte, orig_pte)))
+        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;
        if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
-        if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+        if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-                fe->flags &= ~FAULT_FLAG_WRITE;
+                vmf->flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
                exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
-        if (pte_swp_soft_dirty(orig_pte))
+        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
-        set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+        vmf->orig_pte = pte;
        if (page == swapcache) {
-                do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+                do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
                mem_cgroup_commit_charge(page, memcg, true, false);
                activate_page(page);
        } else { /* ksm created a completely new copy */
-                page_add_new_anon_rmap(page, vma, fe->address, false);
+                page_add_new_anon_rmap(page, vma, vmf->address, false);
                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
                put_page(swapcache);
        }
-        if (fe->flags & FAULT_FLAG_WRITE) {
+        if (vmf->flags & FAULT_FLAG_WRITE) {
-                ret |= do_wp_page(fe, pte);
+                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }
        /* No need to invalidate - it was non-present before */
-        update_mmu_cache(vma, fe->address, fe->pte);
+        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
        return ret;
 out_nomap:
        mem_cgroup_cancel_charge(page, memcg, false);
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
        unlock_page(page);
 out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct mem_cgroup *memcg;
        struct page *page;
        pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
                return VM_FAULT_SIGBUS;
        /* Check if we need to add a guard page to the stack */
-        if (check_stack_guard_page(vma, fe->address) < 0)
+        if (check_stack_guard_page(vma, vmf->address) < 0)
                return VM_FAULT_SIGSEGV;
        /*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
         *
         * Here we only have down_read(mmap_sem).
         */
-        if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+        if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
                return VM_FAULT_OOM;
        /* See the comment in pte_alloc_one_map() */
-        if (unlikely(pmd_trans_unstable(fe->pmd)))
+        if (unlikely(pmd_trans_unstable(vmf->pmd)))
                return 0;
        /* Use the zero-page for reads */
-        if (!(fe->flags & FAULT_FLAG_WRITE) &&
+        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
-                entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
-                fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-                                &fe->ptl);
+                                vmf->address, &vmf->ptl);
-                if (!pte_none(*fe->pte))
+                if (!pte_none(*vmf->pte))
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
-                        pte_unmap_unlock(fe->pte, fe->ptl);
+                        pte_unmap_unlock(vmf->pte, vmf->ptl);
-                        return handle_userfault(fe, VM_UFFD_MISSING);
+                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-        page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+        page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
                goto oom;
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
-        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-                        &fe->ptl);
+                        &vmf->ptl);
-        if (!pte_none(*fe->pte))
+        if (!pte_none(*vmf->pte))
                goto release;
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
                mem_cgroup_cancel_charge(page, memcg, false);
                put_page(page);
-                return handle_userfault(fe, VM_UFFD_MISSING);
+                return handle_userfault(vmf, VM_UFFD_MISSING);
        }
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-        page_add_new_anon_rmap(page, vma, fe->address, false);
+        page_add_new_anon_rmap(page, vma, vmf->address, false);
        mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
-        set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
        /* No need to invalidate - it was non-present before */
-        update_mmu_cache(vma, fe->address, fe->pte);
+        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
 release:
        mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
+static int __do_fault(struct vm_fault *vmf)
-                struct page *cow_page, struct page **page, void **entry)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        struct vm_fault vmf;
        int ret;
-        vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
+        ret = vma->vm_ops->fault(vma, vmf);
-        vmf.pgoff = pgoff;
+        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
-        vmf.flags = fe->flags;
+                            VM_FAULT_DONE_COW)))
-        vmf.page = NULL;
-        vmf.gfp_mask = __get_fault_gfp_mask(vma);
-        vmf.cow_page = cow_page;
-        ret = vma->vm_ops->fault(vma, &vmf);
-        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-                return ret;
-        if (ret & VM_FAULT_DAX_LOCKED) {
-                *entry = vmf.entry;
                return ret;
-        }
-        if (unlikely(PageHWPoison(vmf.page))) {
+        if (unlikely(PageHWPoison(vmf->page))) {
                if (ret & VM_FAULT_LOCKED)
-                        unlock_page(vmf.page);
+                        unlock_page(vmf->page);
-                put_page(vmf.page);
+                put_page(vmf->page);
+                vmf->page = NULL;
                return VM_FAULT_HWPOISON;
        }
        if (unlikely(!(ret & VM_FAULT_LOCKED)))
-                lock_page(vmf.page);
+                lock_page(vmf->page);
        else
-                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+                VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
-        *page = vmf.page;
        return ret;
 }
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        if (!pmd_none(*fe->pmd))
+        if (!pmd_none(*vmf->pmd))
                goto map_pte;
-        if (fe->prealloc_pte) {
+        if (vmf->prealloc_pte) {
-                fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-                if (unlikely(!pmd_none(*fe->pmd))) {
+                if (unlikely(!pmd_none(*vmf->pmd))) {
-                        spin_unlock(fe->ptl);
+                        spin_unlock(vmf->ptl);
                        goto map_pte;
                }
                atomic_long_inc(&vma->vm_mm->nr_ptes);
-                pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+                pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
-                spin_unlock(fe->ptl);
+                spin_unlock(vmf->ptl);
-                fe->prealloc_pte = 0;
+                vmf->prealloc_pte = 0;
-        } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+        } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
                return VM_FAULT_OOM;
        }
 map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
         * through an atomic read in C, which is what pmd_trans_unstable()
         * provides.
         */
-        if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+        if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
                return VM_FAULT_NOPAGE;
-        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
+        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-                        &fe->ptl);
+                        &vmf->ptl);
        return 0;
 }
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
        return true;
 }
-static void deposit_prealloc_pte(struct fault_env *fe)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        atomic_long_inc(&vma->vm_mm->nr_ptes);
-        fe->prealloc_pte = 0;
+        vmf->prealloc_pte = 0;
 }
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        bool write = fe->flags & FAULT_FLAG_WRITE;
+        bool write = vmf->flags & FAULT_FLAG_WRITE;
-        unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        int i, ret;
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
         * Archs like ppc64 need additonal space to store information
         * related to pte entry. Use the preallocated table for that.
         */
-        if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
+        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
-                fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
+                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
-                if (!fe->prealloc_pte)
+                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
                smp_wmb(); /* See comment in __pte_alloc() */
        }
-        fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-        if (unlikely(!pmd_none(*fe->pmd)))
+        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;
        for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
-                deposit_prealloc_pte(fe);
+                deposit_prealloc_pte(vmf);
-        set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-        update_mmu_cache_pmd(vma, haddr, fe->pmd);
+        update_mmu_cache_pmd(vma, haddr, vmf->pmd);
        /* fault is handled */
        ret = 0;
@@ -3005,13 +3013,13 @@ out:
         * withdraw with pmd lock held.
         */
        if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
-                fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+                vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
-                                                               fe->pmd);
+                                                                vmf->pmd);
-        spin_unlock(fe->ptl);
+        spin_unlock(vmf->ptl);
        return ret;
 }
 #else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
        BUILD_BUG();
        return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 * alloc_set_pte - setup new PTE entry for given page and add reverse page
 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
 *
- * @fe: fault environment
+ * @vmf: fault environment
 * @memcg: memcg to charge page (only for private mappings)
 * @page: page to map
 *
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
 *
 * Target users are page handler itself and implementations of
 * vm_ops->map_pages.
 */
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                struct page *page)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        bool write = fe->flags & FAULT_FLAG_WRITE;
+        bool write = vmf->flags & FAULT_FLAG_WRITE;
        pte_t entry;
        int ret;
-        if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+        if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
                        IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
                /* THP on COW? */
                VM_BUG_ON_PAGE(memcg, page);
-                ret = do_set_pmd(fe, page);
+                ret = do_set_pmd(vmf, page);
                if (ret != VM_FAULT_FALLBACK)
                        goto fault_handled;
        }
-        if (!fe->pte) {
+        if (!vmf->pte) {
-                ret = pte_alloc_one_map(fe);
+                ret = pte_alloc_one_map(vmf);
                if (ret)
                        goto fault_handled;
        }
        /* Re-check under ptl */
-        if (unlikely(!pte_none(*fe->pte))) {
+        if (unlikely(!pte_none(*vmf->pte))) {
                ret = VM_FAULT_NOPAGE;
                goto fault_handled;
        }
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-                page_add_new_anon_rmap(page, vma, fe->address, false);
+                page_add_new_anon_rmap(page, vma, vmf->address, false);
                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, false);
        }
-        set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
        /* no need to invalidate: a not-present page won't be cached */
-        update_mmu_cache(vma, fe->address, fe->pte);
+        update_mmu_cache(vma, vmf->address, vmf->pte);
        ret = 0;
 fault_handled:
        /* preallocated pagetable is unused: free it */
-        if (fe->prealloc_pte) {
+        if (vmf->prealloc_pte) {
-                pte_free(fe->vma->vm_mm, fe->prealloc_pte);
+                pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
-                fe->prealloc_pte = 0;
+                vmf->prealloc_pte = 0;
        }
        return ret;
 }
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+        struct page *page;
+        int ret;
+        /* Did we COW the page? */
+        if ((vmf->flags & FAULT_FLAG_WRITE) &&
+            !(vmf->vma->vm_flags & VM_SHARED))
+                page = vmf->cow_page;
+        else
+                page = vmf->page;
+        ret = alloc_set_pte(vmf, vmf->memcg, page);
+        if (vmf->pte)
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
+        return ret;
+}
 static unsigned long fault_around_bytes __read_mostly =
        rounddown_pow_of_two(65536);
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
 * fault_around_pages() value (and therefore to page order).  This way it's
 * easier to guarantee that we don't cross page table boundaries.
 */
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
 {
-        unsigned long address = fe->address, nr_pages, mask;
+        unsigned long address = vmf->address, nr_pages, mask;
+        pgoff_t start_pgoff = vmf->pgoff;
        pgoff_t end_pgoff;
        int off, ret = 0;
        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
-        fe->address = max(address & mask, fe->vma->vm_start);
+        vmf->address = max(address & mask, vmf->vma->vm_start);
-        off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+        off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
        start_pgoff -= off;
        /*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
         *  or fault_around_pages() from start_pgoff, depending what is nearest.
         */
        end_pgoff = start_pgoff -
-                ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+                ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
-        end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+        end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
                        start_pgoff + nr_pages - 1);
-        if (pmd_none(*fe->pmd)) {
+        if (pmd_none(*vmf->pmd)) {
-                fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
+                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
-                if (!fe->prealloc_pte)
+                                                  vmf->address);
+                if (!vmf->prealloc_pte)
                        goto out;
                smp_wmb(); /* See comment in __pte_alloc() */
        }
-        fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+        vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
        /* Huge page is mapped? Page fault is solved */
-        if (pmd_trans_huge(*fe->pmd)) {
+        if (pmd_trans_huge(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /* ->map_pages() haven't done anything useful. Cold page cache? */
-        if (!fe->pte)
+        if (!vmf->pte)
                goto out;
        /* check if the page fault is solved */
-        fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+        vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
-        if (!pte_none(*fe->pte))
+        if (!pte_none(*vmf->pte))
                ret = VM_FAULT_NOPAGE;
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
-        fe->address = address;
+        vmf->address = address;
-        fe->pte = NULL;
+        vmf->pte = NULL;
        return ret;
 }
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        struct page *fault_page;
        int ret = 0;
        /*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-                ret = do_fault_around(fe, pgoff);
+                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }
-        ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
-        ret |= alloc_set_pte(fe, NULL, fault_page);
+        ret |= finish_fault(vmf);
-        if (fe->pte)
+        unlock_page(vmf->page);
-                pte_unmap_unlock(fe->pte, fe->ptl);
-        unlock_page(fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-                put_page(fault_page);
+                put_page(vmf->page);
        return ret;
 }
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        struct page *fault_page, *new_page;
-        void *fault_entry;
-        struct mem_cgroup *memcg;
        int ret;
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
-        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
+        vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
-        if (!new_page)
+        if (!vmf->cow_page)
                return VM_FAULT_OOM;
-        if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+        if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
-                                &memcg, false)) {
+                                &vmf->memcg, false)) {
-                put_page(new_page);
+                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
-        ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
+        if (ret & VM_FAULT_DONE_COW)
+                return ret;
-        if (!(ret & VM_FAULT_DAX_LOCKED))
+        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
-                copy_user_highpage(new_page, fault_page, fe->address, vma);
+        __SetPageUptodate(vmf->cow_page);
-        __SetPageUptodate(new_page);
-        ret |= alloc_set_pte(fe, memcg, new_page);
+        ret |= finish_fault(vmf);
-        if (fe->pte)
+        unlock_page(vmf->page);
-                pte_unmap_unlock(fe->pte, fe->ptl);
+        put_page(vmf->page);
-        if (!(ret & VM_FAULT_DAX_LOCKED)) {
-                unlock_page(fault_page);
-                put_page(fault_page);
-        } else {
-                dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
-        }
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
 uncharge_out:
-        mem_cgroup_cancel_charge(new_page, memcg, false);
+        mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
-        put_page(new_page);
+        put_page(vmf->cow_page);
        return ret;
 }
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        struct page *fault_page;
-        struct address_space *mapping;
-        int dirtied = 0;
        int ret, tmp;
-        ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
-                unlock_page(fault_page);
+                unlock_page(vmf->page);
-                tmp = do_page_mkwrite(vma, fault_page, fe->address);
+                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                        put_page(fault_page);
+                        put_page(vmf->page);
                        return tmp;
                }
        }
-        ret |= alloc_set_pte(fe, NULL, fault_page);
+        ret |= finish_fault(vmf);
-        if (fe->pte)
-                pte_unmap_unlock(fe->pte, fe->ptl);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
-                unlock_page(fault_page);
+                unlock_page(vmf->page);
-                put_page(fault_page);
+                put_page(vmf->page);
                return ret;
        }
-        if (set_page_dirty(fault_page))
+        fault_dirty_shared_page(vma, vmf->page);
-                dirtied = 1;
-        /*
-         * Take a local copy of the address_space - page.mapping may be zeroed
-         * by truncate after unlock_page().   The address_space itself remains
-         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
-         * release semantics to prevent the compiler from undoing this copying.
-         */
-        mapping = page_rmapping(fault_page);
-        unlock_page(fault_page);
-        if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
-                /*
-                 * Some device drivers do not set page.mapping but still
-                 * dirty their pages
-                 */
-                balance_dirty_pages_ratelimited(mapping);
-        }
-        if (!vma->vm_ops->page_mkwrite)
-                file_update_time(vma->vm_file);
        return ret;
 }
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
-        pgoff_t pgoff = linear_page_index(vma, fe->address);
        /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
        if (!vma->vm_ops->fault)
                return VM_FAULT_SIGBUS;
-        if (!(fe->flags & FAULT_FLAG_WRITE))
+        if (!(vmf->flags & FAULT_FLAG_WRITE))
-                return do_read_fault(fe, pgoff);
+                return do_read_fault(vmf);
        if (!(vma->vm_flags & VM_SHARED))
-                return do_cow_fault(fe, pgoff);
+                return do_cow_fault(vmf);
-        return do_shared_fault(fe, pgoff);
+        return do_shared_fault(vmf);
 }
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
        return mpol_misplaced(page, vma, addr);
 }
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL;
        int page_nid = -1;
        int last_cpupid;
        int target_nid;
        bool migrated = false;
+        pte_t pte = vmf->orig_pte;
        bool was_writable = pte_write(pte);
        int flags = 0;
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
        * page table entry is not accessible, so there would be no
        * concurrent hardware modifications to the PTE.
        */
-        fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
+        vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
-        spin_lock(fe->ptl);
+        spin_lock(vmf->ptl);
-        if (unlikely(!pte_same(*fe->pte, pte))) {
+        if (unlikely(!pte_same(*vmf->pte, pte))) {
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
        pte = pte_mkyoung(pte);
        if (was_writable)
                pte = pte_mkwrite(pte);
-        set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
-        update_mmu_cache(vma, fe->address, fe->pte);
+        update_mmu_cache(vma, vmf->address, vmf->pte);
-        page = vm_normal_page(vma, fe->address, pte);
+        page = vm_normal_page(vma, vmf->address, pte);
        if (!page) {
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
        /* TODO: handle PTE-mapped THP */
        if (PageCompound(page)) {
-                pte_unmap_unlock(fe->pte, fe->ptl);
+                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
        last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
-        target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == -1) {
                put_page(page);
                goto out;
@@ -3469,28 +3476,28 @@ out:
        return 0;
 }
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
 {
-        struct vm_area_struct *vma = fe->vma;
+        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
-                return do_huge_pmd_anonymous_page(fe);
+                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->pmd_fault)
-                return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
+                return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-                                fe->flags);
+                                vmf->flags);
        return VM_FAULT_FALLBACK;
 }
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
-        if (vma_is_anonymous(fe->vma))
+        if (vma_is_anonymous(vmf->vma))
-                return do_huge_pmd_wp_page(fe, orig_pmd);
+                return do_huge_pmd_wp_page(vmf, orig_pmd);
-        if (fe->vma->vm_ops->pmd_fault)
+        if (vmf->vma->vm_ops->pmd_fault)
-                return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
+                return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-                                fe->flags);
+                                                   vmf->pmd, vmf->flags);
        /* COW handled on pte level: split pmd */
-        VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
+        VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
-        __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
+        __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
        return VM_FAULT_FALLBACK;
 }
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 * The mmap_sem may have been released depending on flags and our return value.
 * See filemap_fault() and __lock_page_or_retry().
 */
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
 {
        pte_t entry;
-        if (unlikely(pmd_none(*fe->pmd))) {
+        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
-                fe->pte = NULL;
+                vmf->pte = NULL;
        } else {
                /* See comment in pte_alloc_one_map() */
-                if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+                if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
                        return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
                 * mmap_sem read mode and khugepaged takes it in write mode.
                 * So now it's safe to run pte_offset_map().
                 */
-                fe->pte = pte_offset_map(fe->pmd, fe->address);
+                vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+                vmf->orig_pte = *vmf->pte;
-                entry = *fe->pte;
                /*
                 * some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
                 * ptl lock held. So here a barrier will do.
                 */
                barrier();
-                if (pte_none(entry)) {
+                if (pte_none(vmf->orig_pte)) {
-                        pte_unmap(fe->pte);
+                        pte_unmap(vmf->pte);
-                        fe->pte = NULL;
+                        vmf->pte = NULL;
                }
        }
-        if (!fe->pte) {
+        if (!vmf->pte) {
-                if (vma_is_anonymous(fe->vma))
+                if (vma_is_anonymous(vmf->vma))
-                        return do_anonymous_page(fe);
+                        return do_anonymous_page(vmf);
                else
-                        return do_fault(fe);
+                        return do_fault(vmf);
        }
-        if (!pte_present(entry))
+        if (!pte_present(vmf->orig_pte))
-                return do_swap_page(fe, entry);
+                return do_swap_page(vmf);
-        if (pte_protnone(entry) && vma_is_accessible(fe->vma))
+        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
-                return do_numa_page(fe, entry);
+                return do_numa_page(vmf);
-        fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
+        vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-        spin_lock(fe->ptl);
+        spin_lock(vmf->ptl);
-        if (unlikely(!pte_same(*fe->pte, entry)))
+        entry = vmf->orig_pte;
+        if (unlikely(!pte_same(*vmf->pte, entry)))
                goto unlock;
-        if (fe->flags & FAULT_FLAG_WRITE) {
+        if (vmf->flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
-                        return do_wp_page(fe, entry);
+                        return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
-        if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
+        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
-                                fe->flags & FAULT_FLAG_WRITE)) {
+                                vmf->flags & FAULT_FLAG_WRITE)) {
-                update_mmu_cache(fe->vma, fe->address, fe->pte);
+                update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
-                if (fe->flags & FAULT_FLAG_WRITE)
+                if (vmf->flags & FAULT_FLAG_WRITE)
-                        flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
        }
 unlock:
-        pte_unmap_unlock(fe->pte, fe->ptl);
+        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
 }
@@ -3606,10 +3613,12 @@ unlock:
 static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                unsigned int flags)
 {
-        struct fault_env fe = {
+        struct vm_fault vmf = {
                .vma = vma,
-                .address = address,
+                .address = address & PAGE_MASK,
                .flags = flags,
+                .pgoff = linear_page_index(vma, address),
+                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
                return VM_FAULT_OOM;
-        fe.pmd = pmd_alloc(mm, pud, address);
+        vmf.pmd = pmd_alloc(mm, pud, address);
-        if (!fe.pmd)
+        if (!vmf.pmd)
                return VM_FAULT_OOM;
-        if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
+        if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-                int ret = create_huge_pmd(&fe);
+                int ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
-                pmd_t orig_pmd = *fe.pmd;
+                pmd_t orig_pmd = *vmf.pmd;
                int ret;
                barrier();
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
-                                return do_huge_pmd_numa_page(&fe, orig_pmd);
+                                return do_huge_pmd_numa_page(&vmf, orig_pmd);
-                        if ((fe.flags & FAULT_FLAG_WRITE) &&
+                        if ((vmf.flags & FAULT_FLAG_WRITE) &&
                                        !pmd_write(orig_pmd)) {
-                                ret = wp_huge_pmd(&fe, orig_pmd);
+                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
-                                huge_pmd_set_accessed(&fe, orig_pmd);
+                                huge_pmd_set_accessed(&vmf, orig_pmd);
                                return 0;
                        }
                }
        }
-        return handle_pte_fault(&fe);
+        return handle_pte_fault(&vmf);
 }
 /*
@@ -3808,8 +3817,8 @@ out:
        return -EINVAL;
 }
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
-                             pte_t **ptepp, spinlock_t **ptlp)
+               spinlock_t **ptlp)
 {
        int res;
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                struct page *page = NULL;
                ret = get_user_pages_remote(tsk, mm, addr, 1,
-                                gup_flags, &page, &vma);
+                                gup_flags, &page, &vma, NULL);
                if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
                        break;
diff --git a/mm/nommu.c b/mm/nommu.c
index 27bc543128e5..210d7ec2843c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages_unlocked(struct task_struct *tsk,
-                               unsigned long start, unsigned long nr_pages,
+                        struct mm_struct *mm, unsigned long start,
-                               struct page **pages, unsigned int gup_flags)
+                        unsigned long nr_pages, struct page **pages,
+                        unsigned int gup_flags)
 {
        long ret;
        down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
        up_read(&mm->mmap_sem);
        return ret;
 }
-EXPORT_SYMBOL(__get_user_pages_unlocked);
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
@@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
        BUG();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 52e2f8e3b472..290e8b7d3181 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
 {
 #define WRITEBACK_TAG_BATCH 4096
-        unsigned long tagged;
+        unsigned long tagged = 0;
+        struct radix_tree_iter iter;
-        do {
+        void **slot;
-                spin_lock_irq(&mapping->tree_lock);
-                tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+        spin_lock_irq(&mapping->tree_lock);
-                                &start, end, WRITEBACK_TAG_BATCH,
+        radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
-                                PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+                                                        PAGECACHE_TAG_DIRTY) {
+                if (iter.index > end)
+                        break;
+                radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+                                                        PAGECACHE_TAG_TOWRITE);
+                tagged++;
+                if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+                        continue;
+                slot = radix_tree_iter_resume(slot, &iter);
                spin_unlock_irq(&mapping->tree_lock);
-                WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
                cond_resched();
-                /* We check 'start' to handle wrapping when end == ~0UL */
+                spin_lock_irq(&mapping->tree_lock);
-        } while (tagged >= WRITEBACK_TAG_BATCH && start);
+        }
+        spin_unlock_irq(&mapping->tree_lock);
 }
 EXPORT_SYMBOL(tag_pages_for_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f64e7bcb43b7..2c6d5f64feca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
        return page;
 }
+void __page_frag_drain(struct page *page, unsigned int order,
+                       unsigned int count)
+{
+        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+        if (page_ref_sub_and_test(page, count)) {
+                if (order == 0)
+                        free_hot_cold_page(page, false);
+                else
+                        __free_pages_ok(page, order);
+        }
+}
+EXPORT_SYMBOL(__page_frag_drain);
 void *__alloc_page_frag(struct page_frag_cache *nc,
                        unsigned int fragsz, gfp_t gfp_mask)
 {
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
        ssize_t rc = 0;
        unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
                / sizeof(struct pages *);
-        unsigned int flags = FOLL_REMOTE;
+        unsigned int flags = 0;
        /* Work out address and page range required */
        if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
        while (!rc && nr_pages && iov_iter_count(iter)) {
                int pages = min(nr_pages, max_pages_per_loop);
+                int locked = 1;
                size_t bytes;
                /*
                 * Get the pages we're interested in.  We must
-                 * add FOLL_REMOTE because task/mm might not
+                 * access remotely because task/mm might not
                 * current/current->mm
                 */
-                pages = __get_user_pages_unlocked(task, mm, pa, pages,
+                down_read(&mm->mmap_sem);
-                                                  process_pages, flags);
+                pages = get_user_pages_remote(task, mm, pa, pages, flags,
+                                              process_pages, NULL, &locked);
+                if (locked)
+                        up_read(&mm->mmap_sem);
                if (pages <= 0)
                        return -EFAULT;
diff --git a/mm/shmem.c b/mm/shmem.c
index abd7403aba41..54287d443806 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                        swapped++;
                if (need_resched()) {
+                        slot = radix_tree_iter_resume(slot, &iter);
                        cond_resched_rcu();
-                        slot = radix_tree_iter_next(&iter);
                }
        }
@@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
+static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+{
+        struct radix_tree_iter iter;
+        void **slot;
+        unsigned long found = -1;
+        unsigned int checked = 0;
+        rcu_read_lock();
+        radix_tree_for_each_slot(slot, root, &iter, 0) {
+                if (*slot == item) {
+                        found = iter.index;
+                        break;
+                }
+                checked++;
+                if ((checked % 4096) != 0)
+                        continue;
+                slot = radix_tree_iter_resume(slot, &iter);
+                cond_resched_rcu();
+        }
+        rcu_read_unlock();
+        return found;
+}
 /*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
@@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
        int error = 0;
        radswap = swp_to_radix_entry(swap);
-        index = radix_tree_locate_item(&mapping->page_tree, radswap);
+        index = find_swap_entry(&mapping->page_tree, radswap);
        if (index == -1)
                return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping)
                }
                if (need_resched()) {
+                        slot = radix_tree_iter_resume(slot, &iter);
                        cond_resched_rcu();
-                        slot = radix_tree_iter_next(&iter);
                }
        }
        rcu_read_unlock();
@@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
                        spin_unlock_irq(&mapping->tree_lock);
 continue_resched:
                        if (need_resched()) {
+                                slot = radix_tree_iter_resume(slot, &iter);
                                cond_resched_rcu();
-                                slot = radix_tree_iter_next(&iter);
                        }
                }
                rcu_read_unlock();
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2d59c9be40e1..5f63f6dcaabb 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = {
 static int __init af_rxrpc_init(void)
 {
        int ret = -1;
+        unsigned int tmp;
        BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
        get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
        rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
-        get_random_bytes(&rxrpc_client_conn_ids.cur,
+        get_random_bytes(&tmp, sizeof(tmp));
-                         sizeof(rxrpc_client_conn_ids.cur));
+        tmp &= 0x3fffffff;
-        rxrpc_client_conn_ids.cur &= 0x3fffffff;
+        if (tmp == 0)
-        if (rxrpc_client_conn_ids.cur == 0)
+                tmp = 1;
-                rxrpc_client_conn_ids.cur = 1;
+        idr_set_cursor(&rxrpc_client_conn_ids, tmp);
        ret = -ENOMEM;
        rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 60ef9605167e..6cbcdcc29853 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
         * times the maximum number of client conns away from the current
         * allocation point to try and keep the IDs concentrated.
         */
-        id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur);
+        id_cursor = idr_get_cursor(&rxrpc_client_conn_ids);
        id = conn->proto.cid >> RXRPC_CIDSHIFT;
        distance = id - id_cursor;
        if (distance < 0)
                distance = -distance;
-        limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4;
+        limit = max(rxrpc_max_client_connections * 4, 1024U);
        if (distance > limit)
                goto mark_dont_reuse;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 682b73af7766..838ffa78cfda 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
         * the execve().
         */
        if (get_user_pages_remote(current, bprm->mm, pos, 1,
-                                FOLL_FORCE, &page, NULL) <= 0)
+                                FOLL_FORCE, &page, NULL, NULL) <= 0)
                return false;
 #else
        page = bprm->page[pos / PAGE_SIZE];
diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h
index 9e5f4846967f..beda1a884b50 100644
--- a/tools/include/asm/bug.h
+++ b/tools/include/asm/bug.h
@@ -12,6 +12,17 @@
        unlikely(__ret_warn_on);                \
 })
+#define WARN_ON_ONCE(condition) ({                      \
+        static int __warned;                            \
+        int __ret_warn_once = !!(condition);            \
+                                                        \
+        if (unlikely(__ret_warn_once && !__warned)) {   \
+                __warned = true;                        \
+                WARN_ON(1);                             \
+        }                                               \
+        unlikely(__ret_warn_once);                      \
+})
 #define WARN_ONCE(condition, format...) ({      \
        static int __warned;                    \
        int __ret_warn_once = !!(condition);    \
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 43c1c5021e4b..eef41d500e9e 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -35,6 +35,32 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
        }
 }
+static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
+{
+        unsigned int nlongs = BITS_TO_LONGS(nbits);
+        if (!small_const_nbits(nbits)) {
+                unsigned int len = (nlongs - 1) * sizeof(unsigned long);
+                memset(dst, 0xff,  len);
+        }
+        dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
+}
+static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
+{
+        if (small_const_nbits(nbits))
+                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
+        return find_first_bit(src, nbits) == nbits;
+}
+static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
+{
+        if (small_const_nbits(nbits))
+                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
+        return find_first_zero_bit(src, nbits) == nbits;
+}
 static inline int bitmap_weight(const unsigned long *src, int nbits)
 {
        if (small_const_nbits(nbits))
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index d08e214ec6e7..be93ab02b490 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -719,14 +719,14 @@ sub set_value {
    if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") {
        # Note if a test is something other than build, then we
-        # will need other manditory options.
+        # will need other mandatory options.
        if ($prvalue ne "install") {
            # for bisect, we need to check BISECT_TYPE
            if ($prvalue ne "bisect") {
                $buildonly = 0;
            }
        } else {
-            # install still limits some manditory options.
+            # install still limits some mandatory options.
            $buildonly = 2;
        }
    }
@@ -735,7 +735,7 @@ sub set_value {
        if ($prvalue ne "install") {
            $buildonly = 0;
        } else {
-            # install still limits some manditory options.
+            # install still limits some mandatory options.
            $buildonly = 2;
        }
    }
@@ -3989,7 +3989,7 @@ sub make_min_config {
                }
            }
-            # Save off all the current mandidory configs
+            # Save off all the current mandatory configs
            open (OUT, ">$temp_config")
                or die "Can't write to $temp_config";
            foreach my $config (keys %keep_configs) {
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index f2e07f2fd4b4..3635e4d3eca7 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -1,10 +1,14 @@
-CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE
+CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE
 LDFLAGS += -lpthread -lurcu
 TARGETS = main
 OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
         regression1.o regression2.o regression3.o multiorder.o \
-         iteration_check.o
+         iteration_check.o benchmark.o
+ifdef BENCHMARK
+        CFLAGS += -DBENCHMARK=1
+endif
 targets: $(TARGETS)
@@ -14,7 +18,12 @@ main:	$(OFILES)
 clean:
        $(RM) -f $(TARGETS) *.o radix-tree.c
-$(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h
+find_next_bit.o: ../../lib/find_bit.c
+        $(CC) $(CFLAGS) -c -o $@ $<
+$(OFILES): *.h */*.h \
+        ../../include/linux/*.h \
+        ../../../include/linux/radix-tree.h
 radix-tree.c: ../../../lib/radix-tree.c
        sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c
new file mode 100644
index 000000000000..215ca86c7605
--- /dev/null
+++ b/tools/testing/radix-tree/benchmark.c
@@ -0,0 +1,98 @@
+/*
+ * benchmark.c:
+ * Author: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/radix-tree.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <time.h>
+#include "test.h"
+#define NSEC_PER_SEC    1000000000L
+static long long benchmark_iter(struct radix_tree_root *root, bool tagged)
+{
+        volatile unsigned long sink = 0;
+        struct radix_tree_iter iter;
+        struct timespec start, finish;
+        long long nsec;
+        int l, loops = 1;
+        void **slot;
+#ifdef BENCHMARK
+again:
+#endif
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        for (l = 0; l < loops; l++) {
+                if (tagged) {
+                        radix_tree_for_each_tagged(slot, root, &iter, 0, 0)
+                                sink ^= (unsigned long)slot;
+                } else {
+                        radix_tree_for_each_slot(slot, root, &iter, 0)
+                                sink ^= (unsigned long)slot;
+                }
+        }
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
+               (finish.tv_nsec - start.tv_nsec);
+#ifdef BENCHMARK
+        if (loops == 1 && nsec * 5 < NSEC_PER_SEC) {
+                loops = NSEC_PER_SEC / nsec / 4 + 1;
+                goto again;
+        }
+#endif
+        nsec /= loops;
+        return nsec;
+}
+static void benchmark_size(unsigned long size, unsigned long step, int order)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        long long normal, tagged;
+        unsigned long index;
+        for (index = 0 ; index < size ; index += step) {
+                item_insert_order(&tree, index, order);
+                radix_tree_tag_set(&tree, index, 0);
+        }
+        tagged = benchmark_iter(&tree, true);
+        normal = benchmark_iter(&tree, false);
+        printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n",
+                size, step, order, tagged, normal);
+        item_kill_tree(&tree);
+        rcu_barrier();
+}
+void benchmark(void)
+{
+        unsigned long size[] = {1 << 10, 1 << 20, 0};
+        unsigned long step[] = {1, 2, 7, 15, 63, 64, 65,
+                                128, 256, 512, 12345, 0};
+        int c, s;
+        printf("starting benchmarks\n");
+        printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT);
+        for (c = 0; size[c]; c++)
+                for (s = 0; step[s]; s++)
+                        benchmark_size(size[c], step[s], 0);
+        for (c = 0; size[c]; c++)
+                for (s = 0; step[s]; s++)
+                        benchmark_size(size[c], step[s] << 9, 9);
+}
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c
deleted file mode 100644
index d1c2178bb2d4..000000000000
--- a/tools/testing/radix-tree/find_next_bit.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* find_next_bit.c: fallback find next bit implementation
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/types.h>
-#include <linux/bitops.h>
-#define BITOP_WORD(nr)          ((nr) / BITS_PER_LONG)
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-                            unsigned long offset)
-{
-        const unsigned long *p = addr + BITOP_WORD(offset);
-        unsigned long result = offset & ~(BITS_PER_LONG-1);
-        unsigned long tmp;
-        if (offset >= size)
-                return size;
-        size -= result;
-        offset %= BITS_PER_LONG;
-        if (offset) {
-                tmp = *(p++);
-                tmp &= (~0UL << offset);
-                if (size < BITS_PER_LONG)
-                        goto found_first;
-                if (tmp)
-                        goto found_middle;
-                size -= BITS_PER_LONG;
-                result += BITS_PER_LONG;
-        }
-        while (size & ~(BITS_PER_LONG-1)) {
-                if ((tmp = *(p++)))
-                        goto found_middle;
-                result += BITS_PER_LONG;
-                size -= BITS_PER_LONG;
-        }
-        if (!size)
-                return result;
-        tmp = *p;
-found_first:
-        tmp &= (~0UL >> (BITS_PER_LONG - size));
-        if (tmp == 0UL)         /* Are any bits set? */
-                return result + size;   /* Nope. */
-found_middle:
-        return result + __ffs(tmp);
-}
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
index 9adb8e7415a6..7572b7ed930e 100644
--- a/tools/testing/radix-tree/iteration_check.c
+++ b/tools/testing/radix-tree/iteration_check.c
@@ -16,35 +16,50 @@
 #include <pthread.h>
 #include "test.h"
-#define NUM_THREADS 4
+#define NUM_THREADS     5
-#define TAG 0
+#define MAX_IDX         100
+#define TAG             0
+#define NEW_TAG         1
 static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t threads[NUM_THREADS];
-RADIX_TREE(tree, GFP_KERNEL);
+static unsigned int seeds[3];
-bool test_complete;
+static RADIX_TREE(tree, GFP_KERNEL);
+static bool test_complete;
+static int max_order;
 /* relentlessly fill the tree with tagged entries */
 static void *add_entries_fn(void *arg)
 {
-        int pgoff;
+        rcu_register_thread();
        while (!test_complete) {
-                for (pgoff = 0; pgoff < 100; pgoff++) {
+                unsigned long pgoff;
+                int order;
+                for (pgoff = 0; pgoff < MAX_IDX; pgoff++) {
                        pthread_mutex_lock(&tree_lock);
-                        if (item_insert(&tree, pgoff) == 0)
+                        for (order = max_order; order >= 0; order--) {
-                                item_tag_set(&tree, pgoff, TAG);
+                                if (item_insert_order(&tree, pgoff, order)
+                                                == 0) {
+                                        item_tag_set(&tree, pgoff, TAG);
+                                        break;
+                                }
+                        }
                        pthread_mutex_unlock(&tree_lock);
                }
        }
+        rcu_unregister_thread();
        return NULL;
 }
 /*
 * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find
 * things that have been removed and randomly resetting our iteration to the
- * next chunk with radix_tree_iter_next().  Both radix_tree_iter_retry() and
+ * next chunk with radix_tree_iter_resume().  Both radix_tree_iter_retry() and
- * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a
+ * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
 * NULL 'slot' variable.
 */
 static void *tagged_iteration_fn(void *arg)
@@ -52,17 +67,12 @@ static void *tagged_iteration_fn(void *arg)
        struct radix_tree_iter iter;
        void **slot;
+        rcu_register_thread();
        while (!test_complete) {
                rcu_read_lock();
                radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) {
-                        void *entry;
+                        void *entry = radix_tree_deref_slot(slot);
-                        int i;
-                        /* busy wait to let removals happen */
-                        for (i = 0; i < 1000000; i++)
-                                ;
-                        entry = radix_tree_deref_slot(slot);
                        if (unlikely(!entry))
                                continue;
@@ -71,20 +81,26 @@ static void *tagged_iteration_fn(void *arg)
                                continue;
                        }
-                        if (rand() % 50 == 0)
+                        if (rand_r(&seeds[0]) % 50 == 0) {
-                                slot = radix_tree_iter_next(&iter);
+                                slot = radix_tree_iter_resume(slot, &iter);
+                                rcu_read_unlock();
+                                rcu_barrier();
+                                rcu_read_lock();
+                        }
                }
                rcu_read_unlock();
        }
+        rcu_unregister_thread();
        return NULL;
 }
 /*
 * Iterate over the entries, doing a radix_tree_iter_retry() as we find things
 * that have been removed and randomly resetting our iteration to the next
- * chunk with radix_tree_iter_next().  Both radix_tree_iter_retry() and
+ * chunk with radix_tree_iter_resume().  Both radix_tree_iter_retry() and
- * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a
+ * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
 * NULL 'slot' variable.
 */
 static void *untagged_iteration_fn(void *arg)
@@ -92,17 +108,12 @@ static void *untagged_iteration_fn(void *arg)
        struct radix_tree_iter iter;
        void **slot;
+        rcu_register_thread();
        while (!test_complete) {
                rcu_read_lock();
                radix_tree_for_each_slot(slot, &tree, &iter, 0) {
-                        void *entry;
+                        void *entry = radix_tree_deref_slot(slot);
-                        int i;
-                        /* busy wait to let removals happen */
-                        for (i = 0; i < 1000000; i++)
-                                ;
-                        entry = radix_tree_deref_slot(slot);
                        if (unlikely(!entry))
                                continue;
@@ -111,12 +122,18 @@ static void *untagged_iteration_fn(void *arg)
                                continue;
                        }
-                        if (rand() % 50 == 0)
+                        if (rand_r(&seeds[1]) % 50 == 0) {
-                                slot = radix_tree_iter_next(&iter);
+                                slot = radix_tree_iter_resume(slot, &iter);
+                                rcu_read_unlock();
+                                rcu_barrier();
+                                rcu_read_lock();
+                        }
                }
                rcu_read_unlock();
        }
+        rcu_unregister_thread();
        return NULL;
 }
@@ -126,47 +143,71 @@ static void *untagged_iteration_fn(void *arg)
 */
 static void *remove_entries_fn(void *arg)
 {
+        rcu_register_thread();
        while (!test_complete) {
                int pgoff;
-                pgoff = rand() % 100;
+                pgoff = rand_r(&seeds[2]) % MAX_IDX;
                pthread_mutex_lock(&tree_lock);
                item_delete(&tree, pgoff);
                pthread_mutex_unlock(&tree_lock);
        }
+        rcu_unregister_thread();
+        return NULL;
+}
+static void *tag_entries_fn(void *arg)
+{
+        rcu_register_thread();
+        while (!test_complete) {
+                tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG,
+                                        NEW_TAG);
+        }
+        rcu_unregister_thread();
        return NULL;
 }
 /* This is a unit test for a bug found by the syzkaller tester */
-void iteration_test(void)
+void iteration_test(unsigned order, unsigned test_duration)
 {
        int i;
-        printf("Running iteration tests for 10 seconds\n");
+        printf("Running %siteration tests for %d seconds\n",
+                        order > 0 ? "multiorder " : "", test_duration);
-        srand(time(0));
+        max_order = order;
        test_complete = false;
+        for (i = 0; i < 3; i++)
+                seeds[i] = rand();
        if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) {
-                perror("pthread_create");
+                perror("create tagged iteration thread");
                exit(1);
        }
        if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) {
-                perror("pthread_create");
+                perror("create untagged iteration thread");
                exit(1);
        }
        if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) {
-                perror("pthread_create");
+                perror("create add entry thread");
                exit(1);
        }
        if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) {
-                perror("pthread_create");
+                perror("create remove entry thread");
+                exit(1);
+        }
+        if (pthread_create(&threads[4], NULL, tag_entries_fn, NULL)) {
+                perror("create tag entry thread");
                exit(1);
        }
-        sleep(10);
+        sleep(test_duration);
        test_complete = true;
        for (i = 0; i < NUM_THREADS; i++) {
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
index 154823737b20..d31ea7c9abec 100644
--- a/tools/testing/radix-tree/linux.c
+++ b/tools/testing/radix-tree/linux.c
@@ -1,14 +1,26 @@
 #include <stdlib.h>
 #include <string.h>
 #include <malloc.h>
+#include <pthread.h>
 #include <unistd.h>
 #include <assert.h>
 #include <linux/mempool.h>
+#include <linux/poison.h>
 #include <linux/slab.h>
+#include <linux/radix-tree.h>
 #include <urcu/uatomic.h>
 int nr_allocated;
+int preempt_count;
+struct kmem_cache {
+        pthread_mutex_t lock;
+        int size;
+        int nr_objs;
+        void *objs;
+        void (*ctor)(void *);
+};
 void *mempool_alloc(mempool_t *pool, int gfp_mask)
 {
@@ -33,19 +45,59 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
 {
-        void *ret = malloc(cachep->size);
+        struct radix_tree_node *node;
-        if (cachep->ctor)
-                cachep->ctor(ret);
+        if (flags & __GFP_NOWARN)
+                return NULL;
+        pthread_mutex_lock(&cachep->lock);
+        if (cachep->nr_objs) {
+                cachep->nr_objs--;
+                node = cachep->objs;
+                cachep->objs = node->private_data;
+                pthread_mutex_unlock(&cachep->lock);
+                node->private_data = NULL;
+        } else {
+                pthread_mutex_unlock(&cachep->lock);
+                node = malloc(cachep->size);
+                if (cachep->ctor)
+                        cachep->ctor(node);
+        }
        uatomic_inc(&nr_allocated);
-        return ret;
+        return node;
 }
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        assert(objp);
        uatomic_dec(&nr_allocated);
-        memset(objp, 0, cachep->size);
+        pthread_mutex_lock(&cachep->lock);
-        free(objp);
+        if (cachep->nr_objs > 10) {
+                memset(objp, POISON_FREE, cachep->size);
+                free(objp);
+        } else {
+                struct radix_tree_node *node = objp;
+                cachep->nr_objs++;
+                node->private_data = cachep->objs;
+                cachep->objs = node;
+        }
+        pthread_mutex_unlock(&cachep->lock);
+}
+void *kmalloc(size_t size, gfp_t gfp)
+{
+        void *ret = malloc(size);
+        uatomic_inc(&nr_allocated);
+        return ret;
+}
+void kfree(void *p)
+{
+        if (!p)
+                return;
+        uatomic_dec(&nr_allocated);
+        free(p);
 }
 struct kmem_cache *
@@ -54,7 +106,10 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
 {
        struct kmem_cache *ret = malloc(sizeof(*ret));
+        pthread_mutex_init(&ret->lock, NULL);
        ret->size = size;
+        ret->nr_objs = 0;
+        ret->objs = NULL;
        ret->ctor = ctor;
        return ret;
 }
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h
index 71d58427ab60..a13e9bc76eec 100644
--- a/tools/testing/radix-tree/linux/bitops.h
+++ b/tools/testing/radix-tree/linux/bitops.h
@@ -2,9 +2,14 @@
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 #include <linux/types.h>
+#include <linux/bitops/find.h>
+#include <linux/bitops/hweight.h>
+#include <linux/kernel.h>
-#define BITOP_MASK(nr)          (1UL << ((nr) % BITS_PER_LONG))
+#define BIT_MASK(nr)            (1UL << ((nr) % BITS_PER_LONG))
-#define BITOP_WORD(nr)          ((nr) / BITS_PER_LONG)
+#define BIT_WORD(nr)            ((nr) / BITS_PER_LONG)
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 /**
 * __set_bit - Set a bit in memory
@@ -17,16 +22,16 @@
 */
 static inline void __set_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        *p  |= mask;
 }
 static inline void __clear_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        *p &= ~mask;
 }
@@ -42,8 +47,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
 */
 static inline void __change_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        *p ^= mask;
 }
@@ -59,8 +64,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
 */
 static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;
        *p = old | mask;
@@ -78,8 +83,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 */
 static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;
        *p = old & ~mask;
@@ -90,8 +95,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 static inline int __test_and_change_bit(int nr,
                                            volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
-        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;
        *p = old ^ mask;
@@ -105,7 +110,7 @@ static inline int __test_and_change_bit(int nr,
 */
 static inline int test_bit(int nr, const volatile unsigned long *addr)
 {
-        return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+        return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 /**
@@ -147,4 +152,9 @@ unsigned long find_next_bit(const unsigned long *addr,
                            unsigned long size,
                            unsigned long offset);
+static inline unsigned long hweight_long(unsigned long w)
+{
+        return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h
index 46a825cf2ae1..6a1bcb9d2c4a 100644
--- a/tools/testing/radix-tree/linux/bitops/non-atomic.h
+++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h
@@ -3,7 +3,6 @@
 #include <asm/types.h>
-#define BITOP_MASK(nr)          (1UL << ((nr) % BITS_PER_LONG))
 #define BITOP_WORD(nr)          ((nr) / BITS_PER_LONG)
 /**
@@ -17,7 +16,7 @@
 */
 static inline void __set_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        *p  |= mask;
@@ -25,7 +24,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 static inline void __clear_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        *p &= ~mask;
@@ -42,7 +41,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
 */
 static inline void __change_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        *p ^= mask;
@@ -59,7 +58,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
 */
 static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        unsigned long old = *p;
@@ -78,7 +77,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 */
 static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        unsigned long old = *p;
@@ -90,7 +89,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 static inline int __test_and_change_bit(int nr,
                                            volatile unsigned long *addr)
 {
-        unsigned long mask = BITOP_MASK(nr);
+        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
        unsigned long old = *p;
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
index ccbe444977df..23b8ed52f8c8 100644
--- a/tools/testing/radix-tree/linux/bug.h
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -1 +1 @@
-#define WARN_ON_ONCE(x)         assert(x)
+#include "asm/bug.h"
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
index 5201b915f631..5b09b2ce6c33 100644
--- a/tools/testing/radix-tree/linux/gfp.h
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -3,8 +3,24 @@
 #define __GFP_BITS_SHIFT 26
 #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-#define __GFP_WAIT 1
-#define __GFP_ACCOUNT 0
+#define __GFP_HIGH              0x20u
-#define __GFP_NOWARN 0
+#define __GFP_IO                0x40u
+#define __GFP_FS                0x80u
+#define __GFP_NOWARN            0x200u
+#define __GFP_ATOMIC            0x80000u
+#define __GFP_ACCOUNT           0x100000u
+#define __GFP_DIRECT_RECLAIM    0x400000u
+#define __GFP_KSWAPD_RECLAIM    0x2000000u
+#define __GFP_RECLAIM           (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC              (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL              (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
+{
+        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
+}
 #endif
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index be98a47b4e1b..9b43b4975d83 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -8,9 +8,14 @@
 #include <limits.h>
 #include "../../include/linux/compiler.h"
+#include "../../include/linux/err.h"
 #include "../../../include/linux/kconfig.h"
+#ifdef BENCHMARK
+#define RADIX_TREE_MAP_SHIFT    6
+#else
 #define RADIX_TREE_MAP_SHIFT    3
+#endif
 #ifndef NULL
 #define NULL    0
@@ -43,4 +48,17 @@ static inline int in_interrupt(void)
 {
        return 0;
 }
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+#define xchg(ptr, x)    uatomic_xchg(ptr, x)
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h
index 6210672e3baa..65c04c226965 100644
--- a/tools/testing/radix-tree/linux/preempt.h
+++ b/tools/testing/radix-tree/linux/preempt.h
@@ -1,4 +1,4 @@
-/* */
+extern int preempt_count;
-#define preempt_disable() do { } while (0)
+#define preempt_disable()       uatomic_inc(&preempt_count)
-#define preempt_enable() do { } while (0)
+#define preempt_enable()        uatomic_dec(&preempt_count)
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
index 6d5a34770fd4..e40337f41a38 100644
--- a/tools/testing/radix-tree/linux/slab.h
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -7,15 +7,8 @@
 #define SLAB_PANIC 2
 #define SLAB_RECLAIM_ACCOUNT    0x00020000UL            /* Objects are reclaimable */
-static inline int gfpflags_allow_blocking(gfp_t mask)
+void *kmalloc(size_t size, gfp_t);
-{
+void kfree(void *);
-        return 1;
-}
-struct kmem_cache {
-        int size;
-        void (*ctor)(void *);
-};
 void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp);
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
index faa0b6ff9ca8..8491d89873bb 100644
--- a/tools/testing/radix-tree/linux/types.h
+++ b/tools/testing/radix-tree/linux/types.h
@@ -6,8 +6,6 @@
 #define __rcu
 #define __read_mostly
-#define BITS_PER_LONG (sizeof(long) * 8)
 static inline void INIT_LIST_HEAD(struct list_head *list)
 {
        list->next = list;
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index daa9010693e8..f7e9801a6754 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -67,7 +67,6 @@ void big_gang_check(bool long_run)
        for (i = 0; i < (long_run ? 1000 : 3); i++) {
                __big_gang_check();
-                srand(time(0));
                printf("%d ", i);
                fflush(stdout);
        }
@@ -206,8 +205,7 @@ void copy_tag_check(void)
        }
 //      printf("\ncopying tags...\n");
-        cur = start;
+        tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1);
-        tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1);
 //      printf("checking copied tags\n");
        assert(tagged == count);
@@ -215,16 +213,13 @@ void copy_tag_check(void)
        /* Copy tags in several rounds */
 //      printf("\ncopying tags...\n");
-        cur = start;
+        tmp = rand() % (count / 10 + 2);
-        do {
+        tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2);
-                tmp = rand() % (count/10+2);
+        assert(tagged == count);
-                tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2);
-        } while (tmp == tagged);
 //      printf("%lu %lu %lu\n", tagged, tmp, count);
 //      printf("checking copied tags\n");
        check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2);
-        assert(tagged < tmp);
        verify_tag_consistency(&tree, 0);
        verify_tag_consistency(&tree, 1);
        verify_tag_consistency(&tree, 2);
@@ -240,7 +235,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index,
        item_insert_order(tree, index, order);
        item = item_lookup(tree, index);
-        index2 = radix_tree_locate_item(tree, item);
+        index2 = find_item(tree, item);
        if (index != index2) {
                printf("index %ld order %d inserted; found %ld\n",
                        index, order, index2);
@@ -274,17 +269,17 @@ static void locate_check(void)
                             index += (1UL << order)) {
                                __locate_check(&tree, index + offset, order);
                        }
-                        if (radix_tree_locate_item(&tree, &tree) != -1)
+                        if (find_item(&tree, &tree) != -1)
                                abort();
                        item_kill_tree(&tree);
                }
        }
-        if (radix_tree_locate_item(&tree, &tree) != -1)
+        if (find_item(&tree, &tree) != -1)
                abort();
        __locate_check(&tree, -1, 0);
-        if (radix_tree_locate_item(&tree, &tree) != -1)
+        if (find_item(&tree, &tree) != -1)
                abort();
        item_kill_tree(&tree);
 }
@@ -293,50 +288,80 @@ static void single_thread_tests(bool long_run)
 {
        int i;
-        printf("starting single_thread_tests: %d allocated\n", nr_allocated);
+        printf("starting single_thread_tests: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        multiorder_checks();
-        printf("after multiorder_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after multiorder_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        locate_check();
-        printf("after locate_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after locate_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        tag_check();
-        printf("after tag_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after tag_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        gang_check();
-        printf("after gang_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after gang_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        add_and_check();
-        printf("after add_and_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after add_and_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        dynamic_height_check();
-        printf("after dynamic_height_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after dynamic_height_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        big_gang_check(long_run);
-        printf("after big_gang_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after big_gang_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        for (i = 0; i < (long_run ? 2000 : 3); i++) {
                copy_tag_check();
                printf("%d ", i);
                fflush(stdout);
        }
-        printf("after copy_tag_check: %d allocated\n", nr_allocated);
+        rcu_barrier();
+        printf("after copy_tag_check: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
 }
 int main(int argc, char **argv)
 {
        bool long_run = false;
        int opt;
+        unsigned int seed = time(NULL);
-        while ((opt = getopt(argc, argv, "l")) != -1) {
+        while ((opt = getopt(argc, argv, "ls:")) != -1) {
                if (opt == 'l')
                        long_run = true;
+                else if (opt == 's')
+                        seed = strtoul(optarg, NULL, 0);
        }
+        printf("random seed %u\n", seed);
+        srand(seed);
        rcu_register_thread();
        radix_tree_init();
        regression1_test();
        regression2_test();
        regression3_test();
-        iteration_test();
+        iteration_test(0, 10);
+        iteration_test(7, 20);
        single_thread_tests(long_run);
-        sleep(1);
+        /* Free any remaining preallocated nodes */
-        printf("after sleep(1): %d allocated\n", nr_allocated);
+        radix_tree_cpu_dead(0);
+        benchmark();
+        rcu_barrier();
+        printf("after rcu_barrier: %d allocated, preempt %d\n",
+                nr_allocated, preempt_count);
        rcu_unregister_thread();
        exit(0);
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index d1be94667a30..f79812a5e070 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order)
 {
        RADIX_TREE(tree, GFP_KERNEL);
        int base, err, i;
-        unsigned long first = 0;
        /* our canonical entry */
        base = index & ~((1 << order) - 1);
@@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order)
                assert(!radix_tree_tag_get(&tree, i, 1));
        }
-        assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1);
+        assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1);
        assert(radix_tree_tag_clear(&tree, index, 0));
        for_each_index(i, base, order) {
@@ -76,8 +75,27 @@ static void __multiorder_tag_test(int index, int order)
        item_kill_tree(&tree);
 }
+static void __multiorder_tag_test2(unsigned order, unsigned long index2)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        unsigned long index = (1 << order);
+        index2 += index;
+        assert(item_insert_order(&tree, 0, order) == 0);
+        assert(item_insert(&tree, index2) == 0);
+        assert(radix_tree_tag_set(&tree, 0, 0));
+        assert(radix_tree_tag_set(&tree, index2, 0));
+        assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2);
+        item_kill_tree(&tree);
+}
 static void multiorder_tag_tests(void)
 {
+        int i, j;
        /* test multi-order entry for indices 0-7 with no sibling pointers */
        __multiorder_tag_test(0, 3);
        __multiorder_tag_test(5, 3);
@@ -117,6 +135,10 @@ static void multiorder_tag_tests(void)
        __multiorder_tag_test(300, 8);
        __multiorder_tag_test(0x12345678UL, 8);
+        for (i = 1; i < 10; i++)
+                for (j = 0; j < (10 << i); j++)
+                        __multiorder_tag_test2(i, j);
 }
 static void multiorder_check(unsigned long index, int order)
@@ -125,7 +147,7 @@ static void multiorder_check(unsigned long index, int order)
        unsigned long min = index & ~((1UL << order) - 1);
        unsigned long max = min + (1UL << order);
        void **slot;
-        struct item *item2 = item_create(min);
+        struct item *item2 = item_create(min, order);
        RADIX_TREE(tree, GFP_KERNEL);
        printf("Multiorder index %ld, order %d\n", index, order);
@@ -231,11 +253,14 @@ void multiorder_iteration(void)
                radix_tree_for_each_slot(slot, &tree, &iter, j) {
                        int height = order[i] / RADIX_TREE_MAP_SHIFT;
                        int shift = height * RADIX_TREE_MAP_SHIFT;
-                        int mask = (1 << order[i]) - 1;
+                        unsigned long mask = (1UL << order[i]) - 1;
+                        struct item *item = *slot;
-                        assert(iter.index >= (index[i] &~ mask));
+                        assert((iter.index | mask) == (index[i] | mask));
-                        assert(iter.index <= (index[i] | mask));
                        assert(iter.shift == shift);
+                        assert(!radix_tree_is_internal_node(item));
+                        assert((item->index | mask) == (index[i] | mask));
+                        assert(item->order == order[i]);
                        i++;
                }
        }
@@ -248,7 +273,6 @@ void multiorder_tagged_iteration(void)
        RADIX_TREE(tree, GFP_KERNEL);
        struct radix_tree_iter iter;
        void **slot;
-        unsigned long first = 0;
        int i, j;
        printf("Multiorder tagged iteration test\n");
@@ -269,7 +293,7 @@ void multiorder_tagged_iteration(void)
                assert(radix_tree_tag_set(&tree, tag_index[i], 1));
        for (j = 0; j < 256; j++) {
-                int mask, k;
+                int k;
                for (i = 0; i < TAG_ENTRIES; i++) {
                        for (k = i; index[k] < tag_index[i]; k++)
@@ -279,18 +303,22 @@ void multiorder_tagged_iteration(void)
                }
                radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
+                        unsigned long mask;
+                        struct item *item = *slot;
                        for (k = i; index[k] < tag_index[i]; k++)
                                ;
-                        mask = (1 << order[k]) - 1;
+                        mask = (1UL << order[k]) - 1;
-                        assert(iter.index >= (tag_index[i] &~ mask));
+                        assert((iter.index | mask) == (tag_index[i] | mask));
-                        assert(iter.index <= (tag_index[i] | mask));
+                        assert(!radix_tree_is_internal_node(item));
+                        assert((item->index | mask) == (tag_index[i] | mask));
+                        assert(item->order == order[k]);
                        i++;
                }
        }
-        radix_tree_range_tag_if_tagged(&tree, &first, ~0UL,
+        assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) ==
-                                        MT_NUM_ENTRIES, 1, 2);
+                                TAG_ENTRIES);
        for (j = 0; j < 256; j++) {
                int mask, k;
@@ -303,19 +331,21 @@ void multiorder_tagged_iteration(void)
                }
                radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
+                        struct item *item = *slot;
                        for (k = i; index[k] < tag_index[i]; k++)
                                ;
                        mask = (1 << order[k]) - 1;
-                        assert(iter.index >= (tag_index[i] &~ mask));
+                        assert((iter.index | mask) == (tag_index[i] | mask));
-                        assert(iter.index <= (tag_index[i] | mask));
+                        assert(!radix_tree_is_internal_node(item));
+                        assert((item->index | mask) == (tag_index[i] | mask));
+                        assert(item->order == order[k]);
                        i++;
                }
        }
-        first = 1;
+        assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0)
-        radix_tree_range_tag_if_tagged(&tree, &first, ~0UL,
+                        == TAG_ENTRIES);
-                                        MT_NUM_ENTRIES, 1, 0);
        i = 0;
        radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) {
                assert(iter.index == tag_index[i]);
@@ -325,6 +355,261 @@ void multiorder_tagged_iteration(void)
        item_kill_tree(&tree);
 }
+static void multiorder_join1(unsigned long index,
+                                unsigned order1, unsigned order2)
+{
+        unsigned long loc;
+        void *item, *item2 = item_create(index + 1, order1);
+        RADIX_TREE(tree, GFP_KERNEL);
+        item_insert_order(&tree, index, order2);
+        item = radix_tree_lookup(&tree, index);
+        radix_tree_join(&tree, index + 1, order1, item2);
+        loc = find_item(&tree, item);
+        if (loc == -1)
+                free(item);
+        item = radix_tree_lookup(&tree, index + 1);
+        assert(item == item2);
+        item_kill_tree(&tree);
+}
+static void multiorder_join2(unsigned order1, unsigned order2)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        struct radix_tree_node *node;
+        void *item1 = item_create(0, order1);
+        void *item2;
+        item_insert_order(&tree, 0, order2);
+        radix_tree_insert(&tree, 1 << order2, (void *)0x12UL);
+        item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
+        assert(item2 == (void *)0x12UL);
+        assert(node->exceptional == 1);
+        radix_tree_join(&tree, 0, order1, item1);
+        item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
+        assert(item2 == item1);
+        assert(node->exceptional == 0);
+        item_kill_tree(&tree);
+}
+/*
+ * This test revealed an accounting bug for exceptional entries at one point.
+ * Nodes were being freed back into the pool with an elevated exception count
+ * by radix_tree_join() and then radix_tree_split() was failing to zero the
+ * count of exceptional entries.
+ */
+static void multiorder_join3(unsigned int order)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        struct radix_tree_node *node;
+        void **slot;
+        struct radix_tree_iter iter;
+        unsigned long i;
+        for (i = 0; i < (1 << order); i++) {
+                radix_tree_insert(&tree, i, (void *)0x12UL);
+        }
+        radix_tree_join(&tree, 0, order, (void *)0x16UL);
+        rcu_barrier();
+        radix_tree_split(&tree, 0, 0);
+        radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+                radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL);
+        }
+        __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(node->exceptional == node->count);
+        item_kill_tree(&tree);
+}
+static void multiorder_join(void)
+{
+        int i, j, idx;
+        for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
+                for (i = 1; i < 15; i++) {
+                        for (j = 0; j < i; j++) {
+                                multiorder_join1(idx, i, j);
+                        }
+                }
+        }
+        for (i = 1; i < 15; i++) {
+                for (j = 0; j < i; j++) {
+                        multiorder_join2(i, j);
+                }
+        }
+        for (i = 3; i < 10; i++) {
+                multiorder_join3(i);
+        }
+}
+static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
+{
+        struct radix_tree_preload *rtp = &radix_tree_preloads;
+        if (rtp->nr != 0)
+                printf("split(%u %u) remaining %u\n", old_order, new_order,
+                                                        rtp->nr);
+        /*
+         * Can't check for equality here as some nodes may have been
+         * RCU-freed while we ran.  But we should never finish with more
+         * nodes allocated since they should have all been preloaded.
+         */
+        if (nr_allocated > alloc)
+                printf("split(%u %u) allocated %u %u\n", old_order, new_order,
+                                                        alloc, nr_allocated);
+}
+static void __multiorder_split(int old_order, int new_order)
+{
+        RADIX_TREE(tree, GFP_ATOMIC);
+        void **slot;
+        struct radix_tree_iter iter;
+        unsigned alloc;
+        radix_tree_preload(GFP_KERNEL);
+        assert(item_insert_order(&tree, 0, old_order) == 0);
+        radix_tree_preload_end();
+        /* Wipe out the preloaded cache or it'll confuse check_mem() */
+        radix_tree_cpu_dead(0);
+        radix_tree_tag_set(&tree, 0, 2);
+        radix_tree_split_preload(old_order, new_order, GFP_KERNEL);
+        alloc = nr_allocated;
+        radix_tree_split(&tree, 0, new_order);
+        check_mem(old_order, new_order, alloc);
+        radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+                radix_tree_iter_replace(&tree, &iter, slot,
+                                        item_create(iter.index, new_order));
+        }
+        radix_tree_preload_end();
+        item_kill_tree(&tree);
+}
+static void __multiorder_split2(int old_order, int new_order)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        void **slot;
+        struct radix_tree_iter iter;
+        struct radix_tree_node *node;
+        void *item;
+        __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
+        item = __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(item == (void *)0x12);
+        assert(node->exceptional > 0);
+        radix_tree_split(&tree, 0, new_order);
+        radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+                radix_tree_iter_replace(&tree, &iter, slot,
+                                        item_create(iter.index, new_order));
+        }
+        item = __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(item != (void *)0x12);
+        assert(node->exceptional == 0);
+        item_kill_tree(&tree);
+}
+static void __multiorder_split3(int old_order, int new_order)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        void **slot;
+        struct radix_tree_iter iter;
+        struct radix_tree_node *node;
+        void *item;
+        __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
+        item = __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(item == (void *)0x12);
+        assert(node->exceptional > 0);
+        radix_tree_split(&tree, 0, new_order);
+        radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+                radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16);
+        }
+        item = __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(item == (void *)0x16);
+        assert(node->exceptional > 0);
+        item_kill_tree(&tree);
+        __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
+        item = __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(item == (void *)0x12);
+        assert(node->exceptional > 0);
+        radix_tree_split(&tree, 0, new_order);
+        radix_tree_for_each_slot(slot, &tree, &iter, 0) {
+                if (iter.index == (1 << new_order))
+                        radix_tree_iter_replace(&tree, &iter, slot,
+                                                (void *)0x16);
+                else
+                        radix_tree_iter_replace(&tree, &iter, slot, NULL);
+        }
+        item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL);
+        assert(item == (void *)0x16);
+        assert(node->count == node->exceptional);
+        do {
+                node = node->parent;
+                if (!node)
+                        break;
+                assert(node->count == 1);
+                assert(node->exceptional == 0);
+        } while (1);
+        item_kill_tree(&tree);
+}
+static void multiorder_split(void)
+{
+        int i, j;
+        for (i = 3; i < 11; i++)
+                for (j = 0; j < i; j++) {
+                        __multiorder_split(i, j);
+                        __multiorder_split2(i, j);
+                        __multiorder_split3(i, j);
+                }
+}
+static void multiorder_account(void)
+{
+        RADIX_TREE(tree, GFP_KERNEL);
+        struct radix_tree_node *node;
+        void **slot;
+        item_insert_order(&tree, 0, 5);
+        __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
+        __radix_tree_lookup(&tree, 0, &node, NULL);
+        assert(node->count == node->exceptional * 2);
+        radix_tree_delete(&tree, 1 << 5);
+        assert(node->exceptional == 0);
+        __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
+        __radix_tree_lookup(&tree, 1 << 5, &node, &slot);
+        assert(node->count == node->exceptional * 2);
+        __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL);
+        assert(node->exceptional == 0);
+        item_kill_tree(&tree);
+}
 void multiorder_checks(void)
 {
        int i;
@@ -342,4 +627,9 @@ void multiorder_checks(void)
        multiorder_tag_tests();
        multiorder_iteration();
        multiorder_tagged_iteration();
+        multiorder_join();
+        multiorder_split();
+        multiorder_account();
+        radix_tree_cpu_dead(0);
 }
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c
deleted file mode 100644
index 31a2d14225d6..000000000000
--- a/tools/testing/radix-tree/rcupdate.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include <linux/rcupdate.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <assert.h>
-static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER;
-static struct rcu_head *rcuhead_global = NULL;
-static __thread int nr_rcuhead = 0;
-static __thread struct rcu_head *rcuhead = NULL;
-static __thread struct rcu_head *rcutail = NULL;
-static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER;
-/* switch to urcu implementation when it is merged. */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))
-{
-        head->func = func;
-        head->next = rcuhead;
-        rcuhead = head;
-        if (!rcutail)
-                rcutail = head;
-        nr_rcuhead++;
-        if (nr_rcuhead >= 1000) {
-                int signal = 0;
-                pthread_mutex_lock(&rculock);
-                if (!rcuhead_global)
-                        signal = 1;
-                rcutail->next = rcuhead_global;
-                rcuhead_global = head;
-                pthread_mutex_unlock(&rculock);
-                nr_rcuhead = 0;
-                rcuhead = NULL;
-                rcutail = NULL;
-                if (signal) {
-                        pthread_cond_signal(&rcu_worker_cond);
-                }
-        }
-}
-static void *rcu_worker(void *arg)
-{
-        struct rcu_head *r;
-        rcupdate_thread_init();
-        while (1) {
-                pthread_mutex_lock(&rculock);
-                while (!rcuhead_global) {
-                        pthread_cond_wait(&rcu_worker_cond, &rculock);
-                }
-                r = rcuhead_global;
-                rcuhead_global = NULL;
-                pthread_mutex_unlock(&rculock);
-                synchronize_rcu();
-                while (r) {
-                        struct rcu_head *tmp = r->next;
-                        r->func(r);
-                        r = tmp;
-                }
-        }
-        rcupdate_thread_exit();
-        return NULL;
-}
-static pthread_t worker_thread;
-void rcupdate_init(void)
-{
-        pthread_create(&worker_thread, NULL, rcu_worker, NULL);
-}
-void rcupdate_thread_init(void)
-{
-        rcu_register_thread();
-}
-void rcupdate_thread_exit(void)
-{
-        rcu_unregister_thread();
-}
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 63bf347aaf33..a41325d7a170 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -50,6 +50,7 @@
 #include <stdio.h>
 #include "regression.h"
+#include "test.h"
 #define PAGECACHE_TAG_DIRTY     0
 #define PAGECACHE_TAG_WRITEBACK 1
@@ -90,7 +91,7 @@ void regression2_test(void)
        /* 1. */
        start = 0;
        end = max_slots - 2;
-        radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1,
+        tag_tagged_items(&mt_tree, NULL, start, end, 1,
                                PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
        /* 2. */
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
index 1f06ed73d0a8..b594841fae85 100644
--- a/tools/testing/radix-tree/regression3.c
+++ b/tools/testing/radix-tree/regression3.c
@@ -5,7 +5,7 @@
 * In following radix_tree_next_slot current chunk size becomes zero.
 * This isn't checked and it tries to dereference null pointer in slot.
 *
- * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1,
+ * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1,
 * for tagger iteraction it also must reset cached tags in iterator to abort
 * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk.
 *
@@ -88,7 +88,7 @@ void regression3_test(void)
                printf("slot %ld %p\n", iter.index, *slot);
                if (!iter.index) {
                        printf("next at %ld\n", iter.index);
-                        slot = radix_tree_iter_next(&iter);
+                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
@@ -96,7 +96,7 @@ void regression3_test(void)
                printf("contig %ld %p\n", iter.index, *slot);
                if (!iter.index) {
                        printf("next at %ld\n", iter.index);
-                        slot = radix_tree_iter_next(&iter);
+                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
@@ -106,7 +106,7 @@ void regression3_test(void)
                printf("tagged %ld %p\n", iter.index, *slot);
                if (!iter.index) {
                        printf("next at %ld\n", iter.index);
-                        slot = radix_tree_iter_next(&iter);
+                        slot = radix_tree_iter_resume(slot, &iter);
                }
        }
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
index b0ac05741750..fd98c132207a 100644
--- a/tools/testing/radix-tree/tag_check.c
+++ b/tools/testing/radix-tree/tag_check.c
@@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
        item_tag_set(tree, index, tag);
        ret = item_tag_get(tree, index, tag);
        assert(ret != 0);
-        ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag);
+        ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag);
        assert(ret == 1);
        ret = item_tag_get(tree, index, !tag);
        assert(ret != 0);
@@ -51,6 +51,7 @@ void simple_checks(void)
        verify_tag_consistency(&tree, 1);
        printf("before item_kill_tree: %d allocated\n", nr_allocated);
        item_kill_tree(&tree);
+        rcu_barrier();
        printf("after item_kill_tree: %d allocated\n", nr_allocated);
 }
@@ -319,10 +320,13 @@ static void single_check(void)
        assert(ret == 0);
        verify_tag_consistency(&tree, 0);
        verify_tag_consistency(&tree, 1);
-        ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1);
+        ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1);
        assert(ret == 1);
        ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1);
        assert(ret == 1);
+        item_tag_clear(&tree, 0, 0);
+        ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0);
+        assert(ret == 0);
        item_kill_tree(&tree);
 }
@@ -331,12 +335,16 @@ void tag_check(void)
        single_check();
        extend_checks();
        contract_checks();
+        rcu_barrier();
        printf("after extend_checks: %d allocated\n", nr_allocated);
        __leak_check();
        leak_check();
+        rcu_barrier();
        printf("after leak_check: %d allocated\n", nr_allocated);
        simple_checks();
+        rcu_barrier();
        printf("after simple_checks: %d allocated\n", nr_allocated);
        thrash_tags();
+        rcu_barrier();
        printf("after thrash_tags: %d allocated\n", nr_allocated);
 }
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index a6e8099eaf4f..e5726e373646 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -24,21 +24,29 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
        return radix_tree_tag_get(root, index, tag);
 }
-int __item_insert(struct radix_tree_root *root, struct item *item,
+int __item_insert(struct radix_tree_root *root, struct item *item)
-                        unsigned order)
 {
-        return __radix_tree_insert(root, item->index, order, item);
+        return __radix_tree_insert(root, item->index, item->order, item);
 }
 int item_insert(struct radix_tree_root *root, unsigned long index)
 {
-        return __item_insert(root, item_create(index), 0);
+        return __item_insert(root, item_create(index, 0));
 }
 int item_insert_order(struct radix_tree_root *root, unsigned long index,
                        unsigned order)
 {
-        return __item_insert(root, item_create(index), order);
+        return __item_insert(root, item_create(index, order));
+}
+void item_sanity(struct item *item, unsigned long index)
+{
+        unsigned long mask;
+        assert(!radix_tree_is_internal_node(item));
+        assert(item->order < BITS_PER_LONG);
+        mask = (1UL << item->order) - 1;
+        assert((item->index | mask) == (index | mask));
 }
 int item_delete(struct radix_tree_root *root, unsigned long index)
@@ -46,18 +54,19 @@ int item_delete(struct radix_tree_root *root, unsigned long index)
        struct item *item = radix_tree_delete(root, index);
        if (item) {
-                assert(item->index == index);
+                item_sanity(item, index);
                free(item);
                return 1;
        }
        return 0;
 }
-struct item *item_create(unsigned long index)
+struct item *item_create(unsigned long index, unsigned int order)
 {
        struct item *ret = malloc(sizeof(*ret));
        ret->index = index;
+        ret->order = order;
        return ret;
 }
@@ -66,8 +75,8 @@ void item_check_present(struct radix_tree_root *root, unsigned long index)
        struct item *item;
        item = radix_tree_lookup(root, index);
-        assert(item != 0);
+        assert(item != NULL);
-        assert(item->index == index);
+        item_sanity(item, index);
 }
 struct item *item_lookup(struct radix_tree_root *root, unsigned long index)
@@ -80,7 +89,7 @@ void item_check_absent(struct radix_tree_root *root, unsigned long index)
        struct item *item;
        item = radix_tree_lookup(root, index);
-        assert(item == 0);
+        assert(item == NULL);
 }
 /*
@@ -142,6 +151,62 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
        assert(nfound == 0);
 }
+/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */
+int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock,
+                        unsigned long start, unsigned long end, unsigned batch,
+                        unsigned iftag, unsigned thentag)
+{
+        unsigned long tagged = 0;
+        struct radix_tree_iter iter;
+        void **slot;
+        if (batch == 0)
+                batch = 1;
+        if (lock)
+                pthread_mutex_lock(lock);
+        radix_tree_for_each_tagged(slot, root, &iter, start, iftag) {
+                if (iter.index > end)
+                        break;
+                radix_tree_iter_tag_set(root, &iter, thentag);
+                tagged++;
+                if ((tagged % batch) != 0)
+                        continue;
+                slot = radix_tree_iter_resume(slot, &iter);
+                if (lock) {
+                        pthread_mutex_unlock(lock);
+                        rcu_barrier();
+                        pthread_mutex_lock(lock);
+                }
+        }
+        if (lock)
+                pthread_mutex_unlock(lock);
+        return tagged;
+}
+/* Use the same pattern as find_swap_entry() in mm/shmem.c */
+unsigned long find_item(struct radix_tree_root *root, void *item)
+{
+        struct radix_tree_iter iter;
+        void **slot;
+        unsigned long found = -1;
+        unsigned long checked = 0;
+        radix_tree_for_each_slot(slot, root, &iter, 0) {
+                if (*slot == item) {
+                        found = iter.index;
+                        break;
+                }
+                checked++;
+                if ((checked % 4) != 0)
+                        continue;
+                slot = radix_tree_iter_resume(slot, &iter);
+        }
+        return found;
+}
 static int verify_node(struct radix_tree_node *slot, unsigned int tag,
                        int tagged)
 {
@@ -200,9 +265,16 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
 void item_kill_tree(struct radix_tree_root *root)
 {
+        struct radix_tree_iter iter;
+        void **slot;
        struct item *items[32];
        int nfound;
+        radix_tree_for_each_slot(slot, root, &iter, 0) {
+                if (radix_tree_exceptional_entry(*slot))
+                        radix_tree_delete(root, iter.index);
+        }
        while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) {
                int i;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 217fb2403f09..056a23b56467 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -5,11 +5,11 @@
 struct item {
        unsigned long index;
+        unsigned int order;
 };
-struct item *item_create(unsigned long index);
+struct item *item_create(unsigned long index, unsigned int order);
-int __item_insert(struct radix_tree_root *root, struct item *item,
+int __item_insert(struct radix_tree_root *root, struct item *item);
-                        unsigned order);
 int item_insert(struct radix_tree_root *root, unsigned long index);
 int item_insert_order(struct radix_tree_root *root, unsigned long index,
                        unsigned order);
@@ -25,9 +25,15 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
                        unsigned long nr, int chunk);
 void item_kill_tree(struct radix_tree_root *root);
+int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *,
+                        unsigned long start, unsigned long end, unsigned batch,
+                        unsigned iftag, unsigned thentag);
+unsigned long find_item(struct radix_tree_root *, void *item);
 void tag_check(void);
 void multiorder_checks(void);
-void iteration_test(void);
+void iteration_test(unsigned order, unsigned duration);
+void benchmark(void);
 struct item *
 item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
@@ -40,7 +46,14 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
 extern int nr_allocated;
 /* Normally private parts of lib/radix-tree.c */
+struct radix_tree_node *entry_to_node(void *ptr);
 void radix_tree_dump(struct radix_tree_root *root);
 int root_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned long node_maxindex(struct radix_tree_node *);
 unsigned long shift_maxindex(unsigned int shift);
+int radix_tree_cpu_dead(unsigned int cpu);
+struct radix_tree_preload {
+        unsigned nr;
+        struct radix_tree_node *nodes;
+};
+extern struct radix_tree_preload radix_tree_preloads;
diff --git a/usr/Kconfig b/usr/Kconfig
index 572dcf7b6a44..6278f135256d 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -98,3 +98,130 @@ config RD_LZ4
        help
          Support loading of a LZ4 encoded initial ramdisk or cpio buffer
          If unsure, say N.
+choice
+        prompt "Built-in initramfs compression mode"
+        depends on INITRAMFS_SOURCE!=""
+        optional
+        help
+          This option allows you to decide by which algorithm the builtin
+          initramfs will be compressed.  Several compression algorithms are
+          available, which differ in efficiency, compression and
+          decompression speed.  Compression speed is only relevant
+          when building a kernel.  Decompression speed is relevant at
+          each boot. Also the memory usage during decompression may become
+          relevant on memory constrained systems. This is usually based on the
+          dictionary size of the algorithm with algorithms like XZ and LZMA
+          featuring large dictionary sizes.
+          High compression options are mostly useful for users who are
+          low on RAM, since it reduces the memory consumption during
+          boot.
+          Keep in mind that your build system needs to provide the appropriate
+          compression tool to compress the generated initram cpio file for
+          embedding.
+          If in doubt, select 'None'
+config INITRAMFS_COMPRESSION_NONE
+        bool "None"
+        help
+          Do not compress the built-in initramfs at all. This may sound wasteful
+          in space, but, you should be aware that the built-in initramfs will be
+          compressed at a later stage anyways along with the rest of the kernel,
+          on those architectures that support this. However, not compressing the
+          initramfs may lead to slightly higher memory consumption during a
+          short time at boot, while both the cpio image and the unpacked
+          filesystem image will be present in memory simultaneously
+config INITRAMFS_COMPRESSION_GZIP
+        bool "Gzip"
+        depends on RD_GZIP
+        help
+          Use the old and well tested gzip compression algorithm. Gzip provides
+          a good balance between compression ratio and decompression speed and
+          has a reasonable compression speed. It is also more likely to be
+          supported by your build system as the gzip tool is present by default
+          on most distros.
+config INITRAMFS_COMPRESSION_BZIP2
+        bool "Bzip2"
+        depends on RD_BZIP2
+        help
+          It's compression ratio and speed is intermediate. Decompression speed
+          is slowest among the choices. The initramfs size is about 10% smaller
+          with bzip2, in comparison to gzip. Bzip2 uses a large amount of
+          memory. For modern kernels you will need at least 8MB RAM or more for
+          booting.
+          If you choose this, keep in mind that you need to have the bzip2 tool
+          available to be able to compress the initram.
+config INITRAMFS_COMPRESSION_LZMA
+        bool "LZMA"
+        depends on RD_LZMA
+        help
+          This algorithm's compression ratio is best but has a large dictionary
+          size which might cause issues in memory constrained systems.
+          Decompression speed is between the other choices. Compression is
+          slowest. The initramfs size is about 33% smaller with LZMA in
+          comparison to gzip.
+          If you choose this, keep in mind that you may need to install the xz
+          or lzma tools to be able to compress the initram.
+config INITRAMFS_COMPRESSION_XZ
+        bool "XZ"
+        depends on RD_XZ
+        help
+          XZ uses the LZMA2 algorithm and has a large dictionary which may cause
+          problems on memory constrained systems. The initramfs size is about
+          30% smaller with XZ in comparison to gzip. Decompression speed is
+          better than that of bzip2 but worse than gzip and LZO. Compression is
+          slow.
+          If you choose this, keep in mind that you may need to install the xz
+          tool to be able to compress the initram.
+config INITRAMFS_COMPRESSION_LZO
+        bool "LZO"
+        depends on RD_LZO
+        help
+          It's compression ratio is the second poorest amongst the choices. The
+          kernel size is about 10% bigger than gzip. Despite that, it's
+          decompression speed is the second fastest and it's compression speed
+          is quite fast too.
+          If you choose this, keep in mind that you may need to install the lzop
+          tool to be able to compress the initram.
+config INITRAMFS_COMPRESSION_LZ4
+        bool "LZ4"
+        depends on RD_LZ4
+        help
+          It's compression ratio is the poorest amongst the choices. The kernel
+          size is about 15% bigger than gzip; however its decompression speed
+          is the fastest.
+          If you choose this, keep in mind that most distros don't provide lz4
+          by default which could cause a build failure.
+endchoice
+config INITRAMFS_COMPRESSION
+        string
+        default ""      if INITRAMFS_COMPRESSION_NONE
+        default ".gz"   if INITRAMFS_COMPRESSION_GZIP
+        default ".bz2"  if INITRAMFS_COMPRESSION_BZIP2
+        default ".lzma" if INITRAMFS_COMPRESSION_LZMA
+        default ".xz"   if INITRAMFS_COMPRESSION_XZ
+        default ".lzo"  if INITRAMFS_COMPRESSION_LZO
+        default ".lz4"  if INITRAMFS_COMPRESSION_LZ4
+        default ".gz"   if RD_GZIP
+        default ".lz4"  if RD_LZ4
+        default ".lzo"  if RD_LZO
+        default ".xz"   if RD_XZ
+        default ".lzma" if RD_LZMA
+        default ".bz2"  if RD_BZIP2
+        default ""
diff --git a/usr/Makefile b/usr/Makefile
index e767f019accf..17a513268325 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -5,25 +5,7 @@
 klibcdirs:;
 PHONY += klibcdirs
+suffix_y = $(CONFIG_INITRAMFS_COMPRESSION)
-# Bzip2
-suffix_$(CONFIG_RD_BZIP2)  = .bz2
-# Lzma
-suffix_$(CONFIG_RD_LZMA)   = .lzma
-# XZ
-suffix_$(CONFIG_RD_XZ)     = .xz
-# Lzo
-suffix_$(CONFIG_RD_LZO)    = .lzo
-# Lz4
-suffix_$(CONFIG_RD_LZ4)    = .lz4
-# Gzip
-suffix_$(CONFIG_RD_GZIP)   = .gz
 AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)"
 # Generate builtin.o based on initramfs_data.o
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index efeceb0a222d..3815e940fbea 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work)
        struct kvm_vcpu *vcpu = apf->vcpu;
        unsigned long addr = apf->addr;
        gva_t gva = apf->gva;
+        int locked = 1;
        might_sleep();
        /*
         * This work is run asynchromously to the task which owns
         * mm and might be done in another context, so we must
-         * use FOLL_REMOTE.
+         * access remotely.
         */
-        __get_user_pages_unlocked(NULL, mm, addr, 1, NULL,
+        down_read(&mm->mmap_sem);
-                        FOLL_WRITE | FOLL_REMOTE);
+        get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL,
+                        &locked);
+        if (locked)
+                up_read(&mm->mmap_sem);
        kvm_async_page_present_sync(vcpu, apf);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 823544c166be..de102cae7125 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
                npages = get_user_page_nowait(addr, write_fault, page);
                up_read(&current->mm->mmap_sem);
        } else {
-                unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON;
+                unsigned int flags = FOLL_HWPOISON;
                if (write_fault)
                        flags |= FOLL_WRITE;
-                npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
+                npages = get_user_pages_unlocked(addr, 1, page, flags);
-                                                   page, flags);
        }
        if (npages != 1)
                return npages;
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-12-14 20:25:18 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-12-14 20:25:18 -0500
commit	a57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree	5a42ee9a668f171143464bc86013954c1bbe94ad
parent	cf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parent	e1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)