Merge branch 'akpm' (patches from Andrew)

Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-06 23:49:49 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-06 23:49:49 -0400
commit: d34fc1adf01ff87026da85fb972dc259dc347540 (patch)
tree: 27356073d423187157b7cdb69da32b53102fb9e7
parent: 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff)
parent: d2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff)
139 files changed, 3960 insertions, 2068 deletions
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
new file mode 100644
index 000000000000..0a54ed0d63c9
--- /dev/null
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -0,0 +1,31 @@
+What:           /proc/pid/smaps_rollup
+Date:           August 2017
+Contact:        Daniel Colascione <dancol@google.com>
+Description:
+                This file provides pre-summed memory information for a
+                process.  The format is identical to /proc/pid/smaps,
+                except instead of an entry for each VMA in a process,
+                smaps_rollup has a single entry (tagged "[rollup]")
+                for which each field is the sum of the corresponding
+                fields from all the maps in /proc/pid/smaps.
+                For more details, see the procfs man page.
+                Typical output looks like this:
+                00100000-ff709000 ---p 00000000 00:00 0          [rollup]
+                Rss:                 884 kB
+                Pss:                 385 kB
+                Shared_Clean:        696 kB
+                Shared_Dirty:          0 kB
+                Private_Clean:       120 kB
+                Private_Dirty:        68 kB
+                Referenced:          884 kB
+                Anonymous:            68 kB
+                LazyFree:              0 kB
+                AnonHugePages:         0 kB
+                ShmemPmdMapped:        0 kB
+                Shared_Hugetlb:        0 kB
+                Private_Hugetlb:       0 kB
+                Swap:                  0 kB
+                SwapPss:               0 kB
+                Locked:              385 kB
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d882b2c..c1513c756af1 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,11 @@ Description:
                device's debugging info useful for kernel developers. Its
                format is not documented intentionally and may change
                anytime without any notice.
+What:           /sys/block/zram<id>/backing_dev
+Date:           June 2017
+Contact:        Minchan Kim <minchan@kernel.org>
+Description:
+                The backing_dev file is read-write and set up backing
+                device for zram to write incompressible pages.
+                For using, user should enable CONFIG_ZRAM_WRITEBACK.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap
new file mode 100644
index 000000000000..587db52084c7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap
@@ -0,0 +1,26 @@
+What:           /sys/kernel/mm/swap/
+Date:           August 2017
+Contact:        Linux memory management mailing list <linux-mm@kvack.org>
+Description:    Interface for swapping
+What:           /sys/kernel/mm/swap/vma_ra_enabled
+Date:           August 2017
+Contact:        Linux memory management mailing list <linux-mm@kvack.org>
+Description:    Enable/disable VMA based swap readahead.
+                If set to true, the VMA based swap readahead algorithm
+                will be used for swappable anonymous pages mapped in a
+                VMA, and the global swap readahead algorithm will be
+                still used for tmpfs etc. other users.  If set to
+                false, the global swap readahead algorithm will be
+                used for all swappable pages.
+What:           /sys/kernel/mm/swap/vma_ra_max_order
+Date:           August 2017
+Contact:        Linux memory management mailing list <linux-mm@kvack.org>
+Description:    The max readahead size in order for VMA based swap readahead
+                VMA based swap readahead algorithm will readahead at
+                most 1 << max_order pages for each readahead.  The
+                real readahead size for each readahead will be scaled
+                according to the estimation algorithm.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6996b7727b85..86b0e8ec8ad7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2783,7 +2783,7 @@
                        Allowed values are enable and disable
        numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
-                        one of ['zone', 'node', 'default'] can be specified
+                        'node', 'default' can be specified
                        This can be set from sysctl after boot.
                        See Documentation/sysctl/vm.txt for details.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a21307..257e65714c6a 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams  RW    the number of possible concurrent compress operations
 comp_algorithm    RW    show and change the compression algorithm
 compact           WO    trigger memory compaction
 debug_stat        RO    this file is used for zram debugging purposes
+backing_dev       RW    set up backend storage for zram to write out
 User space is advised to use the following files to read the device statistics.
@@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
        resets the disksize to zero. You must set the disksize again
        before reusing the device.
+* Optional Feature
+= writeback
+With incompressible pages, there is no memory saving with zram.
+Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
+to backing storage rather than keeping it in memory.
+User should set up backing device via /sys/block/zramX/backing_dev
+before disksize setting.
 Nitin Gupta
 ngupta@vflare.org
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index aed6b94160b1..0eb31de3a2c1 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
                void (*mark_pages_cached)(void *cookie_netfs_data,
                                          struct address_space *mapping,
                                          struct pagevec *cached_pvec);
-                void (*now_uncached)(void *cookie_netfs_data);
        };
 This has the following fields:
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index a7e6e14aeb08..3be3b266be41 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -63,9 +63,8 @@ Filesystem support consists of
 - implementing an mmap file operation for DAX files which sets the
  VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
-  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
+  handlers should probably call dax_iomap_fault() passing the appropriate
-  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
+  fault size and iomap operations.
-  iomap operations.
 - calling iomap_zero_range() passing appropriate iomap operations instead of
  block_truncate_page() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 48244c42ff52..9baf66a9ef4e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
 numa_zonelist_order
-This sysctl is only for NUMA.
+This sysctl is only for NUMA and it is deprecated. Anything but
+Node order will fail!
 'where the memory is allocated from' is controlled by zonelists.
 (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
 you may be able to read ZONE_DMA as ZONE_DMA32...)
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index a08f71647714..a31b85b9bb88 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
 fall back to the same zone type on a different node, or to a different zone
 type on the same node.  This is an important consideration because some zones,
 such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default zonelist order based on the sizes of the various zone types relative
+a default Node ordered zonelist. This means it tries to fallback to other zones
-to the total memory of the node and the total memory of the system.  The
+from the same node before using remote nodes which are ordered by NUMA distance.
-default zonelist order may be overridden using the numa_zonelist_order kernel
-boot parameter or sysctl.  [see Documentation/admin-guide/kernel-parameters.rst and
-Documentation/sysctl/vm.txt]
 By default, Linux will attempt to satisfy memory allocation requests from the
 node to which the CPU that executes the request is assigned.  Specifically,
diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
new file mode 100644
index 000000000000..d5960c9124f5
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
+Automatically bind swap device to numa node
+-------------------------------------------
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+How to use this feature
+-----------------------
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing:
+# swapon /dev/swapA
+# swapon /dev/swapB
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above:
+# swapon /dev/swapA
+# swapon /dev/swapB
+# swapon /dev/swapC
+# swapon /dev/swapD
+# swapon /dev/swapE
+# swapon /dev/swapF
+Then node 0 will use them in the order of:
+swapA/swapB -> swapC -> swapD -> swapE -> swapF
+swapA and swapB will be used in a round robin mode before any other swap device.
+node 1 will use them in the order of:
+swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+node 2 will use them in the order of:
+swapD/swapE -> swapA -> swapB -> swapC -> swapF
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+node 3 will use them in the order of:
+swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+Implementation details
+----------------------
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 02760f6e6ca4..3b26cc62dadb 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -64,20 +64,12 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP     17              /* Clear the MADV_NODUMP flag */
+#define MADV_WIPEONFORK 18              /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19              /* Undo MADV_WIPEONFORK */
 /* compatibility flags */
 #define MAP_FILE        0
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
 #define PKEY_DISABLE_ACCESS     0x1
 #define PKEY_DISABLE_WRITE      0x2
 #define PKEY_ACCESS_MASK        (PKEY_DISABLE_ACCESS |\
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h
index e95f874ded1b..707c7f7b6bea 100644
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -4,7 +4,6 @@
 #ifdef CONFIG_NUMA
 #define cpu_to_node(cpu)        ((void)(cpu), 0)
-#define parent_node(node)       ((void)(node), 0)
 #define cpumask_of_node(node)   ((void)node, cpu_online_mask)
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 655e2fb5395b..da3216007fe0 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -91,20 +91,12 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP     17              /* Clear the MADV_NODUMP flag */
+#define MADV_WIPEONFORK 18              /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19              /* Undo MADV_WIPEONFORK */
 /* compatibility flags */
 #define MAP_FILE        0
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
 #define PKEY_DISABLE_ACCESS     0x1
 #define PKEY_DISABLE_WRITE      0x2
 #define PKEY_ACCESS_MASK        (PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 9a9c2fe4be50..775b5d5e41a1 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -57,6 +57,9 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP     70              /* Clear the MADV_NODUMP flag */
+#define MADV_WIPEONFORK 71              /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 72              /* Undo MADV_WIPEONFORK */
 #define MADV_HWPOISON     100           /* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101           /* soft offline page for testing */
@@ -64,17 +67,6 @@
 #define MAP_FILE        0
 #define MAP_VARIABLE    0
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
 #define PKEY_DISABLE_ACCESS     0x1
 #define PKEY_DISABLE_WRITE      0x2
 #define PKEY_ACCESS_MASK        (PKEY_DISABLE_ACCESS |\
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,20 +29,4 @@
 #define MAP_STACK       0x20000         /* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB     0x40000         /* create a huge page mapping */
-/*
- * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
- * encode the log2 of the huge page size. A value of zero indicates that the
- * default huge page size should be used. To use a non-default huge page size,
- * one of these defines can be used, or the size can be encoded by hand. Note
- * that on most systems only a subset, or possibly none, of these sizes will be
- * available.
- */
-#define MAP_HUGE_512KB  (19 << MAP_HUGE_SHIFT)  /* 512KB HugeTLB Page */
-#define MAP_HUGE_1MB    (20 << MAP_HUGE_SHIFT)  /* 1MB   HugeTLB Page */
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)  /* 2MB   HugeTLB Page */
-#define MAP_HUGE_8MB    (23 << MAP_HUGE_SHIFT)  /* 8MB   HugeTLB Page */
-#define MAP_HUGE_16MB   (24 << MAP_HUGE_SHIFT)  /* 16MB  HugeTLB Page */
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)  /* 1GB   HugeTLB Page */
-#define MAP_HUGE_16GB   (34 << MAP_HUGE_SHIFT)  /* 16GB  HugeTLB Page */
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index acb366bf6bc1..4b278a33ccbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1806,7 +1806,9 @@ config X86_SMAP
 config X86_INTEL_MPX
        prompt "Intel MPX (Memory Protection Extensions)"
        def_bool n
-        depends on CPU_SUP_INTEL
+        # Note: only available in 64-bit mode due to VMA flags shortage
+        depends on CPU_SUP_INTEL && X86_64
+        select ARCH_USES_HIGH_VMA_FLAGS
        ---help---
          MPX provides hardware features that can be used in
          conjunction with compiler-instrumented code to check
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
 #define MAP_32BIT       0x40            /* only give out 32bit addresses */
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 /*
 * Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 24365b30aae9..b15b278aa314 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,20 +103,12 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP     17              /* Clear the MADV_NODUMP flag */
+#define MADV_WIPEONFORK 18              /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19              /* Undo MADV_WIPEONFORK */
 /* compatibility flags */
 #define MAP_FILE        0
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
 #define PKEY_DISABLE_ACCESS     0x1
 #define PKEY_DISABLE_WRITE      0x2
 #define PKEY_ACCESS_MASK        (PKEY_DISABLE_ACCESS |\
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c7c4e0325cdb..4e3b61cda520 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -388,6 +388,19 @@ static ssize_t show_phys_device(struct device *dev,
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
+static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
+                unsigned long nr_pages, int online_type,
+                struct zone *default_zone)
+{
+        struct zone *zone;
+        zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
+        if (zone != default_zone) {
+                strcat(buf, " ");
+                strcat(buf, zone->name);
+        }
+}
 static ssize_t show_valid_zones(struct device *dev,
                                struct device_attribute *attr, char *buf)
 {
@@ -395,7 +408,7 @@ static ssize_t show_valid_zones(struct device *dev,
        unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
        unsigned long valid_start_pfn, valid_end_pfn;
-        bool append = false;
+        struct zone *default_zone;
        int nid;
        /*
@@ -418,16 +431,13 @@ static ssize_t show_valid_zones(struct device *dev,
        }
        nid = pfn_to_nid(start_pfn);
-        if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
+        default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
-                strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
+        strcat(buf, default_zone->name);
-                append = true;
-        }
-        if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
+        print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
-                if (append)
+                        default_zone);
-                        strcat(buf, " ");
+        print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
-                strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
+                        default_zone);
-        }
 out:
        strcat(buf, "\n");
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 104b71c0490d..5d9ed0616413 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -326,7 +326,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
                       struct page *page, bool is_write)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
-        int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
+        int err;
+        if (PageTransHuge(page))
+                return -ENOTSUPP;
+        err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
        page_endio(page, is_write, err);
        return err;
 }
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6dcd3b..7cd4a8ec3c8f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,15 @@ config ZRAM
          disks and maybe many more.
          See zram.txt for more information.
+config ZRAM_WRITEBACK
+       bool "Write back incompressible page to backing device"
+       depends on ZRAM
+       default n
+       help
+         With incompressible page, there is no memory saving to keep it
+         in memory. Instead, write it out to backing device.
+         For this feature, admin should set up backing device via
+         /sys/block/zramX/backing_dev.
+         See zram.txt for more infomration.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3b1b6340ba13..4a0438c4ef2a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -270,6 +270,349 @@ static ssize_t mem_used_max_store(struct device *dev,
        return len;
 }
+#ifdef CONFIG_ZRAM_WRITEBACK
+static bool zram_wb_enabled(struct zram *zram)
+{
+        return zram->backing_dev;
+}
+static void reset_bdev(struct zram *zram)
+{
+        struct block_device *bdev;
+        if (!zram_wb_enabled(zram))
+                return;
+        bdev = zram->bdev;
+        if (zram->old_block_size)
+                set_blocksize(bdev, zram->old_block_size);
+        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+        /* hope filp_close flush all of IO */
+        filp_close(zram->backing_dev, NULL);
+        zram->backing_dev = NULL;
+        zram->old_block_size = 0;
+        zram->bdev = NULL;
+        kvfree(zram->bitmap);
+        zram->bitmap = NULL;
+}
+static ssize_t backing_dev_show(struct device *dev,
+                struct device_attribute *attr, char *buf)
+{
+        struct zram *zram = dev_to_zram(dev);
+        struct file *file = zram->backing_dev;
+        char *p;
+        ssize_t ret;
+        down_read(&zram->init_lock);
+        if (!zram_wb_enabled(zram)) {
+                memcpy(buf, "none\n", 5);
+                up_read(&zram->init_lock);
+                return 5;
+        }
+        p = file_path(file, buf, PAGE_SIZE - 1);
+        if (IS_ERR(p)) {
+                ret = PTR_ERR(p);
+                goto out;
+        }
+        ret = strlen(p);
+        memmove(buf, p, ret);
+        buf[ret++] = '\n';
+out:
+        up_read(&zram->init_lock);
+        return ret;
+}
+static ssize_t backing_dev_store(struct device *dev,
+                struct device_attribute *attr, const char *buf, size_t len)
+{
+        char *file_name;
+        struct file *backing_dev = NULL;
+        struct inode *inode;
+        struct address_space *mapping;
+        unsigned int bitmap_sz, old_block_size = 0;
+        unsigned long nr_pages, *bitmap = NULL;
+        struct block_device *bdev = NULL;
+        int err;
+        struct zram *zram = dev_to_zram(dev);
+        file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+        if (!file_name)
+                return -ENOMEM;
+        down_write(&zram->init_lock);
+        if (init_done(zram)) {
+                pr_info("Can't setup backing device for initialized device\n");
+                err = -EBUSY;
+                goto out;
+        }
+        strlcpy(file_name, buf, len);
+        backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+        if (IS_ERR(backing_dev)) {
+                err = PTR_ERR(backing_dev);
+                backing_dev = NULL;
+                goto out;
+        }
+        mapping = backing_dev->f_mapping;
+        inode = mapping->host;
+        /* Support only block device in this moment */
+        if (!S_ISBLK(inode->i_mode)) {
+                err = -ENOTBLK;
+                goto out;
+        }
+        bdev = bdgrab(I_BDEV(inode));
+        err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+        if (err < 0)
+                goto out;
+        nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+        bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+        bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
+        if (!bitmap) {
+                err = -ENOMEM;
+                goto out;
+        }
+        old_block_size = block_size(bdev);
+        err = set_blocksize(bdev, PAGE_SIZE);
+        if (err)
+                goto out;
+        reset_bdev(zram);
+        spin_lock_init(&zram->bitmap_lock);
+        zram->old_block_size = old_block_size;
+        zram->bdev = bdev;
+        zram->backing_dev = backing_dev;
+        zram->bitmap = bitmap;
+        zram->nr_pages = nr_pages;
+        up_write(&zram->init_lock);
+        pr_info("setup backing device %s\n", file_name);
+        kfree(file_name);
+        return len;
+out:
+        if (bitmap)
+                kvfree(bitmap);
+        if (bdev)
+                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+        if (backing_dev)
+                filp_close(backing_dev, NULL);
+        up_write(&zram->init_lock);
+        kfree(file_name);
+        return err;
+}
+static unsigned long get_entry_bdev(struct zram *zram)
+{
+        unsigned long entry;
+        spin_lock(&zram->bitmap_lock);
+        /* skip 0 bit to confuse zram.handle = 0 */
+        entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+        if (entry == zram->nr_pages) {
+                spin_unlock(&zram->bitmap_lock);
+                return 0;
+        }
+        set_bit(entry, zram->bitmap);
+        spin_unlock(&zram->bitmap_lock);
+        return entry;
+}
+static void put_entry_bdev(struct zram *zram, unsigned long entry)
+{
+        int was_set;
+        spin_lock(&zram->bitmap_lock);
+        was_set = test_and_clear_bit(entry, zram->bitmap);
+        spin_unlock(&zram->bitmap_lock);
+        WARN_ON_ONCE(!was_set);
+}
+void zram_page_end_io(struct bio *bio)
+{
+        struct page *page = bio->bi_io_vec[0].bv_page;
+        page_endio(page, op_is_write(bio_op(bio)),
+                        blk_status_to_errno(bio->bi_status));
+        bio_put(bio);
+}
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+                        unsigned long entry, struct bio *parent)
+{
+        struct bio *bio;
+        bio = bio_alloc(GFP_ATOMIC, 1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+        bio->bi_bdev = zram->bdev;
+        if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+                bio_put(bio);
+                return -EIO;
+        }
+        if (!parent) {
+                bio->bi_opf = REQ_OP_READ;
+                bio->bi_end_io = zram_page_end_io;
+        } else {
+                bio->bi_opf = parent->bi_opf;
+                bio_chain(bio, parent);
+        }
+        submit_bio(bio);
+        return 1;
+}
+struct zram_work {
+        struct work_struct work;
+        struct zram *zram;
+        unsigned long entry;
+        struct bio *bio;
+};
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+        struct bio_vec bvec;
+        struct zram_work *zw = container_of(work, struct zram_work, work);
+        struct zram *zram = zw->zram;
+        unsigned long entry = zw->entry;
+        struct bio *bio = zw->bio;
+        read_from_bdev_async(zram, &bvec, entry, bio);
+}
+/*
+ * Block layer want one ->make_request_fn to be active at a time
+ * so if we use chained IO with parent IO in same context,
+ * it's a deadlock. To avoid, it, it uses worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+                                unsigned long entry, struct bio *bio)
+{
+        struct zram_work work;
+        work.zram = zram;
+        work.entry = entry;
+        work.bio = bio;
+        INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+        queue_work(system_unbound_wq, &work.work);
+        flush_work(&work.work);
+        destroy_work_on_stack(&work.work);
+        return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+                                unsigned long entry, struct bio *bio)
+{
+        WARN_ON(1);
+        return -EIO;
+}
+#endif
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+                        unsigned long entry, struct bio *parent, bool sync)
+{
+        if (sync)
+                return read_from_bdev_sync(zram, bvec, entry, parent);
+        else
+                return read_from_bdev_async(zram, bvec, entry, parent);
+}
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+                                        u32 index, struct bio *parent,
+                                        unsigned long *pentry)
+{
+        struct bio *bio;
+        unsigned long entry;
+        bio = bio_alloc(GFP_ATOMIC, 1);
+        if (!bio)
+                return -ENOMEM;
+        entry = get_entry_bdev(zram);
+        if (!entry) {
+                bio_put(bio);
+                return -ENOSPC;
+        }
+        bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+        bio->bi_bdev = zram->bdev;
+        if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                        bvec->bv_offset)) {
+                bio_put(bio);
+                put_entry_bdev(zram, entry);
+                return -EIO;
+        }
+        if (!parent) {
+                bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+                bio->bi_end_io = zram_page_end_io;
+        } else {
+                bio->bi_opf = parent->bi_opf;
+                bio_chain(bio, parent);
+        }
+        submit_bio(bio);
+        *pentry = entry;
+        return 0;
+}
+static void zram_wb_clear(struct zram *zram, u32 index)
+{
+        unsigned long entry;
+        zram_clear_flag(zram, index, ZRAM_WB);
+        entry = zram_get_element(zram, index);
+        zram_set_element(zram, index, 0);
+        put_entry_bdev(zram, entry);
+}
+#else
+static bool zram_wb_enabled(struct zram *zram) { return false; }
+static inline void reset_bdev(struct zram *zram) {};
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+                                        u32 index, struct bio *parent,
+                                        unsigned long *pentry)
+{
+        return -EIO;
+}
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+                        unsigned long entry, struct bio *parent, bool sync)
+{
+        return -EIO;
+}
+static void zram_wb_clear(struct zram *zram, u32 index) {}
+#endif
 /*
 * We switched to per-cpu streams and this attr is not needed anymore.
 * However, we will keep it around for some time, because:
@@ -453,30 +796,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
        return false;
 }
-static bool zram_same_page_write(struct zram *zram, u32 index,
-                                        struct page *page)
-{
-        unsigned long element;
-        void *mem = kmap_atomic(page);
-        if (page_same_filled(mem, &element)) {
-                kunmap_atomic(mem);
-                /* Free memory associated with this sector now. */
-                zram_slot_lock(zram, index);
-                zram_free_page(zram, index);
-                zram_set_flag(zram, index, ZRAM_SAME);
-                zram_set_element(zram, index, element);
-                zram_slot_unlock(zram, index);
-                atomic64_inc(&zram->stats.same_pages);
-                atomic64_inc(&zram->stats.pages_stored);
-                return true;
-        }
-        kunmap_atomic(mem);
-        return false;
-}
 static void zram_meta_free(struct zram *zram, u64 disksize)
 {
        size_t num_pages = disksize >> PAGE_SHIFT;
@@ -515,7 +834,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-        unsigned long handle = zram_get_handle(zram, index);
+        unsigned long handle;
+        if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+                zram_wb_clear(zram, index);
+                atomic64_dec(&zram->stats.pages_stored);
+                return;
+        }
        /*
         * No memory is allocated for same element filled pages.
@@ -529,6 +854,7 @@ static void zram_free_page(struct zram *zram, size_t index)
                return;
        }
+        handle = zram_get_handle(zram, index);
        if (!handle)
                return;
@@ -542,13 +868,31 @@ static void zram_free_page(struct zram *zram, size_t index)
        zram_set_obj_size(zram, index, 0);
 }
-static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+                                struct bio *bio, bool partial_io)
 {
        int ret;
        unsigned long handle;
        unsigned int size;
        void *src, *dst;
+        if (zram_wb_enabled(zram)) {
+                zram_slot_lock(zram, index);
+                if (zram_test_flag(zram, index, ZRAM_WB)) {
+                        struct bio_vec bvec;
+                        zram_slot_unlock(zram, index);
+                        bvec.bv_page = page;
+                        bvec.bv_len = PAGE_SIZE;
+                        bvec.bv_offset = 0;
+                        return read_from_bdev(zram, &bvec,
+                                        zram_get_element(zram, index),
+                                        bio, partial_io);
+                }
+                zram_slot_unlock(zram, index);
+        }
        if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
                return 0;
@@ -581,7 +925,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 }
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-                                u32 index, int offset)
+                                u32 index, int offset, struct bio *bio)
 {
        int ret;
        struct page *page;
@@ -594,7 +938,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
                        return -ENOMEM;
        }
-        ret = zram_decompress_page(zram, page, index);
+        ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
        if (unlikely(ret))
                goto out;
@@ -613,30 +957,57 @@ out:
        return ret;
 }
-static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
-                        struct page *page,
+                                u32 index, struct bio *bio)
-                        unsigned long *out_handle, unsigned int *out_comp_len)
 {
-        int ret;
+        int ret = 0;
-        unsigned int comp_len;
-        void *src;
        unsigned long alloced_pages;
        unsigned long handle = 0;
+        unsigned int comp_len = 0;
+        void *src, *dst, *mem;
+        struct zcomp_strm *zstrm;
+        struct page *page = bvec->bv_page;
+        unsigned long element = 0;
+        enum zram_pageflags flags = 0;
+        bool allow_wb = true;
+        mem = kmap_atomic(page);
+        if (page_same_filled(mem, &element)) {
+                kunmap_atomic(mem);
+                /* Free memory associated with this sector now. */
+                flags = ZRAM_SAME;
+                atomic64_inc(&zram->stats.same_pages);
+                goto out;
+        }
+        kunmap_atomic(mem);
 compress_again:
+        zstrm = zcomp_stream_get(zram->comp);
        src = kmap_atomic(page);
-        ret = zcomp_compress(*zstrm, src, &comp_len);
+        ret = zcomp_compress(zstrm, src, &comp_len);
        kunmap_atomic(src);
        if (unlikely(ret)) {
+                zcomp_stream_put(zram->comp);
                pr_err("Compression failed! err=%d\n", ret);
-                if (handle)
+                zs_free(zram->mem_pool, handle);
-                        zs_free(zram->mem_pool, handle);
                return ret;
        }
-        if (unlikely(comp_len > max_zpage_size))
+        if (unlikely(comp_len > max_zpage_size)) {
+                if (zram_wb_enabled(zram) && allow_wb) {
+                        zcomp_stream_put(zram->comp);
+                        ret = write_to_bdev(zram, bvec, index, bio, &element);
+                        if (!ret) {
+                                flags = ZRAM_WB;
+                                ret = 1;
+                                goto out;
+                        }
+                        allow_wb = false;
+                        goto compress_again;
+                }
                comp_len = PAGE_SIZE;
+        }
        /*
         * handle allocation has 2 paths:
@@ -663,7 +1034,6 @@ compress_again:
                handle = zs_malloc(zram->mem_pool, comp_len,
                                GFP_NOIO | __GFP_HIGHMEM |
                                __GFP_MOVABLE);
-                *zstrm = zcomp_stream_get(zram->comp);
                if (handle)
                        goto compress_again;
                return -ENOMEM;
@@ -673,34 +1043,11 @@ compress_again:
        update_used_max(zram, alloced_pages);
        if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+                zcomp_stream_put(zram->comp);
                zs_free(zram->mem_pool, handle);
                return -ENOMEM;
        }
-        *out_handle = handle;
-        *out_comp_len = comp_len;
-        return 0;
-}
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
-{
-        int ret;
-        unsigned long handle;
-        unsigned int comp_len;
-        void *src, *dst;
-        struct zcomp_strm *zstrm;
-        struct page *page = bvec->bv_page;
-        if (zram_same_page_write(zram, index, page))
-                return 0;
-        zstrm = zcomp_stream_get(zram->comp);
-        ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
-        if (ret) {
-                zcomp_stream_put(zram->comp);
-                return ret;
-        }
        dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
        src = zstrm->buffer;
@@ -712,25 +1059,31 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
        zcomp_stream_put(zram->comp);
        zs_unmap_object(zram->mem_pool, handle);
+        atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
        /*
         * Free memory associated with this sector
         * before overwriting unused sectors.
         */
        zram_slot_lock(zram, index);
        zram_free_page(zram, index);
-        zram_set_handle(zram, index, handle);
-        zram_set_obj_size(zram, index, comp_len);
+        if (flags) {
+                zram_set_flag(zram, index, flags);
+                zram_set_element(zram, index, element);
+        }  else {
+                zram_set_handle(zram, index, handle);
+                zram_set_obj_size(zram, index, comp_len);
+        }
        zram_slot_unlock(zram, index);
        /* Update stats */
-        atomic64_add(comp_len, &zram->stats.compr_data_size);
        atomic64_inc(&zram->stats.pages_stored);
-        return 0;
+        return ret;
 }
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
-                                u32 index, int offset)
+                                u32 index, int offset, struct bio *bio)
 {
        int ret;
        struct page *page = NULL;
@@ -748,7 +1101,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
                if (!page)
                        return -ENOMEM;
-                ret = zram_decompress_page(zram, page, index);
+                ret = __zram_bvec_read(zram, page, index, bio, true);
                if (ret)
                        goto out;
@@ -763,7 +1116,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
                vec.bv_offset = 0;
        }
-        ret = __zram_bvec_write(zram, &vec, index);
+        ret = __zram_bvec_write(zram, &vec, index, bio);
 out:
        if (is_partial_io(bvec))
                __free_page(page);
@@ -808,8 +1161,13 @@ static void zram_bio_discard(struct zram *zram, u32 index,
        }
 }
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-                        int offset, bool is_write)
+                        int offset, bool is_write, struct bio *bio)
 {
        unsigned long start_time = jiffies;
        int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
@@ -820,16 +1178,16 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
        if (!is_write) {
                atomic64_inc(&zram->stats.num_reads);
-                ret = zram_bvec_read(zram, bvec, index, offset);
+                ret = zram_bvec_read(zram, bvec, index, offset, bio);
                flush_dcache_page(bvec->bv_page);
        } else {
                atomic64_inc(&zram->stats.num_writes);
-                ret = zram_bvec_write(zram, bvec, index, offset);
+                ret = zram_bvec_write(zram, bvec, index, offset, bio);
        }
        generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
-        if (unlikely(ret)) {
+        if (unlikely(ret < 0)) {
                if (!is_write)
                        atomic64_inc(&zram->stats.failed_reads);
                else
@@ -868,7 +1226,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
                        bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                        unwritten);
                        if (zram_bvec_rw(zram, &bv, index, offset,
-                                        op_is_write(bio_op(bio))) < 0)
+                                        op_is_write(bio_op(bio)), bio) < 0)
                                goto out;
                        bv.bv_offset += bv.bv_len;
@@ -922,16 +1280,18 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
                       struct page *page, bool is_write)
 {
-        int offset, err = -EIO;
+        int offset, ret;
        u32 index;
        struct zram *zram;
        struct bio_vec bv;
+        if (PageTransHuge(page))
+                return -ENOTSUPP;
        zram = bdev->bd_disk->private_data;
        if (!valid_io_request(zram, sector, PAGE_SIZE)) {
                atomic64_inc(&zram->stats.invalid_io);
-                err = -EINVAL;
+                ret = -EINVAL;
                goto out;
        }
@@ -942,7 +1302,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
        bv.bv_len = PAGE_SIZE;
        bv.bv_offset = 0;
-        err = zram_bvec_rw(zram, &bv, index, offset, is_write);
+        ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
 out:
        /*
         * If I/O fails, just return error(ie, non-zero) without
@@ -952,9 +1312,20 @@ out:
         * bio->bi_end_io does things to handle the error
         * (e.g., SetPageError, set_page_dirty and extra works).
         */
-        if (err == 0)
+        if (unlikely(ret < 0))
+                return ret;
+        switch (ret) {
+        case 0:
                page_endio(page, is_write, 0);
-        return err;
+                break;
+        case 1:
+                ret = 0;
+                break;
+        default:
+                WARN_ON(1);
+        }
+        return ret;
 }
 static void zram_reset_device(struct zram *zram)
@@ -983,6 +1354,7 @@ static void zram_reset_device(struct zram *zram)
        zram_meta_free(zram, disksize);
        memset(&zram->stats, 0, sizeof(zram->stats));
        zcomp_destroy(comp);
+        reset_bdev(zram);
 }
 static ssize_t disksize_store(struct device *dev,
@@ -1108,6 +1480,9 @@ static DEVICE_ATTR_WO(mem_limit);
 static DEVICE_ATTR_WO(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+#endif
 static struct attribute *zram_disk_attrs[] = {
        &dev_attr_disksize.attr,
@@ -1118,6 +1493,9 @@ static struct attribute *zram_disk_attrs[] = {
        &dev_attr_mem_used_max.attr,
        &dev_attr_max_comp_streams.attr,
        &dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+        &dev_attr_backing_dev.attr,
+#endif
        &dev_attr_io_stat.attr,
        &dev_attr_mm_stat.attr,
        &dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d02e3e..31762db861e3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 /* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
-        /* Page consists entirely of zeros */
+        /* Page consists the same element */
        ZRAM_SAME = ZRAM_FLAG_SHIFT,
        ZRAM_ACCESS,    /* page is now accessed */
+        ZRAM_WB,        /* page is stored on backing_device */
        __NR_ZRAM_PAGEFLAGS,
 };
@@ -115,5 +116,13 @@ struct zram {
         * zram is claimed so open request will be failed
         */
        bool claim; /* Protected by bdev->bd_mutex */
+#ifdef CONFIG_ZRAM_WRITEBACK
+        struct file *backing_dev;
+        struct block_device *bdev;
+        unsigned int old_block_size;
+        unsigned long *bitmap;
+        unsigned long nr_pages;
+        spinlock_t bitmap_lock;
+#endif
 };
 #endif
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a36216bd2a84..e4d4b6b41e26 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4308,10 +4308,10 @@ i915_drop_caches_set(void *data, u64 val)
        fs_reclaim_acquire(GFP_KERNEL);
        if (val & DROP_BOUND)
-                i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
+                i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_BOUND);
        if (val & DROP_UNBOUND)
-                i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_UNBOUND);
+                i915_gem_shrink(dev_priv, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
        if (val & DROP_SHRINK_ALL)
                i915_gem_shrink_all(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 60267e375e88..bd74641ab7f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3742,6 +3742,7 @@ i915_gem_object_create_internal(struct drm_i915_private *dev_priv,
 /* i915_gem_shrinker.c */
 unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv,
                              unsigned long target,
+                              unsigned long *nr_scanned,
                              unsigned flags);
 #define I915_SHRINK_PURGEABLE 0x1
 #define I915_SHRINK_UNBOUND 0x2
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b9e8e0d6e97b..287c6ead95b3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2354,7 +2354,7 @@ rebuild_st:
                                goto err_sg;
                        }
-                        i915_gem_shrink(dev_priv, 2 * page_count, *s++);
+                        i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
                        cond_resched();
                        /* We've tried hard to allocate the memory by reaping
@@ -5015,7 +5015,7 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
         * the objects as well, see i915_gem_freeze()
         */
-        i915_gem_shrink(dev_priv, -1UL, I915_SHRINK_UNBOUND);
+        i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
        i915_gem_drain_freed_objects(dev_priv);
        mutex_lock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d60f38adc4c4..6c6b8e8592aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2062,7 +2062,7 @@ int i915_gem_gtt_prepare_pages(struct drm_i915_gem_object *obj,
                 */
                GEM_BUG_ON(obj->mm.pages == pages);
        } while (i915_gem_shrink(to_i915(obj->base.dev),
-                                 obj->base.size >> PAGE_SHIFT,
+                                 obj->base.size >> PAGE_SHIFT, NULL,
                                 I915_SHRINK_BOUND |
                                 I915_SHRINK_UNBOUND |
                                 I915_SHRINK_ACTIVE));
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 77fb39808131..74002b2d1b6f 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -136,6 +136,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
 * i915_gem_shrink - Shrink buffer object caches
 * @dev_priv: i915 device
 * @target: amount of memory to make available, in pages
+ * @nr_scanned: optional output for number of pages scanned (incremental)
 * @flags: control flags for selecting cache types
 *
 * This function is the main interface to the shrinker. It will try to release
@@ -158,7 +159,9 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
 */
 unsigned long
 i915_gem_shrink(struct drm_i915_private *dev_priv,
-                unsigned long target, unsigned flags)
+                unsigned long target,
+                unsigned long *nr_scanned,
+                unsigned flags)
 {
        const struct {
                struct list_head *list;
@@ -169,6 +172,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
                { NULL, 0 },
        }, *phase;
        unsigned long count = 0;
+        unsigned long scanned = 0;
        bool unlock;
        if (!shrinker_lock(dev_priv, &unlock))
@@ -249,6 +253,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
                                        count += obj->base.size >> PAGE_SHIFT;
                                }
                                mutex_unlock(&obj->mm.lock);
+                                scanned += obj->base.size >> PAGE_SHIFT;
                        }
                }
                list_splice_tail(&still_in_list, phase->list);
@@ -261,6 +266,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
        shrinker_unlock(dev_priv, unlock);
+        if (nr_scanned)
+                *nr_scanned += scanned;
        return count;
 }
@@ -283,7 +290,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
        unsigned long freed;
        intel_runtime_pm_get(dev_priv);
-        freed = i915_gem_shrink(dev_priv, -1UL,
+        freed = i915_gem_shrink(dev_priv, -1UL, NULL,
                                I915_SHRINK_BOUND |
                                I915_SHRINK_UNBOUND |
                                I915_SHRINK_ACTIVE);
@@ -329,23 +336,28 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
        unsigned long freed;
        bool unlock;
+        sc->nr_scanned = 0;
        if (!shrinker_lock(dev_priv, &unlock))
                return SHRINK_STOP;
        freed = i915_gem_shrink(dev_priv,
                                sc->nr_to_scan,
+                                &sc->nr_scanned,
                                I915_SHRINK_BOUND |
                                I915_SHRINK_UNBOUND |
                                I915_SHRINK_PURGEABLE);
        if (freed < sc->nr_to_scan)
                freed += i915_gem_shrink(dev_priv,
-                                         sc->nr_to_scan - freed,
+                                         sc->nr_to_scan - sc->nr_scanned,
+                                         &sc->nr_scanned,
                                         I915_SHRINK_BOUND |
                                         I915_SHRINK_UNBOUND);
        if (freed < sc->nr_to_scan && current_is_kswapd()) {
                intel_runtime_pm_get(dev_priv);
                freed += i915_gem_shrink(dev_priv,
-                                         sc->nr_to_scan - freed,
+                                         sc->nr_to_scan - sc->nr_scanned,
+                                         &sc->nr_scanned,
                                         I915_SHRINK_ACTIVE |
                                         I915_SHRINK_BOUND |
                                         I915_SHRINK_UNBOUND);
@@ -354,7 +366,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
        shrinker_unlock(dev_priv, unlock);
-        return freed;
+        return sc->nr_scanned ? freed : SHRINK_STOP;
 }
 static bool
@@ -453,7 +465,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
                goto out;
        intel_runtime_pm_get(dev_priv);
-        freed_pages += i915_gem_shrink(dev_priv, -1UL,
+        freed_pages += i915_gem_shrink(dev_priv, -1UL, NULL,
                                       I915_SHRINK_BOUND |
                                       I915_SHRINK_UNBOUND |
                                       I915_SHRINK_ACTIVE |
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 14323faf8bd9..60491641a8d6 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1241,8 +1241,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
 {
        struct btt *btt = bdev->bd_disk->private_data;
        int rc;
+        unsigned int len;
-        rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
+        len = hpage_nr_pages(page) * PAGE_SIZE;
+        rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
        if (rc == 0)
                page_endio(page, is_write, 0);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f7099adaabc0..e9aa453da50c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -80,22 +80,40 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 static void write_pmem(void *pmem_addr, struct page *page,
                unsigned int off, unsigned int len)
 {
-        void *mem = kmap_atomic(page);
+        unsigned int chunk;
+        void *mem;
-        memcpy_flushcache(pmem_addr, mem + off, len);
-        kunmap_atomic(mem);
+        while (len) {
+                mem = kmap_atomic(page);
+                chunk = min_t(unsigned int, len, PAGE_SIZE);
+                memcpy_flushcache(pmem_addr, mem + off, chunk);
+                kunmap_atomic(mem);
+                len -= chunk;
+                off = 0;
+                page++;
+                pmem_addr += PAGE_SIZE;
+        }
 }
 static blk_status_t read_pmem(struct page *page, unsigned int off,
                void *pmem_addr, unsigned int len)
 {
+        unsigned int chunk;
        int rc;
-        void *mem = kmap_atomic(page);
+        void *mem;
-        rc = memcpy_mcsafe(mem + off, pmem_addr, len);
+        while (len) {
-        kunmap_atomic(mem);
+                mem = kmap_atomic(page);
-        if (rc)
+                chunk = min_t(unsigned int, len, PAGE_SIZE);
-                return BLK_STS_IOERR;
+                rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+                kunmap_atomic(mem);
+                if (rc)
+                        return BLK_STS_IOERR;
+                len -= chunk;
+                off = 0;
+                page++;
+                pmem_addr += PAGE_SIZE;
+        }
        return BLK_STS_OK;
 }
@@ -188,7 +206,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        blk_status_t rc;
-        rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
+        rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
+                          0, is_write, sector);
        /*
         * The ->rw_page interface is subtle and tricky.  The core
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
        return FSCACHE_CHECKAUX_OKAY;
 }
-static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
-{
-        struct v9fs_inode *v9inode = cookie_netfs_data;
-        struct pagevec pvec;
-        pgoff_t first;
-        int loop, nr_pages;
-        pagevec_init(&pvec, 0);
-        first = 0;
-        for (;;) {
-                nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
-                                          first,
-                                          PAGEVEC_SIZE - pagevec_count(&pvec));
-                if (!nr_pages)
-                        break;
-                for (loop = 0; loop < nr_pages; loop++)
-                        ClearPageFsCache(pvec.pages[loop]);
-                first = pvec.pages[nr_pages - 1]->index + 1;
-                pvec.nr = nr_pages;
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
 const struct fscache_cookie_def v9fs_cache_inode_index_def = {
        .name           = "9p.inode",
        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
        .get_attr       = v9fs_cache_inode_get_attr,
        .get_aux        = v9fs_cache_inode_get_aux,
        .check_aux      = v9fs_cache_inode_check_aux,
-        .now_uncached   = v9fs_cache_inode_now_uncached,
 };
 void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
                                                       const void *buffer,
                                                       uint16_t buflen);
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
 struct fscache_netfs afs_cache_netfs = {
        .name                   = "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
        .get_attr               = afs_vnode_cache_get_attr,
        .get_aux                = afs_vnode_cache_get_aux,
        .check_aux              = afs_vnode_cache_check_aux,
-        .now_uncached           = afs_vnode_cache_now_uncached,
 };
 /*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
        _leave(" = SUCCESS");
        return FSCACHE_CHECKAUX_OKAY;
 }
-/*
- * indication the cookie is no longer uncached
- * - this function is called when the backing store currently caching a cookie
- *   is removed
- * - the netfs should use this to clean up any markers indicating cached pages
- * - this is mandatory for any object that may have data
- */
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-{
-        struct afs_vnode *vnode = cookie_netfs_data;
-        struct pagevec pvec;
-        pgoff_t first;
-        int loop, nr_pages;
-        _enter("{%x,%x,%Lx}",
-               vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
-        pagevec_init(&pvec, 0);
-        first = 0;
-        for (;;) {
-                /* grab a bunch of pages to clean */
-                nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
-                                          first,
-                                          PAGEVEC_SIZE - pagevec_count(&pvec));
-                if (!nr_pages)
-                        break;
-                for (loop = 0; loop < nr_pages; loop++)
-                        ClearPageFsCache(pvec.pages[loop]);
-                first = pvec.pages[nr_pages - 1]->index + 1;
-                pvec.nr = nr_pages;
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        _leave("");
-}
diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..50da0e102ca0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
        struct pagevec pvec;
        pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pgoff_t end;
-        int i;
+        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;
        end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pagevec_init(&pvec, 0);
-        while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+        while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+                count = pagevec_count(&pvec);
-                for (i = 0; i < pagevec_count(&pvec); i++) {
+                for (i = 0; i < count; i++) {
                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
                        if (!page_has_buffers(page))
                                continue;
                        /*
@@ -1670,7 +1667,9 @@ unlock_page:
                }
                pagevec_release(&pvec);
                cond_resched();
-                index++;
+                /* End of range already reached? */
+                if (index > end || !index)
+                        break;
        }
 }
 EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
        pagevec_init(&pvec, 0);
        do {
-                unsigned want, nr_pages, i;
+                unsigned nr_pages, i;
-                want = min_t(unsigned, end - index, PAGEVEC_SIZE);
+                nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+                                                end - 1);
                if (nr_pages == 0)
                        break;
@@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
                            lastoff < page_offset(page))
                                goto check_range;
-                        /* Searching done if the page index is out of range. */
-                        if (page->index >= end)
-                                goto not_found;
                        lock_page(page);
                        if (likely(page->mapping == inode->i_mapping) &&
                            page_has_buffers(page)) {
@@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
                        unlock_page(page);
                        lastoff = page_offset(page) + PAGE_SIZE;
                }
-                /* Searching done if fewer pages returned than wanted. */
-                if (nr_pages < want)
-                        break;
-                index = pvec.pages[i - 1]->index + 1;
                pagevec_release(&pvec);
        } while (index < end);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 337f88673ed9..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
        return FSCACHE_CHECKAUX_OKAY;
 }
-static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
-{
-        struct ceph_inode_info* ci = cookie_netfs_data;
-        struct pagevec pvec;
-        pgoff_t first;
-        int loop, nr_pages;
-        pagevec_init(&pvec, 0);
-        first = 0;
-        dout("ceph inode 0x%p now uncached", ci);
-        while (1) {
-                nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
-                                          PAGEVEC_SIZE - pagevec_count(&pvec));
-                if (!nr_pages)
-                        break;
-                for (loop = 0; loop < nr_pages; loop++)
-                        ClearPageFsCache(pvec.pages[loop]);
-                first = pvec.pages[nr_pages - 1]->index + 1;
-                pvec.nr = nr_pages;
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
 static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
        .name           = "CEPH.inode",
        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
        .get_attr       = ceph_fscache_inode_get_attr,
        .get_aux        = ceph_fscache_inode_get_aux,
        .check_aux      = ceph_fscache_inode_check_aux,
-        .now_uncached   = ceph_fscache_inode_now_uncached,
 };
 void ceph_fscache_register_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
        return FSCACHE_CHECKAUX_OKAY;
 }
-static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-        struct cifsInodeInfo *cifsi = cookie_netfs_data;
-        struct pagevec pvec;
-        pgoff_t first;
-        int loop, nr_pages;
-        pagevec_init(&pvec, 0);
-        first = 0;
-        cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
-        for (;;) {
-                nr_pages = pagevec_lookup(&pvec,
-                                          cifsi->vfs_inode.i_mapping, first,
-                                          PAGEVEC_SIZE - pagevec_count(&pvec));
-                if (!nr_pages)
-                        break;
-                for (loop = 0; loop < nr_pages; loop++)
-                        ClearPageFsCache(pvec.pages[loop]);
-                first = pvec.pages[nr_pages - 1]->index + 1;
-                pvec.nr = nr_pages;
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
 const struct fscache_cookie_def cifs_fscache_inode_object_def = {
        .name           = "CIFS.uniqueid",
        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
        .get_attr       = cifs_fscache_inode_get_attr,
        .get_aux        = cifs_fscache_inode_get_aux,
        .check_aux      = cifs_fscache_inode_check_aux,
-        .now_uncached   = cifs_fscache_inode_now_uncached,
 };
diff --git a/fs/dax.c b/fs/dax.c
index ab925dc6647a..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+/* The 'colour' (ie low bits) within a PMD of a page offset.  */
+#define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
 static int __init init_dax_wait_table(void)
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT         (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK    (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD           (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE     (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY         (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+static unsigned long dax_radix_sector(void *entry)
+{
+        return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+        return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+                        ((unsigned long)sector << RADIX_DAX_SHIFT) |
+                        RADIX_DAX_ENTRY_LOCK);
+}
+static unsigned int dax_radix_order(void *entry)
+{
+        if ((unsigned long)entry & RADIX_DAX_PMD)
+                return PMD_SHIFT - PAGE_SHIFT;
+        return 0;
+}
 static int dax_is_pmd_entry(void *entry)
 {
        return (unsigned long)entry & RADIX_DAX_PMD;
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
 static int dax_is_zero_entry(void *entry)
 {
-        return (unsigned long)entry & RADIX_DAX_HZP;
+        return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
 }
 static int dax_is_empty_entry(void *entry)
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
         * the range covered by the PMD map to the same bit lock.
         */
        if (dax_is_pmd_entry(entry))
-                index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+                index &= ~PG_PMD_COLOUR;
        key->mapping = mapping;
        key->entry_start = index;
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
 }
 /*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree.  This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+                pgoff_t index, void *entry, bool wake_all)
+{
+        struct exceptional_entry_key key;
+        wait_queue_head_t *wq;
+        wq = dax_entry_waitqueue(mapping, index, entry, &key);
+        /*
+         * Checking for locked entry and prepare_to_wait_exclusive() happens
+         * under mapping->tree_lock, ditto for entry handling in our callers.
+         * So at this point all tasks that could have seen our entry locked
+         * must be in the waitqueue and the following check will see them.
+         */
+        if (waitqueue_active(wq))
+                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+}
+/*
 * Check whether the given slot is locked. The function must be called with
 * mapping->tree_lock held
 */
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
        for (;;) {
                entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
                                          &slot);
-                if (!entry || !radix_tree_exceptional_entry(entry) ||
+                if (!entry ||
+                    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
                    !slot_locked(mapping, slot)) {
                        if (slotp)
                                *slotp = slot;
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
 }
 static void put_locked_mapping_entry(struct address_space *mapping,
-                                     pgoff_t index, void *entry)
+                pgoff_t index)
 {
-        if (!radix_tree_exceptional_entry(entry)) {
+        dax_unlock_mapping_entry(mapping, index);
-                unlock_page(entry);
-                put_page(entry);
-        } else {
-                dax_unlock_mapping_entry(mapping, index);
-        }
 }
 /*
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 static void put_unlocked_mapping_entry(struct address_space *mapping,
                                       pgoff_t index, void *entry)
 {
-        if (!radix_tree_exceptional_entry(entry))
+        if (!entry)
                return;
        /* We have to wake up next waiter for the radix tree entry lock */
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
 }
 /*
- * Find radix tree entry at given index. If it points to a page, return with
+ * Find radix tree entry at given index. If it points to an exceptional entry,
- * the page locked. If it points to the exceptional entry, return with the
+ * return it with the radix tree entry locked. If the radix tree doesn't
- * radix tree entry locked. If the radix tree doesn't contain given index,
+ * contain given index, create an empty exceptional entry for the index and
- * create empty exceptional entry for the index and return with it locked.
+ * return with it locked.
 *
 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
 * either return that locked entry or will return an error.  This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
+ * happen if there are any 4k entries within the 2MiB range that we are
- * within the 2MiB range that we are requesting.
+ * requesting.
 *
 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
 * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
@@ -276,18 +334,21 @@ restart:
        spin_lock_irq(&mapping->tree_lock);
        entry = get_unlocked_mapping_entry(mapping, index, &slot);
+        if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+                entry = ERR_PTR(-EIO);
+                goto out_unlock;
+        }
        if (entry) {
                if (size_flag & RADIX_DAX_PMD) {
-                        if (!radix_tree_exceptional_entry(entry) ||
+                        if (dax_is_pte_entry(entry)) {
-                            dax_is_pte_entry(entry)) {
                                put_unlocked_mapping_entry(mapping, index,
                                                entry);
                                entry = ERR_PTR(-EEXIST);
                                goto out_unlock;
                        }
                } else { /* trying to grab a PTE entry */
-                        if (radix_tree_exceptional_entry(entry) &&
+                        if (dax_is_pmd_entry(entry) &&
-                            dax_is_pmd_entry(entry) &&
                            (dax_is_zero_entry(entry) ||
                             dax_is_empty_entry(entry))) {
                                pmd_downgrade = true;
@@ -321,7 +382,7 @@ restart:
                                mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
                if (err) {
                        if (pmd_downgrade)
-                                put_locked_mapping_entry(mapping, index, entry);
+                                put_locked_mapping_entry(mapping, index);
                        return ERR_PTR(err);
                }
                spin_lock_irq(&mapping->tree_lock);
@@ -371,52 +432,12 @@ restart:
                spin_unlock_irq(&mapping->tree_lock);
                return entry;
        }
-        /* Normal page in radix tree? */
-        if (!radix_tree_exceptional_entry(entry)) {
-                struct page *page = entry;
-                get_page(page);
-                spin_unlock_irq(&mapping->tree_lock);
-                lock_page(page);
-                /* Page got truncated? Retry... */
-                if (unlikely(page->mapping != mapping)) {
-                        unlock_page(page);
-                        put_page(page);
-                        goto restart;
-                }
-                return page;
-        }
        entry = lock_slot(mapping, slot);
 out_unlock:
        spin_unlock_irq(&mapping->tree_lock);
        return entry;
 }
-/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree.  This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
- */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-                pgoff_t index, void *entry, bool wake_all)
-{
-        struct exceptional_entry_key key;
-        wait_queue_head_t *wq;
-        wq = dax_entry_waitqueue(mapping, index, entry, &key);
-        /*
-         * Checking for locked entry and prepare_to_wait_exclusive() happens
-         * under mapping->tree_lock, ditto for entry handling in our callers.
-         * So at this point all tasks that could have seen our entry locked
-         * must be in the waitqueue and the following check will see them.
-         */
-        if (waitqueue_active(wq))
-                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-}
 static int __dax_invalidate_mapping_entry(struct address_space *mapping,
                                          pgoff_t index, bool trunc)
 {
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
        spin_lock_irq(&mapping->tree_lock);
        entry = get_unlocked_mapping_entry(mapping, index, NULL);
-        if (!entry || !radix_tree_exceptional_entry(entry))
+        if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
                goto out;
        if (!trunc &&
            (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
        return __dax_invalidate_mapping_entry(mapping, index, false);
 }
-/*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, void **entry,
-                         struct vm_fault *vmf)
-{
-        struct inode *inode = mapping->host;
-        struct page *page;
-        int ret;
-        /* Hole page already exists? Return it...  */
-        if (!radix_tree_exceptional_entry(*entry)) {
-                page = *entry;
-                goto finish_fault;
-        }
-        /* This will replace locked radix tree entry with a hole page */
-        page = find_or_create_page(mapping, vmf->pgoff,
-                                   vmf->gfp_mask | __GFP_ZERO);
-        if (!page) {
-                ret = VM_FAULT_OOM;
-                goto out;
-        }
-finish_fault:
-        vmf->page = page;
-        ret = finish_fault(vmf);
-        vmf->page = NULL;
-        *entry = page;
-        if (!ret) {
-                /* Grab reference for PTE that is now referencing the page */
-                get_page(page);
-                ret = VM_FAULT_NOPAGE;
-        }
-out:
-        trace_dax_load_hole(inode, vmf, ret);
-        return ret;
-}
 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
                sector_t sector, size_t size, struct page *to,
                unsigned long vaddr)
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                                      unsigned long flags)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-        int error = 0;
-        bool hole_fill = false;
        void *new_entry;
        pgoff_t index = vmf->pgoff;
        if (vmf->flags & FAULT_FLAG_WRITE)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-        /* Replacing hole page with block mapping? */
+        if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
-        if (!radix_tree_exceptional_entry(entry)) {
+                /* we are replacing a zero page with block mapping */
-                hole_fill = true;
+                if (dax_is_pmd_entry(entry))
-                /*
+                        unmap_mapping_range(mapping,
-                 * Unmap the page now before we remove it from page cache below.
+                                        (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
-                 * The page is locked so it cannot be faulted in again.
+                                        PMD_SIZE, 0);
-                 */
+                else /* pte entry */
-                unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                        unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-                                    PAGE_SIZE, 0);
+                                        PAGE_SIZE, 0);
-                error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
-                if (error)
-                        return ERR_PTR(error);
-        } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
-                /* replacing huge zero page with PMD block mapping */
-                unmap_mapping_range(mapping,
-                        (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
        }
        spin_lock_irq(&mapping->tree_lock);
        new_entry = dax_radix_locked_entry(sector, flags);
-        if (hole_fill) {
+        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
-                __delete_from_page_cache(entry, NULL);
-                /* Drop pagecache reference */
-                put_page(entry);
-                error = __radix_tree_insert(page_tree, index,
-                                dax_radix_order(new_entry), new_entry);
-                if (error) {
-                        new_entry = ERR_PTR(error);
-                        goto unlock;
-                }
-                mapping->nrexceptional++;
-        } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                /*
                 * Only swap our new entry into the radix tree if the current
                 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
                WARN_ON_ONCE(ret != entry);
                __radix_tree_replace(page_tree, node, slot,
                                     new_entry, NULL, NULL);
+                entry = new_entry;
        }
        if (vmf->flags & FAULT_FLAG_WRITE)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
        spin_unlock_irq(&mapping->tree_lock);
-        if (hole_fill) {
+        return entry;
-                radix_tree_preload_end();
-                /*
-                 * We don't need hole page anymore, it has been replaced with
-                 * locked radix tree entry now.
-                 */
-                if (mapping->a_ops->freepage)
-                        mapping->a_ops->freepage(entry);
-                unlock_page(entry);
-                put_page(entry);
-        }
-        return new_entry;
 }
 static inline unsigned long
@@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
        spin_lock_irq(&mapping->tree_lock);
        entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
        /* Entry got punched out / reallocated? */
-        if (!entry2 || !radix_tree_exceptional_entry(entry2))
+        if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
                goto put_unlocked;
        /*
         * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
        trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
 dax_unlock:
        dax_read_unlock(id);
-        put_locked_mapping_entry(mapping, index, entry);
+        put_locked_mapping_entry(mapping, index);
        return ret;
 put_unlocked:
@@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 static int dax_insert_mapping(struct address_space *mapping,
                struct block_device *bdev, struct dax_device *dax_dev,
-                sector_t sector, size_t size, void **entryp,
+                sector_t sector, size_t size, void *entry,
                struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        unsigned long vaddr = vmf->address;
-        void *entry = *entryp;
        void *ret, *kaddr;
        pgoff_t pgoff;
        int id, rc;
@@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
        ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
        if (IS_ERR(ret))
                return PTR_ERR(ret);
-        *entryp = ret;
        trace_dax_insert_mapping(mapping->host, vmf, ret);
-        return vm_insert_mixed(vma, vaddr, pfn);
+        if (vmf->flags & FAULT_FLAG_WRITE)
+                return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+        else
+                return vm_insert_mixed(vma, vaddr, pfn);
 }
-/**
+/*
- * dax_pfn_mkwrite - handle first write to DAX page
+ * The user has performed a load from a hole in the file.  Allocating a new
- * @vmf: The description of the fault
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
 */
-int dax_pfn_mkwrite(struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+                         struct vm_fault *vmf)
 {
-        struct file *file = vmf->vma->vm_file;
-        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        void *entry, **slot;
+        unsigned long vaddr = vmf->address;
-        pgoff_t index = vmf->pgoff;
+        int ret = VM_FAULT_NOPAGE;
+        struct page *zero_page;
+        void *entry2;
-        spin_lock_irq(&mapping->tree_lock);
+        zero_page = ZERO_PAGE(0);
-        entry = get_unlocked_mapping_entry(mapping, index, &slot);
+        if (unlikely(!zero_page)) {
-        if (!entry || !radix_tree_exceptional_entry(entry)) {
+                ret = VM_FAULT_OOM;
-                if (entry)
+                goto out;
-                        put_unlocked_mapping_entry(mapping, index, entry);
-                spin_unlock_irq(&mapping->tree_lock);
-                trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
-                return VM_FAULT_NOPAGE;
        }
-        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-        entry = lock_slot(mapping, slot);
+        entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-        spin_unlock_irq(&mapping->tree_lock);
+                        RADIX_DAX_ZERO_PAGE);
-        /*
+        if (IS_ERR(entry2)) {
-         * If we race with somebody updating the PTE and finish_mkwrite_fault()
+                ret = VM_FAULT_SIGBUS;
-         * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+                goto out;
-         * the fault in either case.
+        }
-         */
-        finish_mkwrite_fault(vmf);
+        vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
-        put_locked_mapping_entry(mapping, index, entry);
+out:
-        trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
+        trace_dax_load_hole(inode, vmf, ret);
-        return VM_FAULT_NOPAGE;
+        return ret;
 }
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 static bool dax_range_is_aligned(struct block_device *bdev,
                                 unsigned int offset, unsigned int length)
@@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                if (map_len > end - pos)
                        map_len = end - pos;
+                /*
+                 * The userspace address for the memory copy has already been
+                 * validated via access_ok() in either vfs_read() or
+                 * vfs_write(), depending on which operation we are doing.
+                 */
                if (iov_iter_rw(iter) == WRITE)
                        map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);
@@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                        major = VM_FAULT_MAJOR;
                }
                error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-                                sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+                                sector, PAGE_SIZE, entry, vmf->vma, vmf);
                /* -EBUSY is fine, somebody else faulted on the same PTE */
                if (error == -EBUSY)
                        error = 0;
@@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-                        vmf_ret = dax_load_hole(mapping, &entry, vmf);
+                        vmf_ret = dax_load_hole(mapping, entry, vmf);
                        goto finish_iomap;
                }
                /*FALLTHRU*/
@@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
        }
 unlock_entry:
-        put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+        put_locked_mapping_entry(mapping, vmf->pgoff);
 out:
        trace_dax_pte_fault_done(inode, vmf, vmf_ret);
        return vmf_ret;
 }
 #ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-                loff_t pos, void **entryp)
+                loff_t pos, void *entry)
 {
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
        void *ret = NULL, *kaddr;
        long length = 0;
        pgoff_t pgoff;
-        pfn_t pfn;
+        pfn_t pfn = {};
        int id;
        if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
@@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
                goto unlock_fallback;
        dax_read_unlock(id);
-        ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+        ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
                        RADIX_DAX_PMD);
        if (IS_ERR(ret))
                goto fallback;
-        *entryp = ret;
        trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
        return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1267,7 @@ fallback:
 }
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
-                void **entryp)
+                void *entry)
 {
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
        if (unlikely(!zero_page))
                goto fallback;
-        ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+        ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-                        RADIX_DAX_PMD | RADIX_DAX_HZP);
+                        RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
        if (IS_ERR(ret))
                goto fallback;
-        *entryp = ret;
        ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                goto fallback;
        /*
-         * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+         * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
-         * PMD or a HZP entry.  If it can't (because a 4k page is already in
+         * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
-         * the tree, for instance), it will return -EEXIST and we just fall
+         * is already in the tree, for instance), it will return -EEXIST and
-         * back to 4k entries.
+         * we just fall back to 4k entries.
         */
        entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
        if (IS_ERR(entry))
@@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        switch (iomap.type) {
        case IOMAP_MAPPED:
-                result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+                result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(write))
                        break;
-                result = dax_pmd_load_hole(vmf, &iomap, &entry);
+                result = dax_pmd_load_hole(vmf, &iomap, entry);
                break;
        default:
                WARN_ON_ONCE(1);
@@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                                &iomap);
        }
 unlock_entry:
-        put_locked_mapping_entry(mapping, pgoff, entry);
+        put_locked_mapping_entry(mapping, pgoff);
 fallback:
        if (result == VM_FAULT_FALLBACK) {
                split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
        return ret;
 }
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-        struct inode *inode = file_inode(vmf->vma->vm_file);
-        struct ext2_inode_info *ei = EXT2_I(inode);
-        loff_t size;
-        int ret;
-        sb_start_pagefault(inode->i_sb);
-        file_update_time(vmf->vma->vm_file);
-        down_read(&ei->dax_sem);
-        /* check that the faulting page hasn't raced with truncate */
-        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (vmf->pgoff >= size)
-                ret = VM_FAULT_SIGBUS;
-        else
-                ret = dax_pfn_mkwrite(vmf);
-        up_read(&ei->dax_sem);
-        sb_end_pagefault(inode->i_sb);
-        return ret;
-}
 static const struct vm_operations_struct ext2_dax_vm_ops = {
        .fault          = ext2_dax_fault,
        /*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
         * will always fail and fail back to regular faults.
         */
        .page_mkwrite   = ext2_dax_fault,
-        .pfn_mkwrite    = ext2_dax_pfn_mkwrite,
+        .pfn_mkwrite    = ext2_dax_fault,
 };
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 197653ea6041..57dcaea762c3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
        return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-        struct inode *inode = file_inode(vmf->vma->vm_file);
-        struct super_block *sb = inode->i_sb;
-        loff_t size;
-        int ret;
-        sb_start_pagefault(sb);
-        file_update_time(vmf->vma->vm_file);
-        down_read(&EXT4_I(inode)->i_mmap_sem);
-        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (vmf->pgoff >= size)
-                ret = VM_FAULT_SIGBUS;
-        else
-                ret = dax_pfn_mkwrite(vmf);
-        up_read(&EXT4_I(inode)->i_mmap_sem);
-        sb_end_pagefault(sb);
-        return ret;
-}
 static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
        .huge_fault     = ext4_dax_huge_fault,
        .page_mkwrite   = ext4_dax_fault,
-        .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
+        .pfn_mkwrite    = ext4_dax_fault,
 };
 #else
 #define ext4_dax_vm_ops ext4_file_vm_ops
@@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
        pagevec_init(&pvec, 0);
        do {
-                int i, num;
+                int i;
                unsigned long nr_pages;
-                num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
+                nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+                                        &index, end);
-                                          (pgoff_t)num);
                if (nr_pages == 0)
                        break;
@@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
                                goto out;
                        }
-                        if (page->index > end)
-                                goto out;
                        lock_page(page);
                        if (unlikely(page->mapping != inode->i_mapping)) {
@@ -576,14 +542,10 @@ next:
                        unlock_page(page);
                }
-                /* The no. of pages is less than our desired, we are done. */
-                if (nr_pages < num)
-                        break;
-                index = pvec.pages[i - 1]->index + 1;
                pagevec_release(&pvec);
        } while (index <= end);
+        /* There are no pages upto endoff - that would be a hole in there. */
        if (whence == SEEK_HOLE && lastoff < endoff) {
                found = 1;
                *offset = lastoff;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 714396760616..e963508ea35f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
        pagevec_init(&pvec, 0);
        while (index <= end) {
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        if (page->index > end)
-                                break;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        if (invalidate) {
@@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                        }
                        unlock_page(page);
                }
-                index = pvec.pages[nr_pages - 1]->index + 1;
                pagevec_release(&pvec);
        }
 }
@@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
        pagevec_init(&pvec, 0);
        while (start <= end) {
-                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+                nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-                                          PAGEVEC_SIZE);
+                                                &start, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        if (page->index > end)
-                                break;
-                        /* Up to 'end' pages must be contiguous */
-                        BUG_ON(page->index != start);
                        bh = head = page_buffers(page);
                        do {
                                if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
                                pagevec_release(&pvec);
                                return err;
                        }
-                        start++;
                }
                pagevec_release(&pvec);
        }
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
        pagevec_init(&pvec, 0);
        next = 0;
        do {
-                if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+                if (!pagevec_lookup(&pvec, mapping, &next))
                        break;
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
-                        next = page->index;
                        if (PageFsCache(page)) {
                                __fscache_wait_on_page_write(cookie, page);
                                __fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
                }
                pagevec_release(&pvec);
                cond_resched();
-        } while (++next);
+        } while (next);
        _leave("");
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..7c02b3f738e1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
        const pgoff_t end = lend >> huge_page_shift(h);
        struct vm_area_struct pseudo_vma;
        struct pagevec pvec;
-        pgoff_t next;
+        pgoff_t next, index;
        int i, freed = 0;
-        long lookup_nr = PAGEVEC_SIZE;
        bool truncate_op = (lend == LLONG_MAX);
        memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
@@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
        next = start;
        while (next < end) {
                /*
-                 * Don't grab more pages than the number left in the range.
-                 */
-                if (end - next < lookup_nr)
-                        lookup_nr = end - next;
-                /*
                 * When no more pages are found, we are done.
                 */
-                if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+                if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
                        break;
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
                        u32 hash;
-                        /*
+                        index = page->index;
-                         * The page (index) could be beyond end.  This is
-                         * only possible in the punch hole case as end is
-                         * max page offset in the truncate case.
-                         */
-                        next = page->index;
-                        if (next >= end)
-                                break;
                        hash = hugetlb_fault_mutex_hash(h, current->mm,
                                                        &pseudo_vma,
-                                                        mapping, next, 0);
+                                                        mapping, index, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        /*
@@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                i_mmap_lock_write(mapping);
                                hugetlb_vmdelete_list(&mapping->i_mmap,
-                                        next * pages_per_huge_page(h),
+                                        index * pages_per_huge_page(h),
-                                        (next + 1) * pages_per_huge_page(h));
+                                        (index + 1) * pages_per_huge_page(h));
                                i_mmap_unlock_write(mapping);
                        }
@@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                        freed++;
                        if (!truncate_op) {
                                if (unlikely(hugetlb_unreserve_pages(inode,
-                                                        next, next + 1, 1)))
+                                                        index, index + 1, 1)))
                                        hugetlb_fix_reserve_counts(inode);
                        }
                        unlock_page(page);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
-                ++next;
                huge_pagevec_release(&pvec);
                cond_resched();
        }
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 }
 /*
- * Indication from FS-Cache that the cookie is no longer cached
- * - This function is called when the backing store currently caching a cookie
- *   is removed
- * - The netfs should use this to clean up any markers indicating cached pages
- * - This is mandatory for any object that may have data
- */
-static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-        struct nfs_inode *nfsi = cookie_netfs_data;
-        struct pagevec pvec;
-        pgoff_t first;
-        int loop, nr_pages;
-        pagevec_init(&pvec, 0);
-        first = 0;
-        dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
-        for (;;) {
-                /* grab a bunch of pages to unmark */
-                nr_pages = pagevec_lookup(&pvec,
-                                          nfsi->vfs_inode.i_mapping,
-                                          first,
-                                          PAGEVEC_SIZE - pagevec_count(&pvec));
-                if (!nr_pages)
-                        break;
-                for (loop = 0; loop < nr_pages; loop++)
-                        ClearPageFsCache(pvec.pages[loop]);
-                first = pvec.pages[nr_pages - 1]->index + 1;
-                pvec.nr = nr_pages;
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
-/*
 * Get an extra reference on a read context.
 * - This function can be absent if the completion function doesn't require a
 *   context.
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
        .get_attr       = nfs_fscache_inode_get_attr,
        .get_aux        = nfs_fscache_inode_get_aux,
        .check_aux      = nfs_fscache_inode_check_aux,
-        .now_uncached   = nfs_fscache_inode_now_uncached,
        .get_context    = nfs_fh_get_context,
        .put_context    = nfs_fh_put_context,
 };
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
        pagevec_init(&pvec, 0);
 repeat:
-        n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+        n = pagevec_lookup(&pvec, smap, &index);
        if (!n)
                return;
-        index = pvec.pages[n - 1]->index + 1;
        for (i = 0; i < pagevec_count(&pvec); i++) {
                struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e50a387959bf..40b5cc97f7b0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -221,7 +221,7 @@ out:
 /*
 * Set the access or default ACL of an inode.
 */
-int ocfs2_set_acl(handle_t *handle,
+static int ocfs2_set_acl(handle_t *handle,
                         struct inode *inode,
                         struct buffer_head *di_bh,
                         int type,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 2783a75b3999..7be0bb756286 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int ocfs2_set_acl(handle_t *handle,
-                         struct inode *inode,
-                         struct buffer_head *di_bh,
-                         int type,
-                         struct posix_acl *acl,
-                         struct ocfs2_alloc_context *meta_ac,
-                         struct ocfs2_alloc_context *data_ac);
 extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fb15a96df0b6..a177eae3aa1a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
 /*
 * How many free extents have we got before we need more meta data?
 */
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
-                           struct ocfs2_extent_tree *et)
 {
        int retval;
        struct ocfs2_extent_list *el = NULL;
@@ -1933,14 +1932,12 @@ out:
 * the new changes.
 *
 * left_rec: the record on the left.
- * left_child_el: is the child list pointed to by left_rec
 * right_rec: the record to the right of left_rec
 * right_child_el: is the child list pointed to by right_rec
 *
 * By definition, this only works on interior nodes.
 */
 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
-                                  struct ocfs2_extent_list *left_child_el,
                                  struct ocfs2_extent_rec *right_rec,
                                  struct ocfs2_extent_list *right_child_el)
 {
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
         */
        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
-        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+        ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
                                      &root_el->l_recs[i + 1], right_el);
 }
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                el = right_path->p_node[i].el;
                right_rec = &el->l_recs[0];
-                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+                ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
-                                              right_el);
                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2509,7 +2505,7 @@ out_ret_path:
 static int ocfs2_update_edge_lengths(handle_t *handle,
                                     struct ocfs2_extent_tree *et,
-                                     int subtree_index, struct ocfs2_path *path)
+                                     struct ocfs2_path *path)
 {
        int i, idx, ret;
        struct ocfs2_extent_rec *rec;
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
                                     subtree_index, dealloc);
-                ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+                ret = ocfs2_update_edge_lengths(handle, et, left_path);
-                                                left_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
                ocfs2_unlink_subtree(handle, et, left_path, path,
                                     subtree_index, dealloc);
-                ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+                ret = ocfs2_update_edge_lengths(handle, et, left_path);
-                                                left_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
        if (mark_unwritten)
                flags = OCFS2_EXT_UNWRITTEN;
-        free_extents = ocfs2_num_free_extents(osb, et);
+        free_extents = ocfs2_num_free_extents(et);
        if (free_extents < 0) {
                status = free_extents;
                mlog_errno(status);
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
        *ac = NULL;
-        num_free_extents = ocfs2_num_free_extents(osb, et);
+        num_free_extents = ocfs2_num_free_extents(et);
        if (num_free_extents < 0) {
                ret = num_free_extents;
                mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..27b75cf32cfa 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_cached_dealloc_ctxt *dealloc,
                             u64 refcount_loc, bool refcount_tree_locked);
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
-                           struct ocfs2_extent_tree *et);
 /*
 * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ffe003982d95..56ac07cd35f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
        }
 }
-static void o2hb_wait_on_io(struct o2hb_region *reg,
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
-                            struct o2hb_bio_wait_ctxt *wc)
 {
        o2hb_bio_wait_dec(wc, 1);
        wait_for_completion(&wc->wc_io_complete);
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
        status = 0;
 bail_and_wait:
-        o2hb_wait_on_io(reg, &wc);
+        o2hb_wait_on_io(&wc);
        if (wc.wc_error && !status)
                status = wc.wc_error;
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
         * before we can go to steady state.  This ensures that
         * people we find in our steady state have seen us.
         */
-        o2hb_wait_on_io(reg, &write_wc);
+        o2hb_wait_on_io(&write_wc);
        if (write_wc.wc_error) {
                /* Do not re-arm the write timeout on I/O error - we
                 * can't be sure that the new block ever made it to
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
                o2hb_prepare_block(reg, 0);
                ret = o2hb_issue_node_write(reg, &write_wc);
                if (ret == 0)
-                        o2hb_wait_on_io(reg, &write_wc);
+                        o2hb_wait_on_io(&write_wc);
                else
                        mlog_errno(ret);
        }
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
 }
 EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
-int o2hb_check_node_heartbeating(u8 node_num)
-{
-        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        o2hb_fill_node_map(testing_map, sizeof(testing_map));
-        if (!test_bit(node_num, testing_map)) {
-                mlog(ML_HEARTBEAT,
-                     "node (%u) does not have heartbeating enabled.\n",
-                     node_num);
-                return 0;
-        }
-        return 1;
-}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num)
 {
        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
 }
 EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
-        u8 node_num;
-        /* if this node was set then we have networking */
-        node_num = o2nm_this_node();
-        if (node_num == O2NM_MAX_NODES) {
-                mlog(ML_HEARTBEAT, "this node has not been configured.\n");
-                return 0;
-        }
-        return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
 /*
 * this is just a hack until we get the plumbing which flips file systems
 * read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3ecb9f337b7d..febe6312ceff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                spin_unlock(&OCFS2_I(dir)->ip_lock);
                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
                                              parent_fe_bh);
-                num_free_extents = ocfs2_num_free_extents(osb, &et);
+                num_free_extents = ocfs2_num_free_extents(&et);
                if (num_free_extents < 0) {
                        status = num_free_extents;
                        mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 66e59d3163ea..6e41fc8fabbe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -713,13 +713,6 @@ leave:
        return status;
 }
-int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
-                u32 clusters_to_add, int mark_unwritten)
-{
-        return __ocfs2_extend_allocation(inode, logical_start,
-                        clusters_to_add, mark_unwritten);
-}
 /*
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d5e5fa7f0743..36304434eacf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        ocfs2_schedule_truncate_log_flush(osb, 0);
        osb->local_alloc_copy = NULL;
-        osb->dirty = 0;
        /* queue to recover orphan slots for all offline slots */
        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e52a2852d50d..7eb3b0a6347e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
        unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        num_free_extents = ocfs2_num_free_extents(osb, et);
+        num_free_extents = ocfs2_num_free_extents(et);
        if (num_free_extents < 0) {
                ret = num_free_extents;
                mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0c39d71c67a1..9a50f222ac97 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -320,7 +320,6 @@ struct ocfs2_super
        u64 system_dir_blkno;
        u64 bitmap_blkno;
        u32 bitmap_cpg;
-        u8 *uuid;
        char *uuid_str;
        u32 uuid_hash;
        u8 *vol_label;
@@ -388,9 +387,8 @@ struct ocfs2_super
        unsigned int    osb_resv_level;
        unsigned int    osb_dir_resv_level;
-        /* Next three fields are for local node slot recovery during
+        /* Next two fields are for local node slot recovery during
         * mount. */
-        int dirty;
        struct ocfs2_dinode *local_alloc_copy;
        struct ocfs2_quota_recovery *quota_rec;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index f8933cb53d68..ab156e35ec00 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
                                        int *credits)
 {
        int ret = 0, meta_add = 0;
-        int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+        int num_free_extents = ocfs2_num_free_extents(et);
        if (num_free_extents < 0) {
                ret = num_free_extents;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6ad3533940ba..71f22c8fbffd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-        num_free_extents = ocfs2_num_free_extents(osb, et);
+        num_free_extents = ocfs2_num_free_extents(et);
        if (num_free_extents < 0) {
                ret = num_free_extents;
                mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83005f486451..3f936be379a9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
        if (dirty) {
                /* Recovery will be completed after we've mounted the
                 * rest of the volume. */
-                osb->dirty = 1;
                osb->local_alloc_copy = local_alloc;
                local_alloc = NULL;
        }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f70c3778d600..5fdf269ba82e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
                *credits += 1;
        /* count in the xattr tree change. */
-        num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+        num_free_extents = ocfs2_num_free_extents(xt_et);
        if (num_free_extents < 0) {
                ret = num_free_extents;
                mlog_errno(ret);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98fd8f6df851..e5d89a0d0b8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
+        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
@@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
+        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..2cbfcd32e884 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
 /*
 * task_[no]mmu.c
 */
+struct mem_size_stats;
 struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct mm_struct *mm;
+        struct mem_size_stats *rollup;
 #ifdef CONFIG_MMU
        struct vm_area_struct *tail_vma;
 #endif
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
 extern const struct file_operations proc_pid_numa_maps_operations;
 extern const struct file_operations proc_tid_numa_maps_operations;
 extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_tid_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        show_val_kb(m, "Active(file):   ", pages[LRU_ACTIVE_FILE]);
        show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
        show_val_kb(m, "Unevictable:    ", pages[LRU_UNEVICTABLE]);
-        show_val_kb(m, "Mlocked:        ", global_page_state(NR_MLOCK));
+        show_val_kb(m, "Mlocked:        ", global_zone_page_state(NR_MLOCK));
 #ifdef CONFIG_HIGHMEM
        show_val_kb(m, "HighTotal:      ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        show_val_kb(m, "SUnreclaim:     ",
                    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
        seq_printf(m, "KernelStack:    %8lu kB\n",
-                   global_page_state(NR_KERNEL_STACK_KB));
+                   global_zone_page_state(NR_KERNEL_STACK_KB));
        show_val_kb(m, "PageTables:     ",
-                    global_page_state(NR_PAGETABLE));
+                    global_zone_page_state(NR_PAGETABLE));
 #ifdef CONFIG_QUICKLIST
        show_val_kb(m, "Quicklists:     ", quicklist_total_size());
 #endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        show_val_kb(m, "NFS_Unstable:   ",
                    global_node_page_state(NR_UNSTABLE_NFS));
        show_val_kb(m, "Bounce:         ",
-                    global_page_state(NR_BOUNCE));
+                    global_zone_page_state(NR_BOUNCE));
        show_val_kb(m, "WritebackTmp:   ",
                    global_node_page_state(NR_WRITEBACK_TEMP));
        show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_CMA
        show_val_kb(m, "CmaTotal:       ", totalcma_pages);
        show_val_kb(m, "CmaFree:        ",
-                    global_page_state(NR_FREE_CMA_PAGES));
+                    global_zone_page_state(NR_FREE_CMA_PAGES));
 #endif
        hugetlb_report_meminfo(m);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..a290966f91ec 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
        if (priv->mm)
                mmdrop(priv->mm);
+        kfree(priv->rollup);
        return seq_release_private(inode, file);
 }
@@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv,
                vma->vm_end >= vma->vm_mm->start_stack;
 }
+static void show_vma_header_prefix(struct seq_file *m,
+                                   unsigned long start, unsigned long end,
+                                   vm_flags_t flags, unsigned long long pgoff,
+                                   dev_t dev, unsigned long ino)
+{
+        seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+                   start,
+                   end,
+                   flags & VM_READ ? 'r' : '-',
+                   flags & VM_WRITE ? 'w' : '-',
+                   flags & VM_EXEC ? 'x' : '-',
+                   flags & VM_MAYSHARE ? 's' : 'p',
+                   pgoff,
+                   MAJOR(dev), MINOR(dev), ino);
+}
 static void
 show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 {
@@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
        start = vma->vm_start;
        end = vma->vm_end;
+        show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
-        seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
-        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
-                        start,
-                        end,
-                        flags & VM_READ ? 'r' : '-',
-                        flags & VM_WRITE ? 'w' : '-',
-                        flags & VM_EXEC ? 'x' : '-',
-                        flags & VM_MAYSHARE ? 's' : 'p',
-                        pgoff,
-                        MAJOR(dev), MINOR(dev), ino);
        /*
         * Print the dentry name for named mappings, and a
@@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
+        bool first;
        unsigned long resident;
        unsigned long shared_clean;
        unsigned long shared_dirty;
@@ -443,7 +452,9 @@ struct mem_size_stats {
        unsigned long swap;
        unsigned long shared_hugetlb;
        unsigned long private_hugetlb;
+        unsigned long first_vma_start;
        u64 pss;
+        u64 pss_locked;
        u64 swap_pss;
        bool check_shmem_swap;
 };
@@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
                [ilog2(VM_ARCH_1)]      = "ar",
+                [ilog2(VM_WIPEONFORK)]  = "wf",
                [ilog2(VM_DONTDUMP)]    = "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
                [ilog2(VM_SOFTDIRTY)]   = "sd",
@@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
+        struct proc_maps_private *priv = m->private;
        struct vm_area_struct *vma = v;
-        struct mem_size_stats mss;
+        struct mem_size_stats mss_stack;
+        struct mem_size_stats *mss;
        struct mm_walk smaps_walk = {
                .pmd_entry = smaps_pte_range,
 #ifdef CONFIG_HUGETLB_PAGE
                .hugetlb_entry = smaps_hugetlb_range,
 #endif
                .mm = vma->vm_mm,
-                .private = &mss,
        };
+        int ret = 0;
+        bool rollup_mode;
+        bool last_vma;
+        if (priv->rollup) {
+                rollup_mode = true;
+                mss = priv->rollup;
+                if (mss->first) {
+                        mss->first_vma_start = vma->vm_start;
+                        mss->first = false;
+                }
+                last_vma = !m_next_vma(priv, vma);
+        } else {
+                rollup_mode = false;
+                memset(&mss_stack, 0, sizeof(mss_stack));
+                mss = &mss_stack;
+        }
-        memset(&mss, 0, sizeof mss);
+        smaps_walk.private = mss;
 #ifdef CONFIG_SHMEM
        if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
@@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
                                        !(vma->vm_flags & VM_WRITE)) {
-                        mss.swap = shmem_swapped;
+                        mss->swap = shmem_swapped;
                } else {
-                        mss.check_shmem_swap = true;
+                        mss->check_shmem_swap = true;
                        smaps_walk.pte_hole = smaps_pte_hole;
                }
        }
@@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
        /* mmap_sem is held in m_start */
        walk_page_vma(vma, &smaps_walk);
+        if (vma->vm_flags & VM_LOCKED)
+                mss->pss_locked += mss->pss;
+        if (!rollup_mode) {
+                show_map_vma(m, vma, is_pid);
+        } else if (last_vma) {
+                show_vma_header_prefix(
+                        m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
+                seq_pad(m, ' ');
+                seq_puts(m, "[rollup]\n");
+        } else {
+                ret = SEQ_SKIP;
+        }
-        show_map_vma(m, vma, is_pid);
+        if (!rollup_mode)
+                seq_printf(m,
-        seq_printf(m,
+                           "Size:           %8lu kB\n"
-                   "Size:           %8lu kB\n"
+                           "KernelPageSize: %8lu kB\n"
-                   "Rss:            %8lu kB\n"
+                           "MMUPageSize:    %8lu kB\n",
-                   "Pss:            %8lu kB\n"
+                           (vma->vm_end - vma->vm_start) >> 10,
-                   "Shared_Clean:   %8lu kB\n"
+                           vma_kernel_pagesize(vma) >> 10,
-                   "Shared_Dirty:   %8lu kB\n"
+                           vma_mmu_pagesize(vma) >> 10);
-                   "Private_Clean:  %8lu kB\n"
-                   "Private_Dirty:  %8lu kB\n"
-                   "Referenced:     %8lu kB\n"
+        if (!rollup_mode || last_vma)
-                   "Anonymous:      %8lu kB\n"
+                seq_printf(m,
-                   "LazyFree:       %8lu kB\n"
+                           "Rss:            %8lu kB\n"
-                   "AnonHugePages:  %8lu kB\n"
+                           "Pss:            %8lu kB\n"
-                   "ShmemPmdMapped: %8lu kB\n"
+                           "Shared_Clean:   %8lu kB\n"
-                   "Shared_Hugetlb: %8lu kB\n"
+                           "Shared_Dirty:   %8lu kB\n"
-                   "Private_Hugetlb: %7lu kB\n"
+                           "Private_Clean:  %8lu kB\n"
-                   "Swap:           %8lu kB\n"
+                           "Private_Dirty:  %8lu kB\n"
-                   "SwapPss:        %8lu kB\n"
+                           "Referenced:     %8lu kB\n"
-                   "KernelPageSize: %8lu kB\n"
+                           "Anonymous:      %8lu kB\n"
-                   "MMUPageSize:    %8lu kB\n"
+                           "LazyFree:       %8lu kB\n"
-                   "Locked:         %8lu kB\n",
+                           "AnonHugePages:  %8lu kB\n"
-                   (vma->vm_end - vma->vm_start) >> 10,
+                           "ShmemPmdMapped: %8lu kB\n"
-                   mss.resident >> 10,
+                           "Shared_Hugetlb: %8lu kB\n"
-                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
+                           "Private_Hugetlb: %7lu kB\n"
-                   mss.shared_clean  >> 10,
+                           "Swap:           %8lu kB\n"
-                   mss.shared_dirty  >> 10,
+                           "SwapPss:        %8lu kB\n"
-                   mss.private_clean >> 10,
+                           "Locked:         %8lu kB\n",
-                   mss.private_dirty >> 10,
+                           mss->resident >> 10,
-                   mss.referenced >> 10,
+                           (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
-                   mss.anonymous >> 10,
+                           mss->shared_clean  >> 10,
-                   mss.lazyfree >> 10,
+                           mss->shared_dirty  >> 10,
-                   mss.anonymous_thp >> 10,
+                           mss->private_clean >> 10,
-                   mss.shmem_thp >> 10,
+                           mss->private_dirty >> 10,
-                   mss.shared_hugetlb >> 10,
+                           mss->referenced >> 10,
-                   mss.private_hugetlb >> 10,
+                           mss->anonymous >> 10,
-                   mss.swap >> 10,
+                           mss->lazyfree >> 10,
-                   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
+                           mss->anonymous_thp >> 10,
-                   vma_kernel_pagesize(vma) >> 10,
+                           mss->shmem_thp >> 10,
-                   vma_mmu_pagesize(vma) >> 10,
+                           mss->shared_hugetlb >> 10,
-                   (vma->vm_flags & VM_LOCKED) ?
+                           mss->private_hugetlb >> 10,
-                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+                           mss->swap >> 10,
+                           (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
-        arch_show_smap(m, vma);
+                           (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
-        show_smap_vma_flags(m, vma);
+        if (!rollup_mode) {
+                arch_show_smap(m, vma);
+                show_smap_vma_flags(m, vma);
+        }
        m_cache_vma(m, vma);
-        return 0;
+        return ret;
 }
 static int show_pid_smap(struct seq_file *m, void *v)
@@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
        return do_maps_open(inode, file, &proc_pid_smaps_op);
 }
+static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        struct proc_maps_private *priv;
+        int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
+        if (ret < 0)
+                return ret;
+        seq = file->private_data;
+        priv = seq->private;
+        priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
+        if (!priv->rollup) {
+                proc_map_release(inode, file);
+                return -ENOMEM;
+        }
+        priv->rollup->first = true;
+        return 0;
+}
 static int tid_smaps_open(struct inode *inode, struct file *file)
 {
        return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = {
        .release        = proc_map_release,
 };
+const struct file_operations proc_pid_smaps_rollup_operations = {
+        .open           = pid_smaps_rollup_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = proc_map_release,
+};
 const struct file_operations proc_tid_smaps_operations = {
        .open           = tid_smaps_open,
        .read           = seq_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        if (!pages)
                goto out_free;
-        nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+        nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
        if (nr != lpages)
                goto out_free_pages; /* leave if some pages were missing */
diff --git a/fs/sync.c b/fs/sync.c
index 27d6b8bbcb6a..2e3fd7d94d2d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
                goto out_put;
        mapping = f.file->f_mapping;
-        if (!mapping) {
-                ret = -EINVAL;
-                goto out_put;
-        }
        ret = 0;
        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
                ret = file_fdatawait_range(f.file, offset, endbyte);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 886085b47c75..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
 static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned int flags,
-                                            unsigned long reason)
+                                            unsigned long reason,
+                                            unsigned int features)
 {
        struct uffd_msg msg;
        msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
                 * write protect fault.
                 */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+        if (features & UFFD_FEATURE_THREAD_ID)
+                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
 }
@@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
        VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
        VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+        if (ctx->features & UFFD_FEATURE_SIGBUS)
+                goto out;
        /*
         * If it's already released don't get it. This avoids to loop
         * in __get_user_pages if userfaultfd_release waits on the
@@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
-        uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+        uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+                        ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;
@@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags, new_flags;
        bool found;
-        bool non_anon_pages;
+        bool basic_ioctls;
        unsigned long start, end, vma_end;
        user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         * Search for not compatible vmas.
         */
        found = false;
-        non_anon_pages = false;
+        basic_ioctls = false;
        for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
                cond_resched();
@@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                /*
                 * Note vmas containing huge pages
                 */
-                if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+                if (is_vm_hugetlb_page(cur))
-                        non_anon_pages = true;
+                        basic_ioctls = true;
                found = true;
        }
@@ -1371,7 +1378,7 @@ out_unlock:
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
-                if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+                if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                             UFFD_API_RANGE_IOCTLS,
                             &user_uffdio_register->ioctls))
                        ret = -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0debbc7e3f03..ec3e44fcf771 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite(
        if (vmf->pgoff >= size)
                ret = VM_FAULT_SIGBUS;
        else if (IS_DAX(inode))
-                ret = dax_pfn_mkwrite(vmf);
+                ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
        sb_end_pagefault(inode->i_sb);
        return ret;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b1cf4ba0902..1f0720de8990 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -38,7 +38,15 @@
 #define BIO_BUG_ON
 #endif
+#ifdef CONFIG_THP_SWAP
+#if HPAGE_PMD_NR > 256
+#define BIO_MAX_PAGES           HPAGE_PMD_NR
+#else
 #define BIO_MAX_PAGES           256
+#endif
+#else
+#define BIO_MAX_PAGES           256
+#endif
 #define bio_prio(bio)                   (bio)->bi_ioprio
 #define bio_set_prio(bio, prio)         ((bio)->bi_ioprio = prio)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..eb0bff6f1eab 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -89,34 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
-/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a huge zero
- * page (HZP) or an empty entry that is just used for locking.  In total four
- * special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
- * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-static inline unsigned long dax_radix_sector(void *entry)
-{
-        return (unsigned long)entry >> RADIX_DAX_SHIFT;
-}
-static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
-{
-        return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-                        ((unsigned long)sector << RADIX_DAX_SHIFT) |
-                        RADIX_DAX_ENTRY_LOCK);
-}
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
@@ -124,8 +96,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-                pgoff_t index, void *entry, bool wake_all);
 #ifdef CONFIG_FS_DAX
 int __dax_zero_page_range(struct block_device *bdev,
@@ -140,21 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
-#ifdef CONFIG_FS_DAX_PMD
-static inline unsigned int dax_radix_order(void *entry)
-{
-        if ((unsigned long)entry & RADIX_DAX_PMD)
-                return PMD_SHIFT - PAGE_SHIFT;
-        return 0;
-}
-#else
-static inline unsigned int dax_radix_order(void *entry)
-{
-        return 0;
-}
-#endif
-int dax_pfn_mkwrite(struct vm_fault *vmf);
 static inline bool dax_mapping(struct address_space *mapping)
 {
        return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b744a3456c5..c57002ae6520 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,8 +1269,6 @@ extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
-struct mm_struct;
 /*
 *      Umount options
 */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 115bb81912cc..f4ff47d4a893 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -143,15 +143,6 @@ struct fscache_cookie_def {
        void (*mark_page_cached)(void *cookie_netfs_data,
                                 struct address_space *mapping,
                                 struct page *page);
-        /* indicate the cookie is no longer cached
-         * - this function is called when the backing store currently caching
-         *   a cookie is removed
-         * - the netfs should use this to clean up any markers indicating
-         *   cached pages
-         * - this is mandatory for any object that may have data
-         */
-        void (*now_uncached)(void *cookie_netfs_data);
 };
 /*
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b15a4bcfa77..69966c461d1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
 void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-                                             enum memcg_stat_item idx)
+                                             int idx)
 {
        long val = 0;
        int cpu;
@@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
        return val;
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-                                     enum memcg_stat_item idx, int val)
+                                     int idx, int val)
 {
        if (!mem_cgroup_disabled())
                __this_cpu_add(memcg->stat->count[idx], val);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-                                   enum memcg_stat_item idx, int val)
+                                   int idx, int val)
 {
        if (!mem_cgroup_disabled())
                this_cpu_add(memcg->stat->count[idx], val);
@@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 * Kernel pages are an exception to this, since they'll never move.
 */
 static inline void __mod_memcg_page_state(struct page *page,
-                                          enum memcg_stat_item idx, int val)
+                                          int idx, int val)
 {
        if (page->mem_cgroup)
                __mod_memcg_state(page->mem_cgroup, idx, val);
 }
 static inline void mod_memcg_page_state(struct page *page,
-                                        enum memcg_stat_item idx, int val)
+                                        int idx, int val)
 {
        if (page->mem_cgroup)
                mod_memcg_state(page->mem_cgroup, idx, val);
@@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
                this_cpu_add(memcg->stat->events[idx], count);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void count_memcg_page_event(struct page *page,
-                                          enum memcg_stat_item idx)
+                                          int idx)
 {
        if (page->mem_cgroup)
                count_memcg_events(page->mem_cgroup, idx, 1);
@@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 }
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-                                             enum memcg_stat_item idx)
+                                             int idx)
 {
        return 0;
 }
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-                                     enum memcg_stat_item idx,
+                                     int idx,
                                     int nr)
 {
 }
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-                                   enum memcg_stat_item idx,
+                                   int idx,
                                   int nr)
 {
 }
 static inline void __mod_memcg_page_state(struct page *page,
-                                          enum memcg_stat_item idx,
+                                          int idx,
                                          int nr)
 {
 }
 static inline void mod_memcg_page_state(struct page *page,
-                                        enum memcg_stat_item idx,
+                                        int idx,
                                        int nr)
 {
 }
@@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 }
 static inline void count_memcg_page_event(struct page *page,
-                                          enum memcg_stat_item idx)
+                                          int idx)
 {
 }
@@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-                                     enum memcg_stat_item idx)
+                                     int idx)
 {
        __mod_memcg_state(memcg, idx, 1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-                                     enum memcg_stat_item idx)
+                                     int idx)
 {
        __mod_memcg_state(memcg, idx, -1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_page_state(struct page *page,
-                                          enum memcg_stat_item idx)
+                                          int idx)
 {
        __mod_memcg_page_state(page, idx, 1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_page_state(struct page *page,
-                                          enum memcg_stat_item idx)
+                                          int idx)
 {
        __mod_memcg_page_state(page, idx, -1);
 }
@@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page,
        __mod_lruvec_page_state(page, idx, -1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_state(struct mem_cgroup *memcg,
-                                   enum memcg_stat_item idx)
+                                   int idx)
 {
        mod_memcg_state(memcg, idx, 1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_state(struct mem_cgroup *memcg,
-                                   enum memcg_stat_item idx)
+                                   int idx)
 {
        mod_memcg_state(memcg, idx, -1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_page_state(struct page *page,
-                                        enum memcg_stat_item idx)
+                                        int idx)
 {
        mod_memcg_page_state(page, idx, 1);
 }
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_page_state(struct page *page,
-                                        enum memcg_stat_item idx)
+                                        int idx)
 {
        mod_memcg_page_state(page, idx, -1);
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c8a5056a5ae0..5e6e4cc36ff4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
                                          unsigned long pnum);
 extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
                int online_type);
-extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
+extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
                unsigned long nr_pages);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1f6c95f3496..39db8e54c5d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE    0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB      0x00400000      /* Huge TLB Page VM */
 #define VM_ARCH_1       0x01000000      /* Architecture-specific flag */
-#define VM_ARCH_2       0x02000000
+#define VM_WIPEONFORK   0x02000000      /* Wipe VMA contents in child. */
 #define VM_DONTDUMP     0x04000000      /* Do not include in the core dump */
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_1      33      /* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_2      34      /* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_3      35      /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_4      36      /* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0  BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1  BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2  BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3  BIT(VM_HIGH_ARCH_BIT_3)
+#define VM_HIGH_ARCH_4  BIT(VM_HIGH_ARCH_BIT_4)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 #if defined(CONFIG_X86)
@@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_MAPPED_COPY VM_ARCH_1       /* T if mapped copy of data (nommu mmap) */
 #endif
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86_INTEL_MPX)
 /* MPX specific bounds table or bounds directory */
-# define VM_MPX         VM_ARCH_2
+# define VM_MPX         VM_HIGH_ARCH_BIT_4
+#else
+# define VM_MPX         VM_NONE
 #endif
 #ifndef VM_GROWSUP
@@ -2294,6 +2298,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+                        pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -2506,7 +2512,7 @@ enum mf_action_page_type {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
-                            unsigned long addr,
+                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
 extern void copy_user_huge_page(struct page *dst, struct page *src,
                                unsigned long addr, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 57378c7cb5f8..f45ad815b7d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@ struct vm_area_struct {
        struct file * vm_file;          /* File we map to (can be NULL). */
        void * vm_private_data;         /* was vm_pte (shared mem) */
+        atomic_long_t swap_readahead_info;
 #ifndef CONFIG_MMU
        struct vm_region *vm_region;    /* NOMMU mapping region */
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..e7e92c8f4883 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,8 +770,7 @@ static inline bool is_dev_zone(const struct zone *zone)
 #include <linux/memory_hotplug.h>
-extern struct mutex zonelists_mutex;
+void build_all_zonelists(pg_data_t *pgdat);
-void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int classzone_idx, unsigned int alloc_flags,
@@ -896,7 +895,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
                        void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
-#define NUMA_ZONELIST_ORDER_LEN 16      /* string buffer size */
+#define NUMA_ZONELIST_ORDER_LEN 16
 #ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d33e3280c8ad..ba2d470d2d0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
-TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
-        TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
 PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..5bbd6780f205 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
                          unsigned int nr_entries, struct page **entries,
                          pgoff_t *indices);
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
-                        unsigned int nr_pages, struct page **pages);
+                        pgoff_t end, unsigned int nr_pages,
+                        struct page **pages);
+static inline unsigned find_get_pages(struct address_space *mapping,
+                        pgoff_t *start, unsigned int nr_pages,
+                        struct page **pages)
+{
+        return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
+                                    pages);
+}
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                               unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index b45d391b4540..4dcd5506f1ed 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,8 +27,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                pgoff_t start, unsigned nr_entries,
                                pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+unsigned pagevec_lookup_range(struct pagevec *pvec,
-                pgoff_t start, unsigned nr_pages);
+                              struct address_space *mapping,
+                              pgoff_t *start, pgoff_t end);
+static inline unsigned pagevec_lookup(struct pagevec *pvec,
+                                      struct address_space *mapping,
+                                      pgoff_t *start)
+{
+        return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
+}
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, int tag,
                unsigned nr_pages);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b0a281f9d26..3a19c253bdb1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
-/* same as above but performs the slow path from the async context. Can
- * be called from the atomic context as well
- */
-extern void mmput_async(struct mm_struct *);
-#endif
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 0fb7061ec54c..21a5e6c43385 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
 /* shm_mode upper byte flags */
 #define SHM_DEST        01000   /* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_NORESERVE   010000  /* don't check for reservations */
-/* Bits [26:31] are reserved */
-/*
- * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a7d6bd2a918f..b6c3540e07bc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
                                  unsigned long dst_addr,
                                  unsigned long src_addr,
                                  struct page **pagep);
+extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+                                    pmd_t *dst_pmd,
+                                    struct vm_area_struct *dst_vma,
+                                    unsigned long dst_addr);
 #else
 #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
                               src_addr, pagep)        ({ BUG(); 0; })
+#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
+                                 dst_addr)      ({ BUG(); 0; })
 #endif
 #endif
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 4fcacd915d45..51d189615bda 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -18,6 +18,13 @@ struct shrink_control {
         */
        unsigned long nr_to_scan;
+        /*
+         * How many objects did scan_objects process?
+         * This defaults to nr_to_scan before every call, but the callee
+         * should track its actual progress.
+         */
+        unsigned long nr_scanned;
        /* current node being shrunk (for NUMA aware shrinkers) */
        int nid;
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index cc0faf3a90be..0783b622311e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,6 +115,10 @@ struct kmem_cache {
 #endif
 #endif
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+        unsigned long random;
+#endif
 #ifdef CONFIG_NUMA
        /*
         * Defragmentation by allocating from a remote node.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..8bf3487fb204 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,6 +188,7 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
 /*
 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
@@ -211,7 +212,7 @@ struct swap_info_struct {
        unsigned long   flags;          /* SWP_USED etc: see above */
        signed short    prio;           /* swap priority of this type */
        struct plist_node list;         /* entry in swap_active_head */
-        struct plist_node avail_list;   /* entry in swap_avail_head */
+        struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
        signed char     type;           /* strange name for an index */
        unsigned int    max;            /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
@@ -250,6 +251,25 @@ struct swap_info_struct {
        struct swap_cluster_list discard_clusters; /* discard clusters list */
 };
+#ifdef CONFIG_64BIT
+#define SWAP_RA_ORDER_CEILING   5
+#else
+/* Avoid stack overflow, because we need to save part of page table */
+#define SWAP_RA_ORDER_CEILING   3
+#define SWAP_RA_PTE_CACHE_SIZE  (1 << SWAP_RA_ORDER_CEILING)
+#endif
+struct vma_swap_readahead {
+        unsigned short win;
+        unsigned short offset;
+        unsigned short nr_pte;
+#ifdef CONFIG_64BIT
+        pte_t *ptes;
+#else
+        pte_t ptes[SWAP_RA_PTE_CACHE_SIZE];
+#endif
+};
 /* linux/mm/workingset.c */
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
@@ -262,8 +282,8 @@ extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
 extern unsigned long nr_free_pagecache_pages(void);
-/* Definition of global_page_state not available yet */
+/* Definition of global_zone_page_state not available yet */
-#define nr_free_pages() global_page_state(NR_FREE_PAGES)
+#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
 /* linux/mm/swap.c */
@@ -349,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 #define SWAP_ADDRESS_SPACE_SHIFT        14
 #define SWAP_ADDRESS_SPACE_PAGES        (1 << SWAP_ADDRESS_SPACE_SHIFT)
 extern struct address_space *swapper_spaces[];
+extern bool swap_vma_readahead;
 #define swap_address_space(entry)                           \
        (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
                >> SWAP_ADDRESS_SPACE_SHIFT])
@@ -361,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
-extern struct page *lookup_swap_cache(swp_entry_t);
+extern struct page *lookup_swap_cache(swp_entry_t entry,
+                                      struct vm_area_struct *vma,
+                                      unsigned long addr);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
                        struct vm_area_struct *vma, unsigned long addr,
                        bool do_poll);
@@ -371,11 +394,23 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
                        struct vm_area_struct *vma, unsigned long addr);
+extern struct page *swap_readahead_detect(struct vm_fault *vmf,
+                                          struct vma_swap_readahead *swap_ra);
+extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+                                           struct vm_fault *vmf,
+                                           struct vma_swap_readahead *swap_ra);
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern atomic_t nr_rotate_swap;
 extern bool has_usable_swap(void);
+static inline bool swap_use_vma_readahead(void)
+{
+        return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
+}
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
 {
@@ -465,12 +500,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
        return NULL;
 }
+static inline bool swap_use_vma_readahead(void)
+{
+        return false;
+}
+static inline struct page *swap_readahead_detect(
+        struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+        return NULL;
+}
+static inline struct page *do_swap_page_readahead(
+        swp_entry_t fentry, gfp_t gfp_mask,
+        struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+        return NULL;
+}
 static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
        return 0;
 }
-static inline struct page *lookup_swap_cache(swp_entry_t swp)
+static inline struct page *lookup_swap_cache(swp_entry_t swp,
+                                             struct vm_area_struct *vma,
+                                             unsigned long addr)
 {
        return NULL;
 }
@@ -509,8 +564,8 @@ static inline int swp_swapcount(swp_entry_t entry)
        return 0;
 }
-#define reuse_swap_page(page, total_mapcount) \
+#define reuse_swap_page(page, total_map_swapcount) \
-        (page_trans_huge_mapcount(page, total_mapcount) == 1)
+        (page_trans_huge_mapcount(page, total_map_swapcount) == 1)
 static inline int try_to_free_swap(struct page *page)
 {
@@ -526,6 +581,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
 #endif /* CONFIG_SWAP */
+#ifdef CONFIG_THP_SWAP
+extern int split_swap_cluster(swp_entry_t entry);
+#else
+static inline int split_swap_cluster(swp_entry_t entry)
+{
+        return 0;
+}
+#endif
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31a4632..d77bc35278b0 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -85,6 +85,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
                THP_ZERO_PAGE_ALLOC,
                THP_ZERO_PAGE_ALLOC_FAILED,
+                THP_SWPOUT,
+                THP_SWPOUT_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
                BALLOON_INFLATE,
@@ -104,6 +106,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                VMACACHE_FIND_HITS,
                VMACACHE_FULL_FLUSHES,
 #endif
+#ifdef CONFIG_SWAP
+                SWAP_RA,
+                SWAP_RA_HIT,
+#endif
                NR_VM_EVENT_ITEMS
 };
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f30d424..97e11ab573f0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat,
        atomic_long_add(x, &vm_node_stat[item]);
 }
-static inline unsigned long global_page_state(enum zone_stat_item item)
+static inline unsigned long global_zone_page_state(enum zone_stat_item item)
 {
        long x = atomic_long_read(&vm_zone_stat[item]);
 #ifdef CONFIG_SMP
@@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
 #else
-#define sum_zone_node_page_state(node, item) global_page_state(item)
+#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
 #endif /* CONFIG_NUMA */
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 08bb3ed18dcc..fbc4a06f7310 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 TRACE_EVENT(dax_insert_mapping,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 8e50d01c645f..4c2e4737d7bc 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1,        "arch_1"        }
 #endif
-#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC_2 {VM_MPX,           "mpx"           }
-#else
-#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2,        "arch_2"        }
-#endif
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
 #else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
        {VM_NORESERVE,                  "noreserve"     },              \
        {VM_HUGETLB,                    "hugetlb"       },              \
        __VM_ARCH_SPECIFIC_1                            ,               \
-        __VM_ARCH_SPECIFIC_2                            ,               \
+        {VM_WIPEONFORK,                 "wipeonfork"    },              \
        {VM_DONTDUMP,                   "dontdump"      },              \
 IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,      "softdirty"     )               \
        {VM_MIXEDMAP,                   "mixedmap"      },              \
diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
new file mode 100644
index 000000000000..e4732d3c2998
--- /dev/null
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_
+#define _ASM_GENERIC_HUGETLB_ENCODE_H_
+/*
+ * Several system calls take a flag to request "hugetlb" huge pages.
+ * Without further specification, these system calls will use the
+ * system's default huge page size.  If a system supports multiple
+ * huge page sizes, the desired huge page size can be specified in
+ * bits [26:31] of the flag arguments.  The value in these 6 bits
+ * will encode the log2 of the huge page size.
+ *
+ * The following definitions are associated with this huge page size
+ * encoding in flag arguments.  System call specific header files
+ * that use this encoding should include this file.  They can then
+ * provide definitions based on these with their own specific prefix.
+ * for example:
+ * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+ */
+#define HUGETLB_FLAG_ENCODE_SHIFT       26
+#define HUGETLB_FLAG_ENCODE_MASK        0x3f
+#define HUGETLB_FLAG_ENCODE_64KB        (16 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512KB       (19 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1MB         (20 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2MB         (21 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_8MB         (23 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16MB        (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_256MB       (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1GB         (30 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2GB         (31 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16GB        (34 << HUGETLB_FLAG_ENCODE_SHIFT)
+#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..203268f9231e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -58,20 +58,12 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP     17              /* Clear the MADV_DONTDUMP flag */
+#define MADV_WIPEONFORK 18              /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19              /* Undo MADV_WIPEONFORK */
 /* compatibility flags */
 #define MAP_FILE        0
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
 #define PKEY_DISABLE_ACCESS     0x1
 #define PKEY_DISABLE_WRITE      0x2
 #define PKEY_ACCESS_MASK        (PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 534e364bda92..7f3a722dbd72 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -1,8 +1,32 @@
 #ifndef _UAPI_LINUX_MEMFD_H
 #define _UAPI_LINUX_MEMFD_H
+#include <asm-generic/hugetlb_encode.h>
 /* flags for memfd_create(2) (unsigned int) */
 #define MFD_CLOEXEC             0x0001U
 #define MFD_ALLOW_SEALING       0x0002U
+#define MFD_HUGETLB             0x0004U
+/*
+ * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MFD_HUGE_SHIFT  HUGETLB_FLAG_ENCODE_SHIFT
+#define MFD_HUGE_MASK   HUGETLB_FLAG_ENCODE_MASK
+#define MFD_HUGE_64KB   HUGETLB_FLAG_ENCODE_64KB
+#define MFD_HUGE_512KB  HUGETLB_FLAG_ENCODE_512KB
+#define MFD_HUGE_1MB    HUGETLB_FLAG_ENCODE_1MB
+#define MFD_HUGE_2MB    HUGETLB_FLAG_ENCODE_2MB
+#define MFD_HUGE_8MB    HUGETLB_FLAG_ENCODE_8MB
+#define MFD_HUGE_16MB   HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_256MB  HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_1GB    HUGETLB_FLAG_ENCODE_1GB
+#define MFD_HUGE_2GB    HUGETLB_FLAG_ENCODE_2GB
+#define MFD_HUGE_16GB   HUGETLB_FLAG_ENCODE_16GB
 #endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd3a90c..a937480d7cd3 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -2,6 +2,7 @@
 #define _UAPI_LINUX_MMAN_H
 #include <asm/mman.h>
+#include <asm-generic/hugetlb_encode.h>
 #define MREMAP_MAYMOVE  1
 #define MREMAP_FIXED    2
@@ -10,4 +11,25 @@
 #define OVERCOMMIT_ALWAYS               1
 #define OVERCOMMIT_NEVER                2
+/*
+ * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MAP_HUGE_SHIFT  HUGETLB_FLAG_ENCODE_SHIFT
+#define MAP_HUGE_MASK   HUGETLB_FLAG_ENCODE_MASK
+#define MAP_HUGE_64KB   HUGETLB_FLAG_ENCODE_64KB
+#define MAP_HUGE_512KB  HUGETLB_FLAG_ENCODE_512KB
+#define MAP_HUGE_1MB    HUGETLB_FLAG_ENCODE_1MB
+#define MAP_HUGE_2MB    HUGETLB_FLAG_ENCODE_2MB
+#define MAP_HUGE_8MB    HUGETLB_FLAG_ENCODE_8MB
+#define MAP_HUGE_16MB   HUGETLB_FLAG_ENCODE_16MB
+#define MAP_HUGE_256MB  HUGETLB_FLAG_ENCODE_256MB
+#define MAP_HUGE_1GB    HUGETLB_FLAG_ENCODE_1GB
+#define MAP_HUGE_2GB    HUGETLB_FLAG_ENCODE_2GB
+#define MAP_HUGE_16GB   HUGETLB_FLAG_ENCODE_16GB
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 1fbf24ea37fd..cf23c873719d 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -3,6 +3,7 @@
 #include <linux/ipc.h>
 #include <linux/errno.h>
+#include <asm-generic/hugetlb_encode.h>
 #ifndef __KERNEL__
 #include <unistd.h>
 #endif
@@ -40,11 +41,37 @@ struct shmid_ds {
 /* Include the definition of shmid64_ds and shminfo64 */
 #include <asm/shmbuf.h>
-/* permission flag for shmget */
+/*
+ * shmget() shmflg values.
+ */
+/* The bottom nine bits are the same as open(2) mode flags */
 #define SHM_R           0400    /* or S_IRUGO from <linux/stat.h> */
 #define SHM_W           0200    /* or S_IWUGO from <linux/stat.h> */
+/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
+#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
+#define SHM_NORESERVE   010000  /* don't check for reservations */
+/*
+ * Huge page size encoding when SHM_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h
+ */
+#define SHM_HUGE_SHIFT  HUGETLB_FLAG_ENCODE_SHIFT
+#define SHM_HUGE_MASK   HUGETLB_FLAG_ENCODE_MASK
+#define SHM_HUGE_64KB   HUGETLB_FLAG_ENCODE_64KB
+#define SHM_HUGE_512KB  HUGETLB_FLAG_ENCODE_512KB
+#define SHM_HUGE_1MB    HUGETLB_FLAG_ENCODE_1MB
+#define SHM_HUGE_2MB    HUGETLB_FLAG_ENCODE_2MB
+#define SHM_HUGE_8MB    HUGETLB_FLAG_ENCODE_8MB
+#define SHM_HUGE_16MB   HUGETLB_FLAG_ENCODE_16MB
+#define SHM_HUGE_256MB  HUGETLB_FLAG_ENCODE_256MB
+#define SHM_HUGE_1GB    HUGETLB_FLAG_ENCODE_1GB
+#define SHM_HUGE_2GB    HUGETLB_FLAG_ENCODE_2GB
+#define SHM_HUGE_16GB   HUGETLB_FLAG_ENCODE_16GB
-/* mode for attach */
+/*
+ * shmat() shmflg values
+ */
 #define SHM_RDONLY      010000  /* read-only access */
 #define SHM_RND         020000  /* round attach address to SHMLBA boundary */
 #define SHM_REMAP       040000  /* take-over region on attach */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..d6d1f65cb3c3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -23,7 +23,9 @@
                           UFFD_FEATURE_EVENT_REMOVE |  \
                           UFFD_FEATURE_EVENT_UNMAP |           \
                           UFFD_FEATURE_MISSING_HUGETLBFS |     \
-                           UFFD_FEATURE_MISSING_SHMEM)
+                           UFFD_FEATURE_MISSING_SHMEM |         \
+                           UFFD_FEATURE_SIGBUS |                \
+                           UFFD_FEATURE_THREAD_ID)
 #define UFFD_API_IOCTLS                         \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
@@ -78,6 +80,9 @@ struct uffd_msg {
                struct {
                        __u64   flags;
                        __u64   address;
+                        union {
+                                __u32 ptid;
+                        } feat;
                } pagefault;
                struct {
@@ -153,6 +158,13 @@ struct uffdio_api {
         * UFFD_FEATURE_MISSING_SHMEM works the same as
         * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
         * (i.e. tmpfs and other shmem based APIs).
+         *
+         * UFFD_FEATURE_SIGBUS feature means no page-fault
+         * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
+         * a SIGBUS signal will be sent to the faulting process.
+         *
+         * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
+         * be returned, if feature is not requested 0 will be returned.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP          (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                 (1<<1)
@@ -161,6 +173,8 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS          (1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM              (1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP                (1<<6)
+#define UFFD_FEATURE_SIGBUS                     (1<<7)
+#define UFFD_FEATURE_THREAD_ID                  (1<<8)
        __u64 features;
        __u64 ioctls;
diff --git a/init/Kconfig b/init/Kconfig
index 5f0ef850e808..78cb2461012e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM
          security feature reduces the predictability of the kernel slab
          allocator against heap overflows.
+config SLAB_FREELIST_HARDENED
+        bool "Harden slab freelist metadata"
+        depends on SLUB
+        help
+          Many kernel heap attacks try to target slab cache metadata and
+          other infrastructure. This options makes minor performance
+          sacrifies to harden the kernel slab allocator against common
+          freelist exploit methods.
 config SLUB_CPU_PARTIAL
        default y
        depends on SLUB && SMP
diff --git a/init/main.c b/init/main.c
index 8828fc148670..a21a1a8708a8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void)
        boot_cpu_state_init();
        smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
-        build_all_zonelists(NULL, NULL);
+        build_all_zonelists(NULL);
        page_alloc_init();
        pr_notice("Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..f64fc967a9ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4100,9 +4100,6 @@ static void offline_css(struct cgroup_subsys_state *css)
        if (!(css->flags & CSS_ONLINE))
                return;
-        if (ss->css_reset)
-                ss->css_reset(css);
        if (ss->css_offline)
                ss->css_offline(css);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..e7485786db9b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,6 +56,7 @@
 #include <linux/time64.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/oom.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
@@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
@@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
- *      TIF_MEMDIE   - any node ok
+ *      tsk_is_oom_victim   - any node ok
 *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 */
@@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE)))
+        if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
                return false;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e5345c07344..24a4c0be80d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
-                if (anon_vma_fork(tmp, mpnt))
+                if (tmp->vm_flags & VM_WIPEONFORK) {
+                        /* VM_WIPEONFORK gets a clean slate in the child. */
+                        tmp->anon_vma = NULL;
+                        if (anon_vma_prepare(tmp))
+                                goto fail_nomem_anon_vma_fork;
+                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                rb_parent = &tmp->vm_rb;
                mm->map_count++;
-                retval = copy_page_range(mm, oldmm, mpnt);
+                if (!(tmp->vm_flags & VM_WIPEONFORK))
+                        retval = copy_page_range(mm, oldmm, mpnt);
                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);
@@ -922,7 +928,6 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
-        set_bit(MMF_OOM_SKIP, &mm->flags);
        mmdrop(mm);
 }
@@ -938,22 +943,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
-static void mmput_async_fn(struct work_struct *work)
-{
-        struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
-        __mmput(mm);
-}
-void mmput_async(struct mm_struct *mm)
-{
-        if (atomic_dec_and_test(&mm->mm_users)) {
-                INIT_WORK(&mm->async_put_work, mmput_async_fn);
-                schedule_work(&mm->async_put_work);
-        }
-}
-#endif
 /**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9afdc434fb49..066e73c2fcc9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -194,18 +194,41 @@ struct page_map {
        struct vmem_altmap altmap;
 };
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
 {
-        resource_size_t key, align_start, align_size, align_end;
+        unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+        unsigned long nr_pages, mask;
-        align_start = res->start & ~(SECTION_SIZE - 1);
+        nr_pages = PHYS_PFN(resource_size(res));
-        align_size = ALIGN(resource_size(res), SECTION_SIZE);
+        if (nr_pages == pgoff)
-        align_end = align_start + align_size - 1;
+                return ULONG_MAX;
+        /*
+         * What is the largest aligned power-of-2 range available from
+         * this resource pgoff to the end of the resource range,
+         * considering the alignment of the current pgoff?
+         */
+        mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+        if (!mask)
+                return ULONG_MAX;
+        return find_first_bit(&mask, BITS_PER_LONG);
+}
+#define foreach_order_pgoff(res, order, pgoff) \
+        for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+                        pgoff += 1UL << order, order = order_at((res), pgoff))
+static void pgmap_radix_release(struct resource *res)
+{
+        unsigned long pgoff, order;
        mutex_lock(&pgmap_lock);
-        for (key = res->start; key <= res->end; key += SECTION_SIZE)
+        foreach_order_pgoff(res, order, pgoff)
-                radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+                radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
        mutex_unlock(&pgmap_lock);
+        synchronize_rcu();
 }
 static unsigned long pfn_first(struct page_map *page_map)
@@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
        WARN_ON_ONCE(!rcu_read_lock_held());
-        page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+        page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
        return page_map ? &page_map->pgmap : NULL;
 }
@@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 void *devm_memremap_pages(struct device *dev, struct resource *res,
                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
-        resource_size_t key, align_start, align_size, align_end;
+        resource_size_t align_start, align_size, align_end;
+        unsigned long pfn, pgoff, order;
        pgprot_t pgprot = PAGE_KERNEL;
        struct dev_pagemap *pgmap;
        struct page_map *page_map;
        int error, nid, is_ram;
-        unsigned long pfn;
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        mutex_lock(&pgmap_lock);
        error = 0;
        align_end = align_start + align_size - 1;
-        for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+        foreach_order_pgoff(res, order, pgoff) {
                struct dev_pagemap *dup;
                rcu_read_lock();
-                dup = find_dev_pagemap(key);
+                dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
                rcu_read_unlock();
                if (dup) {
                        dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                        error = -EBUSY;
                        break;
                }
-                error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+                error = __radix_tree_insert(&pgmap_radix,
-                                page_map);
+                                PHYS_PFN(res->start) + pgoff, order, page_map);
                if (error) {
                        dev_err(dev, "%s: failed: %d\n", __func__, error);
                        break;
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP
        depends on ARCH_HAS_ZONE_DEVICE
+        select RADIX_TREE_MULTIORDER
        help
          Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e01cb6e5173..9d21afd692b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
                        return -EEXIST;
                mapping->nrexceptional--;
-                if (!dax_mapping(mapping)) {
+                if (shadowp)
-                        if (shadowp)
+                        *shadowp = p;
-                                *shadowp = p;
-                } else {
-                        /* DAX can replace empty locked entry with a hole */
-                        WARN_ON_ONCE(p !=
-                                dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
-                        /* Wakeup waiters for exceptional entry lock */
-                        dax_wake_mapping_entry_waiter(mapping, page->index, p,
-                                                      true);
-                }
        }
        __radix_tree_replace(&mapping->page_tree, node, slot, page,
                             workingset_update_node, mapping);
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
 {
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
-        struct pagevec pvec;
+        struct page *page;
-        bool ret;
        if (end_byte < start_byte)
                return false;
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
        if (mapping->nrpages == 0)
                return false;
-        pagevec_init(&pvec, 0);
+        if (!find_get_pages_range(mapping, &index, end, 1, &page))
-        if (!pagevec_lookup(&pvec, mapping, index, 1))
                return false;
-        ret = (pvec.pages[0]->index <= end);
+        put_page(page);
-        pagevec_release(&pvec);
+        return true;
-        return ret;
 }
 EXPORT_SYMBOL(filemap_range_has_page);
@@ -1564,23 +1552,29 @@ export:
 }
 /**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
 * @mapping:    The address_space to search
 * @start:      The starting page index
+ * @end:        The final page index (inclusive)
 * @nr_pages:   The maximum number of pages
 * @pages:      Where the resulting pages are placed
 *
- * find_get_pages() will search for and return a group of up to
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
- * @nr_pages pages in the mapping.  The pages are placed at @pages.
+ * pages in the mapping starting at index @start and up to index @end
- * find_get_pages() takes a reference against the returned pages.
+ * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
+ * a reference against the returned pages.
 *
 * The search returns a group of mapping-contiguous pages with ascending
 * indexes.  There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
 *
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
 */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
-                            unsigned int nr_pages, struct page **pages)
+                              pgoff_t end, unsigned int nr_pages,
+                              struct page **pages)
 {
        struct radix_tree_iter iter;
        void **slot;
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
                return 0;
        rcu_read_lock();
-        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
                struct page *head, *page;
+                if (iter.index > end)
+                        break;
 repeat:
                page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
@@ -1627,11 +1624,25 @@ repeat:
                }
                pages[ret] = page;
-                if (++ret == nr_pages)
+                if (++ret == nr_pages) {
-                        break;
+                        *start = pages[ret - 1]->index + 1;
+                        goto out;
+                }
        }
+        /*
+         * We come here when there is no page beyond @end. We take care to not
+         * overflow the index @start as it confuses some of the callers. This
+         * breaks the iteration when there is page at index -1 but that is
+         * already broken anyway.
+         */
+        if (end == (pgoff_t)-1)
+                *start = (pgoff_t)-1;
+        else
+                *start = end + 1;
+out:
        rcu_read_unlock();
        return ret;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* __HAVE_ARCH_PTE_SPECIAL */
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                unsigned long end, struct page **pages, int *nr)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3644ff918434..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
        NULL,
 };
-static struct attribute_group hugepage_attr_group = {
+static const struct attribute_group hugepage_attr_group = {
        .attrs = hugepage_attr,
 };
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
                goto release;
        }
-        clear_huge_page(page, haddr, HPAGE_PMD_NR);
+        clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * clear_huge_page writes become visible before the set_pmd_at()
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
         * We can only reuse the page if nobody else maps the huge page or it's
         * part.
         */
-        if (page_trans_huge_mapcount(page, NULL) == 1) {
+        if (!trylock_page(page)) {
+                get_page(page);
+                spin_unlock(vmf->ptl);
+                lock_page(page);
+                spin_lock(vmf->ptl);
+                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+                        unlock_page(page);
+                        put_page(page);
+                        goto out_unlock;
+                }
+                put_page(page);
+        }
+        if (reuse_swap_page(page, NULL)) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                ret |= VM_FAULT_WRITE;
+                unlock_page(page);
                goto out_unlock;
        }
+        unlock_page(page);
        get_page(page);
        spin_unlock(vmf->ptl);
 alloc:
@@ -1291,7 +1305,7 @@ alloc:
        count_vm_event(THP_FAULT_ALLOC);
        if (!page)
-                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+                clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
        else
                copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
@@ -2467,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(!PageCompound(page), page);
+        if (PageWriteback(page))
+                return -EBUSY;
        if (PageAnon(head)) {
                /*
                 * The caller does not necessarily hold an mmap_sem that would
@@ -2544,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                        __dec_node_page_state(page, NR_SHMEM_THPS);
                spin_unlock(&pgdata->split_queue_lock);
                __split_huge_page(page, list, flags);
-                ret = 0;
+                if (PageSwapCache(head)) {
+                        swp_entry_t entry = { .val = page_private(head) };
+                        ret = split_swap_cluster(entry);
+                } else
+                        ret = 0;
        } else {
                if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
                        pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31e207cb399b..34625b257128 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 }
 static int __alloc_gigantic_page(unsigned long start_pfn,
-                                unsigned long nr_pages)
+                                unsigned long nr_pages, gfp_t gfp_mask)
 {
        unsigned long end_pfn = start_pfn + nr_pages;
        return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
-                                  GFP_KERNEL);
+                                  gfp_mask);
 }
 static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
        return zone_spans_pfn(zone, last_pfn);
 }
-static struct page *alloc_gigantic_page(int nid, unsigned int order)
+static struct page *alloc_gigantic_page(int nid, struct hstate *h)
 {
+        unsigned int order = huge_page_order(h);
        unsigned long nr_pages = 1 << order;
        unsigned long ret, pfn, flags;
-        struct zone *z;
+        struct zonelist *zonelist;
+        struct zone *zone;
+        struct zoneref *z;
+        gfp_t gfp_mask;
-        z = NODE_DATA(nid)->node_zones;
+        gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
-        for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+        zonelist = node_zonelist(nid, gfp_mask);
-                spin_lock_irqsave(&z->lock, flags);
+        for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+                spin_lock_irqsave(&zone->lock, flags);
-                pfn = ALIGN(z->zone_start_pfn, nr_pages);
+                pfn = ALIGN(zone->zone_start_pfn, nr_pages);
-                while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+                while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
-                        if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
+                        if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
                                /*
                                 * We release the zone lock here because
                                 * alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
                                 * spinning on this lock, it may win the race
                                 * and cause alloc_contig_range() to fail...
                                 */
-                                spin_unlock_irqrestore(&z->lock, flags);
+                                spin_unlock_irqrestore(&zone->lock, flags);
-                                ret = __alloc_gigantic_page(pfn, nr_pages);
+                                ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
                                if (!ret)
                                        return pfn_to_page(pfn);
-                                spin_lock_irqsave(&z->lock, flags);
+                                spin_lock_irqsave(&zone->lock, flags);
                        }
                        pfn += nr_pages;
                }
-                spin_unlock_irqrestore(&z->lock, flags);
+                spin_unlock_irqrestore(&zone->lock, flags);
        }
        return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
 {
        struct page *page;
-        page = alloc_gigantic_page(nid, huge_page_order(h));
+        page = alloc_gigantic_page(nid, h);
        if (page) {
                prep_compound_gigantic_page(page, huge_page_order(h));
                prep_new_huge_page(h, page, nid);
@@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = {
        NULL,
 };
-static struct attribute_group hstate_attr_group = {
+static const struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct kobject **hstate_kobjs,
-                                    struct attribute_group *hstate_attr_group)
+                                    const struct attribute_group *hstate_attr_group)
 {
        int retval;
        int hi = hstate_index(h);
@@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = {
        NULL,
 };
-static struct attribute_group per_node_hstate_attr_group = {
+static const struct attribute_group per_node_hstate_attr_group = {
        .attrs = per_node_hstate_attrs,
 };
@@ -4600,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
        return pte;
 }
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
 pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz)
 {
@@ -4614,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
        p4d = p4d_offset(pgd, addr);
        if (!p4d_present(*p4d))
                return NULL;
        pud = pud_offset(p4d, addr);
-        if (!pud_present(*pud))
+        if (sz != PUD_SIZE && pud_none(*pud))
                return NULL;
-        if (pud_huge(*pud))
+        /* hugepage or swap? */
+        if (pud_huge(*pud) || !pud_present(*pud))
                return (pte_t *)pud;
        pmd = pmd_offset(pud, addr);
-        return (pte_t *) pmd;
+        if (sz != PMD_SIZE && pmd_none(*pmd))
+                return NULL;
+        /* hugepage or swap? */
+        if (pmd_huge(*pmd) || !pmd_present(*pmd))
+                return (pte_t *)pmd;
+        return NULL;
 }
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 /* Mask to get the watermark bits */
 #define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM               0x08
+#else
+#define ALLOC_OOM               ALLOC_NO_WATERMARKS
+#endif
 #define ALLOC_HARDER            0x10 /* try to alloc harder */
 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
@@ -525,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
        return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
 }
+void setup_zone_pageset(struct zone *zone);
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index db20f8436bc3..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
        NULL,
 };
-static struct attribute_group ksm_attr_group = {
+static const struct attribute_group ksm_attr_group = {
        .attrs = ksm_attrs,
        .name = "ksm",
 };
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
                }
                new_flags &= ~VM_DONTCOPY;
                break;
+        case MADV_WIPEONFORK:
+                /* MADV_WIPEONFORK is only supported on anonymous memory. */
+                if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+                        error = -EINVAL;
+                        goto out;
+                }
+                new_flags |= VM_WIPEONFORK;
+                break;
+        case MADV_KEEPONFORK:
+                new_flags &= ~VM_WIPEONFORK;
+                break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
+        case MADV_WIPEONFORK:
+        case MADV_KEEPONFORK:
 #ifdef CONFIG_MEMORY_FAILURE
        case MADV_SOFT_OFFLINE:
        case MADV_HWPOISON:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e09741af816f..ad15850ee157 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threshold and synchronization as vmstat[] should be
 * implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
 */
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-                                      enum memcg_event_item event)
+                                      int event)
 {
        unsigned long val = 0;
        int cpu;
@@ -1915,7 +1917,7 @@ retry:
         * bypass the last charges so that they can exit quickly and
         * free their memory.
         */
-        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+        if (unlikely(tsk_is_oom_victim(current) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
                goto force;
@@ -4319,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        }
        spin_unlock(&memcg->event_list_lock);
+        memcg->low = 0;
        memcg_offline_kmem(memcg);
        wb_memcg_offline(memcg);
@@ -4635,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                if (!ret || !target)
                        put_page(page);
        }
-        /* There is a swap entry and a page doesn't exist or isn't charged */
+        /*
-        if (ent.val && !ret &&
+         * There is a swap entry and a page doesn't exist or isn't charged.
+         * But we cannot move a tail-page in a THP.
+         */
+        if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                ret = MC_TARGET_SWAP;
                if (target)
@@ -4647,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * We don't consider swapping or file mapped pages because THP does not
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
- * support them for now.
+ * not support them for now.
 * Caller should make sure that pmd_trans_huge(pmd) is true.
 */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5423,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * in turn serializes uncharging.
                 */
                VM_BUG_ON_PAGE(!PageLocked(page), page);
-                if (page->mem_cgroup)
+                if (compound_head(page)->mem_cgroup)
                        goto out;
                if (do_swap_account) {
@@ -5906,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
        struct mem_cgroup *memcg, *swap_memcg;
+        unsigned int nr_entries;
        unsigned short oldid;
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5926,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
-        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+        nr_entries = hpage_nr_pages(page);
+        /* Get references for the tail pages, too */
+        if (nr_entries > 1)
+                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+        oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+                                   nr_entries);
        VM_BUG_ON_PAGE(oldid, page);
-        mem_cgroup_swap_statistics(swap_memcg, 1);
+        mem_cgroup_swap_statistics(swap_memcg, nr_entries);
        page->mem_cgroup = NULL;
        if (!mem_cgroup_is_root(memcg))
-                page_counter_uncharge(&memcg->memory, 1);
+                page_counter_uncharge(&memcg->memory, nr_entries);
        if (memcg != swap_memcg) {
                if (!mem_cgroup_is_root(swap_memcg))
-                        page_counter_charge(&swap_memcg->memsw, 1);
+                        page_counter_charge(&swap_memcg->memsw, nr_entries);
-                page_counter_uncharge(&memcg->memsw, 1);
+                page_counter_uncharge(&memcg->memsw, nr_entries);
        }
        /*
@@ -5948,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-        mem_cgroup_charge_statistics(memcg, page, false, -1);
+        mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+                                     -nr_entries);
        memcg_check_events(memcg, page);
        if (!mem_cgroup_is_root(memcg))
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
        tlb_gather_mmu(&tlb, mm, start, end);
        update_hiwater_rss(mm);
        mmu_notifier_invalidate_range_start(mm, start, end);
-        for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+        for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
                unmap_single_vma(&tlb, vma, start, end, NULL);
+                /*
+                 * zap_page_range does not specify whether mmap_sem should be
+                 * held for read or write. That allows parallel zap_page_range
+                 * operations to unmap a PTE and defer a flush meaning that
+                 * this call observes pte_none and fails to flush the TLB.
+                 * Rather than adding a complex API, ensure that no stale
+                 * TLB entries exist when this call returns.
+                 */
+                flush_tlb_range(vma, start, end);
+        }
        mmu_notifier_invalidate_range_end(mm, start, end);
        tlb_finish_mmu(&tlb, start, end);
 }
@@ -1676,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                        pfn_t pfn, pgprot_t prot)
+                        pfn_t pfn, pgprot_t prot, bool mkwrite)
 {
        struct mm_struct *mm = vma->vm_mm;
        int retval;
@@ -1688,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (!pte)
                goto out;
        retval = -EBUSY;
-        if (!pte_none(*pte))
+        if (!pte_none(*pte)) {
-                goto out_unlock;
+                if (mkwrite) {
+                        /*
+                         * For read faults on private mappings the PFN passed
+                         * in may not match the PFN we have mapped if the
+                         * mapped PFN is a writeable COW page.  In the mkwrite
+                         * case we are creating a writable PTE for a shared
+                         * mapping and we expect the PFNs to match.
+                         */
+                        if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+                                goto out_unlock;
+                        entry = *pte;
+                        goto out_mkwrite;
+                } else
+                        goto out_unlock;
+        }
        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+out_mkwrite:
+        if (mkwrite) {
+                entry = pte_mkyoung(entry);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+        }
        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1766,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
-        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+                        false);
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                        pfn_t pfn)
+                        pfn_t pfn, bool mkwrite)
 {
        pgprot_t pgprot = vma->vm_page_prot;
@@ -1802,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                return insert_page(vma, addr, page, pgprot);
        }
-        return insert_pfn(vma, addr, pfn, pgprot);
+        return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                        pfn_t pfn)
+{
+        return __vm_insert_mixed(vma, addr, pfn, false);
 }
 EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+                        pfn_t pfn)
+{
+        return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
 /*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
         * not dirty accountable.
         */
        if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
-                int total_mapcount;
+                int total_map_swapcount;
                if (!trylock_page(vmf->page)) {
                        get_page(vmf->page);
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
                        }
                        put_page(vmf->page);
                }
-                if (reuse_swap_page(vmf->page, &total_mapcount)) {
+                if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
-                        if (total_mapcount == 1) {
+                        if (total_map_swapcount == 1) {
                                /*
                                 * The page is all ours. Move it to
                                 * our anon_vma so the rmap code will
@@ -2704,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
 int do_swap_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
-        struct page *page, *swapcache;
+        struct page *page = NULL, *swapcache;
        struct mem_cgroup *memcg;
+        struct vma_swap_readahead swap_ra;
        swp_entry_t entry;
        pte_t pte;
        int locked;
        int exclusive = 0;
        int ret = 0;
+        bool vma_readahead = swap_use_vma_readahead();
-        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+        if (vma_readahead)
+                page = swap_readahead_detect(vmf, &swap_ra);
+        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+                if (page)
+                        put_page(page);
                goto out;
+        }
        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
@@ -2729,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
                goto out;
        }
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-        page = lookup_swap_cache(entry);
+        if (!page)
+                page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+                                         vmf->address);
        if (!page) {
-                page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+                if (vma_readahead)
-                                        vmf->address);
+                        page = do_swap_page_readahead(entry,
+                                GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+                else
+                        page = swapin_readahead(entry,
+                                GFP_HIGHUSER_MOVABLE, vma, vmf->address);
                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
@@ -4356,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
        }
 }
 void clear_huge_page(struct page *page,
-                     unsigned long addr, unsigned int pages_per_huge_page)
+                     unsigned long addr_hint, unsigned int pages_per_huge_page)
 {
-        int i;
+        int i, n, base, l;
+        unsigned long addr = addr_hint &
+                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, pages_per_huge_page);
                return;
        }
+        /* Clear sub-page to access last to keep its cache lines hot */
        might_sleep();
-        for (i = 0; i < pages_per_huge_page; i++) {
+        n = (addr_hint - addr) / PAGE_SIZE;
+        if (2 * n <= pages_per_huge_page) {
+                /* If sub-page to access in first half of huge page */
+                base = 0;
+                l = n;
+                /* Clear sub-pages at the end of huge page */
+                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+                        cond_resched();
+                        clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+                }
+        } else {
+                /* If sub-page to access in second half of huge page */
+                base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+                l = pages_per_huge_page - n;
+                /* Clear sub-pages at the begin of huge page */
+                for (i = 0; i < base; i++) {
+                        cond_resched();
+                        clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+                }
+        }
+        /*
+         * Clear remaining sub-pages in left-right-left-right pattern
+         * towards the sub-page to access
+         */
+        for (i = 0; i < l; i++) {
+                int left_idx = base + i;
+                int right_idx = base + 2 * l - 1 - i;
+                cond_resched();
+                clear_user_highpage(page + left_idx,
+                                    addr + left_idx * PAGE_SIZE);
                cond_resched();
-                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+                clear_user_highpage(page + right_idx,
+                                    addr + right_idx * PAGE_SIZE);
        }
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
        node_set_state(node, N_MEMORY);
 }
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
-{
-        struct pglist_data *pgdat = NODE_DATA(nid);
-        struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-        struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
-        /*
-         * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
-         * physically before ZONE_MOVABLE. All we need is they do not
-         * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
-         * though so let's stick with it for simplicity for now.
-         * TODO make sure we do not overlap with ZONE_DEVICE
-         */
-        if (online_type == MMOP_ONLINE_KERNEL) {
-                if (zone_is_empty(movable_zone))
-                        return true;
-                return movable_zone->zone_start_pfn >= pfn + nr_pages;
-        } else if (online_type == MMOP_ONLINE_MOVABLE) {
-                return zone_end_pfn(default_zone) <= pfn;
-        }
-        /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
-        return online_type == MMOP_ONLINE_KEEP;
-}
 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
                unsigned long nr_pages)
 {
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
 * If no kernel zone covers this pfn range it will automatically go
 * to the ZONE_NORMAL.
 */
-struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
                unsigned long nr_pages)
 {
        struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
        return &pgdat->node_zones[ZONE_NORMAL];
 }
-static inline bool movable_pfn_range(int nid, struct zone *default_zone,
+static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
-                unsigned long start_pfn, unsigned long nr_pages)
+                unsigned long nr_pages)
 {
-        if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+        struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
-                                MMOP_ONLINE_KERNEL))
+                        nr_pages);
-                return true;
+        struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+        bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
+        bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
-        if (!movable_node_is_enabled())
+        /*
-                return false;
+         * We inherit the existing zone in a simple case where zones do not
+         * overlap in the given range
+         */
+        if (in_kernel ^ in_movable)
+                return (in_kernel) ? kernel_zone : movable_zone;
-        return !zone_intersects(default_zone, start_pfn, nr_pages);
+        /*
+         * If the range doesn't belong to any zone or two zones overlap in the
+         * given range then we use movable zone only if movable_node is
+         * enabled because we always online to a kernel zone by default.
+         */
+        return movable_node_enabled ? movable_zone : kernel_zone;
+}
+struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+                unsigned long nr_pages)
+{
+        if (online_type == MMOP_ONLINE_KERNEL)
+                return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
+        if (online_type == MMOP_ONLINE_MOVABLE)
+                return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+        return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 /*
@@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
 static struct zone * __meminit move_pfn_range(int online_type, int nid,
                unsigned long start_pfn, unsigned long nr_pages)
 {
-        struct pglist_data *pgdat = NODE_DATA(nid);
+        struct zone *zone;
-        struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
-        if (online_type == MMOP_ONLINE_KEEP) {
-                struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-                /*
-                 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
-                 * movable zone if that is not possible (e.g. we are within
-                 * or past the existing movable zone). movable_node overrides
-                 * this default and defaults to movable zone
-                 */
-                if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
-                        zone = movable_zone;
-        } else if (online_type == MMOP_ONLINE_MOVABLE) {
-                zone = &pgdat->node_zones[ZONE_MOVABLE];
-        }
+        zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
        move_pfn_range_to_zone(zone, start_pfn, nr_pages);
        return zone;
 }
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long flags;
@@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        struct memory_notify arg;
        nid = pfn_to_nid(pfn);
-        if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
-                return -EINVAL;
        /* associate pfn range with the zone */
        zone = move_pfn_range(online_type, nid, pfn, nr_pages);
@@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
         * This means the page allocator ignores this zone.
         * So, zonelist must be updated after online.
         */
-        mutex_lock(&zonelists_mutex);
        if (!populated_zone(zone)) {
                need_zonelists_rebuild = 1;
-                build_all_zonelists(NULL, zone);
+                setup_zone_pageset(zone);
        }
        ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if (ret) {
                if (need_zonelists_rebuild)
                        zone_pcp_reset(zone);
-                mutex_unlock(&zonelists_mutex);
                goto failed_addition;
        }
@@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if (onlined_pages) {
                node_states_set_node(nid, &arg);
                if (need_zonelists_rebuild)
-                        build_all_zonelists(NULL, NULL);
+                        build_all_zonelists(NULL);
                else
                        zone_pcp_update(zone);
        }
-        mutex_unlock(&zonelists_mutex);
        init_per_zone_wmark_min();
        if (onlined_pages) {
@@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
         * The node we allocated has no zone fallback lists. For avoiding
         * to access not-initialized zonelist, build here.
         */
-        mutex_lock(&zonelists_mutex);
+        build_all_zonelists(pgdat);
-        build_all_zonelists(pgdat, NULL);
-        mutex_unlock(&zonelists_mutex);
        /*
         * zone->managed_pages is set to an approximate value in
@@ -1100,13 +1075,6 @@ int try_online_node(int nid)
        node_set_online(nid);
        ret = register_one_node(nid);
        BUG_ON(ret);
-        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
-                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL, NULL);
-                mutex_unlock(&zonelists_mutex);
-        }
 out:
        mem_hotplug_done();
        return ret;
@@ -1722,9 +1690,7 @@ repeat:
        if (!populated_zone(zone)) {
                zone_pcp_reset(zone);
-                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL);
-                build_all_zonelists(NULL, NULL);
-                mutex_unlock(&zonelists_mutex);
        } else
                zone_pcp_update(zone);
@@ -1750,7 +1716,7 @@ failed_removal:
        return ret;
 }
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/moduleparam.h>
 #include <linux/pkeys.h>
+#include <linux/oom.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
        if (vma->vm_start >= end)
                return 0;
-        if (uf) {
-                int error = userfaultfd_unmap_prep(vma, start, end, uf);
-                if (error)
-                        return error;
-        }
        /*
         * If we need to split any vma, do it now to save pain later.
         *
@@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
        }
        vma = prev ? prev->vm_next : mm->mmap;
+        if (unlikely(uf)) {
+                /*
+                 * If userfaultfd_unmap_prep returns an error the vmas
+                 * will remain splitted, but userland will get a
+                 * highly unexpected error anyway. This is no
+                 * different than the case where the first of the two
+                 * __split_vma fails, but we don't undo the first
+                 * split, despite we could. This is unlikely enough
+                 * failure that it's not worth optimizing it for.
+                 */
+                int error = userfaultfd_unmap_prep(vma, start, end, uf);
+                if (error)
+                        return error;
+        }
        /*
         * unlock any mlock()ed ranges before detaching vmas
         */
@@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, vma, 0, -1);
+        set_bit(MMF_OOM_SKIP, &mm->flags);
+        if (unlikely(tsk_is_oom_victim(current))) {
+                /*
+                 * Wait for oom_reap_task() to stop working on this
+                 * mm. Because MMF_OOM_SKIP is already set before
+                 * calling down_read(), oom_reap_task() will not run
+                 * on this "mm" post up_write().
+                 *
+                 * tsk_is_oom_victim() cannot be set from under us
+                 * either because current->mm is already set to NULL
+                 * under task_lock before calling mmput and oom_mm is
+                 * set not NULL by the OOM killer only if current->mm
+                 * is found not NULL while holding the task_lock.
+                 */
+                down_write(&mm->mmap_sem);
+                up_write(&mm->mmap_sem);
+        }
        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, 0, -1);
@@ -3514,7 +3540,7 @@ static int init_user_reserve(void)
 {
        unsigned long free_kbytes;
-        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
@@ -3535,7 +3561,7 @@ static int init_admin_reserve(void)
 {
        unsigned long free_kbytes;
-        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
@@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
                break;
        case MEM_OFFLINE:
-                free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+                free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
diff --git a/mm/mremap.c b/mm/mremap.c
index 3f23715d3c69..7395564daa6c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (!vma || vma->vm_start > addr)
                return ERR_PTR(-EFAULT);
+        /*
+         * !old_len is a special case where an attempt is made to 'duplicate'
+         * a mapping.  This makes no sense for private mappings as it will
+         * instead create a fresh/new mapping unrelated to the original.  This
+         * is contrary to the basic idea of mremap which creates new mappings
+         * based on the original.  There are no known use cases for this
+         * behavior.  As a result, fail such attempts.
+         */
+        if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+                pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
+                return ERR_PTR(-EINVAL);
+        }
        if (is_vm_hugetlb_page(vma))
                return ERR_PTR(-EINVAL);
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
 {
        unsigned long free_kbytes;
-        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
 {
        unsigned long free_kbytes;
-        free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
        }
        /*
-         * increase mm_users only after we know we will reap something so
+         * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-         * that the mmput_async is called only when we have reaped something
+         * work on the mm anymore. The check for MMF_OOM_SKIP must run
-         * and delayed __mmput doesn't matter that much
+         * under mmap_sem for reading because it serializes against the
+         * down_write();up_write() cycle in exit_mmap().
         */
-        if (!mmget_not_zero(mm)) {
+        if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
                up_read(&mm->mmap_sem);
                trace_skip_task_reaping(tsk->pid);
                goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
        up_read(&mm->mmap_sem);
-        /*
-         * Drop our reference but make sure the mmput slow path is called from a
-         * different context because we shouldn't risk we get stuck there and
-         * put the oom_reaper out of the way.
-         */
-        mmput_async(mm);
        trace_finish_task_reaping(tsk->pid);
 unlock_oom:
        mutex_unlock(&oom_lock);
@@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
-         * its children or threads, just set TIF_MEMDIE so it can die quickly
+         * its children or threads, just give it access to memory reserves
+         * so it can die quickly
         */
        task_lock(p);
        if (task_will_free_mem(p)) {
@@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
        count_memcg_event_mm(mm, OOM_KILL);
        /*
-         * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
+         * We should send SIGKILL before granting access to memory reserves
-         * the OOM victim from depleting the memory reserves from the user
+         * in order to prevent the OOM victim from depleting the memory
-         * space under its control.
+         * reserves from the user space under its control.
         */
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
        mark_oom_victim(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bf050ab025b7..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
 {
        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES);
+        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
- * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
 static unsigned long dirty_poll_interval(unsigned long dirty,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9327a940e373..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 {
        long min = mark;
        int o;
-        const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+        const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
        /* free_pages may go negative - that's OK */
        free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
         * the high-atomic reserves. This will over-estimate the size of the
         * atomic reserve but it avoids a search.
         */
-        if (likely(!alloc_harder))
+        if (likely(!alloc_harder)) {
                free_pages -= z->nr_reserved_highatomic;
-        else
+        } else {
-                min -= min / 4;
+                /*
+                 * OOM victims can try even harder than normal ALLOC_HARDER
+                 * users on the grounds that it's definitely going to be in
+                 * the exit path shortly and free memory. Any allocation it
+                 * makes during the free path will be small and short-lived.
+                 */
+                if (alloc_flags & ALLOC_OOM)
+                        min -= min / 2;
+                else
+                        min -= min / 4;
+        }
 #ifdef CONFIG_CMA
        /* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
         * of allowed nodes.
         */
        if (!(gfp_mask & __GFP_NOMEMALLOC))
-                if (test_thread_flag(TIF_MEMDIE) ||
+                if (tsk_is_oom_victim(current) ||
                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
                        filter &= ~SHOW_MEM_FILTER_NODES;
        if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        return alloc_flags;
 }
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
 {
-        if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+        if (!tsk_is_oom_victim(tsk))
                return false;
+        /*
+         * !MMU doesn't have oom reaper so give access to memory reserves
+         * only to the thread with TIF_MEMDIE set
+         */
+        if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+                return false;
+        return true;
+}
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+        if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+                return 0;
        if (gfp_mask & __GFP_MEMALLOC)
-                return true;
+                return ALLOC_NO_WATERMARKS;
        if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-                return true;
+                return ALLOC_NO_WATERMARKS;
-        if (!in_interrupt() &&
+        if (!in_interrupt()) {
-                        ((current->flags & PF_MEMALLOC) ||
+                if (current->flags & PF_MEMALLOC)
-                         unlikely(test_thread_flag(TIF_MEMDIE))))
+                        return ALLOC_NO_WATERMARKS;
-                return true;
+                else if (oom_reserves_allowed(current))
+                        return ALLOC_OOM;
+        }
-        return false;
+        return 0;
+}
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+        return !!__gfp_pfmemalloc_flags(gfp_mask);
 }
 /*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long alloc_start = jiffies;
        unsigned int stall_timeout = 10 * HZ;
        unsigned int cpuset_mems_cookie;
+        int reserve_flags;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
-        if (gfp_pfmemalloc_allowed(gfp_mask))
+        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
-                alloc_flags = ALLOC_NO_WATERMARKS;
+        if (reserve_flags)
+                alloc_flags = reserve_flags;
        /*
         * Reset the zonelist iterators if memory policies can be ignored.
         * These allocations are high priority and system rather than user
         * orientated.
         */
-        if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+        if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
                ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                        ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
                goto got_pg;
        /* Avoid allocations with no watermarks from looping endlessly */
-        if (test_thread_flag(TIF_MEMDIE) &&
+        if (tsk_is_oom_victim(current) &&
-            (alloc_flags == ALLOC_NO_WATERMARKS ||
+            (alloc_flags == ALLOC_OOM ||
             (gfp_mask & __GFP_NOMEMALLOC)))
                goto nopage;
@@ -4509,7 +4547,7 @@ long si_mem_available(void)
         * Estimate the amount of memory available for userspace allocations,
         * without causing swapping.
         */
-        available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+        available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
        /*
         * Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4576,7 @@ void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
        val->sharedram = global_node_page_state(NR_SHMEM);
-        val->freeram = global_page_state(NR_FREE_PAGES);
+        val->freeram = global_zone_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
        val->totalhigh = totalhigh_pages;
        val->freehigh = nr_free_highpages();
@@ -4673,11 +4711,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                global_node_page_state(NR_SLAB_UNRECLAIMABLE),
                global_node_page_state(NR_FILE_MAPPED),
                global_node_page_state(NR_SHMEM),
-                global_page_state(NR_PAGETABLE),
+                global_zone_page_state(NR_PAGETABLE),
-                global_page_state(NR_BOUNCE),
+                global_zone_page_state(NR_BOUNCE),
-                global_page_state(NR_FREE_PAGES),
+                global_zone_page_state(NR_FREE_PAGES),
                free_pcp,
-                global_page_state(NR_FREE_CMA_PAGES));
+                global_zone_page_state(NR_FREE_CMA_PAGES));
        for_each_online_pgdat(pgdat) {
                if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4839,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 *
 * Add all populated zones of a node to the zonelist.
 */
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
-                                int nr_zones)
 {
        struct zone *zone;
        enum zone_type zone_type = MAX_NR_ZONES;
+        int nr_zones = 0;
        do {
                zone_type--;
                zone = pgdat->node_zones + zone_type;
                if (managed_zone(zone)) {
-                        zoneref_set_zone(zone,
+                        zoneref_set_zone(zone, &zonerefs[nr_zones++]);
-                                &zonelist->_zonerefs[nr_zones++]);
                        check_highest_zone(zone_type);
                }
        } while (zone_type);
@@ -4858,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
        return nr_zones;
 }
-/*
- *  zonelist_order:
- *  0 = automatic detection of better ordering.
- *  1 = order by ([node] distance, -zonetype)
- *  2 = order by (-zonetype, [node] distance)
- *
- *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- *  the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT  0
-#define ZONELIST_ORDER_NODE     1
-#define ZONELIST_ORDER_ZONE     2
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- *      = "[dD]efault   - default, automatic configuration.
- *      = "[nN]ode      - order by node locality, then by zone within node
- *      = "[zZ]one      - order by zone, then by locality within zone
- */
 static int __parse_numa_zonelist_order(char *s)
 {
-        if (*s == 'd' || *s == 'D') {
+        /*
-                user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+         * We used to support different zonlists modes but they turned
-        } else if (*s == 'n' || *s == 'N') {
+         * out to be just not useful. Let's keep the warning in place
-                user_zonelist_order = ZONELIST_ORDER_NODE;
+         * if somebody still use the cmd line parameter so that we do
-        } else if (*s == 'z' || *s == 'Z') {
+         * not fail it silently
-                user_zonelist_order = ZONELIST_ORDER_ZONE;
+         */
-        } else {
+        if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
-                pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
+                pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
                return -EINVAL;
        }
        return 0;
@@ -4911,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        int ret;
        if (!s)
                return 0;
-        ret = __parse_numa_zonelist_order(s);
+        return __parse_numa_zonelist_order(s);
-        if (ret == 0)
-                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
 /*
 * sysctl handler for numa_zonelist_order
 */
@@ -4931,42 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
                loff_t *ppos)
 {
-        char saved_string[NUMA_ZONELIST_ORDER_LEN];
+        char *str;
        int ret;
-        static DEFINE_MUTEX(zl_order_mutex);
-        mutex_lock(&zl_order_mutex);
+        if (!write)
-        if (write) {
+                return proc_dostring(table, write, buffer, length, ppos);
-                if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
+        str = memdup_user_nul(buffer, 16);
-                        ret = -EINVAL;
+        if (IS_ERR(str))
-                        goto out;
+                return PTR_ERR(str);
-                }
-                strcpy(saved_string, (char *)table->data);
-        }
-        ret = proc_dostring(table, write, buffer, length, ppos);
-        if (ret)
-                goto out;
-        if (write) {
-                int oldval = user_zonelist_order;
-                ret = __parse_numa_zonelist_order((char *)table->data);
+        ret = __parse_numa_zonelist_order(str);
-                if (ret) {
+        kfree(str);
-                        /*
-                         * bogus value.  restore saved string
-                         */
-                        strncpy((char *)table->data, saved_string,
-                                NUMA_ZONELIST_ORDER_LEN);
-                        user_zonelist_order = oldval;
-                } else if (oldval != user_zonelist_order) {
-                        mem_hotplug_begin();
-                        mutex_lock(&zonelists_mutex);
-                        build_all_zonelists(NULL, NULL);
-                        mutex_unlock(&zonelists_mutex);
-                        mem_hotplug_done();
-                }
-        }
-out:
-        mutex_unlock(&zl_order_mutex);
        return ret;
 }
@@ -5040,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 * This results in maximum locality--normal zone overflows into local
 * DMA zone, if any--but risks exhausting DMA zone.
 */
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+                unsigned nr_nodes)
 {
-        int j;
+        struct zoneref *zonerefs;
-        struct zonelist *zonelist;
+        int i;
+        zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+        for (i = 0; i < nr_nodes; i++) {
+                int nr_zones;
-        zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
+                pg_data_t *node = NODE_DATA(node_order[i]);
-        for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
-                ;
+                nr_zones = build_zonerefs_node(node, zonerefs);
-        j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+                zonerefs += nr_zones;
-        zonelist->_zonerefs[j].zone = NULL;
+        }
-        zonelist->_zonerefs[j].zone_idx = 0;
+        zonerefs->zone = NULL;
+        zonerefs->zone_idx = 0;
 }
 /*
@@ -5058,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
-        int j;
+        struct zoneref *zonerefs;
-        struct zonelist *zonelist;
+        int nr_zones;
-        zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
+        zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
-        j = build_zonelists_node(pgdat, zonelist, 0);
+        nr_zones = build_zonerefs_node(pgdat, zonerefs);
-        zonelist->_zonerefs[j].zone = NULL;
+        zonerefs += nr_zones;
-        zonelist->_zonerefs[j].zone_idx = 0;
+        zonerefs->zone = NULL;
+        zonerefs->zone_idx = 0;
 }
 /*
@@ -5073,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
 * exhausted, but results in overflowing to remote node while memory
 * may still exist in local DMA zone.
 */
-static int node_order[MAX_NUMNODES];
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
-        int pos, j, node;
-        int zone_type;          /* needs to be signed */
-        struct zone *z;
-        struct zonelist *zonelist;
-        zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-        pos = 0;
-        for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
-                for (j = 0; j < nr_nodes; j++) {
-                        node = node_order[j];
-                        z = &NODE_DATA(node)->node_zones[zone_type];
-                        if (managed_zone(z)) {
-                                zoneref_set_zone(z,
-                                        &zonelist->_zonerefs[pos++]);
-                                check_highest_zone(zone_type);
-                        }
-                }
-        }
-        zonelist->_zonerefs[pos].zone = NULL;
-        zonelist->_zonerefs[pos].zone_idx = 0;
-}
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
-        return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
-        return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-static void set_zonelist_order(void)
-{
-        if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
-                current_zonelist_order = default_zonelist_order();
-        else
-                current_zonelist_order = user_zonelist_order;
-}
 static void build_zonelists(pg_data_t *pgdat)
 {
-        int i, node, load;
+        static int node_order[MAX_NUMNODES];
+        int node, load, nr_nodes = 0;
        nodemask_t used_mask;
        int local_node, prev_node;
-        struct zonelist *zonelist;
-        unsigned int order = current_zonelist_order;
-        /* initialize zonelists */
-        for (i = 0; i < MAX_ZONELISTS; i++) {
-                zonelist = pgdat->node_zonelists + i;
-                zonelist->_zonerefs[0].zone = NULL;
-                zonelist->_zonerefs[0].zone_idx = 0;
-        }
        /* NUMA-aware ordering of nodes */
        local_node = pgdat->node_id;
@@ -5154,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat)
        nodes_clear(used_mask);
        memset(node_order, 0, sizeof(node_order));
-        i = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                /*
                 * We don't want to pressure a particular node.
@@ -5166,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat)
                    node_distance(local_node, prev_node))
                        node_load[node] = load;
+                node_order[nr_nodes++] = node;
                prev_node = node;
                load--;
-                if (order == ZONELIST_ORDER_NODE)
-                        build_zonelists_in_node_order(pgdat, node);
-                else
-                        node_order[i++] = node; /* remember order */
-        }
-        if (order == ZONELIST_ORDER_ZONE) {
-                /* calculate node order -- i.e., DMA last! */
-                build_zonelists_in_zone_order(pgdat, i);
        }
+        build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
        build_thisnode_zonelists(pgdat);
 }
@@ -5204,21 +5111,17 @@ static void setup_min_unmapped_ratio(void);
 static void setup_min_slab_ratio(void);
 #else   /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
-        current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
 static void build_zonelists(pg_data_t *pgdat)
 {
        int node, local_node;
-        enum zone_type j;
+        struct zoneref *zonerefs;
-        struct zonelist *zonelist;
+        int nr_zones;
        local_node = pgdat->node_id;
-        zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
+        zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
-        j = build_zonelists_node(pgdat, zonelist, 0);
+        nr_zones = build_zonerefs_node(pgdat, zonerefs);
+        zonerefs += nr_zones;
        /*
         * Now we build the zonelist so that it contains the zones
@@ -5231,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat)
        for (node = local_node + 1; node < MAX_NUMNODES; node++) {
                if (!node_online(node))
                        continue;
-                j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+                nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+                zonerefs += nr_zones;
        }
        for (node = 0; node < local_node; node++) {
                if (!node_online(node))
                        continue;
-                j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+                nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+                zonerefs += nr_zones;
        }
-        zonelist->_zonerefs[j].zone = NULL;
+        zonerefs->zone = NULL;
-        zonelist->_zonerefs[j].zone_idx = 0;
+        zonerefs->zone_idx = 0;
 }
 #endif  /* CONFIG_NUMA */
@@ -5263,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat)
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-/* return values int ....just for stop_machine() */
+static void __build_all_zonelists(void *data)
-static int __build_all_zonelists(void *data)
 {
        int nid;
-        int cpu;
+        int __maybe_unused cpu;
        pg_data_t *self = data;
+        static DEFINE_SPINLOCK(lock);
+        spin_lock(&lock);
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
 #endif
+        /*
+         * This node is hotadded and no memory is yet present.   So just
+         * building zonelists is fine - no need to touch other nodes.
+         */
        if (self && !node_online(self->node_id)) {
                build_zonelists(self);
-        }
+        } else {
+                for_each_online_node(nid) {
-        for_each_online_node(nid) {
+                        pg_data_t *pgdat = NODE_DATA(nid);
-                pg_data_t *pgdat = NODE_DATA(nid);
-                build_zonelists(pgdat);
-        }
-        /*
+                        build_zonelists(pgdat);
-         * Initialize the boot_pagesets that are going to be used
+                }
-         * for bootstrapping processors. The real pagesets for
-         * each zone will be allocated later when the per cpu
-         * allocator is available.
-         *
-         * boot_pagesets are used also for bootstrapping offline
-         * cpus if the system is already booted because the pagesets
-         * are needed to initialize allocators on a specific cpu too.
-         * F.e. the percpu allocator needs the page allocator which
-         * needs the percpu allocator in order to allocate its pagesets
-         * (a chicken-egg dilemma).
-         */
-        for_each_possible_cpu(cpu) {
-                setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
                /*
@@ -5317,45 +5204,53 @@ static int __build_all_zonelists(void *data)
                 * secondary cpus' numa_mem as they come on-line.  During
                 * node/memory hotplug, we'll fixup all on-line cpus.
                 */
-                if (cpu_online(cpu))
+                for_each_online_cpu(cpu)
                        set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
        }
-        return 0;
+        spin_unlock(&lock);
 }
 static noinline void __init
 build_all_zonelists_init(void)
 {
+        int cpu;
        __build_all_zonelists(NULL);
+        /*
+         * Initialize the boot_pagesets that are going to be used
+         * for bootstrapping processors. The real pagesets for
+         * each zone will be allocated later when the per cpu
+         * allocator is available.
+         *
+         * boot_pagesets are used also for bootstrapping offline
+         * cpus if the system is already booted because the pagesets
+         * are needed to initialize allocators on a specific cpu too.
+         * F.e. the percpu allocator needs the page allocator which
+         * needs the percpu allocator in order to allocate its pagesets
+         * (a chicken-egg dilemma).
+         */
+        for_each_possible_cpu(cpu)
+                setup_pageset(&per_cpu(boot_pageset, cpu), 0);
        mminit_verify_zonelist();
        cpuset_init_current_mems_allowed();
 }
 /*
- * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 *
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * __ref due to call of __init annotated helper build_all_zonelists_init
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
 * [protected by SYSTEM_BOOTING].
 */
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
 {
-        set_zonelist_order();
        if (system_state == SYSTEM_BOOTING) {
                build_all_zonelists_init();
        } else {
-#ifdef CONFIG_MEMORY_HOTPLUG
+                __build_all_zonelists(pgdat);
-                if (zone)
-                        setup_zone_pageset(zone);
-#endif
-                /* we have to stop all cpus to guarantee there is no user
-                   of zonelist */
-                stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -5371,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
        else
                page_group_by_mobility_disabled = 0;
-        pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+        pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
                nr_online_nodes,
-                zonelist_order_name[current_zonelist_order],
                page_group_by_mobility_disabled ? "off" : "on",
                vm_total_pages);
 #ifdef CONFIG_NUMA
@@ -5627,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
        pageset_set_high_and_batch(zone, pcp);
 }
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
 {
        int cpu;
        zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7081,9 +6975,11 @@ static void __setup_per_zone_wmarks(void)
 */
 void setup_per_zone_wmarks(void)
 {
-        mutex_lock(&zonelists_mutex);
+        static DEFINE_SPINLOCK(lock);
+        spin_lock(&lock);
        __setup_per_zone_wmarks();
-        mutex_unlock(&zonelists_mutex);
+        spin_unlock(&lock);
 }
 /*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 88ccc044b09a..32f18911deda 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
                return addr;
        }
-        if (node_state(nid, N_HIGH_MEMORY))
+        addr = vzalloc_node(size, nid);
-                addr = vzalloc_node(size, nid);
-        else
-                addr = vzalloc(size);
        return addr;
 }
@@ -409,6 +406,7 @@ void __init page_ext_init(void)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
+                        cond_resched();
                }
        }
        hotplug_memory_notifier(page_ext_callback, 0);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1b0f48c62316..4bd03a8d809e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = {
        NULL,
 };
-static struct attribute_group page_idle_attr_group = {
+static const struct attribute_group page_idle_attr_group = {
        .bin_attrs = page_idle_bin_attrs,
        .name = "page_idle",
 };
diff --git a/mm/page_io.c b/mm/page_io.c
index 5f61b54ee1f3..20139b90125a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,16 +28,18 @@
 static struct bio *get_swap_bio(gfp_t gfp_flags,
                                struct page *page, bio_end_io_t end_io)
 {
+        int i, nr = hpage_nr_pages(page);
        struct bio *bio;
-        bio = bio_alloc(gfp_flags, 1);
+        bio = bio_alloc(gfp_flags, nr);
        if (bio) {
                bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
                bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                bio->bi_end_io = end_io;
-                bio_add_page(bio, page, PAGE_SIZE, 0);
+                for (i = 0; i < nr; i++)
-                BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+                        bio_add_page(bio, page + i, PAGE_SIZE, 0);
+                VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
        }
        return bio;
 }
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page)
        return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 }
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        if (unlikely(PageTransHuge(page)))
+                count_vm_event(THP_SWPOUT);
+#endif
+        count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
                bio_end_io_t end_write_func)
 {
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
        ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
        if (!ret) {
-                count_vm_event(PSWPOUT);
+                count_swpout_vm_event(page);
                return 0;
        }
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                goto out;
        }
        bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-        count_vm_event(PSWPOUT);
+        count_swpout_vm_event(page);
        set_page_writeback(page);
        unlock_page(page);
        submit_bio(bio);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0fd9dcf2c5dc..8e2d7137510c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 static depot_stack_handle_t dummy_handle;
 static depot_stack_handle_t failure_handle;
+static depot_stack_handle_t early_handle;
 static void init_early_allocated_pages(void);
@@ -53,7 +54,7 @@ static bool need_page_owner(void)
        return true;
 }
-static noinline void register_dummy_stack(void)
+static __always_inline depot_stack_handle_t create_dummy_stack(void)
 {
        unsigned long entries[4];
        struct stack_trace dummy;
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void)
        dummy.skip = 0;
        save_stack_trace(&dummy);
-        dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+        return depot_save_stack(&dummy, GFP_KERNEL);
 }
-static noinline void register_failure_stack(void)
+static noinline void register_dummy_stack(void)
 {
-        unsigned long entries[4];
+        dummy_handle = create_dummy_stack();
-        struct stack_trace failure;
+}
-        failure.nr_entries = 0;
+static noinline void register_failure_stack(void)
-        failure.max_entries = ARRAY_SIZE(entries);
+{
-        failure.entries = &entries[0];
+        failure_handle = create_dummy_stack();
-        failure.skip = 0;
+}
-        save_stack_trace(&failure);
+static noinline void register_early_stack(void)
-        failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+{
+        early_handle = create_dummy_stack();
 }
 static void init_page_owner(void)
@@ -88,6 +90,7 @@ static void init_page_owner(void)
        register_dummy_stack();
        register_failure_stack();
+        register_early_stack();
        static_branch_enable(&page_owner_inited);
        init_early_allocated_pages();
 }
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
        return handle;
 }
-noinline void __set_page_owner(struct page *page, unsigned int order,
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
-                                        gfp_t gfp_mask)
+        depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
 {
-        struct page_ext *page_ext = lookup_page_ext(page);
        struct page_owner *page_owner;
-        if (unlikely(!page_ext))
-                return;
        page_owner = get_page_owner(page_ext);
-        page_owner->handle = save_stack(gfp_mask);
+        page_owner->handle = handle;
        page_owner->order = order;
        page_owner->gfp_mask = gfp_mask;
        page_owner->last_migrate_reason = -1;
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
        __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
+noinline void __set_page_owner(struct page *page, unsigned int order,
+                                        gfp_t gfp_mask)
+{
+        struct page_ext *page_ext = lookup_page_ext(page);
+        depot_stack_handle_t handle;
+        if (unlikely(!page_ext))
+                return;
+        handle = save_stack(gfp_mask);
+        __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+}
 void __set_page_owner_migrate_reason(struct page *page, int reason)
 {
        struct page_ext *page_ext = lookup_page_ext(page);
@@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
                                continue;
                        /*
-                         * We are safe to check buddy flag and order, because
+                         * To avoid having to grab zone->lock, be a little
-                         * this is init stage and only single thread runs.
+                         * careful when reading buddy page order. The only
+                         * danger is that we skip too much and potentially miss
+                         * some early allocated pages, which is better than
+                         * heavy lock contention.
                         */
                        if (PageBuddy(page)) {
-                                pfn += (1UL << page_order(page)) - 1;
+                                unsigned long order = page_order_unsafe(page);
+                                if (order > 0 && order < MAX_ORDER)
+                                        pfn += (1UL << order) - 1;
                                continue;
                        }
@@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
                        if (unlikely(!page_ext))
                                continue;
-                        /* Maybe overraping zone */
+                        /* Maybe overlapping zone */
                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                                continue;
                        /* Found early allocated page */
-                        set_page_owner(page, 0, 0);
+                        __set_page_owner_handle(page_ext, early_handle, 0, 0);
                        count++;
                }
+                cond_resched();
        }
        pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
@@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat)
 {
        struct zone *zone;
        struct zone *node_zones = pgdat->node_zones;
-        unsigned long flags;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
                if (!populated_zone(zone))
                        continue;
-                spin_lock_irqsave(&zone->lock, flags);
                init_pages_in_zone(pgdat, zone);
-                spin_unlock_irqrestore(&zone->lock, flags);
        }
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index fbcb3c96a186..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 }
+static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+{
+        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        if (shmem_acct_block(info->flags, pages))
+                return false;
+        if (sbinfo->max_blocks) {
+                if (percpu_counter_compare(&sbinfo->used_blocks,
+                                           sbinfo->max_blocks - pages) > 0)
+                        goto unacct;
+                percpu_counter_add(&sbinfo->used_blocks, pages);
+        }
+        return true;
+unacct:
+        shmem_unacct_blocks(info->flags, pages);
+        return false;
+}
+static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+{
+        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        if (sbinfo->max_blocks)
+                percpu_counter_sub(&sbinfo->used_blocks, pages);
+        shmem_unacct_blocks(info->flags, pages);
+}
 static const struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
 static const struct file_operations shmem_file_operations;
@@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode)
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
-                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-                if (sbinfo->max_blocks)
-                        percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
-                shmem_unacct_blocks(info->flags, freed);
+                shmem_inode_unacct_blocks(inode, freed);
        }
 }
 bool shmem_charge(struct inode *inode, long pages)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        unsigned long flags;
-        if (shmem_acct_block(info->flags, pages))
+        if (!shmem_inode_acct_block(inode, pages))
                return false;
        spin_lock_irqsave(&info->lock, flags);
        info->alloced += pages;
        inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages)
        spin_unlock_irqrestore(&info->lock, flags);
        inode->i_mapping->nrpages += pages;
-        if (!sbinfo->max_blocks)
-                return true;
-        if (percpu_counter_compare(&sbinfo->used_blocks,
-                                sbinfo->max_blocks - pages) > 0) {
-                inode->i_mapping->nrpages -= pages;
-                spin_lock_irqsave(&info->lock, flags);
-                info->alloced -= pages;
-                shmem_recalc_inode(inode);
-                spin_unlock_irqrestore(&info->lock, flags);
-                shmem_unacct_blocks(info->flags, pages);
-                return false;
-        }
-        percpu_counter_add(&sbinfo->used_blocks, pages);
        return true;
 }
 void shmem_uncharge(struct inode *inode, long pages)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        unsigned long flags;
        spin_lock_irqsave(&info->lock, flags);
@@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages)
        shmem_recalc_inode(inode);
        spin_unlock_irqrestore(&info->lock, flags);
-        if (sbinfo->max_blocks)
+        shmem_inode_unacct_blocks(inode, pages);
-                percpu_counter_sub(&sbinfo->used_blocks, pages);
-        shmem_unacct_blocks(info->flags, pages);
 }
 /*
@@ -1452,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
-                struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+                struct inode *inode,
                pgoff_t index, bool huge)
 {
+        struct shmem_inode_info *info = SHMEM_I(inode);
        struct page *page;
        int nr;
        int err = -ENOSPC;
@@ -1463,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
                huge = false;
        nr = huge ? HPAGE_PMD_NR : 1;
-        if (shmem_acct_block(info->flags, nr))
+        if (!shmem_inode_acct_block(inode, nr))
                goto failed;
-        if (sbinfo->max_blocks) {
-                if (percpu_counter_compare(&sbinfo->used_blocks,
-                                        sbinfo->max_blocks - nr) > 0)
-                        goto unacct;
-                percpu_counter_add(&sbinfo->used_blocks, nr);
-        }
        if (huge)
                page = shmem_alloc_hugepage(gfp, info, index);
@@ -1483,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
        }
        err = -ENOMEM;
-        if (sbinfo->max_blocks)
+        shmem_inode_unacct_blocks(inode, nr);
-                percpu_counter_add(&sbinfo->used_blocks, -nr);
-unacct:
-        shmem_unacct_blocks(info->flags, nr);
 failed:
        return ERR_PTR(err);
 }
@@ -1644,7 +1650,7 @@ repeat:
        if (swap.val) {
                /* Look it up and read it in.. */
-                page = lookup_swap_cache(swap);
+                page = lookup_swap_cache(swap, NULL, 0);
                if (!page) {
                        /* Or update major stats only when swapin succeeds?? */
                        if (fault_type) {
@@ -1751,10 +1757,9 @@ repeat:
                }
 alloc_huge:
-                page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+                page = shmem_alloc_and_acct_page(gfp, inode, index, true);
-                                index, true);
                if (IS_ERR(page)) {
-alloc_nohuge:           page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+alloc_nohuge:           page = shmem_alloc_and_acct_page(gfp, inode,
                                        index, false);
                }
                if (IS_ERR(page)) {
@@ -1876,10 +1881,7 @@ clear:
         * Error recovery.
         */
 unacct:
-        if (sbinfo->max_blocks)
+        shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
-                percpu_counter_sub(&sbinfo->used_blocks,
-                                1 << compound_order(page));
-        shmem_unacct_blocks(info->flags, 1 << compound_order(page));
        if (PageTransHuge(page)) {
                unlock_page(page);
@@ -2206,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping)
        return mapping->a_ops == &shmem_aops;
 }
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
-                           pmd_t *dst_pmd,
+                                  pmd_t *dst_pmd,
-                           struct vm_area_struct *dst_vma,
+                                  struct vm_area_struct *dst_vma,
-                           unsigned long dst_addr,
+                                  unsigned long dst_addr,
-                           unsigned long src_addr,
+                                  unsigned long src_addr,
-                           struct page **pagep)
+                                  bool zeropage,
+                                  struct page **pagep)
 {
        struct inode *inode = file_inode(dst_vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct address_space *mapping = inode->i_mapping;
        gfp_t gfp = mapping_gfp_mask(mapping);
        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -2227,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
        int ret;
        ret = -ENOMEM;
-        if (shmem_acct_block(info->flags, 1))
+        if (!shmem_inode_acct_block(inode, 1))
                goto out;
-        if (sbinfo->max_blocks) {
-                if (percpu_counter_compare(&sbinfo->used_blocks,
-                                           sbinfo->max_blocks) >= 0)
-                        goto out_unacct_blocks;
-                percpu_counter_inc(&sbinfo->used_blocks);
-        }
        if (!*pagep) {
                page = shmem_alloc_page(gfp, info, pgoff);
                if (!page)
-                        goto out_dec_used_blocks;
+                        goto out_unacct_blocks;
-                page_kaddr = kmap_atomic(page);
+                if (!zeropage) {        /* mcopy_atomic */
-                ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+                        page_kaddr = kmap_atomic(page);
-                                     PAGE_SIZE);
+                        ret = copy_from_user(page_kaddr,
-                kunmap_atomic(page_kaddr);
+                                             (const void __user *)src_addr,
+                                             PAGE_SIZE);
-                /* fallback to copy_from_user outside mmap_sem */
+                        kunmap_atomic(page_kaddr);
-                if (unlikely(ret)) {
-                        *pagep = page;
+                        /* fallback to copy_from_user outside mmap_sem */
-                        if (sbinfo->max_blocks)
+                        if (unlikely(ret)) {
-                                percpu_counter_add(&sbinfo->used_blocks, -1);
+                                *pagep = page;
-                        shmem_unacct_blocks(info->flags, 1);
+                                shmem_inode_unacct_blocks(inode, 1);
-                        /* don't free the page */
+                                /* don't free the page */
-                        return -EFAULT;
+                                return -EFAULT;
+                        }
+                } else {                /* mfill_zeropage_atomic */
+                        clear_highpage(page);
                }
        } else {
                page = *pagep;
@@ -2314,14 +2313,33 @@ out_release_uncharge:
 out_release:
        unlock_page(page);
        put_page(page);
-out_dec_used_blocks:
-        if (sbinfo->max_blocks)
-                percpu_counter_add(&sbinfo->used_blocks, -1);
 out_unacct_blocks:
-        shmem_unacct_blocks(info->flags, 1);
+        shmem_inode_unacct_blocks(inode, 1);
        goto out;
 }
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+                           pmd_t *dst_pmd,
+                           struct vm_area_struct *dst_vma,
+                           unsigned long dst_addr,
+                           unsigned long src_addr,
+                           struct page **pagep)
+{
+        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                      dst_addr, src_addr, false, pagep);
+}
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+                             pmd_t *dst_pmd,
+                             struct vm_area_struct *dst_vma,
+                             unsigned long dst_addr)
+{
+        struct page *page = NULL;
+        return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                      dst_addr, 0, true, &page);
+}
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
@@ -3635,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 SYSCALL_DEFINE2(memfd_create,
                const char __user *, uname,
@@ -3647,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
        char *name;
        long len;
-        if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+        if (!(flags & MFD_HUGETLB)) {
-                return -EINVAL;
+                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+                        return -EINVAL;
+        } else {
+                /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+                if (flags & MFD_ALLOW_SEALING)
+                        return -EINVAL;
+                /* Allow huge page size encoding in flags. */
+                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+                        return -EINVAL;
+        }
        /* length includes terminating zero */
        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3679,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
                goto err_name;
        }
-        file = shmem_file_setup(name, 0, VM_NORESERVE);
+        if (flags & MFD_HUGETLB) {
+                struct user_struct *user = NULL;
+                file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+                                        HUGETLB_ANONHUGE_INODE,
+                                        (flags >> MFD_HUGE_SHIFT) &
+                                        MFD_HUGE_MASK);
+        } else
+                file = shmem_file_setup(name, 0, VM_NORESERVE);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_fd;
        }
-        info = SHMEM_I(file_inode(file));
        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        file->f_flags |= O_RDWR | O_LARGEFILE;
-        if (flags & MFD_ALLOW_SEALING)
+        if (flags & MFD_ALLOW_SEALING) {
+                /*
+                 * flags check at beginning of function ensures
+                 * this is not a hugetlbfs (MFD_HUGETLB) file.
+                 */
+                info = SHMEM_I(file_inode(file));
                info->seals &= ~F_SEAL_SEAL;
+        }
        fd_install(fd, file);
        kfree(name);
diff --git a/mm/slub.c b/mm/slub.c
index e8b4e31162ca..ddb04576b342 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
 #include <linux/memcontrol.h>
+#include <linux/random.h>
 #include <trace/events/kmem.h>
@@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+                                 unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+        return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+#else
+        return ptr;
+#endif
+}
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+                                         void *ptr_addr)
+{
+        return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+                            (unsigned long)ptr_addr);
+}
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
-        return *(void **)(object + s->offset);
+        return freelist_dereference(s, object + s->offset);
 }
 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 {
-        prefetch(object + s->offset);
+        if (object)
+                prefetch(freelist_dereference(s, object + s->offset));
 }
 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 {
+        unsigned long freepointer_addr;
        void *p;
        if (!debug_pagealloc_enabled())
                return get_freepointer(s, object);
-        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+        freepointer_addr = (unsigned long)object + s->offset;
-        return p;
+        probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+        return freelist_ptr(s, p, freepointer_addr);
 }
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
-        *(void **)(object + s->offset) = fp;
+        unsigned long freeptr_addr = (unsigned long)object + s->offset;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+        BUG_ON(object == fp); /* naive detection of double free or corruption */
+#endif
+        *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
 }
 /* Loop over all objects in a slab */
@@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
        struct kmem_cache_node *n;
        for_each_kmem_cache_node(s, node, n) {
-                kmem_cache_free(kmem_cache_node, n);
                s->node[node] = NULL;
+                kmem_cache_free(kmem_cache_node, n);
        }
 }
@@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
                        return 0;
                }
-                s->node[node] = n;
                init_kmem_cache_node(n);
+                s->node[node] = n;
        }
        return 1;
 }
@@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 {
        s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
        s->reserved = 0;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+        s->random = get_random_long();
+#endif
        if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
                s->reserved = sizeof(struct rcu_head);
@@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = {
        NULL
 };
-static struct attribute_group slab_attr_group = {
+static const struct attribute_group slab_attr_group = {
        .attrs = slab_attrs,
 };
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d1a39b8051e0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
        if (slab_is_available()) {
                struct page *page;
-                if (node_state(node, N_HIGH_MEMORY))
+                page = alloc_pages_node(node,
-                        page = alloc_pages_node(
+                        GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
-                                node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+                        get_order(size));
-                                get_order(size));
-                else
-                        page = alloc_pages(
-                                GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
-                                get_order(size));
                if (page)
                        return page_address(page);
                return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..a9783acf2bb9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
        unsigned long array_size = SECTIONS_PER_ROOT *
                                   sizeof(struct mem_section);
-        if (slab_is_available()) {
+        if (slab_is_available())
-                if (node_state(nid, N_HIGH_MEMORY))
+                section = kzalloc_node(array_size, GFP_KERNEL, nid);
-                        section = kzalloc_node(array_size, GFP_KERNEL, nid);
+        else
-                else
-                        section = kzalloc(array_size, GFP_KERNEL);
-        } else {
                section = memblock_virt_alloc_node(array_size, nid);
-        }
        return section;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
 }
 /**
- * pagevec_lookup - gang pagecache lookup
+ * pagevec_lookup_range - gang pagecache lookup
 * @pvec:       Where the resulting pages are placed
 * @mapping:    The address_space to search
 * @start:      The starting page index
+ * @end:        The final page index
 * @nr_pages:   The maximum number of pages
 *
- * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * pagevec_lookup_range() will search for and return a group of up to @nr_pages
- * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
 * reference against the pages in @pvec.
 *
 * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages.
+ * indexes.  There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
 *
- * pagevec_lookup() returns the number of pages which were found.
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
 */
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+unsigned pagevec_lookup_range(struct pagevec *pvec,
-                pgoff_t start, unsigned nr_pages)
+                struct address_space *mapping, pgoff_t *start, pgoff_t end)
 {
-        pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+        pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
+                                        pvec->pages);
        return pagevec_count(pvec);
 }
-EXPORT_SYMBOL(pagevec_lookup);
+EXPORT_SYMBOL(pagevec_lookup_range);
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
                pgoff_t *index, int tag, unsigned nr_pages)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..71ce2d1ccbf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
 struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+#define SWAP_RA_MAX_ORDER_DEFAULT       3
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+#define SWAP_RA_WIN_SHIFT       (PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK       ((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+#define SWAP_RA_HITS(v)         ((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v)          (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v)         ((v) & PAGE_MASK)
+#define SWAP_RA_VAL(addr, win, hits)                            \
+        (((addr) & PAGE_MASK) |                                 \
+         (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |    \
+         ((hits) & SWAP_RA_HITS_MASK))
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma)                                    \
+        (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
 #define ADD_CACHE_INFO(x, nr)   do { swap_cache_info.x += (nr); } while (0)
@@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+                               unsigned long addr)
 {
        struct page *page;
+        unsigned long ra_info;
+        int win, hits, readahead;
        page = find_get_page(swap_address_space(entry), swp_offset(entry));
-        if (page && likely(!PageTransCompound(page))) {
+        INC_CACHE_INFO(find_total);
+        if (page) {
                INC_CACHE_INFO(find_success);
-                if (TestClearPageReadahead(page))
+                if (unlikely(PageTransCompound(page)))
-                        atomic_inc(&swapin_readahead_hits);
+                        return page;
+                readahead = TestClearPageReadahead(page);
+                if (vma) {
+                        ra_info = GET_SWAP_RA_VAL(vma);
+                        win = SWAP_RA_WIN(ra_info);
+                        hits = SWAP_RA_HITS(ra_info);
+                        if (readahead)
+                                hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+                        atomic_long_set(&vma->swap_readahead_info,
+                                        SWAP_RA_VAL(addr, win, hits));
+                }
+                if (readahead) {
+                        count_vm_event(SWAP_RA_HIT);
+                        if (!vma)
+                                atomic_inc(&swapin_readahead_hits);
+                }
        }
-        INC_CACHE_INFO(find_total);
        return page;
 }
@@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return retpage;
 }
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+                                      unsigned long offset,
+                                      int hits,
+                                      int max_pages,
+                                      int prev_win)
 {
-        static unsigned long prev_offset;
+        unsigned int pages, last_ra;
-        unsigned int pages, max_pages, last_ra;
-        static atomic_t last_readahead_pages;
-        max_pages = 1 << READ_ONCE(page_cluster);
-        if (max_pages <= 1)
-                return 1;
        /*
         * This heuristic has been found to work well on both sequential and
         * random loads, swapping to hard disk or to SSD: please don't ask
         * what the "+ 2" means, it just happens to work well, that's all.
         */
-        pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+        pages = hits + 2;
        if (pages == 2) {
                /*
                 * We can have no readahead hits to judge by: but must not get
@@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
                 */
                if (offset != prev_offset + 1 && offset != prev_offset - 1)
                        pages = 1;
-                prev_offset = offset;
        } else {
                unsigned int roundup = 4;
                while (roundup < pages)
@@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
                pages = max_pages;
        /* Don't shrink readahead too fast */
-        last_ra = atomic_read(&last_readahead_pages) / 2;
+        last_ra = prev_win / 2;
        if (pages < last_ra)
                pages = last_ra;
+        return pages;
+}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+        static unsigned long prev_offset;
+        unsigned int hits, pages, max_pages;
+        static atomic_t last_readahead_pages;
+        max_pages = 1 << READ_ONCE(page_cluster);
+        if (max_pages <= 1)
+                return 1;
+        hits = atomic_xchg(&swapin_readahead_hits, 0);
+        pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+                                  atomic_read(&last_readahead_pages));
+        if (!hits)
+                prev_offset = offset;
        atomic_set(&last_readahead_pages, pages);
        return pages;
@@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long start_offset, end_offset;
        unsigned long mask;
        struct blk_plug plug;
-        bool do_poll = true;
+        bool do_poll = true, page_allocated;
        mask = swapin_nr_pages(offset) - 1;
        if (!mask)
@@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
-                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+                page = __read_swap_cache_async(
-                                                gfp_mask, vma, addr, false);
+                        swp_entry(swp_type(entry), offset),
+                        gfp_mask, vma, addr, &page_allocated);
                if (!page)
                        continue;
-                if (offset != entry_offset && likely(!PageTransCompound(page)))
+                if (page_allocated) {
-                        SetPageReadahead(page);
+                        swap_readpage(page, false);
+                        if (offset != entry_offset &&
+                            likely(!PageTransCompound(page))) {
+                                SetPageReadahead(page);
+                                count_vm_event(SWAP_RA);
+                        }
+                }
                put_page(page);
        }
        blk_finish_plug(&plug);
@@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type)
        synchronize_rcu();
        kvfree(spaces);
 }
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+                                     unsigned long faddr,
+                                     unsigned long lpfn,
+                                     unsigned long rpfn,
+                                     unsigned long *start,
+                                     unsigned long *end)
+{
+        *start = max3(lpfn, PFN_DOWN(vma->vm_start),
+                      PFN_DOWN(faddr & PMD_MASK));
+        *end = min3(rpfn, PFN_DOWN(vma->vm_end),
+                    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+                                   struct vma_swap_readahead *swap_ra)
+{
+        struct vm_area_struct *vma = vmf->vma;
+        unsigned long swap_ra_info;
+        struct page *page;
+        swp_entry_t entry;
+        unsigned long faddr, pfn, fpfn;
+        unsigned long start, end;
+        pte_t *pte;
+        unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+        pte_t *tpte;
+#endif
+        faddr = vmf->address;
+        entry = pte_to_swp_entry(vmf->orig_pte);
+        if ((unlikely(non_swap_entry(entry))))
+                return NULL;
+        page = lookup_swap_cache(entry, vma, faddr);
+        if (page)
+                return page;
+        max_win = 1 << READ_ONCE(swap_ra_max_order);
+        if (max_win == 1) {
+                swap_ra->win = 1;
+                return NULL;
+        }
+        fpfn = PFN_DOWN(faddr);
+        swap_ra_info = GET_SWAP_RA_VAL(vma);
+        pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+        prev_win = SWAP_RA_WIN(swap_ra_info);
+        hits = SWAP_RA_HITS(swap_ra_info);
+        swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+                                               max_win, prev_win);
+        atomic_long_set(&vma->swap_readahead_info,
+                        SWAP_RA_VAL(faddr, win, 0));
+        if (win == 1)
+                return NULL;
+        /* Copy the PTEs because the page table may be unmapped */
+        if (fpfn == pfn + 1)
+                swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+        else if (pfn == fpfn + 1)
+                swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+                                  &start, &end);
+        else {
+                left = (win - 1) / 2;
+                swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+                                  &start, &end);
+        }
+        swap_ra->nr_pte = end - start;
+        swap_ra->offset = fpfn - start;
+        pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+        swap_ra->ptes = pte;
+#else
+        tpte = swap_ra->ptes;
+        for (pfn = start; pfn != end; pfn++)
+                *tpte++ = *pte++;
+#endif
+        return NULL;
+}
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+                                    struct vm_fault *vmf,
+                                    struct vma_swap_readahead *swap_ra)
+{
+        struct blk_plug plug;
+        struct vm_area_struct *vma = vmf->vma;
+        struct page *page;
+        pte_t *pte, pentry;
+        swp_entry_t entry;
+        unsigned int i;
+        bool page_allocated;
+        if (swap_ra->win == 1)
+                goto skip;
+        blk_start_plug(&plug);
+        for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+             i++, pte++) {
+                pentry = *pte;
+                if (pte_none(pentry))
+                        continue;
+                if (pte_present(pentry))
+                        continue;
+                entry = pte_to_swp_entry(pentry);
+                if (unlikely(non_swap_entry(entry)))
+                        continue;
+                page = __read_swap_cache_async(entry, gfp_mask, vma,
+                                               vmf->address, &page_allocated);
+                if (!page)
+                        continue;
+                if (page_allocated) {
+                        swap_readpage(page, false);
+                        if (i != swap_ra->offset &&
+                            likely(!PageTransCompound(page))) {
+                                SetPageReadahead(page);
+                                count_vm_event(SWAP_RA);
+                        }
+                }
+                put_page(page);
+        }
+        blk_finish_plug(&plug);
+        lru_add_drain();
+skip:
+        return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+                                     swap_ra->win == 1);
+}
+#ifdef CONFIG_SYSFS
+static ssize_t vma_ra_enabled_show(struct kobject *kobj,
+                                     struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
+}
+static ssize_t vma_ra_enabled_store(struct kobject *kobj,
+                                      struct kobj_attribute *attr,
+                                      const char *buf, size_t count)
+{
+        if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+                swap_vma_readahead = true;
+        else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+                swap_vma_readahead = false;
+        else
+                return -EINVAL;
+        return count;
+}
+static struct kobj_attribute vma_ra_enabled_attr =
+        __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
+               vma_ra_enabled_store);
+static ssize_t vma_ra_max_order_show(struct kobject *kobj,
+                                     struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", swap_ra_max_order);
+}
+static ssize_t vma_ra_max_order_store(struct kobject *kobj,
+                                      struct kobj_attribute *attr,
+                                      const char *buf, size_t count)
+{
+        int err, v;
+        err = kstrtoint(buf, 10, &v);
+        if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
+                return -EINVAL;
+        swap_ra_max_order = v;
+        return count;
+}
+static struct kobj_attribute vma_ra_max_order_attr =
+        __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
+               vma_ra_max_order_store);
+static struct attribute *swap_attrs[] = {
+        &vma_ra_enabled_attr.attr,
+        &vma_ra_max_order_attr.attr,
+        NULL,
+};
+static struct attribute_group swap_attr_group = {
+        .attrs = swap_attrs,
+};
+static int __init swap_init_sysfs(void)
+{
+        int err;
+        struct kobject *swap_kobj;
+        swap_kobj = kobject_create_and_add("swap", mm_kobj);
+        if (!swap_kobj) {
+                pr_err("failed to create swap kobject\n");
+                return -ENOMEM;
+        }
+        err = sysfs_create_group(swap_kobj, &swap_attr_group);
+        if (err) {
+                pr_err("failed to register swap group\n");
+                goto delete_obj;
+        }
+        return 0;
+delete_obj:
+        kobject_put(swap_kobj);
+        return err;
+}
+subsys_initcall(swap_init_sysfs);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
 EXPORT_SYMBOL_GPL(nr_swap_pages);
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
 * is held and the locking order requires swap_lock to be taken
 * before any swap_info_struct->lock.
 */
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
 static DEFINE_SPINLOCK(swap_avail_lock);
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
 /* Activity counter to indicate that a swapon or swapoff has occurred */
 static atomic_t proc_poll_event = ATOMIC_INIT(0);
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
 static inline unsigned char swap_count(unsigned char ent)
 {
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
        info->data = 0;
 }
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+        return info->flags & CLUSTER_FLAG_HUGE;
+}
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+        info->flags &= ~CLUSTER_FLAG_HUGE;
+}
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
 {
@@ -580,6 +592,21 @@ new_cluster:
        return found_free;
 }
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+        int nid;
+        for_each_node(nid)
+                plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+        spin_lock(&swap_avail_lock);
+        __del_from_avail_list(p);
+        spin_unlock(&swap_avail_lock);
+}
 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
                             unsigned int nr_entries)
 {
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
-                spin_lock(&swap_avail_lock);
+                del_from_avail_list(si);
-                plist_del(&si->avail_list, &swap_avail_head);
+        }
-                spin_unlock(&swap_avail_lock);
+}
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+        int nid;
+        spin_lock(&swap_avail_lock);
+        for_each_node(nid) {
+                WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+                plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
        }
+        spin_unlock(&swap_avail_lock);
 }
 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                bool was_full = !si->highest_bit;
                si->highest_bit = end;
-                if (was_full && (si->flags & SWP_WRITEOK)) {
+                if (was_full && (si->flags & SWP_WRITEOK))
-                        spin_lock(&swap_avail_lock);
+                        add_to_avail_list(si);
-                        WARN_ON(!plist_node_empty(&si->avail_list));
-                        if (plist_node_empty(&si->avail_list))
-                                plist_add(&si->avail_list, &swap_avail_head);
-                        spin_unlock(&swap_avail_lock);
-                }
        }
        atomic_long_add(nr_entries, &nr_swap_pages);
        si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
        offset = idx * SWAPFILE_CLUSTER;
        ci = lock_cluster(si, offset);
        alloc_cluster(si, idx);
-        cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+        cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
+        int node;
        /* Only single cluster request supported */
        WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
        spin_lock(&swap_avail_lock);
 start_over:
-        plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+        node = numa_node_id();
+        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                /* requeue si to after same-priority siblings */
-                plist_requeue(&si->avail_list, &swap_avail_head);
+                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                spin_unlock(&swap_avail_lock);
                spin_lock(&si->lock);
                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                        spin_lock(&swap_avail_lock);
-                        if (plist_node_empty(&si->avail_list)) {
+                        if (plist_node_empty(&si->avail_lists[node])) {
                                spin_unlock(&si->lock);
                                goto nextsi;
                        }
@@ -934,13 +968,14 @@ start_over:
                        WARN(!(si->flags & SWP_WRITEOK),
                             "swap_info %d in list but !SWP_WRITEOK\n",
                             si->type);
-                        plist_del(&si->avail_list, &swap_avail_head);
+                        __del_from_avail_list(si);
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-                if (cluster)
+                if (cluster) {
-                        n_ret = swap_alloc_cluster(si, swp_entries);
+                        if (!(si->flags & SWP_FILE))
-                else
+                                n_ret = swap_alloc_cluster(si, swp_entries);
+                } else
                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                                    n_goal, swp_entries);
                spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
                 * swap_avail_head list then try it, otherwise start over
                 * if we have not gotten any slots.
                 */
-                if (plist_node_empty(&next->avail_list))
+                if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
        }
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned char *map;
-        unsigned int i;
+        unsigned int i, free_entries = 0;
+        unsigned char val;
-        si = swap_info_get(entry);
+        si = _swap_info_get(entry);
        if (!si)
                return;
        ci = lock_cluster(si, offset);
+        VM_BUG_ON(!cluster_is_huge(ci));
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-                VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
+                val = map[i];
-                map[i] = 0;
+                VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+                if (val == SWAP_HAS_CACHE)
+                        free_entries++;
+        }
+        if (!free_entries) {
+                for (i = 0; i < SWAPFILE_CLUSTER; i++)
+                        map[i] &= ~SWAP_HAS_CACHE;
        }
+        cluster_clear_huge(ci);
        unlock_cluster(ci);
-        mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+        if (free_entries == SWAPFILE_CLUSTER) {
-        swap_free_cluster(si, idx);
+                spin_lock(&si->lock);
-        spin_unlock(&si->lock);
+                ci = lock_cluster(si, offset);
+                memset(map, 0, SWAPFILE_CLUSTER);
+                unlock_cluster(ci);
+                mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+                swap_free_cluster(si, idx);
+                spin_unlock(&si->lock);
+        } else if (free_entries) {
+                for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+                        if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+                                free_swap_slot(entry);
+                }
+        }
+}
+int split_swap_cluster(swp_entry_t entry)
+{
+        struct swap_info_struct *si;
+        struct swap_cluster_info *ci;
+        unsigned long offset = swp_offset(entry);
+        si = _swap_info_get(entry);
+        if (!si)
+                return -EBUSY;
+        ci = lock_cluster(si, offset);
+        cluster_clear_huge(ci);
+        unlock_cluster(ci);
+        return 0;
 }
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
        return count;
 }
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+                                         swp_entry_t entry)
+{
+        struct swap_cluster_info *ci;
+        unsigned char *map = si->swap_map;
+        unsigned long roffset = swp_offset(entry);
+        unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+        int i;
+        bool ret = false;
+        ci = lock_cluster_or_swap_info(si, offset);
+        if (!ci || !cluster_is_huge(ci)) {
+                if (map[roffset] != SWAP_HAS_CACHE)
+                        ret = true;
+                goto unlock_out;
+        }
+        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+                if (map[offset + i] != SWAP_HAS_CACHE) {
+                        ret = true;
+                        break;
+                }
+        }
+unlock_out:
+        unlock_cluster_or_swap_info(si, ci);
+        return ret;
+}
+static bool page_swapped(struct page *page)
+{
+        swp_entry_t entry;
+        struct swap_info_struct *si;
+        if (likely(!PageTransCompound(page)))
+                return page_swapcount(page) != 0;
+        page = compound_head(page);
+        entry.val = page_private(page);
+        si = _swap_info_get(entry);
+        if (si)
+                return swap_page_trans_huge_swapped(si, entry);
+        return false;
+}
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                         int *total_swapcount)
+{
+        int i, map_swapcount, _total_mapcount, _total_swapcount;
+        unsigned long offset = 0;
+        struct swap_info_struct *si;
+        struct swap_cluster_info *ci = NULL;
+        unsigned char *map = NULL;
+        int mapcount, swapcount = 0;
+        /* hugetlbfs shouldn't call it */
+        VM_BUG_ON_PAGE(PageHuge(page), page);
+        if (likely(!PageTransCompound(page))) {
+                mapcount = atomic_read(&page->_mapcount) + 1;
+                if (total_mapcount)
+                        *total_mapcount = mapcount;
+                if (PageSwapCache(page))
+                        swapcount = page_swapcount(page);
+                if (total_swapcount)
+                        *total_swapcount = swapcount;
+                return mapcount + swapcount;
+        }
+        page = compound_head(page);
+        _total_mapcount = _total_swapcount = map_swapcount = 0;
+        if (PageSwapCache(page)) {
+                swp_entry_t entry;
+                entry.val = page_private(page);
+                si = _swap_info_get(entry);
+                if (si) {
+                        map = si->swap_map;
+                        offset = swp_offset(entry);
+                }
+        }
+        if (map)
+                ci = lock_cluster(si, offset);
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                mapcount = atomic_read(&page[i]._mapcount) + 1;
+                _total_mapcount += mapcount;
+                if (map) {
+                        swapcount = swap_count(map[offset + i]);
+                        _total_swapcount += swapcount;
+                }
+                map_swapcount = max(map_swapcount, mapcount + swapcount);
+        }
+        unlock_cluster(ci);
+        if (PageDoubleMap(page)) {
+                map_swapcount -= 1;
+                _total_mapcount -= HPAGE_PMD_NR;
+        }
+        mapcount = compound_mapcount(page);
+        map_swapcount += mapcount;
+        _total_mapcount += mapcount;
+        if (total_mapcount)
+                *total_mapcount = _total_mapcount;
+        if (total_swapcount)
+                *total_swapcount = _total_swapcount;
+        return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
+#define page_swapped(page)                      (page_swapcount(page) != 0)
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                         int *total_swapcount)
+{
+        int mapcount, swapcount = 0;
+        /* hugetlbfs shouldn't call it */
+        VM_BUG_ON_PAGE(PageHuge(page), page);
+        mapcount = page_trans_huge_mapcount(page, total_mapcount);
+        if (PageSwapCache(page))
+                swapcount = page_swapcount(page);
+        if (total_swapcount)
+                *total_swapcount = swapcount;
+        return mapcount + swapcount;
+}
+#endif
 /*
 * We can write to an anon page without COW if there are no other references
 * to it.  And as a side-effect, free up its swap: because the old content
 * on disk will never be read, and seeking back there to write new content
 * later would only waste time away from clustering.
 *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
 * reuse_swap_page() returns false, but it may be always overwritten
 * (see the other implementation for CONFIG_SWAP=n).
 */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-        int count;
+        int count, total_mapcount, total_swapcount;
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return false;
-        count = page_trans_huge_mapcount(page, total_mapcount);
+        count = page_trans_huge_map_swapcount(page, &total_mapcount,
-        if (count <= 1 && PageSwapCache(page)) {
+                                              &total_swapcount);
-                count += page_swapcount(page);
+        if (total_map_swapcount)
-                if (count != 1)
+                *total_map_swapcount = total_mapcount + total_swapcount;
-                        goto out;
+        if (count == 1 && PageSwapCache(page) &&
+            (likely(!PageTransCompound(page)) ||
+             /* The remaining swap count will be freed soon */
+             total_swapcount == page_swapcount(page))) {
                if (!PageWriteback(page)) {
+                        page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                } else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
                        spin_unlock(&p->lock);
                }
        }
-out:
        return count <= 1;
 }
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
                return 0;
        if (PageWriteback(page))
                return 0;
-        if (page_swapcount(page))
+        if (page_swapped(page))
                return 0;
        /*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
        if (pm_suspended_storage())
                return 0;
+        page = compound_head(page);
        delete_from_swap_cache(page);
        SetPageDirty(page);
        return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
        p = _swap_info_get(entry);
        if (p) {
                count = __swap_entry_free(p, entry, 1);
-                if (count == SWAP_HAS_CACHE) {
+                if (count == SWAP_HAS_CACHE &&
+                    !swap_page_trans_huge_swapped(p, entry)) {
                        page = find_get_page(swap_address_space(entry),
                                             swp_offset(entry));
                        if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
                 */
                if (PageSwapCache(page) && !PageWriteback(page) &&
                    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-                    !swap_swapcount(p, entry)) {
+                    !swap_page_trans_huge_swapped(p, entry)) {
+                        page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
                                .sync_mode = WB_SYNC_NONE,
                        };
-                        swap_writepage(page, &wbc);
+                        swap_writepage(compound_head(page), &wbc);
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
                 * delete, since it may not have been written out to swap yet.
                 */
                if (PageSwapCache(page) &&
-                    likely(page_private(page) == entry.val))
+                    likely(page_private(page) == entry.val) &&
-                        delete_from_swap_cache(page);
+                    !page_swapped(page))
+                        delete_from_swap_cache(compound_head(page));
                /*
                 * So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
        return generic_swapfile_activate(sis, swap_file, span);
 }
+static int swap_node(struct swap_info_struct *p)
+{
+        struct block_device *bdev;
+        if (p->bdev)
+                bdev = p->bdev;
+        else
+                bdev = p->swap_file->f_inode->i_sb->s_bdev;
+        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
 {
+        int i;
        if (prio >= 0)
                p->prio = prio;
        else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
         * low-to-high, while swap ordering is high-to-low
         */
        p->list.prio = -p->prio;
-        p->avail_list.prio = -p->prio;
+        for_each_node(i) {
+                if (p->prio >= 0)
+                        p->avail_lists[i].prio = -p->prio;
+                else {
+                        if (swap_node(p) == i)
+                                p->avail_lists[i].prio = 1;
+                        else
+                                p->avail_lists[i].prio = -p->prio;
+                }
+        }
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
        p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
         * swap_info_struct.
         */
        plist_add(&p->list, &swap_active_head);
-        spin_lock(&swap_avail_lock);
+        add_to_avail_list(p);
-        plist_add(&p->avail_list, &swap_avail_head);
-        spin_unlock(&swap_avail_lock);
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-        spin_lock(&swap_avail_lock);
+        del_from_avail_list(p);
-        plist_del(&p->avail_list, &swap_avail_head);
-        spin_unlock(&swap_avail_lock);
        spin_lock(&p->lock);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
+                int nid;
                plist_for_each_entry_continue(si, &swap_active_head, list) {
                        si->prio++;
                        si->list.prio--;
-                        si->avail_list.prio--;
+                        for_each_node(nid) {
+                                if (si->avail_lists[nid].prio != 1)
+                                        si->avail_lists[nid].prio--;
+                        }
                }
                least_priority++;
        }
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
+        if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+                atomic_dec(&nr_rotate_swap);
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 {
        struct swap_info_struct *p;
        unsigned int type;
+        int i;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
        }
        INIT_LIST_HEAD(&p->first_swap_extent.list);
        plist_node_init(&p->list, 0);
-        plist_node_init(&p->avail_list, 0);
+        for_each_node(i)
+                plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        if (!swap_avail_heads)
+                return -ENOMEM;
        p = alloc_swap_info();
        if (IS_ERR(p))
                return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        cluster_set_null(&cluster->index);
                }
-        }
+        } else
+                atomic_inc(&nr_rotate_swap);
        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
                }
        }
 }
+static int __init swapfile_init(void)
+{
+        int nid;
+        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+                                         GFP_KERNEL);
+        if (!swap_avail_heads) {
+                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+                return -ENOMEM;
+        }
+        for_each_node(nid)
+                plist_head_init(&swap_avail_heads[nid]);
+        return 0;
+}
+subsys_initcall(swapfile_init);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8bcb501bce60..81192701964d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                      bool zeropage);
 #endif /* CONFIG_HUGETLB_PAGE */
+static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
+                                                pmd_t *dst_pmd,
+                                                struct vm_area_struct *dst_vma,
+                                                unsigned long dst_addr,
+                                                unsigned long src_addr,
+                                                struct page **page,
+                                                bool zeropage)
+{
+        ssize_t err;
+        if (vma_is_anonymous(dst_vma)) {
+                if (!zeropage)
+                        err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                               dst_addr, src_addr, page);
+                else
+                        err = mfill_zeropage_pte(dst_mm, dst_pmd,
+                                                 dst_vma, dst_addr);
+        } else {
+                if (!zeropage)
+                        err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+                                                     dst_vma, dst_addr,
+                                                     src_addr, page);
+                else
+                        err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
+                                                       dst_vma, dst_addr);
+        }
+        return err;
+}
 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
@@ -487,22 +517,8 @@ retry:
                BUG_ON(pmd_none(*dst_pmd));
                BUG_ON(pmd_trans_huge(*dst_pmd));
-                if (vma_is_anonymous(dst_vma)) {
+                err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-                        if (!zeropage)
+                                       src_addr, &page, zeropage);
-                                err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                                       dst_addr, src_addr,
-                                                       &page);
-                        else
-                                err = mfill_zeropage_pte(dst_mm, dst_pmd,
-                                                         dst_vma, dst_addr);
-                } else {
-                        err = -EINVAL; /* if zeropage is true return -EINVAL */
-                        if (likely(!zeropage))
-                                err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
-                                                             dst_vma, dst_addr,
-                                                             src_addr, &page);
-                }
                cond_resched();
                if (unlikely(err == -EFAULT)) {
diff --git a/mm/util.c b/mm/util.c
index 9ecddf568fe3..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-                free = global_page_state(NR_FREE_PAGES);
+                free = global_zone_page_state(NR_FREE_PAGES);
                free += global_node_page_state(NR_FILE_PAGES);
                /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a47e3894c775..8a43db6284eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int);
 static void free_work(struct work_struct *w)
 {
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
-        struct llist_node *llnode = llist_del_all(&p->list);
+        struct llist_node *t, *llnode;
-        while (llnode) {
-                void *p = llnode;
+        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
-                llnode = llist_next(llnode);
+                __vunmap((void *)llnode, 1);
-                __vunmap(p, 1);
-        }
 }
 /*** Page table manipulation functions ***/
@@ -2482,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 * matching slot.  While scanning, if any of the areas overlaps with
 * existing vmap_area, the base address is pulled down to fit the
 * area.  Scanning is repeated till all the areas fit and then all
- * necessary data structres are inserted and the result is returned.
+ * necessary data structures are inserted and the result is returned.
 */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
@@ -2510,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                if (start > offsets[last_area])
                        last_area = area;
-                for (area2 = 0; area2 < nr_vms; area2++) {
+                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];
-                        if (area2 == area)
+                        BUG_ON(start2 < end && start < end2);
-                                continue;
-                        BUG_ON(start2 >= start && start2 < end);
-                        BUG_ON(end2 <= end && end2 > start);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f957afe900ec..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                unsigned long nr_to_scan = min(batch_size, total_scan);
                shrinkctl->nr_to_scan = nr_to_scan;
+                shrinkctl->nr_scanned = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;
-                count_vm_events(SLABS_SCANNED, nr_to_scan);
+                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
-                total_scan -= nr_to_scan;
+                total_scan -= shrinkctl->nr_scanned;
-                scanned += nr_to_scan;
+                scanned += shrinkctl->nr_scanned;
                cond_resched();
        }
@@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page)
         * that isolated the page, the page cache radix tree and
         * optional buffer heads at page->private.
         */
-        return page_count(page) - page_has_private(page) == 2;
+        int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+                HPAGE_PMD_NR : 1;
+        return page_count(page) - page_has_private(page) == 1 + radix_pins;
 }
 static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
 {
        unsigned long flags;
+        int refcount;
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
@@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (!page_ref_freeze(page, 2))
+        if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
+                refcount = 1 + HPAGE_PMD_NR;
+        else
+                refcount = 2;
+        if (!page_ref_freeze(page, refcount))
                goto cannot_free;
        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
        if (unlikely(PageDirty(page))) {
-                page_ref_unfreeze(page, 2);
+                page_ref_unfreeze(page, refcount);
                goto cannot_free;
        }
@@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * Try to allocate it some swap space here.
                 * Lazyfree page could be freed directly
                 */
-                if (PageAnon(page) && PageSwapBacked(page) &&
+                if (PageAnon(page) && PageSwapBacked(page)) {
-                    !PageSwapCache(page)) {
+                        if (!PageSwapCache(page)) {
-                        if (!(sc->gfp_mask & __GFP_IO))
+                                if (!(sc->gfp_mask & __GFP_IO))
-                                goto keep_locked;
+                                        goto keep_locked;
-                        if (PageTransHuge(page)) {
+                                if (PageTransHuge(page)) {
-                                /* cannot split THP, skip it */
+                                        /* cannot split THP, skip it */
-                                if (!can_split_huge_page(page, NULL))
+                                        if (!can_split_huge_page(page, NULL))
-                                        goto activate_locked;
+                                                goto activate_locked;
-                                /*
+                                        /*
-                                 * Split pages without a PMD map right
+                                         * Split pages without a PMD map right
-                                 * away. Chances are some or all of the
+                                         * away. Chances are some or all of the
-                                 * tail pages can be freed without IO.
+                                         * tail pages can be freed without IO.
-                                 */
+                                         */
-                                if (!compound_mapcount(page) &&
+                                        if (!compound_mapcount(page) &&
-                                    split_huge_page_to_list(page, page_list))
+                                            split_huge_page_to_list(page,
-                                        goto activate_locked;
+                                                                    page_list))
-                        }
+                                                goto activate_locked;
-                        if (!add_to_swap(page)) {
+                                }
-                                if (!PageTransHuge(page))
+                                if (!add_to_swap(page)) {
-                                        goto activate_locked;
+                                        if (!PageTransHuge(page))
-                                /* Split THP and swap individual base pages */
+                                                goto activate_locked;
-                                if (split_huge_page_to_list(page, page_list))
+                                        /* Fallback to swap normal pages */
-                                        goto activate_locked;
+                                        if (split_huge_page_to_list(page,
-                                if (!add_to_swap(page))
+                                                                    page_list))
-                                        goto activate_locked;
+                                                goto activate_locked;
-                        }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                                        count_vm_event(THP_SWPOUT_FALLBACK);
-                        /* XXX: We don't support THP writes */
+#endif
-                        if (PageTransHuge(page) &&
+                                        if (!add_to_swap(page))
-                                  split_huge_page_to_list(page, page_list)) {
+                                                goto activate_locked;
-                                delete_from_swap_cache(page);
+                                }
-                                goto activate_locked;
-                        }
-                        may_enter_fs = 1;
+                                may_enter_fs = 1;
-                        /* Adding to swap updated mapping */
+                                /* Adding to swap updated mapping */
-                        mapping = page_mapping(page);
+                                mapping = page_mapping(page);
+                        }
                } else if (unlikely(PageTransHuge(page))) {
                        /* Split file THP */
                        if (split_huge_page_to_list(page, page_list))
                                goto keep_locked;
                }
-                VM_BUG_ON_PAGE(PageTransHuge(page), page);
                /*
                 * The page is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page)) {
-                        if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
+                        enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+                        if (unlikely(PageTransHuge(page)))
+                                flags |= TTU_SPLIT_HUGE_PMD;
+                        if (!try_to_unmap(page, flags)) {
                                nr_unmap_fail++;
                                goto activate_locked;
                        }
@@ -1312,7 +1321,11 @@ free_it:
                 * Is there need to periodically free_page_list? It would
                 * appear not as the counts should be low
                 */
-                list_add(&page->lru, &free_pages);
+                if (unlikely(PageTransHuge(page))) {
+                        mem_cgroup_uncharge(page);
+                        (*get_compound_page_dtor(page))(page);
+                } else
+                        list_add(&page->lru, &free_pages);
                continue;
 activate_locked:
@@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        int file = is_file_lru(lru);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+        bool stalled = false;
        while (unlikely(too_many_isolated(pgdat, file, sc))) {
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                if (stalled)
+                        return 0;
+                /* wait a bit for the reclaimer. */
+                msleep(100);
+                stalled = true;
                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 {
        unsigned long requested = 1UL << order;
+        if (WARN_ON_ONCE(order >= MAX_ORDER))
+                return 0;
        if (!info->free_blocks_total)
                return 0;
@@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = {
 #endif
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
+        "thp_swpout",
+        "thp_swpout_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
        "balloon_inflate",
@@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = {
        "vmacache_find_hits",
        "vmacache_full_flushes",
 #endif
+#ifdef CONFIG_SWAP
+        "swap_ra",
+        "swap_ra_hit",
+#endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
        seq_putc(m, '\n');
 }
-/* Print out the free pages at each order for each migratetype */
+/* Print out the number of pageblocks for each migratetype */
 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
 {
        int mtype;
@@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
        if (!v)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                v[i] = global_page_state(i);
+                v[i] = global_zone_page_state(i);
        v += NR_VM_ZONE_STAT_ITEMS;
        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
         * which can equally be echo'ed to or cat'ted from (by root),
         * can be used to update the stats just before reading them.
         *
-         * Oh, and since global_page_state() etc. are so careful to hide
+         * Oh, and since global_zone_page_state() etc. are so careful to hide
         * transiently negative values, report an error here if any of
         * the stats is negative, so we know to go looking for imbalance.
         */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 54f63c4a809a..486550df32be 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -23,10 +23,13 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/atomic.h>
+#include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
+#include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
@@ -48,11 +51,15 @@ enum buddy {
 };
 /*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
 *                      z3fold page, except for HEADLESS pages
- * @buddy:      links the z3fold page into the relevant list in the pool
+ * @buddy:              links the z3fold page into the relevant list in the
+ *                      pool
 * @page_lock:          per-page lock
- * @refcount:           reference cound for the z3fold page
+ * @refcount:           reference count for the z3fold page
+ * @work:               work_struct for page layout optimization
+ * @pool:               pointer to the pool which this page belongs to
+ * @cpu:                CPU which this page "belongs" to
 * @first_chunks:       the size of the first buddy in chunks, 0 if free
 * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
 * @last_chunks:        the size of the last buddy in chunks, 0 if free
@@ -62,6 +69,9 @@ struct z3fold_header {
        struct list_head buddy;
        spinlock_t page_lock;
        struct kref refcount;
+        struct work_struct work;
+        struct z3fold_pool *pool;
+        short cpu;
        unsigned short first_chunks;
        unsigned short middle_chunks;
        unsigned short last_chunks;
@@ -92,28 +102,39 @@ struct z3fold_header {
 /**
 * struct z3fold_pool - stores metadata for each z3fold pool
- * @lock:       protects all pool fields and first|last_chunk fields of any
+ * @name:       pool name
- *              z3fold page in the pool
+ * @lock:       protects pool unbuddied/lru lists
- * @unbuddied:  array of lists tracking z3fold pages that contain 2- buddies;
+ * @stale_lock: protects pool stale page list
- *              the lists each z3fold page is added to depends on the size of
+ * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
- *              its free region.
+ *              buddies; the list each z3fold page is added to depends on
+ *              the size of its free region.
 * @lru:        list tracking the z3fold pages in LRU order by most recently
 *              added buddy.
+ * @stale:      list of pages marked for freeing
 * @pages_nr:   number of z3fold pages in the pool.
 * @ops:        pointer to a structure of user defined operations specified at
 *              pool creation time.
+ * @compact_wq: workqueue for page layout background optimization
+ * @release_wq: workqueue for safe page release
+ * @work:       work_struct for safe page release
 *
 * This structure is allocated at pool creation time and maintains metadata
 * pertaining to a particular z3fold pool.
 */
 struct z3fold_pool {
+        const char *name;
        spinlock_t lock;
-        struct list_head unbuddied[NCHUNKS];
+        spinlock_t stale_lock;
+        struct list_head *unbuddied;
        struct list_head lru;
+        struct list_head stale;
        atomic64_t pages_nr;
        const struct z3fold_ops *ops;
        struct zpool *zpool;
        const struct zpool_ops *zpool_ops;
+        struct workqueue_struct *compact_wq;
+        struct workqueue_struct *release_wq;
+        struct work_struct work;
 };
 /*
@@ -122,9 +143,10 @@ struct z3fold_pool {
 enum z3fold_page_flags {
        PAGE_HEADLESS = 0,
        MIDDLE_CHUNK_MAPPED,
+        NEEDS_COMPACTING,
+        PAGE_STALE
 };
 /*****************
 * Helpers
 *****************/
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size)
 #define for_each_unbuddied_list(_iter, _begin) \
        for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+static void compact_page_work(struct work_struct *w);
 /* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page)
+static struct z3fold_header *init_z3fold_page(struct page *page,
+                                        struct z3fold_pool *pool)
 {
        struct z3fold_header *zhdr = page_address(page);
        INIT_LIST_HEAD(&page->lru);
        clear_bit(PAGE_HEADLESS, &page->private);
        clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+        clear_bit(NEEDS_COMPACTING, &page->private);
+        clear_bit(PAGE_STALE, &page->private);
        spin_lock_init(&zhdr->page_lock);
        kref_init(&zhdr->refcount);
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
        zhdr->last_chunks = 0;
        zhdr->first_num = 0;
        zhdr->start_middle = 0;
+        zhdr->cpu = -1;
+        zhdr->pool = pool;
        INIT_LIST_HEAD(&zhdr->buddy);
+        INIT_WORK(&zhdr->work, compact_page_work);
        return zhdr;
 }
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page)
        __free_page(page);
 }
-static void release_z3fold_page(struct kref *ref)
-{
-        struct z3fold_header *zhdr;
-        struct page *page;
-        zhdr = container_of(ref, struct z3fold_header, refcount);
-        page = virt_to_page(zhdr);
-        if (!list_empty(&zhdr->buddy))
-                list_del(&zhdr->buddy);
-        if (!list_empty(&page->lru))
-                list_del(&page->lru);
-        free_z3fold_page(page);
-}
 /* Lock a z3fold page */
 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
 {
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle)
        return (handle - zhdr->first_num) & BUDDY_MASK;
 }
+static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
+{
+        struct page *page = virt_to_page(zhdr);
+        struct z3fold_pool *pool = zhdr->pool;
+        WARN_ON(!list_empty(&zhdr->buddy));
+        set_bit(PAGE_STALE, &page->private);
+        spin_lock(&pool->lock);
+        if (!list_empty(&page->lru))
+                list_del(&page->lru);
+        spin_unlock(&pool->lock);
+        if (locked)
+                z3fold_page_unlock(zhdr);
+        spin_lock(&pool->stale_lock);
+        list_add(&zhdr->buddy, &pool->stale);
+        queue_work(pool->release_wq, &pool->work);
+        spin_unlock(&pool->stale_lock);
+}
+static void __attribute__((__unused__))
+                        release_z3fold_page(struct kref *ref)
+{
+        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+                                                refcount);
+        __release_z3fold_page(zhdr, false);
+}
+static void release_z3fold_page_locked(struct kref *ref)
+{
+        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+                                                refcount);
+        WARN_ON(z3fold_page_trylock(zhdr));
+        __release_z3fold_page(zhdr, true);
+}
+static void release_z3fold_page_locked_list(struct kref *ref)
+{
+        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+                                               refcount);
+        spin_lock(&zhdr->pool->lock);
+        list_del_init(&zhdr->buddy);
+        spin_unlock(&zhdr->pool->lock);
+        WARN_ON(z3fold_page_trylock(zhdr));
+        __release_z3fold_page(zhdr, true);
+}
+static void free_pages_work(struct work_struct *w)
+{
+        struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
+        spin_lock(&pool->stale_lock);
+        while (!list_empty(&pool->stale)) {
+                struct z3fold_header *zhdr = list_first_entry(&pool->stale,
+                                                struct z3fold_header, buddy);
+                struct page *page = virt_to_page(zhdr);
+                list_del(&zhdr->buddy);
+                if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
+                        continue;
+                clear_bit(NEEDS_COMPACTING, &page->private);
+                spin_unlock(&pool->stale_lock);
+                cancel_work_sync(&zhdr->work);
+                free_z3fold_page(page);
+                cond_resched();
+                spin_lock(&pool->stale_lock);
+        }
+        spin_unlock(&pool->stale_lock);
+}
 /*
 * Returns the number of free chunks in a z3fold page.
 * NB: can't be used with HEADLESS pages.
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr)
        return nfree;
 }
-/*****************
- * API Functions
-*****************/
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @gfp:        gfp flags when allocating the z3fold pool structure
- * @ops:        user-defined operations for the z3fold pool
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
-                const struct z3fold_ops *ops)
-{
-        struct z3fold_pool *pool;
-        int i;
-        pool = kzalloc(sizeof(struct z3fold_pool), gfp);
-        if (!pool)
-                return NULL;
-        spin_lock_init(&pool->lock);
-        for_each_unbuddied_list(i, 0)
-                INIT_LIST_HEAD(&pool->unbuddied[i]);
-        INIT_LIST_HEAD(&pool->lru);
-        atomic64_set(&pool->pages_nr, 0);
-        pool->ops = ops;
-        return pool;
-}
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool:       the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
-        kfree(pool);
-}
 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
                                unsigned short dst_chunk)
 {
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
        return 0;
 }
+static void do_compact_page(struct z3fold_header *zhdr, bool locked)
+{
+        struct z3fold_pool *pool = zhdr->pool;
+        struct page *page;
+        struct list_head *unbuddied;
+        int fchunks;
+        page = virt_to_page(zhdr);
+        if (locked)
+                WARN_ON(z3fold_page_trylock(zhdr));
+        else
+                z3fold_page_lock(zhdr);
+        if (test_bit(PAGE_STALE, &page->private) ||
+            !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+                z3fold_page_unlock(zhdr);
+                return;
+        }
+        spin_lock(&pool->lock);
+        list_del_init(&zhdr->buddy);
+        spin_unlock(&pool->lock);
+        z3fold_compact_page(zhdr);
+        unbuddied = get_cpu_ptr(pool->unbuddied);
+        fchunks = num_free_chunks(zhdr);
+        if (fchunks < NCHUNKS &&
+            (!zhdr->first_chunks || !zhdr->middle_chunks ||
+                        !zhdr->last_chunks)) {
+                /* the page's not completely free and it's unbuddied */
+                spin_lock(&pool->lock);
+                list_add(&zhdr->buddy, &unbuddied[fchunks]);
+                spin_unlock(&pool->lock);
+                zhdr->cpu = smp_processor_id();
+        }
+        put_cpu_ptr(pool->unbuddied);
+        z3fold_page_unlock(zhdr);
+}
+static void compact_page_work(struct work_struct *w)
+{
+        struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
+                                                work);
+        do_compact_page(zhdr, false);
+}
+/*
+ * API Functions
+ */
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @name:       pool name
+ * @gfp:        gfp flags when allocating the z3fold pool structure
+ * @ops:        user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
+                const struct z3fold_ops *ops)
+{
+        struct z3fold_pool *pool = NULL;
+        int i, cpu;
+        pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+        if (!pool)
+                goto out;
+        spin_lock_init(&pool->lock);
+        spin_lock_init(&pool->stale_lock);
+        pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+        for_each_possible_cpu(cpu) {
+                struct list_head *unbuddied =
+                                per_cpu_ptr(pool->unbuddied, cpu);
+                for_each_unbuddied_list(i, 0)
+                        INIT_LIST_HEAD(&unbuddied[i]);
+        }
+        INIT_LIST_HEAD(&pool->lru);
+        INIT_LIST_HEAD(&pool->stale);
+        atomic64_set(&pool->pages_nr, 0);
+        pool->name = name;
+        pool->compact_wq = create_singlethread_workqueue(pool->name);
+        if (!pool->compact_wq)
+                goto out;
+        pool->release_wq = create_singlethread_workqueue(pool->name);
+        if (!pool->release_wq)
+                goto out_wq;
+        INIT_WORK(&pool->work, free_pages_work);
+        pool->ops = ops;
+        return pool;
+out_wq:
+        destroy_workqueue(pool->compact_wq);
+out:
+        kfree(pool);
+        return NULL;
+}
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool:       the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+        destroy_workqueue(pool->release_wq);
+        destroy_workqueue(pool->compact_wq);
+        kfree(pool);
+}
 /**
 * z3fold_alloc() - allocates a region of a given size
 * @pool:       z3fold pool from which to allocate
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 {
        int chunks = 0, i, freechunks;
        struct z3fold_header *zhdr = NULL;
+        struct page *page = NULL;
        enum buddy bud;
-        struct page *page;
+        bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
        if (!size || (gfp & __GFP_HIGHMEM))
                return -EINVAL;
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
                bud = HEADLESS;
        else {
+                struct list_head *unbuddied;
                chunks = size_to_chunks(size);
+lookup:
                /* First, try to find an unbuddied z3fold page. */
-                zhdr = NULL;
+                unbuddied = get_cpu_ptr(pool->unbuddied);
                for_each_unbuddied_list(i, chunks) {
-                        spin_lock(&pool->lock);
+                        struct list_head *l = &unbuddied[i];
-                        zhdr = list_first_entry_or_null(&pool->unbuddied[i],
+                        zhdr = list_first_entry_or_null(READ_ONCE(l),
                                                struct z3fold_header, buddy);
-                        if (!zhdr || !z3fold_page_trylock(zhdr)) {
-                                spin_unlock(&pool->lock);
+                        if (!zhdr)
                                continue;
+                        /* Re-check under lock. */
+                        spin_lock(&pool->lock);
+                        l = &unbuddied[i];
+                        if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+                                        struct z3fold_header, buddy)) ||
+                            !z3fold_page_trylock(zhdr)) {
+                                spin_unlock(&pool->lock);
+                                put_cpu_ptr(pool->unbuddied);
+                                goto lookup;
                        }
-                        kref_get(&zhdr->refcount);
                        list_del_init(&zhdr->buddy);
+                        zhdr->cpu = -1;
                        spin_unlock(&pool->lock);
                        page = virt_to_page(zhdr);
+                        if (test_bit(NEEDS_COMPACTING, &page->private)) {
+                                z3fold_page_unlock(zhdr);
+                                zhdr = NULL;
+                                put_cpu_ptr(pool->unbuddied);
+                                if (can_sleep)
+                                        cond_resched();
+                                goto lookup;
+                        }
+                        /*
+                         * this page could not be removed from its unbuddied
+                         * list while pool lock was held, and then we've taken
+                         * page lock so kref_put could not be called before
+                         * we got here, so it's safe to just call kref_get()
+                         */
+                        kref_get(&zhdr->refcount);
+                        break;
+                }
+                put_cpu_ptr(pool->unbuddied);
+                if (zhdr) {
                        if (zhdr->first_chunks == 0) {
                                if (zhdr->middle_chunks != 0 &&
                                    chunks >= zhdr->start_middle)
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
                        else if (zhdr->middle_chunks == 0)
                                bud = MIDDLE;
                        else {
-                                z3fold_page_unlock(zhdr);
-                                spin_lock(&pool->lock);
                                if (kref_put(&zhdr->refcount,
-                                             release_z3fold_page))
+                                             release_z3fold_page_locked))
                                        atomic64_dec(&pool->pages_nr);
-                                spin_unlock(&pool->lock);
+                                else
+                                        z3fold_page_unlock(zhdr);
                                pr_err("No free chunks in unbuddied\n");
                                WARN_ON(1);
-                                continue;
+                                goto lookup;
                        }
                        goto found;
                }
                bud = FIRST;
        }
-        /* Couldn't find unbuddied z3fold page, create new one */
+        spin_lock(&pool->stale_lock);
-        page = alloc_page(gfp);
+        zhdr = list_first_entry_or_null(&pool->stale,
+                                        struct z3fold_header, buddy);
+        /*
+         * Before allocating a page, let's see if we can take one from the
+         * stale pages list. cancel_work_sync() can sleep so we must make
+         * sure it won't be called in case we're in atomic context.
+         */
+        if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
+            !unlikely(work_busy(&zhdr->work)))) {
+                list_del(&zhdr->buddy);
+                clear_bit(NEEDS_COMPACTING, &page->private);
+                spin_unlock(&pool->stale_lock);
+                if (can_sleep)
+                        cancel_work_sync(&zhdr->work);
+                page = virt_to_page(zhdr);
+        } else {
+                spin_unlock(&pool->stale_lock);
+                page = alloc_page(gfp);
+        }
        if (!page)
                return -ENOMEM;
        atomic64_inc(&pool->pages_nr);
-        zhdr = init_z3fold_page(page);
+        zhdr = init_z3fold_page(page, pool);
        if (bud == HEADLESS) {
                set_bit(PAGE_HEADLESS, &page->private);
-                spin_lock(&pool->lock);
                goto headless;
        }
        z3fold_page_lock(zhdr);
@@ -451,15 +659,21 @@ found:
                zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
        }
-        spin_lock(&pool->lock);
        if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
                        zhdr->middle_chunks == 0) {
+                struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
                /* Add to unbuddied list */
                freechunks = num_free_chunks(zhdr);
-                list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+                spin_lock(&pool->lock);
+                list_add(&zhdr->buddy, &unbuddied[freechunks]);
+                spin_unlock(&pool->lock);
+                zhdr->cpu = smp_processor_id();
+                put_cpu_ptr(pool->unbuddied);
        }
 headless:
+        spin_lock(&pool->lock);
        /* Add/move z3fold page to beginning of LRU */
        if (!list_empty(&page->lru))
                list_del(&page->lru);
@@ -487,7 +701,6 @@ headless:
 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 {
        struct z3fold_header *zhdr;
-        int freechunks;
        struct page *page;
        enum buddy bud;
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
                spin_unlock(&pool->lock);
                free_z3fold_page(page);
                atomic64_dec(&pool->pages_nr);
-        } else {
+                return;
-                if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 ||
+        }
-                    zhdr->last_chunks != 0) {
-                        z3fold_compact_page(zhdr);
+        if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
-                        /* Add to the unbuddied list */
+                atomic64_dec(&pool->pages_nr);
-                        spin_lock(&pool->lock);
+                return;
-                        if (!list_empty(&zhdr->buddy))
+        }
-                                list_del(&zhdr->buddy);
+        if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
-                        freechunks = num_free_chunks(zhdr);
-                        list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
-                        spin_unlock(&pool->lock);
-                }
                z3fold_page_unlock(zhdr);
+                return;
+        }
+        if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
                spin_lock(&pool->lock);
-                if (kref_put(&zhdr->refcount, release_z3fold_page))
+                list_del_init(&zhdr->buddy);
-                        atomic64_dec(&pool->pages_nr);
                spin_unlock(&pool->lock);
+                zhdr->cpu = -1;
+                do_compact_page(zhdr, true);
+                return;
        }
+        queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+        z3fold_page_unlock(zhdr);
 }
 /**
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 */
 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 {
-        int i, ret = 0, freechunks;
+        int i, ret = 0;
-        struct z3fold_header *zhdr;
+        struct z3fold_header *zhdr = NULL;
-        struct page *page;
+        struct page *page = NULL;
+        struct list_head *pos;
        unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
        spin_lock(&pool->lock);
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
                        spin_unlock(&pool->lock);
                        return -EINVAL;
                }
-                page = list_last_entry(&pool->lru, struct page, lru);
+                list_for_each_prev(pos, &pool->lru) {
+                        page = list_entry(pos, struct page, lru);
+                        if (test_bit(PAGE_HEADLESS, &page->private))
+                                /* candidate found */
+                                break;
+                        zhdr = page_address(page);
+                        if (!z3fold_page_trylock(zhdr))
+                                continue; /* can't evict at this point */
+                        kref_get(&zhdr->refcount);
+                        list_del_init(&zhdr->buddy);
+                        zhdr->cpu = -1;
+                }
                list_del_init(&page->lru);
+                spin_unlock(&pool->lock);
-                zhdr = page_address(page);
                if (!test_bit(PAGE_HEADLESS, &page->private)) {
-                        if (!list_empty(&zhdr->buddy))
-                                list_del_init(&zhdr->buddy);
-                        kref_get(&zhdr->refcount);
-                        spin_unlock(&pool->lock);
-                        z3fold_page_lock(zhdr);
                        /*
                         * We need encode the handles before unlocking, since
                         * we can race with free that will set
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
                                middle_handle = encode_handle(zhdr, MIDDLE);
                        if (zhdr->last_chunks)
                                last_handle = encode_handle(zhdr, LAST);
+                        /*
+                         * it's safe to unlock here because we hold a
+                         * reference to this page
+                         */
                        z3fold_page_unlock(zhdr);
                } else {
                        first_handle = encode_handle(zhdr, HEADLESS);
                        last_handle = middle_handle = 0;
-                        spin_unlock(&pool->lock);
                }
                /* Issue the eviction callback(s) */
@@ -652,31 +879,12 @@ next:
                        if (ret == 0) {
                                free_z3fold_page(page);
                                return 0;
-                        } else {
-                                spin_lock(&pool->lock);
-                        }
-                } else {
-                        z3fold_page_lock(zhdr);
-                        if ((zhdr->first_chunks || zhdr->last_chunks ||
-                             zhdr->middle_chunks) &&
-                            !(zhdr->first_chunks && zhdr->last_chunks &&
-                              zhdr->middle_chunks)) {
-                                z3fold_compact_page(zhdr);
-                                /* add to unbuddied list */
-                                spin_lock(&pool->lock);
-                                freechunks = num_free_chunks(zhdr);
-                                list_add(&zhdr->buddy,
-                                         &pool->unbuddied[freechunks]);
-                                spin_unlock(&pool->lock);
-                        }
-                        z3fold_page_unlock(zhdr);
-                        spin_lock(&pool->lock);
-                        if (kref_put(&zhdr->refcount, release_z3fold_page)) {
-                                spin_unlock(&pool->lock);
-                                atomic64_dec(&pool->pages_nr);
-                                return 0;
                        }
+                } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
+                        atomic64_dec(&pool->pages_nr);
+                        return 0;
                }
+                spin_lock(&pool->lock);
                /*
                 * Add to the beginning of LRU.
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp,
 {
        struct z3fold_pool *pool;
-        pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+        pool = z3fold_create_pool(name, gfp,
+                                zpool_ops ? &z3fold_zpool_ops : NULL);
        if (pool) {
                pool->zpool = zpool;
                pool->zpool_ops = zpool_ops;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 308acb9d814b..62457eb82330 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
        spin_lock(&class->lock);
        if (!get_zspage_inuse(zspage)) {
-                ret = -EBUSY;
+                /*
-                goto unlock_class;
+                 * Set "offset" to end of the page so that every loops
+                 * skips unnecessary object scanning.
+                 */
+                offset = PAGE_SIZE;
        }
        pos = offset;
@@ -2052,7 +2055,6 @@ unpin_objects:
                }
        }
        kunmap_atomic(s_addr);
-unlock_class:
        spin_unlock(&class->lock);
        migrate_write_unlock(zspage);
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 48397feb08fb..b920d186ad4a 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -261,7 +261,17 @@ static enum export export_no(const char *s)
        return export_unknown;
 }
-static const char *sec_name(struct elf_info *elf, int secindex);
+static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
+{
+        return (void *)elf->hdr +
+                elf->sechdrs[elf->secindex_strings].sh_offset +
+                sechdr->sh_name;
+}
+static const char *sec_name(struct elf_info *elf, int secindex)
+{
+        return sech_name(elf, &elf->sechdrs[secindex]);
+}
 #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
@@ -775,21 +785,6 @@ static const char *sym_name(struct elf_info *elf, Elf_Sym *sym)
                return "(unknown)";
 }
-static const char *sec_name(struct elf_info *elf, int secindex)
-{
-        Elf_Shdr *sechdrs = elf->sechdrs;
-        return (void *)elf->hdr +
-                elf->sechdrs[elf->secindex_strings].sh_offset +
-                sechdrs[secindex].sh_name;
-}
-static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr)
-{
-        return (void *)elf->hdr +
-                elf->sechdrs[elf->secindex_strings].sh_offset +
-                sechdr->sh_name;
-}
 /* The pattern is an array of simple patterns.
 * "foo" will match an exact string equal to "foo"
 * "*foo" will match a string that ends with "foo"
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index ad8a0897e47f..bc9d02d615da 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -3,7 +3,7 @@ CFLAGS += -I../../../../include/uapi/
 CFLAGS += -I../../../../include/
 CFLAGS += -I../../../../usr/include/
-TEST_PROGS := run_fuse_test.sh
+TEST_PROGS := run_tests.sh
 TEST_GEN_FILES := memfd_test fuse_mnt fuse_test
 fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags)
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 26546892cd54..f94c6d1fb46f 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -18,12 +18,48 @@
 #include <sys/wait.h>
 #include <unistd.h>
+#define MEMFD_STR       "memfd:"
+#define SHARED_FT_STR   "(shared file-table)"
 #define MFD_DEF_SIZE 8192
 #define STACK_SIZE 65536
+/*
+ * Default is not to test hugetlbfs
+ */
+static int hugetlbfs_test;
+static size_t mfd_def_size = MFD_DEF_SIZE;
+/*
+ * Copied from mlock2-tests.c
+ */
+static unsigned long default_huge_page_size(void)
+{
+        unsigned long hps = 0;
+        char *line = NULL;
+        size_t linelen = 0;
+        FILE *f = fopen("/proc/meminfo", "r");
+        if (!f)
+                return 0;
+        while (getline(&line, &linelen, f) > 0) {
+                if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                        hps <<= 10;
+                        break;
+                }
+        }
+        free(line);
+        fclose(f);
+        return hps;
+}
 static int sys_memfd_create(const char *name,
                            unsigned int flags)
 {
+        if (hugetlbfs_test)
+                flags |= MFD_HUGETLB;
        return syscall(__NR_memfd_create, name, flags);
 }
@@ -150,7 +186,7 @@ static void *mfd_assert_mmap_shared(int fd)
        void *p;
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ | PROT_WRITE,
                 MAP_SHARED,
                 fd,
@@ -168,7 +204,7 @@ static void *mfd_assert_mmap_private(int fd)
        void *p;
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ,
                 MAP_PRIVATE,
                 fd,
@@ -223,7 +259,7 @@ static void mfd_assert_read(int fd)
        /* verify PROT_READ *is* allowed */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ,
                 MAP_PRIVATE,
                 fd,
@@ -232,11 +268,11 @@ static void mfd_assert_read(int fd)
                printf("mmap() failed: %m\n");
                abort();
        }
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
        /* verify MAP_PRIVATE is *always* allowed (even writable) */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ | PROT_WRITE,
                 MAP_PRIVATE,
                 fd,
@@ -245,7 +281,7 @@ static void mfd_assert_read(int fd)
                printf("mmap() failed: %m\n");
                abort();
        }
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
 }
 static void mfd_assert_write(int fd)
@@ -254,16 +290,22 @@ static void mfd_assert_write(int fd)
        void *p;
        int r;
-        /* verify write() succeeds */
+        /*
-        l = write(fd, "\0\0\0\0", 4);
+         * huegtlbfs does not support write, but we want to
-        if (l != 4) {
+         * verify everything else here.
-                printf("write() failed: %m\n");
+         */
-                abort();
+        if (!hugetlbfs_test) {
+                /* verify write() succeeds */
+                l = write(fd, "\0\0\0\0", 4);
+                if (l != 4) {
+                        printf("write() failed: %m\n");
+                        abort();
+                }
        }
        /* verify PROT_READ | PROT_WRITE is allowed */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ | PROT_WRITE,
                 MAP_SHARED,
                 fd,
@@ -273,11 +315,11 @@ static void mfd_assert_write(int fd)
                abort();
        }
        *(char *)p = 0;
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
        /* verify PROT_WRITE is allowed */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_WRITE,
                 MAP_SHARED,
                 fd,
@@ -287,12 +329,12 @@ static void mfd_assert_write(int fd)
                abort();
        }
        *(char *)p = 0;
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
        /* verify PROT_READ with MAP_SHARED is allowed and a following
         * mprotect(PROT_WRITE) allows writing */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ,
                 MAP_SHARED,
                 fd,
@@ -302,20 +344,20 @@ static void mfd_assert_write(int fd)
                abort();
        }
-        r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+        r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
        if (r < 0) {
                printf("mprotect() failed: %m\n");
                abort();
        }
        *(char *)p = 0;
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
        /* verify PUNCH_HOLE works */
        r = fallocate(fd,
                      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                      0,
-                      MFD_DEF_SIZE);
+                      mfd_def_size);
        if (r < 0) {
                printf("fallocate(PUNCH_HOLE) failed: %m\n");
                abort();
@@ -337,7 +379,7 @@ static void mfd_fail_write(int fd)
        /* verify PROT_READ | PROT_WRITE is not allowed */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ | PROT_WRITE,
                 MAP_SHARED,
                 fd,
@@ -349,7 +391,7 @@ static void mfd_fail_write(int fd)
        /* verify PROT_WRITE is not allowed */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_WRITE,
                 MAP_SHARED,
                 fd,
@@ -362,13 +404,13 @@ static void mfd_fail_write(int fd)
        /* Verify PROT_READ with MAP_SHARED with a following mprotect is not
         * allowed. Note that for r/w the kernel already prevents the mmap. */
        p = mmap(NULL,
-                 MFD_DEF_SIZE,
+                 mfd_def_size,
                 PROT_READ,
                 MAP_SHARED,
                 fd,
                 0);
        if (p != MAP_FAILED) {
-                r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+                r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
                if (r >= 0) {
                        printf("mmap()+mprotect() didn't fail as expected\n");
                        abort();
@@ -379,7 +421,7 @@ static void mfd_fail_write(int fd)
        r = fallocate(fd,
                      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                      0,
-                      MFD_DEF_SIZE);
+                      mfd_def_size);
        if (r >= 0) {
                printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
                abort();
@@ -390,13 +432,13 @@ static void mfd_assert_shrink(int fd)
 {
        int r, fd2;
-        r = ftruncate(fd, MFD_DEF_SIZE / 2);
+        r = ftruncate(fd, mfd_def_size / 2);
        if (r < 0) {
                printf("ftruncate(SHRINK) failed: %m\n");
                abort();
        }
-        mfd_assert_size(fd, MFD_DEF_SIZE / 2);
+        mfd_assert_size(fd, mfd_def_size / 2);
        fd2 = mfd_assert_open(fd,
                              O_RDWR | O_CREAT | O_TRUNC,
@@ -410,7 +452,7 @@ static void mfd_fail_shrink(int fd)
 {
        int r;
-        r = ftruncate(fd, MFD_DEF_SIZE / 2);
+        r = ftruncate(fd, mfd_def_size / 2);
        if (r >= 0) {
                printf("ftruncate(SHRINK) didn't fail as expected\n");
                abort();
@@ -425,31 +467,31 @@ static void mfd_assert_grow(int fd)
 {
        int r;
-        r = ftruncate(fd, MFD_DEF_SIZE * 2);
+        r = ftruncate(fd, mfd_def_size * 2);
        if (r < 0) {
                printf("ftruncate(GROW) failed: %m\n");
                abort();
        }
-        mfd_assert_size(fd, MFD_DEF_SIZE * 2);
+        mfd_assert_size(fd, mfd_def_size * 2);
        r = fallocate(fd,
                      0,
                      0,
-                      MFD_DEF_SIZE * 4);
+                      mfd_def_size * 4);
        if (r < 0) {
                printf("fallocate(ALLOC) failed: %m\n");
                abort();
        }
-        mfd_assert_size(fd, MFD_DEF_SIZE * 4);
+        mfd_assert_size(fd, mfd_def_size * 4);
 }
 static void mfd_fail_grow(int fd)
 {
        int r;
-        r = ftruncate(fd, MFD_DEF_SIZE * 2);
+        r = ftruncate(fd, mfd_def_size * 2);
        if (r >= 0) {
                printf("ftruncate(GROW) didn't fail as expected\n");
                abort();
@@ -458,7 +500,7 @@ static void mfd_fail_grow(int fd)
        r = fallocate(fd,
                      0,
                      0,
-                      MFD_DEF_SIZE * 4);
+                      mfd_def_size * 4);
        if (r >= 0) {
                printf("fallocate(ALLOC) didn't fail as expected\n");
                abort();
@@ -467,25 +509,37 @@ static void mfd_fail_grow(int fd)
 static void mfd_assert_grow_write(int fd)
 {
-        static char buf[MFD_DEF_SIZE * 8];
+        static char *buf;
        ssize_t l;
-        l = pwrite(fd, buf, sizeof(buf), 0);
+        buf = malloc(mfd_def_size * 8);
-        if (l != sizeof(buf)) {
+        if (!buf) {
+                printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
+                abort();
+        }
+        l = pwrite(fd, buf, mfd_def_size * 8, 0);
+        if (l != (mfd_def_size * 8)) {
                printf("pwrite() failed: %m\n");
                abort();
        }
-        mfd_assert_size(fd, MFD_DEF_SIZE * 8);
+        mfd_assert_size(fd, mfd_def_size * 8);
 }
 static void mfd_fail_grow_write(int fd)
 {
-        static char buf[MFD_DEF_SIZE * 8];
+        static char *buf;
        ssize_t l;
-        l = pwrite(fd, buf, sizeof(buf), 0);
+        buf = malloc(mfd_def_size * 8);
-        if (l == sizeof(buf)) {
+        if (!buf) {
+                printf("malloc(%d) failed: %m\n", mfd_def_size * 8);
+                abort();
+        }
+        l = pwrite(fd, buf, mfd_def_size * 8, 0);
+        if (l == (mfd_def_size * 8)) {
                printf("pwrite() didn't fail as expected\n");
                abort();
        }
@@ -543,6 +597,8 @@ static void test_create(void)
        char buf[2048];
        int fd;
+        printf("%s CREATE\n", MEMFD_STR);
        /* test NULL name */
        mfd_fail_new(NULL, 0);
@@ -570,13 +626,18 @@ static void test_create(void)
        fd = mfd_assert_new("", 0, MFD_CLOEXEC);
        close(fd);
-        /* verify MFD_ALLOW_SEALING is allowed */
+        if (!hugetlbfs_test) {
-        fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+                /* verify MFD_ALLOW_SEALING is allowed */
-        close(fd);
+                fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+                close(fd);
-        /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
-        fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+                /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
-        close(fd);
+                fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+                close(fd);
+        } else {
+                /* sealing is not supported on hugetlbfs */
+                mfd_fail_new("", MFD_ALLOW_SEALING);
+        }
 }
 /*
@@ -587,8 +648,14 @@ static void test_basic(void)
 {
        int fd;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s BASIC\n", MEMFD_STR);
        fd = mfd_assert_new("kern_memfd_basic",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        /* add basic seals */
@@ -619,7 +686,7 @@ static void test_basic(void)
        /* verify sealing does not work without MFD_ALLOW_SEALING */
        fd = mfd_assert_new("kern_memfd_basic",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC);
        mfd_assert_has_seals(fd, F_SEAL_SEAL);
        mfd_fail_add_seals(fd, F_SEAL_SHRINK |
@@ -630,6 +697,28 @@ static void test_basic(void)
 }
 /*
+ * hugetlbfs doesn't support seals or write, so just verify grow and shrink
+ * on a hugetlbfs file created via memfd_create.
+ */
+static void test_hugetlbfs_grow_shrink(void)
+{
+        int fd;
+        printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR);
+        fd = mfd_assert_new("kern_memfd_seal_write",
+                            mfd_def_size,
+                            MFD_CLOEXEC);
+        mfd_assert_read(fd);
+        mfd_assert_write(fd);
+        mfd_assert_shrink(fd);
+        mfd_assert_grow(fd);
+        close(fd);
+}
+/*
 * Test SEAL_WRITE
 * Test whether SEAL_WRITE actually prevents modifications.
 */
@@ -637,8 +726,17 @@ static void test_seal_write(void)
 {
        int fd;
+        /*
+         * hugetlbfs does not contain sealing or write support.  Just test
+         * basic grow and shrink via test_hugetlbfs_grow_shrink.
+         */
+        if (hugetlbfs_test)
+                return test_hugetlbfs_grow_shrink();
+        printf("%s SEAL-WRITE\n", MEMFD_STR);
        fd = mfd_assert_new("kern_memfd_seal_write",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
        mfd_assert_add_seals(fd, F_SEAL_WRITE);
@@ -661,8 +759,14 @@ static void test_seal_shrink(void)
 {
        int fd;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s SEAL-SHRINK\n", MEMFD_STR);
        fd = mfd_assert_new("kern_memfd_seal_shrink",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
        mfd_assert_add_seals(fd, F_SEAL_SHRINK);
@@ -685,8 +789,14 @@ static void test_seal_grow(void)
 {
        int fd;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s SEAL-GROW\n", MEMFD_STR);
        fd = mfd_assert_new("kern_memfd_seal_grow",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
        mfd_assert_add_seals(fd, F_SEAL_GROW);
@@ -709,8 +819,14 @@ static void test_seal_resize(void)
 {
        int fd;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s SEAL-RESIZE\n", MEMFD_STR);
        fd = mfd_assert_new("kern_memfd_seal_resize",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
        mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
@@ -726,15 +842,52 @@ static void test_seal_resize(void)
 }
 /*
+ * hugetlbfs does not support seals.  Basic test to dup the memfd created
+ * fd and perform some basic operations on it.
+ */
+static void hugetlbfs_dup(char *b_suffix)
+{
+        int fd, fd2;
+        printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix);
+        fd = mfd_assert_new("kern_memfd_share_dup",
+                            mfd_def_size,
+                            MFD_CLOEXEC);
+        fd2 = mfd_assert_dup(fd);
+        mfd_assert_read(fd);
+        mfd_assert_write(fd);
+        mfd_assert_shrink(fd2);
+        mfd_assert_grow(fd2);
+        close(fd2);
+        close(fd);
+}
+/*
 * Test sharing via dup()
 * Test that seals are shared between dupped FDs and they're all equal.
 */
-static void test_share_dup(void)
+static void test_share_dup(char *banner, char *b_suffix)
 {
        int fd, fd2;
+        /*
+         * hugetlbfs does not contain sealing support.  Perform some
+         * basic testing on dup'ed fd instead via hugetlbfs_dup.
+         */
+        if (hugetlbfs_test) {
+                hugetlbfs_dup(b_suffix);
+                return;
+        }
+        printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
        fd = mfd_assert_new("kern_memfd_share_dup",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
@@ -768,13 +921,19 @@ static void test_share_dup(void)
 * Test sealing with active mmap()s
 * Modifying seals is only allowed if no other mmap() refs exist.
 */
-static void test_share_mmap(void)
+static void test_share_mmap(char *banner, char *b_suffix)
 {
        int fd;
        void *p;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s %s %s\n", MEMFD_STR,  banner, b_suffix);
        fd = mfd_assert_new("kern_memfd_share_mmap",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
@@ -784,14 +943,40 @@ static void test_share_mmap(void)
        mfd_assert_has_seals(fd, 0);
        mfd_assert_add_seals(fd, F_SEAL_SHRINK);
        mfd_assert_has_seals(fd, F_SEAL_SHRINK);
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
        /* readable ref allows sealing */
        p = mfd_assert_mmap_private(fd);
        mfd_assert_add_seals(fd, F_SEAL_WRITE);
        mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
-        munmap(p, MFD_DEF_SIZE);
+        munmap(p, mfd_def_size);
+        close(fd);
+}
+/*
+ * Basic test to make sure we can open the hugetlbfs fd via /proc and
+ * perform some simple operations on it.
+ */
+static void hugetlbfs_proc_open(char *b_suffix)
+{
+        int fd, fd2;
+        printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix);
+        fd = mfd_assert_new("kern_memfd_share_open",
+                            mfd_def_size,
+                            MFD_CLOEXEC);
+        fd2 = mfd_assert_open(fd, O_RDWR, 0);
+        mfd_assert_read(fd);
+        mfd_assert_write(fd);
+        mfd_assert_shrink(fd2);
+        mfd_assert_grow(fd2);
+        close(fd2);
        close(fd);
 }
@@ -801,12 +986,23 @@ static void test_share_mmap(void)
 * This is *not* like dup(), but like a real separate open(). Make sure the
 * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
 */
-static void test_share_open(void)
+static void test_share_open(char *banner, char *b_suffix)
 {
        int fd, fd2;
+        /*
+         * hugetlbfs does not contain sealing support.  So test basic
+         * functionality of using /proc fd via hugetlbfs_proc_open
+         */
+        if (hugetlbfs_test) {
+                hugetlbfs_proc_open(b_suffix);
+                return;
+        }
+        printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
        fd = mfd_assert_new("kern_memfd_share_open",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
@@ -841,13 +1037,19 @@ static void test_share_open(void)
 * Test sharing via fork()
 * Test whether seal-modifications work as expected with forked childs.
 */
-static void test_share_fork(void)
+static void test_share_fork(char *banner, char *b_suffix)
 {
        int fd;
        pid_t pid;
+        /* hugetlbfs does not contain sealing support */
+        if (hugetlbfs_test)
+                return;
+        printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
        fd = mfd_assert_new("kern_memfd_share_fork",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                            MFD_CLOEXEC | MFD_ALLOW_SEALING);
        mfd_assert_has_seals(fd, 0);
@@ -870,40 +1072,40 @@ int main(int argc, char **argv)
 {
        pid_t pid;
-        printf("memfd: CREATE\n");
+        if (argc == 2) {
+                if (!strcmp(argv[1], "hugetlbfs")) {
+                        unsigned long hpage_size = default_huge_page_size();
+                        if (!hpage_size) {
+                                printf("Unable to determine huge page size\n");
+                                abort();
+                        }
+                        hugetlbfs_test = 1;
+                        mfd_def_size = hpage_size * 2;
+                }
+        }
        test_create();
-        printf("memfd: BASIC\n");
        test_basic();
-        printf("memfd: SEAL-WRITE\n");
        test_seal_write();
-        printf("memfd: SEAL-SHRINK\n");
        test_seal_shrink();
-        printf("memfd: SEAL-GROW\n");
        test_seal_grow();
-        printf("memfd: SEAL-RESIZE\n");
        test_seal_resize();
-        printf("memfd: SHARE-DUP\n");
+        test_share_dup("SHARE-DUP", "");
-        test_share_dup();
+        test_share_mmap("SHARE-MMAP", "");
-        printf("memfd: SHARE-MMAP\n");
+        test_share_open("SHARE-OPEN", "");
-        test_share_mmap();
+        test_share_fork("SHARE-FORK", "");
-        printf("memfd: SHARE-OPEN\n");
-        test_share_open();
-        printf("memfd: SHARE-FORK\n");
-        test_share_fork();
        /* Run test-suite in a multi-threaded environment with a shared
         * file-table. */
        pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
-        printf("memfd: SHARE-DUP (shared file-table)\n");
+        test_share_dup("SHARE-DUP", SHARED_FT_STR);
-        test_share_dup();
+        test_share_mmap("SHARE-MMAP", SHARED_FT_STR);
-        printf("memfd: SHARE-MMAP (shared file-table)\n");
+        test_share_open("SHARE-OPEN", SHARED_FT_STR);
-        test_share_mmap();
+        test_share_fork("SHARE-FORK", SHARED_FT_STR);
-        printf("memfd: SHARE-OPEN (shared file-table)\n");
-        test_share_open();
-        printf("memfd: SHARE-FORK (shared file-table)\n");
-        test_share_fork();
        join_idle_thread(pid);
        printf("memfd: DONE\n");
diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh
new file mode 100644
index 000000000000..daabb350697c
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_tests.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# please run as root
+#
+# Normal tests requiring no special resources
+#
+./run_fuse_test.sh
+./memfd_test
+#
+# To test memfd_create with hugetlbfs, there needs to be hpages_test
+# huge pages free.  Attempt to allocate enough pages to test.
+#
+hpages_test=8
+#
+# Get count of free huge pages from /proc/meminfo
+#
+while read name size unit; do
+        if [ "$name" = "HugePages_Free:" ]; then
+                freepgs=$size
+        fi
+done < /proc/meminfo
+#
+# If not enough free huge pages for test, attempt to increase
+#
+if [ -n "$freepgs" ] && [ $freepgs -lt $hpages_test ]; then
+        nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+        hpages_needed=`expr $hpages_test - $freepgs`
+        echo 3 > /proc/sys/vm/drop_caches
+        echo $(( $hpages_needed + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+        if [ $? -ne 0 ]; then
+                echo "Please run this test as root"
+                exit 1
+        fi
+        while read name size unit; do
+                if [ "$name" = "HugePages_Free:" ]; then
+                        freepgs=$size
+                fi
+        done < /proc/meminfo
+fi
+#
+# If still not enough huge pages available, exit.  But, give back any huge
+# pages potentially allocated above.
+#
+if [ $freepgs -lt $hpages_test ]; then
+        # nr_hugepgs non-zero only if we attempted to increase
+        if [ -n "$nr_hugepgs" ]; then
+                echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+        fi
+        printf "Not enough huge pages available (%d < %d)\n" \
+                $freepgs $needpgs
+        exit 1
+fi
+#
+# Run the hugetlbfs test
+#
+./memfd_test hugetlbfs
+#
+# Give back any huge pages allocated for the test
+#
+if [ -n "$nr_hugepgs" ]; then
+        echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+fi
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 1eae79ae5b4e..a2c53a3d223d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -66,6 +66,8 @@
 #include <sys/wait.h>
 #include <pthread.h>
 #include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
 #ifdef __NR_userfaultfd
@@ -82,11 +84,17 @@ static int bounces;
 #define TEST_SHMEM      3
 static int test_type;
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+static bool map_shared;
 static int huge_fd;
 static char *huge_fd_off0;
 static unsigned long long *count_verify;
 static int uffd, uffd_flags, finished, *pipefd;
-static char *area_src, *area_dst;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
 static char *zeropage;
 pthread_attr_t attr;
@@ -125,6 +133,9 @@ static void anon_allocate_area(void **alloc_area)
        }
 }
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
 /* HugeTLB memory */
 static int hugetlb_release_pages(char *rel_area)
@@ -145,17 +156,51 @@ static int hugetlb_release_pages(char *rel_area)
 static void hugetlb_allocate_area(void **alloc_area)
 {
+        void *area_alias = NULL;
+        char **alloc_area_alias;
        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-                                MAP_PRIVATE | MAP_HUGETLB, huge_fd,
+                           (map_shared ? MAP_SHARED : MAP_PRIVATE) |
-                                *alloc_area == area_src ? 0 :
+                           MAP_HUGETLB,
-                                nr_pages * page_size);
+                           huge_fd, *alloc_area == area_src ? 0 :
+                           nr_pages * page_size);
        if (*alloc_area == MAP_FAILED) {
                fprintf(stderr, "mmap of hugetlbfs file failed\n");
                *alloc_area = NULL;
        }
-        if (*alloc_area == area_src)
+        if (map_shared) {
+                area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+                                  MAP_SHARED | MAP_HUGETLB,
+                                  huge_fd, *alloc_area == area_src ? 0 :
+                                  nr_pages * page_size);
+                if (area_alias == MAP_FAILED) {
+                        if (munmap(*alloc_area, nr_pages * page_size) < 0)
+                                perror("hugetlb munmap"), exit(1);
+                        *alloc_area = NULL;
+                        return;
+                }
+        }
+        if (*alloc_area == area_src) {
                huge_fd_off0 = *alloc_area;
+                alloc_area_alias = &area_src_alias;
+        } else {
+                alloc_area_alias = &area_dst_alias;
+        }
+        if (area_alias)
+                *alloc_area_alias = area_alias;
+}
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+        if (!map_shared)
+                return;
+        /*
+         * We can't zap just the pagetable with hugetlbfs because
+         * MADV_DONTEED won't work. So exercise -EEXIST on a alias
+         * mapping where the pagetables are not established initially,
+         * this way we'll exercise the -EEXEC at the fs level.
+         */
+        *start = (unsigned long) area_dst_alias + offset;
 }
 /* Shared memory */
@@ -185,6 +230,7 @@ struct uffd_test_ops {
        unsigned long expected_ioctls;
        void (*allocate_area)(void **alloc_area);
        int (*release_pages)(char *rel_area);
+        void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 };
 #define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
@@ -195,18 +241,21 @@ static struct uffd_test_ops anon_uffd_test_ops = {
        .expected_ioctls = ANON_EXPECTED_IOCTLS,
        .allocate_area  = anon_allocate_area,
        .release_pages  = anon_release_pages,
+        .alias_mapping = noop_alias_mapping,
 };
 static struct uffd_test_ops shmem_uffd_test_ops = {
-        .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+        .expected_ioctls = ANON_EXPECTED_IOCTLS,
        .allocate_area  = shmem_allocate_area,
        .release_pages  = shmem_release_pages,
+        .alias_mapping = noop_alias_mapping,
 };
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
        .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
        .allocate_area  = hugetlb_allocate_area,
        .release_pages  = hugetlb_release_pages,
+        .alias_mapping = hugetlb_alias_mapping,
 };
 static struct uffd_test_ops *uffd_test_ops;
@@ -331,6 +380,23 @@ static void *locking_thread(void *arg)
        return NULL;
 }
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+                            unsigned long offset)
+{
+        uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+                                     uffdio_copy->len,
+                                     offset);
+        if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+                /* real retval in ufdio_copy.copy */
+                if (uffdio_copy->copy != -EEXIST)
+                        fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
+                                uffdio_copy->copy), exit(1);
+        } else {
+                fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n",
+                        uffdio_copy->copy), exit(1);
+        }
+}
 static int copy_page(int ufd, unsigned long offset)
 {
        struct uffdio_copy uffdio_copy;
@@ -351,8 +417,13 @@ static int copy_page(int ufd, unsigned long offset)
        } else if (uffdio_copy.copy != page_size) {
                fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
                        uffdio_copy.copy), exit(1);
-        } else
+        } else {
+                if (test_uffdio_copy_eexist) {
+                        test_uffdio_copy_eexist = false;
+                        retry_copy_page(ufd, &uffdio_copy, offset);
+                }
                return 1;
+        }
        return 0;
 }
@@ -408,6 +479,7 @@ static void *uffd_poll_thread(void *arg)
                                userfaults++;
                        break;
                case UFFD_EVENT_FORK:
+                        close(uffd);
                        uffd = msg.arg.fork.ufd;
                        pollfd[0].fd = uffd;
                        break;
@@ -572,6 +644,17 @@ static int userfaultfd_open(int features)
        return 0;
 }
+sigjmp_buf jbuf, *sigbuf;
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+        if (sig == SIGBUS) {
+                if (sigbuf)
+                        siglongjmp(*sigbuf, 1);
+                abort();
+        }
+}
 /*
 * For non-cooperative userfaultfd test we fork() a process that will
 * generate pagefaults, will mremap the area monitored by the
@@ -585,19 +668,59 @@ static int userfaultfd_open(int features)
 * The release of the pages currently generates event for shmem and
 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
 * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
 */
-static int faulting_process(void)
+static int faulting_process(int signal_test)
 {
        unsigned long nr;
        unsigned long long count;
        unsigned long split_nr_pages;
+        unsigned long lastnr;
+        struct sigaction act;
+        unsigned long signalled = 0;
        if (test_type != TEST_HUGETLB)
                split_nr_pages = (nr_pages + 1) / 2;
        else
                split_nr_pages = nr_pages;
+        if (signal_test) {
+                sigbuf = &jbuf;
+                memset(&act, 0, sizeof(act));
+                act.sa_sigaction = sighndl;
+                act.sa_flags = SA_SIGINFO;
+                if (sigaction(SIGBUS, &act, 0)) {
+                        perror("sigaction");
+                        return 1;
+                }
+                lastnr = (unsigned long)-1;
+        }
        for (nr = 0; nr < split_nr_pages; nr++) {
+                if (signal_test) {
+                        if (sigsetjmp(*sigbuf, 1) != 0) {
+                                if (nr == lastnr) {
+                                        fprintf(stderr, "Signal repeated\n");
+                                        return 1;
+                                }
+                                lastnr = nr;
+                                if (signal_test == 1) {
+                                        if (copy_page(uffd, nr * page_size))
+                                                signalled++;
+                                } else {
+                                        signalled++;
+                                        continue;
+                                }
+                        }
+                }
                count = *area_count(area_dst, nr);
                if (count != count_verify[nr]) {
                        fprintf(stderr,
@@ -607,6 +730,9 @@ static int faulting_process(void)
                }
        }
+        if (signal_test)
+                return signalled != split_nr_pages;
        if (test_type == TEST_HUGETLB)
                return 0;
@@ -636,6 +762,23 @@ static int faulting_process(void)
        return 0;
 }
+static void retry_uffdio_zeropage(int ufd,
+                                  struct uffdio_zeropage *uffdio_zeropage,
+                                  unsigned long offset)
+{
+        uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+                                     uffdio_zeropage->range.len,
+                                     offset);
+        if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+                if (uffdio_zeropage->zeropage != -EEXIST)
+                        fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
+                                uffdio_zeropage->zeropage), exit(1);
+        } else {
+                fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
+                        uffdio_zeropage->zeropage), exit(1);
+        }
+}
 static int uffdio_zeropage(int ufd, unsigned long offset)
 {
        struct uffdio_zeropage uffdio_zeropage;
@@ -670,8 +813,14 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
                if (uffdio_zeropage.zeropage != page_size) {
                        fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
                                uffdio_zeropage.zeropage), exit(1);
-                } else
+                } else {
+                        if (test_uffdio_zeropage_eexist) {
+                                test_uffdio_zeropage_eexist = false;
+                                retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+                                                      offset);
+                        }
                        return 1;
+                }
        } else {
                fprintf(stderr,
                        "UFFDIO_ZEROPAGE succeeded %Ld\n",
@@ -761,7 +910,7 @@ static int userfaultfd_events_test(void)
                perror("fork"), exit(1);
        if (!pid)
-                return faulting_process();
+                return faulting_process(0);
        waitpid(pid, &err, 0);
        if (err)
@@ -778,6 +927,72 @@ static int userfaultfd_events_test(void)
        return userfaults != nr_pages;
 }
+static int userfaultfd_sig_test(void)
+{
+        struct uffdio_register uffdio_register;
+        unsigned long expected_ioctls;
+        unsigned long userfaults;
+        pthread_t uffd_mon;
+        int err, features;
+        pid_t pid;
+        char c;
+        printf("testing signal delivery: ");
+        fflush(stdout);
+        if (uffd_test_ops->release_pages(area_dst))
+                return 1;
+        features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+        if (userfaultfd_open(features) < 0)
+                return 1;
+        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+        uffdio_register.range.start = (unsigned long) area_dst;
+        uffdio_register.range.len = nr_pages * page_size;
+        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+                fprintf(stderr, "register failure\n"), exit(1);
+        expected_ioctls = uffd_test_ops->expected_ioctls;
+        if ((uffdio_register.ioctls & expected_ioctls) !=
+            expected_ioctls)
+                fprintf(stderr,
+                        "unexpected missing ioctl for anon memory\n"),
+                        exit(1);
+        if (faulting_process(1))
+                fprintf(stderr, "faulting process failed\n"), exit(1);
+        if (uffd_test_ops->release_pages(area_dst))
+                return 1;
+        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+                perror("uffd_poll_thread create"), exit(1);
+        pid = fork();
+        if (pid < 0)
+                perror("fork"), exit(1);
+        if (!pid)
+                exit(faulting_process(2));
+        waitpid(pid, &err, 0);
+        if (err)
+                fprintf(stderr, "faulting process failed\n"), exit(1);
+        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+                perror("pipe write"), exit(1);
+        if (pthread_join(uffd_mon, (void **)&userfaults))
+                return 1;
+        printf("done.\n");
+        if (userfaults)
+                fprintf(stderr, "Signal test failed, userfaults: %ld\n",
+                        userfaults);
+        close(uffd);
+        return userfaults != 0;
+}
 static int userfaultfd_stress(void)
 {
        void *area;
@@ -879,6 +1094,15 @@ static int userfaultfd_stress(void)
                        return 1;
                }
+                if (area_dst_alias) {
+                        uffdio_register.range.start = (unsigned long)
+                                area_dst_alias;
+                        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+                                fprintf(stderr, "register failure alias\n");
+                                return 1;
+                        }
+                }
                /*
                 * The madvise done previously isn't enough: some
                 * uffd_thread could have read userfaults (one of
@@ -912,9 +1136,17 @@ static int userfaultfd_stress(void)
                /* unregister */
                if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
-                        fprintf(stderr, "register failure\n");
+                        fprintf(stderr, "unregister failure\n");
                        return 1;
                }
+                if (area_dst_alias) {
+                        uffdio_register.range.start = (unsigned long) area_dst;
+                        if (ioctl(uffd, UFFDIO_UNREGISTER,
+                                  &uffdio_register.range)) {
+                                fprintf(stderr, "unregister failure alias\n");
+                                return 1;
+                        }
+                }
                /* verification */
                if (bounces & BOUNCE_VERIFY) {
@@ -936,6 +1168,10 @@ static int userfaultfd_stress(void)
                area_src = area_dst;
                area_dst = tmp_area;
+                tmp_area = area_src_alias;
+                area_src_alias = area_dst_alias;
+                area_dst_alias = tmp_area;
                printf("userfaults:");
                for (cpu = 0; cpu < nr_cpus; cpu++)
                        printf(" %lu", userfaults[cpu]);
@@ -946,7 +1182,8 @@ static int userfaultfd_stress(void)
                return err;
        close(uffd);
-        return userfaultfd_zeropage_test() || userfaultfd_events_test();
+        return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+                || userfaultfd_events_test();
 }
 /*
@@ -981,7 +1218,12 @@ static void set_test_type(const char *type)
        } else if (!strcmp(type, "hugetlb")) {
                test_type = TEST_HUGETLB;
                uffd_test_ops = &hugetlb_uffd_test_ops;
+        } else if (!strcmp(type, "hugetlb_shared")) {
+                map_shared = true;
+                test_type = TEST_HUGETLB;
+                uffd_test_ops = &hugetlb_uffd_test_ops;
        } else if (!strcmp(type, "shmem")) {
+                map_shared = true;
                test_type = TEST_SHMEM;
                uffd_test_ops = &shmem_uffd_test_ops;
        } else {
@@ -1001,12 +1243,25 @@ static void set_test_type(const char *type)
                fprintf(stderr, "Impossible to run this test\n"), exit(2);
 }
+static void sigalrm(int sig)
+{
+        if (sig != SIGALRM)
+                abort();
+        test_uffdio_copy_eexist = true;
+        test_uffdio_zeropage_eexist = true;
+        alarm(ALARM_INTERVAL_SECS);
+}
 int main(int argc, char **argv)
 {
        if (argc < 4)
                fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
                                exit(1);
+        if (signal(SIGALRM, sigalrm) == SIG_ERR)
+                fprintf(stderr, "failed to arm SIGALRM"), exit(1);
+        alarm(ALARM_INTERVAL_SECS);
        set_test_type(argv[1]);
        nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-06 23:49:49 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-06 23:49:49 -0400
commit	d34fc1adf01ff87026da85fb972dc259dc347540 (patch)
tree	27356073d423187157b7cdb69da32b53102fb9e7
parent	1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33 (diff)
parent	d2cd9ede6e193dd7d88b6d27399e96229a551b19 (diff)