Merge branch 'akpm' (patches from Andrew)

Merge second patch-bomb from Andrew Morton: - more MM stuff: - Kirill's page-flags rework - Kirill's now-allegedly-fixed THP rework - MADV_FREE implementation - DAX feature work (msync/fsync). This isn't quite complete but DAX is new and it's good enough and the guys have a handle on what needs to be done - I expect this to be wrapped in the next week or two. - some vsprintf maintenance work - various other misc bits * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits) printk: change recursion_bug type to bool lib/vsprintf: factor out %pN[F] handler as netdev_bits() lib/vsprintf: refactor duplicate code to special_hex_number() printk-formats.txt: remove unimplemented %pT printk: help pr_debug and pr_devel to optimize out arguments lib/test_printf.c: test dentry printing lib/test_printf.c: add test for large bitmaps lib/test_printf.c: account for kvasprintf tests lib/test_printf.c: add a few number() tests lib/test_printf.c: test precision quirks lib/test_printf.c: check for out-of-bound writes lib/test_printf.c: don't BUG lib/kasprintf.c: add sanity check to kvasprintf lib/vsprintf.c: warn about too large precisions and field widths lib/vsprintf.c: help gcc make number() smaller lib/vsprintf.c: expand field_width to 24 bits lib/vsprintf.c: eliminate potential race in string() lib/vsprintf.c: move string() below widen_string() lib/vsprintf.c: pull out padding code from dentry_name() printk: do cond_resched() between lines while outputting to consoles ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-17 15:58:52 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-17 15:58:52 -0500
commit: 0cbeafb245ca568bc0765645aa64f0451b716657 (patch)
tree: 663c09ff5a62a1b2b66a17c4dfe0413603530a36
parent: 58cf279acac3080ce03eeea5ca268210b3165fe1 (diff)
parent: 06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff)
189 files changed, 4357 insertions, 2886 deletions
diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644
index 26f74b457e0b..000000000000
--- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name:          pmdp_splitting_flush
-#         Kconfig:       __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-#         description:   arch supports the pmdp_splitting_flush() VM API
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |       avr32: | TODO |
-    |    blackfin: | TODO |
-    |         c6x: | TODO |
-    |        cris: | TODO |
-    |         frv: | TODO |
-    |       h8300: | TODO |
-    |     hexagon: | TODO |
-    |        ia64: | TODO |
-    |        m32r: | TODO |
-    |        m68k: | TODO |
-    |       metag: | TODO |
-    |  microblaze: | TODO |
-    |        mips: |  ok  |
-    |     mn10300: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |        s390: |  ok  |
-    |       score: | TODO |
-    |          sh: | TODO |
-    |       sparc: | TODO |
-    |        tile: | TODO |
-    |          um: | TODO |
-    |   unicore32: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 6389551bbad6..5d1128bf0282 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -306,15 +306,6 @@ Network device features:
        Passed by reference.
-Command from struct task_struct
-        %pT     ls
-        For printing executable name excluding path from struct
-        task_struct.
-        Passed by reference.
 If you add other %p extensions, please extend lib/test_printf.c with
 one or more test cases, if at all feasible.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8a282687ee06..21cf34f3ddb2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -35,10 +35,10 @@ miss is going to run faster.
 == Design ==
- "graceful fallback": mm components which don't have transparent
+- "graceful fallback": mm components which don't have transparent hugepage
-  hugepage knowledge fall back to breaking a transparent hugepage and
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
-  working on the regular pages and their respective regular pmd/pte
+  if necessary, split a transparent hugepage. Therefore these components
-  mappings
+  can continue working on the regular pages or regular pte mappings.
 - if a hugepage allocation fails because of memory fragmentation,
  regular pages should be gracefully allocated instead and mixed in
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
        of pages that should be collapsed into one huge page but failed
        the allocation.
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
        pages. This can happen for a variety of reasons but a common
        reason is that a huge page is old and is being reclaimed.
+        This action implies splitting all PMD the page mapped with.
+thp_split_page_failed is is incremented if kernel fails to split huge
+        page. This can happen if the page was pinned by somebody.
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+        This can happen, for instance, when application calls mprotect() or
+        munmap() on part of huge page. It doesn't split huge page, only
+        page table entry.
 thp_zero_page_alloc is incremented every time a huge zero page is
        successfully allocated. It includes allocations which where
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
 if any driver is going to mangle over the page structure of the tail
 page (like for checking page->mapping or other bits that are relevant
 for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
+to check head page instead. Taking reference on any head/tail page would
-split_huge_page() to avoid the head and tail pages to disappear from
+prevent page from being split by anyone.
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
 NOTE: these aren't new constraints to the GUP API, and they match the
 same constrains that applies to hugetlbfs too, so any driver capable
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
 == Graceful fallback ==
 Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
 pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
 missing after pmd_offset returns the pmd. Thanks to the graceful
 fallback design, with a one liner change, you can avoid to write
 hundred if not thousand of lines of complex code to make your code
@@ -323,7 +330,8 @@ hugepage aware.
 If you're not walking pagetables but you run into a physical hugepage
 but you can't handle it natively in your code, you can split it by
 calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
 Example to make mremap.c transparent hugepage aware with a one liner
 change:
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
                return NULL;
        pmd = pmd_offset(pud, addr);
-+       split_huge_page_pmd(vma, addr, pmd);
+       split_huge_pmd(vma, pmd, addr);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
 == Locking in hugepage aware code ==
 We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
 To make pagetable walks huge pmd aware, all you need to do is to call
 pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
 takes the mmap_sem in write mode in addition to the anon_vma lock). If
 pmd_trans_huge returns false, you just fallback in the old code
 paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
+page table lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+regular pmd from under you (split_huge_pmd can run in parallel to the
 pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
+should just drop the page table lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
+before. Otherwise you can proceed to process the huge pmd and the
-case pmd_trans_splitting returns true, it means split_huge_page is
+hugepage natively. Once finished you can drop the page table lock.
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
+== Refcounts and transparent huge pages ==
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
+Refcounting on THP is mostly consistent with refcounting on other compound
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
+pages:
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
+  - get_page()/put_page() and GUP operate in head page's ->_count.
-== compound_lock, get_user_pages and put_page ==
+  - ->_count in tail pages is always zero: get_page_unless_zero() never
+    succeed on tail pages.
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+  - map/unmap of the whole compound page accounted in compound_mapcount
+    (stored in first tail page).
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
 split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
+page to the tail pages before clearing all PG_head/tail bits from the page
-page structures. It can do that easily for refcounts taken by huge pmd
+structures. It can be done easily for refcounts taken by page table
-mappings. But the GUI API as created by hugetlbfs (that returns head
+entries. But we don't have enough information on how to distribute any
-and tail pages if running get_user_pages on an address backed by any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
-hugepage), requires the refcount to be accounted on the tail pages and
+requests to split pinned huge page: it expects page count to be equal to
-not only in the head pages, if we want to be able to run
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
-split_huge_page while there are gup pins established on any tail
+have reference for head page).
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
+split_huge_page uses migration entries to stabilize page->_count and
-get_user_pages which is unacceptable as too many gup users are
+page->_mapcount.
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
+We safe against physical memory scanners too: the only legitimate way
-hugetlbfs pages cannot be split so there wouldn't be requirement of
+scanner can get reference to a page is get_page_unless_zero().
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
-anymore which tail page is pinned by gup and which is not while we run
+from geting reference to tail page up to the point. After the atomic_add()
-split_huge_page. But we still have to add the gup pin to the head page
+we don't care about ->_count value.  We already known how many references
-too, to know when we can free the compound page in case it's never
+with should uncharge from head page.
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
+For head page get_page_unless_zero() will succeed and we don't mind. It's
-page (and only on a tail page) it will find its respective head page,
+clear where reference should go after split: it will stay on head page.
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
-refcount without race conditions, put_page has to serialize against
+pmd can be split at any point and never fails.
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+== Partial unmap and deferred_split_huge_page() ==
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f2f949671798..ab336c06153e 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -47,8 +47,10 @@
 #define MADV_WILLNEED   3               /* will need these pages */
 #define MADV_SPACEAVAIL 5               /* ensure resources are available */
 #define MADV_DONTNEED   6               /* don't need these pages */
+#define MADV_FREE       7               /* free pages only if memory pressure */
 /* common/generic parameters */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 #define MADV_REMOVE     9               /* remove these pages & resources */
 #define MADV_DONTFORK   10              /* don't inherit across fork */
 #define MADV_DOFORK     11              /* do inherit across fork */
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 6312f607932f..76dde9db7934 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -73,9 +73,6 @@ config STACKTRACE_SUPPORT
        def_bool y
        select STACKTRACE
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
        def_bool y
        depends on ARC_MMU_V4
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index ff7ff6cbb811..b65f797e9ad6 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
         */
        if (!mapping_mapped(mapping)) {
                clear_bit(PG_dc_clean, &page->flags);
-        } else if (page_mapped(page)) {
+        } else if (page_mapcount(page)) {
                /* kernel reading from page with U-mapping */
                phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
         * For !VIPT cache, all of this gets compiled out as
         * addr_not_cache_congruent() is 0
         */
-        if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+        if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
                __flush_dcache_page((unsigned long)kfrom, u_vaddr);
                clean_src_k_mappings = 1;
        }
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4e489cc5c45e..6a889afa6a2c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -168,11 +168,6 @@ config STACKTRACE_SUPPORT
        bool
        default y
-config HAVE_LATENCYTOP_SUPPORT
-        bool
-        depends on !SMP
-        default y
 config LOCKDEP_SUPPORT
        bool
        default y
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9203c21b4673..a520b7987a29 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -182,7 +182,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
        return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
 }
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
+                                               kvm_pfn_t pfn,
                                               unsigned long size,
                                               bool ipa_uncached)
 {
@@ -246,7 +247,7 @@ static inline void __kvm_flush_dcache_pte(pte_t pte)
 static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
 {
        unsigned long size = PMD_SIZE;
-        pfn_t pfn = pmd_pfn(pmd);
+        kvm_pfn_t pfn = pmd_pfn(pmd);
        while (size) {
                void *va = kmap_atomic_pfn(pfn);
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853..dc46398bc3a5 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -88,7 +88,6 @@
 #define L_PMD_SECT_VALID        (_AT(pmdval_t, 1) << 0)
 #define L_PMD_SECT_DIRTY        (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING    (_AT(pmdval_t, 1) << 56)
 #define L_PMD_SECT_NONE         (_AT(pmdval_t, 1) << 57)
 #define L_PMD_SECT_RDONLY       (_AT(pteval_t, 1) << 58)
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)     (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp);
-#endif
 #endif
 #define PMD_BIT_FUNC(fn,op) \
@@ -246,9 +238,9 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
 PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkold,     &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
 PMD_BIT_FUNC(mkwrite,   &= ~L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkdirty,   |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean,   &= ~L_PMD_SECT_DIRTY);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 #define pmd_mkhuge(pmd)         (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 22f7fa0124ec..aba61fd3697a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -992,9 +992,9 @@ out:
        return ret;
 }
-static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
+static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
 {
-        pfn_t pfn = *pfnp;
+        kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *ipap >> PAGE_SHIFT;
        if (PageTransCompound(pfn_to_page(pfn))) {
@@ -1201,7 +1201,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
-static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
                                      unsigned long size, bool uncached)
 {
        __coherent_cache_guest_page(vcpu, pfn, size, uncached);
@@ -1218,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
        struct vm_area_struct *vma;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        pgprot_t mem_type = PAGE_S2;
        bool fault_ipa_uncached;
        bool logging_active = memslot_is_logging(memslot);
@@ -1346,7 +1346,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 {
        pmd_t *pmd;
        pte_t *pte;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        bool pfn_valid = false;
        trace_kvm_access_fault(fault_ipa);
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 588bbc288396..6bd1089b07e0 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
         *
         * Lock the page table for the destination and check
         * to see that it's still huge and whether or not we will
-         * need to fault on write, or if we have a splitting THP.
+         * need to fault on write.
         */
        if (unlikely(pmd_thp_or_huge(*pmd))) {
                ptl = &current->mm->page_table_lock;
                spin_lock(ptl);
                if (unlikely(!pmd_thp_or_huge(*pmd)
-                        || pmd_hugewillfault(*pmd)
+                        || pmd_hugewillfault(*pmd))) {
-                        || pmd_trans_splitting(*pmd))) {
                        spin_unlock(ptl);
                        return 0;
                }
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1ec8e7590fc6..d0ba3551d49a 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
        mapping = page_mapping(page);
        if (!cache_ops_need_broadcast() &&
-            mapping && !page_mapped(page))
+            mapping && !page_mapcount(page))
                clear_bit(PG_dcache_clean, &page->flags);
        else {
                __flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
         */
        __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
 }
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp)
-{
-        pmd_t pmd = pmd_mksplitting(*pmdp);
-        VM_BUG_ON(address & ~PMD_MASK);
-        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-        /* dummy IPI to serialise against fast_gup */
-        kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0bf8b4320a91..736433912a1e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -230,7 +230,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
        return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
+static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
+                                               kvm_pfn_t pfn,
                                               unsigned long size,
                                               bool ipa_uncached)
 {
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 69d2e2f86bce..2d545d7aa80b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -353,21 +353,14 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)     (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd)        pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define pmd_dirty(pmd)          pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)          pte_young(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)      pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd)    pte_pmd(pte_mkspecial(pmd_pte(pmd)))
 #define pmd_mkold(pmd)          pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)        pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)        pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)        pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mknotpresent(pmd)   (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 46649d6e6c5a..60585bde1264 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -102,19 +102,3 @@ EXPORT_SYMBOL(flush_dcache_page);
 * Additional functions defined in assembly.
 */
 EXPORT_SYMBOL(flush_icache_range);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp)
-{
-        pmd_t pmd = pmd_mksplitting(*pmdp);
-        VM_BUG_ON(address & ~PMD_MASK);
-        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-        /* dummy IPI to serialise against fast_gup */
-        kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/avr32/include/asm/page.h b/arch/avr32/include/asm/page.h
index f805d1cb11bc..c5d2a3e2c62f 100644
--- a/arch/avr32/include/asm/page.h
+++ b/arch/avr32/include/asm/page.h
@@ -83,11 +83,9 @@ static inline int get_order(unsigned long size)
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-#define PHYS_PFN_OFFSET         (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
+#define ARCH_PFN_OFFSET         (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
-#define pfn_to_page(pfn)        (mem_map + ((pfn) - PHYS_PFN_OFFSET))
+#define pfn_valid(pfn)          ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (ARCH_PFN_OFFSET + max_mapnr))
-#define page_to_pfn(page)       ((unsigned long)((page) - mem_map) + PHYS_PFN_OFFSET)
-#define pfn_valid(pfn)          ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 #define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
@@ -101,4 +99,6 @@ static inline int get_order(unsigned long size)
 */
 #define HIGHMEM_START           0x20000000UL
+#include <asm-generic/memory_model.h>
 #endif /* __ASM_AVR32_PAGE_H */
diff --git a/arch/frv/include/asm/page.h b/arch/frv/include/asm/page.h
index 8c97068ac8fc..688d8076a43a 100644
--- a/arch/frv/include/asm/page.h
+++ b/arch/frv/include/asm/page.h
@@ -34,7 +34,7 @@ typedef struct page *pgtable_t;
 #define pgprot_val(x)   ((x).pgprot)
 #define __pte(x)        ((pte_t) { (x) } )
-#define __pmd(x)        ((pmd_t) { (x) } )
+#define __pmd(x)        ((pmd_t) { { (x) } } )
 #define __pud(x)        ((pud_t) { (x) } )
 #define __pgd(x)        ((pgd_t) { (x) } )
 #define __pgprot(x)     ((pgprot_t) { (x) } )
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index ec48bb9f95e1..e8c486ef0d76 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -105,6 +105,7 @@ extern struct page *vmem_map;
 #ifdef CONFIG_DISCONTIGMEM
 # define page_to_pfn(page)      ((unsigned long) (page - vmem_map))
 # define pfn_to_page(pfn)       (vmem_map + (pfn))
+# define __pfn_to_phys(pfn)     PFN_PHYS(pfn)
 #else
 # include <asm-generic/memory_model.h>
 #endif
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index 0b389a81c43a..a0fa88da3e31 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -36,9 +36,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config RWSEM_GENERIC_SPINLOCK
        def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 0bce820428fc..5ecd0287a874 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -67,9 +67,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6ded8d347af9..7c191443c7ea 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -101,9 +101,9 @@
 #define CAUSEF_DC                       (_ULCAST_(1) << 27)
 extern atomic_t kvm_mips_instance;
-extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn);
+extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn);
+extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool(*kvm_mips_is_error_pfn) (pfn_t pfn);
+extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h
index ff7ad91c85db..97b313882678 100644
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -131,14 +131,12 @@
 /* Huge TLB page */
 #define _PAGE_HUGE_SHIFT        (_PAGE_MODIFIED_SHIFT + 1)
 #define _PAGE_HUGE              (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT   (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING         (1 << _PAGE_SPLITTING_SHIFT)
 #endif  /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 /* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
+#ifdef _PAGE_HUGE_SHIFT
-#define _PAGE_NO_EXEC_SHIFT     (_PAGE_SPLITTING_SHIFT + 1)
+#define _PAGE_NO_EXEC_SHIFT     (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_NO_EXEC_SHIFT     (_PAGE_MODIFIED_SHIFT + 1)
 #endif
@@ -153,8 +151,8 @@
 #if defined(_PAGE_NO_READ_SHIFT)
 #define _PAGE_GLOBAL_SHIFT      (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
+#elif defined(_PAGE_HUGE_SHIFT)
-#define _PAGE_GLOBAL_SHIFT      (_PAGE_SPLITTING_SHIFT + 1)
+#define _PAGE_GLOBAL_SHIFT      (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_GLOBAL_SHIFT      (_PAGE_MODIFIED_SHIFT + 1)
 #endif
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 8957f15e21ec..6995b4a02e23 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
        return pmd;
 }
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-        pmd_val(pmd) |= _PAGE_SPLITTING;
-        return pmd;
-}
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                       pmd_t *pmdp, pmd_t pmd);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                        unsigned long address,
-                                        pmd_t *pmdp);
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 97c03f468924..b0ebe59f73fd 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -73,8 +73,10 @@
 #define MADV_SEQUENTIAL 2               /* expect sequential page references */
 #define MADV_WILLNEED   3               /* will need these pages */
 #define MADV_DONTNEED   4               /* don't need these pages */
+#define MADV_FREE       5               /* free pages only if memory pressure */
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 #define MADV_REMOVE     9               /* remove these pages & resources */
 #define MADV_DONTFORK   10              /* don't inherit across fork */
 #define MADV_DOFORK     11              /* do inherit across fork */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 41b1b090f56f..1b675c7ce89f 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1525,7 +1525,7 @@ int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
        struct kvm *kvm = vcpu->kvm;
        unsigned long pa;
        gfn_t gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        gfn = va >> PAGE_SHIFT;
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index aed0ac2a4972..570479c03bdc 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -38,13 +38,13 @@ atomic_t kvm_mips_instance;
 EXPORT_SYMBOL(kvm_mips_instance);
 /* These function pointers are initialized once the KVM module is loaded */
-pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
 EXPORT_SYMBOL(kvm_mips_gfn_to_pfn);
-void (*kvm_mips_release_pfn_clean)(pfn_t pfn);
+void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
 EXPORT_SYMBOL(kvm_mips_release_pfn_clean);
-bool (*kvm_mips_is_error_pfn)(pfn_t pfn);
+bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 EXPORT_SYMBOL(kvm_mips_is_error_pfn);
 uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(kvm_mips_dump_guest_tlbs);
 static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 {
        int srcu_idx, err = 0;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
                return 0;
@@ -262,7 +262,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
                                    struct kvm_vcpu *vcpu)
 {
        gfn_t gfn;
-        pfn_t pfn0, pfn1;
+        kvm_pfn_t pfn0, pfn1;
        unsigned long vaddr = 0;
        unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
        int even;
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(kvm_mips_handle_kseg0_tlb_fault);
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
        struct kvm_vcpu *vcpu)
 {
-        pfn_t pfn0, pfn1;
+        kvm_pfn_t pfn0, pfn1;
        unsigned long flags, old_entryhi = 0, vaddr = 0;
        unsigned long entrylo0 = 0, entrylo1 = 0;
@@ -360,7 +360,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 {
        unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
        struct kvm *kvm = vcpu->kvm;
-        pfn_t pfn0, pfn1;
+        kvm_pfn_t pfn0, pfn1;
        if ((tlb->tlb_hi & VPN2_MASK) == 0) {
                pfn0 = 0;
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 5d3a25e1cfae..caac3d747a90 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
                 * another ASID than the current one.
                 */
                map_coherent = (cpu_has_dc_aliases &&
-                                page_mapped(page) && !Page_dcache_dirty(page));
+                                page_mapcount(page) &&
+                                !Page_dcache_dirty(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, addr);
                else
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index aab218c36e0d..3f159caf6dbc 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
        if (pages_do_alias(addr, vmaddr)) {
-                if (page_mapped(page) && !Page_dcache_dirty(page)) {
+                if (page_mapcount(page) && !Page_dcache_dirty(page)) {
                        void *kaddr;
                        kaddr = kmap_coherent(page, vmaddr);
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 349995d19c7f..1afd87c999b0 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-                if (PageTail(page))
-                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                /*
+                if (pmd_none(pmd))
-                 * The pmd_trans_splitting() check below explains why
-                 * pmdp_splitting_flush has to flush the tlb, to stop
-                 * this gup-fast code from running while we set the
-                 * splitting bit in the pmd. Returning zero will take
-                 * the slow path that will call wait_split_huge_page()
-                 * if the pmd is still in splitting state. gup-fast
-                 * can't because it has irq disabled and
-                 * wait_split_huge_page() would never return as the
-                 * tlb flush IPI wouldn't run.
-                 */
-                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_huge(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-                if (PageTail(page))
-                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 8770e619185e..7e5fa0938c21 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
        vto = kmap_atomic(to);
        if (cpu_has_dc_aliases &&
-            page_mapped(from) && !Page_dcache_dirty(from)) {
+            page_mapcount(from) && !Page_dcache_dirty(from)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
                kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-            page_mapped(page) && !Page_dcache_dirty(page)) {
+            page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
                kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-            page_mapped(page) && !Page_dcache_dirty(page)) {
+            page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
                kunmap_coherent();
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index e8adc0069d66..ce4473e7c0d2 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
 }
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                         unsigned long address,
-                         pmd_t *pmdp)
-{
-        if (!pmd_trans_splitting(*pmdp)) {
-                pmd_t pmd = pmd_mksplitting(*pmdp);
-                set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-        }
-}
-#endif
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
 {
        pmd_t pmd;
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 32e0be27673f..482192cc8f2b 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
        pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
        pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
-        pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
 #endif
 #ifdef CONFIG_CPU_MIPSR2
        if (cpu_has_rixi) {
diff --git a/arch/mn10300/include/asm/page.h b/arch/mn10300/include/asm/page.h
index 8288e124165b..3810a6f740fd 100644
--- a/arch/mn10300/include/asm/page.h
+++ b/arch/mn10300/include/asm/page.h
@@ -107,6 +107,7 @@ static inline int get_order(unsigned long size)
 #define pfn_to_kaddr(pfn)       __va((pfn) << PAGE_SHIFT)
 #define pfn_to_page(pfn)        (mem_map + ((pfn) - __pfn_disp))
 #define page_to_pfn(page)       ((unsigned long)((page) - mem_map) + __pfn_disp)
+#define __pfn_to_phys(pfn)      PFN_PHYS(pfn)
 #define pfn_valid(pfn)                                  \
 ({                                                      \
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 729f89163bc3..7c34cafdf301 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -79,9 +79,6 @@ config TIME_LOW_RES
        depends on SMP
        default y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
        bool
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index dd4d1876a020..cf830d465f75 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -43,8 +43,10 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 /* common/generic parameters */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 #define MADV_REMOVE     9               /* remove these pages & resources */
 #define MADV_DONTFORK   10              /* don't inherit across fork */
 #define MADV_DOFORK     11              /* do inherit across fork */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7d5a8350f913..94f6c5089e0c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -47,9 +47,6 @@ config STACKTRACE_SUPPORT
        bool
        default y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config TRACE_IRQFLAGS_SUPPORT
        bool
        default y
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9e55e3b1fef0..849bbec80f7b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -256,13 +256,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
                  (_PAGE_PTE | _PAGE_THP_HUGE));
 }
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        if (pmd_trans_huge(pmd))
-                return pmd_val(pmd) & _PAGE_SPLITTING;
-        return 0;
-}
 static inline int pmd_large(pmd_t pmd)
 {
        return !!(pmd_val(pmd) & _PAGE_PTE);
@@ -273,11 +266,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
        return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
 }
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-        return __pmd(pmd_val(pmd) | _PAGE_SPLITTING);
-}
 #define __HAVE_ARCH_PMD_SAME
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 2ff8b3df553d..06f17e778c27 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -41,11 +41,6 @@
 #endif
 /*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-/*
 * We need to differentiate between explicit huge page and THP huge
 * page, since THP huge page also need to track real subpage details
 */
@@ -54,9 +49,8 @@
 /*
 * set of bits not changed in pmd_modify.
 */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |               \
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-                         _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+                         _PAGE_ACCESSED | _PAGE_THP_HUGE)
-                         _PAGE_THP_HUGE | _PAGE_PTE | _PAGE_SOFT_DIRTY)
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b3a5badab69f..8204b0c393aa 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -223,9 +223,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_pfn(pmd)            pte_pfn(pmd_pte(pmd))
 #define pmd_dirty(pmd)          pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)          pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd)          pte_dirty(pmd_pte(pmd))
 #define pmd_mkold(pmd)          pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_wrprotect(pmd)      pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)        pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)        pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)        pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)        pte_pmd(pte_mkwrite(pmd_pte(pmd)))
@@ -266,10 +268,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                     unsigned long addr, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                 unsigned long address, pmd_t *pmdp);
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #define pmdp_collapse_flush pmdp_collapse_flush
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 9fac01cb89c1..8f39796c9da8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -154,8 +154,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
-extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
+extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
-                        bool *writable);
+                        bool writing, bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
 extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05bd0765..2241d5357129 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -515,7 +515,7 @@ void kvmppc_claim_lpid(long lpid);
 void kvmppc_free_lpid(long lpid);
 void kvmppc_init_lpid(unsigned long nr_lpids);
-static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
+static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn)
 {
        struct page *page;
        /*
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 099c79d8c160..638c6d9be9e0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -366,7 +366,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
-pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
+kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
                        bool *writable)
 {
        ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM;
@@ -379,9 +379,9 @@ pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
        gpa &= ~0xFFFULL;
        if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) {
                ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
-                pfn_t pfn;
+                kvm_pfn_t pfn;
-                pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+                pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
                get_page(pfn_to_page(pfn));
                if (writable)
                        *writable = true;
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index d5c9bfeb0c9c..55c4d51ea3e2 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -142,7 +142,7 @@ extern char etext[];
 int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
                        bool iswrite)
 {
-        pfn_t hpaddr;
+        kvm_pfn_t hpaddr;
        u64 vpn;
        u64 vsid;
        struct kvmppc_sid_map *map;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 79ad35abd196..913cd2198fa6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -83,7 +83,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
                        bool iswrite)
 {
        unsigned long vpn;
-        pfn_t hpaddr;
+        kvm_pfn_t hpaddr;
        ulong hash, hpteg;
        u64 vsid;
        int ret;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 72920bed3ac6..94f04fcb373e 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -41,7 +41,7 @@ enum vcpu_ftr {
 #define E500_TLB_MAS2_ATTR      (0x7f)
 struct tlbe_ref {
-        pfn_t pfn;              /* valid only for TLB0, except briefly */
+        kvm_pfn_t pfn;          /* valid only for TLB0, except briefly */
        unsigned int flags;     /* E500_TLB_* */
 };
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 34c43fff4adb..b0333cc737dd 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -163,9 +163,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
        struct kvm_book3e_206_tlb_entry magic;
        ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
        unsigned int stid;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
-        pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+        pfn = (kvm_pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
        get_page(pfn_to_page(pfn));
        preempt_disable();
@@ -246,7 +246,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
 static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
                                         struct kvm_book3e_206_tlb_entry *gtlbe,
-                                         pfn_t pfn, unsigned int wimg)
+                                         kvm_pfn_t pfn, unsigned int wimg)
 {
        ref->pfn = pfn;
        ref->flags = E500_TLB_VALID;
@@ -309,7 +309,7 @@ static void kvmppc_e500_setup_stlbe(
        int tsize, struct tlbe_ref *ref, u64 gvaddr,
        struct kvm_book3e_206_tlb_entry *stlbe)
 {
-        pfn_t pfn = ref->pfn;
+        kvm_pfn_t pfn = ref->pfn;
        u32 pr = vcpu->arch.shared->msr & MSR_PR;
        BUG_ON(!(ref->flags & E500_TLB_VALID));
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
index 810507cb688a..d44f324184fb 100644
--- a/arch/powerpc/kvm/trace_pr.h
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -30,7 +30,7 @@ TRACE_EVENT(kvm_book3s_reenter,
 #ifdef CONFIG_PPC_BOOK3S_64
 TRACE_EVENT(kvm_book3s_64_mmu_map,
-        TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+        TP_PROTO(int rflags, ulong hpteg, ulong va, kvm_pfn_t hpaddr,
                 struct kvmppc_pte *orig_pte),
        TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index baf1301ded0c..49b152b0f926 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                /* If PMD busy, retry the access */
                if (unlikely(old_pmd & _PAGE_BUSY))
                        return 0;
-                /* If PMD is trans splitting retry the access */
-                if (unlikely(old_pmd & _PAGE_SPLITTING))
-                        return 0;
                /* If PMD permissions don't match, take page fault */
                if (unlikely(access & ~old_pmd))
                        return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 61b8b7ccea4f..744e24bcb85c 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -958,10 +958,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
                        /*
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
-                         *
-                         * We don't worry about pmd_trans_splitting here, The
-                         * caller if it needs to handle the splitting case
-                         * should check for that.
                         */
                        if (pmd_none(pmd))
                                return NULL;
@@ -999,7 +995,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 {
        unsigned long mask;
        unsigned long pte_end;
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        pte_t pte;
        int refs;
@@ -1022,7 +1018,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
        head = pte_page(pte);
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -1044,15 +1039,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                return 0;
        }
-        /*
-         * Any tail page need their mapcount reference taken before we
-         * return.
-         */
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ea6bc31debb0..3124a20d0fab 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -604,55 +604,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 }
 /*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                          unsigned long address, pmd_t *pmdp)
-{
-        unsigned long old, tmp;
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifdef CONFIG_DEBUG_VM
-        WARN_ON(!pmd_trans_huge(*pmdp));
-        assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-#ifdef PTE_ATOMIC_UPDATES
-        __asm__ __volatile__(
-        "1:     ldarx   %0,0,%3\n\
-                andi.   %1,%0,%6\n\
-                bne-    1b \n\
-                oris    %1,%0,%4@h \n\
-                stdcx.  %1,0,%3 \n\
-                bne-    1b"
-        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-        : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
-        : "cc" );
-#else
-        old = pmd_val(*pmdp);
-        *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
-        /*
-         * If we didn't had the splitting flag set, go and flush the
-         * HPTE entries.
-         */
-        trace_hugepage_splitting(address, old);
-        if (!(old & _PAGE_SPLITTING)) {
-                /* We need to flush the hpte */
-                if (old & _PAGE_HASHPTE)
-                        hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
-        }
-        /*
-         * This ensures that generic code that rely on IRQ disabling
-         * to prevent a parallel THP split work as expected.
-         */
-        kick_all_cpus_sync();
-}
-/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index fa9fb5b4c66c..d5543514c1df 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
                                  unsigned long end, struct mm_walk *walk)
 {
        struct vm_area_struct *vma = walk->vma;
-        split_huge_page_pmd(vma, addr, pmd);
+        split_huge_pmd(vma, pmd, addr);
        return 0;
 }
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index c713b349d967..0d112b94d91d 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -43,6 +43,7 @@
 #include <linux/types.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
+#include <linux/pfn_t.h>
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -142,15 +143,13 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-                       void __pmem **kaddr, unsigned long *pfn)
+                       void __pmem **kaddr, pfn_t *pfn)
 {
        struct axon_ram_bank *bank = device->bd_disk->private_data;
        loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
-        void *addr = (void *)(bank->ph_addr + offset);
-        *kaddr = (void __pmem *)addr;
-        *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
+        *kaddr = (void __pmem __force *) bank->io_addr + offset;
+        *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
        return bank->size - offset;
 }
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 24490344c30f..dbeeb3a049f2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -10,9 +10,6 @@ config LOCKDEP_SUPPORT
 config STACKTRACE_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config RWSEM_GENERIC_SPINLOCK
        bool
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 024f85f947ae..64ead8091248 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
 #define _SEGMENT_ENTRY_DIRTY    0x2000  /* SW segment dirty bit */
 #define _SEGMENT_ENTRY_YOUNG    0x1000  /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT    0x0800  /* THP splitting bit */
 #define _SEGMENT_ENTRY_LARGE    0x0400  /* STE-format control, large page */
 #define _SEGMENT_ENTRY_READ     0x0002  /* SW segment read bit */
 #define _SEGMENT_ENTRY_WRITE    0x0001  /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
 * SW-bits: y young, d dirty, r read, w write
 */
-#define _SEGMENT_ENTRY_SPLIT_BIT 11     /* THP splitting bit number */
 /* Page status table bits for virtualization */
 #define PGSTE_ACC_BITS  0xf000000000000000UL
 #define PGSTE_FP_BIT    0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
        return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 }
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                 unsigned long addr, pmd_t *pmdp);
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
        if (pmd_large(pmd)) {
                pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
                        _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-                        _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
+                        _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
-                        _SEGMENT_ENTRY_SOFT_DIRTY;
                pmd_val(pmd) |= massage_pgprot_pmd(newprot);
                if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
                        pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
-                (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t entry)
 {
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 21c74a71e2ab..13dab0c1645c 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
        unsigned long mask, result;
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        int refs;
        result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
-        /*
-         * Any tail page need their mapcount reference taken before we
-         * return.
-         */
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                pmd = *pmdp;
                barrier();
                next = pmd_addr_end(addr, end);
-                /*
+                if (pmd_none(pmd))
-                 * The pmd_trans_splitting() check below explains why
-                 * pmdp_splitting_flush() has to serialize with
-                 * smp_call_function() against our disabled IRQs, to stop
-                 * this gup-fast code from running while we set the
-                 * splitting bit in the pmd. Returning zero will take
-                 * the slow path that will call wait_split_huge_page()
-                 * if the pmd is still in splitting state.
-                 */
-                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        /*
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index aa34af0a0b26..a809fa8e6f8b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
        unsigned long vmaddr;
        int rc;
+        bool unlocked;
        down_read(&gmap->mm->mmap_sem);
+retry:
+        unlocked = false;
        vmaddr = __gmap_translate(gmap, gaddr);
        if (IS_ERR_VALUE(vmaddr)) {
                rc = vmaddr;
                goto out_up;
        }
-        if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+        if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+                             &unlocked)) {
                rc = -EFAULT;
                goto out_up;
        }
+        /*
+         * In the case that fixup_user_fault unlocked the mmap_sem during
+         * faultin redo __gmap_translate to not race with a map/unmap_segment.
+         */
+        if (unlocked)
+                goto retry;
        rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
        up_read(&gmap->mm->mmap_sem);
@@ -714,12 +726,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
        spinlock_t *ptl;
        pte_t *ptep, entry;
        pgste_t pgste;
+        bool unlocked;
        int rc = 0;
        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
                return -EINVAL;
        down_read(&gmap->mm->mmap_sem);
        while (len) {
+                unlocked = false;
                /* Convert gmap address and connect the page tables */
                addr = __gmap_translate(gmap, gaddr);
                if (IS_ERR_VALUE(addr)) {
@@ -727,10 +741,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
                        break;
                }
                /* Get the page mapped */
-                if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
+                if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+                                     &unlocked)) {
                        rc = -EFAULT;
                        break;
                }
+                /* While trying to map mmap_sem got unlocked. Let us retry */
+                if (unlocked)
+                        continue;
                rc = __gmap_link(gmap, gaddr, addr);
                if (rc)
                        break;
@@ -791,9 +809,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
        spinlock_t *ptl;
        pgste_t old, new;
        pte_t *ptep;
+        bool unlocked;
        down_read(&mm->mmap_sem);
 retry:
+        unlocked = false;
        ptep = get_locked_pte(mm, addr, &ptl);
        if (unlikely(!ptep)) {
                up_read(&mm->mmap_sem);
@@ -802,7 +822,12 @@ retry:
        if (!(pte_val(*ptep) & _PAGE_INVALID) &&
             (pte_val(*ptep) & _PAGE_PROTECT)) {
                pte_unmap_unlock(ptep, ptl);
-                if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+                /*
+                 * We do not really care about unlocked. We will retry either
+                 * way. But this allows fixup_user_fault to enable userfaultfd.
+                 */
+                if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+                                     &unlocked)) {
                        up_read(&mm->mmap_sem);
                        return -EFAULT;
                }
@@ -1305,22 +1330,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
        return 1;
 }
-static void pmdp_splitting_flush_sync(void *arg)
-{
-        /* Simply deliver the interrupt */
-}
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp)
-{
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
-                              (unsigned long *) pmdp)) {
-                /* need to serialize against gup-fast (IRQ disabled) */
-                smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
-        }
-}
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
 {
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d514df7e04dd..6c391a5d3e5c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -130,9 +130,6 @@ config STACKTRACE_SUPPORT
 config LOCKDEP_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config ARCH_HAS_ILOG2_U32
        def_bool n
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 51d8f7f31d1d..58aaa4f33b81 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
                 */
                map_coherent = (current_cpu_data.dcache.n_aliases &&
                        test_bit(PG_dcache_clean, &page->flags) &&
-                        page_mapped(page));
+                        page_mapcount(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, address);
                else
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index f770e3992620..e58cfbf45150 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long vaddr, void *dst, const void *src,
                       unsigned long len)
 {
-        if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+        if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
                         unsigned long vaddr, void *dst, const void *src,
                         unsigned long len)
 {
-        if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+        if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
        vto = kmap_atomic(to);
-        if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+        if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
            test_bit(PG_dcache_clean, &from->flags)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
        if (pages_do_alias(addr, vmaddr)) {
-                if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+                if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
                    test_bit(PG_dcache_clean, &page->flags)) {
                        void *kaddr;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 56442d2d7bbc..3203e42190dd 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -101,10 +101,6 @@ config LOCKDEP_SUPPORT
        bool
        default y if SPARC64
-config HAVE_LATENCYTOP_SUPPORT
-        bool
-        default y if SPARC64
 config ARCH_HIBERNATION_POSSIBLE
        def_bool y if SPARC64
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36fcd07a..7a38d6a576c5 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
        return pte_val(pte) & _PAGE_PMD_HUGE;
 }
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
-        pte_t pte = __pte(pmd_val(pmd));
-        return pmd_trans_huge(pmd) && pte_special(pte);
-}
 #define has_transparent_hugepage() 1
 static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -717,29 +710,29 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return __pmd(pte_val(pte));
 }
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
+static inline pmd_t pmd_mkclean(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
-        pte = pte_mkyoung(pte);
+        pte = pte_mkclean(pte);
        return __pmd(pte_val(pte));
 }
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
-        pte = pte_mkwrite(pte);
+        pte = pte_mkyoung(pte);
        return __pmd(pte_val(pte));
 }
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
-        pte = pte_mkspecial(pte);
+        pte = pte_mkwrite(pte);
        return __pmd(pte_val(pte));
 }
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index dbabe5713a15..cb841a33da59 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmdp)) {
-                if (pmd_trans_splitting(*pmdp))
-                        goto out_irq_enable;
                pa  = pmd_pfn(*pmdp) << PAGE_SHIFT;
                pa += tpc & ~HPAGE_MASK;
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 2e5c4fc2daa9..eb3d8e8ebc6b 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        put_page(head);
                        return 0;
                }
-                if (head != page)
-                        get_huge_page_tail(page);
                pages[*nr] = page;
                (*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                        unsigned long end, int write, struct page **pages,
                        int *nr)
 {
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        int refs;
        if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
-        /* Any tail page need their mapcount reference taken before we
-         * return.
-         */
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmdp, pmd, addr, next,
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 2b05ccbebed9..96cecf55522e 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define has_transparent_hugepage() 1
 #define pmd_trans_huge pmd_huge_page
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-        return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        return hv_pte_get_client2(pmd_pte(pmd));
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /*
diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h
index 71c5d132062a..e13d41c392ae 100644
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -18,6 +18,7 @@
 struct page;
+#include <linux/pfn.h>
 #include <linux/types.h>
 #include <asm/vm-flags.h>
@@ -52,7 +53,6 @@ typedef struct { unsigned long pgd; } pgd_t;
 #define pmd_val(x)      ((x).pmd)
 #define __pmd(x) ((pmd_t) { (x) } )
-typedef unsigned long long pfn_t;
 typedef unsigned long long phys_t;
 #else
@@ -76,7 +76,6 @@ typedef struct { unsigned long pmd; } pmd_t;
 #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
 #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot))
-typedef unsigned long pfn_t;
 typedef unsigned long phys_t;
 #endif
@@ -109,8 +108,8 @@ extern unsigned long uml_physmem;
 #define __pa(virt) to_phys((void *) (unsigned long) (virt))
 #define __va(phys) to_virt((unsigned long) (phys))
-#define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT))
+#define phys_to_pfn(p) ((p) >> PAGE_SHIFT)
-#define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT))
+#define pfn_to_phys(pfn) PFN_PHYS(pfn)
 #define pfn_valid(pfn) ((pfn) < max_mapnr)
 #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v)))
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 2b4274e7c095..bae8523a162f 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -98,7 +98,7 @@ static inline unsigned long pte_pfn(pte_t pte)
        return phys_to_pfn(pte_val(pte));
 }
-static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot)
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
        pte_t pte;
        phys_t phys = pfn_to_phys(page_nr);
@@ -107,7 +107,7 @@ static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot)
        return pte;
 }
-static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot)
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
        return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
 }
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 18eb9924dda3..7485398d0737 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
 #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
 #define __virt_to_page(virt) phys_to_page(__pa(virt))
-#define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page))
+#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
 #define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
 #define mk_pte(page, pgprot) \
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 5dc4c0a43ccd..877342640b6e 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -34,9 +34,6 @@ config NO_IOPORT_MAP
 config STACKTRACE_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config LOCKDEP_SUPPORT
        def_bool y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 24f362bf3ec6..4a10ba9e95da 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,9 +180,6 @@ config LOCKDEP_SUPPORT
 config STACKTRACE_SUPPORT
        def_bool y
-config HAVE_LATENCYTOP_SUPPORT
-        def_bool y
 config MMU
        def_bool y
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d3eee663c41f..0687c4748b8f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte)
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        return pmd_val(pmd) & _PAGE_SPLITTING;
-}
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-        return pmd_val(pmd) & _PAGE_PSE;
+        return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 static inline int has_transparent_hugepage(void)
 {
        return cpu_has_pse;
 }
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pmd_devmap(pmd_t pmd)
+{
+        return !!(pmd_val(pmd) & _PAGE_DEVMAP);
+}
+#endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
        return pte_set_flags(pte, _PAGE_SPECIAL);
 }
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+        return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
 static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
 {
        pmdval_t v = native_pmd_val(pmd);
@@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
 }
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+        return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
        return pmd_clear_flags(pmd, _PAGE_RW);
@@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+        return pmd_set_flags(pmd, _PAGE_DEVMAP);
+}
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
        return pmd_set_flags(pmd, _PAGE_PSE);
@@ -462,6 +479,13 @@ static inline int pte_present(pte_t a)
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t a)
+{
+        return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
+}
+#endif
 #define pte_accessible pte_accessible
 static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 {
@@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                 unsigned long addr, pmd_t *pmdp);
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a471cadb9630..04c27a013165 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,10 +22,11 @@
 #define _PAGE_BIT_PAT_LARGE     12      /* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL       _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST      _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING     _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
 #define _PAGE_BIT_HIDDEN        _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY    _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_SOFTW4        58      /* available for programmer */
+#define _PAGE_BIT_DEVMAP                _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_NX            63      /* No execute: only valid after cpuid check */
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -46,7 +47,6 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL   (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST  (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 #ifdef CONFIG_KMEMCHECK
@@ -85,8 +85,11 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_DEVMAP    (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX        (_AT(pteval_t, 0))
+#define _PAGE_DEVMAP    (_AT(pteval_t, 0))
 #endif
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d8ce3ec816ab..1544fabcd7f9 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -132,12 +132,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
        void *vaddr = (void __force *)addr;
-        /* TODO: implement the zeroing via non-temporal writes */
+        memset(vaddr, 0, size);
-        if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-                clear_page(vaddr);
-        else
-                memset(vaddr, 0, size);
        __arch_wb_cache_pmem(vaddr, size);
 }
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 483231ebbb0b..e574b8546518 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
-        split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+        if (pmd_trans_huge(*pmd)) {
+                struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+                split_huge_pmd(vma, pmd, 0xA0000);
+        }
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 5c520ebf6343..a22a488b4622 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
                                gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
                           unsigned long npages)
 {
        gfn_t end_gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        pfn     = gfn_to_pfn_memslot(slot, gfn);
        end_gfn = gfn + npages;
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
        return pfn;
 }
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
+                unsigned long npages)
 {
        unsigned long i;
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        gfn_t gfn, end_gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        int r = 0;
        struct iommu_domain *domain = kvm->arch.iommu_domain;
        int flags;
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 {
        struct iommu_domain *domain;
        gfn_t end_gfn, gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        u64 phys;
        domain  = kvm->arch.iommu_domain;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 420a5ca3c0ee..95a955de5964 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte)
 }
 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                          pfn_t pfn, unsigned access)
+                          kvm_pfn_t pfn, unsigned access)
 {
        if (unlikely(is_noslot_pfn(pfn))) {
                mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -320,7 +320,7 @@ static int is_last_spte(u64 pte, int level)
        return 0;
 }
-static pfn_t spte_to_pfn(u64 pte)
+static kvm_pfn_t spte_to_pfn(u64 pte)
 {
        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
@@ -582,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        u64 old_spte = *sptep;
        if (!spte_has_volatile_bits(old_spte))
@@ -1372,7 +1372,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        int need_flush = 0;
        u64 new_spte;
        pte_t *ptep = (pte_t *)data;
-        pfn_t new_pfn;
+        kvm_pfn_t new_pfn;
        WARN_ON(pte_huge(*ptep));
        new_pfn = pte_pfn(*ptep);
@@ -2450,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
        return 0;
 }
-static bool kvm_is_mmio_pfn(pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 {
        if (pfn_valid(pfn))
                return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2460,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int level,
-                    gfn_t gfn, pfn_t pfn, bool speculative,
+                    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
        u64 spte;
@@ -2539,7 +2539,7 @@ done:
 }
 static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
-                         int write_fault, int level, gfn_t gfn, pfn_t pfn,
+                         int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
                         bool speculative, bool host_writable)
 {
        int was_rmapped = 0;
@@ -2602,7 +2602,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
        return emulate;
 }
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
        struct kvm_memory_slot *slot;
@@ -2684,7 +2684,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-                        int level, gfn_t gfn, pfn_t pfn, bool prefault)
+                        int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
@@ -2732,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
        send_sig_info(SIGBUS, &info, tsk);
 }
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 {
        /*
         * Do not cache the mmio info caused by writing the readonly gfn
@@ -2752,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 }
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                        gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+                                        gfn_t *gfnp, kvm_pfn_t *pfnp,
+                                        int *levelp)
 {
-        pfn_t pfn = *pfnp;
+        kvm_pfn_t pfn = *pfnp;
        gfn_t gfn = *gfnp;
        int level = *levelp;
@@ -2793,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 }
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-                                pfn_t pfn, unsigned access, int *ret_val)
+                                kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
        bool ret = true;
@@ -2947,7 +2948,7 @@ exit:
 }
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                         gva_t gva, pfn_t *pfn, bool write, bool *writable);
+                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
 static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2956,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
        int r;
        int level;
        bool force_pt_level = false;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        unsigned long mmu_seq;
        bool map_writable, write = error_code & PFERR_WRITE_MASK;
@@ -3410,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 }
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                         gva_t gva, pfn_t *pfn, bool write, bool *writable)
+                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
        struct kvm_memory_slot *slot;
        bool async;
@@ -3448,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
                          bool prefault)
 {
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        int r;
        int level;
        bool force_pt_level;
@@ -4601,7 +4602,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
        u64 *sptep;
        struct rmap_iterator iter;
        int need_tlb_flush = 0;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        struct kvm_mmu_page *sp;
 restart:
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1cee3ec20dd2..dcce533d420c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
        struct kvm_mmu_page *sp;
        gfn_t gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        hpa_t hpa;
        sp = page_header(__pa(sptep));
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 91e939b486d1..6c9fed957cce 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 {
        unsigned pte_access;
        gfn_t gfn;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return false;
@@ -551,7 +551,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
                         int write_fault, int hlevel,
-                         pfn_t pfn, bool map_writable, bool prefault)
+                         kvm_pfn_t pfn, bool map_writable, bool prefault)
 {
        struct kvm_mmu_page *sp = NULL;
        struct kvm_shadow_walk_iterator it;
@@ -694,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int user_fault = error_code & PFERR_USER_MASK;
        struct guest_walker walker;
        int r;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
        bool force_pt_level = false;
        unsigned long mmu_seq;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 04d61d496b14..e2951b6edbbc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4251,7 +4251,7 @@ out:
 static int init_rmode_identity_map(struct kvm *kvm)
 {
        int i, idx, r = 0;
-        pfn_t identity_map_pfn;
+        kvm_pfn_t identity_map_pfn;
        u32 tmp;
        if (!enable_ept)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f53f5b13c677..4244c2baf57d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5148,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
                                  int emulation_type)
 {
        gpa_t gpa = cr2;
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        if (emulation_type & EMULTYPE_NO_REEXECUTE)
                return false;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..6d5eb5900372 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -9,6 +9,7 @@
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/memremap.h>
 #include <asm/pgtable.h>
@@ -63,6 +64,16 @@ retry:
 #endif
 }
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+        while ((*nr) - nr_start) {
+                struct page *page = pages[--(*nr)];
+                ClearPageReferenced(page);
+                put_page(page);
+        }
+}
 /*
 * The performance critical leaf functions are made noinline otherwise gcc
 * inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ retry:
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
+        struct dev_pagemap *pgmap = NULL;
        unsigned long mask;
+        int nr_start = *nr;
        pte_t *ptep;
        mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        return 0;
                }
-                if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+                page = pte_page(pte);
+                if (pte_devmap(pte)) {
+                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+                        if (unlikely(!pgmap)) {
+                                undo_dev_pagemap(nr, nr_start, pages);
+                                pte_unmap(ptep);
+                                return 0;
+                        }
+                } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
                        pte_unmap(ptep);
                        return 0;
                }
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-                page = pte_page(pte);
                get_page(page);
+                put_dev_pagemap(pgmap);
                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        SetPageReferenced(page);
 }
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+                unsigned long end, struct page **pages, int *nr)
+{
+        int nr_start = *nr;
+        unsigned long pfn = pmd_pfn(pmd);
+        struct dev_pagemap *pgmap = NULL;
+        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+        do {
+                struct page *page = pfn_to_page(pfn);
+                pgmap = get_dev_pagemap(pfn, pgmap);
+                if (unlikely(!pgmap)) {
+                        undo_dev_pagemap(nr, nr_start, pages);
+                        return 0;
+                }
+                SetPageReferenced(page);
+                pages[*nr] = page;
+                get_page(page);
+                put_dev_pagemap(pgmap);
+                (*nr)++;
+                pfn++;
+        } while (addr += PAGE_SIZE, addr != end);
+        return 1;
+}
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
                mask |= _PAGE_RW;
        if ((pmd_flags(pmd) & mask) != mask)
                return 0;
+        VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+        if (pmd_devmap(pmd))
+                return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
        /* hugepages are never "special" */
        VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-        VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
        refs = 0;
        head = pmd_page(pmd);
@@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-                if (PageTail(page))
-                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                /*
+                if (pmd_none(pmd))
-                 * The pmd_trans_splitting() check below explains why
-                 * pmdp_splitting_flush has to flush the tlb, to stop
-                 * this gup-fast code from running while we set the
-                 * splitting bit in the pmd. Returning zero will take
-                 * the slow path that will call wait_split_huge_page()
-                 * if the pmd is still in splitting state. gup-fast
-                 * can't because it has irq disabled and
-                 * wait_split_huge_page() would never return as the
-                 * tlb flush IPI wouldn't run.
-                 */
-                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                        /*
@@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-                if (PageTail(page))
-                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8829482d69ec..5488d21123bd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
+#include <linux/memremap.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
 #include <linux/kcore.h>
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order)
 {
        unsigned long magic;
        unsigned int nr_pages = 1 << order;
+        struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
+        if (altmap) {
+                vmem_altmap_free(altmap, nr_pages);
+                return;
+        }
        /* bootmem page has reserved flag */
        if (PageReserved(page)) {
@@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
+        struct page *page = pfn_to_page(start_pfn);
+        struct vmem_altmap *altmap;
        struct zone *zone;
        int ret;
-        zone = page_zone(pfn_to_page(start_pfn));
+        /* With altmap the first mapped page is offset from @start */
-        kernel_physical_mapping_remove(start, start + size);
+        altmap = to_vmem_altmap((unsigned long) page);
+        if (altmap)
+                page += vmem_altmap_offset(altmap);
+        zone = page_zone(page);
        ret = __remove_pages(zone, start_pfn, nr_pages);
        WARN_ON_ONCE(ret);
+        kernel_physical_mapping_remove(start, start + size);
        return ret;
 }
@@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end;
 static int __meminitdata node_start;
 static int __meminit vmemmap_populate_hugepages(unsigned long start,
-                                                unsigned long end, int node)
+                unsigned long end, int node, struct vmem_altmap *altmap)
 {
        unsigned long addr;
        unsigned long next;
@@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                if (pmd_none(*pmd)) {
                        void *p;
-                        p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+                        p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
                        if (p) {
                                pte_t entry;
@@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
                                addr_end = addr + PMD_SIZE;
                                p_end = p + PMD_SIZE;
                                continue;
-                        }
+                        } else if (altmap)
+                                return -ENOMEM; /* no fallback */
                } else if (pmd_large(*pmd)) {
                        vmemmap_verify((pte_t *)pmd, node, addr, next);
                        continue;
@@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
+        struct vmem_altmap *altmap = to_vmem_altmap(start);
        int err;
        if (cpu_has_pse)
-                err = vmemmap_populate_hugepages(start, end, node);
+                err = vmemmap_populate_hugepages(start, end, node, altmap);
-        else
+        else if (altmap) {
+                pr_err_once("%s: no cpu support for altmap allocations\n",
+                                __func__);
+                err = -ENOMEM;
+        } else
                err = vmemmap_populate_basepages(start, end, node);
        if (!err)
                sync_global_pgds(start, end - 1, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 031782e74231..f4ae536b0914 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 }
 int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                     unsigned long pfn)
+                     pfn_t pfn)
 {
        enum page_cache_mode pcm;
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                return 0;
        /* Set prot based on lookup */
-        pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+        pcm = lookup_memtype(pfn_t_to_phys(pfn));
        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ee9c2e3a7199..4eb287e25043 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                          unsigned long address, pmd_t *pmdp)
-{
-        int set;
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-                                (unsigned long *)pmdp);
-        if (set) {
-                /* need tlb flush only to serialize against gup-fast */
-                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-        }
-}
 #endif
 /**
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 360944e1da52..d030594ed22b 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -86,8 +86,10 @@
 #define MADV_SEQUENTIAL 2               /* expect sequential page references */
 #define MADV_WILLNEED   3               /* will need these pages */
 #define MADV_DONTNEED   4               /* don't need these pages */
+#define MADV_FREE       5               /* free pages only if memory pressure */
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 #define MADV_REMOVE     9               /* remove these pages & resources */
 #define MADV_DONTFORK   10              /* don't inherit across fork */
 #define MADV_DOFORK     11              /* do inherit across fork */
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index 5ece856c5725..35c822286bbe 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
                                                page_mapcount(p));
                                if (!page_count(p))
                                        rc |= TLB_INSANE;
-                                else if (page_mapped(p))
+                                else if (page_mapcount(p))
                                        rc |= TLB_SUSPICIOUS;
                        } else {
                                rc |= TLB_INSANE;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 619fe584a44c..213456c2b123 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -647,6 +647,13 @@ static int add_memory_block(int base_section_nr)
        return 0;
 }
+static bool is_zone_device_section(struct mem_section *ms)
+{
+        struct page *page;
+        page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms));
+        return is_zone_device_page(page);
+}
 /*
 * need an interface for the VM to add new memory regions,
@@ -657,6 +664,9 @@ int register_new_memory(int nid, struct mem_section *section)
        int ret = 0;
        struct memory_block *mem;
+        if (is_zone_device_section(section))
+                return 0;
        mutex_lock(&mem_sysfs_mutex);
        mem = find_memory_block(section);
@@ -693,6 +703,9 @@ static int remove_memory_section(unsigned long node_id,
 {
        struct memory_block *mem;
+        if (is_zone_device_section(section))
+                return 0;
        mutex_lock(&mem_sysfs_mutex);
        mem = find_memory_block(section);
        unregister_mem_sect_under_nodes(mem, __section_nr(section));
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index a5880f4ab40e..cb27190e9f39 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -19,6 +19,9 @@
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+#include <linux/pfn_t.h>
+#endif
 #include <asm/uaccess.h>
@@ -378,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-                        void __pmem **kaddr, unsigned long *pfn)
+                        void __pmem **kaddr, pfn_t *pfn)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
        struct page *page;
@@ -389,7 +392,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
        if (!page)
                return -ENOSPC;
        *kaddr = (void __pmem *)page_address(page);
-        *pfn = page_to_pfn(page);
+        *pfn = page_to_pfn_t(page);
        return PAGE_SIZE;
 }
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 47915d736f8d..370c2f76016d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1325,7 +1325,6 @@ static int zram_remove(struct zram *zram)
        pr_info("Removed device: %s\n", zram->disk->disk_name);
-        idr_remove(&zram_index_idr, zram->disk->first_minor);
        blk_cleanup_queue(zram->disk->queue);
        del_gendisk(zram->disk);
        put_disk(zram->disk);
@@ -1367,10 +1366,12 @@ static ssize_t hot_remove_store(struct class *class,
        mutex_lock(&zram_index_mutex);
        zram = idr_find(&zram_index_idr, dev_id);
-        if (zram)
+        if (zram) {
                ret = zram_remove(zram);
-        else
+                idr_remove(&zram_index_idr, dev_id);
+        } else {
                ret = -ENODEV;
+        }
        mutex_unlock(&zram_index_mutex);
        return ret ? ret : count;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index 252eb301470c..32358c5e3db4 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -14,6 +14,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/pfn_t.h>
 #include <drm/exynos_drm.h>
 #include "exynos_drm_drv.h"
@@ -490,7 +491,8 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        pfn = page_to_pfn(exynos_gem->pages[page_offset]);
-        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                        __pfn_to_pfn_t(pfn, PFN_DEV));
 out:
        switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index ee95c03a8c54..cb95765050cc 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/pfn_t.h>
 #include <linux/mm.h>
 #include <linux/tty.h>
 #include <linux/slab.h>
@@ -132,7 +133,8 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        for (i = 0; i < page_num; i++) {
                pfn = (phys_addr >> PAGE_SHIFT);
-                ret = vm_insert_mixed(vma, address, pfn);
+                ret = vm_insert_mixed(vma, address,
+                                __pfn_to_pfn_t(pfn, PFN_DEV));
                if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
                        break;
                else if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index c76cc853b08a..3cedb8d5c855 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -18,6 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/pfn_t.h>
 #include "msm_drv.h"
 #include "msm_gem.h"
@@ -222,7 +223,8 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
                        pfn, pfn << PAGE_SHIFT);
-        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+        ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                        __pfn_to_pfn_t(pfn, PFN_DEV));
 out_unlock:
        mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 7ed08fdc4c42..ceba5459ceb7 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -19,6 +19,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/spinlock.h>
+#include <linux/pfn_t.h>
 #include <drm/drm_vma_manager.h>
@@ -385,7 +386,8 @@ static int fault_1d(struct drm_gem_object *obj,
        VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address,
                        pfn, pfn << PAGE_SHIFT);
-        return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn);
+        return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                        __pfn_to_pfn_t(pfn, PFN_DEV));
 }
 /* Special handling for the case of faulting in 2d tiled buffers */
@@ -478,7 +480,8 @@ static int fault_2d(struct drm_gem_object *obj,
                        pfn, pfn << PAGE_SHIFT);
        for (i = n; i > 0; i--) {
-                vm_insert_mixed(vma, (unsigned long)vaddr, pfn);
+                vm_insert_mixed(vma, (unsigned long)vaddr,
+                                __pfn_to_pfn_t(pfn, PFN_DEV));
                pfn += usergart[fmt].stride_pfn;
                vaddr += PAGE_SIZE * m;
        }
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 8fb7213277cc..06d26dc438b2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -35,6 +35,7 @@
 #include <ttm/ttm_placement.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/mm.h>
+#include <linux/pfn_t.h>
 #include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
@@ -229,7 +230,8 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                }
                if (vma->vm_flags & VM_MIXEDMAP)
-                        ret = vm_insert_mixed(&cvma, address, pfn);
+                        ret = vm_insert_mixed(&cvma, address,
+                                        __pfn_to_pfn_t(pfn, PFN_DEV));
                else
                        ret = vm_insert_pfn(&cvma, address, pfn);
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index fd01f3493fc7..af7cc1e65656 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -433,16 +433,15 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals)
                scale_db = true;
        case IIO_VAL_INT_PLUS_MICRO:
                if (vals[1] < 0)
-                        return sprintf(buf, "-%ld.%06u%s\n", abs(vals[0]),
+                        return sprintf(buf, "-%d.%06u%s\n", abs(vals[0]),
-                                        -vals[1],
+                                       -vals[1], scale_db ? " dB" : "");
-                                scale_db ? " dB" : "");
                else
                        return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1],
                                scale_db ? " dB" : "");
        case IIO_VAL_INT_PLUS_NANO:
                if (vals[1] < 0)
-                        return sprintf(buf, "-%ld.%09u\n", abs(vals[0]),
+                        return sprintf(buf, "-%d.%09u\n", abs(vals[0]),
-                                        -vals[1]);
+                                       -vals[1]);
                else
                        return sprintf(buf, "%d.%09u\n", vals[0], vals[1]);
        case IIO_VAL_FRACTIONAL:
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index 07a4c644fb9b..e9cef9de9ed8 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -901,7 +901,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
                /* bound gain by 2 bits value max, 3rd bit is sign */
                data->delta_gain_code[i] =
                        min(abs(delta_g),
-                        (long) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
+                        (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
                if (delta_g < 0)
                        /*
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index f9b674bc49db..0cc9048b86e2 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -83,8 +83,7 @@ static ssize_t mode_store(struct device *dev,
                if (strncmp(buf, "pmem\n", n) == 0
                                || strncmp(buf, "pmem", n) == 0) {
-                        /* TODO: allocate from PMEM support */
+                        nd_pfn->mode = PFN_MODE_PMEM;
-                        rc = -ENOTTY;
                } else if (strncmp(buf, "ram\n", n) == 0
                                || strncmp(buf, "ram", n) == 0)
                        nd_pfn->mode = PFN_MODE_RAM;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b493ff3fccb2..7edf31671dab 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -21,10 +21,11 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
-#include <linux/memory_hotplug.h>
 #include <linux/moduleparam.h>
 #include <linux/badblocks.h>
+#include <linux/memremap.h>
 #include <linux/vmalloc.h>
+#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
@@ -40,6 +41,7 @@ struct pmem_device {
        phys_addr_t             phys_addr;
        /* when non-zero this device is hosting a 'pfn' instance */
        phys_addr_t             data_offset;
+        unsigned long           pfn_flags;
        void __pmem             *virt_addr;
        size_t                  size;
        struct badblocks        bb;
@@ -135,13 +137,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-                      void __pmem **kaddr, unsigned long *pfn)
+                      void __pmem **kaddr, pfn_t *pfn)
 {
        struct pmem_device *pmem = bdev->bd_disk->private_data;
        resource_size_t offset = sector * 512 + pmem->data_offset;
        *kaddr = pmem->virt_addr + offset;
-        *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+        *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
        return pmem->size - offset;
 }
@@ -157,6 +159,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
                struct resource *res, int id)
 {
        struct pmem_device *pmem;
+        struct request_queue *q;
        pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
        if (!pmem)
@@ -174,16 +177,26 @@ static struct pmem_device *pmem_alloc(struct device *dev,
                return ERR_PTR(-EBUSY);
        }
-        if (pmem_should_map_pages(dev))
+        q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
-                pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res);
+        if (!q)
-        else
+                return ERR_PTR(-ENOMEM);
+        pmem->pfn_flags = PFN_DEV;
+        if (pmem_should_map_pages(dev)) {
+                pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
+                                &q->q_usage_counter, NULL);
+                pmem->pfn_flags |= PFN_MAP;
+        } else
                pmem->virt_addr = (void __pmem *) devm_memremap(dev,
                                pmem->phys_addr, pmem->size,
                                ARCH_MEMREMAP_PMEM);
-        if (IS_ERR(pmem->virt_addr))
+        if (IS_ERR(pmem->virt_addr)) {
+                blk_cleanup_queue(q);
                return (void __force *) pmem->virt_addr;
+        }
+        pmem->pmem_queue = q;
        return pmem;
 }
@@ -203,10 +216,6 @@ static int pmem_attach_disk(struct device *dev,
        int nid = dev_to_node(dev);
        struct gendisk *disk;
-        pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
-        if (!pmem->pmem_queue)
-                return -ENOMEM;
        blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
        blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
        blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
@@ -352,12 +361,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
        struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
        struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
        struct device *dev = &nd_pfn->dev;
-        struct vmem_altmap *altmap;
        struct nd_region *nd_region;
+        struct vmem_altmap *altmap;
        struct nd_pfn_sb *pfn_sb;
        struct pmem_device *pmem;
+        struct request_queue *q;
        phys_addr_t offset;
        int rc;
+        struct vmem_altmap __altmap = {
+                .base_pfn = __phys_to_pfn(nsio->res.start),
+                .reserve = __phys_to_pfn(SZ_8K),
+        };
        if (!nd_pfn->uuid || !nd_pfn->ndns)
                return -ENODEV;
@@ -375,6 +389,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
                        return -EINVAL;
                nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
                altmap = NULL;
+        } else if (nd_pfn->mode == PFN_MODE_PMEM) {
+                nd_pfn->npfns = (resource_size(&nsio->res) - offset)
+                        / PAGE_SIZE;
+                if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
+                        dev_info(&nd_pfn->dev,
+                                        "number of pfns truncated from %lld to %ld\n",
+                                        le64_to_cpu(nd_pfn->pfn_sb->npfns),
+                                        nd_pfn->npfns);
+                altmap = & __altmap;
+                altmap->free = __phys_to_pfn(offset - SZ_8K);
+                altmap->alloc = 0;
        } else {
                rc = -ENXIO;
                goto err;
@@ -382,8 +407,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
        /* establish pfn range for lookup, and switch to direct map */
        pmem = dev_get_drvdata(dev);
+        q = pmem->pmem_queue;
        devm_memunmap(dev, (void __force *) pmem->virt_addr);
-        pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res);
+        pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
+                        &q->q_usage_counter, altmap);
+        pmem->pfn_flags |= PFN_MAP;
        if (IS_ERR(pmem->virt_addr)) {
                rc = PTR_ERR(pmem->virt_addr);
                goto err;
@@ -424,19 +452,22 @@ static int nd_pmem_probe(struct device *dev)
                return -ENOMEM;
        nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
-        if (is_nd_btt(dev))
+        if (is_nd_btt(dev)) {
+                /* btt allocates its own request_queue */
+                blk_cleanup_queue(pmem->pmem_queue);
+                pmem->pmem_queue = NULL;
                return nvdimm_namespace_attach_btt(ndns);
+        }
        if (is_nd_pfn(dev))
                return nvdimm_namespace_attach_pfn(ndns);
-        if (nd_btt_probe(ndns, pmem) == 0) {
+        if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
-                /* we'll come back as btt-pmem */
+                /*
-                return -ENXIO;
+                 * We'll come back as either btt-pmem, or pfn-pmem, so
-        }
+                 * drop the queue allocation for now.
+                 */
-        if (nd_pfn_probe(ndns, pmem) == 0) {
+                blk_cleanup_queue(pmem->pmem_queue);
-                /* we'll come back as pfn-pmem */
                return -ENXIO;
        }
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 94a8f4ab57bc..ce7b70181740 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -17,6 +17,7 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
+#include <linux/pfn_t.h>
 #include <asm/extmem.h>
 #include <asm/io.h>
@@ -30,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
                                                struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-                         void __pmem **kaddr, unsigned long *pfn);
+                         void __pmem **kaddr, pfn_t *pfn);
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
@@ -883,20 +884,18 @@ fail:
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-                        void __pmem **kaddr, unsigned long *pfn)
+                        void __pmem **kaddr, pfn_t *pfn)
 {
        struct dcssblk_dev_info *dev_info;
        unsigned long offset, dev_sz;
-        void *addr;
        dev_info = bdev->bd_disk->private_data;
        if (!dev_info)
                return -ENODEV;
        dev_sz = dev_info->end - dev_info->start;
        offset = secnum * 512;
-        addr = (void *) (dev_info->start + offset);
+        *kaddr = (void __pmem *) (dev_info->start + offset);
-        *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
+        *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
-        *kaddr = (void __pmem *) addr;
        return dev_sz - offset;
 }
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bb1ef86c411..9adee0d7536e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,8 @@ config FS_DAX_PMD
        bool
        default FS_DAX
        depends on FS_DAX
-        depends on BROKEN
+        depends on ZONE_DEVICE
+        depends on TRANSPARENT_HUGEPAGE
 endif # BLOCK
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 81c0705558be..530145b607c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -455,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
 /**
 * bdev_direct_access() - Get the address for directly-accessibly memory
 * @bdev: The device containing the memory
- * @sector: The offset within the device
+ * @dax: control and output parameters for ->direct_access
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
 *
 * If a block device is made up of directly addressable memory, this function
 * will tell the caller the PFN and the address of the memory.  The address
@@ -469,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
 * Return: negative errno if an error occurs, otherwise the number of bytes
 * accessible at this address.
 */
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
-                        void __pmem **addr, unsigned long *pfn, long size)
 {
-        long avail;
+        sector_t sector = dax->sector;
+        long avail, size = dax->size;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
        /*
@@ -491,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-        avail = ops->direct_access(bdev, sector, addr, pfn);
+        avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
        if (!avail)
                return -ERANGE;
+        if (avail > 0 && avail & ~PAGE_MASK)
+                return -ENXIO;
        return min(avail, size);
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e82217c3..0a2752b79e72 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
         * should have access to this page, we're safe to simply set
         * PG_locked without checking it first.
         */
-        __set_page_locked(page);
+        __SetPageLocked(page);
        rc = add_to_page_cache_locked(page, mapping,
                                      page->index, gfp);
        /* give up if we can't stick it in the cache */
        if (rc) {
-                __clear_page_locked(page);
+                __ClearPageLocked(page);
                return rc;
        }
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
                if (*bytes + PAGE_CACHE_SIZE > rsize)
                        break;
-                __set_page_locked(page);
+                __SetPageLocked(page);
                if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-                        __clear_page_locked(page);
+                        __ClearPageLocked(page);
                        break;
                }
                list_move_tail(&page->lru, tmplist);
diff --git a/fs/dax.c b/fs/dax.c
index 43671b68220e..7af879759064 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -28,54 +28,68 @@
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+        struct request_queue *q = bdev->bd_queue;
+        long rc = -EIO;
+        dax->addr = (void __pmem *) ERR_PTR(-EIO);
+        if (blk_queue_enter(q, true) != 0)
+                return rc;
+        rc = bdev_direct_access(bdev, dax);
+        if (rc < 0) {
+                dax->addr = (void __pmem *) ERR_PTR(rc);
+                blk_queue_exit(q);
+                return rc;
+        }
+        return rc;
+}
+static void dax_unmap_atomic(struct block_device *bdev,
+                const struct blk_dax_ctl *dax)
+{
+        if (IS_ERR(dax->addr))
+                return;
+        blk_queue_exit(bdev->bd_queue);
+}
 /*
 * dax_clear_blocks() is called from within transaction context from XFS,
 * and hence this means the stack from this point must follow GFP_NOFS
 * semantics for all operations.
 */
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
 {
        struct block_device *bdev = inode->i_sb->s_bdev;
-        sector_t sector = block << (inode->i_blkbits - 9);
+        struct blk_dax_ctl dax = {
+                .sector = block << (inode->i_blkbits - 9),
+                .size = _size,
+        };
        might_sleep();
        do {
-                void __pmem *addr;
+                long count, sz;
-                unsigned long pfn;
-                long count;
-                count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+                count = dax_map_atomic(bdev, &dax);
                if (count < 0)
                        return count;
-                BUG_ON(size < count);
+                sz = min_t(long, count, SZ_128K);
-                while (count > 0) {
+                clear_pmem(dax.addr, sz);
-                        unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+                dax.size -= sz;
-                        if (pgsz > count)
+                dax.sector += sz / 512;
-                                pgsz = count;
+                dax_unmap_atomic(bdev, &dax);
-                        clear_pmem(addr, pgsz);
+                cond_resched();
-                        addr += pgsz;
+        } while (dax.size);
-                        size -= pgsz;
-                        count -= pgsz;
-                        BUG_ON(pgsz & 511);
-                        sector += pgsz / 512;
-                        cond_resched();
-                }
-        } while (size);
        wmb_pmem();
        return 0;
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
-                unsigned blkbits)
-{
-        unsigned long pfn;
-        sector_t sector = bh->b_blocknr << (blkbits - 9);
-        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
                loff_t pos, loff_t end)
@@ -105,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
        return bh->b_state != 0;
 }
+static sector_t to_sector(const struct buffer_head *bh,
+                const struct inode *inode)
+{
+        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+        return sector;
+}
 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                      loff_t start, loff_t end, get_block_t get_block,
                      struct buffer_head *bh)
 {
-        ssize_t retval = 0;
+        loff_t pos = start, max = start, bh_max = start;
-        loff_t pos = start;
+        bool hole = false, need_wmb = false;
-        loff_t max = start;
+        struct block_device *bdev = NULL;
-        loff_t bh_max = start;
+        int rw = iov_iter_rw(iter), rc;
-        void __pmem *addr;
+        long map_len = 0;
-        bool hole = false;
+        struct blk_dax_ctl dax = {
-        bool need_wmb = false;
+                .addr = (void __pmem *) ERR_PTR(-EIO),
+        };
-        if (iov_iter_rw(iter) != WRITE)
+        if (rw == READ)
                end = min(end, i_size_read(inode));
        while (pos < end) {
@@ -132,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                        if (pos == bh_max) {
                                bh->b_size = PAGE_ALIGN(end - pos);
                                bh->b_state = 0;
-                                retval = get_block(inode, block, bh,
+                                rc = get_block(inode, block, bh, rw == WRITE);
-                                                   iov_iter_rw(iter) == WRITE);
+                                if (rc)
-                                if (retval)
                                        break;
                                if (!buffer_size_valid(bh))
                                        bh->b_size = 1 << blkbits;
                                bh_max = pos - first + bh->b_size;
+                                bdev = bh->b_bdev;
                        } else {
                                unsigned done = bh->b_size -
                                                (bh_max - (pos - first));
@@ -146,47 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                bh->b_size -= done;
                        }
-                        hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+                        hole = rw == READ && !buffer_written(bh);
                        if (hole) {
-                                addr = NULL;
                                size = bh->b_size - first;
                        } else {
-                                retval = dax_get_addr(bh, &addr, blkbits);
+                                dax_unmap_atomic(bdev, &dax);
-                                if (retval < 0)
+                                dax.sector = to_sector(bh, inode);
+                                dax.size = bh->b_size;
+                                map_len = dax_map_atomic(bdev, &dax);
+                                if (map_len < 0) {
+                                        rc = map_len;
                                        break;
+                                }
                                if (buffer_unwritten(bh) || buffer_new(bh)) {
-                                        dax_new_buf(addr, retval, first, pos,
+                                        dax_new_buf(dax.addr, map_len, first,
-                                                                        end);
+                                                        pos, end);
                                        need_wmb = true;
                                }
-                                addr += first;
+                                dax.addr += first;
-                                size = retval - first;
+                                size = map_len - first;
                        }
                        max = min(pos + size, end);
                }
                if (iov_iter_rw(iter) == WRITE) {
-                        len = copy_from_iter_pmem(addr, max - pos, iter);
+                        len = copy_from_iter_pmem(dax.addr, max - pos, iter);
                        need_wmb = true;
                } else if (!hole)
-                        len = copy_to_iter((void __force *)addr, max - pos,
+                        len = copy_to_iter((void __force *) dax.addr, max - pos,
                                        iter);
                else
                        len = iov_iter_zero(max - pos, iter);
                if (!len) {
-                        retval = -EFAULT;
+                        rc = -EFAULT;
                        break;
                }
                pos += len;
-                addr += len;
+                if (!IS_ERR(dax.addr))
+                        dax.addr += len;
        }
        if (need_wmb)
                wmb_pmem();
+        dax_unmap_atomic(bdev, &dax);
-        return (pos == start) ? retval : pos - start;
+        return (pos == start) ? rc : pos - start;
 }
 /**
@@ -275,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
        return VM_FAULT_LOCKED;
 }
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
+static int copy_user_bh(struct page *to, struct inode *inode,
-                        unsigned blkbits, unsigned long vaddr)
+                struct buffer_head *bh, unsigned long vaddr)
 {
-        void __pmem *vfrom;
+        struct blk_dax_ctl dax = {
+                .sector = to_sector(bh, inode),
+                .size = bh->b_size,
+        };
+        struct block_device *bdev = bh->b_bdev;
        void *vto;
-        if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+        if (dax_map_atomic(bdev, &dax) < 0)
-                return -EIO;
+                return PTR_ERR(dax.addr);
        vto = kmap_atomic(to);
-        copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+        copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
        kunmap_atomic(vto);
+        dax_unmap_atomic(bdev, &dax);
        return 0;
 }
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct address_space *mapping = inode->i_mapping;
-        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-        void __pmem *addr;
+        struct address_space *mapping = inode->i_mapping;
-        unsigned long pfn;
+        struct block_device *bdev = bh->b_bdev;
+        struct blk_dax_ctl dax = {
+                .sector = to_sector(bh, inode),
+                .size = bh->b_size,
+        };
        pgoff_t size;
        int error;
@@ -315,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                goto out;
        }
-        error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+        if (dax_map_atomic(bdev, &dax) < 0) {
-        if (error < 0)
+                error = PTR_ERR(dax.addr);
-                goto out;
-        if (error < PAGE_SIZE) {
-                error = -EIO;
                goto out;
        }
        if (buffer_unwritten(bh) || buffer_new(bh)) {
-                clear_pmem(addr, PAGE_SIZE);
+                clear_pmem(dax.addr, PAGE_SIZE);
                wmb_pmem();
        }
+        dax_unmap_atomic(bdev, &dax);
-        error = vm_insert_mixed(vma, vaddr, pfn);
+        error = vm_insert_mixed(vma, vaddr, dax.pfn);
 out:
        i_mmap_unlock_read(mapping);
@@ -422,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (vmf->cow_page) {
                struct page *new_page = vmf->cow_page;
                if (buffer_written(&bh))
-                        error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+                        error = copy_user_bh(new_page, inode, &bh, vaddr);
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
@@ -523,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
 */
 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+                const char *reason, const char *fn)
+{
+        if (bh) {
+                char bname[BDEVNAME_SIZE];
+                bdevname(bh->b_bdev, bname);
+                pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+                        "length %zd fallback: %s\n", fn, current->comm,
+                        address, bname, bh->b_state, (u64)bh->b_blocknr,
+                        bh->b_size, reason);
+        } else {
+                pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+                        current->comm, address, reason);
+        }
+}
+#define dax_pmd_dbg(bh, address, reason)        __dax_dbg(bh, address, reason, "dax_pmd")
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd, unsigned int flags, get_block_t get_block,
                dax_iodone_t complete_unwritten)
@@ -534,41 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        unsigned blkbits = inode->i_blkbits;
        unsigned long pmd_addr = address & PMD_MASK;
        bool write = flags & FAULT_FLAG_WRITE;
-        long length;
+        struct block_device *bdev;
-        void __pmem *kaddr;
        pgoff_t size, pgoff;
-        sector_t block, sector;
+        sector_t block;
-        unsigned long pfn;
        int result = 0;
-        /* dax pmd mappings are broken wrt gup and fork */
+        /* dax pmd mappings require pfn_t_devmap() */
        if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
                return VM_FAULT_FALLBACK;
        /* Fall back to PTEs if we're going to COW */
-        if (write && !(vma->vm_flags & VM_SHARED))
+        if (write && !(vma->vm_flags & VM_SHARED)) {
+                split_huge_pmd(vma, pmd, address);
+                dax_pmd_dbg(NULL, address, "cow write");
                return VM_FAULT_FALLBACK;
+        }
        /* If the PMD would extend outside the VMA */
-        if (pmd_addr < vma->vm_start)
+        if (pmd_addr < vma->vm_start) {
+                dax_pmd_dbg(NULL, address, "vma start unaligned");
                return VM_FAULT_FALLBACK;
-        if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+        }
+        if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+                dax_pmd_dbg(NULL, address, "vma end unaligned");
                return VM_FAULT_FALLBACK;
+        }
        pgoff = linear_page_index(vma, pmd_addr);
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (pgoff >= size)
                return VM_FAULT_SIGBUS;
        /* If the PMD would cover blocks out of the file */
-        if ((pgoff | PG_PMD_COLOUR) >= size)
+        if ((pgoff | PG_PMD_COLOUR) >= size) {
+                dax_pmd_dbg(NULL, address,
+                                "offset + huge page size > file size");
                return VM_FAULT_FALLBACK;
+        }
        memset(&bh, 0, sizeof(bh));
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
        bh.b_size = PMD_SIZE;
-        length = get_block(inode, block, &bh, write);
+        if (get_block(inode, block, &bh, write) != 0)
-        if (length)
                return VM_FAULT_SIGBUS;
+        bdev = bh.b_bdev;
        i_mmap_lock_read(mapping);
        /*
@@ -576,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
         * just fall back to PTEs.  Calling get_block 512 times in a loop
         * would be silly.
         */
-        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+                dax_pmd_dbg(&bh, address, "allocated block too small");
                goto fallback;
+        }
        /*
         * If we allocated new storage, make sure no process has any
@@ -600,57 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                result = VM_FAULT_SIGBUS;
                goto out;
        }
-        if ((pgoff | PG_PMD_COLOUR) >= size)
+        if ((pgoff | PG_PMD_COLOUR) >= size) {
+                dax_pmd_dbg(&bh, address, "pgoff unaligned");
                goto fallback;
+        }
        if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
                spinlock_t *ptl;
                pmd_t entry;
                struct page *zero_page = get_huge_zero_page();
-                if (unlikely(!zero_page))
+                if (unlikely(!zero_page)) {
+                        dax_pmd_dbg(&bh, address, "no zero page");
                        goto fallback;
+                }
                ptl = pmd_lock(vma->vm_mm, pmd);
                if (!pmd_none(*pmd)) {
                        spin_unlock(ptl);
+                        dax_pmd_dbg(&bh, address, "pmd already present");
                        goto fallback;
                }
+                dev_dbg(part_to_dev(bdev->bd_part),
+                                "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+                                __func__, current->comm, address,
+                                (unsigned long long) to_sector(&bh, inode));
                entry = mk_pmd(zero_page, vma->vm_page_prot);
                entry = pmd_mkhuge(entry);
                set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
                result = VM_FAULT_NOPAGE;
                spin_unlock(ptl);
        } else {
-                sector = bh.b_blocknr << (blkbits - 9);
+                struct blk_dax_ctl dax = {
-                length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+                        .sector = to_sector(&bh, inode),
-                                                bh.b_size);
+                        .size = PMD_SIZE,
+                };
+                long length = dax_map_atomic(bdev, &dax);
                if (length < 0) {
                        result = VM_FAULT_SIGBUS;
                        goto out;
                }
-                if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+                if (length < PMD_SIZE) {
+                        dax_pmd_dbg(&bh, address, "dax-length too small");
+                        dax_unmap_atomic(bdev, &dax);
                        goto fallback;
+                }
+                if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+                        dax_pmd_dbg(&bh, address, "pfn unaligned");
+                        dax_unmap_atomic(bdev, &dax);
+                        goto fallback;
+                }
-                /*
+                if (!pfn_t_devmap(dax.pfn)) {
-                 * TODO: teach vmf_insert_pfn_pmd() to support
+                        dax_unmap_atomic(bdev, &dax);
-                 * 'pte_special' for pmds
+                        dax_pmd_dbg(&bh, address, "pfn not in memmap");
-                 */
-                if (pfn_valid(pfn))
                        goto fallback;
+                }
                if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-                        int i;
+                        clear_pmem(dax.addr, PMD_SIZE);
-                        for (i = 0; i < PTRS_PER_PMD; i++)
-                                clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
                        wmb_pmem();
                        count_vm_event(PGMAJFAULT);
                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                        result |= VM_FAULT_MAJOR;
                }
+                dax_unmap_atomic(bdev, &dax);
-                result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+                dev_dbg(part_to_dev(bdev->bd_part),
+                                "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+                                __func__, current->comm, address,
+                                pfn_t_to_pfn(dax.pfn),
+                                (unsigned long long) dax.sector);
+                result |= vmf_insert_pfn_pmd(vma, address, pmd,
+                                dax.pfn, write);
        }
 out:
@@ -752,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        if (err < 0)
                return err;
        if (buffer_written(&bh)) {
-                void __pmem *addr;
+                struct block_device *bdev = bh.b_bdev;
-                err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+                struct blk_dax_ctl dax = {
-                if (err < 0)
+                        .sector = to_sector(&bh, inode),
-                        return err;
+                        .size = PAGE_CACHE_SIZE,
-                clear_pmem(addr + offset, length);
+                };
+                if (dax_map_atomic(bdev, &dax) < 0)
+                        return PTR_ERR(dax.addr);
+                clear_pmem(dax.addr + offset, length);
                wmb_pmem();
+                dax_unmap_atomic(bdev, &dax);
        }
        return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 023f6a1f23cd..6915c950e6e8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
        if (!wbc->wb)
                return;
-        rcu_read_lock();
        id = mem_cgroup_css_from_page(page)->id;
-        rcu_read_unlock();
        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 47789292a582..8bbf7f3e2a27 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
        delete_from_page_cache(page);
 }
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+        struct vm_area_struct *vma;
+        /*
+         * end == 0 indicates that the entire range after
+         * start should be unmapped.
+         */
+        vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+                unsigned long v_offset;
+                unsigned long v_end;
+                /*
+                 * Can the expression below overflow on 32-bit arches?
+                 * No, because the interval tree returns us only those vmas
+                 * which overlap the truncated area starting at pgoff,
+                 * and no vma on a 32-bit arch can span beyond the 4GB.
+                 */
+                if (vma->vm_pgoff < start)
+                        v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+                else
+                        v_offset = 0;
+                if (!end)
+                        v_end = vma->vm_end;
+                else {
+                        v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+                                                        + vma->vm_start;
+                        if (v_end > vma->vm_end)
+                                v_end = vma->vm_end;
+                }
+                unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+                                                                        NULL);
+        }
+}
 /*
 * remove_inode_hugepages handles two distinct cases: truncation and hole
 * punch.  There are subtle differences in operation for each case.
+ *
 * truncation is indicated by end of range being LLONG_MAX
 *      In this case, we first scan the range and release found pages.
 *      After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                        bool rsv_on_error;
                        u32 hash;
                        /*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                                        mapping, next, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                        lock_page(page);
+                        /*
-                        if (likely(!page_mapped(page))) {
+                         * If page is mapped, it was faulted in after being
-                                bool rsv_on_error = !PagePrivate(page);
+                         * unmapped in caller.  Unmap (again) now after taking
-                                /*
+                         * the fault mutex.  The mutex will prevent faults
-                                 * We must free the huge page and remove
+                         * until we finish removing the page.
-                                 * from page cache (remove_huge_page) BEFORE
+                         *
-                                 * removing the region/reserve map
+                         * This race can only happen in the hole punch case.
-                                 * (hugetlb_unreserve_pages).  In rare out
+                         * Getting here in a truncate operation is a bug.
-                                 * of memory conditions, removal of the
+                         */
-                                 * region/reserve map could fail.  Before
+                        if (unlikely(page_mapped(page))) {
-                                 * free'ing the page, note PagePrivate which
-                                 * is used in case of error.
-                                 */
-                                remove_huge_page(page);
-                                freed++;
-                                if (!truncate_op) {
-                                        if (unlikely(hugetlb_unreserve_pages(
-                                                        inode, next,
-                                                        next + 1, 1)))
-                                                hugetlb_fix_reserve_counts(
-                                                        inode, rsv_on_error);
-                                }
-                        } else {
-                                /*
-                                 * If page is mapped, it was faulted in after
-                                 * being unmapped.  It indicates a race between
-                                 * hole punch and page fault.  Do nothing in
-                                 * this case.  Getting here in a truncate
-                                 * operation is a bug.
-                                 */
                                BUG_ON(truncate_op);
+                                i_mmap_lock_write(mapping);
+                                hugetlb_vmdelete_list(&mapping->i_mmap,
+                                        next * pages_per_huge_page(h),
+                                        (next + 1) * pages_per_huge_page(h));
+                                i_mmap_unlock_write(mapping);
+                        }
+                        lock_page(page);
+                        /*
+                         * We must free the huge page and remove from page
+                         * cache (remove_huge_page) BEFORE removing the
+                         * region/reserve map (hugetlb_unreserve_pages).  In
+                         * rare out of memory conditions, removal of the
+                         * region/reserve map could fail.  Before free'ing
+                         * the page, note PagePrivate which is used in case
+                         * of error.
+                         */
+                        rsv_on_error = !PagePrivate(page);
+                        remove_huge_page(page);
+                        freed++;
+                        if (!truncate_op) {
+                                if (unlikely(hugetlb_unreserve_pages(inode,
+                                                        next, next + 1, 1)))
+                                        hugetlb_fix_reserve_counts(inode,
+                                                                rsv_on_error);
                        }
                        unlock_page(page);
@@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-        struct vm_area_struct *vma;
-        /*
-         * end == 0 indicates that the entire range after
-         * start should be unmapped.
-         */
-        vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-                unsigned long v_offset;
-                /*
-                 * Can the expression below overflow on 32-bit arches?
-                 * No, because the interval tree returns us only those vmas
-                 * which overlap the truncated area starting at pgoff,
-                 * and no vma on a 32-bit arch can span beyond the 4GB.
-                 */
-                if (vma->vm_pgoff < start)
-                        v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-                else
-                        v_offset = 0;
-                if (end) {
-                        end = ((end - start) << PAGE_SHIFT) +
-                               vma->vm_start + v_offset;
-                        if (end > vma->vm_end)
-                                end = vma->vm_end;
-                } else
-                        end = vma->vm_end;
-                unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
-        }
-}
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
        pgoff_t pgoff;
@@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 /*
 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
 * be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
 * i_mmap_rwsem.
 */
 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..b2855eea5405 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
         * pseudo flags for the well known (anonymous) memory mapped pages
         *
         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
-         * simple test in page_mapped() is not enough.
+         * simple test in page_mapcount() is not enough.
         */
-        if (!PageSlab(page) && page_mapped(page))
+        if (!PageSlab(page) && page_mapcount(page))
                u |= 1 << KPF_MMAP;
        if (PageAnon(page))
                u |= 1 << KPF_ANON;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a353b4c6e86e..65a1b6c69c11 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -466,9 +466,10 @@ struct mem_size_stats {
 };
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-                unsigned long size, bool young, bool dirty)
+                bool compound, bool young, bool dirty)
 {
-        int mapcount;
+        int i, nr = compound ? HPAGE_PMD_NR : 1;
+        unsigned long size = nr * PAGE_SIZE;
        if (PageAnon(page))
                mss->anonymous += size;
@@ -477,23 +478,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
        /* Accumulate the size in pages that have been accessed. */
        if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
-        mapcount = page_mapcount(page);
-        if (mapcount >= 2) {
-                u64 pss_delta;
-                if (dirty || PageDirty(page))
+        /*
-                        mss->shared_dirty += size;
+         * page_count(page) == 1 guarantees the page is mapped exactly once.
-                else
+         * If any subpage of the compound page mapped with PTE it would elevate
-                        mss->shared_clean += size;
+         * page_count().
-                pss_delta = (u64)size << PSS_SHIFT;
+         */
-                do_div(pss_delta, mapcount);
+        if (page_count(page) == 1) {
-                mss->pss += pss_delta;
-        } else {
                if (dirty || PageDirty(page))
                        mss->private_dirty += size;
                else
                        mss->private_clean += size;
                mss->pss += (u64)size << PSS_SHIFT;
+                return;
+        }
+        for (i = 0; i < nr; i++, page++) {
+                int mapcount = page_mapcount(page);
+                if (mapcount >= 2) {
+                        if (dirty || PageDirty(page))
+                                mss->shared_dirty += PAGE_SIZE;
+                        else
+                                mss->shared_clean += PAGE_SIZE;
+                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+                } else {
+                        if (dirty || PageDirty(page))
+                                mss->private_dirty += PAGE_SIZE;
+                        else
+                                mss->private_clean += PAGE_SIZE;
+                        mss->pss += PAGE_SIZE << PSS_SHIFT;
+                }
        }
 }
@@ -554,7 +569,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
        if (!page)
                return;
-        smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+        smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -570,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
        if (IS_ERR_OR_NULL(page))
                return;
        mss->anonymous_thp += HPAGE_PMD_SIZE;
-        smaps_account(mss, page, HPAGE_PMD_SIZE,
+        smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
-                        pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -587,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                smaps_pmd_entry(pmd, addr, walk);
                spin_unlock(ptl);
                return 0;
@@ -898,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                        clear_soft_dirty_pmd(vma, addr, pmd);
                        goto out;
@@ -1172,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
        int err = 0;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
                u64 flags = 0, frame = 0;
                pmd_t pmd = *pmdp;
@@ -1504,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *orig_pte;
        pte_t *pte;
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
diff --git a/fs/stat.c b/fs/stat.c
index d4a61d8dc021..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #  define choose_32_64(a,b) b
 #endif
-#define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x)  choose_32_64(old_valid_dev(x),true)
 #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
 #ifndef INIT_STRUCT_STAT_PADDING
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 3a6803cb0ec9..0b3c0d39ef75 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_GENERIC_PGTABLE_H
 #define _ASM_GENERIC_PGTABLE_H
+#include <linux/pfn.h>
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_MMU
@@ -207,11 +209,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                 unsigned long address, pmd_t *pmdp);
-#endif
 #ifndef pmdp_collapse_flush
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -554,7 +551,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 * by vm_insert_pfn().
 */
 static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                                   unsigned long pfn)
+                                   pfn_t pfn)
 {
        return 0;
 }
@@ -589,7 +586,7 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
 extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-                            unsigned long pfn);
+                            pfn_t pfn);
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size);
@@ -627,10 +624,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
 {
        return 0;
 }
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-        return 0;
-}
 #ifndef __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b58fd667f87b..af0254c09424 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -4,6 +4,7 @@
 /* References to section boundaries */
 #include <linux/compiler.h>
+#include <linux/types.h>
 /*
 * Usage guidelines:
@@ -63,4 +64,68 @@ static inline int arch_is_kernel_data(unsigned long addr)
 }
 #endif
+/**
+ * memory_contains - checks if an object is contained within a memory region
+ * @begin: virtual address of the beginning of the memory region
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the memory region defined by @begin and @end, false
+ * otherwise.
+ */
+static inline bool memory_contains(void *begin, void *end, void *virt,
+                                   size_t size)
+{
+        return virt >= begin && virt + size <= end;
+}
+/**
+ * memory_intersects - checks if the region occupied by an object intersects
+ *                     with another memory region
+ * @begin: virtual address of the beginning of the memory regien
+ * @end: virtual address of the end of the memory region
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the region specified by @begin and @end, false otherwise.
+ */
+static inline bool memory_intersects(void *begin, void *end, void *virt,
+                                     size_t size)
+{
+        void *vend = virt + size;
+        return (virt >= begin && virt < end) || (vend >= begin && vend < end);
+}
+/**
+ * init_section_contains - checks if an object is contained within the init
+ *                         section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if the object specified by @virt and @size is entirely
+ * contained within the init section, false otherwise.
+ */
+static inline bool init_section_contains(void *virt, size_t size)
+{
+        return memory_contains(__init_begin, __init_end, virt, size);
+}
+/**
+ * init_section_intersects - checks if the region occupied by an object
+ *                           intersects with the init section
+ * @virt: virtual address of the memory object
+ * @size: size of the memory object
+ *
+ * Returns: true if an object's memory region, specified by @virt and @size,
+ * intersects with the init section, false otherwise.
+ */
+static inline bool init_section_intersects(void *virt, size_t size)
+{
+        return memory_intersects(__init_begin, __init_end, virt, size);
+}
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c70e3588a48c..bfb64d672e19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -15,6 +15,7 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
+#include <linux/pfn.h>
 #include <linux/bio.h>
 #include <linux/stringify.h>
 #include <linux/gfp.h>
@@ -1617,6 +1618,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
+/**
+ * struct blk_dax_ctl - control and output parameters for ->direct_access
+ * @sector: (input) offset relative to a block_device
+ * @addr: (output) kernel virtual address for @sector populated by driver
+ * @pfn: (output) page frame number for @addr populated by driver
+ * @size: (input) number of bytes requested
+ */
+struct blk_dax_ctl {
+        sector_t sector;
+        void __pmem *addr;
+        long size;
+        pfn_t pfn;
+};
 struct block_device_operations {
        int (*open) (struct block_device *, fmode_t);
        void (*release) (struct gendisk *, fmode_t);
@@ -1624,7 +1639,7 @@ struct block_device_operations {
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-                        unsigned long *pfn);
+                        pfn_t *);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1643,8 +1658,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, sector_t,
+extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
-                void __pmem **addr, unsigned long *pfn, long size);
 #else /* CONFIG_BLOCK */
 struct block_device;
diff --git a/include/linux/console.h b/include/linux/console.h
index bd194343c346..ea731af2451e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -150,6 +150,7 @@ extern int console_trylock(void);
 extern void console_unlock(void);
 extern void console_conditional_schedule(void);
 extern void console_unblank(void);
+extern void console_flush_on_panic(void);
 extern struct tty_driver *console_device(int *);
 extern void console_stop(struct console *);
 extern void console_start(struct console *);
diff --git a/include/linux/err.h b/include/linux/err.h
index a729120644d5..56762ab41713 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
 static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
 {
-        return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
 }
 /**
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ecb080d6ff42..cfe81e10bd54 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,13 +19,16 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
                                          unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+                        struct vm_area_struct *vma,
+                        pmd_t *pmd, unsigned long addr, unsigned long next);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
                        struct vm_area_struct *vma,
                        pmd_t *pmd, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
                         struct vm_area_struct *new_vma,
                         unsigned long old_addr,
                         unsigned long new_addr, unsigned long old_end,
@@ -34,8 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
                        int prot_numa);
 int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                        unsigned long pfn, bool write);
+                        pfn_t pfn, bool write);
 enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -48,21 +50,13 @@ enum transparent_hugepage_flag {
 #endif
 };
-enum page_check_address_pmd_flag {
-        PAGE_CHECK_ADDRESS_PMD_FLAG,
-        PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
-        PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
-extern pmd_t *page_check_address_pmd(struct page *page,
-                                     struct mm_struct *mm,
-                                     unsigned long address,
-                                     enum page_check_address_pmd_flag flag,
-                                     spinlock_t **ptl);
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+                pmd_t *pmd, int flags);
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE  ((1UL) << HPAGE_PMD_SHIFT)
 #define HPAGE_PMD_MASK  (~(HPAGE_PMD_SIZE - 1))
@@ -95,30 +89,28 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 #endif /* CONFIG_DEBUG_VM */
 extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
        return split_huge_page_to_list(page, NULL);
 }
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
+void deferred_split_huge_page(struct page *page);
-                unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd)                    \
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long address);
+#define split_huge_pmd(__vma, __pmd, __address)                         \
        do {                                                            \
                pmd_t *____pmd = (__pmd);                               \
-                if (unlikely(pmd_trans_huge(*____pmd)))                 \
+                if (pmd_trans_huge(*____pmd)                            \
-                        __split_huge_page_pmd(__vma, __address,         \
+                                        || pmd_devmap(*____pmd))        \
-                                        ____pmd);                       \
+                        __split_huge_pmd(__vma, __pmd, __address);      \
        }  while (0)
-#define wait_split_huge_page(__anon_vma, __pmd)                         \
-        do {                                                            \
-                pmd_t *____pmd = (__pmd);                               \
-                anon_vma_lock_write(__anon_vma);                        \
-                anon_vma_unlock_write(__anon_vma);                      \
-                BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
-                       pmd_trans_huge(*____pmd));                       \
-        } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-                pmd_t *pmd);
 #if HPAGE_PMD_ORDER >= MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
@@ -128,17 +120,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
-        if (pmd_trans_huge(*pmd))
+        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma, ptl);
        else
-                return 0;
+                return false;
 }
 static inline int hpage_nr_pages(struct page *page)
 {
@@ -183,11 +175,8 @@ static inline int split_huge_page(struct page *page)
 {
        return 0;
 }
-#define split_huge_page_pmd(__vma, __address, __pmd)    \
+static inline void deferred_split_huge_page(struct page *page) {}
-        do { } while (0)
+#define split_huge_pmd(__vma, __pmd, __address) \
-#define wait_split_huge_page(__anon_vma, __pmd) \
-        do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd)  \
        do { } while (0)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
@@ -201,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
-        return 0;
+        return false;
 }
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -218,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
        return false;
 }
+static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmd, int flags)
+{
+        return NULL;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e76574d8f9b5..7d953c2542a8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -8,6 +8,7 @@
 #include <linux/cgroup.h>
 #include <linux/list.h>
 #include <linux/kref.h>
+#include <asm/pgtable.h>
 struct ctl_table;
 struct user_struct;
diff --git a/include/linux/io.h b/include/linux/io.h
index de64c1e53612..fffd88d7f426 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr);
 void *__devm_memremap_pages(struct device *dev, struct resource *res);
-#ifdef CONFIG_ZONE_DEVICE
-void *devm_memremap_pages(struct device *dev, struct resource *res);
-#else
-static inline void *devm_memremap_pages(struct device *dev, struct resource *res)
-{
-        /*
-         * Fail attempts to call devm_memremap_pages() without
-         * ZONE_DEVICE support enabled, this requires callers to fall
-         * back to plain devm_memremap() based on config
-         */
-        WARN_ON_ONCE(1);
-        return ERR_PTR(-ENXIO);
-}
-#endif
 /*
 * Some systems do not have legacy ISA devices.
 * /dev/port is not a valid interface on these systems.
diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
index 052c7b32cc91..8e9e288b08c1 100644
--- a/include/linux/kdev_t.h
+++ b/include/linux/kdev_t.h
@@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val)
        return MKDEV((val >> 8) & 255, val & 255);
 }
-static inline bool new_valid_dev(dev_t dev)
-{
-        return 1;
-}
 static inline u32 new_encode_dev(dev_t dev)
 {
        unsigned major = MAJOR(dev);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7311c3294e25..f31638c6e873 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -202,26 +202,26 @@ extern int _cond_resched(void);
 /**
 * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first
+ * @x: the value.  If it is unsigned type, it is converted to signed type first.
- *   (s64, long or int depending on its size).
+ *     char is treated as if it was signed (regardless of whether it really is)
+ *     but the macro's return type is preserved as char.
 *
- * Return: an absolute value of x.  If x is 64-bit, macro's return type is s64,
+ * Return: an absolute value of x.
- *   otherwise it is signed long.
 */
-#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({       \
+#define abs(x)  __abs_choose_expr(x, long long,                         \
-                s64 __x = (x);                                          \
+                __abs_choose_expr(x, long,                              \
-                (__x < 0) ? -__x : __x;                                 \
+                __abs_choose_expr(x, int,                               \
-        }), ({                                                          \
+                __abs_choose_expr(x, short,                             \
-                long ret;                                               \
+                __abs_choose_expr(x, char,                              \
-                if (sizeof(x) == sizeof(long)) {                        \
+                __builtin_choose_expr(                                  \
-                        long __x = (x);                                 \
+                        __builtin_types_compatible_p(typeof(x), char),  \
-                        ret = (__x < 0) ? -__x : __x;                   \
+                        (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
-                } else {                                                \
+                        ((void)0)))))))
-                        int __x = (x);                                  \
-                        ret = (__x < 0) ? -__x : __x;                   \
+#define __abs_choose_expr(x, type, other) __builtin_choose_expr(        \
-                }                                                       \
+        __builtin_types_compatible_p(typeof(x),   signed type) ||       \
-                ret;                                                    \
+        __builtin_types_compatible_p(typeof(x), unsigned type),         \
-        }))
+        ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
 /**
 * reciprocal_scale - "scale" a value into range [0, ep_ro)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f707f74055c3..861f690aa791 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -66,7 +66,7 @@
 * error pfns indicate that the gfn is in slot but faild to
 * translate it to pfn on host.
 */
-static inline bool is_error_pfn(pfn_t pfn)
+static inline bool is_error_pfn(kvm_pfn_t pfn)
 {
        return !!(pfn & KVM_PFN_ERR_MASK);
 }
@@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn)
 * translated to pfn - it is not in slot or failed to
 * translate it to pfn.
 */
-static inline bool is_error_noslot_pfn(pfn_t pfn)
+static inline bool is_error_noslot_pfn(kvm_pfn_t pfn)
 {
        return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
 }
 /* noslot pfn indicates that the gfn is not in slot. */
-static inline bool is_noslot_pfn(pfn_t pfn)
+static inline bool is_noslot_pfn(kvm_pfn_t pfn)
 {
        return pfn == KVM_PFN_NOSLOT;
 }
@@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
-pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
-pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
-                           bool *async, bool write_fault, bool *writable);
+                               bool atomic, bool *async, bool write_fault,
+                               bool *writable);
-void kvm_release_pfn_clean(pfn_t pfn);
+void kvm_release_pfn_clean(kvm_pfn_t pfn);
-void kvm_set_pfn_dirty(pfn_t pfn);
+void kvm_set_pfn_dirty(kvm_pfn_t pfn);
-void kvm_set_pfn_accessed(pfn_t pfn);
+void kvm_set_pfn_accessed(kvm_pfn_t pfn);
-void kvm_get_pfn(pfn_t pfn);
+void kvm_get_pfn(kvm_pfn_t pfn);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
                        int len);
@@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
-pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
-pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
@@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-bool kvm_is_reserved_pfn(pfn_t pfn);
+bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
 struct kvm_irq_ack_notifier {
        struct hlist_node link;
@@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa)
        return (gfn_t)(gpa >> PAGE_SHIFT);
 }
-static inline hpa_t pfn_to_hpa(pfn_t pfn)
+static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
 {
        return (hpa_t)pfn << PAGE_SHIFT;
 }
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 1b47a185c2f0..8bf259dae9f6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -53,7 +53,7 @@ typedef unsigned long  hva_t;
 typedef u64            hpa_t;
 typedef u64            hfn_t;
-typedef hfn_t pfn_t;
+typedef hfn_t kvm_pfn_t;
 struct gfn_to_hva_cache {
        u64 generation;
diff --git a/include/linux/list.h b/include/linux/list.h
index 5356f4d661a7..30cf4200ab40 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry);
 extern void list_del(struct list_head *entry);
 #endif
+#ifdef CONFIG_DEBUG_LIST
+/*
+ * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
+ * of the pages it allocates is ever passed to list_add()
+ */
+extern void list_force_poison(struct list_head *entry);
+#else
+/* fallback to the less strict LIST_POISON* definitions */
+#define list_force_poison list_del
+#endif
 /**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 173fb44e22f1..3106ac1c895e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,14 @@ extern int memblock_debug;
 extern bool movable_node_enabled;
 #endif /* CONFIG_MOVABLE_NODE */
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#define __init_memblock __meminit
+#define __initdata_memblock __meminitdata
+#else
+#define __init_memblock
+#define __initdata_memblock
+#endif
 #define memblock_dbg(fmt, ...) \
        if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
@@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m)
        return m->flags & MEMBLOCK_HOTPLUG;
 }
-static inline bool movable_node_is_enabled(void)
+static inline bool __init_memblock movable_node_is_enabled(void)
 {
        return movable_node_enabled;
 }
@@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
        for (idx = 0; idx < memblock_type->cnt;                         \
             idx++,rgn = &memblock_type->regions[idx])
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-#define __init_memblock __meminit
-#define __initdata_memblock __meminitdata
-#else
-#define __init_memblock
-#define __initdata_memblock
-#endif
 #ifdef CONFIG_MEMTEST
 extern void early_memtest(phys_addr_t start, phys_addr_t end);
 #else
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2292468f2a30..189f04d4d2ec 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                          gfp_t gfp_mask, struct mem_cgroup **memcgp);
+                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                          bool compound);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                              bool lrucare);
+                              bool lrucare, bool compound);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+                bool compound);
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask,
-                                        struct mem_cgroup **memcgp)
+                                        struct mem_cgroup **memcgp,
+                                        bool compound)
 {
        *memcgp = NULL;
        return 0;
@@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 static inline void mem_cgroup_commit_charge(struct page *page,
                                            struct mem_cgroup *memcg,
-                                            bool lrucare)
+                                            bool lrucare, bool compound)
 {
 }
 static inline void mem_cgroup_cancel_charge(struct page *page,
-                                            struct mem_cgroup *memcg)
+                                            struct mem_cgroup *memcg,
+                                            bool compound)
 {
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2ea574ff9714..43405992d027 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
+extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+                unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
                                          unsigned long pnum);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
new file mode 100644
index 000000000000..bcaa634139a9
--- /dev/null
+++ b/include/linux/memremap.h
@@ -0,0 +1,114 @@
+#ifndef _LINUX_MEMREMAP_H_
+#define _LINUX_MEMREMAP_H_
+#include <linux/mm.h>
+#include <linux/ioport.h>
+#include <linux/percpu-refcount.h>
+struct resource;
+struct device;
+/**
+ * struct vmem_altmap - pre-allocated storage for vmemmap_populate
+ * @base_pfn: base of the entire dev_pagemap mapping
+ * @reserve: pages mapped, but reserved for driver use (relative to @base)
+ * @free: free pages set aside in the mapping for memmap storage
+ * @align: pages reserved to meet allocation alignments
+ * @alloc: track pages consumed, private to vmemmap_populate()
+ */
+struct vmem_altmap {
+        const unsigned long base_pfn;
+        const unsigned long reserve;
+        unsigned long free;
+        unsigned long align;
+        unsigned long alloc;
+};
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
+#else
+static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+        return NULL;
+}
+#endif
+/**
+ * struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @altmap: pre-allocated/reserved memory for vmemmap allocations
+ * @res: physical address range covered by @ref
+ * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @dev: host device of the mapping for debug
+ */
+struct dev_pagemap {
+        struct vmem_altmap *altmap;
+        const struct resource *res;
+        struct percpu_ref *ref;
+        struct device *dev;
+};
+#ifdef CONFIG_ZONE_DEVICE
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+                struct percpu_ref *ref, struct vmem_altmap *altmap);
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+#else
+static inline void *devm_memremap_pages(struct device *dev,
+                struct resource *res, struct percpu_ref *ref,
+                struct vmem_altmap *altmap)
+{
+        /*
+         * Fail attempts to call devm_memremap_pages() without
+         * ZONE_DEVICE support enabled, this requires callers to fall
+         * back to plain devm_memremap() based on config
+         */
+        WARN_ON_ONCE(1);
+        return ERR_PTR(-ENXIO);
+}
+static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+        return NULL;
+}
+#endif
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the
+ * same mapping.
+ */
+static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+                struct dev_pagemap *pgmap)
+{
+        const struct resource *res = pgmap ? pgmap->res : NULL;
+        resource_size_t phys = PFN_PHYS(pfn);
+        /*
+         * In the cached case we're already holding a live reference so
+         * we can simply do a blind increment
+         */
+        if (res && phys >= res->start && phys <= res->end) {
+                percpu_ref_get(pgmap->ref);
+                return pgmap;
+        }
+        /* fall back to slow path lookup */
+        rcu_read_lock();
+        pgmap = find_dev_pagemap(phys);
+        if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+                pgmap = NULL;
+        rcu_read_unlock();
+        return pgmap;
+}
+static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
+{
+        if (pgmap)
+                percpu_ref_put(pgmap->ref);
+}
+#endif /* _LINUX_MEMREMAP_H_ */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 839d9e9a1c38..f1cd22f2df1a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -16,6 +16,7 @@
 #include <linux/mm_types.h>
 #include <linux/range.h>
 #include <linux/pfn.h>
+#include <linux/percpu-refcount.h>
 #include <linux/bit_spinlock.h>
 #include <linux/shrinker.h>
 #include <linux/resource.h>
@@ -329,6 +330,13 @@ struct inode;
 #define page_private(page)              ((page)->private)
 #define set_page_private(page, v)       ((page)->private = (v))
+#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline int pmd_devmap(pmd_t pmd)
+{
+        return 0;
+}
+#endif
 /*
 * FIXME: take this include out, include page-flags.h in
 * files which need it (119 of them)
@@ -410,39 +418,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 extern void kvfree(const void *addr);
-static inline void compound_lock(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        return &page[1].compound_mapcount;
-        VM_BUG_ON_PAGE(PageSlab(page), page);
-        bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
 }
-static inline void compound_unlock(struct page *page)
+static inline int compound_mapcount(struct page *page)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        if (!PageCompound(page))
-        VM_BUG_ON_PAGE(PageSlab(page), page);
+                return 0;
-        bit_spin_unlock(PG_compound_lock, &page->flags);
+        page = compound_head(page);
-#endif
+        return atomic_read(compound_mapcount_ptr(page)) + 1;
-}
-static inline unsigned long compound_lock_irqsave(struct page *page)
-{
-        unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        local_irq_save(flags);
-        compound_lock(page);
-#endif
-        return flags;
-}
-static inline void compound_unlock_irqrestore(struct page *page,
-                                              unsigned long flags)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        compound_unlock(page);
-        local_irq_restore(flags);
-#endif
 }
 /*
@@ -455,61 +441,29 @@ static inline void page_mapcount_reset(struct page *page)
        atomic_set(&(page)->_mapcount, -1);
 }
+int __page_mapcount(struct page *page);
 static inline int page_mapcount(struct page *page)
 {
        VM_BUG_ON_PAGE(PageSlab(page), page);
-        return atomic_read(&page->_mapcount) + 1;
-}
-static inline int page_count(struct page *page)
+        if (unlikely(PageCompound(page)))
-{
+                return __page_mapcount(page);
-        return atomic_read(&compound_head(page)->_count);
+        return atomic_read(&page->_mapcount) + 1;
-}
-static inline bool __compound_tail_refcounted(struct page *page)
-{
-        return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
-}
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
-{
-        VM_BUG_ON_PAGE(!PageHead(page), page);
-        return __compound_tail_refcounted(page);
 }
-static inline void get_huge_page_tail(struct page *page)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int total_mapcount(struct page *page);
+#else
+static inline int total_mapcount(struct page *page)
 {
-        /*
+        return page_mapcount(page);
-         * __split_huge_page_refcount() cannot run from under us.
-         */
-        VM_BUG_ON_PAGE(!PageTail(page), page);
-        VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-        VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-        if (compound_tail_refcounted(compound_head(page)))
-                atomic_inc(&page->_mapcount);
 }
+#endif
-extern bool __get_page_tail(struct page *page);
+static inline int page_count(struct page *page)
-static inline void get_page(struct page *page)
 {
-        if (unlikely(PageTail(page)))
+        return atomic_read(&compound_head(page)->_count);
-                if (likely(__get_page_tail(page)))
-                        return;
-        /*
-         * Getting a normal page or the head of a compound page
-         * requires to already have an elevated page->_count.
-         */
-        VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-        atomic_inc(&page->_count);
 }
 static inline struct page *virt_to_head_page(const void *x)
@@ -528,7 +482,8 @@ static inline void init_page_count(struct page *page)
        atomic_set(&page->_count, 1);
 }
-void put_page(struct page *page);
+void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 void split_page(struct page *page, unsigned int order);
@@ -548,6 +503,9 @@ enum compound_dtor_id {
 #ifdef CONFIG_HUGETLB_PAGE
        HUGETLB_PAGE_DTOR,
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        TRANSHUGE_PAGE_DTOR,
+#endif
        NR_COMPOUND_DTORS,
 };
 extern compound_page_dtor * const compound_page_dtors[];
@@ -577,6 +535,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
        page[1].compound_order = order;
 }
+void free_compound_page(struct page *page);
 #ifdef CONFIG_MMU
 /*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
@@ -704,6 +664,51 @@ static inline enum zone_type page_zonenum(const struct page *page)
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
+#ifdef CONFIG_ZONE_DEVICE
+void get_zone_device_page(struct page *page);
+void put_zone_device_page(struct page *page);
+static inline bool is_zone_device_page(const struct page *page)
+{
+        return page_zonenum(page) == ZONE_DEVICE;
+}
+#else
+static inline void get_zone_device_page(struct page *page)
+{
+}
+static inline void put_zone_device_page(struct page *page)
+{
+}
+static inline bool is_zone_device_page(const struct page *page)
+{
+        return false;
+}
+#endif
+static inline void get_page(struct page *page)
+{
+        page = compound_head(page);
+        /*
+         * Getting a normal page or the head of a compound page
+         * requires to already have an elevated page->_count.
+         */
+        VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+        atomic_inc(&page->_count);
+        if (unlikely(is_zone_device_page(page)))
+                get_zone_device_page(page);
+}
+static inline void put_page(struct page *page)
+{
+        page = compound_head(page);
+        if (put_page_testzero(page))
+                __put_page(page);
+        if (unlikely(is_zone_device_page(page)))
+                put_zone_device_page(page);
+}
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
@@ -993,10 +998,21 @@ static inline pgoff_t page_file_index(struct page *page)
 /*
 * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
 */
-static inline int page_mapped(struct page *page)
+static inline bool page_mapped(struct page *page)
-{
+{
-        return atomic_read(&(page)->_mapcount) >= 0;
+        int i;
+        if (likely(!PageCompound(page)))
+                return atomic_read(&page->_mapcount) >= 0;
+        page = compound_head(page);
+        if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+                return true;
+        for (i = 0; i < hpage_nr_pages(page); i++) {
+                if (atomic_read(&page[i]._mapcount) >= 0)
+                        return true;
+        }
+        return false;
 }
 /*
@@ -1084,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping)
 }
 #endif
-extern int can_do_mlock(void);
+extern bool can_do_mlock(void);
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
@@ -1178,7 +1194,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                            unsigned long address, unsigned int fault_flags);
+                            unsigned long address, unsigned int fault_flags,
+                            bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
@@ -1190,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
                struct mm_struct *mm, unsigned long address,
-                unsigned int fault_flags)
+                unsigned int fault_flags, bool *unlocked)
 {
        /* should never happen if there's no MMU */
        BUG();
@@ -1444,6 +1461,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
+#ifndef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t pte)
+{
+        return 0;
+}
+#endif
 int vma_wants_writenotify(struct vm_area_struct *vma);
 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -2114,7 +2138,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long pfn);
+                        pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -2224,7 +2248,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
-void *vmemmap_alloc_block_buf(unsigned long size, int node);
+struct vmem_altmap;
+void *__vmemmap_alloc_block_buf(unsigned long size, int node,
+                struct vmem_altmap *altmap);
+static inline void *vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+        return __vmemmap_alloc_block_buf(size, node, NULL);
+}
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node);
@@ -2246,7 +2277,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int get_hwpoison_page(struct page *page);
-extern void put_hwpoison_page(struct page *page);
+#define put_hwpoison_page(page) put_page(page)
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6bc9a0ce2253..d3ebb9d21a53 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,6 +54,8 @@ struct page {
                                                 * see PAGE_MAPPING_ANON below.
                                                 */
                void *s_mem;                    /* slab first object */
+                atomic_t compound_mapcount;     /* first tail page */
+                /* page_deferred_list().next     -- second tail page */
        };
        /* Second double word */
@@ -61,6 +63,7 @@ struct page {
                union {
                        pgoff_t index;          /* Our offset within mapping. */
                        void *freelist;         /* sl[aou]b first free object */
+                        /* page_deferred_list().prev    -- second tail page */
                };
                union {
@@ -81,20 +84,9 @@ struct page {
                                union {
                                        /*
-                                         * Count of ptes mapped in
+                                         * Count of ptes mapped in mms, to show
-                                         * mms, to show when page is
+                                         * when page is mapped & limit reverse
-                                         * mapped & limit reverse map
+                                         * map searches.
-                                         * searches.
-                                         *
-                                         * Used also for tail pages
-                                         * refcounting instead of
-                                         * _count. Tail pages cannot
-                                         * be mapped and keeping the
-                                         * tail page _count zero at
-                                         * all times guarantees
-                                         * get_page_unless_zero() will
-                                         * never succeed on tail
-                                         * pages.
                                         */
                                        atomic_t _mapcount;
@@ -124,6 +116,11 @@ struct page {
                                         * Can be used as a generic list
                                         * by the page owner.
                                         */
+                struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
+                                            * lru or handled by a slab
+                                            * allocator, this points to the
+                                            * hosting device page map.
+                                            */
                struct {                /* slub per cpu partial pages */
                        struct page *next;      /* Next partial slab */
 #ifdef CONFIG_64BIT
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 772362adf471..053824b0a412 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm);
 #define VIRTUAL_BUG_ON(cond) do { } while (0)
 #endif
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
 #endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bb53c7b86315..19724e6ebd26 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,9 +101,6 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        PG_compound_lock,
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
@@ -129,53 +126,104 @@ enum pageflags {
        /* SLOB */
        PG_slob_free = PG_private,
+        /* Compound pages. Stored in first tail page's flags */
+        PG_double_map = PG_private_2,
 };
 #ifndef __GENERATING_BOUNDS_H
+struct page;    /* forward declaration */
+static inline struct page *compound_head(struct page *page)
+{
+        unsigned long head = READ_ONCE(page->compound_head);
+        if (unlikely(head & 1))
+                return (struct page *) (head - 1);
+        return page;
+}
+static inline int PageTail(struct page *page)
+{
+        return READ_ONCE(page->compound_head) & 1;
+}
+static inline int PageCompound(struct page *page)
+{
+        return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ *     the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ *     for compound page all operations related to the page flag applied to
+ *     head page.
+ *
+ * PF_NO_TAIL:
+ *     modifications of the page flag must be done on small or head pages,
+ *     checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ *     the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce)   page
+#define PF_HEAD(page, enforce)  compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({                                    \
+                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);     \
+                compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({                                \
+                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+                page;})
 /*
 * Macros to create function definitions for page flags
 */
-#define TESTPAGEFLAG(uname, lname)                                      \
+#define TESTPAGEFLAG(uname, lname, policy)                              \
-static inline int Page##uname(const struct page *page)                  \
+static inline int Page##uname(struct page *page)                        \
-                        { return test_bit(PG_##lname, &page->flags); }
+        { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname)                                       \
+#define SETPAGEFLAG(uname, lname, policy)                               \
 static inline void SetPage##uname(struct page *page)                    \
-                        { set_bit(PG_##lname, &page->flags); }
+        { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname)                                     \
+#define CLEARPAGEFLAG(uname, lname, policy)                             \
 static inline void ClearPage##uname(struct page *page)                  \
-                        { clear_bit(PG_##lname, &page->flags); }
+        { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname)                                     \
+#define __SETPAGEFLAG(uname, lname, policy)                             \
 static inline void __SetPage##uname(struct page *page)                  \
-                        { __set_bit(PG_##lname, &page->flags); }
+        { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname)                                   \
+#define __CLEARPAGEFLAG(uname, lname, policy)                           \
 static inline void __ClearPage##uname(struct page *page)                \
-                        { __clear_bit(PG_##lname, &page->flags); }
+        { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname)                                       \
+#define TESTSETFLAG(uname, lname, policy)                               \
 static inline int TestSetPage##uname(struct page *page)                 \
-                { return test_and_set_bit(PG_##lname, &page->flags); }
+        { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname)                                     \
+#define TESTCLEARFLAG(uname, lname, policy)                             \
 static inline int TestClearPage##uname(struct page *page)               \
-                { return test_and_clear_bit(PG_##lname, &page->flags); }
+        { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname)                                   \
-static inline int __TestClearPage##uname(struct page *page)             \
-                { return __test_and_clear_bit(PG_##lname, &page->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)               \
+#define PAGEFLAG(uname, lname, policy)                                  \
-        SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+        TESTPAGEFLAG(uname, lname, policy)                              \
+        SETPAGEFLAG(uname, lname, policy)                               \
+        CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)             \
+#define __PAGEFLAG(uname, lname, policy)                                \
-        __SETPAGEFLAG(uname, lname)  __CLEARPAGEFLAG(uname, lname)
+        TESTPAGEFLAG(uname, lname, policy)                              \
+        __SETPAGEFLAG(uname, lname, policy)                             \
+        __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname)                                        \
+#define TESTSCFLAG(uname, lname, policy)                                \
-        TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+        TESTSETFLAG(uname, lname, policy)                               \
+        TESTCLEARFLAG(uname, lname, policy)
 #define TESTPAGEFLAG_FALSE(uname)                                       \
 static inline int Page##uname(const struct page *page) { return 0; }
@@ -195,56 +243,62 @@ static inline int TestSetPage##uname(struct page *page) { return 0; }
 #define TESTCLEARFLAG_FALSE(uname)                                      \
 static inline int TestClearPage##uname(struct page *page) { return 0; }
-#define __TESTCLEARFLAG_FALSE(uname)                                    \
-static inline int __TestClearPage##uname(struct page *page) { return 0; }
 #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                 \
        SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
 #define TESTSCFLAG_FALSE(uname)                                         \
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page;    /* forward declaration */
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
-TESTPAGEFLAG(Locked, locked)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
+        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
+        __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
-        __SETPAGEFLAG(Referenced, referenced)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
+        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
-        TESTCLEARFLAG(Active, active)
+        TESTCLEARFLAG(Active, active, PF_HEAD)
-__PAGEFLAG(Slab, slab)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
-PAGEFLAG(Checked, checked)              /* Used by some filesystems */
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)     /* Xen */
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND)         /* Used by some filesystems */
-PAGEFLAG(SavePinned, savepinned);                       /* Xen */
-PAGEFLAG(Foreign, foreign);                             /* Xen */
+/* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
+        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
-        __SETPAGEFLAG(SwapBacked, swapbacked)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
-__PAGEFLAG(SlobFree, slob_free)
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+        __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+        __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
 /*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause releasepage() and co to be invoked
 */
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
-        __CLEARPAGEFLAG(Private, private)
+        __CLEARPAGEFLAG(Private, private, PF_ANY)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+        TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 /*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+        TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+        TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 #ifdef CONFIG_HIGHMEM
 /*
@@ -257,31 +311,33 @@ PAGEFLAG_FALSE(HighMem)
 #endif
 #ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(SwapCache)
 #endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
-        TESTCLEARFLAG(Unevictable, unevictable)
+        __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+        TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
 #ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
-        TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
 #else
 PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
-        TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
+        TESTSCFLAG_FALSE(Mlocked)
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(Uncached)
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
-TESTSCFLAG(HWPoison, hwpoison)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
 #else
 PAGEFLAG_FALSE(HWPoison)
@@ -289,10 +345,10 @@ PAGEFLAG_FALSE(HWPoison)
 #endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
+TESTPAGEFLAG(Young, young, PF_ANY)
-SETPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young, PF_ANY)
-TESTCLEARFLAG(Young, young)
+TESTCLEARFLAG(Young, young, PF_ANY)
-PAGEFLAG(Idle, idle)
+PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 /*
@@ -317,6 +373,7 @@ PAGEFLAG(Idle, idle)
 static inline int PageAnon(struct page *page)
 {
+        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
 }
@@ -329,6 +386,7 @@ static inline int PageAnon(struct page *page)
 */
 static inline int PageKsm(struct page *page)
 {
+        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }
@@ -340,8 +398,9 @@ u64 stable_page_flags(struct page *page);
 static inline int PageUptodate(struct page *page)
 {
-        int ret = test_bit(PG_uptodate, &(page)->flags);
+        int ret;
+        page = compound_head(page);
+        ret = test_bit(PG_uptodate, &(page)->flags);
        /*
         * Must ensure that the data we read out of the page is loaded
         * _after_ we've loaded page->flags to check for PageUptodate.
@@ -358,22 +417,24 @@ static inline int PageUptodate(struct page *page)
 static inline void __SetPageUptodate(struct page *page)
 {
+        VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
-        __set_bit(PG_uptodate, &(page)->flags);
+        __set_bit(PG_uptodate, &page->flags);
 }
 static inline void SetPageUptodate(struct page *page)
 {
+        VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the page
         * uptodate are actually visible before PageUptodate becomes true.
         */
        smp_wmb();
-        set_bit(PG_uptodate, &(page)->flags);
+        set_bit(PG_uptodate, &page->flags);
 }
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 int test_clear_page_writeback(struct page *page);
 int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -393,12 +454,7 @@ static inline void set_page_writeback_keepwrite(struct page *page)
        test_set_page_writeback_keepwrite(page);
 }
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
-static inline int PageTail(struct page *page)
-{
-        return READ_ONCE(page->compound_head) & 1;
-}
 static inline void set_compound_head(struct page *page, struct page *head)
 {
@@ -410,20 +466,6 @@ static inline void clear_compound_head(struct page *page)
        WRITE_ONCE(page->compound_head, 0);
 }
-static inline struct page *compound_head(struct page *page)
-{
-        unsigned long head = READ_ONCE(page->compound_head);
-        if (unlikely(head & 1))
-                return (struct page *) (head - 1);
-        return page;
-}
-static inline int PageCompound(struct page *page)
-{
-        return PageHead(page) || PageTail(page);
-}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void ClearPageCompound(struct page *page)
 {
@@ -484,22 +526,43 @@ static inline int PageTransTail(struct page *page)
        return PageTail(page);
 }
-#else
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
-static inline int PageTransHuge(struct page *page)
+ * as PMDs.
+ *
+ * This is required for optimization of rmap operations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
 {
-        return 0;
+        return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
 }
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
 {
-        return 0;
+        VM_BUG_ON_PAGE(!PageHead(page), page);
+        return test_and_set_bit(PG_double_map, &page[1].flags);
 }
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
 {
-        return 0;
+        VM_BUG_ON_PAGE(!PageHead(page), page);
+        return test_and_clear_bit(PG_double_map, &page[1].flags);
 }
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+        TESTSETFLAG_FALSE(DoubleMap)
+        TESTCLEARFLAG_FALSE(DoubleMap)
 #endif
 /*
@@ -583,12 +646,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 #define __PG_MLOCKED            0
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK              (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK              0
-#endif
 /*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  It they are, there is a problem.
@@ -598,8 +655,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
         1 << PG_private | 1 << PG_private_2 | \
         1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-         1 << PG_unevictable | __PG_MLOCKED | \
+         1 << PG_unevictable | __PG_MLOCKED)
-         __PG_COMPOUND_LOCK)
 /*
 * Flags checked when a page is prepped for return by the page allocator.
@@ -626,6 +682,10 @@ static inline int page_has_private(struct page *page)
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
 }
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
 #endif /* !__GENERATING_BOUNDS_H */
 #endif  /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 26eabf5ec718..4d08b6c33557 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 */
 static inline pgoff_t page_to_pgoff(struct page *page)
 {
+        pgoff_t pgoff;
        if (unlikely(PageHeadHuge(page)))
                return page->index << compound_order(page);
-        else
+        if (likely(!PageTransTail(page)))
                return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        /*
+         *  We don't initialize ->index for tail pages: calculate based on
+         *  head page
+         */
+        pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff += page - compound_head(page);
+        return pgoff;
 }
 /*
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                unsigned int flags);
 extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
-        __set_bit(PG_locked, &page->flags);
-}
-static inline void __clear_page_locked(struct page *page)
-{
-        __clear_bit(PG_locked, &page->flags);
-}
 static inline int trylock_page(struct page *page)
 {
+        page = compound_head(page);
        return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-        if (PageLocked(page))
+        if (!PageLocked(page))
-                return wait_on_page_bit_killable(page, PG_locked);
+                return 0;
-        return 0;
+        return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
 static inline void wait_on_page_locked(struct page *page)
 {
        if (PageLocked(page))
-                wait_on_page_bit(page, PG_locked);
+                wait_on_page_bit(compound_head(page), PG_locked);
 }
 /* 
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 /*
 * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
 */
 static inline int add_to_page_cache(struct page *page,
                struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
        int error;
-        __set_page_locked(page);
+        __SetPageLocked(page);
        error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
        if (unlikely(error))
-                __clear_page_locked(page);
+                __ClearPageLocked(page);
        return error;
 }
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 97f3e88aead4..2d8e49711b63 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -3,6 +3,15 @@
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
+/*
+ * pfn_t: encapsulates a page-frame number that is optionally backed
+ * by memmap (struct page).  Whether a pfn_t has a 'struct page'
+ * backing is indicated by flags in the high bits of the value.
+ */
+typedef struct {
+        unsigned long val;
+} pfn_t;
 #endif
 #define PFN_ALIGN(x)    (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
new file mode 100644
index 000000000000..0703b5360d31
--- /dev/null
+++ b/include/linux/pfn_t.h
@@ -0,0 +1,102 @@
+#ifndef _LINUX_PFN_T_H_
+#define _LINUX_PFN_T_H_
+#include <linux/mm.h>
+/*
+ * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags
+ * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
+ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
+ * PFN_DEV - pfn is not covered by system memmap by default
+ * PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ */
+#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
+                << (BITS_PER_LONG - PAGE_SHIFT))
+#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
+#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
+#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
+#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
+static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
+{
+        pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
+        return pfn_t;
+}
+/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
+static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
+{
+        return __pfn_to_pfn_t(pfn, 0);
+}
+extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags);
+static inline bool pfn_t_has_page(pfn_t pfn)
+{
+        return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0;
+}
+static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
+{
+        return pfn.val & ~PFN_FLAGS_MASK;
+}
+static inline struct page *pfn_t_to_page(pfn_t pfn)
+{
+        if (pfn_t_has_page(pfn))
+                return pfn_to_page(pfn_t_to_pfn(pfn));
+        return NULL;
+}
+static inline dma_addr_t pfn_t_to_phys(pfn_t pfn)
+{
+        return PFN_PHYS(pfn_t_to_pfn(pfn));
+}
+static inline void *pfn_t_to_virt(pfn_t pfn)
+{
+        if (pfn_t_has_page(pfn))
+                return __va(pfn_t_to_phys(pfn));
+        return NULL;
+}
+static inline pfn_t page_to_pfn_t(struct page *page)
+{
+        return pfn_to_pfn_t(page_to_pfn(page));
+}
+static inline int pfn_t_valid(pfn_t pfn)
+{
+        return pfn_valid(pfn_t_to_pfn(pfn));
+}
+#ifdef CONFIG_MMU
+static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot)
+{
+        return pfn_pte(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
+{
+        return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline bool pfn_t_devmap(pfn_t pfn)
+{
+        const unsigned long flags = PFN_DEV|PFN_MAP;
+        return (pfn.val & flags) == flags;
+}
+#else
+static inline bool pfn_t_devmap(pfn_t pfn)
+{
+        return false;
+}
+pte_t pte_mkdevmap(pte_t pte);
+pmd_t pmd_mkdevmap(pmd_t pmd);
+#endif
+#endif /* _LINUX_PFN_T_H_ */
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 317e16de09e5..4a27153574e2 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -27,11 +27,15 @@
 * Magic number "tsta" to indicate a static timer initializer
 * for the object debugging code.
 */
-#define TIMER_ENTRY_STATIC      ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC      ((void *) 0x300 + POISON_POINTER_DELTA)
 /********** mm/debug-pagealloc.c **********/
 #define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+#define TAIL_MAPPING    ((void *) 0x400 + POISON_POINTER_DELTA)
 /********** mm/slab.c **********/
 /*
 * Magic nums for obj red zoning.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9729565c25ff..9ccbdf2c1453 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -106,13 +106,13 @@ struct va_format {
 /*
 * Dummy printk for disabled debugging statements to use whilst maintaining
- * gcc's format and side-effect checking.
+ * gcc's format checking.
 */
-static inline __printf(1, 2)
+#define no_printk(fmt, ...)                     \
-int no_printk(const char *fmt, ...)
+do {                                            \
-{
+        if (0)                                  \
-        return 0;
+                printk(fmt, ##__VA_ARGS__);     \
-}
+} while (0)
 #ifdef CONFIG_EARLY_PRINTK
 extern asmlinkage __printf(1, 2)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 29446aeef36e..bdf597c4f0be 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,6 +85,7 @@ enum ttu_flags {
        TTU_UNMAP = 1,                  /* unmap mode */
        TTU_MIGRATION = 2,              /* migration mode */
        TTU_MUNLOCK = 4,                /* munlock mode */
+        TTU_LZFREE = 8,                 /* lazy free mode */
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
@@ -161,25 +162,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 struct anon_vma *page_get_anon_vma(struct page *page);
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
 /*
 * rmap interfaces called when adding or removing pte of page
 */
 void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+                unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+                unsigned long, bool);
 void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long);
 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                                unsigned long);
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
 {
-        atomic_inc(&page->_mapcount);
+        atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
 }
 /*
@@ -210,6 +217,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 }
 /*
+ * Used by idle page tracking to check if a page was referenced via page
+ * tables.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+                                  unsigned long address, pmd_t **pmdp,
+                                  pte_t **ptep, spinlock_t **ptlp);
+#else
+static inline bool page_check_address_transhuge(struct page *page,
+                                struct mm_struct *mm, unsigned long address,
+                                pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
+{
+        *ptep = page_check_address(page, mm, address, ptlp, 0);
+        *pmdp = NULL;
+        return !!*ptep;
+}
+#endif
+/*
 * Used by swapoff to help locate where page is expected in vma.
 */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
@@ -286,5 +312,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN      1
 #define SWAP_FAIL       2
 #define SWAP_MLOCK      3
+#define SWAP_LZFREE     4
 #endif  /* _LINUX_RMAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 066bd21765ad..414e101cd061 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
 extern void add_page_to_unevictable_list(struct page *page);
@@ -538,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry)
        return 0;
 }
-#define reuse_swap_page(page)   (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+        (!PageTransCompound(page) && page_mapcount(page) == 1)
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e623d392db0c..67c1dbd19c6d 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                FOR_ALL_ZONES(PGALLOC),
                PGFREE, PGACTIVATE, PGDEACTIVATE,
                PGFAULT, PGMAJFAULT,
+                PGLAZYFREED,
                FOR_ALL_ZONES(PGREFILL),
                FOR_ALL_ZONES(PGSTEAL_KSWAPD),
                FOR_ALL_ZONES(PGSTEAL_DIRECT),
@@ -68,7 +69,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                THP_FAULT_FALLBACK,
                THP_COLLAPSE_ALLOC,
                THP_COLLAPSE_ALLOC_FAILED,
-                THP_SPLIT,
+                THP_SPLIT_PAGE,
+                THP_SPLIT_PAGE_FAILED,
+                THP_SPLIT_PMD,
                THP_ZERO_PAGE_ALLOC,
                THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 97d635cabac8..0f803d2783e3 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -22,6 +22,7 @@
        EM( SCAN_PAGE_LRU,              "page_not_in_lru")              \
        EM( SCAN_PAGE_LOCK,             "page_locked")                  \
        EM( SCAN_PAGE_ANON,             "page_not_anon")                \
+        EM( SCAN_PAGE_COMPOUND,         "page_compound")                \
        EM( SCAN_ANY_PROCESS,           "no_process_for_page")          \
        EM( SCAN_VMA_NULL,              "vma_null")                     \
        EM( SCAN_VMA_CHECK,             "vma_check_failed")             \
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index a74dd84bbb6d..58274382a616 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -41,6 +41,7 @@
 #define MADV_DONTNEED   4               /* don't need these pages */
 /* common parameters: try to keep these consistent across architectures */
+#define MADV_FREE       8               /* free pages only if memory pressure */
 #define MADV_REMOVE     9               /* remove these pages & resources */
 #define MADV_DONTFORK   10              /* don't inherit across fork */
 #define MADV_DOFORK     11              /* do inherit across fork */
diff --git a/init/Kconfig b/init/Kconfig
index 5481b49e8c3f..4644217b2373 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -285,7 +285,7 @@ config FHANDLE
 config USELIB
        bool "uselib syscall"
-        default y
+        def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION
        help
          This option enables the uselib syscall, a system call used in the
          dynamic linker from libc5 and earlier.  glibc does not use this
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bb0669169716..0167679182c0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        const unsigned long mmun_end   = addr + PAGE_SIZE;
        struct mem_cgroup *memcg;
-        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+                        false);
        if (err)
                return err;
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                goto unlock;
        get_page(kpage);
-        page_add_new_anon_rmap(kpage, vma, addr);
+        page_add_new_anon_rmap(kpage, vma, addr, false);
-        mem_cgroup_commit_charge(kpage, memcg, false);
+        mem_cgroup_commit_charge(kpage, memcg, false, false);
        lru_cache_add_active_or_unevictable(kpage, vma);
        if (!PageAnon(page)) {
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
-        page_remove_rmap(page);
+        page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
-        mem_cgroup_cancel_charge(kpage, memcg);
+        mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
diff --git a/kernel/futex.c b/kernel/futex.c
index 8a310e240cda..c6f514573b28 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page, *page_head;
+        struct page *page;
+        struct address_space *mapping;
        int err, ro = 0;
        /*
@@ -519,46 +520,9 @@ again:
        else
                err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        lock_page(page);
-        page_head = page;
-        if (unlikely(PageTail(page))) {
-                put_page(page);
-                /* serialize against __split_huge_page_splitting() */
-                local_irq_disable();
-                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-                        page_head = compound_head(page);
-                        /*
-                         * page_head is valid pointer but we must pin
-                         * it before taking the PG_lock and/or
-                         * PG_compound_lock. The moment we re-enable
-                         * irqs __split_huge_page_splitting() can
-                         * return and the head page can be freed from
-                         * under us. We can't take the PG_lock and/or
-                         * PG_compound_lock on a page that could be
-                         * freed from under us.
-                         */
-                        if (page != page_head) {
-                                get_page(page_head);
-                                put_page(page);
-                        }
-                        local_irq_enable();
-                } else {
-                        local_irq_enable();
-                        goto again;
-                }
-        }
-#else
-        page_head = compound_head(page);
-        if (page != page_head) {
-                get_page(page_head);
-                put_page(page);
-        }
-#endif
-        lock_page(page_head);
        /*
-         * If page_head->mapping is NULL, then it cannot be a PageAnon
+         * If page->mapping is NULL, then it cannot be a PageAnon
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
-         * an unlikely race, but we do need to retry for page_head->mapping.
+         * an unlikely race, but we do need to retry for page->mapping.
         */
-        if (!page_head->mapping) {
+        mapping = compound_head(page)->mapping;
-                int shmem_swizzled = PageSwapCache(page_head);
+        if (!mapping) {
-                unlock_page(page_head);
+                int shmem_swizzled = PageSwapCache(page);
-                put_page(page_head);
+                unlock_page(page);
+                put_page(page);
                if (shmem_swizzled)
                        goto again;
                return -EFAULT;
@@ -588,7 +553,7 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page_head)) {
+        if (PageAnon(page)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
@@ -603,15 +568,15 @@ again:
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page_head->mapping->host;
+                key->shared.inode = mapping->host;
                key->shared.pgoff = basepage_index(page);
        }
        get_futex_key_refs(key); /* implies MB (B) */
 out:
-        unlock_page(page_head);
+        unlock_page(page);
-        put_page(page_head);
+        put_page(page);
        return err;
 }
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
        down_read(&mm->mmap_sem);
        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-                               FAULT_FLAG_WRITE);
+                               FAULT_FLAG_WRITE, NULL);
        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..e517a16cb426 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
 #include <linux/device.h>
 #include <linux/types.h>
+#include <linux/pfn_t.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
+pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+{
+        return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
 #ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 struct page_map {
        struct resource res;
+        struct percpu_ref *ref;
+        struct dev_pagemap pgmap;
+        struct vmem_altmap altmap;
 };
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+        percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+void put_zone_device_page(struct page *page)
+{
+        put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+static void pgmap_radix_release(struct resource *res)
+{
+        resource_size_t key;
+        mutex_lock(&pgmap_lock);
+        for (key = res->start; key <= res->end; key += SECTION_SIZE)
+                radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+        mutex_unlock(&pgmap_lock);
+}
+static unsigned long pfn_first(struct page_map *page_map)
+{
+        struct dev_pagemap *pgmap = &page_map->pgmap;
+        const struct resource *res = &page_map->res;
+        struct vmem_altmap *altmap = pgmap->altmap;
+        unsigned long pfn;
+        pfn = res->start >> PAGE_SHIFT;
+        if (altmap)
+                pfn += vmem_altmap_offset(altmap);
+        return pfn;
+}
+static unsigned long pfn_end(struct page_map *page_map)
+{
+        const struct resource *res = &page_map->res;
+        return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+#define for_each_device_pfn(pfn, map) \
+        for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+static void devm_memremap_pages_release(struct device *dev, void *data)
 {
-        struct page_map *page_map = res;
+        struct page_map *page_map = data;
+        struct resource *res = &page_map->res;
+        resource_size_t align_start, align_size;
+        struct dev_pagemap *pgmap = &page_map->pgmap;
+        if (percpu_ref_tryget_live(pgmap->ref)) {
+                dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+                percpu_ref_put(pgmap->ref);
+        }
+        pgmap_radix_release(res);
        /* pages are dead and unused, undo the arch mapping */
-        arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+        align_start = res->start & ~(SECTION_SIZE - 1);
+        align_size = ALIGN(resource_size(res), SECTION_SIZE);
+        arch_remove_memory(align_start, align_size);
+        dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+                        "%s: failed to free all reserved pages\n", __func__);
+}
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+        struct page_map *page_map;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+        return page_map ? &page_map->pgmap : NULL;
 }
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ *    (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ *    treated as a "System RAM" range, i.e. not a device mmio range, but
+ *    this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
        int is_ram = region_intersects(res->start, resource_size(res),
                        "System RAM");
+        resource_size_t key, align_start, align_size;
+        struct dev_pagemap *pgmap;
        struct page_map *page_map;
+        unsigned long pfn;
        int error, nid;
        if (is_ram == REGION_MIXED) {
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
+        if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+                dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+                                __func__);
+                return ERR_PTR(-ENXIO);
+        }
+        if (!ref)
+                return ERR_PTR(-EINVAL);
        page_map = devres_alloc_node(devm_memremap_pages_release,
                        sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
        if (!page_map)
                return ERR_PTR(-ENOMEM);
+        pgmap = &page_map->pgmap;
        memcpy(&page_map->res, res, sizeof(*res));
+        pgmap->dev = dev;
+        if (altmap) {
+                memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+                pgmap->altmap = &page_map->altmap;
+        }
+        pgmap->ref = ref;
+        pgmap->res = &page_map->res;
+        mutex_lock(&pgmap_lock);
+        error = 0;
+        for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+                struct dev_pagemap *dup;
+                rcu_read_lock();
+                dup = find_dev_pagemap(key);
+                rcu_read_unlock();
+                if (dup) {
+                        dev_err(dev, "%s: %pr collides with mapping for %s\n",
+                                        __func__, res, dev_name(dup->dev));
+                        error = -EBUSY;
+                        break;
+                }
+                error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+                                page_map);
+                if (error) {
+                        dev_err(dev, "%s: failed: %d\n", __func__, error);
+                        break;
+                }
+        }
+        mutex_unlock(&pgmap_lock);
+        if (error)
+                goto err_radix;
        nid = dev_to_node(dev);
        if (nid < 0)
                nid = numa_mem_id();
-        error = arch_add_memory(nid, res->start, resource_size(res), true);
+        align_start = res->start & ~(SECTION_SIZE - 1);
-        if (error) {
+        align_size = ALIGN(resource_size(res), SECTION_SIZE);
-                devres_free(page_map);
+        error = arch_add_memory(nid, align_start, align_size, true);
-                return ERR_PTR(error);
+        if (error)
-        }
+                goto err_add_memory;
+        for_each_device_pfn(pfn, page_map) {
+                struct page *page = pfn_to_page(pfn);
+                /* ZONE_DEVICE pages must never appear on a slab lru */
+                list_force_poison(&page->lru);
+                page->pgmap = pgmap;
+        }
        devres_add(dev, page_map);
        return __va(res->start);
+ err_add_memory:
+ err_radix:
+        pgmap_radix_release(res);
+        devres_free(page_map);
+        return ERR_PTR(error);
 }
 EXPORT_SYMBOL(devm_memremap_pages);
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+        /* number of pfns from base where pfn_to_page() is valid */
+        return altmap->reserve + altmap->free;
+}
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+        altmap->alloc -= nr_pfns;
+}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+        /*
+         * 'memmap_start' is the virtual address for the first "struct
+         * page" in this range of the vmemmap array.  In the case of
+         * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+         * pointer arithmetic, so we can perform this to_vmem_altmap()
+         * conversion without concern for the initialization state of
+         * the struct page fields.
+         */
+        struct page *page = (struct page *) memmap_start;
+        struct dev_pagemap *pgmap;
+        /*
+         * Uncoditionally retrieve a dev_pagemap associated with the
+         * given physical address, this is only for use in the
+         * arch_{add|remove}_memory() for setting up and tearing down
+         * the memmap.
+         */
+        rcu_read_lock();
+        pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+        rcu_read_unlock();
+        return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/panic.c b/kernel/panic.c
index b333380c6bb2..d96469de72dc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...)
         * panic() is not being callled from OOPS.
         */
        debug_locks_off();
-        console_trylock();
+        console_flush_on_panic();
-        console_unlock();
        if (!panic_blink)
                panic_blink = no_blink;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..e79439134978 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
 #include <linux/uio.h>
 #include <asm/uaccess.h>
+#include <asm-generic/sections.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
 {
-        static int recursion_bug;
+        static bool recursion_bug;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len = 0;
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * it can be printed at the next appropriate moment:
                 */
                if (!oops_in_progress && !lockdep_recursing(current)) {
-                        recursion_bug = 1;
+                        recursion_bug = true;
                        local_irq_restore(flags);
                        return 0;
                }
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                static const char recursion_msg[] =
                        "BUG: recent printk recursion!";
-                recursion_bug = 0;
+                recursion_bug = false;
                /* emit KERN_CRIT message */
                printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
                                         NULL, 0, recursion_msg,
@@ -2233,13 +2234,24 @@ void console_unlock(void)
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
-        bool retry;
+        bool do_cond_resched, retry;
        if (console_suspended) {
                up_console_sem();
                return;
        }
+        /*
+         * Console drivers are called under logbuf_lock, so
+         * @console_may_schedule should be cleared before; however, we may
+         * end up dumping a lot of lines, for example, if called from
+         * console registration path, and should invoke cond_resched()
+         * between lines if allowable.  Not doing so can cause a very long
+         * scheduling stall on a slow console leading to RCU stall and
+         * softlockup warnings which exacerbate the issue with more
+         * messages practically incapacitating the system.
+         */
+        do_cond_resched = console_may_schedule;
        console_may_schedule = 0;
        /* flush buffered message fragment immediately to console */
@@ -2311,6 +2323,9 @@ skip:
                call_console_drivers(level, ext_text, ext_len, text, len);
                start_critical_timings();
                local_irq_restore(flags);
+                if (do_cond_resched)
+                        cond_resched();
        }
        console_locked = 0;
@@ -2378,6 +2393,25 @@ void console_unblank(void)
        console_unlock();
 }
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+        /*
+         * If someone else is holding the console lock, trylock will fail
+         * and may_schedule may be set.  Ignore and proceed to unlock so
+         * that messages are flushed out.  As this can be called from any
+         * context and we don't want to get preempted while flushing,
+         * ensure may_schedule is cleared.
+         */
+        console_trylock();
+        console_may_schedule = 0;
+        console_unlock();
+}
 /*
 * Return the console tty driver structure and its associated index
 */
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
 static int __init printk_late_init(void)
 {
        struct console *con;
        for_each_console(con) {
                if (!keep_bootcon && con->flags & CON_BOOT) {
-                        unregister_console(con);
+                        /*
+                         * Make sure to unregister boot consoles whose data
+                         * resides in the init section before the init section
+                         * is discarded. Boot consoles whose data will stick
+                         * around will automatically be unregistered when the
+                         * proper console replaces them.
+                         */
+                        if (init_section_intersects(con, sizeof(*con)))
+                                unregister_console(con);
                }
        }
        hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index edb6de4f5908..a467e6c28a3b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
 static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
        struct multi_stop_data msdata = {
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
        mutex_unlock(&stop_cpus_mutex);
        return ret ?: done.ret;
 }
-#endif  /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee1ac1cc082c..f75a33f29f6e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
          If unsure, say N.
+config DEBUG_VM_PGFLAGS
+        bool "Debug page-flags operations"
+        depends on DEBUG_VM
+        help
+          Enables extra validation on page flags operations.
+          If unsure, say N.
 config DEBUG_VIRTUAL
        bool "Debug VM translations"
        depends on DEBUG_KERNEL && X86
@@ -1589,7 +1597,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 config LATENCYTOP
        bool "Latency measuring infrastructure"
-        depends on HAVE_LATENCYTOP_SUPPORT
        depends on DEBUG_KERNEL
        depends on STACKTRACE_SUPPORT
        depends on PROC_FS
diff --git a/lib/kasprintf.c b/lib/kasprintf.c
index f194e6e593e1..7f6c506a4942 100644
--- a/lib/kasprintf.c
+++ b/lib/kasprintf.c
@@ -13,19 +13,21 @@
 /* Simplified asprintf. */
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
-        unsigned int len;
+        unsigned int first, second;
        char *p;
        va_list aq;
        va_copy(aq, ap);
-        len = vsnprintf(NULL, 0, fmt, aq);
+        first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);
-        p = kmalloc_track_caller(len+1, gfp);
+        p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;
-        vsnprintf(p, len+1, fmt, ap);
+        second = vsnprintf(p, first+1, fmt, ap);
+        WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
+             first, second, fmt);
        return p;
 }
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..3345a089ef7b 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -12,6 +12,13 @@
 #include <linux/kernel.h>
 #include <linux/rculist.h>
+static struct list_head force_poison;
+void list_force_poison(struct list_head *entry)
+{
+        entry->next = &force_poison;
+        entry->prev = &force_poison;
+}
 /*
 * Insert a new entry between two known consecutive entries.
 *
@@ -23,6 +30,8 @@ void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
 {
+        WARN(new->next == &force_poison || new->prev == &force_poison,
+                "list_add attempted on force-poisoned entry\n");
        WARN(next->prev != prev,
                "list_add corruption. next->prev should be "
                "prev (%p), but was %p. (next=%p).\n",
diff --git a/lib/test_printf.c b/lib/test_printf.c
index c5a666af9ba5..4f6ae60433bc 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -12,10 +12,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/bitmap.h>
+#include <linux/dcache.h>
 #include <linux/socket.h>
 #include <linux/in.h>
 #define BUF_SIZE 256
+#define PAD_SIZE 16
 #define FILL_CHAR '$'
 #define PTR1 ((void*)0x01234567)
@@ -39,6 +42,7 @@
 static unsigned total_tests __initdata;
 static unsigned failed_tests __initdata;
 static char *test_buffer __initdata;
+static char *alloced_buffer __initdata;
 static int __printf(4, 0) __init
 do_test(int bufsize, const char *expect, int elen,
@@ -49,7 +53,7 @@ do_test(int bufsize, const char *expect, int elen,
        total_tests++;
-        memset(test_buffer, FILL_CHAR, BUF_SIZE);
+        memset(alloced_buffer, FILL_CHAR, BUF_SIZE + 2*PAD_SIZE);
        va_copy(aq, ap);
        ret = vsnprintf(test_buffer, bufsize, fmt, aq);
        va_end(aq);
@@ -60,8 +64,13 @@ do_test(int bufsize, const char *expect, int elen,
                return 1;
        }
+        if (memchr_inv(alloced_buffer, FILL_CHAR, PAD_SIZE)) {
+                pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", bufsize, fmt);
+                return 1;
+        }
        if (!bufsize) {
-                if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) {
+                if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE + PAD_SIZE)) {
                        pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n",
                                fmt);
                        return 1;
@@ -76,6 +85,12 @@ do_test(int bufsize, const char *expect, int elen,
                return 1;
        }
+        if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) {
+                pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n",
+                        bufsize, fmt);
+                return 1;
+        }
        if (memcmp(test_buffer, expect, written)) {
                pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
                        bufsize, fmt, test_buffer, written, expect);
@@ -91,7 +106,12 @@ __test(const char *expect, int elen, const char *fmt, ...)
        int rand;
        char *p;
-        BUG_ON(elen >= BUF_SIZE);
+        if (elen >= BUF_SIZE) {
+                pr_err("error in test suite: expected output length %d too long. Format was '%s'.\n",
+                       elen, fmt);
+                failed_tests++;
+                return;
+        }
        va_start(ap, fmt);
@@ -109,6 +129,7 @@ __test(const char *expect, int elen, const char *fmt, ...)
        p = kvasprintf(GFP_KERNEL, fmt, ap);
        if (p) {
+                total_tests++;
                if (memcmp(p, expect, elen+1)) {
                        pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n",
                                fmt, p, expect);
@@ -140,6 +161,30 @@ test_number(void)
        test("0x1234abcd  ", "%#-12x", 0x1234abcd);
        test("  0x1234abcd", "%#12x", 0x1234abcd);
        test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234);
+        test("0|1|1|128|255", "%hhu|%hhu|%hhu|%hhu|%hhu", 0, 1, 257, 128, -1);
+        test("0|1|1|-128|-1", "%hhd|%hhd|%hhd|%hhd|%hhd", 0, 1, 257, 128, -1);
+        test("2015122420151225", "%ho%ho%#ho", 1037, 5282, -11627);
+        /*
+         * POSIX/C99: »The result of converting zero with an explicit
+         * precision of zero shall be no characters.« Hence the output
+         * from the below test should really be "00|0||| ". However,
+         * the kernel's printf also produces a single 0 in that
+         * case. This test case simply documents the current
+         * behaviour.
+         */
+        test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0);
+#ifndef __CHAR_UNSIGNED__
+        {
+                /*
+                 * Passing a 'char' to a %02x specifier doesn't do
+                 * what was presumably the intention when char is
+                 * signed and the value is negative. One must either &
+                 * with 0xff or cast to u8.
+                 */
+                char val = -16;
+                test("0xfffffff0|0xf0|0xf0", "%#02x|%#02x|%#02x", val, val & 0xff, (u8)val);
+        }
+#endif
 }
 static void __init
@@ -148,14 +193,23 @@ test_string(void)
        test("", "%s%.0s", "", "123");
        test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456");
        test("1  |  2|3  |  4|5  ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5");
+        test("1234      ", "%-10.4s", "123456");
+        test("      1234", "%10.4s", "123456");
        /*
-         * POSIX and C99 say that a missing precision should be
+         * POSIX and C99 say that a negative precision (which is only
-         * treated as a precision of 0. However, the kernel's printf
+         * possible to pass via a * argument) should be treated as if
-         * implementation treats this case as if the . wasn't
+         * the precision wasn't present, and that if the precision is
-         * present. Let's add a test case documenting the current
+         * omitted (as in %.s), the precision should be taken to be
-         * behaviour; should anyone ever feel the need to follow the
+         * 0. However, the kernel's printf behave exactly opposite,
-         * standards more closely, this can be revisited.
+         * treating a negative precision as 0 and treating an omitted
+         * precision specifier as if no precision was given.
+         *
+         * These test cases document the current behaviour; should
+         * anyone ever feel the need to follow the standards more
+         * closely, this can be revisited.
         */
+        test("    ", "%4.*s", -5, "123456");
+        test("123456", "%.s", "123456");
        test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c");
        test("a  |   |   ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c");
 }
@@ -273,9 +327,35 @@ uuid(void)
        test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid);
 }
+static struct dentry test_dentry[4] __initdata = {
+        { .d_parent = &test_dentry[0],
+          .d_name = QSTR_INIT(test_dentry[0].d_iname, 3),
+          .d_iname = "foo" },
+        { .d_parent = &test_dentry[0],
+          .d_name = QSTR_INIT(test_dentry[1].d_iname, 5),
+          .d_iname = "bravo" },
+        { .d_parent = &test_dentry[1],
+          .d_name = QSTR_INIT(test_dentry[2].d_iname, 4),
+          .d_iname = "alfa" },
+        { .d_parent = &test_dentry[2],
+          .d_name = QSTR_INIT(test_dentry[3].d_iname, 5),
+          .d_iname = "romeo" },
+};
 static void __init
 dentry(void)
 {
+        test("foo", "%pd", &test_dentry[0]);
+        test("foo", "%pd2", &test_dentry[0]);
+        test("romeo", "%pd", &test_dentry[3]);
+        test("alfa/romeo", "%pd2", &test_dentry[3]);
+        test("bravo/alfa/romeo", "%pd3", &test_dentry[3]);
+        test("/bravo/alfa/romeo", "%pd4", &test_dentry[3]);
+        test("/bravo/alfa", "%pd4", &test_dentry[2]);
+        test("bravo/alfa  |bravo/alfa  ", "%-12pd2|%*pd2", &test_dentry[2], -12, &test_dentry[2]);
+        test("  bravo/alfa|  bravo/alfa", "%12pd2|%*pd2", &test_dentry[2], 12, &test_dentry[2]);
 }
 static void __init
@@ -289,6 +369,20 @@ struct_clk(void)
 }
 static void __init
+large_bitmap(void)
+{
+        const int nbits = 1 << 16;
+        unsigned long *bits = kcalloc(BITS_TO_LONGS(nbits), sizeof(long), GFP_KERNEL);
+        if (!bits)
+                return;
+        bitmap_set(bits, 1, 20);
+        bitmap_set(bits, 60000, 15);
+        test("1-20,60000-60014", "%*pbl", nbits, bits);
+        kfree(bits);
+}
+static void __init
 bitmap(void)
 {
        DECLARE_BITMAP(bits, 20);
@@ -307,6 +401,8 @@ bitmap(void)
        bitmap_fill(bits, 20);
        test("fffff|fffff", "%20pb|%*pb", bits, 20, bits);
        test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits);
+        large_bitmap();
 }
 static void __init
@@ -337,16 +433,17 @@ test_pointer(void)
 static int __init
 test_printf_init(void)
 {
-        test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+        alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL);
-        if (!test_buffer)
+        if (!alloced_buffer)
                return -ENOMEM;
+        test_buffer = alloced_buffer + PAD_SIZE;
        test_basic();
        test_number();
        test_string();
        test_pointer();
-        kfree(test_buffer);
+        kfree(alloced_buffer);
        if (failed_tests == 0)
                pr_info("all %u tests passed\n", total_tests);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ac3f9476b776..48ff9c36644d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -383,13 +383,14 @@ enum format_type {
 };
 struct printf_spec {
-        u8      type;           /* format_type enum */
+        unsigned int    type:8;         /* format_type enum */
-        u8      flags;          /* flags to number() */
+        signed int      field_width:24; /* width of output field */
-        u8      base;           /* number base, 8, 10 or 16 only */
+        unsigned int    flags:8;        /* flags to number() */
-        u8      qualifier;      /* number qualifier, one of 'hHlLtzZ' */
+        unsigned int    base:8;         /* number base, 8, 10 or 16 only */
-        s16     field_width;    /* width of output field */
+        signed int      precision:16;   /* # of digits/chars */
-        s16     precision;      /* # of digits/chars */
+} __packed;
-};
+#define FIELD_WIDTH_MAX ((1 << 23) - 1)
+#define PRECISION_MAX ((1 << 15) - 1)
 static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
@@ -402,6 +403,10 @@ char *number(char *buf, char *end, unsigned long long num,
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
+        int field_width = spec.field_width;
+        int precision = spec.precision;
+        BUILD_BUG_ON(sizeof(struct printf_spec) != 8);
        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
@@ -413,20 +418,20 @@ char *number(char *buf, char *end, unsigned long long num,
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
-                        spec.field_width--;
+                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
-                        spec.field_width--;
+                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
-                        spec.field_width--;
+                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
-                        spec.field_width -= 2;
+                        field_width -= 2;
                else if (!is_zero)
-                        spec.field_width--;
+                        field_width--;
        }
        /* generate full string in tmp[], in reverse order */
@@ -448,12 +453,12 @@ char *number(char *buf, char *end, unsigned long long num,
        }
        /* printing 100 using %2d gives "100", not "00" */
-        if (i > spec.precision)
+        if (i > precision)
-                spec.precision = i;
+                precision = i;
        /* leading space padding */
-        spec.field_width -= spec.precision;
+        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
-                while (--spec.field_width >= 0) {
+                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
@@ -482,14 +487,14 @@ char *number(char *buf, char *end, unsigned long long num,
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);
                BUILD_BUG_ON(' ' + ZEROPAD != '0');
-                while (--spec.field_width >= 0) {
+                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
-        while (i <= --spec.precision) {
+        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
@@ -501,7 +506,7 @@ char *number(char *buf, char *end, unsigned long long num,
                ++buf;
        }
        /* trailing space padding */
-        while (--spec.field_width >= 0) {
+        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
@@ -511,37 +516,20 @@ char *number(char *buf, char *end, unsigned long long num,
 }
 static noinline_for_stack
-char *string(char *buf, char *end, const char *s, struct printf_spec spec)
+char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
 {
-        int len, i;
+        struct printf_spec spec;
-        if ((unsigned long)s < PAGE_SIZE)
-                s = "(null)";
-        len = strnlen(s, spec.precision);
+        spec.type = FORMAT_TYPE_PTR;
+        spec.field_width = 2 + 2 * size;        /* 0x + hex */
-        if (!(spec.flags & LEFT)) {
+        spec.flags = SPECIAL | SMALL | ZEROPAD;
-                while (len < spec.field_width--) {
+        spec.base = 16;
-                        if (buf < end)
+        spec.precision = -1;
-                                *buf = ' ';
-                        ++buf;
-                }
-        }
-        for (i = 0; i < len; ++i) {
-                if (buf < end)
-                        *buf = *s;
-                ++buf; ++s;
-        }
-        while (len < spec.field_width--) {
-                if (buf < end)
-                        *buf = ' ';
-                ++buf;
-        }
-        return buf;
+        return number(buf, end, num, spec);
 }
-static void widen(char *buf, char *end, unsigned len, unsigned spaces)
+static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
 {
        size_t size;
        if (buf >= end) /* nowhere to put anything */
@@ -559,6 +547,56 @@ static void widen(char *buf, char *end, unsigned len, unsigned spaces)
        memset(buf, ' ', spaces);
 }
+/*
+ * Handle field width padding for a string.
+ * @buf: current buffer position
+ * @n: length of string
+ * @end: end of output buffer
+ * @spec: for field width and flags
+ * Returns: new buffer position after padding.
+ */
+static noinline_for_stack
+char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
+{
+        unsigned spaces;
+        if (likely(n >= spec.field_width))
+                return buf;
+        /* we want to pad the sucker */
+        spaces = spec.field_width - n;
+        if (!(spec.flags & LEFT)) {
+                move_right(buf - n, end, n, spaces);
+                return buf + spaces;
+        }
+        while (spaces--) {
+                if (buf < end)
+                        *buf = ' ';
+                ++buf;
+        }
+        return buf;
+}
+static noinline_for_stack
+char *string(char *buf, char *end, const char *s, struct printf_spec spec)
+{
+        int len = 0;
+        size_t lim = spec.precision;
+        if ((unsigned long)s < PAGE_SIZE)
+                s = "(null)";
+        while (lim--) {
+                char c = *s++;
+                if (!c)
+                        break;
+                if (buf < end)
+                        *buf = c;
+                ++buf;
+                ++len;
+        }
+        return widen_string(buf, len, end, spec);
+}
 static noinline_for_stack
 char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
@@ -600,20 +638,7 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
                        *buf = c;
        }
        rcu_read_unlock();
-        if (n < spec.field_width) {
+        return widen_string(buf, n, end, spec);
-                /* we want to pad the sucker */
-                unsigned spaces = spec.field_width - n;
-                if (!(spec.flags & LEFT)) {
-                        widen(buf - n, end, n, spaces);
-                        return buf + spaces;
-                }
-                while (spaces--) {
-                        if (buf < end)
-                                *buf = ' ';
-                        ++buf;
-                }
-        }
-        return buf;
 }
 #ifdef CONFIG_BLOCK
@@ -659,11 +684,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
        return string(buf, end, sym, spec);
 #else
-        spec.field_width = 2 * sizeof(void *);
+        return special_hex_number(buf, end, value, sizeof(void *));
-        spec.flags |= SPECIAL | SMALL | ZEROPAD;
-        spec.base = 16;
-        return number(buf, end, value, spec);
 #endif
 }
@@ -1324,40 +1345,45 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
        return string(buf, end, uuid, spec);
 }
-static
+static noinline_for_stack
-char *netdev_feature_string(char *buf, char *end, const u8 *addr,
+char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt)
-                      struct printf_spec spec)
 {
-        spec.flags |= SPECIAL | SMALL | ZEROPAD;
+        unsigned long long num;
-        if (spec.field_width == -1)
+        int size;
-                spec.field_width = 2 + 2 * sizeof(netdev_features_t);
-        spec.base = 16;
-        return number(buf, end, *(const netdev_features_t *)addr, spec);
+        switch (fmt[1]) {
+        case 'F':
+                num = *(const netdev_features_t *)addr;
+                size = sizeof(netdev_features_t);
+                break;
+        default:
+                num = (unsigned long)addr;
+                size = sizeof(unsigned long);
+                break;
+        }
+        return special_hex_number(buf, end, num, size);
 }
 static noinline_for_stack
-char *address_val(char *buf, char *end, const void *addr,
+char *address_val(char *buf, char *end, const void *addr, const char *fmt)
-                  struct printf_spec spec, const char *fmt)
 {
        unsigned long long num;
+        int size;
-        spec.flags |= SPECIAL | SMALL | ZEROPAD;
-        spec.base = 16;
        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
-                spec.field_width = sizeof(dma_addr_t) * 2 + 2;
+                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
-                spec.field_width = sizeof(phys_addr_t) * 2 + 2;
+                size = sizeof(phys_addr_t);
                break;
        }
-        return number(buf, end, num, spec);
+        return special_hex_number(buf, end, num, size);
 }
 static noinline_for_stack
@@ -1376,10 +1402,7 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
 #ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
 #else
-                spec.base = 16;
+                return special_hex_number(buf, end, (unsigned long)clk, sizeof(unsigned long));
-                spec.field_width = sizeof(unsigned long) * 2 + 2;
-                spec.flags |= SPECIAL | SMALL | ZEROPAD;
-                return number(buf, end, (unsigned long)clk, spec);
 #endif
        }
 }
@@ -1609,13 +1632,9 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
                break;
        case 'N':
-                switch (fmt[1]) {
+                return netdev_bits(buf, end, ptr, fmt);
-                case 'F':
-                        return netdev_feature_string(buf, end, ptr, spec);
-                }
-                break;
        case 'a':
-                return address_val(buf, end, ptr, spec, fmt);
+                return address_val(buf, end, ptr, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 'C':
@@ -1664,6 +1683,7 @@ static noinline_for_stack
 int format_decode(const char *fmt, struct printf_spec *spec)
 {
        const char *start = fmt;
+        char qualifier;
        /* we finished early by reading the field width */
        if (spec->type == FORMAT_TYPE_WIDTH) {
@@ -1746,16 +1766,16 @@ precision:
 qualifier:
        /* get the conversion qualifier */
-        spec->qualifier = -1;
+        qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
            _tolower(*fmt) == 'z' || *fmt == 't') {
-                spec->qualifier = *fmt++;
+                qualifier = *fmt++;
-                if (unlikely(spec->qualifier == *fmt)) {
+                if (unlikely(qualifier == *fmt)) {
-                        if (spec->qualifier == 'l') {
+                        if (qualifier == 'l') {
-                                spec->qualifier = 'L';
+                                qualifier = 'L';
                                ++fmt;
-                        } else if (spec->qualifier == 'h') {
+                        } else if (qualifier == 'h') {
-                                spec->qualifier = 'H';
+                                qualifier = 'H';
                                ++fmt;
                        }
                }
@@ -1812,19 +1832,19 @@ qualifier:
                return fmt - start;
        }
-        if (spec->qualifier == 'L')
+        if (qualifier == 'L')
                spec->type = FORMAT_TYPE_LONG_LONG;
-        else if (spec->qualifier == 'l') {
+        else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
-        } else if (_tolower(spec->qualifier) == 'z') {
+        } else if (_tolower(qualifier) == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
-        } else if (spec->qualifier == 't') {
+        } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
-        } else if (spec->qualifier == 'H') {
+        } else if (qualifier == 'H') {
                BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
                spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
-        } else if (spec->qualifier == 'h') {
+        } else if (qualifier == 'h') {
                BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
                spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
        } else {
@@ -1835,6 +1855,24 @@ qualifier:
        return ++fmt - start;
 }
+static void
+set_field_width(struct printf_spec *spec, int width)
+{
+        spec->field_width = width;
+        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
+                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
+        }
+}
+static void
+set_precision(struct printf_spec *spec, int prec)
+{
+        spec->precision = prec;
+        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
+                spec->precision = clamp(prec, 0, PRECISION_MAX);
+        }
+}
 /**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
@@ -1902,11 +1940,11 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                }
                case FORMAT_TYPE_WIDTH:
-                        spec.field_width = va_arg(args, int);
+                        set_field_width(&spec, va_arg(args, int));
                        break;
                case FORMAT_TYPE_PRECISION:
-                        spec.precision = va_arg(args, int);
+                        set_precision(&spec, va_arg(args, int));
                        break;
                case FORMAT_TYPE_CHAR: {
@@ -2346,11 +2384,11 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                }
                case FORMAT_TYPE_WIDTH:
-                        spec.field_width = get_arg(int);
+                        set_field_width(&spec, get_arg(int));
                        break;
                case FORMAT_TYPE_PRECISION:
-                        spec.precision = get_arg(int);
+                        set_precision(&spec, get_arg(int));
                        break;
                case FORMAT_TYPE_CHAR: {
diff --git a/mm/debug.c b/mm/debug.c
index 5d2072ed8d5e..f05b2d5d6481 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
        {1UL << PG_hwpoison,            "hwpoison"      },
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        {1UL << PG_compound_lock,       "compound_lock" },
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        {1UL << PG_young,               "young"         },
        {1UL << PG_idle,                "idle"          },
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
 void dump_page_badflags(struct page *page, const char *reason,
                unsigned long badflags)
 {
-        pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+        pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
                  page, atomic_read(&page->_count), page_mapcount(page),
                  page->mapping, page->index);
+        if (PageCompound(page))
+                pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+        pr_cont("\n");
        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
        dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
        if (reason)
diff --git a/mm/filemap.c b/mm/filemap.c
index ff42d31c891a..847ee43c2806 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
-        BUG_ON(page_mapped(page));
+        VM_BUG_ON_PAGE(page_mapped(page), page);
        /*
         * At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
        if (!huge) {
                error = mem_cgroup_try_charge(page, current->mm,
-                                              gfp_mask, &memcg);
+                                              gfp_mask, &memcg, false);
                if (error)
                        return error;
        }
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
        error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error) {
                if (!huge)
-                        mem_cgroup_cancel_charge(page, memcg);
+                        mem_cgroup_cancel_charge(page, memcg, false);
                return error;
        }
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
                __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-                mem_cgroup_commit_charge(page, memcg, false);
+                mem_cgroup_commit_charge(page, memcg, false, false);
        trace_mm_filemap_add_to_page_cache(page);
        return 0;
 err_insert:
@@ -653,7 +653,7 @@ err_insert:
        /* Leave page->index set: truncation relies upon it */
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        return error;
 }
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
        void *shadow = NULL;
        int ret;
-        __set_page_locked(page);
+        __SetPageLocked(page);
        ret = __add_to_page_cache_locked(page, mapping, offset,
                                         gfp_mask, &shadow);
        if (unlikely(ret))
-                __clear_page_locked(page);
+                __ClearPageLocked(page);
        else {
                /*
                 * The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
 */
 void unlock_page(struct page *page)
 {
+        page = compound_head(page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
        smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
 */
 void __lock_page(struct page *page)
 {
-        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+        struct page *page_head = compound_head(page);
+        DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-        __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+        __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
 int __lock_page_killable(struct page *page)
 {
-        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+        struct page *page_head = compound_head(page);
+        DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-        return __wait_on_bit_lock(page_waitqueue(page), &wait,
+        return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
                                        bit_wait_io, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c91b36..b64a36175884 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
+#include <linux/memremap.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags)
 {
        struct mm_struct *mm = vma->vm_mm;
+        struct dev_pagemap *pgmap = NULL;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
        }
        page = vm_normal_page(vma, address, pte);
-        if (unlikely(!page)) {
+        if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+                /*
+                 * Only return device mapping pages in the FOLL_GET case since
+                 * they are only valid while holding the pgmap reference.
+                 */
+                pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+                if (pgmap)
+                        page = pte_page(pte);
+                else
+                        goto no_page;
+        } else if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
@@ -116,8 +128,28 @@ retry:
                }
        }
-        if (flags & FOLL_GET)
+        if (flags & FOLL_SPLIT && PageTransCompound(page)) {
-                get_page_foll(page);
+                int ret;
+                get_page(page);
+                pte_unmap_unlock(ptep, ptl);
+                lock_page(page);
+                ret = split_huge_page(page);
+                unlock_page(page);
+                put_page(page);
+                if (ret)
+                        return ERR_PTR(ret);
+                goto retry;
+        }
+        if (flags & FOLL_GET) {
+                get_page(page);
+                /* drop the pgmap reference now that we hold the page */
+                if (pgmap) {
+                        put_dev_pagemap(pgmap);
+                        pgmap = NULL;
+                }
+        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +162,10 @@ retry:
                mark_page_accessed(page);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                /* Do not mlock pte-mapped THP */
+                if (PageTransCompound(page))
+                        goto out;
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        }
        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
-        if (pmd_trans_huge(*pmd)) {
+        if (pmd_devmap(*pmd)) {
-                if (flags & FOLL_SPLIT) {
-                        split_huge_page_pmd(vma, address, pmd);
-                        return follow_page_pte(vma, address, pmd, flags);
-                }
                ptl = pmd_lock(mm, pmd);
-                if (likely(pmd_trans_huge(*pmd))) {
+                page = follow_devmap_pmd(vma, address, pmd, flags);
-                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                spin_unlock(ptl);
-                                spin_unlock(ptl);
+                if (page)
-                                wait_split_huge_page(vma->anon_vma, pmd);
+                        return page;
-                        } else {
+        }
-                                page = follow_trans_huge_pmd(vma, address,
+        if (likely(!pmd_trans_huge(*pmd)))
-                                                             pmd, flags);
+                return follow_page_pte(vma, address, pmd, flags);
-                                spin_unlock(ptl);
-                                *page_mask = HPAGE_PMD_NR - 1;
+        ptl = pmd_lock(mm, pmd);
-                                return page;
+        if (unlikely(!pmd_trans_huge(*pmd))) {
-                        }
+                spin_unlock(ptl);
-                } else
+                return follow_page_pte(vma, address, pmd, flags);
+        }
+        if (flags & FOLL_SPLIT) {
+                int ret;
+                page = pmd_page(*pmd);
+                if (is_huge_zero_page(page)) {
+                        spin_unlock(ptl);
+                        ret = 0;
+                        split_huge_pmd(vma, pmd, address);
+                } else {
+                        get_page(page);
                        spin_unlock(ptl);
+                        lock_page(page);
+                        ret = split_huge_page(page);
+                        unlock_page(page);
+                        put_page(page);
+                }
+                return ret ? ERR_PTR(ret) :
+                        follow_page_pte(vma, address, pmd, flags);
        }
-        return follow_page_pte(vma, address, pmd, flags);
+        page = follow_trans_huge_pmd(vma, address, pmd, flags);
+        spin_unlock(ptl);
+        *page_mask = HPAGE_PMD_NR - 1;
+        return page;
 }
 static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
 * @mm:         mm_struct of target mm
 * @address:    user address
 * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:   did we unlock the mmap_sem while retrying, maybe NULL if caller
+ *              does not allow retry
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
 */
 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long address, unsigned int fault_flags)
+                     unsigned long address, unsigned int fault_flags,
+                     bool *unlocked)
 {
        struct vm_area_struct *vma;
        vm_flags_t vm_flags;
-        int ret;
+        int ret, major = 0;
+        if (unlocked)
+                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+retry:
        vma = find_extend_vma(mm, address);
        if (!vma || address < vma->vm_start)
                return -EFAULT;
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                return -EFAULT;
        ret = handle_mm_fault(mm, vma, address, fault_flags);
+        major |= ret & VM_FAULT_MAJOR;
        if (ret & VM_FAULT_ERROR) {
                if (ret & VM_FAULT_OOM)
                        return -ENOMEM;
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                        return -EFAULT;
                BUG();
        }
+        if (ret & VM_FAULT_RETRY) {
+                down_read(&mm->mmap_sem);
+                if (!(fault_flags & FAULT_FLAG_TRIED)) {
+                        *unlocked = true;
+                        fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                        fault_flags |= FAULT_FLAG_TRIED;
+                        goto retry;
+                }
+        }
        if (tsk) {
-                if (ret & VM_FAULT_MAJOR)
+                if (major)
                        tsk->maj_flt++;
                else
                        tsk->min_flt++;
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
        if (vma->vm_flags & VM_LOCKONFAULT)
                gup_flags &= ~FOLL_POPULATE;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
 *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
 *      pages containing page tables.
 *
- *  *) THP splits will broadcast an IPI, this can be achieved by overriding
- *      pmdp_splitting_flush.
- *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                 * for an example see gup_get_pte in arch/x86/mm/gup.c
                 */
                pte_t pte = READ_ONCE(*ptep);
-                struct page *page;
+                struct page *head, *page;
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
+                head = compound_head(page);
-                if (!page_cache_get_speculative(page))
+                if (!page_cache_get_speculative(head))
                        goto pte_unmap;
                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                        put_page(page);
+                        put_page(head);
                        goto pte_unmap;
                }
+                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
                (*nr)++;
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        int refs;
        if (write && !pmd_write(orig))
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
        refs = 0;
        head = pmd_page(orig);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                return 0;
        }
-        /*
-         * Any tail pages need their mapcount reference taken before we
-         * return. (This allows the THP code to bump their ref count when
-         * they are split into base pages).
-         */
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        int refs;
        if (write && !pud_write(orig))
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
        refs = 0;
        head = pud_page(orig);
        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                return 0;
        }
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                        struct page **pages, int *nr)
 {
        int refs;
-        struct page *head, *page, *tail;
+        struct page *head, *page;
        if (write && !pgd_write(orig))
                return 0;
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
        refs = 0;
        head = pgd_page(orig);
        page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-        tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                return 0;
        }
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
        return 1;
 }
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = READ_ONCE(*pmdp);
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f952f055fdcf..b2db98136af9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,12 +16,16 @@
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/swapops.h>
 #include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/pfn_t.h>
 #include <linux/mman.h>
+#include <linux/memremap.h>
 #include <linux/pagemap.h>
+#include <linux/debugfs.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
@@ -45,6 +49,7 @@ enum scan_result {
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
+        SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
@@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
 static void set_recommended_min_free_kbytes(void)
 {
@@ -665,6 +674,9 @@ static int __init hugepage_init(void)
        err = register_shrinker(&huge_zero_page_shrinker);
        if (err)
                goto err_hzp_shrinker;
+        err = register_shrinker(&deferred_split_shrinker);
+        if (err)
+                goto err_split_shrinker;
        /*
         * By default disable transparent hugepages on smaller systems,
@@ -682,6 +694,8 @@ static int __init hugepage_init(void)
        return 0;
 err_khugepaged:
+        unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
        unregister_shrinker(&huge_zero_page_shrinker);
 err_hzp_shrinker:
        khugepaged_slab_exit();
@@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
        return entry;
 }
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+        /*
+         * ->lru in the tail pages is occupied by compound_head.
+         * Let's use ->mapping + ->index in the second tail page as list_head.
+         */
+        return (struct list_head *)&page[2].mapping;
+}
+void prep_transhuge_page(struct page *page)
+{
+        /*
+         * we use page->mapping and page->indexlru in second tail page
+         * as list_head: assuming THP order >= 2
+         */
+        BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+        INIT_LIST_HEAD(page_deferred_list(page));
+        set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
@@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+        if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                return VM_FAULT_OOM;
        }
@@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_none(*pmd))) {
                spin_unlock(ptl);
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                pte_free(mm, pgtable);
        } else {
@@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                        int ret;
                        spin_unlock(ptl);
-                        mem_cgroup_cancel_charge(page, memcg);
+                        mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(mm, pgtable);
                        ret = handle_userfault(vma, address, flags,
@@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                page_add_new_anon_rmap(page, vma, haddr);
+                page_add_new_anon_rmap(page, vma, haddr, true);
-                mem_cgroup_commit_charge(page, memcg, false);
+                mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
+        prep_transhuge_page(page);
        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                            flags);
 }
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
 {
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;
        ptl = pmd_lock(mm, pmd);
-        if (pmd_none(*pmd)) {
+        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
-                entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+        if (pfn_t_devmap(pfn))
-                if (write) {
+                entry = pmd_mkdevmap(entry);
-                        entry = pmd_mkyoung(pmd_mkdirty(entry));
+        if (write) {
-                        entry = maybe_pmd_mkwrite(entry, vma);
+                entry = pmd_mkyoung(pmd_mkdirty(entry));
-                }
+                entry = maybe_pmd_mkwrite(entry, vma);
-                set_pmd_at(mm, addr, pmd, entry);
+        }
-                update_mmu_cache_pmd(vma, addr, pmd);
+        set_pmd_at(mm, addr, pmd, entry);
-        }
+        update_mmu_cache_pmd(vma, addr, pmd);
        spin_unlock(ptl);
 }
 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                        pmd_t *pmd, unsigned long pfn, bool write)
+                        pmd_t *pmd, pfn_t pfn, bool write)
 {
        pgprot_t pgprot = vma->vm_page_prot;
        /*
@@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+        BUG_ON(!pfn_t_devmap(pfn));
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;
@@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        return VM_FAULT_NOPAGE;
 }
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+                pmd_t *pmd)
+{
+        pmd_t _pmd;
+        /*
+         * We should set the dirty bit only for FOLL_WRITE but for now
+         * the dirty bit in the pmd is meaningless.  And if the dirty
+         * bit will become meaningful and we'll only set it with
+         * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+         * set the young bit, instead of the current set_pmd_at.
+         */
+        _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                                pmd, _pmd,  1))
+                update_mmu_cache_pmd(vma, addr, pmd);
+}
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+                pmd_t *pmd, int flags)
+{
+        unsigned long pfn = pmd_pfn(*pmd);
+        struct mm_struct *mm = vma->vm_mm;
+        struct dev_pagemap *pgmap;
+        struct page *page;
+        assert_spin_locked(pmd_lockptr(mm, pmd));
+        if (flags & FOLL_WRITE && !pmd_write(*pmd))
+                return NULL;
+        if (pmd_present(*pmd) && pmd_devmap(*pmd))
+                /* pass */;
+        else
+                return NULL;
+        if (flags & FOLL_TOUCH)
+                touch_pmd(vma, addr, pmd);
+        /*
+         * device mapped pages can only be returned if the
+         * caller will manage the page reference count.
+         */
+        if (!(flags & FOLL_GET))
+                return ERR_PTR(-EEXIST);
+        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+        pgmap = get_dev_pagemap(pfn, NULL);
+        if (!pgmap)
+                return ERR_PTR(-EFAULT);
+        page = pfn_to_page(pfn);
+        get_page(page);
+        put_dev_pagemap(pgmap);
+        return page;
+}
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
@@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        ret = -EAGAIN;
        pmd = *src_pmd;
-        if (unlikely(!pmd_trans_huge(pmd))) {
+        if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
@@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
-        if (unlikely(pmd_trans_splitting(pmd))) {
+        if (pmd_trans_huge(pmd)) {
-                /* split huge page running from under us */
+                /* thp accounting separate from pmd_devmap accounting */
-                spin_unlock(src_ptl);
+                src_page = pmd_page(pmd);
-                spin_unlock(dst_ptl);
+                VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-                pte_free(dst_mm, pgtable);
+                get_page(src_page);
+                page_dup_rmap(src_page, true);
-                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-                goto out;
+                atomic_long_inc(&dst_mm->nr_ptes);
+                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        }
-        src_page = pmd_page(pmd);
-        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-        get_page(src_page);
-        page_dup_rmap(src_page);
-        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
-        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-        atomic_long_inc(&dst_mm->nr_ptes);
        ret = 0;
 out_unlock:
@@ -1035,37 +1122,6 @@ unlock:
        spin_unlock(ptl);
 }
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
-        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-                struct page *endpage = page + HPAGE_PMD_NR;
-                atomic_add(HPAGE_PMD_NR, &page->_count);
-                while (++page < endpage)
-                        get_huge_page_tail(page);
-        } else {
-                get_page(page);
-        }
-}
-static void put_user_huge_page(struct page *page)
-{
-        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-                struct page *endpage = page + HPAGE_PMD_NR;
-                while (page < endpage)
-                        put_page(page++);
-        } else {
-                put_page(page);
-        }
-}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
-                                                   &memcg))) {
+                                                   &memcg, false))) {
                        if (pages[i])
                                put_page(pages[i]);
                        while (--i >= 0) {
                                memcg = (void *)page_private(pages[i]);
                                set_page_private(pages[i], 0);
-                                mem_cgroup_cancel_charge(pages[i], memcg);
+                                mem_cgroup_cancel_charge(pages[i], memcg,
+                                                false);
                                put_page(pages[i]);
                        }
                        kfree(pages);
@@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-                page_add_new_anon_rmap(pages[i], vma, haddr);
+                page_add_new_anon_rmap(pages[i], vma, haddr, false);
-                mem_cgroup_commit_charge(pages[i], memcg, false);
+                mem_cgroup_commit_charge(pages[i], memcg, false, false);
                lru_cache_add_active_or_unevictable(pages[i], vma);
                pte = pte_offset_map(&_pmd, haddr);
                VM_BUG_ON(!pte_none(*pte));
@@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-        page_remove_rmap(page);
+        page_remove_rmap(page, true);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1168,7 +1225,7 @@ out_free_pages:
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-                mem_cgroup_cancel_charge(pages[i], memcg);
+                mem_cgroup_cancel_charge(pages[i], memcg, false);
                put_page(pages[i]);
        }
        kfree(pages);
@@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-        if (page_mapcount(page) == 1) {
+        /*
+         * We can only reuse the page if nobody else maps the huge page or it's
+         * part. We can do it by checking page_mapcount() on each sub-page, but
+         * it's expensive.
+         * The cheaper way is to check page_count() to be equal 1: every
+         * mapcount takes page reference reference, so this way we can
+         * guarantee, that the PMD is the only mapping.
+         * This can give false negative if somebody pinned the page, but that's
+         * fine.
+         */
+        if (page_mapcount(page) == 1 && page_count(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-        get_user_huge_page(page);
+        get_page(page);
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
@@ -1217,30 +1284,33 @@ alloc:
        } else
                new_page = NULL;
-        if (unlikely(!new_page)) {
+        if (likely(new_page)) {
+                prep_transhuge_page(new_page);
+        } else {
                if (!page) {
-                        split_huge_page_pmd(vma, address, pmd);
+                        split_huge_pmd(vma, pmd, address);
                        ret |= VM_FAULT_FALLBACK;
                } else {
                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                        pmd, orig_pmd, page, haddr);
                        if (ret & VM_FAULT_OOM) {
-                                split_huge_page(page);
+                                split_huge_pmd(vma, pmd, address);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                        put_user_huge_page(page);
+                        put_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+                                           true))) {
                put_page(new_page);
                if (page) {
-                        split_huge_page(page);
+                        split_huge_pmd(vma, pmd, address);
-                        put_user_huge_page(page);
+                        put_page(page);
                } else
-                        split_huge_page_pmd(vma, address, pmd);
+                        split_huge_pmd(vma, pmd, address);
                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
@@ -1260,10 +1330,10 @@ alloc:
        spin_lock(ptl);
        if (page)
-                put_user_huge_page(page);
+                put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
-                mem_cgroup_cancel_charge(new_page, memcg);
+                mem_cgroup_cancel_charge(new_page, memcg, true);
                put_page(new_page);
                goto out_mn;
        } else {
@@ -1271,8 +1341,8 @@ alloc:
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-                page_add_new_anon_rmap(new_page, vma, haddr);
+                page_add_new_anon_rmap(new_page, vma, haddr, true);
-                mem_cgroup_commit_charge(new_page, memcg, false);
+                mem_cgroup_commit_charge(new_page, memcg, false, true);
                lru_cache_add_active_or_unevictable(new_page, vma);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
@@ -1281,7 +1351,7 @@ alloc:
                        put_huge_zero_page();
                } else {
                        VM_BUG_ON_PAGE(!PageHead(page), page);
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, true);
                        put_page(page);
                }
                ret |= VM_FAULT_WRITE;
@@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        page = pmd_page(*pmd);
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        if (flags & FOLL_TOUCH) {
+        if (flags & FOLL_TOUCH)
-                pmd_t _pmd;
+                touch_pmd(vma, addr, pmd);
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /*
-                 * We should set the dirty bit only for FOLL_WRITE but
+                 * We don't mlock() pte-mapped THPs. This way we can avoid
-                 * for now the dirty bit in the pmd is meaningless.
+                 * leaking mlocked pages into non-VM_LOCKED VMAs.
-                 * And if the dirty bit will become meaningful and
+                 *
-                 * we'll only set it with FOLL_WRITE, an atomic
+                 * In most cases the pmd is the only mapping of the page as we
-                 * set_bit will be required on the pmd to set the
+                 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
-                 * young bit, instead of the current set_pmd_at.
+                 * writable private mappings in populate_vma_page_range().
+                 *
+                 * The only scenario when we have the page shared here is if we
+                 * mlocking read-only mapping shared over fork(). We skip
+                 * mlocking such pages.
                 */
-                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+                if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
-                if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                                page->mapping && trylock_page(page)) {
-                                          pmd, _pmd,  1))
-                        update_mmu_cache_pmd(vma, addr, pmd);
-        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
                                mlock_vma_page(page);
@@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        if (flags & FOLL_GET)
-                get_page_foll(page);
+                get_page(page);
 out:
        return page;
@@ -1480,13 +1550,84 @@ out:
        return 0;
 }
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long addr, unsigned long next)
+{
+        spinlock_t *ptl;
+        pmd_t orig_pmd;
+        struct page *page;
+        struct mm_struct *mm = tlb->mm;
+        int ret = 0;
+        if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+                goto out;
+        orig_pmd = *pmd;
+        if (is_huge_zero_pmd(orig_pmd)) {
+                ret = 1;
+                goto out;
+        }
+        page = pmd_page(orig_pmd);
+        /*
+         * If other processes are mapping this page, we couldn't discard
+         * the page unless they all do MADV_FREE so let's skip the page.
+         */
+        if (page_mapcount(page) != 1)
+                goto out;
+        if (!trylock_page(page))
+                goto out;
+        /*
+         * If user want to discard part-pages of THP, split it so MADV_FREE
+         * will deactivate only them.
+         */
+        if (next - addr != HPAGE_PMD_SIZE) {
+                get_page(page);
+                spin_unlock(ptl);
+                if (split_huge_page(page)) {
+                        put_page(page);
+                        unlock_page(page);
+                        goto out_unlocked;
+                }
+                put_page(page);
+                unlock_page(page);
+                ret = 1;
+                goto out_unlocked;
+        }
+        if (PageDirty(page))
+                ClearPageDirty(page);
+        unlock_page(page);
+        if (PageActive(page))
+                deactivate_page(page);
+        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+                orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                        tlb->fullmm);
+                orig_pmd = pmd_mkold(orig_pmd);
+                orig_pmd = pmd_mkclean(orig_pmd);
+                set_pmd_at(mm, addr, pmd, orig_pmd);
+                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+        }
+        ret = 1;
+out:
+        spin_unlock(ptl);
+out_unlocked:
+        return ret;
+}
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
        pmd_t orig_pmd;
        spinlock_t *ptl;
-        if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+        if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                put_huge_zero_page();
        } else {
                struct page *page = pmd_page(orig_pmd);
-                page_remove_rmap(page);
+                page_remove_rmap(page, true);
                VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return 1;
 }
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
                  pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
-        int ret = 0;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
@@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
            (new_addr & ~HPAGE_PMD_MASK) ||
            old_end - old_addr < HPAGE_PMD_SIZE ||
            (new_vma->vm_flags & VM_NOHUGEPAGE))
-                goto out;
+                return false;
        /*
         * The destination pmd shouldn't be established, free_pgtables()
@@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         */
        if (WARN_ON(!pmd_none(*new_pmd))) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
-                goto out;
+                return false;
        }
        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-        ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
+        if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
-        if (ret == 1) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
+                return true;
        }
-out:
+        return false;
-        return ret;
 }
 /*
@@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        int ret = 0;
-        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
@@ -1616,405 +1755,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 }
 /*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns true if a given pmd maps a thp, false otherwise.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
 *
- * Note that if it returns 1, this routine returns without unlocking page
+ * Note that if it returns true, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * table lock. So callers must unlock it.
 */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        *ptl = pmd_lock(vma->vm_mm, pmd);
-        if (likely(pmd_trans_huge(*pmd))) {
+        if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
-                if (unlikely(pmd_trans_splitting(*pmd))) {
+                return true;
-                        spin_unlock(*ptl);
-                        wait_split_huge_page(vma->anon_vma, pmd);
-                        return -1;
-                } else {
-                        /* Thp mapped by 'pmd' is stable, so we can
-                         * handle it as it is. */
-                        return 1;
-                }
-        }
-        spin_unlock(*ptl);
-        return 0;
-}
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
-                              struct mm_struct *mm,
-                              unsigned long address,
-                              enum page_check_address_pmd_flag flag,
-                              spinlock_t **ptl)
-{
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        if (address & ~HPAGE_PMD_MASK)
-                return NULL;
-        pgd = pgd_offset(mm, address);
-        if (!pgd_present(*pgd))
-                return NULL;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                return NULL;
-        pmd = pmd_offset(pud, address);
-        *ptl = pmd_lock(mm, pmd);
-        if (!pmd_present(*pmd))
-                goto unlock;
-        if (pmd_page(*pmd) != page)
-                goto unlock;
-        /*
-         * split_vma() may create temporary aliased mappings. There is
-         * no risk as long as all huge pmd are found and have their
-         * splitting bit set before __split_huge_page_refcount
-         * runs. Finding the same huge pmd more than once during the
-         * same rmap walk is not a problem.
-         */
-        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-            pmd_trans_splitting(*pmd))
-                goto unlock;
-        if (pmd_trans_huge(*pmd)) {
-                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
-                          !pmd_trans_splitting(*pmd));
-                return pmd;
-        }
-unlock:
        spin_unlock(*ptl);
-        return NULL;
+        return false;
-}
-static int __split_huge_page_splitting(struct page *page,
-                                       struct vm_area_struct *vma,
-                                       unsigned long address)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        spinlock_t *ptl;
-        pmd_t *pmd;
-        int ret = 0;
-        /* For mmu_notifiers */
-        const unsigned long mmun_start = address;
-        const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-        pmd = page_check_address_pmd(page, mm, address,
-                        PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
-        if (pmd) {
-                /*
-                 * We can't temporarily set the pmd to null in order
-                 * to split it, the pmd must remain marked huge at all
-                 * times or the VM won't take the pmd_trans_huge paths
-                 * and it won't wait on the anon_vma->root->rwsem to
-                 * serialize against split_huge_page*.
-                 */
-                pmdp_splitting_flush(vma, address, pmd);
-                ret = 1;
-                spin_unlock(ptl);
-        }
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        return ret;
-}
-static void __split_huge_page_refcount(struct page *page,
-                                       struct list_head *list)
-{
-        int i;
-        struct zone *zone = page_zone(page);
-        struct lruvec *lruvec;
-        int tail_count = 0;
-        /* prevent PageLRU to go away from under us, and freeze lru stats */
-        spin_lock_irq(&zone->lru_lock);
-        lruvec = mem_cgroup_page_lruvec(page, zone);
-        compound_lock(page);
-        /* complete memcg works before add pages to LRU */
-        mem_cgroup_split_huge_fixup(page);
-        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
-                struct page *page_tail = page + i;
-                /* tail_page->_mapcount cannot change */
-                BUG_ON(page_mapcount(page_tail) < 0);
-                tail_count += page_mapcount(page_tail);
-                /* check for overflow */
-                BUG_ON(tail_count < 0);
-                BUG_ON(atomic_read(&page_tail->_count) != 0);
-                /*
-                 * tail_page->_count is zero and not changing from
-                 * under us. But get_page_unless_zero() may be running
-                 * from under us on the tail_page. If we used
-                 * atomic_set() below instead of atomic_add(), we
-                 * would then run atomic_set() concurrently with
-                 * get_page_unless_zero(), and atomic_set() is
-                 * implemented in C not using locked ops. spin_unlock
-                 * on x86 sometime uses locked ops because of PPro
-                 * errata 66, 92, so unless somebody can guarantee
-                 * atomic_set() here would be safe on all archs (and
-                 * not only on x86), it's safer to use atomic_add().
-                 */
-                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
-                           &page_tail->_count);
-                /* after clearing PageTail the gup refcount can be released */
-                smp_mb__after_atomic();
-                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-                page_tail->flags |= (page->flags &
-                                     ((1L << PG_referenced) |
-                                      (1L << PG_swapbacked) |
-                                      (1L << PG_mlocked) |
-                                      (1L << PG_uptodate) |
-                                      (1L << PG_active) |
-                                      (1L << PG_unevictable)));
-                page_tail->flags |= (1L << PG_dirty);
-                clear_compound_head(page_tail);
-                if (page_is_young(page))
-                        set_page_young(page_tail);
-                if (page_is_idle(page))
-                        set_page_idle(page_tail);
-                /*
-                 * __split_huge_page_splitting() already set the
-                 * splitting bit in all pmd that could map this
-                 * hugepage, that will ensure no CPU can alter the
-                 * mapcount on the head page. The mapcount is only
-                 * accounted in the head page and it has to be
-                 * transferred to all tail pages in the below code. So
-                 * for this code to be safe, the split the mapcount
-                 * can't change. But that doesn't mean userland can't
-                 * keep changing and reading the page contents while
-                 * we transfer the mapcount, so the pmd splitting
-                 * status is achieved setting a reserved bit in the
-                 * pmd, not by clearing the present bit.
-                */
-                page_tail->_mapcount = page->_mapcount;
-                BUG_ON(page_tail->mapping);
-                page_tail->mapping = page->mapping;
-                page_tail->index = page->index + i;
-                page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-                BUG_ON(!PageAnon(page_tail));
-                BUG_ON(!PageUptodate(page_tail));
-                BUG_ON(!PageDirty(page_tail));
-                BUG_ON(!PageSwapBacked(page_tail));
-                lru_add_page_tail(page, page_tail, lruvec, list);
-        }
-        atomic_sub(tail_count, &page->_count);
-        BUG_ON(atomic_read(&page->_count) <= 0);
-        __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-        ClearPageCompound(page);
-        compound_unlock(page);
-        spin_unlock_irq(&zone->lru_lock);
-        for (i = 1; i < HPAGE_PMD_NR; i++) {
-                struct page *page_tail = page + i;
-                BUG_ON(page_count(page_tail) <= 0);
-                /*
-                 * Tail pages may be freed if there wasn't any mapping
-                 * like if add_to_swap() is running on a lru page that
-                 * had its mapping zapped. And freeing these pages
-                 * requires taking the lru_lock so we do the put_page
-                 * of the tail pages after the split is complete.
-                 */
-                put_page(page_tail);
-        }
-        /*
-         * Only the head page (now become a regular page) is required
-         * to be pinned by the caller.
-         */
-        BUG_ON(page_count(page) <= 0);
-}
-static int __split_huge_page_map(struct page *page,
-                                 struct vm_area_struct *vma,
-                                 unsigned long address)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        spinlock_t *ptl;
-        pmd_t *pmd, _pmd;
-        int ret = 0, i;
-        pgtable_t pgtable;
-        unsigned long haddr;
-        pmd = page_check_address_pmd(page, mm, address,
-                        PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
-        if (pmd) {
-                pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-                pmd_populate(mm, &_pmd, pgtable);
-                if (pmd_write(*pmd))
-                        BUG_ON(page_mapcount(page) != 1);
-                haddr = address;
-                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                        pte_t *pte, entry;
-                        BUG_ON(PageCompound(page+i));
-                        /*
-                         * Note that NUMA hinting access restrictions are not
-                         * transferred to avoid any possibility of altering
-                         * permissions across VMAs.
-                         */
-                        entry = mk_pte(page + i, vma->vm_page_prot);
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                        if (!pmd_write(*pmd))
-                                entry = pte_wrprotect(entry);
-                        if (!pmd_young(*pmd))
-                                entry = pte_mkold(entry);
-                        pte = pte_offset_map(&_pmd, haddr);
-                        BUG_ON(!pte_none(*pte));
-                        set_pte_at(mm, haddr, pte, entry);
-                        pte_unmap(pte);
-                }
-                smp_wmb(); /* make pte visible before pmd */
-                /*
-                 * Up to this point the pmd is present and huge and
-                 * userland has the whole access to the hugepage
-                 * during the split (which happens in place). If we
-                 * overwrite the pmd with the not-huge version
-                 * pointing to the pte here (which of course we could
-                 * if all CPUs were bug free), userland could trigger
-                 * a small page size TLB miss on the small sized TLB
-                 * while the hugepage TLB entry is still established
-                 * in the huge TLB. Some CPU doesn't like that. See
-                 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                 * Erratum 383 on page 93. Intel should be safe but is
-                 * also warns that it's only safe if the permission
-                 * and cache attributes of the two entries loaded in
-                 * the two TLB is identical (which should be the case
-                 * here). But it is generally safer to never allow
-                 * small and huge TLB entries for the same virtual
-                 * address to be loaded simultaneously. So instead of
-                 * doing "pmd_populate(); flush_pmd_tlb_range();" we first
-                 * mark the current pmd notpresent (atomically because
-                 * here the pmd_trans_huge and pmd_trans_splitting
-                 * must remain set at all times on the pmd until the
-                 * split is complete for this pmd), then we flush the
-                 * SMP TLB and finally we write the non-huge version
-                 * of the pmd entry with pmd_populate.
-                 */
-                pmdp_invalidate(vma, address, pmd);
-                pmd_populate(mm, pmd, pgtable);
-                ret = 1;
-                spin_unlock(ptl);
-        }
-        return ret;
-}
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
-                              struct anon_vma *anon_vma,
-                              struct list_head *list)
-{
-        int mapcount, mapcount2;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        struct anon_vma_chain *avc;
-        BUG_ON(!PageHead(page));
-        BUG_ON(PageTail(page));
-        mapcount = 0;
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long addr = vma_address(page, vma);
-                BUG_ON(is_vma_temporary_stack(vma));
-                mapcount += __split_huge_page_splitting(page, vma, addr);
-        }
-        /*
-         * It is critical that new vmas are added to the tail of the
-         * anon_vma list. This guarantes that if copy_huge_pmd() runs
-         * and establishes a child pmd before
-         * __split_huge_page_splitting() freezes the parent pmd (so if
-         * we fail to prevent copy_huge_pmd() from running until the
-         * whole __split_huge_page() is complete), we will still see
-         * the newly established pmd of the child later during the
-         * walk, to be able to set it as pmd_trans_splitting too.
-         */
-        if (mapcount != page_mapcount(page)) {
-                pr_err("mapcount %d page_mapcount %d\n",
-                        mapcount, page_mapcount(page));
-                BUG();
-        }
-        __split_huge_page_refcount(page, list);
-        mapcount2 = 0;
-        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-                struct vm_area_struct *vma = avc->vma;
-                unsigned long addr = vma_address(page, vma);
-                BUG_ON(is_vma_temporary_stack(vma));
-                mapcount2 += __split_huge_page_map(page, vma, addr);
-        }
-        if (mapcount != mapcount2) {
-                pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                        mapcount, mapcount2, page_mapcount(page));
-                BUG();
-        }
-}
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
-        struct anon_vma *anon_vma;
-        int ret = 1;
-        BUG_ON(is_huge_zero_page(page));
-        BUG_ON(!PageAnon(page));
-        /*
-         * The caller does not necessarily hold an mmap_sem that would prevent
-         * the anon_vma disappearing so we first we take a reference to it
-         * and then lock the anon_vma for write. This is similar to
-         * page_lock_anon_vma_read except the write lock is taken to serialise
-         * against parallel split or collapse operations.
-         */
-        anon_vma = page_get_anon_vma(page);
-        if (!anon_vma)
-                goto out;
-        anon_vma_lock_write(anon_vma);
-        ret = 0;
-        if (!PageCompound(page))
-                goto out_unlock;
-        BUG_ON(!PageSwapBacked(page));
-        __split_huge_page(page, anon_vma, list);
-        count_vm_event(THP_SPLIT);
-        BUG_ON(PageCompound(page));
-out_unlock:
-        anon_vma_unlock_write(anon_vma);
-        put_anon_vma(anon_vma);
-out:
-        return ret;
 }
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                         * superfluous.
                         */
                        pte_clear(vma->vm_mm, address, _pte);
-                        page_remove_rmap(src_page);
+                        page_remove_rmap(src_page, false);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
@@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                return NULL;
        }
+        prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return *hpage;
 }
@@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void)
 static inline struct page *alloc_hugepage(int defrag)
 {
-        return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+        struct page *page;
-                           HPAGE_PMD_ORDER);
+        page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+        if (page)
+                prep_transhuge_page(page);
+        return page;
 }
 static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (is_vma_temporary_stack(vma))
@@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out_nolock;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) {
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out_nolock;
        }
@@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
-        page_add_new_anon_rmap(new_page, vma, address);
+        page_add_new_anon_rmap(new_page, vma, address, true);
-        mem_cgroup_commit_charge(new_page, memcg, false);
+        mem_cgroup_commit_charge(new_page, memcg, false, true);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
@@ -2703,7 +2460,7 @@ out_nolock:
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
 out:
-        mem_cgroup_cancel_charge(new_page, memcg);
+        mem_cgroup_cancel_charge(new_page, memcg, true);
        goto out_up_write;
 }
@@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
+                /* TODO: teach khugepaged to collapse THP mapped with pte */
+                if (PageCompound(page)) {
+                        result = SCAN_PAGE_COMPOUND;
+                        goto out_unmap;
+                }
                /*
                 * Record which node the original page is from and save this
                 * information to khugepaged_node_load[].
@@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
                khugepaged_node_load[node]++;
-                VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
@@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmd_t _pmd;
        int i;
-        pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
+        pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        put_huge_zero_page();
 }
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
-                pmd_t *pmd)
+                unsigned long haddr, bool freeze)
 {
-        spinlock_t *ptl;
-        struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long haddr = address & HPAGE_PMD_MASK;
+        struct page *page;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        pgtable_t pgtable;
-        unsigned long mmun_end;         /* For mmu_notifiers */
+        pmd_t _pmd;
+        bool young, write, dirty;
+        int i;
-        BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+        VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+        count_vm_event(THP_SPLIT_PMD);
-        mmun_start = haddr;
-        mmun_end   = haddr + HPAGE_PMD_SIZE;
-again:
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-        ptl = pmd_lock(mm, pmd);
-        if (unlikely(!pmd_trans_huge(*pmd)))
-                goto unlock;
        if (vma_is_dax(vma)) {
                pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                if (is_huge_zero_pmd(_pmd))
                        put_huge_zero_page();
+                return;
        } else if (is_huge_zero_pmd(*pmd)) {
-                __split_huge_zero_page_pmd(vma, haddr, pmd);
+                return __split_huge_zero_page_pmd(vma, haddr, pmd);
-        } else {
-                page = pmd_page(*pmd);
-                VM_BUG_ON_PAGE(!page_count(page), page);
-                get_page(page);
        }
- unlock:
-        spin_unlock(ptl);
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        if (!page)
+        page = pmd_page(*pmd);
-                return;
+        VM_BUG_ON_PAGE(!page_count(page), page);
+        atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+        write = pmd_write(*pmd);
+        young = pmd_young(*pmd);
+        dirty = pmd_dirty(*pmd);
-        split_huge_page(page);
+        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-        put_page(page);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t entry, *pte;
+                /*
+                 * Note that NUMA hinting access restrictions are not
+                 * transferred to avoid any possibility of altering
+                 * permissions across VMAs.
+                 */
+                if (freeze) {
+                        swp_entry_t swp_entry;
+                        swp_entry = make_migration_entry(page + i, write);
+                        entry = swp_entry_to_pte(swp_entry);
+                } else {
+                        entry = mk_pte(page + i, vma->vm_page_prot);
+                        entry = maybe_mkwrite(entry, vma);
+                        if (!write)
+                                entry = pte_wrprotect(entry);
+                        if (!young)
+                                entry = pte_mkold(entry);
+                }
+                if (dirty)
+                        SetPageDirty(page + i);
+                pte = pte_offset_map(&_pmd, haddr);
+                BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                atomic_inc(&page[i]._mapcount);
+                pte_unmap(pte);
+        }
+        /*
+         * Set PG_double_map before dropping compound_mapcount to avoid
+         * false-negative page_mapped().
+         */
+        if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+                for (i = 0; i < HPAGE_PMD_NR; i++)
+                        atomic_inc(&page[i]._mapcount);
+        }
+        if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+                /* Last compound_mapcount is gone. */
+                __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+                if (TestClearPageDoubleMap(page)) {
+                        /* No need in mapcount reference anymore */
+                        for (i = 0; i < HPAGE_PMD_NR; i++)
+                                atomic_dec(&page[i]._mapcount);
+                }
+        }
+        smp_wmb(); /* make pte visible before pmd */
        /*
-         * We don't always have down_write of mmap_sem here: a racing
+         * Up to this point the pmd is present and huge and userland has the
-         * do_huge_pmd_wp_page() might have copied-on-write to another
+         * whole access to the hugepage during the split (which happens in
-         * huge page before our split_huge_page() got the anon_vma lock.
+         * place). If we overwrite the pmd with the not-huge version pointing
+         * to the pte here (which of course we could if all CPUs were bug
+         * free), userland could trigger a small page size TLB miss on the
+         * small sized TLB while the hugepage TLB entry is still established in
+         * the huge TLB. Some CPU doesn't like that.
+         * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+         * 383 on page 93. Intel should be safe but is also warns that it's
+         * only safe if the permission and cache attributes of the two entries
+         * loaded in the two TLB is identical (which should be the case here).
+         * But it is generally safer to never allow small and huge TLB entries
+         * for the same virtual address to be loaded simultaneously. So instead
+         * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+         * current pmd notpresent (atomically because here the pmd_trans_huge
+         * and pmd_trans_splitting must remain set at all times on the pmd
+         * until the split is complete for this pmd), then we flush the SMP TLB
+         * and finally we write the non-huge version of the pmd entry with
+         * pmd_populate.
         */
-        if (unlikely(pmd_trans_huge(*pmd)))
+        pmdp_invalidate(vma, haddr, pmd);
-                goto again;
+        pmd_populate(mm, pmd, pgtable);
+        if (freeze) {
+                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                        page_remove_rmap(page + i, false);
+                        put_page(page + i);
+                }
+        }
 }
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                pmd_t *pmd)
+                unsigned long address)
 {
-        struct vm_area_struct *vma;
+        spinlock_t *ptl;
+        struct mm_struct *mm = vma->vm_mm;
+        struct page *page = NULL;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
-        vma = find_vma(mm, address);
+        mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
-        BUG_ON(vma == NULL);
+        ptl = pmd_lock(mm, pmd);
-        split_huge_page_pmd(vma, address, pmd);
+        if (pmd_trans_huge(*pmd)) {
+                page = pmd_page(*pmd);
+                if (PageMlocked(page))
+                        get_page(page);
+                else
+                        page = NULL;
+        } else if (!pmd_devmap(*pmd))
+                goto out;
+        __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
+        spin_unlock(ptl);
+        mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+        if (page) {
+                lock_page(page);
+                munlock_vma_page(page);
+                unlock_page(page);
+                put_page(page);
+        }
 }
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
                                    unsigned long address)
 {
        pgd_t *pgd;
@@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm,
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-        pgd = pgd_offset(mm, address);
+        pgd = pgd_offset(vma->vm_mm, address);
        if (!pgd_present(*pgd))
                return;
@@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm,
                return;
        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
+        if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-        split_huge_page_pmd_mm(mm, address, pmd);
+        split_huge_pmd(vma, pmd, address);
 }
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (start & ~HPAGE_PMD_MASK &&
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-                split_huge_page_address(vma->vm_mm, start);
+                split_huge_pmd_address(vma, start);
        /*
         * If the new end address isn't hpage aligned and it could
@@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (end & ~HPAGE_PMD_MASK &&
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-                split_huge_page_address(vma->vm_mm, end);
+                split_huge_pmd_address(vma, end);
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
@@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                if (nstart & ~HPAGE_PMD_MASK &&
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                        split_huge_page_address(next->vm_mm, nstart);
+                        split_huge_pmd_address(next, nstart);
+        }
+}
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+                unsigned long address)
+{
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        spinlock_t *ptl;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        int i, nr = HPAGE_PMD_NR;
+        /* Skip pages which doesn't belong to the VMA */
+        if (address < vma->vm_start) {
+                int off = (vma->vm_start - address) >> PAGE_SHIFT;
+                page += off;
+                nr -= off;
+                address = vma->vm_start;
+        }
+        pgd = pgd_offset(vma->vm_mm, address);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        ptl = pmd_lock(vma->vm_mm, pmd);
+        if (!pmd_present(*pmd)) {
+                spin_unlock(ptl);
+                return;
+        }
+        if (pmd_trans_huge(*pmd)) {
+                if (page == pmd_page(*pmd))
+                        __split_huge_pmd_locked(vma, pmd, haddr, true);
+                spin_unlock(ptl);
+                return;
+        }
+        spin_unlock(ptl);
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+        for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+                pte_t entry, swp_pte;
+                swp_entry_t swp_entry;
+                /*
+                 * We've just crossed page table boundary: need to map next one.
+                 * It can happen if THP was mremaped to non PMD-aligned address.
+                 */
+                if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                        pte_unmap_unlock(pte - 1, ptl);
+                        pmd = mm_find_pmd(vma->vm_mm, address);
+                        if (!pmd)
+                                return;
+                        pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                        address, &ptl);
+                }
+                if (!pte_present(*pte))
+                        continue;
+                if (page_to_pfn(page) != pte_pfn(*pte))
+                        continue;
+                flush_cache_page(vma, address, page_to_pfn(page));
+                entry = ptep_clear_flush(vma, address, pte);
+                if (pte_dirty(entry))
+                        SetPageDirty(page);
+                swp_entry = make_migration_entry(page, pte_write(entry));
+                swp_pte = swp_entry_to_pte(swp_entry);
+                if (pte_soft_dirty(entry))
+                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                set_pte_at(vma->vm_mm, address, pte, swp_pte);
+                page_remove_rmap(page, false);
+                put_page(page);
+        }
+        pte_unmap_unlock(pte - 1, ptl);
+}
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+        struct anon_vma_chain *avc;
+        pgoff_t pgoff = page_to_pgoff(page);
+        VM_BUG_ON_PAGE(!PageHead(page), page);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                        pgoff + HPAGE_PMD_NR - 1) {
+                unsigned long address = __vma_address(page, avc->vma);
+                mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+                freeze_page_vma(avc->vma, page, address);
+                mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+        }
+}
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+                unsigned long address)
+{
+        spinlock_t *ptl;
+        pmd_t *pmd;
+        pte_t *pte, entry;
+        swp_entry_t swp_entry;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        int i, nr = HPAGE_PMD_NR;
+        /* Skip pages which doesn't belong to the VMA */
+        if (address < vma->vm_start) {
+                int off = (vma->vm_start - address) >> PAGE_SHIFT;
+                page += off;
+                nr -= off;
+                address = vma->vm_start;
+        }
+        pmd = mm_find_pmd(vma->vm_mm, address);
+        if (!pmd)
+                return;
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+        for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+                /*
+                 * We've just crossed page table boundary: need to map next one.
+                 * It can happen if THP was mremaped to non-PMD aligned address.
+                 */
+                if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                        pte_unmap_unlock(pte - 1, ptl);
+                        pmd = mm_find_pmd(vma->vm_mm, address);
+                        if (!pmd)
+                                return;
+                        pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                        address, &ptl);
+                }
+                if (!is_swap_pte(*pte))
+                        continue;
+                swp_entry = pte_to_swp_entry(*pte);
+                if (!is_migration_entry(swp_entry))
+                        continue;
+                if (migration_entry_to_page(swp_entry) != page)
+                        continue;
+                get_page(page);
+                page_add_anon_rmap(page, vma, address, false);
+                entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+                if (PageDirty(page))
+                        entry = pte_mkdirty(entry);
+                if (is_write_migration_entry(swp_entry))
+                        entry = maybe_mkwrite(entry, vma);
+                flush_dcache_page(page);
+                set_pte_at(vma->vm_mm, address, pte, entry);
+                /* No need to invalidate - it was non-present before */
+                update_mmu_cache(vma, address, pte);
+        }
+        pte_unmap_unlock(pte - 1, ptl);
+}
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+        struct anon_vma_chain *avc;
+        pgoff_t pgoff = page_to_pgoff(page);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                        pgoff, pgoff + HPAGE_PMD_NR - 1) {
+                unsigned long address = __vma_address(page, avc->vma);
+                mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+                unfreeze_page_vma(avc->vma, page, address);
+                mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+        }
+}
+static int __split_huge_page_tail(struct page *head, int tail,
+                struct lruvec *lruvec, struct list_head *list)
+{
+        int mapcount;
+        struct page *page_tail = head + tail;
+        mapcount = atomic_read(&page_tail->_mapcount) + 1;
+        VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+        /*
+         * tail_page->_count is zero and not changing from under us. But
+         * get_page_unless_zero() may be running from under us on the
+         * tail_page. If we used atomic_set() below instead of atomic_add(), we
+         * would then run atomic_set() concurrently with
+         * get_page_unless_zero(), and atomic_set() is implemented in C not
+         * using locked ops. spin_unlock on x86 sometime uses locked ops
+         * because of PPro errata 66, 92, so unless somebody can guarantee
+         * atomic_set() here would be safe on all archs (and not only on x86),
+         * it's safer to use atomic_add().
+         */
+        atomic_add(mapcount + 1, &page_tail->_count);
+        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+        page_tail->flags |= (head->flags &
+                        ((1L << PG_referenced) |
+                         (1L << PG_swapbacked) |
+                         (1L << PG_mlocked) |
+                         (1L << PG_uptodate) |
+                         (1L << PG_active) |
+                         (1L << PG_locked) |
+                         (1L << PG_unevictable) |
+                         (1L << PG_dirty)));
+        /*
+         * After clearing PageTail the gup refcount can be released.
+         * Page flags also must be visible before we make the page non-compound.
+         */
+        smp_wmb();
+        clear_compound_head(page_tail);
+        if (page_is_young(head))
+                set_page_young(page_tail);
+        if (page_is_idle(head))
+                set_page_idle(page_tail);
+        /* ->mapping in first tail page is compound_mapcount */
+        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                        page_tail);
+        page_tail->mapping = head->mapping;
+        page_tail->index = head->index + tail;
+        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+        lru_add_page_tail(head, page_tail, lruvec, list);
+        return mapcount;
+}
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+        struct page *head = compound_head(page);
+        struct zone *zone = page_zone(head);
+        struct lruvec *lruvec;
+        int i, tail_mapcount;
+        /* prevent PageLRU to go away from under us, and freeze lru stats */
+        spin_lock_irq(&zone->lru_lock);
+        lruvec = mem_cgroup_page_lruvec(head, zone);
+        /* complete memcg works before add pages to LRU */
+        mem_cgroup_split_huge_fixup(head);
+        tail_mapcount = 0;
+        for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+                tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+        atomic_sub(tail_mapcount, &head->_count);
+        ClearPageCompound(head);
+        spin_unlock_irq(&zone->lru_lock);
+        unfreeze_page(page_anon_vma(head), head);
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                struct page *subpage = head + i;
+                if (subpage == page)
+                        continue;
+                unlock_page(subpage);
+                /*
+                 * Subpages may be freed if there wasn't any mapping
+                 * like if add_to_swap() is running on a lru page that
+                 * had its mapping zapped. And freeing these pages
+                 * requires taking the lru_lock so we do the put_page
+                 * of the tail pages after the split is complete.
+                 */
+                put_page(subpage);
        }
 }
+int total_mapcount(struct page *page)
+{
+        int i, ret;
+        VM_BUG_ON_PAGE(PageTail(page), page);
+        if (likely(!PageCompound(page)))
+                return atomic_read(&page->_mapcount) + 1;
+        ret = compound_mapcount(page);
+        if (PageHuge(page))
+                return ret;
+        for (i = 0; i < HPAGE_PMD_NR; i++)
+                ret += atomic_read(&page[i]._mapcount) + 1;
+        if (PageDoubleMap(page))
+                ret -= HPAGE_PMD_NR;
+        return ret;
+}
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+        struct page *head = compound_head(page);
+        struct anon_vma *anon_vma;
+        int count, mapcount, ret;
+        bool mlocked;
+        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+        VM_BUG_ON_PAGE(!PageAnon(page), page);
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
+        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+        VM_BUG_ON_PAGE(!PageCompound(page), page);
+        /*
+         * The caller does not necessarily hold an mmap_sem that would prevent
+         * the anon_vma disappearing so we first we take a reference to it
+         * and then lock the anon_vma for write. This is similar to
+         * page_lock_anon_vma_read except the write lock is taken to serialise
+         * against parallel split or collapse operations.
+         */
+        anon_vma = page_get_anon_vma(head);
+        if (!anon_vma) {
+                ret = -EBUSY;
+                goto out;
+        }
+        anon_vma_lock_write(anon_vma);
+        /*
+         * Racy check if we can split the page, before freeze_page() will
+         * split PMDs
+         */
+        if (total_mapcount(head) != page_count(head) - 1) {
+                ret = -EBUSY;
+                goto out_unlock;
+        }
+        mlocked = PageMlocked(page);
+        freeze_page(anon_vma, head);
+        VM_BUG_ON_PAGE(compound_mapcount(head), head);
+        /* Make sure the page is not on per-CPU pagevec as it takes pin */
+        if (mlocked)
+                lru_add_drain();
+        /* Prevent deferred_split_scan() touching ->_count */
+        spin_lock(&split_queue_lock);
+        count = page_count(head);
+        mapcount = total_mapcount(head);
+        if (!mapcount && count == 1) {
+                if (!list_empty(page_deferred_list(head))) {
+                        split_queue_len--;
+                        list_del(page_deferred_list(head));
+                }
+                spin_unlock(&split_queue_lock);
+                __split_huge_page(page, list);
+                ret = 0;
+        } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+                spin_unlock(&split_queue_lock);
+                pr_alert("total_mapcount: %u, page_count(): %u\n",
+                                mapcount, count);
+                if (PageTail(page))
+                        dump_page(head, NULL);
+                dump_page(page, "total_mapcount(head) > 0");
+                BUG();
+        } else {
+                spin_unlock(&split_queue_lock);
+                unfreeze_page(anon_vma, head);
+                ret = -EBUSY;
+        }
+out_unlock:
+        anon_vma_unlock_write(anon_vma);
+        put_anon_vma(anon_vma);
+out:
+        count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+        return ret;
+}
+void free_transhuge_page(struct page *page)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&split_queue_lock, flags);
+        if (!list_empty(page_deferred_list(page))) {
+                split_queue_len--;
+                list_del(page_deferred_list(page));
+        }
+        spin_unlock_irqrestore(&split_queue_lock, flags);
+        free_compound_page(page);
+}
+void deferred_split_huge_page(struct page *page)
+{
+        unsigned long flags;
+        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+        spin_lock_irqsave(&split_queue_lock, flags);
+        if (list_empty(page_deferred_list(page))) {
+                list_add_tail(page_deferred_list(page), &split_queue);
+                split_queue_len++;
+        }
+        spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+static unsigned long deferred_split_count(struct shrinker *shrink,
+                struct shrink_control *sc)
+{
+        /*
+         * Split a page from split_queue will free up at least one page,
+         * at most HPAGE_PMD_NR - 1. We don't track exact number.
+         * Let's use HPAGE_PMD_NR / 2 as ballpark.
+         */
+        return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+                struct shrink_control *sc)
+{
+        unsigned long flags;
+        LIST_HEAD(list), *pos, *next;
+        struct page *page;
+        int split = 0;
+        spin_lock_irqsave(&split_queue_lock, flags);
+        list_splice_init(&split_queue, &list);
+        /* Take pin on all head pages to avoid freeing them under us */
+        list_for_each_safe(pos, next, &list) {
+                page = list_entry((void *)pos, struct page, mapping);
+                page = compound_head(page);
+                /* race with put_compound_page() */
+                if (!get_page_unless_zero(page)) {
+                        list_del_init(page_deferred_list(page));
+                        split_queue_len--;
+                }
+        }
+        spin_unlock_irqrestore(&split_queue_lock, flags);
+        list_for_each_safe(pos, next, &list) {
+                page = list_entry((void *)pos, struct page, mapping);
+                lock_page(page);
+                /* split_huge_page() removes page from list on success */
+                if (!split_huge_page(page))
+                        split++;
+                unlock_page(page);
+                put_page(page);
+        }
+        spin_lock_irqsave(&split_queue_lock, flags);
+        list_splice_tail(&list, &split_queue);
+        spin_unlock_irqrestore(&split_queue_lock, flags);
+        return split * HPAGE_PMD_NR / 2;
+}
+static struct shrinker deferred_split_shrinker = {
+        .count_objects = deferred_split_count,
+        .scan_objects = deferred_split_scan,
+        .seeks = DEFAULT_SEEKS,
+};
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+        struct zone *zone;
+        struct page *page;
+        unsigned long pfn, max_zone_pfn;
+        unsigned long total = 0, split = 0;
+        if (val != 1)
+                return -EINVAL;
+        for_each_populated_zone(zone) {
+                max_zone_pfn = zone_end_pfn(zone);
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+                        if (!pfn_valid(pfn))
+                                continue;
+                        page = pfn_to_page(pfn);
+                        if (!get_page_unless_zero(page))
+                                continue;
+                        if (zone != page_zone(page))
+                                goto next;
+                        if (!PageHead(page) || !PageAnon(page) ||
+                                        PageHuge(page))
+                                goto next;
+                        total++;
+                        lock_page(page);
+                        if (!split_huge_page(page))
+                                split++;
+                        unlock_page(page);
+next:
+                        put_page(page);
+                }
+        }
+        pr_info("%lu of %lu THP split", split, total);
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+                "%llu\n");
+static int __init split_huge_pages_debugfs(void)
+{
+        void *ret;
+        ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+                        &split_huge_pages_fops);
+        if (!ret)
+                pr_warn("Failed to create split_huge_pages in debugfs");
+        return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be934df69b85..12908dcf5831 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
-        __SetPageHead(page);
        __ClearPageReserved(page);
+        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                /*
                 * For gigantic hugepages allocated through bootmem at
@@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
-                        page_dup_rmap(ptepage);
+                        page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(pages_per_huge_page(h), dst);
                }
@@ -3186,7 +3186,7 @@ again:
                        set_page_dirty(page);
                hugetlb_count_sub(pages_per_huge_page(h), mm);
-                page_remove_rmap(page);
+                page_remove_rmap(page, true);
                force_flush = !__tlb_remove_page(tlb, page);
                if (force_flush) {
                        address += sz;
@@ -3415,7 +3415,7 @@ retry_avoidcopy:
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
-                page_remove_rmap(old_page);
+                page_remove_rmap(old_page, true);
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
@@ -3585,7 +3585,7 @@ retry:
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, address);
        } else
-                page_dup_rmap(page);
+                page_dup_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                        get_page_foll(pages[i]);
+                        get_page(pages[i]);
                }
                if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 38e24b89e4c4..ed8b5ffcf9b1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 /*
 * The set of flags that only affect watermark checking and reclaim
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page)
        set_page_count(page, 1);
 }
-static inline void __get_page_tail_foll(struct page *page,
-                                        bool get_page_head)
-{
-        /*
-         * If we're getting a tail page, the elevated page->_count is
-         * required only in the head page and we will elevate the head
-         * page->_count and tail page->_mapcount.
-         *
-         * We elevate page_tail->_mapcount for tail pages to force
-         * page_tail->_count to be zero at all times to avoid getting
-         * false positives from get_page_unless_zero() with
-         * speculative page access (like in
-         * page_cache_get_speculative()) on tail pages.
-         */
-        VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
-        if (get_page_head)
-                atomic_inc(&compound_head(page)->_count);
-        get_huge_page_tail(page);
-}
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
-        if (unlikely(PageTail(page)))
-                /*
-                 * This is safe only because
-                 * __split_huge_page_refcount() can't run under
-                 * get_page_foll() because we hold the proper PT lock.
-                 */
-                __get_page_tail_foll(page, true);
-        else {
-                /*
-                 * Getting a normal page or the head of a compound page
-                 * requires to already have an elevated page->_count.
-                 */
-                VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-                atomic_inc(&page->_count);
-        }
-}
 extern unsigned long highest_memmap_pfn;
 /*
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
-extern unsigned long vma_address(struct page *page,
+ * At what user virtual address is page expected in @vma?
-                                 struct vm_area_struct *vma);
+ */
-#endif
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+        pgoff_t pgoff = page_to_pgoff(page);
+        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+        unsigned long address = __vma_address(page, vma);
+        /* page should be within @vma mapping range */
+        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+        return address;
+}
 #else /* !CONFIG_MMU */
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/ksm.c b/mm/ksm.c
index 2d162c5625f6..ca6d2a06a615 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
        up_read(&mm->mmap_sem);
 }
-static struct page *page_trans_compound_anon(struct page *page)
-{
-        if (PageTransCompound(page)) {
-                struct page *head = compound_head(page);
-                /*
-                 * head may actually be splitted and freed from under
-                 * us but it's ok here.
-                 */
-                if (PageAnon(head))
-                        return head;
-        }
-        return NULL;
-}
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-        if (PageAnon(page) || page_trans_compound_anon(page)) {
+        if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        }
        get_page(kpage);
-        page_add_anon_rmap(kpage, vma, addr);
+        page_add_anon_rmap(kpage, vma, addr, false);
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
-        page_remove_rmap(page);
+        page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        put_page(page);
@@ -975,33 +961,6 @@ out:
        return err;
 }
-static int page_trans_compound_anon_split(struct page *page)
-{
-        int ret = 0;
-        struct page *transhuge_head = page_trans_compound_anon(page);
-        if (transhuge_head) {
-                /* Get the reference on the head to split it. */
-                if (get_page_unless_zero(transhuge_head)) {
-                        /*
-                         * Recheck we got the reference while the head
-                         * was still anonymous.
-                         */
-                        if (PageAnon(transhuge_head))
-                                ret = split_huge_page(transhuge_head);
-                        else
-                                /*
-                                 * Retry later if split_huge_page run
-                                 * from under us.
-                                 */
-                                ret = 1;
-                        put_page(transhuge_head);
-                } else
-                        /* Retry later if split_huge_page run from under us. */
-                        ret = 1;
-        }
-        return ret;
-}
 /*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
@@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (page == kpage)                      /* ksm page forked */
                return 0;
-        if (PageTransCompound(page) && page_trans_compound_anon_split(page))
-                goto out;
-        BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
@@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
         */
        if (!trylock_page(page))
                goto out;
+        if (PageTransCompound(page)) {
+                err = split_huge_page(page);
+                if (err)
+                        goto out_unlock;
+        }
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
@@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                         */
                        set_page_stable_node(page, NULL);
                        mark_page_accessed(page);
+                        /*
+                         * Page reclaim just frees a clean page with no dirty
+                         * ptes: make sure that the ksm page would be swapped.
+                         */
+                        if (!PageDirty(page))
+                                SetPageDirty(page);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
@@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                }
        }
+out_unlock:
        unlock_page(page);
 out:
        return err;
@@ -1635,8 +1605,7 @@ next_mm:
                                cond_resched();
                                continue;
                        }
-                        if (PageAnon(*page) ||
+                        if (PageAnon(*page)) {
-                            page_trans_compound_anon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-                __set_page_locked(new_page);
+                __SetPageLocked(new_page);
        }
        return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcbb530e..f56825b6d2e1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,9 @@
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlb.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+        case MADV_FREE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
        return 0;
 }
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+                                unsigned long end, struct mm_walk *walk)
+{
+        struct mmu_gather *tlb = walk->private;
+        struct mm_struct *mm = tlb->mm;
+        struct vm_area_struct *vma = walk->vma;
+        spinlock_t *ptl;
+        pte_t *orig_pte, *pte, ptent;
+        struct page *page;
+        int nr_swap = 0;
+        unsigned long next;
+        next = pmd_addr_end(addr, end);
+        if (pmd_trans_huge(*pmd))
+                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+                        goto next;
+        if (pmd_trans_unstable(pmd))
+                return 0;
+        orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        arch_enter_lazy_mmu_mode();
+        for (; addr != end; pte++, addr += PAGE_SIZE) {
+                ptent = *pte;
+                if (pte_none(ptent))
+                        continue;
+                /*
+                 * If the pte has swp_entry, just clear page table to
+                 * prevent swap-in which is more expensive rather than
+                 * (page allocation + zeroing).
+                 */
+                if (!pte_present(ptent)) {
+                        swp_entry_t entry;
+                        entry = pte_to_swp_entry(ptent);
+                        if (non_swap_entry(entry))
+                                continue;
+                        nr_swap--;
+                        free_swap_and_cache(entry);
+                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                        continue;
+                }
+                page = vm_normal_page(vma, addr, ptent);
+                if (!page)
+                        continue;
+                /*
+                 * If pmd isn't transhuge but the page is THP and
+                 * is owned by only this process, split it and
+                 * deactivate all pages.
+                 */
+                if (PageTransCompound(page)) {
+                        if (page_mapcount(page) != 1)
+                                goto out;
+                        get_page(page);
+                        if (!trylock_page(page)) {
+                                put_page(page);
+                                goto out;
+                        }
+                        pte_unmap_unlock(orig_pte, ptl);
+                        if (split_huge_page(page)) {
+                                unlock_page(page);
+                                put_page(page);
+                                pte_offset_map_lock(mm, pmd, addr, &ptl);
+                                goto out;
+                        }
+                        put_page(page);
+                        unlock_page(page);
+                        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                        pte--;
+                        addr -= PAGE_SIZE;
+                        continue;
+                }
+                VM_BUG_ON_PAGE(PageTransCompound(page), page);
+                if (PageSwapCache(page) || PageDirty(page)) {
+                        if (!trylock_page(page))
+                                continue;
+                        /*
+                         * If page is shared with others, we couldn't clear
+                         * PG_dirty of the page.
+                         */
+                        if (page_mapcount(page) != 1) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (PageSwapCache(page) && !try_to_free_swap(page)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        ClearPageDirty(page);
+                        unlock_page(page);
+                }
+                if (pte_young(ptent) || pte_dirty(ptent)) {
+                        /*
+                         * Some of architecture(ex, PPC) don't update TLB
+                         * with set_pte_at and tlb_remove_tlb_entry so for
+                         * the portability, remap the pte with old|clean
+                         * after pte clearing.
+                         */
+                        ptent = ptep_get_and_clear_full(mm, addr, pte,
+                                                        tlb->fullmm);
+                        ptent = pte_mkold(ptent);
+                        ptent = pte_mkclean(ptent);
+                        set_pte_at(mm, addr, pte, ptent);
+                        if (PageActive(page))
+                                deactivate_page(page);
+                        tlb_remove_tlb_entry(tlb, pte, addr);
+                }
+        }
+out:
+        if (nr_swap) {
+                if (current->mm == mm)
+                        sync_mm_rss(mm);
+                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+        }
+        arch_leave_lazy_mmu_mode();
+        pte_unmap_unlock(orig_pte, ptl);
+        cond_resched();
+next:
+        return 0;
+}
+static void madvise_free_page_range(struct mmu_gather *tlb,
+                             struct vm_area_struct *vma,
+                             unsigned long addr, unsigned long end)
+{
+        struct mm_walk free_walk = {
+                .pmd_entry = madvise_free_pte_range,
+                .mm = vma->vm_mm,
+                .private = tlb,
+        };
+        tlb_start_vma(tlb, vma);
+        walk_page_range(addr, end, &free_walk);
+        tlb_end_vma(tlb, vma);
+}
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+                        unsigned long start_addr, unsigned long end_addr)
+{
+        unsigned long start, end;
+        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_gather tlb;
+        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+                return -EINVAL;
+        /* MADV_FREE works for only anon vma at the moment */
+        if (!vma_is_anonymous(vma))
+                return -EINVAL;
+        start = max(vma->vm_start, start_addr);
+        if (start >= vma->vm_end)
+                return -EINVAL;
+        end = min(vma->vm_end, end_addr);
+        if (end <= vma->vm_start)
+                return -EINVAL;
+        lru_add_drain();
+        tlb_gather_mmu(&tlb, mm, start, end);
+        update_hiwater_rss(mm);
+        mmu_notifier_invalidate_range_start(mm, start, end);
+        madvise_free_page_range(&tlb, vma, start, end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_finish_mmu(&tlb, start, end);
+        return 0;
+}
+static long madvise_free(struct vm_area_struct *vma,
+                             struct vm_area_struct **prev,
+                             unsigned long start, unsigned long end)
+{
+        *prev = vma;
+        return madvise_free_single_vma(vma, start, end);
+}
 /*
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
+        case MADV_FREE:
+                /*
+                 * XXX: In this implementation, MADV_FREE works like
+                 * MADV_DONTNEED on swapless system or full swap.
+                 */
+                if (get_nr_swap_pages() > 0)
+                        return madvise_free(vma, prev, start, end);
+                /* passthrough */
        case MADV_DONTNEED:
                return madvise_dontneed(vma, prev, start, end);
        default:
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+        case MADV_FREE:
 #ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 54eae4f19d80..0eda67376df4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 {
        struct mem_cgroup *memcg;
-        rcu_read_lock();
        memcg = page->mem_cgroup;
        if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
-        rcu_read_unlock();
        return &memcg->css;
 }
@@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         struct page *page,
-                                         int nr_pages)
+                                         bool compound, int nr_pages)
 {
        /*
         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
-        if (PageTransHuge(page))
+        if (compound) {
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                                nr_pages);
+        }
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
@@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
 /*
 * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
+ * zone->lru_lock and migration entries setup in all page mappings.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
 */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
@@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 swap_buffers:
        /* Swap primary and spare array */
        thresholds->spare = thresholds->primary;
-        /* If all events are unregistered, free the spare array */
-        if (!new) {
-                kfree(thresholds->spare);
-                thresholds->spare = NULL;
-        }
        rcu_assign_pointer(thresholds->primary, new);
        /* To be sure that nobody uses thresholds */
        synchronize_rcu();
+        /* If all events are unregistered, free the spare array */
+        if (!new) {
+                kfree(thresholds->spare);
+                thresholds->spare = NULL;
+        }
 unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
@@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
 *
- * The caller must confirm following.
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
 *
 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
 * from old cgroup.
 */
 static int mem_cgroup_move_account(struct page *page,
-                                   unsigned int nr_pages,
+                                   bool compound,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
 {
        unsigned long flags;
+        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret;
        bool anon;
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
-        /*
+        VM_BUG_ON(compound && !PageTransHuge(page));
-         * The page is isolated from LRU. So, collapse function
-         * will not handle this page. But page splitting can happen.
-         * Do this check under compound_page_lock(). The caller should
-         * hold it.
-         */
-        ret = -EBUSY;
-        if (nr_pages > 1 && !PageTransHuge(page))
-                goto out;
        /*
         * Prevent mem_cgroup_replace_page() from looking at
         * page->mem_cgroup of its source page while we change it.
         */
+        ret = -EBUSY;
        if (!trylock_page(page))
                goto out;
@@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page,
        ret = 0;
        local_irq_disable();
-        mem_cgroup_charge_statistics(to, page, nr_pages);
+        mem_cgroup_charge_statistics(to, page, compound, nr_pages);
        memcg_check_events(to, page);
-        mem_cgroup_charge_statistics(from, page, -nr_pages);
+        mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
        memcg_check_events(from, page);
        local_irq_enable();
 out_unlock:
@@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        union mc_target target;
        struct page *page;
-        /*
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
-         * We don't take compound_lock() here but no race with splitting thp
-         * happens because:
-         *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
-         *    under splitting, which means there's no concurrent thp split,
-         *  - if another thread runs into split_huge_page() just after we
-         *    entered this if-block, the thread must wait for page table lock
-         *    to be unlocked in __split_huge_page_splitting(), where the main
-         *    part of thp split is not executed yet.
-         */
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(ptl);
                        return 0;
@@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                if (target_type == MC_TARGET_PAGE) {
                        page = target.page;
                        if (!isolate_lru_page(page)) {
-                                if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                                if (!mem_cgroup_move_account(page, true,
                                                             mc.from, mc.to)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
@@ -4917,9 +4897,18 @@ retry:
                switch (get_mctgt_type(vma, addr, ptent, &target)) {
                case MC_TARGET_PAGE:
                        page = target.page;
+                        /*
+                         * We can have a part of the split pmd here. Moving it
+                         * can be done but it would be too convoluted so simply
+                         * ignore such a partial THP and keep it in original
+                         * memcg. There should be somebody mapping the head.
+                         */
+                        if (PageTransCompound(page))
+                                goto put;
                        if (isolate_lru_page(page))
                                goto put;
-                        if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+                        if (!mem_cgroup_move_account(page, false,
+                                                mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 * with mem_cgroup_cancel_charge() in case page instantiation fails.
 */
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                          gfp_t gfp_mask, struct mem_cgroup **memcgp)
+                          gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                          bool compound)
 {
        struct mem_cgroup *memcg = NULL;
-        unsigned int nr_pages = 1;
+        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret = 0;
        if (mem_cgroup_disabled())
@@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                }
        }
-        if (PageTransHuge(page)) {
-                nr_pages <<= compound_order(page);
-                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-        }
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
@@ -5324,9 +5309,9 @@ out:
 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
 */
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                              bool lrucare)
+                              bool lrucare, bool compound)
 {
-        unsigned int nr_pages = 1;
+        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        VM_BUG_ON_PAGE(!page->mapping, page);
        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
        commit_charge(page, memcg, lrucare);
-        if (PageTransHuge(page)) {
-                nr_pages <<= compound_order(page);
-                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-        }
        local_irq_disable();
-        mem_cgroup_charge_statistics(memcg, page, nr_pages);
+        mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
        memcg_check_events(memcg, page);
        local_irq_enable();
@@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 *
 * Cancel a charge transaction started by mem_cgroup_try_charge().
 */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+                bool compound)
 {
-        unsigned int nr_pages = 1;
+        unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        if (mem_cgroup_disabled())
                return;
@@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
        if (!memcg)
                return;
-        if (PageTransHuge(page)) {
-                nr_pages <<= compound_order(page);
-                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-        }
        cancel_charge(memcg, nr_pages);
 }
@@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-        mem_cgroup_charge_statistics(memcg, page, -1);
+        mem_cgroup_charge_statistics(memcg, page, false, -1);
        memcg_check_events(memcg, page);
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8424b64711ac..ac595e7a3a95 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
 {
        struct page *head = compound_head(page);
-        if (PageHuge(head))
+        if (!PageHuge(head) && PageTransHuge(head)) {
-                return get_page_unless_zero(head);
-        /*
-         * Thp tail page has special refcounting rule (refcount of tail pages
-         * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
-         * directly for tail pages.
-         */
-        if (PageTransHuge(head)) {
                /*
                 * Non anonymous thp exists only in allocation/free time. We
                 * can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
                                page_to_pfn(page));
                        return 0;
                }
-                if (get_page_unless_zero(head)) {
-                        if (PageTail(page))
-                                get_page(page);
-                        return 1;
-                } else {
-                        return 0;
-                }
        }
-        return get_page_unless_zero(page);
+        return get_page_unless_zero(head);
 }
 EXPORT_SYMBOL_GPL(get_hwpoison_page);
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page:       raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
-        struct page *head = compound_head(page);
-        if (PageHuge(head)) {
-                put_page(head);
-                return;
-        }
-        if (PageTransHuge(head))
-                if (page != head)
-                        put_page(head);
-        put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
 /*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
        if (!PageHuge(p) && PageTransHuge(hpage)) {
+                lock_page(hpage);
                if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                        unlock_page(hpage);
                        if (!PageAnon(hpage))
                                pr_err("MCE: %#lx: non anonymous thp\n", pfn);
                        else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        put_hwpoison_page(p);
                        return -EBUSY;
                }
+                unlock_page(hpage);
+                get_hwpoison_page(p);
+                put_hwpoison_page(hpage);
                VM_BUG_ON_PAGE(!page_count(p), p);
                hpage = compound_head(p);
        }
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
-         * - to avoid races with __set_page_locked()
+         * - to avoid races with __SetPageLocked()
         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                 * Did it turn free?
                 */
                ret = __get_any_page(page, pfn, 0);
-                if (!PageLRU(page)) {
+                if (ret == 1 && !PageLRU(page)) {
                        /* Drop page reference which is from __get_any_page() */
                        put_hwpoison_page(page);
                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags)
        return ret;
 }
+static int soft_offline_in_use_page(struct page *page, int flags)
+{
+        int ret;
+        struct page *hpage = compound_head(page);
+        if (!PageHuge(page) && PageTransHuge(hpage)) {
+                lock_page(hpage);
+                if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                        unlock_page(hpage);
+                        if (!PageAnon(hpage))
+                                pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
+                        else
+                                pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
+                        put_hwpoison_page(hpage);
+                        return -EBUSY;
+                }
+                unlock_page(hpage);
+                get_hwpoison_page(page);
+                put_hwpoison_page(hpage);
+        }
+        if (PageHuge(page))
+                ret = soft_offline_huge_page(page, flags);
+        else
+                ret = __soft_offline_page(page, flags);
+        return ret;
+}
+static void soft_offline_free_page(struct page *page)
+{
+        if (PageHuge(page)) {
+                struct page *hpage = compound_head(page);
+                set_page_hwpoison_huge_page(hpage);
+                if (!dequeue_hwpoisoned_huge_page(hpage))
+                        num_poisoned_pages_add(1 << compound_order(hpage));
+        } else {
+                if (!TestSetPageHWPoison(page))
+                        num_poisoned_pages_inc();
+        }
+}
 /**
 * soft_offline_page - Soft offline a page.
 * @page: page to offline
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
-        struct page *hpage = compound_head(page);
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags)
                        put_hwpoison_page(page);
                return -EBUSY;
        }
-        if (!PageHuge(page) && PageTransHuge(hpage)) {
-                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
-                        pr_info("soft offline: %#lx: failed to split THP\n",
-                                pfn);
-                        if (flags & MF_COUNT_INCREASED)
-                                put_hwpoison_page(page);
-                        return -EBUSY;
-                }
-        }
        get_online_mems();
        ret = get_any_page(page, pfn, flags);
        put_online_mems();
-        if (ret > 0) { /* for in-use pages */
-                if (PageHuge(page))
+        if (ret > 0)
-                        ret = soft_offline_huge_page(page, flags);
+                ret = soft_offline_in_use_page(page, flags);
-                else
+        else if (ret == 0)
-                        ret = __soft_offline_page(page, flags);
+                soft_offline_free_page(page);
-        } else if (ret == 0) { /* for free pages */
-                if (PageHuge(page)) {
-                        set_page_hwpoison_huge_page(hpage);
-                        if (!dequeue_hwpoisoned_huge_page(hpage))
-                                num_poisoned_pages_add(1 << compound_order(hpage));
-                } else {
-                        if (!TestSetPageHWPoison(page))
-                                num_poisoned_pages_inc();
-                }
-        }
        return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d4e4d37c1989..ff17850a52d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
 #include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/pfn_t.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
-        int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        ptl = pmd_lock(mm, pmd);
-        wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
-        } else if (unlikely(pmd_trans_splitting(*pmd)))
+        }
-                wait_split_huge_page = 1;
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
-        if (wait_split_huge_page)
-                wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-        } else
+        }
-                VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
-                page_dup_rmap(page);
+                page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
        }
@@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (pmd_trans_huge(*src_pmd)) {
+                if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
                        err = copy_huge_pmd(dst_mm, src_mm,
@@ -1118,7 +1113,7 @@ again:
                                        mark_page_accessed(page);
                        }
                        rss[mm_counter(page)]--;
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (pmd_trans_huge(*pmd)) {
+                if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
 #ifdef CONFIG_DEBUG_VM
                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                                split_huge_page_pmd(vma, addr, pmd);
+                                split_huge_pmd(vma, pmd, addr);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long pfn, pgprot_t prot)
+                        pfn_t pfn, pgprot_t prot)
 {
        struct mm_struct *mm = vma->vm_mm;
        int retval;
@@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                goto out_unlock;
        /* Ok, finally just insert the thing.. */
-        entry = pte_mkspecial(pfn_pte(pfn, prot));
+        if (pfn_t_devmap(pfn))
+                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+        else
+                entry = pte_mkspecial(pfn_t_pte(pfn, prot));
        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        if (track_pfn_insert(vma, &pgprot, pfn))
+        if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
                return -EINVAL;
-        ret = insert_pfn(vma, addr, pfn, pgprot);
+        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long pfn)
+                        pfn_t pfn)
 {
        BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
@@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
-        if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+        if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
                struct page *page;
-                page = pfn_to_page(pfn);
+                page = pfn_t_to_page(pfn);
                return insert_page(vma, addr, page, vma->vm_page_prot);
        }
        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                cow_user_page(new_page, old_page, address, vma);
        }
-        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_new;
        __SetPageUptodate(new_page);
@@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-                page_add_new_anon_rmap(new_page, vma, address);
+                page_add_new_anon_rmap(new_page, vma, address, false);
-                mem_cgroup_commit_charge(new_page, memcg, false);
+                mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
@@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                        page_remove_rmap(old_page);
+                        page_remove_rmap(old_page, false);
                }
                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
-                mem_cgroup_cancel_charge(new_page, memcg);
+                mem_cgroup_cancel_charge(new_page, memcg, false);
        }
        if (new_page)
@@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
-                        munlock_vma_page(old_page);
+                        if (PageMlocked(old_page))
+                                munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
@@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_page;
        }
-        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
                ret = VM_FAULT_OOM;
                goto out_page;
        }
@@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
-                exclusive = 1;
+                exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(orig_pte))
@@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, address, page_table, pte);
        if (page == swapcache) {
                do_page_add_anon_rmap(page, vma, address, exclusive);
-                mem_cgroup_commit_charge(page, memcg, true);
+                mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-                page_add_new_anon_rmap(page, vma, address);
+                page_add_new_anon_rmap(page, vma, address, false);
-                mem_cgroup_commit_charge(page, memcg, false);
+                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
@@ -2613,7 +2612,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-        mem_cgroup_cancel_charge(page, memcg);
+        mem_cgroup_cancel_charge(page, memcg, false);
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
@@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!page)
                goto oom;
-        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+        if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_page;
        /*
@@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(page_table, ptl);
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, false);
                page_cache_release(page);
                return handle_userfault(vma, address, flags,
                                        VM_UFFD_MISSING);
        }
        inc_mm_counter_fast(mm, MM_ANONPAGES);
-        page_add_new_anon_rmap(page, vma, address);
+        page_add_new_anon_rmap(page, vma, address, false);
-        mem_cgroup_commit_charge(page, memcg, false);
+        mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
        set_pte_at(mm, address, page_table, entry);
@@ -2747,7 +2746,7 @@ unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
 release:
-        mem_cgroup_cancel_charge(page, memcg);
+        mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        goto unlock;
 oom_free_page:
@@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-                page_add_new_anon_rmap(page, vma, address);
+                page_add_new_anon_rmap(page, vma, address, false);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page);
@@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page)
                return VM_FAULT_OOM;
-        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
                page_cache_release(new_page);
                return VM_FAULT_OOM;
        }
@@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
-        mem_cgroup_commit_charge(new_page, memcg, false);
+        mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
@@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        return ret;
 uncharge_out:
-        mem_cgroup_cancel_charge(new_page, memcg);
+        mem_cgroup_cancel_charge(new_page, memcg, false);
        page_cache_release(new_page);
        return ret;
 }
@@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
-        mapping = fault_page->mapping;
+        mapping = page_rmapping(fault_page);
        unlock_page(fault_page);
        if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
                /*
@@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
+        /* TODO: handle PTE-mapped THP */
+        if (PageCompound(page)) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
@@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                int ret;
                barrier();
-                if (pmd_trans_huge(orig_pmd)) {
+                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
-                        /*
-                         * If the pmd is splitting, return and retry the
-                         * the fault.  Alternative: wait until the split
-                         * is done, and goto retry.
-                         */
-                        if (pmd_trans_splitting(orig_pmd))
-                                return 0;
                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
@@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
-        if (unlikely(pmd_trans_huge(*pmd)))
+        if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 92f95952692b..4af58a3a8ffa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -17,6 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/memory.h>
+#include <linux/memremap.h>
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
@@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
        unsigned long i;
        int err = 0;
        int start_sec, end_sec;
+        struct vmem_altmap *altmap;
        /* during initialize mem_map, align hot-added range to section */
        start_sec = pfn_to_section_nr(phys_start_pfn);
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+        altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
+        if (altmap) {
+                /*
+                 * Validate altmap is within bounds of the total request
+                 */
+                if (altmap->base_pfn != phys_start_pfn
+                                || vmem_altmap_offset(altmap) > nr_pages) {
+                        pr_warn_once("memory add fail, invalid altmap\n");
+                        return -EINVAL;
+                }
+                altmap->alloc = 0;
+        }
        for (i = start_sec; i <= end_sec; i++) {
                err = __add_section(nid, zone, section_nr_to_pfn(i));
@@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static int __remove_section(struct zone *zone, struct mem_section *ms,
+                unsigned long map_offset)
 {
        unsigned long start_pfn;
        int scn_nr;
@@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        start_pfn = section_nr_to_pfn(scn_nr);
        __remove_zone(zone, start_pfn);
-        sparse_remove_one_section(zone, ms);
+        sparse_remove_one_section(zone, ms, map_offset);
        return 0;
 }
@@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                 unsigned long nr_pages)
 {
        unsigned long i;
-        int sections_to_remove;
+        unsigned long map_offset = 0;
-        resource_size_t start, size;
+        int sections_to_remove, ret = 0;
-        int ret = 0;
+        /* In the ZONE_DEVICE case device driver owns the memory region */
+        if (is_dev_zone(zone)) {
+                struct page *page = pfn_to_page(phys_start_pfn);
+                struct vmem_altmap *altmap;
+                altmap = to_vmem_altmap((unsigned long) page);
+                if (altmap)
+                        map_offset = vmem_altmap_offset(altmap);
+        } else {
+                resource_size_t start, size;
+                start = phys_start_pfn << PAGE_SHIFT;
+                size = nr_pages * PAGE_SIZE;
+                ret = release_mem_region_adjustable(&iomem_resource, start,
+                                        size);
+                if (ret) {
+                        resource_size_t endres = start + size - 1;
+                        pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+                                        &start, &endres, ret);
+                }
+        }
        /*
         * We can only remove entire sections
@@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
-        start = phys_start_pfn << PAGE_SHIFT;
-        size = nr_pages * PAGE_SIZE;
-        /* in the ZONE_DEVICE case device driver owns the memory region */
-        if (!is_dev_zone(zone))
-                ret = release_mem_region_adjustable(&iomem_resource, start, size);
-        if (ret) {
-                resource_size_t endres = start + size - 1;
-                pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
-                                &start, &endres, ret);
-        }
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-                ret = __remove_section(zone, __pfn_to_section(pfn));
+                ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+                map_offset = 0;
                if (ret)
                        break;
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8caff071a30..27d135408a22 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
-        int nid;
+        int nid, ret;
        pte_t *pte;
        spinlock_t *ptl;
-        split_huge_page_pmd(vma, addr, pmd);
+        if (pmd_trans_huge(*pmd)) {
-        if (pmd_trans_unstable(pmd))
+                ptl = pmd_lock(walk->mm, pmd);
-                return 0;
+                if (pmd_trans_huge(*pmd)) {
+                        page = pmd_page(*pmd);
+                        if (is_huge_zero_page(page)) {
+                                spin_unlock(ptl);
+                                split_huge_pmd(vma, pmd, addr);
+                        } else {
+                                get_page(page);
+                                spin_unlock(ptl);
+                                lock_page(page);
+                                ret = split_huge_page(page);
+                                unlock_page(page);
+                                put_page(page);
+                                if (ret)
+                                        return 0;
+                        }
+                } else {
+                        spin_unlock(ptl);
+                }
+        }
+retry:
        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                nid = page_to_nid(page);
                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
+                if (PageTail(page) && PageAnon(page)) {
+                        get_page(page);
+                        pte_unmap_unlock(pte, ptl);
+                        lock_page(page);
+                        ret = split_huge_page(page);
+                        unlock_page(page);
+                        put_page(page);
+                        /* Failed to split -- skip. */
+                        if (ret) {
+                                pte = pte_offset_map_lock(walk->mm, pmd,
+                                                addr, &ptl);
+                                continue;
+                        }
+                        goto retry;
+                }
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, qp->pagelist, flags);
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+                if (vma_migratable(vma) &&
+                        vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 7890d0bb5e23..b1034f9c77e7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                if (PageAnon(new))
                        hugepage_add_anon_rmap(new, vma, addr);
                else
-                        page_dup_rmap(new);
+                        page_dup_rmap(new, true);
        } else if (PageAnon(new))
-                page_add_anon_rmap(new, vma, addr);
+                page_add_anon_rmap(new, vma, addr, false);
        else
                page_add_file_rmap(new);
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                goto out;
        }
-        if (unlikely(PageTransHuge(page)))
+        if (unlikely(PageTransHuge(page))) {
-                if (unlikely(split_huge_page(page)))
+                lock_page(page);
+                rc = split_huge_page(page);
+                unlock_page(page);
+                if (rc)
                        goto out;
+        }
        rc = __unmap_and_move(page, newpage, force, mode);
        if (rc == MIGRATEPAGE_SUCCESS)
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                HPAGE_PMD_ORDER);
        if (!new_page)
                goto out_fail;
+        prep_transhuge_page(new_page);
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                flush_tlb_range(vma, mmun_start, mmun_end);
        /* Prepare a page as a migration target */
-        __set_page_locked(new_page);
+        __SetPageLocked(new_page);
        SetPageSwapBacked(new_page);
        /* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1820,7 @@ fail_putback:
         * guarantee the copy is visible before the pagetable update.
         */
        flush_cache_range(vma, mmun_start, mmun_end);
-        page_add_anon_rmap(new_page, vma, mmun_start);
+        page_add_anon_rmap(new_page, vma, mmun_start, true);
        pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1831,14 @@ fail_putback:
                flush_tlb_range(vma, mmun_start, mmun_end);
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                update_mmu_cache_pmd(vma, address, &entry);
-                page_remove_rmap(new_page);
+                page_remove_rmap(new_page, true);
                goto fail_putback;
        }
        mlock_migrate_page(new_page, page);
        set_page_memcg(new_page, page_memcg(page));
        set_page_memcg(page, NULL);
-        page_remove_rmap(page);
+        page_remove_rmap(page, true);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb37f0c..2a565ed8bb49 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        unsigned char *vec = walk->private;
        int nr = (end - addr) >> PAGE_SHIFT;
-        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+        if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                memset(vec, 1, nr);
                spin_unlock(ptl);
                goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 9cb87cbc4071..e1e2b1207bf2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -24,13 +24,13 @@
 #include "internal.h"
-int can_do_mlock(void)
+bool can_do_mlock(void)
 {
        if (rlimit(RLIMIT_MEMLOCK) != 0)
-                return 1;
+                return true;
        if (capable(CAP_IPC_LOCK))
-                return 1;
+                return true;
-        return 0;
+        return false;
 }
 EXPORT_SYMBOL(can_do_mlock);
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
        /* Serialize with page migration */
        BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(PageTail(page), page);
+        VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
        if (!TestSetPageMlocked(page)) {
                mod_zone_page_state(page_zone(page), NR_MLOCK,
                                    hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
        /* For try_to_munlock() and to serialize with page migration */
        BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Serialize with any parallel __split_huge_page_refcount() which
         * might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
                if (!page || page_zone_id(page) != zoneid)
                        break;
+                /*
+                 * Do not use pagevec for PTE-mapped THP,
+                 * munlock_vma_pages_range() will handle them.
+                 */
+                if (PageTransCompound(page))
+                        break;
                get_page(page);
                /*
                 * Increase the address that will be returned *before* the
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                &page_mask);
                if (page && !IS_ERR(page)) {
-                        if (PageTransHuge(page)) {
+                        if (PageTransTail(page)) {
+                                VM_BUG_ON_PAGE(PageMlocked(page), page);
+                                put_page(page); /* follow_page_mask() */
+                        } else if (PageTransHuge(page)) {
                                lock_page(page);
                                /*
                                 * Any THP page found by follow_page_mask() may
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                goto next;
                        }
                }
-                /* It's a bug to munlock in the middle of a THP page */
-                VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
                page_increm = 1 + page_mask;
                start += page_increm * PAGE_SIZE;
 next:
diff --git a/mm/mmap.c b/mm/mmap.c
index b3f00b616b81..84b12624ceb0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
- * We can take all the locks in random order because the VM code
+ * We take locks in following order, accordingly to comment at beginning
- * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
+ * of mm/rmap.c:
- * takes more than one of them in a row. Secondly we're protected
+ *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
- * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *     hugetlb mapping);
+ *   - all i_mmap_rwsem locks;
+ *   - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm)
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
-                if (vma->vm_file && vma->vm_file->f_mapping)
+                if (vma->vm_file && vma->vm_file->f_mapping &&
+                                is_vm_hugetlb_page(vma))
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->vm_file && vma->vm_file->f_mapping &&
+                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c764402c464f..8eb7bb40dc40 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                unsigned long this_pages;
                next = pmd_addr_end(addr, end);
-                if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+                if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+                                && pmd_none_or_clear_bad(pmd))
                        continue;
                /* invoke the mmu notifier if the pmd is populated */
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                        mmu_notifier_invalidate_range_start(mm, mni_start, end);
                }
-                if (pmd_trans_huge(*pmd)) {
+                if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
-                                split_huge_page_pmd(vma, addr, pmd);
+                                split_huge_pmd(vma, pmd, addr);
                        else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index e55b157865d5..d77946a997f7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (!new_pmd)
                        break;
                if (pmd_trans_huge(*old_pmd)) {
-                        int err = 0;
                        if (extent == HPAGE_PMD_SIZE) {
+                                bool moved;
                                VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
                                              vma);
                                /* See comment in move_ptes() */
                                if (need_rmap_locks)
                                        anon_vma_lock_write(vma->anon_vma);
-                                err = move_huge_pmd(vma, new_vma, old_addr,
+                                moved = move_huge_pmd(vma, new_vma, old_addr,
                                                    new_addr, old_end,
                                                    old_pmd, new_pmd);
                                if (need_rmap_locks)
                                        anon_vma_unlock_write(vma->anon_vma);
+                                if (moved) {
+                                        need_flush = true;
+                                        continue;
+                                }
                        }
-                        if (err > 0) {
+                        split_huge_pmd(vma, old_pmd, old_addr);
-                                need_flush = true;
-                                continue;
-                        } else if (!err) {
-                                split_huge_page_pmd(vma, old_addr, old_pmd);
-                        }
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce63d603820f..63358d9f9aa9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
+#include <linux/memremap.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
@@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
-static void free_compound_page(struct page *page);
 compound_page_dtor * const compound_page_dtors[] = {
        NULL,
        free_compound_page,
 #ifdef CONFIG_HUGETLB_PAGE
        free_huge_page,
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        free_transhuge_page,
+#endif
 };
 int min_free_kbytes = 1024;
@@ -450,7 +453,7 @@ out:
 * This usage means that zero-order pages may not be compound.
 */
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
 {
        __free_pages_ok(page, compound_order(page));
 }
@@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                set_page_count(p, 0);
+                p->mapping = TAIL_MAPPING;
                set_compound_head(p, page);
        }
+        atomic_set(compound_mapcount_ptr(page), -1);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
-        if (unlikely(page_mapcount(page)))
+        if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
                ret = 0;
                goto out;
        }
+        switch (page - head_page) {
+        case 1:
+                /* the first tail page: ->mapping is compound_mapcount() */
+                if (unlikely(compound_mapcount(page))) {
+                        bad_page(page, "nonzero compound_mapcount", 0);
+                        goto out;
+                }
+                break;
+        case 2:
+                /*
+                 * the second tail page: ->mapping is
+                 * page_deferred_list().next -- ignore value.
+                 */
+                break;
+        default:
+                if (page->mapping != TAIL_MAPPING) {
+                        bad_page(page, "corrupted mapping in tail page", 0);
+                        goto out;
+                }
+                break;
+        }
        if (unlikely(!PageTail(page))) {
                bad_page(page, "PageTail not set", 0);
                goto out;
@@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
        }
        ret = 0;
 out:
+        page->mapping = NULL;
        clear_compound_head(page);
        return ret;
 }
@@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
-        if (unlikely(page_mapcount(page)))
+        if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn, enum memmap_context context)
 {
-        pg_data_t *pgdat = NODE_DATA(nid);
+        struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
        unsigned long end_pfn = start_pfn + size;
+        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
-        struct zone *z;
        unsigned long nr_initialised = 0;
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
-        z = &pgdat->node_zones[zone];
+        /*
+         * Honor reservation requested by the driver for this ZONE_DEVICE
+         * memory
+         */
+        if (altmap && start_pfn == altmap->base_pfn)
+                start_pfn += altmap->reserve;
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
                 * There can be holes in boot-time mem_map[]s
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd79041484..4ea9c4ef5146 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
                                        unsigned long addr, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
-        spinlock_t *ptl;
        pmd_t *pmd;
        pte_t *pte;
+        spinlock_t *ptl;
        bool referenced = false;
-        if (unlikely(PageTransHuge(page))) {
+        if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
-                pmd = page_check_address_pmd(page, mm, addr,
+                return SWAP_AGAIN;
-                                             PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
-                if (pmd) {
+        if (pte) {
-                        referenced = pmdp_clear_young_notify(vma, addr, pmd);
+                referenced = ptep_clear_young_notify(vma, addr, pte);
-                        spin_unlock(ptl);
+                pte_unmap(pte);
-                }
+        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+                referenced = pmdp_clear_young_notify(vma, addr, pmd);
        } else {
-                pte = page_check_address(page, mm, addr, &ptl, 0);
+                /* unexpected pmd-mapped page? */
-                if (pte) {
+                WARN_ON_ONCE(1);
-                        referenced = ptep_clear_young_notify(vma, addr, pte);
-                        pte_unmap_unlock(pte, ptl);
-                }
        }
+        spin_unlock(ptl);
        if (referenced) {
                clear_page_idle(page);
                /*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e139fec6c6c..92c4c36501e7 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 {
        unsigned long pfn;
        struct page *page;
-        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
-        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+        BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+        BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
        for (pfn = start_pfn;
             pfn < end_pfn;
             pfn += pageblock_nr_pages) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b853ae..207244489a68 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
                if (!walk->pte_entry)
                        continue;
-                split_huge_page_pmd_mm(walk->mm, addr, pmd);
+                split_huge_pmd(walk->vma, pmd, addr);
                if (pmd_trans_unstable(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4c681baff363..9d4767698a1c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        VM_BUG_ON(!pmd_trans_huge(*pmdp));
+        VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
 #endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                          pmd_t *pmdp)
-{
-        pmd_t pmd = pmd_mksplitting(*pmdp);
-        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-        /* tlb flush only to serialize against gup-fast */
-        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
diff --git a/mm/rmap.c b/mm/rmap.c
index 622756c16ac8..79f3bf047f38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,21 +23,22 @@
 * inode->i_mutex       (while writing or truncating, not reading or faulting)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_rwsem
+ *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
- *         anon_vma->rwsem
+ *         mapping->i_mmap_rwsem
- *           mm->page_table_lock or pte_lock
+ *           anon_vma->rwsem
- *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *             mm->page_table_lock or pte_lock
- *             swap_lock (in swap_duplicate, swap_info_get)
+ *               zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- *               mmlist_lock (in mmput, drain_mmlist and others)
+ *               swap_lock (in swap_duplicate, swap_info_get)
- *               mapping->private_lock (in __set_page_dirty_buffers)
+ *                 mmlist_lock (in mmput, drain_mmlist and others)
- *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                 mapping->private_lock (in __set_page_dirty_buffers)
- *                   mapping->tree_lock (widely used)
+ *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *                     mapping->tree_lock (widely used)
- *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *                 sb_lock (within inode_lock in fs/fs-writeback.c)
+ *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- *                 mapping->tree_lock (widely used, in set_page_dirty,
+ *                   sb_lock (within inode_lock in fs/fs-writeback.c)
- *                           in arch-dependent flush_dcache_mmap_lock,
+ *                   mapping->tree_lock (widely used, in set_page_dirty,
- *                           within bdi.wb->list_lock in __sync_single_inode)
+ *                             in arch-dependent flush_dcache_mmap_lock,
+ *                             within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
        anon_vma_unlock_read(anon_vma);
 }
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
-        pgoff_t pgoff = page_to_pgoff(page);
-        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
-        unsigned long address = __vma_address(page, vma);
-        /* page should be within @vma mapping range */
-        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-        return address;
-}
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 static void percpu_flush_tlb_batch_pages(void *data)
 {
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+                                  unsigned long address, pmd_t **pmdp,
+                                  pte_t **ptep, spinlock_t **ptlp)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        spinlock_t *ptl;
+        if (unlikely(PageHuge(page))) {
+                /* when pud is not present, pte will be NULL */
+                pte = huge_pte_offset(mm, address);
+                if (!pte)
+                        return false;
+                ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+                pmd = NULL;
+                goto check_pte;
+        }
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                return false;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return false;
+        pmd = pmd_offset(pud, address);
+        if (pmd_trans_huge(*pmd)) {
+                ptl = pmd_lock(mm, pmd);
+                if (!pmd_present(*pmd))
+                        goto unlock_pmd;
+                if (unlikely(!pmd_trans_huge(*pmd))) {
+                        spin_unlock(ptl);
+                        goto map_pte;
+                }
+                if (pmd_page(*pmd) != page)
+                        goto unlock_pmd;
+                pte = NULL;
+                goto found;
+unlock_pmd:
+                spin_unlock(ptl);
+                return false;
+        } else {
+                pmd_t pmde = *pmd;
+                barrier();
+                if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+                        return false;
+        }
+map_pte:
+        pte = pte_offset_map(pmd, address);
+        if (!pte_present(*pte)) {
+                pte_unmap(pte);
+                return false;
+        }
+        ptl = pte_lockptr(mm, pmd);
+check_pte:
+        spin_lock(ptl);
+        if (!pte_present(*pte)) {
+                pte_unmap_unlock(pte, ptl);
+                return false;
+        }
+        /* THP can be referenced by any subpage */
+        if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+                pte_unmap_unlock(pte, ptl);
+                return false;
+        }
+found:
+        *ptep = pte;
+        *pmdp = pmd;
+        *ptlp = ptl;
+        return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 struct page_referenced_arg {
        int mapcount;
        int referenced;
@@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
+        struct page_referenced_arg *pra = arg;
+        pmd_t *pmd;
+        pte_t *pte;
        spinlock_t *ptl;
        int referenced = 0;
-        struct page_referenced_arg *pra = arg;
-        if (unlikely(PageTransHuge(page))) {
-                pmd_t *pmd;
-                /*
+        if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
-                 * rmap might return false positives; we must filter
+                return SWAP_AGAIN;
-                 * these out using page_check_address_pmd().
-                 */
-                pmd = page_check_address_pmd(page, mm, address,
-                                             PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
-                if (!pmd)
-                        return SWAP_AGAIN;
-                if (vma->vm_flags & VM_LOCKED) {
-                        spin_unlock(ptl);
-                        pra->vm_flags |= VM_LOCKED;
-                        return SWAP_FAIL; /* To break the loop */
-                }
-                /* go ahead even if the pmd is pmd_trans_splitting() */
+        if (vma->vm_flags & VM_LOCKED) {
-                if (pmdp_clear_flush_young_notify(vma, address, pmd))
+                if (pte)
-                        referenced++;
+                        pte_unmap(pte);
                spin_unlock(ptl);
-        } else {
+                pra->vm_flags |= VM_LOCKED;
-                pte_t *pte;
+                return SWAP_FAIL; /* To break the loop */
+        }
-                /*
-                 * rmap might return false positives; we must filter
-                 * these out using page_check_address().
-                 */
-                pte = page_check_address(page, mm, address, &ptl, 0);
-                if (!pte)
-                        return SWAP_AGAIN;
-                if (vma->vm_flags & VM_LOCKED) {
-                        pte_unmap_unlock(pte, ptl);
-                        pra->vm_flags |= VM_LOCKED;
-                        return SWAP_FAIL; /* To break the loop */
-                }
+        if (pte) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        /*
                         * Don't treat a reference through a sequentially read
@@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                referenced++;
                }
-                pte_unmap_unlock(pte, ptl);
+                pte_unmap(pte);
+        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+                if (pmdp_clear_flush_young_notify(vma, address, pmd))
+                        referenced++;
+        } else {
+                /* unexpected pmd-mapped page? */
+                WARN_ON_ONCE(1);
        }
+        spin_unlock(ptl);
        if (referenced)
                clear_page_idle(page);
@@ -935,7 +987,7 @@ int page_referenced(struct page *page,
        int ret;
        int we_locked = 0;
        struct page_referenced_arg pra = {
-                .mapcount = page_mapcount(page),
+                .mapcount = total_mapcount(page),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
@@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
         * over the call to page_add_new_anon_rmap.
         */
        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
-        BUG_ON(page->index != linear_page_index(vma, address));
+        BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
 #endif
 }
@@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
+ * @compound:   charge the page as compound or small page
 *
 * The caller needs to hold the pte lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
@@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
 * (but PageKsm is never downgraded to PageAnon).
 */
 void page_add_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address)
+        struct vm_area_struct *vma, unsigned long address, bool compound)
 {
-        do_page_add_anon_rmap(page, vma, address, 0);
+        do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
 }
 /*
@@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
 * Everybody else should continue to use page_add_anon_rmap above.
 */
 void do_page_add_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address, int exclusive)
+        struct vm_area_struct *vma, unsigned long address, int flags)
 {
-        int first = atomic_inc_and_test(&page->_mapcount);
+        bool compound = flags & RMAP_COMPOUND;
+        bool first;
+        if (compound) {
+                atomic_t *mapcount;
+                VM_BUG_ON_PAGE(!PageLocked(page), page);
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+                mapcount = compound_mapcount_ptr(page);
+                first = atomic_inc_and_test(mapcount);
+        } else {
+                first = atomic_inc_and_test(&page->_mapcount);
+        }
        if (first) {
+                int nr = compound ? hpage_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
                 * pte lock(a spinlock) is held, which implies preemption
                 * disabled.
                 */
-                if (PageTransHuge(page))
+                if (compound) {
                        __inc_zone_page_state(page,
                                              NR_ANON_TRANSPARENT_HUGEPAGES);
-                __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+                }
-                                hpage_nr_pages(page));
+                __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        }
        if (unlikely(PageKsm(page)))
                return;
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        /* address might be in next vma when migration races vma_adjust */
        if (first)
-                __page_set_anon_rmap(page, vma, address, exclusive);
+                __page_set_anon_rmap(page, vma, address,
+                                flags & RMAP_EXCLUSIVE);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
+ * @compound:   charge the page as compound or small page
 *
 * Same as page_add_anon_rmap but must only be called on *new* pages.
 * This means the inc-and-test can be bypassed.
 * Page does not have to be locked.
 */
 void page_add_new_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address)
+        struct vm_area_struct *vma, unsigned long address, bool compound)
 {
+        int nr = compound ? hpage_nr_pages(page) : 1;
        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
-        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+        if (compound) {
-        if (PageTransHuge(page))
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+                /* increment count (starts at -1) */
+                atomic_set(compound_mapcount_ptr(page), 0);
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-        __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+        } else {
-                        hpage_nr_pages(page));
+                /* Anon THP always mapped first with PMD */
+                VM_BUG_ON_PAGE(PageTransCompound(page), page);
+                /* increment count (starts at -1) */
+                atomic_set(&page->_mapcount, 0);
+        }
+        __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        __page_set_anon_rmap(page, vma, address, 1);
 }
@@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page)
        memcg = mem_cgroup_begin_page_stat(page);
-        /* page still mapped by someone else? */
+        /* Hugepages are not counted in NR_FILE_MAPPED for now. */
-        if (!atomic_add_negative(-1, &page->_mapcount))
+        if (unlikely(PageHuge(page))) {
+                /* hugetlb pages are always mapped with pmds */
+                atomic_dec(compound_mapcount_ptr(page));
                goto out;
+        }
-        /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+        /* page still mapped by someone else? */
-        if (unlikely(PageHuge(page)))
+        if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;
        /*
@@ -1247,41 +1328,79 @@ out:
        mem_cgroup_end_page_stat(memcg);
 }
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+        int i, nr;
+        if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+                return;
+        /* Hugepages are not counted in NR_ANON_PAGES for now. */
+        if (unlikely(PageHuge(page)))
+                return;
+        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+                return;
+        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+        if (TestClearPageDoubleMap(page)) {
+                /*
+                 * Subpages can be mapped with PTEs too. Check how many of
+                 * themi are still mapped.
+                 */
+                for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+                        if (atomic_add_negative(-1, &page[i]._mapcount))
+                                nr++;
+                }
+        } else {
+                nr = HPAGE_PMD_NR;
+        }
+        if (unlikely(PageMlocked(page)))
+                clear_page_mlock(page);
+        if (nr) {
+                __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+                deferred_split_huge_page(page);
+        }
+}
 /**
 * page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page:       page to remove mapping from
+ * @compound:   uncharge the page as compound or small page
 *
 * The caller needs to hold the pte lock.
 */
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
 {
        if (!PageAnon(page)) {
+                VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
                page_remove_file_rmap(page);
                return;
        }
+        if (compound)
+                return page_remove_anon_compound_rmap(page);
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                return;
-        /* Hugepages are not counted in NR_ANON_PAGES for now. */
-        if (unlikely(PageHuge(page)))
-                return;
        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-        if (PageTransHuge(page))
+        __dec_zone_page_state(page, NR_ANON_PAGES);
-                __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-        __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                              -hpage_nr_pages(page));
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
+        if (PageTransCompound(page))
+                deferred_split_huge_page(compound_head(page));
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page)
         */
 }
+struct rmap_private {
+        enum ttu_flags flags;
+        int lazyfreed;
+};
 /*
 * @arg: enum ttu_flags will be passed to this argument
 */
@@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
-        enum ttu_flags flags = (enum ttu_flags)arg;
+        struct rmap_private *rp = arg;
+        enum ttu_flags flags = rp->flags;
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 * See handle_pte_fault() ...
                 */
                VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+                if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+                        /* It's a freeable page by MADV_FREE */
+                        dec_mm_counter(mm, MM_ANONPAGES);
+                        rp->lazyfreed++;
+                        goto discard;
+                }
                if (swap_duplicate(entry) < 0) {
                        set_pte_at(mm, address, pte, pteval);
                        ret = SWAP_FAIL;
@@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        } else
                dec_mm_counter(mm, mm_counter_file(page));
-        page_remove_rmap(page);
+discard:
+        page_remove_rmap(page, PageHuge(page));
        page_cache_release(page);
 out_unmap:
@@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page)
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
+        struct rmap_private rp = {
+                .flags = flags,
+                .lazyfreed = 0,
+        };
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-                .arg = (void *)flags,
+                .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        ret = rmap_walk(page, &rwc);
-        if (ret != SWAP_MLOCK && !page_mapped(page))
+        if (ret != SWAP_MLOCK && !page_mapped(page)) {
                ret = SWAP_SUCCESS;
+                if (rp.lazyfreed && !PageDirty(page))
+                        ret = SWAP_LZFREE;
+        }
        return ret;
 }
@@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 int try_to_munlock(struct page *page)
 {
        int ret;
+        struct rmap_private rp = {
+                .flags = TTU_MUNLOCK,
+                .lazyfreed = 0,
+        };
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-                .arg = (void *)TTU_MUNLOCK,
+                .arg = &rp,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
@@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
-        first = atomic_inc_and_test(&page->_mapcount);
+        first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __hugepage_set_anon_rmap(page, vma, address, 0);
 }
@@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-        atomic_set(&page->_mapcount, 0);
+        atomic_set(compound_mapcount_ptr(page), 0);
        __hugepage_set_anon_rmap(page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 970ff5b80853..b98e1011858c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
         */
-        error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+        error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+                        false);
        if (error)
                goto out;
        /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        if (error) {
                if (error != -ENOMEM)
                        error = 0;
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, false);
        } else
-                mem_cgroup_commit_charge(page, memcg, true);
+                mem_cgroup_commit_charge(page, memcg, true, false);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        copy_highpage(newpage, oldpage);
        flush_dcache_page(newpage);
-        __set_page_locked(newpage);
+        __SetPageLocked(newpage);
        SetPageUptodate(newpage);
        SetPageSwapBacked(newpage);
        set_page_private(newpage, swap_index);
@@ -1218,7 +1219,8 @@ repeat:
                                goto failed;
                }
-                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                                false);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                swp_to_radix_entry(swap));
@@ -1235,14 +1237,14 @@ repeat:
                         * "repeat": reading a hole and writing should succeed.
                         */
                        if (error) {
-                                mem_cgroup_cancel_charge(page, memcg);
+                                mem_cgroup_cancel_charge(page, memcg, false);
                                delete_from_swap_cache(page);
                        }
                }
                if (error)
                        goto failed;
-                mem_cgroup_commit_charge(page, memcg, true);
+                mem_cgroup_commit_charge(page, memcg, true, false);
                spin_lock(&info->lock);
                info->swapped--;
@@ -1277,11 +1279,12 @@ repeat:
                }
                __SetPageSwapBacked(page);
-                __set_page_locked(page);
+                __SetPageLocked(page);
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
-                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+                error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                                false);
                if (error)
                        goto decused;
                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1291,10 +1294,10 @@ repeat:
                        radix_tree_preload_end();
                }
                if (error) {
-                        mem_cgroup_cancel_charge(page, memcg);
+                        mem_cgroup_cancel_charge(page, memcg, false);
                        goto decused;
                }
-                mem_cgroup_commit_charge(page, memcg, false);
+                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
diff --git a/mm/slub.c b/mm/slub.c
index 2d0e610d195a..b21fd24b08b1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
 */
 static __always_inline void slab_lock(struct page *page)
 {
+        VM_BUG_ON_PAGE(PageTail(page), page);
        bit_spin_lock(PG_locked, &page->flags);
 }
 static __always_inline void slab_unlock(struct page *page)
 {
+        VM_BUG_ON_PAGE(PageTail(page), page);
        __bit_spin_unlock(PG_locked, &page->flags);
 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2783a1..b60802b3e5ea 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
+#include <linux/memremap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 }
 /* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+static void * __meminit alloc_block_buf(unsigned long size, int node)
 {
        void *ptr;
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
        return ptr;
 }
+static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+        return altmap->base_pfn + altmap->reserve + altmap->alloc
+                + altmap->align;
+}
+static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+        unsigned long allocated = altmap->alloc + altmap->align;
+        if (altmap->free > allocated)
+                return altmap->free - allocated;
+        return 0;
+}
+/**
+ * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
+ * @altmap - reserved page pool for the allocation
+ * @nr_pfns - size (in pages) of the allocation
+ *
+ * Allocations are aligned to the size of the request
+ */
+static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
+                unsigned long nr_pfns)
+{
+        unsigned long pfn = vmem_altmap_next_pfn(altmap);
+        unsigned long nr_align;
+        nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+        nr_align = ALIGN(pfn, nr_align) - pfn;
+        if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+                return ULONG_MAX;
+        altmap->alloc += nr_pfns;
+        altmap->align += nr_align;
+        return pfn + nr_align;
+}
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+                struct vmem_altmap *altmap)
+{
+        unsigned long pfn, nr_pfns;
+        void *ptr;
+        if (size & ~PAGE_MASK) {
+                pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+                                __func__, size);
+                return NULL;
+        }
+        nr_pfns = size >> PAGE_SHIFT;
+        pfn = vmem_altmap_alloc(altmap, nr_pfns);
+        if (pfn < ULONG_MAX)
+                ptr = __va(__pfn_to_phys(pfn));
+        else
+                ptr = NULL;
+        pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+                        __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+        return ptr;
+}
+/* need to make sure size is all the same during early stage */
+void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
+                struct vmem_altmap *altmap)
+{
+        if (altmap)
+                return altmap_alloc_block_buf(size, altmap);
+        return alloc_block_buf(size, node);
+}
 void __meminit vmemmap_verify(pte_t *pte, int node,
                                unsigned long start, unsigned long end)
 {
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
-                void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+                void *p = alloc_block_buf(PAGE_SIZE, node);
                if (!p)
                        return NULL;
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b691ac8..3717ceed4177 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
        if (!memmap)
                return;
-        for (i = 0; i < PAGES_PER_SECTION; i++) {
+        for (i = 0; i < nr_pages; i++) {
                if (PageHWPoison(&memmap[i])) {
                        atomic_long_sub(1, &num_poisoned_pages);
                        ClearPageHWPoison(&memmap[i]);
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
                free_map_bootmem(memmap);
 }
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+                unsigned long map_offset)
 {
        struct page *memmap = NULL;
        unsigned long *usemap = NULL, flags;
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
        }
        pgdat_resize_unlock(pgdat, &flags);
-        clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+        clear_hwpoisoned_pages(memmap + map_offset,
+                        PAGES_PER_SECTION - map_offset);
        free_section_usemap(memmap, usemap);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..09fe5e97714a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/mm_inline.h>
 #include <linux/percpu_counter.h>
+#include <linux/memremap.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
        (*dtor)(page);
 }
-/**
+void __put_page(struct page *page)
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- *    PageHeadHuge will remain true until the compound page
- *    is released and enters the buddy allocator, and it could
- *    not be split by __split_huge_page_refcount().
- *
- *    So if we see PageHeadHuge set, and we have the tail page pin,
- *    then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- *    PG_slab is cleared before the slab frees the head page, and
- *    tail pin cannot be the last reference left on the head page,
- *    because the slab code is free to reuse the compound page
- *    after a kfree/kmem_cache_free without having to check if
- *    there's any tail pin left.  In turn all tail pinsmust be always
- *    released while the head is still pinned by the slab code
- *    and so we know PG_slab will be still set too.
- *
- *    So if we see PageSlab set, and we have the tail page pin,
- *    then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
-        /*
-         * If @page is a THP tail, we must read the tail page
-         * flags after the head page flags. The
-         * __split_huge_page_refcount side enforces write memory barriers
-         * between clearing PageTail and before the head page
-         * can be freed and reallocated.
-         */
-        smp_rmb();
-        if (likely(PageTail(page))) {
-                /*
-                 * __split_huge_page_refcount cannot race
-                 * here, see the comment above this function.
-                 */
-                VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                if (put_page_testzero(page_head)) {
-                        /*
-                         * If this is the tail of a slab THP page,
-                         * the tail pin must not be the last reference
-                         * held on the page, because the PG_slab cannot
-                         * be cleared before all tail pins (which skips
-                         * the _mapcount tail refcounting) have been
-                         * released.
-                         *
-                         * If this is the tail of a hugetlbfs page,
-                         * the tail pin may be the last reference on
-                         * the page instead, because PageHeadHuge will
-                         * not go away until the compound page enters
-                         * the buddy allocator.
-                         */
-                        VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-                        __put_compound_page(page_head);
-                }
-        } else
-                /*
-                 * __split_huge_page_refcount run before us,
-                 * @page was a THP tail. The split @page_head
-                 * has been freed and reallocated as slab or
-                 * hugetlbfs page of smaller order (only
-                 * possible if reallocated as slab on x86).
-                 */
-                if (put_page_testzero(page))
-                        __put_single_page(page);
-}
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
-        if (likely(page != page_head && get_page_unless_zero(page_head))) {
-                unsigned long flags;
-                /*
-                 * @page_head wasn't a dangling pointer but it may not
-                 * be a head page anymore by the time we obtain the
-                 * lock. That is ok as long as it can't be freed from
-                 * under us.
-                 */
-                flags = compound_lock_irqsave(page_head);
-                if (unlikely(!PageTail(page))) {
-                        /* __split_huge_page_refcount run before us */
-                        compound_unlock_irqrestore(page_head, flags);
-                        if (put_page_testzero(page_head)) {
-                                /*
-                                 * The @page_head may have been freed
-                                 * and reallocated as a compound page
-                                 * of smaller order and then freed
-                                 * again.  All we know is that it
-                                 * cannot have become: a THP page, a
-                                 * compound page of higher order, a
-                                 * tail page.  That is because we
-                                 * still hold the refcount of the
-                                 * split THP tail and page_head was
-                                 * the THP head before the split.
-                                 */
-                                if (PageHead(page_head))
-                                        __put_compound_page(page_head);
-                                else
-                                        __put_single_page(page_head);
-                        }
-out_put_single:
-                        if (put_page_testzero(page))
-                                __put_single_page(page);
-                        return;
-                }
-                VM_BUG_ON_PAGE(page_head != compound_head(page), page);
-                /*
-                 * We can release the refcount taken by
-                 * get_page_unless_zero() now that
-                 * __split_huge_page_refcount() is blocked on the
-                 * compound_lock.
-                 */
-                if (put_page_testzero(page_head))
-                        VM_BUG_ON_PAGE(1, page_head);
-                /* __split_huge_page_refcount will wait now */
-                VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
-                atomic_dec(&page->_mapcount);
-                VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
-                VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-                compound_unlock_irqrestore(page_head, flags);
-                if (put_page_testzero(page_head)) {
-                        if (PageHead(page_head))
-                                __put_compound_page(page_head);
-                        else
-                                __put_single_page(page_head);
-                }
-        } else {
-                /* @page_head is a dangling pointer */
-                VM_BUG_ON_PAGE(PageTail(page), page);
-                goto out_put_single;
-        }
-}
-static void put_compound_page(struct page *page)
-{
-        struct page *page_head;
-        /*
-         * We see the PageCompound set and PageTail not set, so @page maybe:
-         *  1. hugetlbfs head page, or
-         *  2. THP head page.
-         */
-        if (likely(!PageTail(page))) {
-                if (put_page_testzero(page)) {
-                        /*
-                         * By the time all refcounts have been released
-                         * split_huge_page cannot run anymore from under us.
-                         */
-                        if (PageHead(page))
-                                __put_compound_page(page);
-                        else
-                                __put_single_page(page);
-                }
-                return;
-        }
-        /*
-         * We see the PageCompound set and PageTail set, so @page maybe:
-         *  1. a tail hugetlbfs page, or
-         *  2. a tail THP page, or
-         *  3. a split THP page.
-         *
-         *  Case 3 is possible, as we may race with
-         *  __split_huge_page_refcount tearing down a THP page.
-         */
-        page_head = compound_head(page);
-        if (!__compound_tail_refcounted(page_head))
-                put_unrefcounted_compound_page(page_head, page);
-        else
-                put_refcounted_compound_page(page_head, page);
-}
-void put_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
-                put_compound_page(page);
+                __put_compound_page(page);
-        else if (put_page_testzero(page))
+        else
                __put_single_page(page);
 }
-EXPORT_SYMBOL(put_page);
+EXPORT_SYMBOL(__put_page);
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
-        /*
-         * This takes care of get_page() if run on a tail page
-         * returned by one of the get_user_pages/follow_page variants.
-         * get_user_pages/follow_page itself doesn't need the compound
-         * lock because it runs __get_page_tail_foll() under the
-         * proper PT lock that already serializes against
-         * split_huge_page().
-         */
-        unsigned long flags;
-        bool got;
-        struct page *page_head = compound_head(page);
-        /* Ref to put_compound_page() comment. */
-        if (!__compound_tail_refcounted(page_head)) {
-                smp_rmb();
-                if (likely(PageTail(page))) {
-                        /*
-                         * This is a hugetlbfs page or a slab
-                         * page. __split_huge_page_refcount
-                         * cannot race here.
-                         */
-                        VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                        __get_page_tail_foll(page, true);
-                        return true;
-                } else {
-                        /*
-                         * __split_huge_page_refcount run
-                         * before us, "page" was a THP
-                         * tail. The split page_head has been
-                         * freed and reallocated as slab or
-                         * hugetlbfs page of smaller order
-                         * (only possible if reallocated as
-                         * slab on x86).
-                         */
-                        return false;
-                }
-        }
-        got = false;
-        if (likely(page != page_head && get_page_unless_zero(page_head))) {
-                /*
-                 * page_head wasn't a dangling pointer but it
-                 * may not be a head page anymore by the time
-                 * we obtain the lock. That is ok as long as it
-                 * can't be freed from under us.
-                 */
-                flags = compound_lock_irqsave(page_head);
-                /* here __split_huge_page_refcount won't run anymore */
-                if (likely(PageTail(page))) {
-                        __get_page_tail_foll(page, false);
-                        got = true;
-                }
-                compound_unlock_irqrestore(page_head, flags);
-                if (unlikely(!got))
-                        put_page(page_head);
-        }
-        return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
 /**
 * put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
 */
 void mark_page_accessed(struct page *page)
 {
+        page = compound_head(page);
        if (!PageActive(page) && !PageUnevictable(page) &&
                        PageReferenced(page)) {
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
        update_page_reclaim_stat(lruvec, file, 0);
 }
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+                            void *arg)
+{
+        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+                int file = page_is_file_cache(page);
+                int lru = page_lru_base_type(page);
+                del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+                ClearPageActive(page);
+                ClearPageReferenced(page);
+                add_page_to_lru_list(page, lruvec, lru);
+                __count_vm_event(PGDEACTIVATE);
+                update_page_reclaim_stat(lruvec, file, 0);
+        }
+}
 /*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+        pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
        activate_page_drain(cpu);
 }
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
        }
 }
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+                struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+                page_cache_get(page);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                put_cpu_var(lru_deactivate_pvecs);
+        }
+}
 void lru_add_drain(void)
 {
        lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
-                if (unlikely(PageCompound(page))) {
-                        if (zone) {
-                                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                                zone = NULL;
-                        }
-                        put_compound_page(page);
-                        continue;
-                }
                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
                        zone = NULL;
                }
+                page = compound_head(page);
                if (!put_page_testzero(page))
                        continue;
+                if (PageCompound(page)) {
+                        if (zone) {
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                                zone = NULL;
+                        }
+                        __put_compound_page(page);
+                        continue;
+                }
                if (PageLRU(page)) {
                        struct zone *pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb7fa5f..676ff2991380 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list)
         * deadlock in the swap out path.
         */
        /*
-         * Add it to the swap cache and mark it dirty
+         * Add it to the swap cache.
         */
        err = add_to_swap_cache(page, entry,
                        __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
-        if (!err) {     /* Success */
+        if (!err) {
-                SetPageDirty(page);
                return 1;
        } else {        /* -ENOMEM radix-tree allocation failure */
                /*
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
-                __set_page_locked(new_page);
+                __SetPageLocked(new_page);
                SetPageSwapBacked(new_page);
                err = __add_to_swap_cache(new_page, entry);
                if (likely(!err)) {
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
                radix_tree_preload_end();
                ClearPageSwapBacked(new_page);
-                __clear_page_locked(new_page);
+                __ClearPageLocked(new_page);
                /*
                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
                 * clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e6b8591a3ed2..2bb30aa3a412 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page)
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return 0;
+        /* The page is part of THP and cannot be reused */
+        if (PageTransCompound(page))
+                return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
@@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free)
 }
 #endif /* CONFIG_HIBERNATION */
-static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
+        return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
-        /*
-         * When pte keeps soft dirty bit the pte generated
-         * from swap entry does not has it, still it's same
-         * pte from logical point of view.
-         */
-        pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
-        return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
-#else
-        return pte_same(pte, swp_pte);
-#endif
 }
 /*
@@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        if (unlikely(!page))
                return -ENOMEM;
-        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+                                &memcg, false)) {
                ret = -ENOMEM;
                goto out_nolock;
        }
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-        if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
+        if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
-                mem_cgroup_cancel_charge(page, memcg);
+                mem_cgroup_cancel_charge(page, memcg, false);
                ret = 0;
                goto out;
        }
@@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        if (page == swapcache) {
-                page_add_anon_rmap(page, vma, addr);
+                page_add_anon_rmap(page, vma, addr, false);
-                mem_cgroup_commit_charge(page, memcg, true);
+                mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-                page_add_new_anon_rmap(page, vma, addr);
+                page_add_new_anon_rmap(page, vma, addr, false);
-                mem_cgroup_commit_charge(page, memcg, false);
+                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
        swap_free(entry);
@@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 * swapoff spends a _lot_ of time in this loop!
                 * Test inline before going to call unuse_pte.
                 */
-                if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+                if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
                        pte_unmap(pte);
                        ret = unuse_pte(vma, pmd, addr, entry, page);
                        if (ret)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..806b0c758c5b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        __SetPageUptodate(page);
        ret = -ENOMEM;
-        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
                goto out_release;
        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
                goto out_release_uncharge_unlock;
        inc_mm_counter(dst_mm, MM_ANONPAGES);
-        page_add_new_anon_rmap(page, dst_vma, dst_addr);
+        page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
-        mem_cgroup_commit_charge(page, memcg, false);
+        mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, dst_vma);
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
        return ret;
 out_release_uncharge_unlock:
        pte_unmap_unlock(dst_pte, ptl);
-        mem_cgroup_cancel_charge(page, memcg);
+        mem_cgroup_cancel_charge(page, memcg, false);
 out_release:
        page_cache_release(page);
        goto out;
diff --git a/mm/util.c b/mm/util.c
index 2d28f7930043..6d1f9200f74e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page)
 struct address_space *page_mapping(struct page *page)
 {
-        unsigned long mapping;
+        struct address_space *mapping;
+        page = compound_head(page);
        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
@@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page)
                return swap_address_space(entry);
        }
-        mapping = (unsigned long)page->mapping;
+        mapping = page->mapping;
-        if (mapping & PAGE_MAPPING_FLAGS)
+        if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;
-        return page->mapping;
+        return mapping;
+}
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+        int ret;
+        ret = atomic_read(&page->_mapcount) + 1;
+        page = compound_head(page);
+        ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+        if (PageDoubleMap(page))
+                ret--;
+        return ret;
 }
+EXPORT_SYMBOL_GPL(__page_mapcount);
 int overcommit_ratio_handler(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 58ceeb107960..fb42a5bffe47 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -455,7 +455,7 @@ found:
        free_vmap_cache = &va->rb_node;
        spin_unlock(&vmap_area_lock);
-        BUG_ON(va->va_start & (align-1));
+        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);
@@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
-        BUG_ON(addr & (PAGE_SIZE-1));
+        BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
        debug_check_no_locks_freed(mem, size);
        vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 108bd119f2f6..5ac86956ff9d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+                bool lazyfree = false;
+                int ret = SWAP_SUCCESS;
                cond_resched();
@@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
+                        lazyfree = true;
                        may_enter_fs = 1;
                        /* Adding to swap updated mapping */
@@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page,
+                        switch (ret = try_to_unmap(page, lazyfree ?
-                                        ttu_flags|TTU_BATCH_FLUSH)) {
+                                (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
+                                (ttu_flags | TTU_BATCH_FLUSH))) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
                                goto keep_locked;
                        case SWAP_MLOCK:
                                goto cull_mlocked;
+                        case SWAP_LZFREE:
+                                goto lazyfree;
                        case SWAP_SUCCESS:
                                ; /* try to free the page below */
                        }
@@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
+lazyfree:
                if (!mapping || !__remove_mapping(mapping, page, true))
                        goto keep_locked;
@@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * we obviously don't have to worry about waking up a process
                 * waiting on the page lock, because there are no references.
                 */
-                __clear_page_locked(page);
+                __ClearPageLocked(page);
 free_it:
+                if (ret == SWAP_LZFREE)
+                        count_vm_event(PGLAZYFREED);
                nr_reclaimed++;
                /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83a003bc3cae..64bd0aa13f75 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -783,6 +783,7 @@ const char * const vmstat_text[] = {
        "pgfault",
        "pgmajfault",
+        "pglazyfreed",
        TEXTS_FOR_ZONES("pgrefill")
        TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -844,7 +845,9 @@ const char * const vmstat_text[] = {
        "thp_fault_fallback",
        "thp_collapse_alloc",
        "thp_collapse_alloc_failed",
-        "thp_split",
+        "thp_split_page",
+        "thp_split_page_failed",
+        "thp_split_pmd",
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 262889046703..76f131ebc192 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -193,7 +193,6 @@ exuberant()
        --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/'      \
        --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/'  \
        --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \
-        --regex-c++='/__TESTCLEARFLAG_FALSE\(([^,)]*).*/__TestClearPage\1/' \
        --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/'                \
        --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/'       \
        --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/'    \
@@ -260,7 +259,6 @@ emacs()
        --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \
        --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \
        --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \
-        --regex='/__TESTCLEARFLAG_FALSE(\([^,)]*\).*/__TestClearPage\1/' \
        --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/'           \
        --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/'        \
        --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/'    \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 314c7774652e..a11cfd20a6a0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -111,7 +111,7 @@ static void hardware_disable_all(void);
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-static void kvm_release_pfn_dirty(pfn_t pfn);
+static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
 __visible bool kvm_rebooting;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 static bool largepages_enabled = true;
-bool kvm_is_reserved_pfn(pfn_t pfn)
+bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
        if (pfn_valid(pfn))
                return PageReserved(pfn_to_page(pfn));
@@ -1289,7 +1289,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 * true indicates success, otherwise false is returned.
 */
 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
-                            bool write_fault, bool *writable, pfn_t *pfn)
+                            bool write_fault, bool *writable, kvm_pfn_t *pfn)
 {
        struct page *page[1];
        int npages;
@@ -1322,7 +1322,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
 * 1 indicates success, -errno is returned if error is detected.
 */
 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-                           bool *writable, pfn_t *pfn)
+                           bool *writable, kvm_pfn_t *pfn)
 {
        struct page *page[1];
        int npages = 0;
@@ -1386,11 +1386,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 * 2): @write_fault = false && @writable, @writable will tell the caller
 *     whether the mapping is writable.
 */
-static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
                        bool write_fault, bool *writable)
 {
        struct vm_area_struct *vma;
-        pfn_t pfn = 0;
+        kvm_pfn_t pfn = 0;
        int npages;
        /* we can do it either atomically or asynchronously, not both */
@@ -1431,8 +1431,9 @@ exit:
        return pfn;
 }
-pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
-                           bool *async, bool write_fault, bool *writable)
+                               bool atomic, bool *async, bool write_fault,
+                               bool *writable)
 {
        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -1453,7 +1454,7 @@ pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
-pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
                      bool *writable)
 {
        return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
@@ -1461,37 +1462,37 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
-pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
-pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
-pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
+kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
        return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
-pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
 }
@@ -1514,7 +1515,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
-static struct page *kvm_pfn_to_page(pfn_t pfn)
+static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
 {
        if (is_error_noslot_pfn(pfn))
                return KVM_ERR_PTR_BAD_PAGE;
@@ -1529,7 +1530,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        pfn = gfn_to_pfn(kvm, gfn);
@@ -1539,7 +1540,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
-        pfn_t pfn;
+        kvm_pfn_t pfn;
        pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
@@ -1555,7 +1556,7 @@ void kvm_release_page_clean(struct page *page)
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
-void kvm_release_pfn_clean(pfn_t pfn)
+void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
                put_page(pfn_to_page(pfn));
@@ -1570,13 +1571,13 @@ void kvm_release_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
-static void kvm_release_pfn_dirty(pfn_t pfn)
+static void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
        kvm_set_pfn_dirty(pfn);
        kvm_release_pfn_clean(pfn);
 }
-void kvm_set_pfn_dirty(pfn_t pfn)
+void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
@@ -1587,14 +1588,14 @@ void kvm_set_pfn_dirty(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
-void kvm_set_pfn_accessed(pfn_t pfn)
+void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
-void kvm_get_pfn(pfn_t pfn)
+void kvm_get_pfn(kvm_pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn))
                get_page(pfn_to_page(pfn));
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-17 15:58:52 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-17 15:58:52 -0500
commit	0cbeafb245ca568bc0765645aa64f0451b716657 (patch)
tree	663c09ff5a62a1b2b66a17c4dfe0413603530a36
parent	58cf279acac3080ce03eeea5ca268210b3165fe1 (diff)
parent	06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff)